text.py 76 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167
  1. # Authors: Olivier Grisel <olivier.grisel@ensta.org>
  2. # Mathieu Blondel <mathieu@mblondel.org>
  3. # Lars Buitinck
  4. # Robert Layton <robertlayton@gmail.com>
  5. # Jochen Wersdörfer <jochen@wersdoerfer.de>
  6. # Roman Sinayev <roman.sinayev@gmail.com>
  7. #
  8. # License: BSD 3 clause
  9. """
  10. The :mod:`sklearn.feature_extraction.text` submodule gathers utilities to
  11. build feature vectors from text documents.
  12. """
  13. import array
  14. import re
  15. import unicodedata
  16. import warnings
  17. from collections import defaultdict
  18. from collections.abc import Mapping
  19. from functools import partial
  20. from numbers import Integral
  21. from operator import itemgetter
  22. import numpy as np
  23. import scipy.sparse as sp
  24. from ..base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin, _fit_context
  25. from ..exceptions import NotFittedError
  26. from ..preprocessing import normalize
  27. from ..utils import _IS_32BIT
  28. from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions
  29. from ..utils.validation import FLOAT_DTYPES, check_array, check_is_fitted
  30. from ._hash import FeatureHasher
  31. from ._stop_words import ENGLISH_STOP_WORDS
  32. __all__ = [
  33. "HashingVectorizer",
  34. "CountVectorizer",
  35. "ENGLISH_STOP_WORDS",
  36. "TfidfTransformer",
  37. "TfidfVectorizer",
  38. "strip_accents_ascii",
  39. "strip_accents_unicode",
  40. "strip_tags",
  41. ]
  42. def _preprocess(doc, accent_function=None, lower=False):
  43. """Chain together an optional series of text preprocessing steps to
  44. apply to a document.
  45. Parameters
  46. ----------
  47. doc: str
  48. The string to preprocess
  49. accent_function: callable, default=None
  50. Function for handling accented characters. Common strategies include
  51. normalizing and removing.
  52. lower: bool, default=False
  53. Whether to use str.lower to lowercase all of the text
  54. Returns
  55. -------
  56. doc: str
  57. preprocessed string
  58. """
  59. if lower:
  60. doc = doc.lower()
  61. if accent_function is not None:
  62. doc = accent_function(doc)
  63. return doc
  64. def _analyze(
  65. doc,
  66. analyzer=None,
  67. tokenizer=None,
  68. ngrams=None,
  69. preprocessor=None,
  70. decoder=None,
  71. stop_words=None,
  72. ):
  73. """Chain together an optional series of text processing steps to go from
  74. a single document to ngrams, with or without tokenizing or preprocessing.
  75. If analyzer is used, only the decoder argument is used, as the analyzer is
  76. intended to replace the preprocessor, tokenizer, and ngrams steps.
  77. Parameters
  78. ----------
  79. analyzer: callable, default=None
  80. tokenizer: callable, default=None
  81. ngrams: callable, default=None
  82. preprocessor: callable, default=None
  83. decoder: callable, default=None
  84. stop_words: list, default=None
  85. Returns
  86. -------
  87. ngrams: list
  88. A sequence of tokens, possibly with pairs, triples, etc.
  89. """
  90. if decoder is not None:
  91. doc = decoder(doc)
  92. if analyzer is not None:
  93. doc = analyzer(doc)
  94. else:
  95. if preprocessor is not None:
  96. doc = preprocessor(doc)
  97. if tokenizer is not None:
  98. doc = tokenizer(doc)
  99. if ngrams is not None:
  100. if stop_words is not None:
  101. doc = ngrams(doc, stop_words)
  102. else:
  103. doc = ngrams(doc)
  104. return doc
  105. def strip_accents_unicode(s):
  106. """Transform accentuated unicode symbols into their simple counterpart.
  107. Warning: the python-level loop and join operations make this
  108. implementation 20 times slower than the strip_accents_ascii basic
  109. normalization.
  110. Parameters
  111. ----------
  112. s : str
  113. The string to strip.
  114. Returns
  115. -------
  116. s : str
  117. The stripped string.
  118. See Also
  119. --------
  120. strip_accents_ascii : Remove accentuated char for any unicode symbol that
  121. has a direct ASCII equivalent.
  122. """
  123. try:
  124. # If `s` is ASCII-compatible, then it does not contain any accented
  125. # characters and we can avoid an expensive list comprehension
  126. s.encode("ASCII", errors="strict")
  127. return s
  128. except UnicodeEncodeError:
  129. normalized = unicodedata.normalize("NFKD", s)
  130. return "".join([c for c in normalized if not unicodedata.combining(c)])
  131. def strip_accents_ascii(s):
  132. """Transform accentuated unicode symbols into ascii or nothing.
  133. Warning: this solution is only suited for languages that have a direct
  134. transliteration to ASCII symbols.
  135. Parameters
  136. ----------
  137. s : str
  138. The string to strip.
  139. Returns
  140. -------
  141. s : str
  142. The stripped string.
  143. See Also
  144. --------
  145. strip_accents_unicode : Remove accentuated char for any unicode symbol.
  146. """
  147. nkfd_form = unicodedata.normalize("NFKD", s)
  148. return nkfd_form.encode("ASCII", "ignore").decode("ASCII")
  149. def strip_tags(s):
  150. """Basic regexp based HTML / XML tag stripper function.
  151. For serious HTML/XML preprocessing you should rather use an external
  152. library such as lxml or BeautifulSoup.
  153. Parameters
  154. ----------
  155. s : str
  156. The string to strip.
  157. Returns
  158. -------
  159. s : str
  160. The stripped string.
  161. """
  162. return re.compile(r"<([^>]+)>", flags=re.UNICODE).sub(" ", s)
  163. def _check_stop_list(stop):
  164. if stop == "english":
  165. return ENGLISH_STOP_WORDS
  166. elif isinstance(stop, str):
  167. raise ValueError("not a built-in stop list: %s" % stop)
  168. elif stop is None:
  169. return None
  170. else: # assume it's a collection
  171. return frozenset(stop)
  172. class _VectorizerMixin:
  173. """Provides common code for text vectorizers (tokenization logic)."""
  174. _white_spaces = re.compile(r"\s\s+")
  175. def decode(self, doc):
  176. """Decode the input into a string of unicode symbols.
  177. The decoding strategy depends on the vectorizer parameters.
  178. Parameters
  179. ----------
  180. doc : bytes or str
  181. The string to decode.
  182. Returns
  183. -------
  184. doc: str
  185. A string of unicode symbols.
  186. """
  187. if self.input == "filename":
  188. with open(doc, "rb") as fh:
  189. doc = fh.read()
  190. elif self.input == "file":
  191. doc = doc.read()
  192. if isinstance(doc, bytes):
  193. doc = doc.decode(self.encoding, self.decode_error)
  194. if doc is np.nan:
  195. raise ValueError(
  196. "np.nan is an invalid document, expected byte or unicode string."
  197. )
  198. return doc
  199. def _word_ngrams(self, tokens, stop_words=None):
  200. """Turn tokens into a sequence of n-grams after stop words filtering"""
  201. # handle stop words
  202. if stop_words is not None:
  203. tokens = [w for w in tokens if w not in stop_words]
  204. # handle token n-grams
  205. min_n, max_n = self.ngram_range
  206. if max_n != 1:
  207. original_tokens = tokens
  208. if min_n == 1:
  209. # no need to do any slicing for unigrams
  210. # just iterate through the original tokens
  211. tokens = list(original_tokens)
  212. min_n += 1
  213. else:
  214. tokens = []
  215. n_original_tokens = len(original_tokens)
  216. # bind method outside of loop to reduce overhead
  217. tokens_append = tokens.append
  218. space_join = " ".join
  219. for n in range(min_n, min(max_n + 1, n_original_tokens + 1)):
  220. for i in range(n_original_tokens - n + 1):
  221. tokens_append(space_join(original_tokens[i : i + n]))
  222. return tokens
  223. def _char_ngrams(self, text_document):
  224. """Tokenize text_document into a sequence of character n-grams"""
  225. # normalize white spaces
  226. text_document = self._white_spaces.sub(" ", text_document)
  227. text_len = len(text_document)
  228. min_n, max_n = self.ngram_range
  229. if min_n == 1:
  230. # no need to do any slicing for unigrams
  231. # iterate through the string
  232. ngrams = list(text_document)
  233. min_n += 1
  234. else:
  235. ngrams = []
  236. # bind method outside of loop to reduce overhead
  237. ngrams_append = ngrams.append
  238. for n in range(min_n, min(max_n + 1, text_len + 1)):
  239. for i in range(text_len - n + 1):
  240. ngrams_append(text_document[i : i + n])
  241. return ngrams
  242. def _char_wb_ngrams(self, text_document):
  243. """Whitespace sensitive char-n-gram tokenization.
  244. Tokenize text_document into a sequence of character n-grams
  245. operating only inside word boundaries. n-grams at the edges
  246. of words are padded with space."""
  247. # normalize white spaces
  248. text_document = self._white_spaces.sub(" ", text_document)
  249. min_n, max_n = self.ngram_range
  250. ngrams = []
  251. # bind method outside of loop to reduce overhead
  252. ngrams_append = ngrams.append
  253. for w in text_document.split():
  254. w = " " + w + " "
  255. w_len = len(w)
  256. for n in range(min_n, max_n + 1):
  257. offset = 0
  258. ngrams_append(w[offset : offset + n])
  259. while offset + n < w_len:
  260. offset += 1
  261. ngrams_append(w[offset : offset + n])
  262. if offset == 0: # count a short word (w_len < n) only once
  263. break
  264. return ngrams
  265. def build_preprocessor(self):
  266. """Return a function to preprocess the text before tokenization.
  267. Returns
  268. -------
  269. preprocessor: callable
  270. A function to preprocess the text before tokenization.
  271. """
  272. if self.preprocessor is not None:
  273. return self.preprocessor
  274. # accent stripping
  275. if not self.strip_accents:
  276. strip_accents = None
  277. elif callable(self.strip_accents):
  278. strip_accents = self.strip_accents
  279. elif self.strip_accents == "ascii":
  280. strip_accents = strip_accents_ascii
  281. elif self.strip_accents == "unicode":
  282. strip_accents = strip_accents_unicode
  283. else:
  284. raise ValueError(
  285. 'Invalid value for "strip_accents": %s' % self.strip_accents
  286. )
  287. return partial(_preprocess, accent_function=strip_accents, lower=self.lowercase)
  288. def build_tokenizer(self):
  289. """Return a function that splits a string into a sequence of tokens.
  290. Returns
  291. -------
  292. tokenizer: callable
  293. A function to split a string into a sequence of tokens.
  294. """
  295. if self.tokenizer is not None:
  296. return self.tokenizer
  297. token_pattern = re.compile(self.token_pattern)
  298. if token_pattern.groups > 1:
  299. raise ValueError(
  300. "More than 1 capturing group in token pattern. Only a single "
  301. "group should be captured."
  302. )
  303. return token_pattern.findall
  304. def get_stop_words(self):
  305. """Build or fetch the effective stop words list.
  306. Returns
  307. -------
  308. stop_words: list or None
  309. A list of stop words.
  310. """
  311. return _check_stop_list(self.stop_words)
  312. def _check_stop_words_consistency(self, stop_words, preprocess, tokenize):
  313. """Check if stop words are consistent
  314. Returns
  315. -------
  316. is_consistent : True if stop words are consistent with the preprocessor
  317. and tokenizer, False if they are not, None if the check
  318. was previously performed, "error" if it could not be
  319. performed (e.g. because of the use of a custom
  320. preprocessor / tokenizer)
  321. """
  322. if id(self.stop_words) == getattr(self, "_stop_words_id", None):
  323. # Stop words are were previously validated
  324. return None
  325. # NB: stop_words is validated, unlike self.stop_words
  326. try:
  327. inconsistent = set()
  328. for w in stop_words or ():
  329. tokens = list(tokenize(preprocess(w)))
  330. for token in tokens:
  331. if token not in stop_words:
  332. inconsistent.add(token)
  333. self._stop_words_id = id(self.stop_words)
  334. if inconsistent:
  335. warnings.warn(
  336. "Your stop_words may be inconsistent with "
  337. "your preprocessing. Tokenizing the stop "
  338. "words generated tokens %r not in "
  339. "stop_words."
  340. % sorted(inconsistent)
  341. )
  342. return not inconsistent
  343. except Exception:
  344. # Failed to check stop words consistency (e.g. because a custom
  345. # preprocessor or tokenizer was used)
  346. self._stop_words_id = id(self.stop_words)
  347. return "error"
  348. def build_analyzer(self):
  349. """Return a callable to process input data.
  350. The callable handles preprocessing, tokenization, and n-grams generation.
  351. Returns
  352. -------
  353. analyzer: callable
  354. A function to handle preprocessing, tokenization
  355. and n-grams generation.
  356. """
  357. if callable(self.analyzer):
  358. return partial(_analyze, analyzer=self.analyzer, decoder=self.decode)
  359. preprocess = self.build_preprocessor()
  360. if self.analyzer == "char":
  361. return partial(
  362. _analyze,
  363. ngrams=self._char_ngrams,
  364. preprocessor=preprocess,
  365. decoder=self.decode,
  366. )
  367. elif self.analyzer == "char_wb":
  368. return partial(
  369. _analyze,
  370. ngrams=self._char_wb_ngrams,
  371. preprocessor=preprocess,
  372. decoder=self.decode,
  373. )
  374. elif self.analyzer == "word":
  375. stop_words = self.get_stop_words()
  376. tokenize = self.build_tokenizer()
  377. self._check_stop_words_consistency(stop_words, preprocess, tokenize)
  378. return partial(
  379. _analyze,
  380. ngrams=self._word_ngrams,
  381. tokenizer=tokenize,
  382. preprocessor=preprocess,
  383. decoder=self.decode,
  384. stop_words=stop_words,
  385. )
  386. else:
  387. raise ValueError(
  388. "%s is not a valid tokenization scheme/analyzer" % self.analyzer
  389. )
  390. def _validate_vocabulary(self):
  391. vocabulary = self.vocabulary
  392. if vocabulary is not None:
  393. if isinstance(vocabulary, set):
  394. vocabulary = sorted(vocabulary)
  395. if not isinstance(vocabulary, Mapping):
  396. vocab = {}
  397. for i, t in enumerate(vocabulary):
  398. if vocab.setdefault(t, i) != i:
  399. msg = "Duplicate term in vocabulary: %r" % t
  400. raise ValueError(msg)
  401. vocabulary = vocab
  402. else:
  403. indices = set(vocabulary.values())
  404. if len(indices) != len(vocabulary):
  405. raise ValueError("Vocabulary contains repeated indices.")
  406. for i in range(len(vocabulary)):
  407. if i not in indices:
  408. msg = "Vocabulary of size %d doesn't contain index %d." % (
  409. len(vocabulary),
  410. i,
  411. )
  412. raise ValueError(msg)
  413. if not vocabulary:
  414. raise ValueError("empty vocabulary passed to fit")
  415. self.fixed_vocabulary_ = True
  416. self.vocabulary_ = dict(vocabulary)
  417. else:
  418. self.fixed_vocabulary_ = False
  419. def _check_vocabulary(self):
  420. """Check if vocabulary is empty or missing (not fitted)"""
  421. if not hasattr(self, "vocabulary_"):
  422. self._validate_vocabulary()
  423. if not self.fixed_vocabulary_:
  424. raise NotFittedError("Vocabulary not fitted or provided")
  425. if len(self.vocabulary_) == 0:
  426. raise ValueError("Vocabulary is empty")
  427. def _validate_ngram_range(self):
  428. """Check validity of ngram_range parameter"""
  429. min_n, max_m = self.ngram_range
  430. if min_n > max_m:
  431. raise ValueError(
  432. "Invalid value for ngram_range=%s "
  433. "lower boundary larger than the upper boundary."
  434. % str(self.ngram_range)
  435. )
  436. def _warn_for_unused_params(self):
  437. if self.tokenizer is not None and self.token_pattern is not None:
  438. warnings.warn(
  439. "The parameter 'token_pattern' will not be used"
  440. " since 'tokenizer' is not None'"
  441. )
  442. if self.preprocessor is not None and callable(self.analyzer):
  443. warnings.warn(
  444. "The parameter 'preprocessor' will not be used"
  445. " since 'analyzer' is callable'"
  446. )
  447. if (
  448. self.ngram_range != (1, 1)
  449. and self.ngram_range is not None
  450. and callable(self.analyzer)
  451. ):
  452. warnings.warn(
  453. "The parameter 'ngram_range' will not be used"
  454. " since 'analyzer' is callable'"
  455. )
  456. if self.analyzer != "word" or callable(self.analyzer):
  457. if self.stop_words is not None:
  458. warnings.warn(
  459. "The parameter 'stop_words' will not be used"
  460. " since 'analyzer' != 'word'"
  461. )
  462. if (
  463. self.token_pattern is not None
  464. and self.token_pattern != r"(?u)\b\w\w+\b"
  465. ):
  466. warnings.warn(
  467. "The parameter 'token_pattern' will not be used"
  468. " since 'analyzer' != 'word'"
  469. )
  470. if self.tokenizer is not None:
  471. warnings.warn(
  472. "The parameter 'tokenizer' will not be used"
  473. " since 'analyzer' != 'word'"
  474. )
  475. class HashingVectorizer(
  476. TransformerMixin, _VectorizerMixin, BaseEstimator, auto_wrap_output_keys=None
  477. ):
  478. r"""Convert a collection of text documents to a matrix of token occurrences.
  479. It turns a collection of text documents into a scipy.sparse matrix holding
  480. token occurrence counts (or binary occurrence information), possibly
  481. normalized as token frequencies if norm='l1' or projected on the euclidean
  482. unit sphere if norm='l2'.
  483. This text vectorizer implementation uses the hashing trick to find the
  484. token string name to feature integer index mapping.
  485. This strategy has several advantages:
  486. - it is very low memory scalable to large datasets as there is no need to
  487. store a vocabulary dictionary in memory.
  488. - it is fast to pickle and un-pickle as it holds no state besides the
  489. constructor parameters.
  490. - it can be used in a streaming (partial fit) or parallel pipeline as there
  491. is no state computed during fit.
  492. There are also a couple of cons (vs using a CountVectorizer with an
  493. in-memory vocabulary):
  494. - there is no way to compute the inverse transform (from feature indices to
  495. string feature names) which can be a problem when trying to introspect
  496. which features are most important to a model.
  497. - there can be collisions: distinct tokens can be mapped to the same
  498. feature index. However in practice this is rarely an issue if n_features
  499. is large enough (e.g. 2 ** 18 for text classification problems).
  500. - no IDF weighting as this would render the transformer stateful.
  501. The hash function employed is the signed 32-bit version of Murmurhash3.
  502. For an efficiency comparision of the different feature extractors, see
  503. :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.
  504. Read more in the :ref:`User Guide <text_feature_extraction>`.
  505. Parameters
  506. ----------
  507. input : {'filename', 'file', 'content'}, default='content'
  508. - If `'filename'`, the sequence passed as an argument to fit is
  509. expected to be a list of filenames that need reading to fetch
  510. the raw content to analyze.
  511. - If `'file'`, the sequence items must have a 'read' method (file-like
  512. object) that is called to fetch the bytes in memory.
  513. - If `'content'`, the input is expected to be a sequence of items that
  514. can be of type string or byte.
  515. encoding : str, default='utf-8'
  516. If bytes or files are given to analyze, this encoding is used to
  517. decode.
  518. decode_error : {'strict', 'ignore', 'replace'}, default='strict'
  519. Instruction on what to do if a byte sequence is given to analyze that
  520. contains characters not of the given `encoding`. By default, it is
  521. 'strict', meaning that a UnicodeDecodeError will be raised. Other
  522. values are 'ignore' and 'replace'.
  523. strip_accents : {'ascii', 'unicode'} or callable, default=None
  524. Remove accents and perform other character normalization
  525. during the preprocessing step.
  526. 'ascii' is a fast method that only works on characters that have
  527. a direct ASCII mapping.
  528. 'unicode' is a slightly slower method that works on any character.
  529. None (default) means no character normalization is performed.
  530. Both 'ascii' and 'unicode' use NFKD normalization from
  531. :func:`unicodedata.normalize`.
  532. lowercase : bool, default=True
  533. Convert all characters to lowercase before tokenizing.
  534. preprocessor : callable, default=None
  535. Override the preprocessing (string transformation) stage while
  536. preserving the tokenizing and n-grams generation steps.
  537. Only applies if ``analyzer`` is not callable.
  538. tokenizer : callable, default=None
  539. Override the string tokenization step while preserving the
  540. preprocessing and n-grams generation steps.
  541. Only applies if ``analyzer == 'word'``.
  542. stop_words : {'english'}, list, default=None
  543. If 'english', a built-in stop word list for English is used.
  544. There are several known issues with 'english' and you should
  545. consider an alternative (see :ref:`stop_words`).
  546. If a list, that list is assumed to contain stop words, all of which
  547. will be removed from the resulting tokens.
  548. Only applies if ``analyzer == 'word'``.
  549. token_pattern : str or None, default=r"(?u)\\b\\w\\w+\\b"
  550. Regular expression denoting what constitutes a "token", only used
  551. if ``analyzer == 'word'``. The default regexp selects tokens of 2
  552. or more alphanumeric characters (punctuation is completely ignored
  553. and always treated as a token separator).
  554. If there is a capturing group in token_pattern then the
  555. captured group content, not the entire match, becomes the token.
  556. At most one capturing group is permitted.
  557. ngram_range : tuple (min_n, max_n), default=(1, 1)
  558. The lower and upper boundary of the range of n-values for different
  559. n-grams to be extracted. All values of n such that min_n <= n <= max_n
  560. will be used. For example an ``ngram_range`` of ``(1, 1)`` means only
  561. unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means
  562. only bigrams.
  563. Only applies if ``analyzer`` is not callable.
  564. analyzer : {'word', 'char', 'char_wb'} or callable, default='word'
  565. Whether the feature should be made of word or character n-grams.
  566. Option 'char_wb' creates character n-grams only from text inside
  567. word boundaries; n-grams at the edges of words are padded with space.
  568. If a callable is passed it is used to extract the sequence of features
  569. out of the raw, unprocessed input.
  570. .. versionchanged:: 0.21
  571. Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data
  572. is first read from the file and then passed to the given callable
  573. analyzer.
  574. n_features : int, default=(2 ** 20)
  575. The number of features (columns) in the output matrices. Small numbers
  576. of features are likely to cause hash collisions, but large numbers
  577. will cause larger coefficient dimensions in linear learners.
  578. binary : bool, default=False
  579. If True, all non zero counts are set to 1. This is useful for discrete
  580. probabilistic models that model binary events rather than integer
  581. counts.
  582. norm : {'l1', 'l2'}, default='l2'
  583. Norm used to normalize term vectors. None for no normalization.
  584. alternate_sign : bool, default=True
  585. When True, an alternating sign is added to the features as to
  586. approximately conserve the inner product in the hashed space even for
  587. small n_features. This approach is similar to sparse random projection.
  588. .. versionadded:: 0.19
  589. dtype : type, default=np.float64
  590. Type of the matrix returned by fit_transform() or transform().
  591. See Also
  592. --------
  593. CountVectorizer : Convert a collection of text documents to a matrix of
  594. token counts.
  595. TfidfVectorizer : Convert a collection of raw documents to a matrix of
  596. TF-IDF features.
  597. Notes
  598. -----
  599. This estimator is :term:`stateless` and does not need to be fitted.
  600. However, we recommend to call :meth:`fit_transform` instead of
  601. :meth:`transform`, as parameter validation is only performed in
  602. :meth:`fit`.
  603. Examples
  604. --------
  605. >>> from sklearn.feature_extraction.text import HashingVectorizer
  606. >>> corpus = [
  607. ... 'This is the first document.',
  608. ... 'This document is the second document.',
  609. ... 'And this is the third one.',
  610. ... 'Is this the first document?',
  611. ... ]
  612. >>> vectorizer = HashingVectorizer(n_features=2**4)
  613. >>> X = vectorizer.fit_transform(corpus)
  614. >>> print(X.shape)
  615. (4, 16)
  616. """
  617. _parameter_constraints: dict = {
  618. "input": [StrOptions({"filename", "file", "content"})],
  619. "encoding": [str],
  620. "decode_error": [StrOptions({"strict", "ignore", "replace"})],
  621. "strip_accents": [StrOptions({"ascii", "unicode"}), None, callable],
  622. "lowercase": ["boolean"],
  623. "preprocessor": [callable, None],
  624. "tokenizer": [callable, None],
  625. "stop_words": [StrOptions({"english"}), list, None],
  626. "token_pattern": [str, None],
  627. "ngram_range": [tuple],
  628. "analyzer": [StrOptions({"word", "char", "char_wb"}), callable],
  629. "n_features": [Interval(Integral, 1, np.iinfo(np.int32).max, closed="left")],
  630. "binary": ["boolean"],
  631. "norm": [StrOptions({"l1", "l2"}), None],
  632. "alternate_sign": ["boolean"],
  633. "dtype": "no_validation", # delegate to numpy
  634. }
  635. def __init__(
  636. self,
  637. *,
  638. input="content",
  639. encoding="utf-8",
  640. decode_error="strict",
  641. strip_accents=None,
  642. lowercase=True,
  643. preprocessor=None,
  644. tokenizer=None,
  645. stop_words=None,
  646. token_pattern=r"(?u)\b\w\w+\b",
  647. ngram_range=(1, 1),
  648. analyzer="word",
  649. n_features=(2**20),
  650. binary=False,
  651. norm="l2",
  652. alternate_sign=True,
  653. dtype=np.float64,
  654. ):
  655. self.input = input
  656. self.encoding = encoding
  657. self.decode_error = decode_error
  658. self.strip_accents = strip_accents
  659. self.preprocessor = preprocessor
  660. self.tokenizer = tokenizer
  661. self.analyzer = analyzer
  662. self.lowercase = lowercase
  663. self.token_pattern = token_pattern
  664. self.stop_words = stop_words
  665. self.n_features = n_features
  666. self.ngram_range = ngram_range
  667. self.binary = binary
  668. self.norm = norm
  669. self.alternate_sign = alternate_sign
  670. self.dtype = dtype
  671. @_fit_context(prefer_skip_nested_validation=True)
  672. def partial_fit(self, X, y=None):
  673. """Only validates estimator's parameters.
  674. This method allows to: (i) validate the estimator's parameters and
  675. (ii) be consistent with the scikit-learn transformer API.
  676. Parameters
  677. ----------
  678. X : ndarray of shape [n_samples, n_features]
  679. Training data.
  680. y : Ignored
  681. Not used, present for API consistency by convention.
  682. Returns
  683. -------
  684. self : object
  685. HashingVectorizer instance.
  686. """
  687. return self
  688. @_fit_context(prefer_skip_nested_validation=True)
  689. def fit(self, X, y=None):
  690. """Only validates estimator's parameters.
  691. This method allows to: (i) validate the estimator's parameters and
  692. (ii) be consistent with the scikit-learn transformer API.
  693. Parameters
  694. ----------
  695. X : ndarray of shape [n_samples, n_features]
  696. Training data.
  697. y : Ignored
  698. Not used, present for API consistency by convention.
  699. Returns
  700. -------
  701. self : object
  702. HashingVectorizer instance.
  703. """
  704. # triggers a parameter validation
  705. if isinstance(X, str):
  706. raise ValueError(
  707. "Iterable over raw text documents expected, string object received."
  708. )
  709. self._warn_for_unused_params()
  710. self._validate_ngram_range()
  711. self._get_hasher().fit(X, y=y)
  712. return self
  713. def transform(self, X):
  714. """Transform a sequence of documents to a document-term matrix.
  715. Parameters
  716. ----------
  717. X : iterable over raw text documents, length = n_samples
  718. Samples. Each sample must be a text document (either bytes or
  719. unicode strings, file name or file object depending on the
  720. constructor argument) which will be tokenized and hashed.
  721. Returns
  722. -------
  723. X : sparse matrix of shape (n_samples, n_features)
  724. Document-term matrix.
  725. """
  726. if isinstance(X, str):
  727. raise ValueError(
  728. "Iterable over raw text documents expected, string object received."
  729. )
  730. self._validate_ngram_range()
  731. analyzer = self.build_analyzer()
  732. X = self._get_hasher().transform(analyzer(doc) for doc in X)
  733. if self.binary:
  734. X.data.fill(1)
  735. if self.norm is not None:
  736. X = normalize(X, norm=self.norm, copy=False)
  737. return X
  738. def fit_transform(self, X, y=None):
  739. """Transform a sequence of documents to a document-term matrix.
  740. Parameters
  741. ----------
  742. X : iterable over raw text documents, length = n_samples
  743. Samples. Each sample must be a text document (either bytes or
  744. unicode strings, file name or file object depending on the
  745. constructor argument) which will be tokenized and hashed.
  746. y : any
  747. Ignored. This parameter exists only for compatibility with
  748. sklearn.pipeline.Pipeline.
  749. Returns
  750. -------
  751. X : sparse matrix of shape (n_samples, n_features)
  752. Document-term matrix.
  753. """
  754. return self.fit(X, y).transform(X)
  755. def _get_hasher(self):
  756. return FeatureHasher(
  757. n_features=self.n_features,
  758. input_type="string",
  759. dtype=self.dtype,
  760. alternate_sign=self.alternate_sign,
  761. )
  762. def _more_tags(self):
  763. return {"X_types": ["string"]}
  764. def _document_frequency(X):
  765. """Count the number of non-zero values for each feature in sparse X."""
  766. if sp.issparse(X) and X.format == "csr":
  767. return np.bincount(X.indices, minlength=X.shape[1])
  768. else:
  769. return np.diff(X.indptr)
  770. class CountVectorizer(_VectorizerMixin, BaseEstimator):
  771. r"""Convert a collection of text documents to a matrix of token counts.
  772. This implementation produces a sparse representation of the counts using
  773. scipy.sparse.csr_matrix.
  774. If you do not provide an a-priori dictionary and you do not use an analyzer
  775. that does some kind of feature selection then the number of features will
  776. be equal to the vocabulary size found by analyzing the data.
  777. For an efficiency comparision of the different feature extractors, see
  778. :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.
  779. Read more in the :ref:`User Guide <text_feature_extraction>`.
  780. Parameters
  781. ----------
  782. input : {'filename', 'file', 'content'}, default='content'
  783. - If `'filename'`, the sequence passed as an argument to fit is
  784. expected to be a list of filenames that need reading to fetch
  785. the raw content to analyze.
  786. - If `'file'`, the sequence items must have a 'read' method (file-like
  787. object) that is called to fetch the bytes in memory.
  788. - If `'content'`, the input is expected to be a sequence of items that
  789. can be of type string or byte.
  790. encoding : str, default='utf-8'
  791. If bytes or files are given to analyze, this encoding is used to
  792. decode.
  793. decode_error : {'strict', 'ignore', 'replace'}, default='strict'
  794. Instruction on what to do if a byte sequence is given to analyze that
  795. contains characters not of the given `encoding`. By default, it is
  796. 'strict', meaning that a UnicodeDecodeError will be raised. Other
  797. values are 'ignore' and 'replace'.
  798. strip_accents : {'ascii', 'unicode'} or callable, default=None
  799. Remove accents and perform other character normalization
  800. during the preprocessing step.
  801. 'ascii' is a fast method that only works on characters that have
  802. a direct ASCII mapping.
  803. 'unicode' is a slightly slower method that works on any characters.
  804. None (default) means no character normalization is performed.
  805. Both 'ascii' and 'unicode' use NFKD normalization from
  806. :func:`unicodedata.normalize`.
  807. lowercase : bool, default=True
  808. Convert all characters to lowercase before tokenizing.
  809. preprocessor : callable, default=None
  810. Override the preprocessing (strip_accents and lowercase) stage while
  811. preserving the tokenizing and n-grams generation steps.
  812. Only applies if ``analyzer`` is not callable.
  813. tokenizer : callable, default=None
  814. Override the string tokenization step while preserving the
  815. preprocessing and n-grams generation steps.
  816. Only applies if ``analyzer == 'word'``.
  817. stop_words : {'english'}, list, default=None
  818. If 'english', a built-in stop word list for English is used.
  819. There are several known issues with 'english' and you should
  820. consider an alternative (see :ref:`stop_words`).
  821. If a list, that list is assumed to contain stop words, all of which
  822. will be removed from the resulting tokens.
  823. Only applies if ``analyzer == 'word'``.
  824. If None, no stop words will be used. In this case, setting `max_df`
  825. to a higher value, such as in the range (0.7, 1.0), can automatically detect
  826. and filter stop words based on intra corpus document frequency of terms.
  827. token_pattern : str or None, default=r"(?u)\\b\\w\\w+\\b"
  828. Regular expression denoting what constitutes a "token", only used
  829. if ``analyzer == 'word'``. The default regexp select tokens of 2
  830. or more alphanumeric characters (punctuation is completely ignored
  831. and always treated as a token separator).
  832. If there is a capturing group in token_pattern then the
  833. captured group content, not the entire match, becomes the token.
  834. At most one capturing group is permitted.
  835. ngram_range : tuple (min_n, max_n), default=(1, 1)
  836. The lower and upper boundary of the range of n-values for different
  837. word n-grams or char n-grams to be extracted. All values of n such
  838. such that min_n <= n <= max_n will be used. For example an
  839. ``ngram_range`` of ``(1, 1)`` means only unigrams, ``(1, 2)`` means
  840. unigrams and bigrams, and ``(2, 2)`` means only bigrams.
  841. Only applies if ``analyzer`` is not callable.
  842. analyzer : {'word', 'char', 'char_wb'} or callable, default='word'
  843. Whether the feature should be made of word n-gram or character
  844. n-grams.
  845. Option 'char_wb' creates character n-grams only from text inside
  846. word boundaries; n-grams at the edges of words are padded with space.
  847. If a callable is passed it is used to extract the sequence of features
  848. out of the raw, unprocessed input.
  849. .. versionchanged:: 0.21
  850. Since v0.21, if ``input`` is ``filename`` or ``file``, the data is
  851. first read from the file and then passed to the given callable
  852. analyzer.
  853. max_df : float in range [0.0, 1.0] or int, default=1.0
  854. When building the vocabulary ignore terms that have a document
  855. frequency strictly higher than the given threshold (corpus-specific
  856. stop words).
  857. If float, the parameter represents a proportion of documents, integer
  858. absolute counts.
  859. This parameter is ignored if vocabulary is not None.
  860. min_df : float in range [0.0, 1.0] or int, default=1
  861. When building the vocabulary ignore terms that have a document
  862. frequency strictly lower than the given threshold. This value is also
  863. called cut-off in the literature.
  864. If float, the parameter represents a proportion of documents, integer
  865. absolute counts.
  866. This parameter is ignored if vocabulary is not None.
  867. max_features : int, default=None
  868. If not None, build a vocabulary that only consider the top
  869. `max_features` ordered by term frequency across the corpus.
  870. Otherwise, all features are used.
  871. This parameter is ignored if vocabulary is not None.
  872. vocabulary : Mapping or iterable, default=None
  873. Either a Mapping (e.g., a dict) where keys are terms and values are
  874. indices in the feature matrix, or an iterable over terms. If not
  875. given, a vocabulary is determined from the input documents. Indices
  876. in the mapping should not be repeated and should not have any gap
  877. between 0 and the largest index.
  878. binary : bool, default=False
  879. If True, all non zero counts are set to 1. This is useful for discrete
  880. probabilistic models that model binary events rather than integer
  881. counts.
  882. dtype : dtype, default=np.int64
  883. Type of the matrix returned by fit_transform() or transform().
  884. Attributes
  885. ----------
  886. vocabulary_ : dict
  887. A mapping of terms to feature indices.
  888. fixed_vocabulary_ : bool
  889. True if a fixed vocabulary of term to indices mapping
  890. is provided by the user.
  891. stop_words_ : set
  892. Terms that were ignored because they either:
  893. - occurred in too many documents (`max_df`)
  894. - occurred in too few documents (`min_df`)
  895. - were cut off by feature selection (`max_features`).
  896. This is only available if no vocabulary was given.
  897. See Also
  898. --------
  899. HashingVectorizer : Convert a collection of text documents to a
  900. matrix of token counts.
  901. TfidfVectorizer : Convert a collection of raw documents to a matrix
  902. of TF-IDF features.
  903. Notes
  904. -----
  905. The ``stop_words_`` attribute can get large and increase the model size
  906. when pickling. This attribute is provided only for introspection and can
  907. be safely removed using delattr or set to None before pickling.
  908. Examples
  909. --------
  910. >>> from sklearn.feature_extraction.text import CountVectorizer
  911. >>> corpus = [
  912. ... 'This is the first document.',
  913. ... 'This document is the second document.',
  914. ... 'And this is the third one.',
  915. ... 'Is this the first document?',
  916. ... ]
  917. >>> vectorizer = CountVectorizer()
  918. >>> X = vectorizer.fit_transform(corpus)
  919. >>> vectorizer.get_feature_names_out()
  920. array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
  921. 'this'], ...)
  922. >>> print(X.toarray())
  923. [[0 1 1 1 0 0 1 0 1]
  924. [0 2 0 1 0 1 1 0 1]
  925. [1 0 0 1 1 0 1 1 1]
  926. [0 1 1 1 0 0 1 0 1]]
  927. >>> vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))
  928. >>> X2 = vectorizer2.fit_transform(corpus)
  929. >>> vectorizer2.get_feature_names_out()
  930. array(['and this', 'document is', 'first document', 'is the', 'is this',
  931. 'second document', 'the first', 'the second', 'the third', 'third one',
  932. 'this document', 'this is', 'this the'], ...)
  933. >>> print(X2.toarray())
  934. [[0 0 1 1 0 0 1 0 0 0 0 1 0]
  935. [0 1 0 1 0 1 0 1 0 0 1 0 0]
  936. [1 0 0 1 0 0 0 0 1 1 0 1 0]
  937. [0 0 1 0 1 0 1 0 0 0 0 0 1]]
  938. """
  939. _parameter_constraints: dict = {
  940. "input": [StrOptions({"filename", "file", "content"})],
  941. "encoding": [str],
  942. "decode_error": [StrOptions({"strict", "ignore", "replace"})],
  943. "strip_accents": [StrOptions({"ascii", "unicode"}), None, callable],
  944. "lowercase": ["boolean"],
  945. "preprocessor": [callable, None],
  946. "tokenizer": [callable, None],
  947. "stop_words": [StrOptions({"english"}), list, None],
  948. "token_pattern": [str, None],
  949. "ngram_range": [tuple],
  950. "analyzer": [StrOptions({"word", "char", "char_wb"}), callable],
  951. "max_df": [
  952. Interval(RealNotInt, 0, 1, closed="both"),
  953. Interval(Integral, 1, None, closed="left"),
  954. ],
  955. "min_df": [
  956. Interval(RealNotInt, 0, 1, closed="both"),
  957. Interval(Integral, 1, None, closed="left"),
  958. ],
  959. "max_features": [Interval(Integral, 1, None, closed="left"), None],
  960. "vocabulary": [Mapping, HasMethods("__iter__"), None],
  961. "binary": ["boolean"],
  962. "dtype": "no_validation", # delegate to numpy
  963. }
  964. def __init__(
  965. self,
  966. *,
  967. input="content",
  968. encoding="utf-8",
  969. decode_error="strict",
  970. strip_accents=None,
  971. lowercase=True,
  972. preprocessor=None,
  973. tokenizer=None,
  974. stop_words=None,
  975. token_pattern=r"(?u)\b\w\w+\b",
  976. ngram_range=(1, 1),
  977. analyzer="word",
  978. max_df=1.0,
  979. min_df=1,
  980. max_features=None,
  981. vocabulary=None,
  982. binary=False,
  983. dtype=np.int64,
  984. ):
  985. self.input = input
  986. self.encoding = encoding
  987. self.decode_error = decode_error
  988. self.strip_accents = strip_accents
  989. self.preprocessor = preprocessor
  990. self.tokenizer = tokenizer
  991. self.analyzer = analyzer
  992. self.lowercase = lowercase
  993. self.token_pattern = token_pattern
  994. self.stop_words = stop_words
  995. self.max_df = max_df
  996. self.min_df = min_df
  997. self.max_features = max_features
  998. self.ngram_range = ngram_range
  999. self.vocabulary = vocabulary
  1000. self.binary = binary
  1001. self.dtype = dtype
  1002. def _sort_features(self, X, vocabulary):
  1003. """Sort features by name
  1004. Returns a reordered matrix and modifies the vocabulary in place
  1005. """
  1006. sorted_features = sorted(vocabulary.items())
  1007. map_index = np.empty(len(sorted_features), dtype=X.indices.dtype)
  1008. for new_val, (term, old_val) in enumerate(sorted_features):
  1009. vocabulary[term] = new_val
  1010. map_index[old_val] = new_val
  1011. X.indices = map_index.take(X.indices, mode="clip")
  1012. return X
  1013. def _limit_features(self, X, vocabulary, high=None, low=None, limit=None):
  1014. """Remove too rare or too common features.
  1015. Prune features that are non zero in more samples than high or less
  1016. documents than low, modifying the vocabulary, and restricting it to
  1017. at most the limit most frequent.
  1018. This does not prune samples with zero features.
  1019. """
  1020. if high is None and low is None and limit is None:
  1021. return X, set()
  1022. # Calculate a mask based on document frequencies
  1023. dfs = _document_frequency(X)
  1024. mask = np.ones(len(dfs), dtype=bool)
  1025. if high is not None:
  1026. mask &= dfs <= high
  1027. if low is not None:
  1028. mask &= dfs >= low
  1029. if limit is not None and mask.sum() > limit:
  1030. tfs = np.asarray(X.sum(axis=0)).ravel()
  1031. mask_inds = (-tfs[mask]).argsort()[:limit]
  1032. new_mask = np.zeros(len(dfs), dtype=bool)
  1033. new_mask[np.where(mask)[0][mask_inds]] = True
  1034. mask = new_mask
  1035. new_indices = np.cumsum(mask) - 1 # maps old indices to new
  1036. removed_terms = set()
  1037. for term, old_index in list(vocabulary.items()):
  1038. if mask[old_index]:
  1039. vocabulary[term] = new_indices[old_index]
  1040. else:
  1041. del vocabulary[term]
  1042. removed_terms.add(term)
  1043. kept_indices = np.where(mask)[0]
  1044. if len(kept_indices) == 0:
  1045. raise ValueError(
  1046. "After pruning, no terms remain. Try a lower min_df or a higher max_df."
  1047. )
  1048. return X[:, kept_indices], removed_terms
  1049. def _count_vocab(self, raw_documents, fixed_vocab):
  1050. """Create sparse feature matrix, and vocabulary where fixed_vocab=False"""
  1051. if fixed_vocab:
  1052. vocabulary = self.vocabulary_
  1053. else:
  1054. # Add a new value when a new vocabulary item is seen
  1055. vocabulary = defaultdict()
  1056. vocabulary.default_factory = vocabulary.__len__
  1057. analyze = self.build_analyzer()
  1058. j_indices = []
  1059. indptr = []
  1060. values = _make_int_array()
  1061. indptr.append(0)
  1062. for doc in raw_documents:
  1063. feature_counter = {}
  1064. for feature in analyze(doc):
  1065. try:
  1066. feature_idx = vocabulary[feature]
  1067. if feature_idx not in feature_counter:
  1068. feature_counter[feature_idx] = 1
  1069. else:
  1070. feature_counter[feature_idx] += 1
  1071. except KeyError:
  1072. # Ignore out-of-vocabulary items for fixed_vocab=True
  1073. continue
  1074. j_indices.extend(feature_counter.keys())
  1075. values.extend(feature_counter.values())
  1076. indptr.append(len(j_indices))
  1077. if not fixed_vocab:
  1078. # disable defaultdict behaviour
  1079. vocabulary = dict(vocabulary)
  1080. if not vocabulary:
  1081. raise ValueError(
  1082. "empty vocabulary; perhaps the documents only contain stop words"
  1083. )
  1084. if indptr[-1] > np.iinfo(np.int32).max: # = 2**31 - 1
  1085. if _IS_32BIT:
  1086. raise ValueError(
  1087. (
  1088. "sparse CSR array has {} non-zero "
  1089. "elements and requires 64 bit indexing, "
  1090. "which is unsupported with 32 bit Python."
  1091. ).format(indptr[-1])
  1092. )
  1093. indices_dtype = np.int64
  1094. else:
  1095. indices_dtype = np.int32
  1096. j_indices = np.asarray(j_indices, dtype=indices_dtype)
  1097. indptr = np.asarray(indptr, dtype=indices_dtype)
  1098. values = np.frombuffer(values, dtype=np.intc)
  1099. X = sp.csr_matrix(
  1100. (values, j_indices, indptr),
  1101. shape=(len(indptr) - 1, len(vocabulary)),
  1102. dtype=self.dtype,
  1103. )
  1104. X.sort_indices()
  1105. return vocabulary, X
  1106. def fit(self, raw_documents, y=None):
  1107. """Learn a vocabulary dictionary of all tokens in the raw documents.
  1108. Parameters
  1109. ----------
  1110. raw_documents : iterable
  1111. An iterable which generates either str, unicode or file objects.
  1112. y : None
  1113. This parameter is ignored.
  1114. Returns
  1115. -------
  1116. self : object
  1117. Fitted vectorizer.
  1118. """
  1119. self.fit_transform(raw_documents)
  1120. return self
  1121. @_fit_context(prefer_skip_nested_validation=True)
  1122. def fit_transform(self, raw_documents, y=None):
  1123. """Learn the vocabulary dictionary and return document-term matrix.
  1124. This is equivalent to fit followed by transform, but more efficiently
  1125. implemented.
  1126. Parameters
  1127. ----------
  1128. raw_documents : iterable
  1129. An iterable which generates either str, unicode or file objects.
  1130. y : None
  1131. This parameter is ignored.
  1132. Returns
  1133. -------
  1134. X : array of shape (n_samples, n_features)
  1135. Document-term matrix.
  1136. """
  1137. # We intentionally don't call the transform method to make
  1138. # fit_transform overridable without unwanted side effects in
  1139. # TfidfVectorizer.
  1140. if isinstance(raw_documents, str):
  1141. raise ValueError(
  1142. "Iterable over raw text documents expected, string object received."
  1143. )
  1144. self._validate_ngram_range()
  1145. self._warn_for_unused_params()
  1146. self._validate_vocabulary()
  1147. max_df = self.max_df
  1148. min_df = self.min_df
  1149. max_features = self.max_features
  1150. if self.fixed_vocabulary_ and self.lowercase:
  1151. for term in self.vocabulary:
  1152. if any(map(str.isupper, term)):
  1153. warnings.warn(
  1154. "Upper case characters found in"
  1155. " vocabulary while 'lowercase'"
  1156. " is True. These entries will not"
  1157. " be matched with any documents"
  1158. )
  1159. break
  1160. vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary_)
  1161. if self.binary:
  1162. X.data.fill(1)
  1163. if not self.fixed_vocabulary_:
  1164. n_doc = X.shape[0]
  1165. max_doc_count = max_df if isinstance(max_df, Integral) else max_df * n_doc
  1166. min_doc_count = min_df if isinstance(min_df, Integral) else min_df * n_doc
  1167. if max_doc_count < min_doc_count:
  1168. raise ValueError("max_df corresponds to < documents than min_df")
  1169. if max_features is not None:
  1170. X = self._sort_features(X, vocabulary)
  1171. X, self.stop_words_ = self._limit_features(
  1172. X, vocabulary, max_doc_count, min_doc_count, max_features
  1173. )
  1174. if max_features is None:
  1175. X = self._sort_features(X, vocabulary)
  1176. self.vocabulary_ = vocabulary
  1177. return X
  1178. def transform(self, raw_documents):
  1179. """Transform documents to document-term matrix.
  1180. Extract token counts out of raw text documents using the vocabulary
  1181. fitted with fit or the one provided to the constructor.
  1182. Parameters
  1183. ----------
  1184. raw_documents : iterable
  1185. An iterable which generates either str, unicode or file objects.
  1186. Returns
  1187. -------
  1188. X : sparse matrix of shape (n_samples, n_features)
  1189. Document-term matrix.
  1190. """
  1191. if isinstance(raw_documents, str):
  1192. raise ValueError(
  1193. "Iterable over raw text documents expected, string object received."
  1194. )
  1195. self._check_vocabulary()
  1196. # use the same matrix-building strategy as fit_transform
  1197. _, X = self._count_vocab(raw_documents, fixed_vocab=True)
  1198. if self.binary:
  1199. X.data.fill(1)
  1200. return X
  1201. def inverse_transform(self, X):
  1202. """Return terms per document with nonzero entries in X.
  1203. Parameters
  1204. ----------
  1205. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  1206. Document-term matrix.
  1207. Returns
  1208. -------
  1209. X_inv : list of arrays of shape (n_samples,)
  1210. List of arrays of terms.
  1211. """
  1212. self._check_vocabulary()
  1213. # We need CSR format for fast row manipulations.
  1214. X = check_array(X, accept_sparse="csr")
  1215. n_samples = X.shape[0]
  1216. terms = np.array(list(self.vocabulary_.keys()))
  1217. indices = np.array(list(self.vocabulary_.values()))
  1218. inverse_vocabulary = terms[np.argsort(indices)]
  1219. if sp.issparse(X):
  1220. return [
  1221. inverse_vocabulary[X[i, :].nonzero()[1]].ravel()
  1222. for i in range(n_samples)
  1223. ]
  1224. else:
  1225. return [
  1226. inverse_vocabulary[np.flatnonzero(X[i, :])].ravel()
  1227. for i in range(n_samples)
  1228. ]
  1229. def get_feature_names_out(self, input_features=None):
  1230. """Get output feature names for transformation.
  1231. Parameters
  1232. ----------
  1233. input_features : array-like of str or None, default=None
  1234. Not used, present here for API consistency by convention.
  1235. Returns
  1236. -------
  1237. feature_names_out : ndarray of str objects
  1238. Transformed feature names.
  1239. """
  1240. self._check_vocabulary()
  1241. return np.asarray(
  1242. [t for t, i in sorted(self.vocabulary_.items(), key=itemgetter(1))],
  1243. dtype=object,
  1244. )
  1245. def _more_tags(self):
  1246. return {"X_types": ["string"]}
  1247. def _make_int_array():
  1248. """Construct an array.array of a type suitable for scipy.sparse indices."""
  1249. return array.array(str("i"))
  1250. class TfidfTransformer(
  1251. OneToOneFeatureMixin, TransformerMixin, BaseEstimator, auto_wrap_output_keys=None
  1252. ):
  1253. """Transform a count matrix to a normalized tf or tf-idf representation.
  1254. Tf means term-frequency while tf-idf means term-frequency times inverse
  1255. document-frequency. This is a common term weighting scheme in information
  1256. retrieval, that has also found good use in document classification.
  1257. The goal of using tf-idf instead of the raw frequencies of occurrence of a
  1258. token in a given document is to scale down the impact of tokens that occur
  1259. very frequently in a given corpus and that are hence empirically less
  1260. informative than features that occur in a small fraction of the training
  1261. corpus.
  1262. The formula that is used to compute the tf-idf for a term t of a document d
  1263. in a document set is tf-idf(t, d) = tf(t, d) * idf(t), and the idf is
  1264. computed as idf(t) = log [ n / df(t) ] + 1 (if ``smooth_idf=False``), where
  1265. n is the total number of documents in the document set and df(t) is the
  1266. document frequency of t; the document frequency is the number of documents
  1267. in the document set that contain the term t. The effect of adding "1" to
  1268. the idf in the equation above is that terms with zero idf, i.e., terms
  1269. that occur in all documents in a training set, will not be entirely
  1270. ignored.
  1271. (Note that the idf formula above differs from the standard textbook
  1272. notation that defines the idf as
  1273. idf(t) = log [ n / (df(t) + 1) ]).
  1274. If ``smooth_idf=True`` (the default), the constant "1" is added to the
  1275. numerator and denominator of the idf as if an extra document was seen
  1276. containing every term in the collection exactly once, which prevents
  1277. zero divisions: idf(t) = log [ (1 + n) / (1 + df(t)) ] + 1.
  1278. Furthermore, the formulas used to compute tf and idf depend
  1279. on parameter settings that correspond to the SMART notation used in IR
  1280. as follows:
  1281. Tf is "n" (natural) by default, "l" (logarithmic) when
  1282. ``sublinear_tf=True``.
  1283. Idf is "t" when use_idf is given, "n" (none) otherwise.
  1284. Normalization is "c" (cosine) when ``norm='l2'``, "n" (none)
  1285. when ``norm=None``.
  1286. Read more in the :ref:`User Guide <text_feature_extraction>`.
  1287. Parameters
  1288. ----------
  1289. norm : {'l1', 'l2'} or None, default='l2'
  1290. Each output row will have unit norm, either:
  1291. - 'l2': Sum of squares of vector elements is 1. The cosine
  1292. similarity between two vectors is their dot product when l2 norm has
  1293. been applied.
  1294. - 'l1': Sum of absolute values of vector elements is 1.
  1295. See :func:`~sklearn.preprocessing.normalize`.
  1296. - None: No normalization.
  1297. use_idf : bool, default=True
  1298. Enable inverse-document-frequency reweighting. If False, idf(t) = 1.
  1299. smooth_idf : bool, default=True
  1300. Smooth idf weights by adding one to document frequencies, as if an
  1301. extra document was seen containing every term in the collection
  1302. exactly once. Prevents zero divisions.
  1303. sublinear_tf : bool, default=False
  1304. Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
  1305. Attributes
  1306. ----------
  1307. idf_ : array of shape (n_features)
  1308. The inverse document frequency (IDF) vector; only defined
  1309. if ``use_idf`` is True.
  1310. .. versionadded:: 0.20
  1311. n_features_in_ : int
  1312. Number of features seen during :term:`fit`.
  1313. .. versionadded:: 1.0
  1314. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  1315. Names of features seen during :term:`fit`. Defined only when `X`
  1316. has feature names that are all strings.
  1317. .. versionadded:: 1.0
  1318. See Also
  1319. --------
  1320. CountVectorizer : Transforms text into a sparse matrix of n-gram counts.
  1321. TfidfVectorizer : Convert a collection of raw documents to a matrix of
  1322. TF-IDF features.
  1323. HashingVectorizer : Convert a collection of text documents to a matrix
  1324. of token occurrences.
  1325. References
  1326. ----------
  1327. .. [Yates2011] R. Baeza-Yates and B. Ribeiro-Neto (2011). Modern
  1328. Information Retrieval. Addison Wesley, pp. 68-74.
  1329. .. [MRS2008] C.D. Manning, P. Raghavan and H. Schütze (2008).
  1330. Introduction to Information Retrieval. Cambridge University
  1331. Press, pp. 118-120.
  1332. Examples
  1333. --------
  1334. >>> from sklearn.feature_extraction.text import TfidfTransformer
  1335. >>> from sklearn.feature_extraction.text import CountVectorizer
  1336. >>> from sklearn.pipeline import Pipeline
  1337. >>> corpus = ['this is the first document',
  1338. ... 'this document is the second document',
  1339. ... 'and this is the third one',
  1340. ... 'is this the first document']
  1341. >>> vocabulary = ['this', 'document', 'first', 'is', 'second', 'the',
  1342. ... 'and', 'one']
  1343. >>> pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),
  1344. ... ('tfid', TfidfTransformer())]).fit(corpus)
  1345. >>> pipe['count'].transform(corpus).toarray()
  1346. array([[1, 1, 1, 1, 0, 1, 0, 0],
  1347. [1, 2, 0, 1, 1, 1, 0, 0],
  1348. [1, 0, 0, 1, 0, 1, 1, 1],
  1349. [1, 1, 1, 1, 0, 1, 0, 0]])
  1350. >>> pipe['tfid'].idf_
  1351. array([1. , 1.22314355, 1.51082562, 1. , 1.91629073,
  1352. 1. , 1.91629073, 1.91629073])
  1353. >>> pipe.transform(corpus).shape
  1354. (4, 8)
  1355. """
  1356. _parameter_constraints: dict = {
  1357. "norm": [StrOptions({"l1", "l2"}), None],
  1358. "use_idf": ["boolean"],
  1359. "smooth_idf": ["boolean"],
  1360. "sublinear_tf": ["boolean"],
  1361. }
  1362. def __init__(self, *, norm="l2", use_idf=True, smooth_idf=True, sublinear_tf=False):
  1363. self.norm = norm
  1364. self.use_idf = use_idf
  1365. self.smooth_idf = smooth_idf
  1366. self.sublinear_tf = sublinear_tf
  1367. @_fit_context(prefer_skip_nested_validation=True)
  1368. def fit(self, X, y=None):
  1369. """Learn the idf vector (global term weights).
  1370. Parameters
  1371. ----------
  1372. X : sparse matrix of shape n_samples, n_features)
  1373. A matrix of term/token counts.
  1374. y : None
  1375. This parameter is not needed to compute tf-idf.
  1376. Returns
  1377. -------
  1378. self : object
  1379. Fitted transformer.
  1380. """
  1381. # large sparse data is not supported for 32bit platforms because
  1382. # _document_frequency uses np.bincount which works on arrays of
  1383. # dtype NPY_INTP which is int32 for 32bit platforms. See #20923
  1384. X = self._validate_data(
  1385. X, accept_sparse=("csr", "csc"), accept_large_sparse=not _IS_32BIT
  1386. )
  1387. if not sp.issparse(X):
  1388. X = sp.csr_matrix(X)
  1389. dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64
  1390. if self.use_idf:
  1391. n_samples, n_features = X.shape
  1392. df = _document_frequency(X)
  1393. df = df.astype(dtype, copy=False)
  1394. # perform idf smoothing if required
  1395. df += int(self.smooth_idf)
  1396. n_samples += int(self.smooth_idf)
  1397. # log+1 instead of log makes sure terms with zero idf don't get
  1398. # suppressed entirely.
  1399. idf = np.log(n_samples / df) + 1
  1400. self._idf_diag = sp.diags(
  1401. idf,
  1402. offsets=0,
  1403. shape=(n_features, n_features),
  1404. format="csr",
  1405. dtype=dtype,
  1406. )
  1407. return self
  1408. def transform(self, X, copy=True):
  1409. """Transform a count matrix to a tf or tf-idf representation.
  1410. Parameters
  1411. ----------
  1412. X : sparse matrix of (n_samples, n_features)
  1413. A matrix of term/token counts.
  1414. copy : bool, default=True
  1415. Whether to copy X and operate on the copy or perform in-place
  1416. operations.
  1417. Returns
  1418. -------
  1419. vectors : sparse matrix of shape (n_samples, n_features)
  1420. Tf-idf-weighted document-term matrix.
  1421. """
  1422. X = self._validate_data(
  1423. X, accept_sparse="csr", dtype=FLOAT_DTYPES, copy=copy, reset=False
  1424. )
  1425. if not sp.issparse(X):
  1426. X = sp.csr_matrix(X, dtype=np.float64)
  1427. if self.sublinear_tf:
  1428. np.log(X.data, X.data)
  1429. X.data += 1
  1430. if self.use_idf:
  1431. # idf_ being a property, the automatic attributes detection
  1432. # does not work as usual and we need to specify the attribute
  1433. # name:
  1434. check_is_fitted(self, attributes=["idf_"], msg="idf vector is not fitted")
  1435. # *= doesn't work
  1436. X = X * self._idf_diag
  1437. if self.norm is not None:
  1438. X = normalize(X, norm=self.norm, copy=False)
  1439. return X
  1440. @property
  1441. def idf_(self):
  1442. """Inverse document frequency vector, only defined if `use_idf=True`.
  1443. Returns
  1444. -------
  1445. ndarray of shape (n_features,)
  1446. """
  1447. # if _idf_diag is not set, this will raise an attribute error,
  1448. # which means hasattr(self, "idf_") is False
  1449. return np.ravel(self._idf_diag.sum(axis=0))
  1450. @idf_.setter
  1451. def idf_(self, value):
  1452. value = np.asarray(value, dtype=np.float64)
  1453. n_features = value.shape[0]
  1454. self._idf_diag = sp.spdiags(
  1455. value, diags=0, m=n_features, n=n_features, format="csr"
  1456. )
  1457. def _more_tags(self):
  1458. return {"X_types": ["2darray", "sparse"]}
  1459. class TfidfVectorizer(CountVectorizer):
  1460. r"""Convert a collection of raw documents to a matrix of TF-IDF features.
  1461. Equivalent to :class:`CountVectorizer` followed by
  1462. :class:`TfidfTransformer`.
  1463. For an example of usage, see
  1464. :ref:`sphx_glr_auto_examples_text_plot_document_classification_20newsgroups.py`.
  1465. For an efficiency comparision of the different feature extractors, see
  1466. :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.
  1467. Read more in the :ref:`User Guide <text_feature_extraction>`.
  1468. Parameters
  1469. ----------
  1470. input : {'filename', 'file', 'content'}, default='content'
  1471. - If `'filename'`, the sequence passed as an argument to fit is
  1472. expected to be a list of filenames that need reading to fetch
  1473. the raw content to analyze.
  1474. - If `'file'`, the sequence items must have a 'read' method (file-like
  1475. object) that is called to fetch the bytes in memory.
  1476. - If `'content'`, the input is expected to be a sequence of items that
  1477. can be of type string or byte.
  1478. encoding : str, default='utf-8'
  1479. If bytes or files are given to analyze, this encoding is used to
  1480. decode.
  1481. decode_error : {'strict', 'ignore', 'replace'}, default='strict'
  1482. Instruction on what to do if a byte sequence is given to analyze that
  1483. contains characters not of the given `encoding`. By default, it is
  1484. 'strict', meaning that a UnicodeDecodeError will be raised. Other
  1485. values are 'ignore' and 'replace'.
  1486. strip_accents : {'ascii', 'unicode'} or callable, default=None
  1487. Remove accents and perform other character normalization
  1488. during the preprocessing step.
  1489. 'ascii' is a fast method that only works on characters that have
  1490. a direct ASCII mapping.
  1491. 'unicode' is a slightly slower method that works on any characters.
  1492. None (default) means no character normalization is performed.
  1493. Both 'ascii' and 'unicode' use NFKD normalization from
  1494. :func:`unicodedata.normalize`.
  1495. lowercase : bool, default=True
  1496. Convert all characters to lowercase before tokenizing.
  1497. preprocessor : callable, default=None
  1498. Override the preprocessing (string transformation) stage while
  1499. preserving the tokenizing and n-grams generation steps.
  1500. Only applies if ``analyzer`` is not callable.
  1501. tokenizer : callable, default=None
  1502. Override the string tokenization step while preserving the
  1503. preprocessing and n-grams generation steps.
  1504. Only applies if ``analyzer == 'word'``.
  1505. analyzer : {'word', 'char', 'char_wb'} or callable, default='word'
  1506. Whether the feature should be made of word or character n-grams.
  1507. Option 'char_wb' creates character n-grams only from text inside
  1508. word boundaries; n-grams at the edges of words are padded with space.
  1509. If a callable is passed it is used to extract the sequence of features
  1510. out of the raw, unprocessed input.
  1511. .. versionchanged:: 0.21
  1512. Since v0.21, if ``input`` is ``'filename'`` or ``'file'``, the data
  1513. is first read from the file and then passed to the given callable
  1514. analyzer.
  1515. stop_words : {'english'}, list, default=None
  1516. If a string, it is passed to _check_stop_list and the appropriate stop
  1517. list is returned. 'english' is currently the only supported string
  1518. value.
  1519. There are several known issues with 'english' and you should
  1520. consider an alternative (see :ref:`stop_words`).
  1521. If a list, that list is assumed to contain stop words, all of which
  1522. will be removed from the resulting tokens.
  1523. Only applies if ``analyzer == 'word'``.
  1524. If None, no stop words will be used. In this case, setting `max_df`
  1525. to a higher value, such as in the range (0.7, 1.0), can automatically detect
  1526. and filter stop words based on intra corpus document frequency of terms.
  1527. token_pattern : str, default=r"(?u)\\b\\w\\w+\\b"
  1528. Regular expression denoting what constitutes a "token", only used
  1529. if ``analyzer == 'word'``. The default regexp selects tokens of 2
  1530. or more alphanumeric characters (punctuation is completely ignored
  1531. and always treated as a token separator).
  1532. If there is a capturing group in token_pattern then the
  1533. captured group content, not the entire match, becomes the token.
  1534. At most one capturing group is permitted.
  1535. ngram_range : tuple (min_n, max_n), default=(1, 1)
  1536. The lower and upper boundary of the range of n-values for different
  1537. n-grams to be extracted. All values of n such that min_n <= n <= max_n
  1538. will be used. For example an ``ngram_range`` of ``(1, 1)`` means only
  1539. unigrams, ``(1, 2)`` means unigrams and bigrams, and ``(2, 2)`` means
  1540. only bigrams.
  1541. Only applies if ``analyzer`` is not callable.
  1542. max_df : float or int, default=1.0
  1543. When building the vocabulary ignore terms that have a document
  1544. frequency strictly higher than the given threshold (corpus-specific
  1545. stop words).
  1546. If float in range [0.0, 1.0], the parameter represents a proportion of
  1547. documents, integer absolute counts.
  1548. This parameter is ignored if vocabulary is not None.
  1549. min_df : float or int, default=1
  1550. When building the vocabulary ignore terms that have a document
  1551. frequency strictly lower than the given threshold. This value is also
  1552. called cut-off in the literature.
  1553. If float in range of [0.0, 1.0], the parameter represents a proportion
  1554. of documents, integer absolute counts.
  1555. This parameter is ignored if vocabulary is not None.
  1556. max_features : int, default=None
  1557. If not None, build a vocabulary that only consider the top
  1558. `max_features` ordered by term frequency across the corpus.
  1559. Otherwise, all features are used.
  1560. This parameter is ignored if vocabulary is not None.
  1561. vocabulary : Mapping or iterable, default=None
  1562. Either a Mapping (e.g., a dict) where keys are terms and values are
  1563. indices in the feature matrix, or an iterable over terms. If not
  1564. given, a vocabulary is determined from the input documents.
  1565. binary : bool, default=False
  1566. If True, all non-zero term counts are set to 1. This does not mean
  1567. outputs will have only 0/1 values, only that the tf term in tf-idf
  1568. is binary. (Set `binary` to True, `use_idf` to False and
  1569. `norm` to None to get 0/1 outputs).
  1570. dtype : dtype, default=float64
  1571. Type of the matrix returned by fit_transform() or transform().
  1572. norm : {'l1', 'l2'} or None, default='l2'
  1573. Each output row will have unit norm, either:
  1574. - 'l2': Sum of squares of vector elements is 1. The cosine
  1575. similarity between two vectors is their dot product when l2 norm has
  1576. been applied.
  1577. - 'l1': Sum of absolute values of vector elements is 1.
  1578. See :func:`~sklearn.preprocessing.normalize`.
  1579. - None: No normalization.
  1580. use_idf : bool, default=True
  1581. Enable inverse-document-frequency reweighting. If False, idf(t) = 1.
  1582. smooth_idf : bool, default=True
  1583. Smooth idf weights by adding one to document frequencies, as if an
  1584. extra document was seen containing every term in the collection
  1585. exactly once. Prevents zero divisions.
  1586. sublinear_tf : bool, default=False
  1587. Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf).
  1588. Attributes
  1589. ----------
  1590. vocabulary_ : dict
  1591. A mapping of terms to feature indices.
  1592. fixed_vocabulary_ : bool
  1593. True if a fixed vocabulary of term to indices mapping
  1594. is provided by the user.
  1595. idf_ : array of shape (n_features,)
  1596. The inverse document frequency (IDF) vector; only defined
  1597. if ``use_idf`` is True.
  1598. stop_words_ : set
  1599. Terms that were ignored because they either:
  1600. - occurred in too many documents (`max_df`)
  1601. - occurred in too few documents (`min_df`)
  1602. - were cut off by feature selection (`max_features`).
  1603. This is only available if no vocabulary was given.
  1604. See Also
  1605. --------
  1606. CountVectorizer : Transforms text into a sparse matrix of n-gram counts.
  1607. TfidfTransformer : Performs the TF-IDF transformation from a provided
  1608. matrix of counts.
  1609. Notes
  1610. -----
  1611. The ``stop_words_`` attribute can get large and increase the model size
  1612. when pickling. This attribute is provided only for introspection and can
  1613. be safely removed using delattr or set to None before pickling.
  1614. Examples
  1615. --------
  1616. >>> from sklearn.feature_extraction.text import TfidfVectorizer
  1617. >>> corpus = [
  1618. ... 'This is the first document.',
  1619. ... 'This document is the second document.',
  1620. ... 'And this is the third one.',
  1621. ... 'Is this the first document?',
  1622. ... ]
  1623. >>> vectorizer = TfidfVectorizer()
  1624. >>> X = vectorizer.fit_transform(corpus)
  1625. >>> vectorizer.get_feature_names_out()
  1626. array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
  1627. 'this'], ...)
  1628. >>> print(X.shape)
  1629. (4, 9)
  1630. """
  1631. _parameter_constraints: dict = {**CountVectorizer._parameter_constraints}
  1632. _parameter_constraints.update(
  1633. {
  1634. "norm": [StrOptions({"l1", "l2"}), None],
  1635. "use_idf": ["boolean"],
  1636. "smooth_idf": ["boolean"],
  1637. "sublinear_tf": ["boolean"],
  1638. }
  1639. )
  1640. def __init__(
  1641. self,
  1642. *,
  1643. input="content",
  1644. encoding="utf-8",
  1645. decode_error="strict",
  1646. strip_accents=None,
  1647. lowercase=True,
  1648. preprocessor=None,
  1649. tokenizer=None,
  1650. analyzer="word",
  1651. stop_words=None,
  1652. token_pattern=r"(?u)\b\w\w+\b",
  1653. ngram_range=(1, 1),
  1654. max_df=1.0,
  1655. min_df=1,
  1656. max_features=None,
  1657. vocabulary=None,
  1658. binary=False,
  1659. dtype=np.float64,
  1660. norm="l2",
  1661. use_idf=True,
  1662. smooth_idf=True,
  1663. sublinear_tf=False,
  1664. ):
  1665. super().__init__(
  1666. input=input,
  1667. encoding=encoding,
  1668. decode_error=decode_error,
  1669. strip_accents=strip_accents,
  1670. lowercase=lowercase,
  1671. preprocessor=preprocessor,
  1672. tokenizer=tokenizer,
  1673. analyzer=analyzer,
  1674. stop_words=stop_words,
  1675. token_pattern=token_pattern,
  1676. ngram_range=ngram_range,
  1677. max_df=max_df,
  1678. min_df=min_df,
  1679. max_features=max_features,
  1680. vocabulary=vocabulary,
  1681. binary=binary,
  1682. dtype=dtype,
  1683. )
  1684. self.norm = norm
  1685. self.use_idf = use_idf
  1686. self.smooth_idf = smooth_idf
  1687. self.sublinear_tf = sublinear_tf
  1688. # Broadcast the TF-IDF parameters to the underlying transformer instance
  1689. # for easy grid search and repr
  1690. @property
  1691. def idf_(self):
  1692. """Inverse document frequency vector, only defined if `use_idf=True`.
  1693. Returns
  1694. -------
  1695. ndarray of shape (n_features,)
  1696. """
  1697. if not hasattr(self, "_tfidf"):
  1698. raise NotFittedError(
  1699. f"{self.__class__.__name__} is not fitted yet. Call 'fit' with "
  1700. "appropriate arguments before using this attribute."
  1701. )
  1702. return self._tfidf.idf_
  1703. @idf_.setter
  1704. def idf_(self, value):
  1705. if not self.use_idf:
  1706. raise ValueError("`idf_` cannot be set when `user_idf=False`.")
  1707. if not hasattr(self, "_tfidf"):
  1708. # We should support transferring `idf_` from another `TfidfTransformer`
  1709. # and therefore, we need to create the transformer instance it does not
  1710. # exist yet.
  1711. self._tfidf = TfidfTransformer(
  1712. norm=self.norm,
  1713. use_idf=self.use_idf,
  1714. smooth_idf=self.smooth_idf,
  1715. sublinear_tf=self.sublinear_tf,
  1716. )
  1717. self._validate_vocabulary()
  1718. if hasattr(self, "vocabulary_"):
  1719. if len(self.vocabulary_) != len(value):
  1720. raise ValueError(
  1721. "idf length = %d must be equal to vocabulary size = %d"
  1722. % (len(value), len(self.vocabulary))
  1723. )
  1724. self._tfidf.idf_ = value
  1725. def _check_params(self):
  1726. if self.dtype not in FLOAT_DTYPES:
  1727. warnings.warn(
  1728. "Only {} 'dtype' should be used. {} 'dtype' will "
  1729. "be converted to np.float64.".format(FLOAT_DTYPES, self.dtype),
  1730. UserWarning,
  1731. )
  1732. @_fit_context(prefer_skip_nested_validation=True)
  1733. def fit(self, raw_documents, y=None):
  1734. """Learn vocabulary and idf from training set.
  1735. Parameters
  1736. ----------
  1737. raw_documents : iterable
  1738. An iterable which generates either str, unicode or file objects.
  1739. y : None
  1740. This parameter is not needed to compute tfidf.
  1741. Returns
  1742. -------
  1743. self : object
  1744. Fitted vectorizer.
  1745. """
  1746. self._check_params()
  1747. self._warn_for_unused_params()
  1748. self._tfidf = TfidfTransformer(
  1749. norm=self.norm,
  1750. use_idf=self.use_idf,
  1751. smooth_idf=self.smooth_idf,
  1752. sublinear_tf=self.sublinear_tf,
  1753. )
  1754. X = super().fit_transform(raw_documents)
  1755. self._tfidf.fit(X)
  1756. return self
  1757. def fit_transform(self, raw_documents, y=None):
  1758. """Learn vocabulary and idf, return document-term matrix.
  1759. This is equivalent to fit followed by transform, but more efficiently
  1760. implemented.
  1761. Parameters
  1762. ----------
  1763. raw_documents : iterable
  1764. An iterable which generates either str, unicode or file objects.
  1765. y : None
  1766. This parameter is ignored.
  1767. Returns
  1768. -------
  1769. X : sparse matrix of (n_samples, n_features)
  1770. Tf-idf-weighted document-term matrix.
  1771. """
  1772. self._check_params()
  1773. self._tfidf = TfidfTransformer(
  1774. norm=self.norm,
  1775. use_idf=self.use_idf,
  1776. smooth_idf=self.smooth_idf,
  1777. sublinear_tf=self.sublinear_tf,
  1778. )
  1779. X = super().fit_transform(raw_documents)
  1780. self._tfidf.fit(X)
  1781. # X is already a transformed view of raw_documents so
  1782. # we set copy to False
  1783. return self._tfidf.transform(X, copy=False)
  1784. def transform(self, raw_documents):
  1785. """Transform documents to document-term matrix.
  1786. Uses the vocabulary and document frequencies (df) learned by fit (or
  1787. fit_transform).
  1788. Parameters
  1789. ----------
  1790. raw_documents : iterable
  1791. An iterable which generates either str, unicode or file objects.
  1792. Returns
  1793. -------
  1794. X : sparse matrix of (n_samples, n_features)
  1795. Tf-idf-weighted document-term matrix.
  1796. """
  1797. check_is_fitted(self, msg="The TF-IDF vectorizer is not fitted")
  1798. X = super().transform(raw_documents)
  1799. return self._tfidf.transform(X, copy=False)
  1800. def _more_tags(self):
  1801. return {"X_types": ["string"], "_skip_test": True}