| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642 |
- import pickle
- import re
- import warnings
- from collections import defaultdict
- from collections.abc import Mapping
- from functools import partial
- from io import StringIO
- import numpy as np
- import pytest
- from numpy.testing import assert_array_almost_equal, assert_array_equal
- from scipy import sparse
- from sklearn.base import clone
- from sklearn.feature_extraction.text import (
- ENGLISH_STOP_WORDS,
- CountVectorizer,
- HashingVectorizer,
- TfidfTransformer,
- TfidfVectorizer,
- strip_accents_ascii,
- strip_accents_unicode,
- strip_tags,
- )
- from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
- from sklearn.pipeline import Pipeline
- from sklearn.svm import LinearSVC
- from sklearn.utils import IS_PYPY
- from sklearn.utils._testing import (
- assert_allclose_dense_sparse,
- assert_almost_equal,
- fails_if_pypy,
- skip_if_32bit,
- )
- JUNK_FOOD_DOCS = (
- "the pizza pizza beer copyright",
- "the pizza burger beer copyright",
- "the the pizza beer beer copyright",
- "the burger beer beer copyright",
- "the coke burger coke copyright",
- "the coke burger burger",
- )
- NOTJUNK_FOOD_DOCS = (
- "the salad celeri copyright",
- "the salad salad sparkling water copyright",
- "the the celeri celeri copyright",
- "the tomato tomato salad water",
- "the tomato salad water copyright",
- )
- ALL_FOOD_DOCS = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS
- def uppercase(s):
- return strip_accents_unicode(s).upper()
- def strip_eacute(s):
- return s.replace("é", "e")
- def split_tokenize(s):
- return s.split()
- def lazy_analyze(s):
- return ["the_ultimate_feature"]
- def test_strip_accents():
- # check some classical latin accentuated symbols
- a = "àáâãäåçèéêë"
- expected = "aaaaaaceeee"
- assert strip_accents_unicode(a) == expected
- a = "ìíîïñòóôõöùúûüý"
- expected = "iiiinooooouuuuy"
- assert strip_accents_unicode(a) == expected
- # check some arabic
- a = "\u0625" # alef with a hamza below: إ
- expected = "\u0627" # simple alef: ا
- assert strip_accents_unicode(a) == expected
- # mix letters accentuated and not
- a = "this is à test"
- expected = "this is a test"
- assert strip_accents_unicode(a) == expected
- # strings that are already decomposed
- a = "o\u0308" # o with diaeresis
- expected = "o"
- assert strip_accents_unicode(a) == expected
- # combining marks by themselves
- a = "\u0300\u0301\u0302\u0303"
- expected = ""
- assert strip_accents_unicode(a) == expected
- # Multiple combining marks on one character
- a = "o\u0308\u0304"
- expected = "o"
- assert strip_accents_unicode(a) == expected
- def test_to_ascii():
- # check some classical latin accentuated symbols
- a = "àáâãäåçèéêë"
- expected = "aaaaaaceeee"
- assert strip_accents_ascii(a) == expected
- a = "ìíîïñòóôõöùúûüý"
- expected = "iiiinooooouuuuy"
- assert strip_accents_ascii(a) == expected
- # check some arabic
- a = "\u0625" # halef with a hamza below
- expected = "" # halef has no direct ascii match
- assert strip_accents_ascii(a) == expected
- # mix letters accentuated and not
- a = "this is à test"
- expected = "this is a test"
- assert strip_accents_ascii(a) == expected
- @pytest.mark.parametrize("Vectorizer", (CountVectorizer, HashingVectorizer))
- def test_word_analyzer_unigrams(Vectorizer):
- wa = Vectorizer(strip_accents="ascii").build_analyzer()
- text = "J'ai mangé du kangourou ce midi, c'était pas très bon."
- expected = [
- "ai",
- "mange",
- "du",
- "kangourou",
- "ce",
- "midi",
- "etait",
- "pas",
- "tres",
- "bon",
- ]
- assert wa(text) == expected
- text = "This is a test, really.\n\n I met Harry yesterday."
- expected = ["this", "is", "test", "really", "met", "harry", "yesterday"]
- assert wa(text) == expected
- wa = Vectorizer(input="file").build_analyzer()
- text = StringIO("This is a test with a file-like object!")
- expected = ["this", "is", "test", "with", "file", "like", "object"]
- assert wa(text) == expected
- # with custom preprocessor
- wa = Vectorizer(preprocessor=uppercase).build_analyzer()
- text = "J'ai mangé du kangourou ce midi, c'était pas très bon."
- expected = [
- "AI",
- "MANGE",
- "DU",
- "KANGOUROU",
- "CE",
- "MIDI",
- "ETAIT",
- "PAS",
- "TRES",
- "BON",
- ]
- assert wa(text) == expected
- # with custom tokenizer
- wa = Vectorizer(tokenizer=split_tokenize, strip_accents="ascii").build_analyzer()
- text = "J'ai mangé du kangourou ce midi, c'était pas très bon."
- expected = [
- "j'ai",
- "mange",
- "du",
- "kangourou",
- "ce",
- "midi,",
- "c'etait",
- "pas",
- "tres",
- "bon.",
- ]
- assert wa(text) == expected
- def test_word_analyzer_unigrams_and_bigrams():
- wa = CountVectorizer(
- analyzer="word", strip_accents="unicode", ngram_range=(1, 2)
- ).build_analyzer()
- text = "J'ai mangé du kangourou ce midi, c'était pas très bon."
- expected = [
- "ai",
- "mange",
- "du",
- "kangourou",
- "ce",
- "midi",
- "etait",
- "pas",
- "tres",
- "bon",
- "ai mange",
- "mange du",
- "du kangourou",
- "kangourou ce",
- "ce midi",
- "midi etait",
- "etait pas",
- "pas tres",
- "tres bon",
- ]
- assert wa(text) == expected
- def test_unicode_decode_error():
- # decode_error default to strict, so this should fail
- # First, encode (as bytes) a unicode string.
- text = "J'ai mangé du kangourou ce midi, c'était pas très bon."
- text_bytes = text.encode("utf-8")
- # Then let the Analyzer try to decode it as ascii. It should fail,
- # because we have given it an incorrect encoding.
- wa = CountVectorizer(ngram_range=(1, 2), encoding="ascii").build_analyzer()
- with pytest.raises(UnicodeDecodeError):
- wa(text_bytes)
- ca = CountVectorizer(
- analyzer="char", ngram_range=(3, 6), encoding="ascii"
- ).build_analyzer()
- with pytest.raises(UnicodeDecodeError):
- ca(text_bytes)
- def test_char_ngram_analyzer():
- cnga = CountVectorizer(
- analyzer="char", strip_accents="unicode", ngram_range=(3, 6)
- ).build_analyzer()
- text = "J'ai mangé du kangourou ce midi, c'était pas très bon"
- expected = ["j'a", "'ai", "ai ", "i m", " ma"]
- assert cnga(text)[:5] == expected
- expected = ["s tres", " tres ", "tres b", "res bo", "es bon"]
- assert cnga(text)[-5:] == expected
- text = "This \n\tis a test, really.\n\n I met Harry yesterday"
- expected = ["thi", "his", "is ", "s i", " is"]
- assert cnga(text)[:5] == expected
- expected = [" yeste", "yester", "esterd", "sterda", "terday"]
- assert cnga(text)[-5:] == expected
- cnga = CountVectorizer(
- input="file", analyzer="char", ngram_range=(3, 6)
- ).build_analyzer()
- text = StringIO("This is a test with a file-like object!")
- expected = ["thi", "his", "is ", "s i", " is"]
- assert cnga(text)[:5] == expected
- def test_char_wb_ngram_analyzer():
- cnga = CountVectorizer(
- analyzer="char_wb", strip_accents="unicode", ngram_range=(3, 6)
- ).build_analyzer()
- text = "This \n\tis a test, really.\n\n I met Harry yesterday"
- expected = [" th", "thi", "his", "is ", " thi"]
- assert cnga(text)[:5] == expected
- expected = ["yester", "esterd", "sterda", "terday", "erday "]
- assert cnga(text)[-5:] == expected
- cnga = CountVectorizer(
- input="file", analyzer="char_wb", ngram_range=(3, 6)
- ).build_analyzer()
- text = StringIO("A test with a file-like object!")
- expected = [" a ", " te", "tes", "est", "st ", " tes"]
- assert cnga(text)[:6] == expected
- def test_word_ngram_analyzer():
- cnga = CountVectorizer(
- analyzer="word", strip_accents="unicode", ngram_range=(3, 6)
- ).build_analyzer()
- text = "This \n\tis a test, really.\n\n I met Harry yesterday"
- expected = ["this is test", "is test really", "test really met"]
- assert cnga(text)[:3] == expected
- expected = [
- "test really met harry yesterday",
- "this is test really met harry",
- "is test really met harry yesterday",
- ]
- assert cnga(text)[-3:] == expected
- cnga_file = CountVectorizer(
- input="file", analyzer="word", ngram_range=(3, 6)
- ).build_analyzer()
- file = StringIO(text)
- assert cnga_file(file) == cnga(text)
- def test_countvectorizer_custom_vocabulary():
- vocab = {"pizza": 0, "beer": 1}
- terms = set(vocab.keys())
- # Try a few of the supported types.
- for typ in [dict, list, iter, partial(defaultdict, int)]:
- v = typ(vocab)
- vect = CountVectorizer(vocabulary=v)
- vect.fit(JUNK_FOOD_DOCS)
- if isinstance(v, Mapping):
- assert vect.vocabulary_ == vocab
- else:
- assert set(vect.vocabulary_) == terms
- X = vect.transform(JUNK_FOOD_DOCS)
- assert X.shape[1] == len(terms)
- v = typ(vocab)
- vect = CountVectorizer(vocabulary=v)
- inv = vect.inverse_transform(X)
- assert len(inv) == X.shape[0]
- def test_countvectorizer_custom_vocabulary_pipeline():
- what_we_like = ["pizza", "beer"]
- pipe = Pipeline(
- [
- ("count", CountVectorizer(vocabulary=what_we_like)),
- ("tfidf", TfidfTransformer()),
- ]
- )
- X = pipe.fit_transform(ALL_FOOD_DOCS)
- assert set(pipe.named_steps["count"].vocabulary_) == set(what_we_like)
- assert X.shape[1] == len(what_we_like)
- def test_countvectorizer_custom_vocabulary_repeated_indices():
- vocab = {"pizza": 0, "beer": 0}
- msg = "Vocabulary contains repeated indices"
- with pytest.raises(ValueError, match=msg):
- vect = CountVectorizer(vocabulary=vocab)
- vect.fit(["pasta_siziliana"])
- def test_countvectorizer_custom_vocabulary_gap_index():
- vocab = {"pizza": 1, "beer": 2}
- with pytest.raises(ValueError, match="doesn't contain index"):
- vect = CountVectorizer(vocabulary=vocab)
- vect.fit(["pasta_verdura"])
- def test_countvectorizer_stop_words():
- cv = CountVectorizer()
- cv.set_params(stop_words="english")
- assert cv.get_stop_words() == ENGLISH_STOP_WORDS
- cv.set_params(stop_words="_bad_str_stop_")
- with pytest.raises(ValueError):
- cv.get_stop_words()
- cv.set_params(stop_words="_bad_unicode_stop_")
- with pytest.raises(ValueError):
- cv.get_stop_words()
- stoplist = ["some", "other", "words"]
- cv.set_params(stop_words=stoplist)
- assert cv.get_stop_words() == set(stoplist)
- def test_countvectorizer_empty_vocabulary():
- with pytest.raises(ValueError, match="empty vocabulary"):
- vect = CountVectorizer(vocabulary=[])
- vect.fit(["foo"])
- with pytest.raises(ValueError, match="empty vocabulary"):
- v = CountVectorizer(max_df=1.0, stop_words="english")
- # fit on stopwords only
- v.fit(["to be or not to be", "and me too", "and so do you"])
- def test_fit_countvectorizer_twice():
- cv = CountVectorizer()
- X1 = cv.fit_transform(ALL_FOOD_DOCS[:5])
- X2 = cv.fit_transform(ALL_FOOD_DOCS[5:])
- assert X1.shape[1] != X2.shape[1]
- def test_countvectorizer_custom_token_pattern():
- """Check `get_feature_names_out()` when a custom token pattern is passed.
- Non-regression test for:
- https://github.com/scikit-learn/scikit-learn/issues/12971
- """
- corpus = [
- "This is the 1st document in my corpus.",
- "This document is the 2nd sample.",
- "And this is the 3rd one.",
- "Is this the 4th document?",
- ]
- token_pattern = r"[0-9]{1,3}(?:st|nd|rd|th)\s\b(\w{2,})\b"
- vectorizer = CountVectorizer(token_pattern=token_pattern)
- vectorizer.fit_transform(corpus)
- expected = ["document", "one", "sample"]
- feature_names_out = vectorizer.get_feature_names_out()
- assert_array_equal(feature_names_out, expected)
- def test_countvectorizer_custom_token_pattern_with_several_group():
- """Check that we raise an error if token pattern capture several groups.
- Non-regression test for:
- https://github.com/scikit-learn/scikit-learn/issues/12971
- """
- corpus = [
- "This is the 1st document in my corpus.",
- "This document is the 2nd sample.",
- "And this is the 3rd one.",
- "Is this the 4th document?",
- ]
- token_pattern = r"([0-9]{1,3}(?:st|nd|rd|th))\s\b(\w{2,})\b"
- err_msg = "More than 1 capturing group in token pattern"
- vectorizer = CountVectorizer(token_pattern=token_pattern)
- with pytest.raises(ValueError, match=err_msg):
- vectorizer.fit(corpus)
- def test_countvectorizer_uppercase_in_vocab():
- # Check that the check for uppercase in the provided vocabulary is only done at fit
- # time and not at transform time (#21251)
- vocabulary = ["Sample", "Upper", "Case", "Vocabulary"]
- message = (
- "Upper case characters found in"
- " vocabulary while 'lowercase'"
- " is True. These entries will not"
- " be matched with any documents"
- )
- vectorizer = CountVectorizer(lowercase=True, vocabulary=vocabulary)
- with pytest.warns(UserWarning, match=message):
- vectorizer.fit(vocabulary)
- with warnings.catch_warnings():
- warnings.simplefilter("error", UserWarning)
- vectorizer.transform(vocabulary)
- def test_tf_transformer_feature_names_out():
- """Check get_feature_names_out for TfidfTransformer"""
- X = [[1, 1, 1], [1, 1, 0], [1, 0, 0]]
- tr = TfidfTransformer(smooth_idf=True, norm="l2").fit(X)
- feature_names_in = ["a", "c", "b"]
- feature_names_out = tr.get_feature_names_out(feature_names_in)
- assert_array_equal(feature_names_in, feature_names_out)
- def test_tf_idf_smoothing():
- X = [[1, 1, 1], [1, 1, 0], [1, 0, 0]]
- tr = TfidfTransformer(smooth_idf=True, norm="l2")
- tfidf = tr.fit_transform(X).toarray()
- assert (tfidf >= 0).all()
- # check normalization
- assert_array_almost_equal((tfidf**2).sum(axis=1), [1.0, 1.0, 1.0])
- # this is robust to features with only zeros
- X = [[1, 1, 0], [1, 1, 0], [1, 0, 0]]
- tr = TfidfTransformer(smooth_idf=True, norm="l2")
- tfidf = tr.fit_transform(X).toarray()
- assert (tfidf >= 0).all()
- def test_tfidf_no_smoothing():
- X = [[1, 1, 1], [1, 1, 0], [1, 0, 0]]
- tr = TfidfTransformer(smooth_idf=False, norm="l2")
- tfidf = tr.fit_transform(X).toarray()
- assert (tfidf >= 0).all()
- # check normalization
- assert_array_almost_equal((tfidf**2).sum(axis=1), [1.0, 1.0, 1.0])
- # the lack of smoothing make IDF fragile in the presence of feature with
- # only zeros
- X = [[1, 1, 0], [1, 1, 0], [1, 0, 0]]
- tr = TfidfTransformer(smooth_idf=False, norm="l2")
- in_warning_message = "divide by zero"
- with pytest.warns(RuntimeWarning, match=in_warning_message):
- tr.fit_transform(X).toarray()
- def test_sublinear_tf():
- X = [[1], [2], [3]]
- tr = TfidfTransformer(sublinear_tf=True, use_idf=False, norm=None)
- tfidf = tr.fit_transform(X).toarray()
- assert tfidf[0] == 1
- assert tfidf[1] > tfidf[0]
- assert tfidf[2] > tfidf[1]
- assert tfidf[1] < 2
- assert tfidf[2] < 3
- def test_vectorizer():
- # raw documents as an iterator
- train_data = iter(ALL_FOOD_DOCS[:-1])
- test_data = [ALL_FOOD_DOCS[-1]]
- n_train = len(ALL_FOOD_DOCS) - 1
- # test without vocabulary
- v1 = CountVectorizer(max_df=0.5)
- counts_train = v1.fit_transform(train_data)
- if hasattr(counts_train, "tocsr"):
- counts_train = counts_train.tocsr()
- assert counts_train[0, v1.vocabulary_["pizza"]] == 2
- # build a vectorizer v1 with the same vocabulary as the one fitted by v1
- v2 = CountVectorizer(vocabulary=v1.vocabulary_)
- # compare that the two vectorizer give the same output on the test sample
- for v in (v1, v2):
- counts_test = v.transform(test_data)
- if hasattr(counts_test, "tocsr"):
- counts_test = counts_test.tocsr()
- vocabulary = v.vocabulary_
- assert counts_test[0, vocabulary["salad"]] == 1
- assert counts_test[0, vocabulary["tomato"]] == 1
- assert counts_test[0, vocabulary["water"]] == 1
- # stop word from the fixed list
- assert "the" not in vocabulary
- # stop word found automatically by the vectorizer DF thresholding
- # words that are high frequent across the complete corpus are likely
- # to be not informative (either real stop words of extraction
- # artifacts)
- assert "copyright" not in vocabulary
- # not present in the sample
- assert counts_test[0, vocabulary["coke"]] == 0
- assert counts_test[0, vocabulary["burger"]] == 0
- assert counts_test[0, vocabulary["beer"]] == 0
- assert counts_test[0, vocabulary["pizza"]] == 0
- # test tf-idf
- t1 = TfidfTransformer(norm="l1")
- tfidf = t1.fit(counts_train).transform(counts_train).toarray()
- assert len(t1.idf_) == len(v1.vocabulary_)
- assert tfidf.shape == (n_train, len(v1.vocabulary_))
- # test tf-idf with new data
- tfidf_test = t1.transform(counts_test).toarray()
- assert tfidf_test.shape == (len(test_data), len(v1.vocabulary_))
- # test tf alone
- t2 = TfidfTransformer(norm="l1", use_idf=False)
- tf = t2.fit(counts_train).transform(counts_train).toarray()
- assert not hasattr(t2, "idf_")
- # test idf transform with unlearned idf vector
- t3 = TfidfTransformer(use_idf=True)
- with pytest.raises(ValueError):
- t3.transform(counts_train)
- # L1-normalized term frequencies sum to one
- assert_array_almost_equal(np.sum(tf, axis=1), [1.0] * n_train)
- # test the direct tfidf vectorizer
- # (equivalent to term count vectorizer + tfidf transformer)
- train_data = iter(ALL_FOOD_DOCS[:-1])
- tv = TfidfVectorizer(norm="l1")
- tv.max_df = v1.max_df
- tfidf2 = tv.fit_transform(train_data).toarray()
- assert not tv.fixed_vocabulary_
- assert_array_almost_equal(tfidf, tfidf2)
- # test the direct tfidf vectorizer with new data
- tfidf_test2 = tv.transform(test_data).toarray()
- assert_array_almost_equal(tfidf_test, tfidf_test2)
- # test transform on unfitted vectorizer with empty vocabulary
- v3 = CountVectorizer(vocabulary=None)
- with pytest.raises(ValueError):
- v3.transform(train_data)
- # ascii preprocessor?
- v3.set_params(strip_accents="ascii", lowercase=False)
- processor = v3.build_preprocessor()
- text = "J'ai mangé du kangourou ce midi, c'était pas très bon."
- expected = strip_accents_ascii(text)
- result = processor(text)
- assert expected == result
- # error on bad strip_accents param
- v3.set_params(strip_accents="_gabbledegook_", preprocessor=None)
- with pytest.raises(ValueError):
- v3.build_preprocessor()
- # error with bad analyzer type
- v3.set_params = "_invalid_analyzer_type_"
- with pytest.raises(ValueError):
- v3.build_analyzer()
- def test_tfidf_vectorizer_setters():
- norm, use_idf, smooth_idf, sublinear_tf = "l2", False, False, False
- tv = TfidfVectorizer(
- norm=norm, use_idf=use_idf, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf
- )
- tv.fit(JUNK_FOOD_DOCS)
- assert tv._tfidf.norm == norm
- assert tv._tfidf.use_idf == use_idf
- assert tv._tfidf.smooth_idf == smooth_idf
- assert tv._tfidf.sublinear_tf == sublinear_tf
- # assigning value to `TfidfTransformer` should not have any effect until
- # fitting
- tv.norm = "l1"
- tv.use_idf = True
- tv.smooth_idf = True
- tv.sublinear_tf = True
- assert tv._tfidf.norm == norm
- assert tv._tfidf.use_idf == use_idf
- assert tv._tfidf.smooth_idf == smooth_idf
- assert tv._tfidf.sublinear_tf == sublinear_tf
- tv.fit(JUNK_FOOD_DOCS)
- assert tv._tfidf.norm == tv.norm
- assert tv._tfidf.use_idf == tv.use_idf
- assert tv._tfidf.smooth_idf == tv.smooth_idf
- assert tv._tfidf.sublinear_tf == tv.sublinear_tf
- @fails_if_pypy
- def test_hashing_vectorizer():
- v = HashingVectorizer()
- X = v.transform(ALL_FOOD_DOCS)
- token_nnz = X.nnz
- assert X.shape == (len(ALL_FOOD_DOCS), v.n_features)
- assert X.dtype == v.dtype
- # By default the hashed values receive a random sign and l2 normalization
- # makes the feature values bounded
- assert np.min(X.data) > -1
- assert np.min(X.data) < 0
- assert np.max(X.data) > 0
- assert np.max(X.data) < 1
- # Check that the rows are normalized
- for i in range(X.shape[0]):
- assert_almost_equal(np.linalg.norm(X[0].data, 2), 1.0)
- # Check vectorization with some non-default parameters
- v = HashingVectorizer(ngram_range=(1, 2), norm="l1")
- X = v.transform(ALL_FOOD_DOCS)
- assert X.shape == (len(ALL_FOOD_DOCS), v.n_features)
- assert X.dtype == v.dtype
- # ngrams generate more non zeros
- ngrams_nnz = X.nnz
- assert ngrams_nnz > token_nnz
- assert ngrams_nnz < 2 * token_nnz
- # makes the feature values bounded
- assert np.min(X.data) > -1
- assert np.max(X.data) < 1
- # Check that the rows are normalized
- for i in range(X.shape[0]):
- assert_almost_equal(np.linalg.norm(X[0].data, 1), 1.0)
- def test_feature_names():
- cv = CountVectorizer(max_df=0.5)
- # test for Value error on unfitted/empty vocabulary
- with pytest.raises(ValueError):
- cv.get_feature_names_out()
- assert not cv.fixed_vocabulary_
- # test for vocabulary learned from data
- X = cv.fit_transform(ALL_FOOD_DOCS)
- n_samples, n_features = X.shape
- assert len(cv.vocabulary_) == n_features
- feature_names = cv.get_feature_names_out()
- assert isinstance(feature_names, np.ndarray)
- assert feature_names.dtype == object
- assert len(feature_names) == n_features
- assert_array_equal(
- [
- "beer",
- "burger",
- "celeri",
- "coke",
- "pizza",
- "salad",
- "sparkling",
- "tomato",
- "water",
- ],
- feature_names,
- )
- for idx, name in enumerate(feature_names):
- assert idx == cv.vocabulary_.get(name)
- # test for custom vocabulary
- vocab = [
- "beer",
- "burger",
- "celeri",
- "coke",
- "pizza",
- "salad",
- "sparkling",
- "tomato",
- "water",
- ]
- cv = CountVectorizer(vocabulary=vocab)
- feature_names = cv.get_feature_names_out()
- assert_array_equal(
- [
- "beer",
- "burger",
- "celeri",
- "coke",
- "pizza",
- "salad",
- "sparkling",
- "tomato",
- "water",
- ],
- feature_names,
- )
- assert cv.fixed_vocabulary_
- for idx, name in enumerate(feature_names):
- assert idx == cv.vocabulary_.get(name)
- @pytest.mark.parametrize("Vectorizer", (CountVectorizer, TfidfVectorizer))
- def test_vectorizer_max_features(Vectorizer):
- expected_vocabulary = {"burger", "beer", "salad", "pizza"}
- expected_stop_words = {
- "celeri",
- "tomato",
- "copyright",
- "coke",
- "sparkling",
- "water",
- "the",
- }
- # test bounded number of extracted features
- vectorizer = Vectorizer(max_df=0.6, max_features=4)
- vectorizer.fit(ALL_FOOD_DOCS)
- assert set(vectorizer.vocabulary_) == expected_vocabulary
- assert vectorizer.stop_words_ == expected_stop_words
- def test_count_vectorizer_max_features():
- # Regression test: max_features didn't work correctly in 0.14.
- cv_1 = CountVectorizer(max_features=1)
- cv_3 = CountVectorizer(max_features=3)
- cv_None = CountVectorizer(max_features=None)
- counts_1 = cv_1.fit_transform(JUNK_FOOD_DOCS).sum(axis=0)
- counts_3 = cv_3.fit_transform(JUNK_FOOD_DOCS).sum(axis=0)
- counts_None = cv_None.fit_transform(JUNK_FOOD_DOCS).sum(axis=0)
- features_1 = cv_1.get_feature_names_out()
- features_3 = cv_3.get_feature_names_out()
- features_None = cv_None.get_feature_names_out()
- # The most common feature is "the", with frequency 7.
- assert 7 == counts_1.max()
- assert 7 == counts_3.max()
- assert 7 == counts_None.max()
- # The most common feature should be the same
- assert "the" == features_1[np.argmax(counts_1)]
- assert "the" == features_3[np.argmax(counts_3)]
- assert "the" == features_None[np.argmax(counts_None)]
- def test_vectorizer_max_df():
- test_data = ["abc", "dea", "eat"]
- vect = CountVectorizer(analyzer="char", max_df=1.0)
- vect.fit(test_data)
- assert "a" in vect.vocabulary_.keys()
- assert len(vect.vocabulary_.keys()) == 6
- assert len(vect.stop_words_) == 0
- vect.max_df = 0.5 # 0.5 * 3 documents -> max_doc_count == 1.5
- vect.fit(test_data)
- assert "a" not in vect.vocabulary_.keys() # {ae} ignored
- assert len(vect.vocabulary_.keys()) == 4 # {bcdt} remain
- assert "a" in vect.stop_words_
- assert len(vect.stop_words_) == 2
- vect.max_df = 1
- vect.fit(test_data)
- assert "a" not in vect.vocabulary_.keys() # {ae} ignored
- assert len(vect.vocabulary_.keys()) == 4 # {bcdt} remain
- assert "a" in vect.stop_words_
- assert len(vect.stop_words_) == 2
- def test_vectorizer_min_df():
- test_data = ["abc", "dea", "eat"]
- vect = CountVectorizer(analyzer="char", min_df=1)
- vect.fit(test_data)
- assert "a" in vect.vocabulary_.keys()
- assert len(vect.vocabulary_.keys()) == 6
- assert len(vect.stop_words_) == 0
- vect.min_df = 2
- vect.fit(test_data)
- assert "c" not in vect.vocabulary_.keys() # {bcdt} ignored
- assert len(vect.vocabulary_.keys()) == 2 # {ae} remain
- assert "c" in vect.stop_words_
- assert len(vect.stop_words_) == 4
- vect.min_df = 0.8 # 0.8 * 3 documents -> min_doc_count == 2.4
- vect.fit(test_data)
- assert "c" not in vect.vocabulary_.keys() # {bcdet} ignored
- assert len(vect.vocabulary_.keys()) == 1 # {a} remains
- assert "c" in vect.stop_words_
- assert len(vect.stop_words_) == 5
- def test_count_binary_occurrences():
- # by default multiple occurrences are counted as longs
- test_data = ["aaabc", "abbde"]
- vect = CountVectorizer(analyzer="char", max_df=1.0)
- X = vect.fit_transform(test_data).toarray()
- assert_array_equal(["a", "b", "c", "d", "e"], vect.get_feature_names_out())
- assert_array_equal([[3, 1, 1, 0, 0], [1, 2, 0, 1, 1]], X)
- # using boolean features, we can fetch the binary occurrence info
- # instead.
- vect = CountVectorizer(analyzer="char", max_df=1.0, binary=True)
- X = vect.fit_transform(test_data).toarray()
- assert_array_equal([[1, 1, 1, 0, 0], [1, 1, 0, 1, 1]], X)
- # check the ability to change the dtype
- vect = CountVectorizer(analyzer="char", max_df=1.0, binary=True, dtype=np.float32)
- X_sparse = vect.fit_transform(test_data)
- assert X_sparse.dtype == np.float32
- @fails_if_pypy
- def test_hashed_binary_occurrences():
- # by default multiple occurrences are counted as longs
- test_data = ["aaabc", "abbde"]
- vect = HashingVectorizer(alternate_sign=False, analyzer="char", norm=None)
- X = vect.transform(test_data)
- assert np.max(X[0:1].data) == 3
- assert np.max(X[1:2].data) == 2
- assert X.dtype == np.float64
- # using boolean features, we can fetch the binary occurrence info
- # instead.
- vect = HashingVectorizer(
- analyzer="char", alternate_sign=False, binary=True, norm=None
- )
- X = vect.transform(test_data)
- assert np.max(X.data) == 1
- assert X.dtype == np.float64
- # check the ability to change the dtype
- vect = HashingVectorizer(
- analyzer="char", alternate_sign=False, binary=True, norm=None, dtype=np.float64
- )
- X = vect.transform(test_data)
- assert X.dtype == np.float64
- @pytest.mark.parametrize("Vectorizer", (CountVectorizer, TfidfVectorizer))
- def test_vectorizer_inverse_transform(Vectorizer):
- # raw documents
- data = ALL_FOOD_DOCS
- vectorizer = Vectorizer()
- transformed_data = vectorizer.fit_transform(data)
- inversed_data = vectorizer.inverse_transform(transformed_data)
- assert isinstance(inversed_data, list)
- analyze = vectorizer.build_analyzer()
- for doc, inversed_terms in zip(data, inversed_data):
- terms = np.sort(np.unique(analyze(doc)))
- inversed_terms = np.sort(np.unique(inversed_terms))
- assert_array_equal(terms, inversed_terms)
- assert sparse.issparse(transformed_data)
- assert transformed_data.format == "csr"
- # Test that inverse_transform also works with numpy arrays and
- # scipy
- transformed_data2 = transformed_data.toarray()
- inversed_data2 = vectorizer.inverse_transform(transformed_data2)
- for terms, terms2 in zip(inversed_data, inversed_data2):
- assert_array_equal(np.sort(terms), np.sort(terms2))
- # Check that inverse_transform also works on non CSR sparse data:
- transformed_data3 = transformed_data.tocsc()
- inversed_data3 = vectorizer.inverse_transform(transformed_data3)
- for terms, terms3 in zip(inversed_data, inversed_data3):
- assert_array_equal(np.sort(terms), np.sort(terms3))
- def test_count_vectorizer_pipeline_grid_selection():
- # raw documents
- data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS
- # label junk food as -1, the others as +1
- target = [-1] * len(JUNK_FOOD_DOCS) + [1] * len(NOTJUNK_FOOD_DOCS)
- # split the dataset for model development and final evaluation
- train_data, test_data, target_train, target_test = train_test_split(
- data, target, test_size=0.2, random_state=0
- )
- pipeline = Pipeline([("vect", CountVectorizer()), ("svc", LinearSVC(dual="auto"))])
- parameters = {
- "vect__ngram_range": [(1, 1), (1, 2)],
- "svc__loss": ("hinge", "squared_hinge"),
- }
- # find the best parameters for both the feature extraction and the
- # classifier
- grid_search = GridSearchCV(pipeline, parameters, n_jobs=1, cv=3)
- # Check that the best model found by grid search is 100% correct on the
- # held out evaluation set.
- pred = grid_search.fit(train_data, target_train).predict(test_data)
- assert_array_equal(pred, target_test)
- # on this toy dataset bigram representation which is used in the last of
- # the grid_search is considered the best estimator since they all converge
- # to 100% accuracy models
- assert grid_search.best_score_ == 1.0
- best_vectorizer = grid_search.best_estimator_.named_steps["vect"]
- assert best_vectorizer.ngram_range == (1, 1)
- def test_vectorizer_pipeline_grid_selection():
- # raw documents
- data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS
- # label junk food as -1, the others as +1
- target = [-1] * len(JUNK_FOOD_DOCS) + [1] * len(NOTJUNK_FOOD_DOCS)
- # split the dataset for model development and final evaluation
- train_data, test_data, target_train, target_test = train_test_split(
- data, target, test_size=0.1, random_state=0
- )
- pipeline = Pipeline([("vect", TfidfVectorizer()), ("svc", LinearSVC(dual="auto"))])
- parameters = {
- "vect__ngram_range": [(1, 1), (1, 2)],
- "vect__norm": ("l1", "l2"),
- "svc__loss": ("hinge", "squared_hinge"),
- }
- # find the best parameters for both the feature extraction and the
- # classifier
- grid_search = GridSearchCV(pipeline, parameters, n_jobs=1)
- # Check that the best model found by grid search is 100% correct on the
- # held out evaluation set.
- pred = grid_search.fit(train_data, target_train).predict(test_data)
- assert_array_equal(pred, target_test)
- # on this toy dataset bigram representation which is used in the last of
- # the grid_search is considered the best estimator since they all converge
- # to 100% accuracy models
- assert grid_search.best_score_ == 1.0
- best_vectorizer = grid_search.best_estimator_.named_steps["vect"]
- assert best_vectorizer.ngram_range == (1, 1)
- assert best_vectorizer.norm == "l2"
- assert not best_vectorizer.fixed_vocabulary_
- def test_vectorizer_pipeline_cross_validation():
- # raw documents
- data = JUNK_FOOD_DOCS + NOTJUNK_FOOD_DOCS
- # label junk food as -1, the others as +1
- target = [-1] * len(JUNK_FOOD_DOCS) + [1] * len(NOTJUNK_FOOD_DOCS)
- pipeline = Pipeline([("vect", TfidfVectorizer()), ("svc", LinearSVC(dual="auto"))])
- cv_scores = cross_val_score(pipeline, data, target, cv=3)
- assert_array_equal(cv_scores, [1.0, 1.0, 1.0])
- @fails_if_pypy
- def test_vectorizer_unicode():
- # tests that the count vectorizer works with cyrillic.
- document = (
- "Машинное обучение — обширный подраздел искусственного "
- "интеллекта, изучающий методы построения алгоритмов, "
- "способных обучаться."
- )
- vect = CountVectorizer()
- X_counted = vect.fit_transform([document])
- assert X_counted.shape == (1, 12)
- vect = HashingVectorizer(norm=None, alternate_sign=False)
- X_hashed = vect.transform([document])
- assert X_hashed.shape == (1, 2**20)
- # No collisions on such a small dataset
- assert X_counted.nnz == X_hashed.nnz
- # When norm is None and not alternate_sign, the tokens are counted up to
- # collisions
- assert_array_equal(np.sort(X_counted.data), np.sort(X_hashed.data))
- def test_tfidf_vectorizer_with_fixed_vocabulary():
- # non regression smoke test for inheritance issues
- vocabulary = ["pizza", "celeri"]
- vect = TfidfVectorizer(vocabulary=vocabulary)
- X_1 = vect.fit_transform(ALL_FOOD_DOCS)
- X_2 = vect.transform(ALL_FOOD_DOCS)
- assert_array_almost_equal(X_1.toarray(), X_2.toarray())
- assert vect.fixed_vocabulary_
- def test_pickling_vectorizer():
- instances = [
- HashingVectorizer(),
- HashingVectorizer(norm="l1"),
- HashingVectorizer(binary=True),
- HashingVectorizer(ngram_range=(1, 2)),
- CountVectorizer(),
- CountVectorizer(preprocessor=strip_tags),
- CountVectorizer(analyzer=lazy_analyze),
- CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),
- CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS),
- TfidfVectorizer(),
- TfidfVectorizer(analyzer=lazy_analyze),
- TfidfVectorizer().fit(JUNK_FOOD_DOCS),
- ]
- for orig in instances:
- s = pickle.dumps(orig)
- copy = pickle.loads(s)
- assert type(copy) == orig.__class__
- assert copy.get_params() == orig.get_params()
- if IS_PYPY and isinstance(orig, HashingVectorizer):
- continue
- else:
- assert_allclose_dense_sparse(
- copy.fit_transform(JUNK_FOOD_DOCS),
- orig.fit_transform(JUNK_FOOD_DOCS),
- )
- @pytest.mark.parametrize(
- "factory",
- [
- CountVectorizer.build_analyzer,
- CountVectorizer.build_preprocessor,
- CountVectorizer.build_tokenizer,
- ],
- )
- def test_pickling_built_processors(factory):
- """Tokenizers cannot be pickled
- https://github.com/scikit-learn/scikit-learn/issues/12833
- """
- vec = CountVectorizer()
- function = factory(vec)
- text = "J'ai mangé du kangourou ce midi, c'était pas très bon."
- roundtripped_function = pickle.loads(pickle.dumps(function))
- expected = function(text)
- result = roundtripped_function(text)
- assert result == expected
- def test_countvectorizer_vocab_sets_when_pickling():
- # ensure that vocabulary of type set is coerced to a list to
- # preserve iteration ordering after deserialization
- rng = np.random.RandomState(0)
- vocab_words = np.array(
- [
- "beer",
- "burger",
- "celeri",
- "coke",
- "pizza",
- "salad",
- "sparkling",
- "tomato",
- "water",
- ]
- )
- for x in range(0, 100):
- vocab_set = set(rng.choice(vocab_words, size=5, replace=False))
- cv = CountVectorizer(vocabulary=vocab_set)
- unpickled_cv = pickle.loads(pickle.dumps(cv))
- cv.fit(ALL_FOOD_DOCS)
- unpickled_cv.fit(ALL_FOOD_DOCS)
- assert_array_equal(
- cv.get_feature_names_out(), unpickled_cv.get_feature_names_out()
- )
- def test_countvectorizer_vocab_dicts_when_pickling():
- rng = np.random.RandomState(0)
- vocab_words = np.array(
- [
- "beer",
- "burger",
- "celeri",
- "coke",
- "pizza",
- "salad",
- "sparkling",
- "tomato",
- "water",
- ]
- )
- for x in range(0, 100):
- vocab_dict = dict()
- words = rng.choice(vocab_words, size=5, replace=False)
- for y in range(0, 5):
- vocab_dict[words[y]] = y
- cv = CountVectorizer(vocabulary=vocab_dict)
- unpickled_cv = pickle.loads(pickle.dumps(cv))
- cv.fit(ALL_FOOD_DOCS)
- unpickled_cv.fit(ALL_FOOD_DOCS)
- assert_array_equal(
- cv.get_feature_names_out(), unpickled_cv.get_feature_names_out()
- )
- def test_stop_words_removal():
- # Ensure that deleting the stop_words_ attribute doesn't affect transform
- fitted_vectorizers = (
- TfidfVectorizer().fit(JUNK_FOOD_DOCS),
- CountVectorizer(preprocessor=strip_tags).fit(JUNK_FOOD_DOCS),
- CountVectorizer(strip_accents=strip_eacute).fit(JUNK_FOOD_DOCS),
- )
- for vect in fitted_vectorizers:
- vect_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
- vect.stop_words_ = None
- stop_None_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
- delattr(vect, "stop_words_")
- stop_del_transform = vect.transform(JUNK_FOOD_DOCS).toarray()
- assert_array_equal(stop_None_transform, vect_transform)
- assert_array_equal(stop_del_transform, vect_transform)
- def test_pickling_transformer():
- X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS)
- orig = TfidfTransformer().fit(X)
- s = pickle.dumps(orig)
- copy = pickle.loads(s)
- assert type(copy) == orig.__class__
- assert_array_equal(copy.fit_transform(X).toarray(), orig.fit_transform(X).toarray())
- def test_transformer_idf_setter():
- X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS)
- orig = TfidfTransformer().fit(X)
- copy = TfidfTransformer()
- copy.idf_ = orig.idf_
- assert_array_equal(copy.transform(X).toarray(), orig.transform(X).toarray())
- def test_tfidf_vectorizer_setter():
- orig = TfidfVectorizer(use_idf=True)
- orig.fit(JUNK_FOOD_DOCS)
- copy = TfidfVectorizer(vocabulary=orig.vocabulary_, use_idf=True)
- copy.idf_ = orig.idf_
- assert_array_equal(
- copy.transform(JUNK_FOOD_DOCS).toarray(),
- orig.transform(JUNK_FOOD_DOCS).toarray(),
- )
- # `idf_` cannot be set with `use_idf=False`
- copy = TfidfVectorizer(vocabulary=orig.vocabulary_, use_idf=False)
- err_msg = "`idf_` cannot be set when `user_idf=False`."
- with pytest.raises(ValueError, match=err_msg):
- copy.idf_ = orig.idf_
- def test_tfidfvectorizer_invalid_idf_attr():
- vect = TfidfVectorizer(use_idf=True)
- vect.fit(JUNK_FOOD_DOCS)
- copy = TfidfVectorizer(vocabulary=vect.vocabulary_, use_idf=True)
- expected_idf_len = len(vect.idf_)
- invalid_idf = [1.0] * (expected_idf_len + 1)
- with pytest.raises(ValueError):
- setattr(copy, "idf_", invalid_idf)
- def test_non_unique_vocab():
- vocab = ["a", "b", "c", "a", "a"]
- vect = CountVectorizer(vocabulary=vocab)
- with pytest.raises(ValueError):
- vect.fit([])
- @fails_if_pypy
- def test_hashingvectorizer_nan_in_docs():
- # np.nan can appear when using pandas to load text fields from a csv file
- # with missing values.
- message = "np.nan is an invalid document, expected byte or unicode string."
- exception = ValueError
- def func():
- hv = HashingVectorizer()
- hv.fit_transform(["hello world", np.nan, "hello hello"])
- with pytest.raises(exception, match=message):
- func()
- def test_tfidfvectorizer_binary():
- # Non-regression test: TfidfVectorizer used to ignore its "binary" param.
- v = TfidfVectorizer(binary=True, use_idf=False, norm=None)
- assert v.binary
- X = v.fit_transform(["hello world", "hello hello"]).toarray()
- assert_array_equal(X.ravel(), [1, 1, 1, 0])
- X2 = v.transform(["hello world", "hello hello"]).toarray()
- assert_array_equal(X2.ravel(), [1, 1, 1, 0])
- def test_tfidfvectorizer_export_idf():
- vect = TfidfVectorizer(use_idf=True)
- vect.fit(JUNK_FOOD_DOCS)
- assert_array_almost_equal(vect.idf_, vect._tfidf.idf_)
- def test_vectorizer_vocab_clone():
- vect_vocab = TfidfVectorizer(vocabulary=["the"])
- vect_vocab_clone = clone(vect_vocab)
- vect_vocab.fit(ALL_FOOD_DOCS)
- vect_vocab_clone.fit(ALL_FOOD_DOCS)
- assert vect_vocab_clone.vocabulary_ == vect_vocab.vocabulary_
- @pytest.mark.parametrize(
- "Vectorizer", (CountVectorizer, TfidfVectorizer, HashingVectorizer)
- )
- def test_vectorizer_string_object_as_input(Vectorizer):
- message = "Iterable over raw text documents expected, string object received."
- vec = Vectorizer()
- with pytest.raises(ValueError, match=message):
- vec.fit_transform("hello world!")
- with pytest.raises(ValueError, match=message):
- vec.fit("hello world!")
- vec.fit(["some text", "some other text"])
- with pytest.raises(ValueError, match=message):
- vec.transform("hello world!")
- @pytest.mark.parametrize("X_dtype", [np.float32, np.float64])
- def test_tfidf_transformer_type(X_dtype):
- X = sparse.rand(10, 20000, dtype=X_dtype, random_state=42)
- X_trans = TfidfTransformer().fit_transform(X)
- assert X_trans.dtype == X.dtype
- def test_tfidf_transformer_sparse():
- X = sparse.rand(10, 20000, dtype=np.float64, random_state=42)
- X_csc = sparse.csc_matrix(X)
- X_csr = sparse.csr_matrix(X)
- X_trans_csc = TfidfTransformer().fit_transform(X_csc)
- X_trans_csr = TfidfTransformer().fit_transform(X_csr)
- assert_allclose_dense_sparse(X_trans_csc, X_trans_csr)
- assert X_trans_csc.format == X_trans_csr.format
- @pytest.mark.parametrize(
- "vectorizer_dtype, output_dtype, warning_expected",
- [
- (np.int32, np.float64, True),
- (np.int64, np.float64, True),
- (np.float32, np.float32, False),
- (np.float64, np.float64, False),
- ],
- )
- def test_tfidf_vectorizer_type(vectorizer_dtype, output_dtype, warning_expected):
- X = np.array(["numpy", "scipy", "sklearn"])
- vectorizer = TfidfVectorizer(dtype=vectorizer_dtype)
- warning_msg_match = "'dtype' should be used."
- if warning_expected:
- with pytest.warns(UserWarning, match=warning_msg_match):
- X_idf = vectorizer.fit_transform(X)
- else:
- with warnings.catch_warnings():
- warnings.simplefilter("error", UserWarning)
- X_idf = vectorizer.fit_transform(X)
- assert X_idf.dtype == output_dtype
- @pytest.mark.parametrize(
- "vec",
- [
- HashingVectorizer(ngram_range=(2, 1)),
- CountVectorizer(ngram_range=(2, 1)),
- TfidfVectorizer(ngram_range=(2, 1)),
- ],
- )
- def test_vectorizers_invalid_ngram_range(vec):
- # vectorizers could be initialized with invalid ngram range
- # test for raising error message
- invalid_range = vec.ngram_range
- message = re.escape(
- f"Invalid value for ngram_range={invalid_range} "
- "lower boundary larger than the upper boundary."
- )
- if isinstance(vec, HashingVectorizer) and IS_PYPY:
- pytest.xfail(reason="HashingVectorizer is not supported on PyPy")
- with pytest.raises(ValueError, match=message):
- vec.fit(["good news everyone"])
- with pytest.raises(ValueError, match=message):
- vec.fit_transform(["good news everyone"])
- if isinstance(vec, HashingVectorizer):
- with pytest.raises(ValueError, match=message):
- vec.transform(["good news everyone"])
- def _check_stop_words_consistency(estimator):
- stop_words = estimator.get_stop_words()
- tokenize = estimator.build_tokenizer()
- preprocess = estimator.build_preprocessor()
- return estimator._check_stop_words_consistency(stop_words, preprocess, tokenize)
- @fails_if_pypy
- def test_vectorizer_stop_words_inconsistent():
- lstr = r"\['and', 'll', 've'\]"
- message = (
- "Your stop_words may be inconsistent with your "
- "preprocessing. Tokenizing the stop words generated "
- "tokens %s not in stop_words." % lstr
- )
- for vec in [CountVectorizer(), TfidfVectorizer(), HashingVectorizer()]:
- vec.set_params(stop_words=["you've", "you", "you'll", "AND"])
- with pytest.warns(UserWarning, match=message):
- vec.fit_transform(["hello world"])
- # reset stop word validation
- del vec._stop_words_id
- assert _check_stop_words_consistency(vec) is False
- # Only one warning per stop list
- with warnings.catch_warnings():
- warnings.simplefilter("error", UserWarning)
- vec.fit_transform(["hello world"])
- assert _check_stop_words_consistency(vec) is None
- # Test caching of inconsistency assessment
- vec.set_params(stop_words=["you've", "you", "you'll", "blah", "AND"])
- with pytest.warns(UserWarning, match=message):
- vec.fit_transform(["hello world"])
- @skip_if_32bit
- def test_countvectorizer_sort_features_64bit_sparse_indices():
- """
- Check that CountVectorizer._sort_features preserves the dtype of its sparse
- feature matrix.
- This test is skipped on 32bit platforms, see:
- https://github.com/scikit-learn/scikit-learn/pull/11295
- for more details.
- """
- X = sparse.csr_matrix((5, 5), dtype=np.int64)
- # force indices and indptr to int64.
- INDICES_DTYPE = np.int64
- X.indices = X.indices.astype(INDICES_DTYPE)
- X.indptr = X.indptr.astype(INDICES_DTYPE)
- vocabulary = {"scikit-learn": 0, "is": 1, "great!": 2}
- Xs = CountVectorizer()._sort_features(X, vocabulary)
- assert INDICES_DTYPE == Xs.indices.dtype
- @fails_if_pypy
- @pytest.mark.parametrize(
- "Estimator", [CountVectorizer, TfidfVectorizer, HashingVectorizer]
- )
- def test_stop_word_validation_custom_preprocessor(Estimator):
- data = [{"text": "some text"}]
- vec = Estimator()
- assert _check_stop_words_consistency(vec) is True
- vec = Estimator(preprocessor=lambda x: x["text"], stop_words=["and"])
- assert _check_stop_words_consistency(vec) == "error"
- # checks are cached
- assert _check_stop_words_consistency(vec) is None
- vec.fit_transform(data)
- class CustomEstimator(Estimator):
- def build_preprocessor(self):
- return lambda x: x["text"]
- vec = CustomEstimator(stop_words=["and"])
- assert _check_stop_words_consistency(vec) == "error"
- vec = Estimator(
- tokenizer=lambda doc: re.compile(r"\w{1,}").findall(doc), stop_words=["and"]
- )
- assert _check_stop_words_consistency(vec) is True
- @pytest.mark.parametrize(
- "Estimator", [CountVectorizer, TfidfVectorizer, HashingVectorizer]
- )
- @pytest.mark.parametrize(
- "input_type, err_type, err_msg",
- [
- ("filename", FileNotFoundError, ""),
- ("file", AttributeError, "'str' object has no attribute 'read'"),
- ],
- )
- def test_callable_analyzer_error(Estimator, input_type, err_type, err_msg):
- if issubclass(Estimator, HashingVectorizer) and IS_PYPY:
- pytest.xfail("HashingVectorizer is not supported on PyPy")
- data = ["this is text, not file or filename"]
- with pytest.raises(err_type, match=err_msg):
- Estimator(analyzer=lambda x: x.split(), input=input_type).fit_transform(data)
- @pytest.mark.parametrize(
- "Estimator",
- [
- CountVectorizer,
- TfidfVectorizer,
- pytest.param(HashingVectorizer, marks=fails_if_pypy),
- ],
- )
- @pytest.mark.parametrize(
- "analyzer", [lambda doc: open(doc, "r"), lambda doc: doc.read()]
- )
- @pytest.mark.parametrize("input_type", ["file", "filename"])
- def test_callable_analyzer_change_behavior(Estimator, analyzer, input_type):
- data = ["this is text, not file or filename"]
- with pytest.raises((FileNotFoundError, AttributeError)):
- Estimator(analyzer=analyzer, input=input_type).fit_transform(data)
- @pytest.mark.parametrize(
- "Estimator", [CountVectorizer, TfidfVectorizer, HashingVectorizer]
- )
- def test_callable_analyzer_reraise_error(tmpdir, Estimator):
- # check if a custom exception from the analyzer is shown to the user
- def analyzer(doc):
- raise Exception("testing")
- if issubclass(Estimator, HashingVectorizer) and IS_PYPY:
- pytest.xfail("HashingVectorizer is not supported on PyPy")
- f = tmpdir.join("file.txt")
- f.write("sample content\n")
- with pytest.raises(Exception, match="testing"):
- Estimator(analyzer=analyzer, input="file").fit_transform([f])
- @pytest.mark.parametrize(
- "Vectorizer", [CountVectorizer, HashingVectorizer, TfidfVectorizer]
- )
- @pytest.mark.parametrize(
- (
- "stop_words, tokenizer, preprocessor, ngram_range, token_pattern,"
- "analyzer, unused_name, ovrd_name, ovrd_msg"
- ),
- [
- (
- ["you've", "you'll"],
- None,
- None,
- (1, 1),
- None,
- "char",
- "'stop_words'",
- "'analyzer'",
- "!= 'word'",
- ),
- (
- None,
- lambda s: s.split(),
- None,
- (1, 1),
- None,
- "char",
- "'tokenizer'",
- "'analyzer'",
- "!= 'word'",
- ),
- (
- None,
- lambda s: s.split(),
- None,
- (1, 1),
- r"\w+",
- "word",
- "'token_pattern'",
- "'tokenizer'",
- "is not None",
- ),
- (
- None,
- None,
- lambda s: s.upper(),
- (1, 1),
- r"\w+",
- lambda s: s.upper(),
- "'preprocessor'",
- "'analyzer'",
- "is callable",
- ),
- (
- None,
- None,
- None,
- (1, 2),
- None,
- lambda s: s.upper(),
- "'ngram_range'",
- "'analyzer'",
- "is callable",
- ),
- (
- None,
- None,
- None,
- (1, 1),
- r"\w+",
- "char",
- "'token_pattern'",
- "'analyzer'",
- "!= 'word'",
- ),
- ],
- )
- def test_unused_parameters_warn(
- Vectorizer,
- stop_words,
- tokenizer,
- preprocessor,
- ngram_range,
- token_pattern,
- analyzer,
- unused_name,
- ovrd_name,
- ovrd_msg,
- ):
- train_data = JUNK_FOOD_DOCS
- # setting parameter and checking for corresponding warning messages
- vect = Vectorizer()
- vect.set_params(
- stop_words=stop_words,
- tokenizer=tokenizer,
- preprocessor=preprocessor,
- ngram_range=ngram_range,
- token_pattern=token_pattern,
- analyzer=analyzer,
- )
- msg = "The parameter %s will not be used since %s %s" % (
- unused_name,
- ovrd_name,
- ovrd_msg,
- )
- with pytest.warns(UserWarning, match=msg):
- vect.fit(train_data)
- @pytest.mark.parametrize(
- "Vectorizer, X",
- (
- (HashingVectorizer, [{"foo": 1, "bar": 2}, {"foo": 3, "baz": 1}]),
- (CountVectorizer, JUNK_FOOD_DOCS),
- ),
- )
- def test_n_features_in(Vectorizer, X):
- # For vectorizers, n_features_in_ does not make sense
- vectorizer = Vectorizer()
- assert not hasattr(vectorizer, "n_features_in_")
- vectorizer.fit(X)
- assert not hasattr(vectorizer, "n_features_in_")
- def test_tie_breaking_sample_order_invariance():
- # Checks the sample order invariance when setting max_features
- # non-regression test for #17939
- vec = CountVectorizer(max_features=1)
- vocab1 = vec.fit(["hello", "world"]).vocabulary_
- vocab2 = vec.fit(["world", "hello"]).vocabulary_
- assert vocab1 == vocab2
- @fails_if_pypy
- def test_nonnegative_hashing_vectorizer_result_indices():
- # add test for pr 19035
- hashing = HashingVectorizer(n_features=1000000, ngram_range=(2, 3))
- indices = hashing.transform(["22pcs efuture"]).indices
- assert indices[0] >= 0
- @pytest.mark.parametrize(
- "Estimator", [CountVectorizer, TfidfVectorizer, TfidfTransformer, HashingVectorizer]
- )
- def test_vectorizers_do_not_have_set_output(Estimator):
- """Check that vectorizers do not define set_output."""
- est = Estimator()
- assert not hasattr(est, "set_output")
|