| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325 |
- import re
- import numpy as np
- import pytest
- from scipy import sparse
- from sklearn.exceptions import NotFittedError
- from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
- from sklearn.utils import is_scalar_nan
- from sklearn.utils._testing import (
- _convert_container,
- assert_allclose,
- assert_array_equal,
- )
- def test_one_hot_encoder_sparse_dense():
- # check that sparse and dense will give the same results
- X = np.array([[3, 2, 1], [0, 1, 1]])
- enc_sparse = OneHotEncoder()
- enc_dense = OneHotEncoder(sparse_output=False)
- X_trans_sparse = enc_sparse.fit_transform(X)
- X_trans_dense = enc_dense.fit_transform(X)
- assert X_trans_sparse.shape == (2, 5)
- assert X_trans_dense.shape == (2, 5)
- assert sparse.issparse(X_trans_sparse)
- assert not sparse.issparse(X_trans_dense)
- # check outcome
- assert_array_equal(
- X_trans_sparse.toarray(), [[0.0, 1.0, 0.0, 1.0, 1.0], [1.0, 0.0, 1.0, 0.0, 1.0]]
- )
- assert_array_equal(X_trans_sparse.toarray(), X_trans_dense)
- @pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist"])
- def test_one_hot_encoder_handle_unknown(handle_unknown):
- X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
- X2 = np.array([[4, 1, 1]])
- # Test that one hot encoder raises error for unknown features
- # present during transform.
- oh = OneHotEncoder(handle_unknown="error")
- oh.fit(X)
- with pytest.raises(ValueError, match="Found unknown categories"):
- oh.transform(X2)
- # Test the ignore option, ignores unknown features (giving all 0's)
- oh = OneHotEncoder(handle_unknown=handle_unknown)
- oh.fit(X)
- X2_passed = X2.copy()
- assert_array_equal(
- oh.transform(X2_passed).toarray(),
- np.array([[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]]),
- )
- # ensure transformed data was not modified in place
- assert_allclose(X2, X2_passed)
- def test_one_hot_encoder_not_fitted():
- X = np.array([["a"], ["b"]])
- enc = OneHotEncoder(categories=["a", "b"])
- msg = (
- "This OneHotEncoder instance is not fitted yet. "
- "Call 'fit' with appropriate arguments before using this "
- "estimator."
- )
- with pytest.raises(NotFittedError, match=msg):
- enc.transform(X)
- @pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist"])
- def test_one_hot_encoder_handle_unknown_strings(handle_unknown):
- X = np.array(["11111111", "22", "333", "4444"]).reshape((-1, 1))
- X2 = np.array(["55555", "22"]).reshape((-1, 1))
- # Non Regression test for the issue #12470
- # Test the ignore option, when categories are numpy string dtype
- # particularly when the known category strings are larger
- # than the unknown category strings
- oh = OneHotEncoder(handle_unknown=handle_unknown)
- oh.fit(X)
- X2_passed = X2.copy()
- assert_array_equal(
- oh.transform(X2_passed).toarray(),
- np.array([[0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]]),
- )
- # ensure transformed data was not modified in place
- assert_array_equal(X2, X2_passed)
- @pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64])
- @pytest.mark.parametrize("input_dtype", [np.int32, np.float32, np.float64])
- def test_one_hot_encoder_dtype(input_dtype, output_dtype):
- X = np.asarray([[0, 1]], dtype=input_dtype).T
- X_expected = np.asarray([[1, 0], [0, 1]], dtype=output_dtype)
- oh = OneHotEncoder(categories="auto", dtype=output_dtype)
- assert_array_equal(oh.fit_transform(X).toarray(), X_expected)
- assert_array_equal(oh.fit(X).transform(X).toarray(), X_expected)
- oh = OneHotEncoder(categories="auto", dtype=output_dtype, sparse_output=False)
- assert_array_equal(oh.fit_transform(X), X_expected)
- assert_array_equal(oh.fit(X).transform(X), X_expected)
- @pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64])
- def test_one_hot_encoder_dtype_pandas(output_dtype):
- pd = pytest.importorskip("pandas")
- X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]})
- X_expected = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=output_dtype)
- oh = OneHotEncoder(dtype=output_dtype)
- assert_array_equal(oh.fit_transform(X_df).toarray(), X_expected)
- assert_array_equal(oh.fit(X_df).transform(X_df).toarray(), X_expected)
- oh = OneHotEncoder(dtype=output_dtype, sparse_output=False)
- assert_array_equal(oh.fit_transform(X_df), X_expected)
- assert_array_equal(oh.fit(X_df).transform(X_df), X_expected)
- def test_one_hot_encoder_feature_names():
- enc = OneHotEncoder()
- X = [
- ["Male", 1, "girl", 2, 3],
- ["Female", 41, "girl", 1, 10],
- ["Male", 51, "boy", 12, 3],
- ["Male", 91, "girl", 21, 30],
- ]
- enc.fit(X)
- feature_names = enc.get_feature_names_out()
- assert_array_equal(
- [
- "x0_Female",
- "x0_Male",
- "x1_1",
- "x1_41",
- "x1_51",
- "x1_91",
- "x2_boy",
- "x2_girl",
- "x3_1",
- "x3_2",
- "x3_12",
- "x3_21",
- "x4_3",
- "x4_10",
- "x4_30",
- ],
- feature_names,
- )
- feature_names2 = enc.get_feature_names_out(["one", "two", "three", "four", "five"])
- assert_array_equal(
- [
- "one_Female",
- "one_Male",
- "two_1",
- "two_41",
- "two_51",
- "two_91",
- "three_boy",
- "three_girl",
- "four_1",
- "four_2",
- "four_12",
- "four_21",
- "five_3",
- "five_10",
- "five_30",
- ],
- feature_names2,
- )
- with pytest.raises(ValueError, match="input_features should have length"):
- enc.get_feature_names_out(["one", "two"])
- def test_one_hot_encoder_feature_names_unicode():
- enc = OneHotEncoder()
- X = np.array([["c❤t1", "dat2"]], dtype=object).T
- enc.fit(X)
- feature_names = enc.get_feature_names_out()
- assert_array_equal(["x0_c❤t1", "x0_dat2"], feature_names)
- feature_names = enc.get_feature_names_out(input_features=["n👍me"])
- assert_array_equal(["n👍me_c❤t1", "n👍me_dat2"], feature_names)
- def test_one_hot_encoder_custom_feature_name_combiner():
- """Check the behaviour of `feature_name_combiner` as a callable."""
- def name_combiner(feature, category):
- return feature + "_" + repr(category)
- enc = OneHotEncoder(feature_name_combiner=name_combiner)
- X = np.array([["None", None]], dtype=object).T
- enc.fit(X)
- feature_names = enc.get_feature_names_out()
- assert_array_equal(["x0_'None'", "x0_None"], feature_names)
- feature_names = enc.get_feature_names_out(input_features=["a"])
- assert_array_equal(["a_'None'", "a_None"], feature_names)
- def wrong_combiner(feature, category):
- # we should be returning a Python string
- return 0
- enc = OneHotEncoder(feature_name_combiner=wrong_combiner).fit(X)
- err_msg = (
- "When `feature_name_combiner` is a callable, it should return a Python string."
- )
- with pytest.raises(TypeError, match=err_msg):
- enc.get_feature_names_out()
- def test_one_hot_encoder_set_params():
- X = np.array([[1, 2]]).T
- oh = OneHotEncoder()
- # set params on not yet fitted object
- oh.set_params(categories=[[0, 1, 2, 3]])
- assert oh.get_params()["categories"] == [[0, 1, 2, 3]]
- assert oh.fit_transform(X).toarray().shape == (2, 4)
- # set params on already fitted object
- oh.set_params(categories=[[0, 1, 2, 3, 4]])
- assert oh.fit_transform(X).toarray().shape == (2, 5)
- def check_categorical_onehot(X):
- enc = OneHotEncoder(categories="auto")
- Xtr1 = enc.fit_transform(X)
- enc = OneHotEncoder(categories="auto", sparse_output=False)
- Xtr2 = enc.fit_transform(X)
- assert_allclose(Xtr1.toarray(), Xtr2)
- assert sparse.issparse(Xtr1) and Xtr1.format == "csr"
- return Xtr1.toarray()
- @pytest.mark.parametrize(
- "X",
- [
- [["def", 1, 55], ["abc", 2, 55]],
- np.array([[10, 1, 55], [5, 2, 55]]),
- np.array([["b", "A", "cat"], ["a", "B", "cat"]], dtype=object),
- np.array([["b", 1, "cat"], ["a", np.nan, "cat"]], dtype=object),
- np.array([["b", 1, "cat"], ["a", float("nan"), "cat"]], dtype=object),
- np.array([[None, 1, "cat"], ["a", 2, "cat"]], dtype=object),
- np.array([[None, 1, None], ["a", np.nan, None]], dtype=object),
- np.array([[None, 1, None], ["a", float("nan"), None]], dtype=object),
- ],
- ids=[
- "mixed",
- "numeric",
- "object",
- "mixed-nan",
- "mixed-float-nan",
- "mixed-None",
- "mixed-None-nan",
- "mixed-None-float-nan",
- ],
- )
- def test_one_hot_encoder(X):
- Xtr = check_categorical_onehot(np.array(X)[:, [0]])
- assert_allclose(Xtr, [[0, 1], [1, 0]])
- Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]])
- assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]])
- Xtr = OneHotEncoder(categories="auto").fit_transform(X)
- assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]])
- @pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist"])
- @pytest.mark.parametrize("sparse_", [False, True])
- @pytest.mark.parametrize("drop", [None, "first"])
- def test_one_hot_encoder_inverse(handle_unknown, sparse_, drop):
- X = [["abc", 2, 55], ["def", 1, 55], ["abc", 3, 55]]
- enc = OneHotEncoder(sparse_output=sparse_, drop=drop)
- X_tr = enc.fit_transform(X)
- exp = np.array(X, dtype=object)
- assert_array_equal(enc.inverse_transform(X_tr), exp)
- X = [[2, 55], [1, 55], [3, 55]]
- enc = OneHotEncoder(sparse_output=sparse_, categories="auto", drop=drop)
- X_tr = enc.fit_transform(X)
- exp = np.array(X)
- assert_array_equal(enc.inverse_transform(X_tr), exp)
- if drop is None:
- # with unknown categories
- # drop is incompatible with handle_unknown=ignore
- X = [["abc", 2, 55], ["def", 1, 55], ["abc", 3, 55]]
- enc = OneHotEncoder(
- sparse_output=sparse_,
- handle_unknown=handle_unknown,
- categories=[["abc", "def"], [1, 2], [54, 55, 56]],
- )
- X_tr = enc.fit_transform(X)
- exp = np.array(X, dtype=object)
- exp[2, 1] = None
- assert_array_equal(enc.inverse_transform(X_tr), exp)
- # with an otherwise numerical output, still object if unknown
- X = [[2, 55], [1, 55], [3, 55]]
- enc = OneHotEncoder(
- sparse_output=sparse_,
- categories=[[1, 2], [54, 56]],
- handle_unknown=handle_unknown,
- )
- X_tr = enc.fit_transform(X)
- exp = np.array(X, dtype=object)
- exp[2, 0] = None
- exp[:, 1] = None
- assert_array_equal(enc.inverse_transform(X_tr), exp)
- # incorrect shape raises
- X_tr = np.array([[0, 1, 1], [1, 0, 1]])
- msg = re.escape("Shape of the passed X data is not correct")
- with pytest.raises(ValueError, match=msg):
- enc.inverse_transform(X_tr)
- @pytest.mark.parametrize("sparse_", [False, True])
- @pytest.mark.parametrize(
- "X, X_trans",
- [
- ([[2, 55], [1, 55], [2, 55]], [[0, 1, 1], [0, 0, 0], [0, 1, 1]]),
- (
- [["one", "a"], ["two", "a"], ["three", "b"], ["two", "a"]],
- [[0, 0, 0, 0, 0], [0, 0, 0, 0, 1], [0, 1, 0, 0, 0]],
- ),
- ],
- )
- def test_one_hot_encoder_inverse_transform_raise_error_with_unknown(
- X, X_trans, sparse_
- ):
- """Check that `inverse_transform` raise an error with unknown samples, no
- dropped feature, and `handle_unknow="error`.
- Non-regression test for:
- https://github.com/scikit-learn/scikit-learn/issues/14934
- """
- enc = OneHotEncoder(sparse_output=sparse_).fit(X)
- msg = (
- r"Samples \[(\d )*\d\] can not be inverted when drop=None and "
- r"handle_unknown='error' because they contain all zeros"
- )
- if sparse_:
- # emulate sparse data transform by a one-hot encoder sparse.
- X_trans = _convert_container(X_trans, "sparse")
- with pytest.raises(ValueError, match=msg):
- enc.inverse_transform(X_trans)
- def test_one_hot_encoder_inverse_if_binary():
- X = np.array([["Male", 1], ["Female", 3], ["Female", 2]], dtype=object)
- ohe = OneHotEncoder(drop="if_binary", sparse_output=False)
- X_tr = ohe.fit_transform(X)
- assert_array_equal(ohe.inverse_transform(X_tr), X)
- @pytest.mark.parametrize("drop", ["if_binary", "first", None])
- @pytest.mark.parametrize("reset_drop", ["if_binary", "first", None])
- def test_one_hot_encoder_drop_reset(drop, reset_drop):
- # check that resetting drop option without refitting does not throw an error
- X = np.array([["Male", 1], ["Female", 3], ["Female", 2]], dtype=object)
- ohe = OneHotEncoder(drop=drop, sparse_output=False)
- ohe.fit(X)
- X_tr = ohe.transform(X)
- feature_names = ohe.get_feature_names_out()
- ohe.set_params(drop=reset_drop)
- assert_array_equal(ohe.inverse_transform(X_tr), X)
- assert_allclose(ohe.transform(X), X_tr)
- assert_array_equal(ohe.get_feature_names_out(), feature_names)
- @pytest.mark.parametrize("method", ["fit", "fit_transform"])
- @pytest.mark.parametrize("X", [[1, 2], np.array([3.0, 4.0])])
- def test_X_is_not_1D(X, method):
- oh = OneHotEncoder()
- msg = "Expected 2D array, got 1D array instead"
- with pytest.raises(ValueError, match=msg):
- getattr(oh, method)(X)
- @pytest.mark.parametrize("method", ["fit", "fit_transform"])
- def test_X_is_not_1D_pandas(method):
- pd = pytest.importorskip("pandas")
- X = pd.Series([6, 3, 4, 6])
- oh = OneHotEncoder()
- msg = "Expected 2D array, got 1D array instead"
- with pytest.raises(ValueError, match=msg):
- getattr(oh, method)(X)
- @pytest.mark.parametrize(
- "X, cat_exp, cat_dtype",
- [
- ([["abc", 55], ["def", 55]], [["abc", "def"], [55]], np.object_),
- (np.array([[1, 2], [3, 2]]), [[1, 3], [2]], np.integer),
- (
- np.array([["A", "cat"], ["B", "cat"]], dtype=object),
- [["A", "B"], ["cat"]],
- np.object_,
- ),
- (np.array([["A", "cat"], ["B", "cat"]]), [["A", "B"], ["cat"]], np.str_),
- (np.array([[1, 2], [np.nan, 2]]), [[1, np.nan], [2]], np.float64),
- (
- np.array([["A", np.nan], [None, np.nan]], dtype=object),
- [["A", None], [np.nan]],
- np.object_,
- ),
- (
- np.array([["A", float("nan")], [None, float("nan")]], dtype=object),
- [["A", None], [float("nan")]],
- np.object_,
- ),
- ],
- ids=[
- "mixed",
- "numeric",
- "object",
- "string",
- "missing-float",
- "missing-np.nan-object",
- "missing-float-nan-object",
- ],
- )
- def test_one_hot_encoder_categories(X, cat_exp, cat_dtype):
- # order of categories should not depend on order of samples
- for Xi in [X, X[::-1]]:
- enc = OneHotEncoder(categories="auto")
- enc.fit(Xi)
- # assert enc.categories == 'auto'
- assert isinstance(enc.categories_, list)
- for res, exp in zip(enc.categories_, cat_exp):
- res_list = res.tolist()
- if is_scalar_nan(exp[-1]):
- assert is_scalar_nan(res_list[-1])
- assert res_list[:-1] == exp[:-1]
- else:
- assert res.tolist() == exp
- assert np.issubdtype(res.dtype, cat_dtype)
- @pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist"])
- @pytest.mark.parametrize(
- "X, X2, cats, cat_dtype",
- [
- (
- np.array([["a", "b"]], dtype=object).T,
- np.array([["a", "d"]], dtype=object).T,
- [["a", "b", "c"]],
- np.object_,
- ),
- (
- np.array([[1, 2]], dtype="int64").T,
- np.array([[1, 4]], dtype="int64").T,
- [[1, 2, 3]],
- np.int64,
- ),
- (
- np.array([["a", "b"]], dtype=object).T,
- np.array([["a", "d"]], dtype=object).T,
- [np.array(["a", "b", "c"])],
- np.object_,
- ),
- (
- np.array([[None, "a"]], dtype=object).T,
- np.array([[None, "b"]], dtype=object).T,
- [[None, "a", "z"]],
- object,
- ),
- (
- np.array([["a", "b"]], dtype=object).T,
- np.array([["a", np.nan]], dtype=object).T,
- [["a", "b", "z"]],
- object,
- ),
- (
- np.array([["a", None]], dtype=object).T,
- np.array([["a", np.nan]], dtype=object).T,
- [["a", None, "z"]],
- object,
- ),
- (
- np.array([["a", np.nan]], dtype=object).T,
- np.array([["a", None]], dtype=object).T,
- [["a", np.nan, "z"]],
- object,
- ),
- ],
- ids=[
- "object",
- "numeric",
- "object-string",
- "object-string-none",
- "object-string-nan",
- "object-None-and-nan",
- "object-nan-and-None",
- ],
- )
- def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype, handle_unknown):
- enc = OneHotEncoder(categories=cats)
- exp = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]])
- assert_array_equal(enc.fit_transform(X).toarray(), exp)
- assert list(enc.categories[0]) == list(cats[0])
- assert enc.categories_[0].tolist() == list(cats[0])
- # manually specified categories should have same dtype as
- # the data when coerced from lists
- assert enc.categories_[0].dtype == cat_dtype
- # when specifying categories manually, unknown categories should already
- # raise when fitting
- enc = OneHotEncoder(categories=cats)
- with pytest.raises(ValueError, match="Found unknown categories"):
- enc.fit(X2)
- enc = OneHotEncoder(categories=cats, handle_unknown=handle_unknown)
- exp = np.array([[1.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
- assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp)
- def test_one_hot_encoder_unsorted_categories():
- X = np.array([["a", "b"]], dtype=object).T
- enc = OneHotEncoder(categories=[["b", "a", "c"]])
- exp = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0]])
- assert_array_equal(enc.fit(X).transform(X).toarray(), exp)
- assert_array_equal(enc.fit_transform(X).toarray(), exp)
- assert enc.categories_[0].tolist() == ["b", "a", "c"]
- assert np.issubdtype(enc.categories_[0].dtype, np.object_)
- # unsorted passed categories still raise for numerical values
- X = np.array([[1, 2]]).T
- enc = OneHotEncoder(categories=[[2, 1, 3]])
- msg = "Unsorted categories are not supported"
- with pytest.raises(ValueError, match=msg):
- enc.fit_transform(X)
- # np.nan must be the last category in categories[0] to be considered sorted
- X = np.array([[1, 2, np.nan]]).T
- enc = OneHotEncoder(categories=[[1, np.nan, 2]])
- with pytest.raises(ValueError, match=msg):
- enc.fit_transform(X)
- def test_one_hot_encoder_specified_categories_mixed_columns():
- # multiple columns
- X = np.array([["a", "b"], [0, 2]], dtype=object).T
- enc = OneHotEncoder(categories=[["a", "b", "c"], [0, 1, 2]])
- exp = np.array([[1.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0, 1.0]])
- assert_array_equal(enc.fit_transform(X).toarray(), exp)
- assert enc.categories_[0].tolist() == ["a", "b", "c"]
- assert np.issubdtype(enc.categories_[0].dtype, np.object_)
- assert enc.categories_[1].tolist() == [0, 1, 2]
- # integer categories but from object dtype data
- assert np.issubdtype(enc.categories_[1].dtype, np.object_)
- def test_one_hot_encoder_pandas():
- pd = pytest.importorskip("pandas")
- X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]})
- Xtr = check_categorical_onehot(X_df)
- assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]])
- @pytest.mark.parametrize(
- "drop, expected_names",
- [
- ("first", ["x0_c", "x2_b"]),
- ("if_binary", ["x0_c", "x1_2", "x2_b"]),
- (["c", 2, "b"], ["x0_b", "x2_a"]),
- ],
- ids=["first", "binary", "manual"],
- )
- def test_one_hot_encoder_feature_names_drop(drop, expected_names):
- X = [["c", 2, "a"], ["b", 2, "b"]]
- ohe = OneHotEncoder(drop=drop)
- ohe.fit(X)
- feature_names = ohe.get_feature_names_out()
- assert_array_equal(expected_names, feature_names)
- def test_one_hot_encoder_drop_equals_if_binary():
- # Canonical case
- X = [[10, "yes"], [20, "no"], [30, "yes"]]
- expected = np.array(
- [[1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 1.0]]
- )
- expected_drop_idx = np.array([None, 0])
- ohe = OneHotEncoder(drop="if_binary", sparse_output=False)
- result = ohe.fit_transform(X)
- assert_array_equal(ohe.drop_idx_, expected_drop_idx)
- assert_allclose(result, expected)
- # with only one cat, the behaviour is equivalent to drop=None
- X = [["true", "a"], ["false", "a"], ["false", "a"]]
- expected = np.array([[1.0, 1.0], [0.0, 1.0], [0.0, 1.0]])
- expected_drop_idx = np.array([0, None])
- ohe = OneHotEncoder(drop="if_binary", sparse_output=False)
- result = ohe.fit_transform(X)
- assert_array_equal(ohe.drop_idx_, expected_drop_idx)
- assert_allclose(result, expected)
- @pytest.mark.parametrize(
- "X",
- [
- [["abc", 2, 55], ["def", 1, 55]],
- np.array([[10, 2, 55], [20, 1, 55]]),
- np.array([["a", "B", "cat"], ["b", "A", "cat"]], dtype=object),
- ],
- ids=["mixed", "numeric", "object"],
- )
- def test_ordinal_encoder(X):
- enc = OrdinalEncoder()
- exp = np.array([[0, 1, 0], [1, 0, 0]], dtype="int64")
- assert_array_equal(enc.fit_transform(X), exp.astype("float64"))
- enc = OrdinalEncoder(dtype="int64")
- assert_array_equal(enc.fit_transform(X), exp)
- @pytest.mark.parametrize(
- "X, X2, cats, cat_dtype",
- [
- (
- np.array([["a", "b"]], dtype=object).T,
- np.array([["a", "d"]], dtype=object).T,
- [["a", "b", "c"]],
- np.object_,
- ),
- (
- np.array([[1, 2]], dtype="int64").T,
- np.array([[1, 4]], dtype="int64").T,
- [[1, 2, 3]],
- np.int64,
- ),
- (
- np.array([["a", "b"]], dtype=object).T,
- np.array([["a", "d"]], dtype=object).T,
- [np.array(["a", "b", "c"])],
- np.object_,
- ),
- ],
- ids=["object", "numeric", "object-string-cat"],
- )
- def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype):
- enc = OrdinalEncoder(categories=cats)
- exp = np.array([[0.0], [1.0]])
- assert_array_equal(enc.fit_transform(X), exp)
- assert list(enc.categories[0]) == list(cats[0])
- assert enc.categories_[0].tolist() == list(cats[0])
- # manually specified categories should have same dtype as
- # the data when coerced from lists
- assert enc.categories_[0].dtype == cat_dtype
- # when specifying categories manually, unknown categories should already
- # raise when fitting
- enc = OrdinalEncoder(categories=cats)
- with pytest.raises(ValueError, match="Found unknown categories"):
- enc.fit(X2)
- def test_ordinal_encoder_inverse():
- X = [["abc", 2, 55], ["def", 1, 55]]
- enc = OrdinalEncoder()
- X_tr = enc.fit_transform(X)
- exp = np.array(X, dtype=object)
- assert_array_equal(enc.inverse_transform(X_tr), exp)
- # incorrect shape raises
- X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]])
- msg = re.escape("Shape of the passed X data is not correct")
- with pytest.raises(ValueError, match=msg):
- enc.inverse_transform(X_tr)
- def test_ordinal_encoder_handle_unknowns_string():
- enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-2)
- X_fit = np.array([["a", "x"], ["b", "y"], ["c", "z"]], dtype=object)
- X_trans = np.array([["c", "xy"], ["bla", "y"], ["a", "x"]], dtype=object)
- enc.fit(X_fit)
- X_trans_enc = enc.transform(X_trans)
- exp = np.array([[2, -2], [-2, 1], [0, 0]], dtype="int64")
- assert_array_equal(X_trans_enc, exp)
- X_trans_inv = enc.inverse_transform(X_trans_enc)
- inv_exp = np.array([["c", None], [None, "y"], ["a", "x"]], dtype=object)
- assert_array_equal(X_trans_inv, inv_exp)
- @pytest.mark.parametrize("dtype", [float, int])
- def test_ordinal_encoder_handle_unknowns_numeric(dtype):
- enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-999)
- X_fit = np.array([[1, 7], [2, 8], [3, 9]], dtype=dtype)
- X_trans = np.array([[3, 12], [23, 8], [1, 7]], dtype=dtype)
- enc.fit(X_fit)
- X_trans_enc = enc.transform(X_trans)
- exp = np.array([[2, -999], [-999, 1], [0, 0]], dtype="int64")
- assert_array_equal(X_trans_enc, exp)
- X_trans_inv = enc.inverse_transform(X_trans_enc)
- inv_exp = np.array([[3, None], [None, 8], [1, 7]], dtype=object)
- assert_array_equal(X_trans_inv, inv_exp)
- def test_ordinal_encoder_handle_unknowns_nan():
- # Make sure unknown_value=np.nan properly works
- enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)
- X_fit = np.array([[1], [2], [3]])
- enc.fit(X_fit)
- X_trans = enc.transform([[1], [2], [4]])
- assert_array_equal(X_trans, [[0], [1], [np.nan]])
- def test_ordinal_encoder_handle_unknowns_nan_non_float_dtype():
- # Make sure an error is raised when unknown_value=np.nan and the dtype
- # isn't a float dtype
- enc = OrdinalEncoder(
- handle_unknown="use_encoded_value", unknown_value=np.nan, dtype=int
- )
- X_fit = np.array([[1], [2], [3]])
- with pytest.raises(ValueError, match="dtype parameter should be a float dtype"):
- enc.fit(X_fit)
- def test_ordinal_encoder_raise_categories_shape():
- X = np.array([["Low", "Medium", "High", "Medium", "Low"]], dtype=object).T
- cats = ["Low", "Medium", "High"]
- enc = OrdinalEncoder(categories=cats)
- msg = "Shape mismatch: if categories is an array,"
- with pytest.raises(ValueError, match=msg):
- enc.fit(X)
- def test_encoder_dtypes():
- # check that dtypes are preserved when determining categories
- enc = OneHotEncoder(categories="auto")
- exp = np.array([[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0]], dtype="float64")
- for X in [
- np.array([[1, 2], [3, 4]], dtype="int64"),
- np.array([[1, 2], [3, 4]], dtype="float64"),
- np.array([["a", "b"], ["c", "d"]]), # str dtype
- np.array([[b"a", b"b"], [b"c", b"d"]]), # bytes dtype
- np.array([[1, "a"], [3, "b"]], dtype="object"),
- ]:
- enc.fit(X)
- assert all([enc.categories_[i].dtype == X.dtype for i in range(2)])
- assert_array_equal(enc.transform(X).toarray(), exp)
- X = [[1, 2], [3, 4]]
- enc.fit(X)
- assert all([np.issubdtype(enc.categories_[i].dtype, np.integer) for i in range(2)])
- assert_array_equal(enc.transform(X).toarray(), exp)
- X = [[1, "a"], [3, "b"]]
- enc.fit(X)
- assert all([enc.categories_[i].dtype == "object" for i in range(2)])
- assert_array_equal(enc.transform(X).toarray(), exp)
- def test_encoder_dtypes_pandas():
- # check dtype (similar to test_categorical_encoder_dtypes for dataframes)
- pd = pytest.importorskip("pandas")
- enc = OneHotEncoder(categories="auto")
- exp = np.array(
- [[1.0, 0.0, 1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0, 0.0, 1.0]],
- dtype="float64",
- )
- X = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}, dtype="int64")
- enc.fit(X)
- assert all([enc.categories_[i].dtype == "int64" for i in range(2)])
- assert_array_equal(enc.transform(X).toarray(), exp)
- X = pd.DataFrame({"A": [1, 2], "B": ["a", "b"], "C": [3.0, 4.0]})
- X_type = [X["A"].dtype, X["B"].dtype, X["C"].dtype]
- enc.fit(X)
- assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)])
- assert_array_equal(enc.transform(X).toarray(), exp)
- def test_one_hot_encoder_warning():
- enc = OneHotEncoder()
- X = [["Male", 1], ["Female", 3]]
- np.testing.assert_no_warnings(enc.fit_transform, X)
- @pytest.mark.parametrize("missing_value", [np.nan, None, float("nan")])
- def test_one_hot_encoder_drop_manual(missing_value):
- cats_to_drop = ["def", 12, 3, 56, missing_value]
- enc = OneHotEncoder(drop=cats_to_drop)
- X = [
- ["abc", 12, 2, 55, "a"],
- ["def", 12, 1, 55, "a"],
- ["def", 12, 3, 56, missing_value],
- ]
- trans = enc.fit_transform(X).toarray()
- exp = [[1, 0, 1, 1, 1], [0, 1, 0, 1, 1], [0, 0, 0, 0, 0]]
- assert_array_equal(trans, exp)
- assert enc.drop is cats_to_drop
- dropped_cats = [
- cat[feature] for cat, feature in zip(enc.categories_, enc.drop_idx_)
- ]
- X_inv_trans = enc.inverse_transform(trans)
- X_array = np.array(X, dtype=object)
- # last value is np.nan
- if is_scalar_nan(cats_to_drop[-1]):
- assert_array_equal(dropped_cats[:-1], cats_to_drop[:-1])
- assert is_scalar_nan(dropped_cats[-1])
- assert is_scalar_nan(cats_to_drop[-1])
- # do not include the last column which includes missing values
- assert_array_equal(X_array[:, :-1], X_inv_trans[:, :-1])
- # check last column is the missing value
- assert_array_equal(X_array[-1, :-1], X_inv_trans[-1, :-1])
- assert is_scalar_nan(X_array[-1, -1])
- assert is_scalar_nan(X_inv_trans[-1, -1])
- else:
- assert_array_equal(dropped_cats, cats_to_drop)
- assert_array_equal(X_array, X_inv_trans)
- @pytest.mark.parametrize("drop", [["abc", 3], ["abc", 3, 41, "a"]])
- def test_invalid_drop_length(drop):
- enc = OneHotEncoder(drop=drop)
- err_msg = "`drop` should have length equal to the number"
- with pytest.raises(ValueError, match=err_msg):
- enc.fit([["abc", 2, 55], ["def", 1, 55], ["def", 3, 59]])
- @pytest.mark.parametrize("density", [True, False], ids=["sparse", "dense"])
- @pytest.mark.parametrize("drop", ["first", ["a", 2, "b"]], ids=["first", "manual"])
- def test_categories(density, drop):
- ohe_base = OneHotEncoder(sparse_output=density)
- ohe_test = OneHotEncoder(sparse_output=density, drop=drop)
- X = [["c", 1, "a"], ["a", 2, "b"]]
- ohe_base.fit(X)
- ohe_test.fit(X)
- assert_array_equal(ohe_base.categories_, ohe_test.categories_)
- if drop == "first":
- assert_array_equal(ohe_test.drop_idx_, 0)
- else:
- for drop_cat, drop_idx, cat_list in zip(
- drop, ohe_test.drop_idx_, ohe_test.categories_
- ):
- assert cat_list[int(drop_idx)] == drop_cat
- assert isinstance(ohe_test.drop_idx_, np.ndarray)
- assert ohe_test.drop_idx_.dtype == object
- @pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder])
- def test_encoders_has_categorical_tags(Encoder):
- assert "categorical" in Encoder()._get_tags()["X_types"]
- @pytest.mark.parametrize(
- "kwargs",
- [
- {"max_categories": 2},
- {"min_frequency": 11},
- {"min_frequency": 0.29},
- {"max_categories": 2, "min_frequency": 6},
- {"max_categories": 4, "min_frequency": 12},
- ],
- )
- @pytest.mark.parametrize("categories", ["auto", [["a", "b", "c", "d"]]])
- def test_ohe_infrequent_two_levels(kwargs, categories):
- """Test that different parameters for combine 'a', 'c', and 'd' into
- the infrequent category works as expected."""
- X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
- ohe = OneHotEncoder(
- categories=categories,
- handle_unknown="infrequent_if_exist",
- sparse_output=False,
- **kwargs,
- ).fit(X_train)
- assert_array_equal(ohe.infrequent_categories_, [["a", "c", "d"]])
- X_test = [["b"], ["a"], ["c"], ["d"], ["e"]]
- expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]])
- X_trans = ohe.transform(X_test)
- assert_allclose(expected, X_trans)
- expected_inv = [[col] for col in ["b"] + ["infrequent_sklearn"] * 4]
- X_inv = ohe.inverse_transform(X_trans)
- assert_array_equal(expected_inv, X_inv)
- feature_names = ohe.get_feature_names_out()
- assert_array_equal(["x0_b", "x0_infrequent_sklearn"], feature_names)
- @pytest.mark.parametrize("drop", ["if_binary", "first", ["b"]])
- def test_ohe_infrequent_two_levels_drop_frequent(drop):
- """Test two levels and dropping the frequent category."""
- X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
- ohe = OneHotEncoder(
- handle_unknown="infrequent_if_exist",
- sparse_output=False,
- max_categories=2,
- drop=drop,
- ).fit(X_train)
- assert ohe.categories_[0][ohe.drop_idx_[0]] == "b"
- X_test = np.array([["b"], ["c"]])
- X_trans = ohe.transform(X_test)
- assert_allclose([[0], [1]], X_trans)
- feature_names = ohe.get_feature_names_out()
- assert_array_equal(["x0_infrequent_sklearn"], feature_names)
- X_inverse = ohe.inverse_transform(X_trans)
- assert_array_equal([["b"], ["infrequent_sklearn"]], X_inverse)
- @pytest.mark.parametrize("drop", [["a"], ["d"]])
- def test_ohe_infrequent_two_levels_drop_infrequent_errors(drop):
- """Test two levels and dropping any infrequent category removes the
- whole infrequent category."""
- X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
- ohe = OneHotEncoder(
- handle_unknown="infrequent_if_exist",
- sparse_output=False,
- max_categories=2,
- drop=drop,
- )
- msg = f"Unable to drop category {drop[0]!r} from feature 0 because it is infrequent"
- with pytest.raises(ValueError, match=msg):
- ohe.fit(X_train)
- @pytest.mark.parametrize(
- "kwargs",
- [
- {"max_categories": 3},
- {"min_frequency": 6},
- {"min_frequency": 9},
- {"min_frequency": 0.24},
- {"min_frequency": 0.16},
- {"max_categories": 3, "min_frequency": 8},
- {"max_categories": 4, "min_frequency": 6},
- ],
- )
- def test_ohe_infrequent_three_levels(kwargs):
- """Test that different parameters for combing 'a', and 'd' into
- the infrequent category works as expected."""
- X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
- ohe = OneHotEncoder(
- handle_unknown="infrequent_if_exist", sparse_output=False, **kwargs
- ).fit(X_train)
- assert_array_equal(ohe.infrequent_categories_, [["a", "d"]])
- X_test = [["b"], ["a"], ["c"], ["d"], ["e"]]
- expected = np.array([[1, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1], [0, 0, 1]])
- X_trans = ohe.transform(X_test)
- assert_allclose(expected, X_trans)
- expected_inv = [
- ["b"],
- ["infrequent_sklearn"],
- ["c"],
- ["infrequent_sklearn"],
- ["infrequent_sklearn"],
- ]
- X_inv = ohe.inverse_transform(X_trans)
- assert_array_equal(expected_inv, X_inv)
- feature_names = ohe.get_feature_names_out()
- assert_array_equal(["x0_b", "x0_c", "x0_infrequent_sklearn"], feature_names)
- @pytest.mark.parametrize("drop", ["first", ["b"]])
- def test_ohe_infrequent_three_levels_drop_frequent(drop):
- """Test three levels and dropping the frequent category."""
- X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
- ohe = OneHotEncoder(
- handle_unknown="infrequent_if_exist",
- sparse_output=False,
- max_categories=3,
- drop=drop,
- ).fit(X_train)
- X_test = np.array([["b"], ["c"], ["d"]])
- assert_allclose([[0, 0], [1, 0], [0, 1]], ohe.transform(X_test))
- # Check handle_unknown="ignore"
- ohe.set_params(handle_unknown="ignore").fit(X_train)
- msg = "Found unknown categories"
- with pytest.warns(UserWarning, match=msg):
- X_trans = ohe.transform([["b"], ["e"]])
- assert_allclose([[0, 0], [0, 0]], X_trans)
- @pytest.mark.parametrize("drop", [["a"], ["d"]])
- def test_ohe_infrequent_three_levels_drop_infrequent_errors(drop):
- """Test three levels and dropping the infrequent category."""
- X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
- ohe = OneHotEncoder(
- handle_unknown="infrequent_if_exist",
- sparse_output=False,
- max_categories=3,
- drop=drop,
- )
- msg = f"Unable to drop category {drop[0]!r} from feature 0 because it is infrequent"
- with pytest.raises(ValueError, match=msg):
- ohe.fit(X_train)
- def test_ohe_infrequent_handle_unknown_error():
- """Test that different parameters for combining 'a', and 'd' into
- the infrequent category works as expected."""
- X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
- ohe = OneHotEncoder(
- handle_unknown="error", sparse_output=False, max_categories=3
- ).fit(X_train)
- assert_array_equal(ohe.infrequent_categories_, [["a", "d"]])
- # all categories are known
- X_test = [["b"], ["a"], ["c"], ["d"]]
- expected = np.array([[1, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1]])
- X_trans = ohe.transform(X_test)
- assert_allclose(expected, X_trans)
- # 'bad' is not known and will error
- X_test = [["bad"]]
- msg = r"Found unknown categories \['bad'\] in column 0"
- with pytest.raises(ValueError, match=msg):
- ohe.transform(X_test)
- @pytest.mark.parametrize(
- "kwargs", [{"max_categories": 3, "min_frequency": 1}, {"min_frequency": 4}]
- )
- def test_ohe_infrequent_two_levels_user_cats_one_frequent(kwargs):
- """'a' is the only frequent category, all other categories are infrequent."""
- X_train = np.array([["a"] * 5 + ["e"] * 30], dtype=object).T
- ohe = OneHotEncoder(
- categories=[["c", "d", "a", "b"]],
- sparse_output=False,
- handle_unknown="infrequent_if_exist",
- **kwargs,
- ).fit(X_train)
- X_test = [["a"], ["b"], ["c"], ["d"], ["e"]]
- expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]])
- X_trans = ohe.transform(X_test)
- assert_allclose(expected, X_trans)
- # 'a' is dropped
- drops = ["first", "if_binary", ["a"]]
- X_test = [["a"], ["c"]]
- for drop in drops:
- ohe.set_params(drop=drop).fit(X_train)
- assert_allclose([[0], [1]], ohe.transform(X_test))
- def test_ohe_infrequent_two_levels_user_cats():
- """Test that the order of the categories provided by a user is respected."""
- X_train = np.array(
- [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object
- ).T
- ohe = OneHotEncoder(
- categories=[["c", "d", "a", "b"]],
- sparse_output=False,
- handle_unknown="infrequent_if_exist",
- max_categories=2,
- ).fit(X_train)
- assert_array_equal(ohe.infrequent_categories_, [["c", "d", "a"]])
- X_test = [["b"], ["a"], ["c"], ["d"], ["e"]]
- expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]])
- X_trans = ohe.transform(X_test)
- assert_allclose(expected, X_trans)
- # 'infrequent' is used to denote the infrequent categories for
- # `inverse_transform`
- expected_inv = [[col] for col in ["b"] + ["infrequent_sklearn"] * 4]
- X_inv = ohe.inverse_transform(X_trans)
- assert_array_equal(expected_inv, X_inv)
- def test_ohe_infrequent_three_levels_user_cats():
- """Test that the order of the categories provided by a user is respected.
- In this case 'c' is encoded as the first category and 'b' is encoded
- as the second one."""
- X_train = np.array(
- [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object
- ).T
- ohe = OneHotEncoder(
- categories=[["c", "d", "b", "a"]],
- sparse_output=False,
- handle_unknown="infrequent_if_exist",
- max_categories=3,
- ).fit(X_train)
- assert_array_equal(ohe.infrequent_categories_, [["d", "a"]])
- X_test = [["b"], ["a"], ["c"], ["d"], ["e"]]
- expected = np.array([[0, 1, 0], [0, 0, 1], [1, 0, 0], [0, 0, 1], [0, 0, 1]])
- X_trans = ohe.transform(X_test)
- assert_allclose(expected, X_trans)
- # 'infrequent' is used to denote the infrequent categories for
- # `inverse_transform`
- expected_inv = [
- ["b"],
- ["infrequent_sklearn"],
- ["c"],
- ["infrequent_sklearn"],
- ["infrequent_sklearn"],
- ]
- X_inv = ohe.inverse_transform(X_trans)
- assert_array_equal(expected_inv, X_inv)
- def test_ohe_infrequent_mixed():
- """Test infrequent categories where feature 0 has infrequent categories,
- and feature 1 does not."""
- # X[:, 0] 1 and 2 are infrequent
- # X[:, 1] nothing is infrequent
- X = np.c_[[0, 1, 3, 3, 3, 3, 2, 0, 3], [0, 0, 0, 0, 1, 1, 1, 1, 1]]
- ohe = OneHotEncoder(max_categories=3, drop="if_binary", sparse_output=False)
- ohe.fit(X)
- X_test = [[3, 0], [1, 1]]
- X_trans = ohe.transform(X_test)
- # feature 1 is binary so it drops a category 0
- assert_allclose(X_trans, [[0, 1, 0, 0], [0, 0, 1, 1]])
- def test_ohe_infrequent_multiple_categories():
- """Test infrequent categories with feature matrix with 3 features."""
- X = np.c_[
- [0, 1, 3, 3, 3, 3, 2, 0, 3],
- [0, 0, 5, 1, 1, 10, 5, 5, 0],
- [1, 0, 1, 0, 1, 0, 1, 0, 1],
- ]
- ohe = OneHotEncoder(
- categories="auto", max_categories=3, handle_unknown="infrequent_if_exist"
- )
- # X[:, 0] 1 and 2 are infrequent
- # X[:, 1] 1 and 10 are infrequent
- # X[:, 2] nothing is infrequent
- X_trans = ohe.fit_transform(X).toarray()
- assert_array_equal(ohe.infrequent_categories_[0], [1, 2])
- assert_array_equal(ohe.infrequent_categories_[1], [1, 10])
- assert_array_equal(ohe.infrequent_categories_[2], None)
- # 'infrequent' is used to denote the infrequent categories
- # For the first column, 1 and 2 have the same frequency. In this case,
- # 1 will be chosen to be the feature name because is smaller lexiconically
- feature_names = ohe.get_feature_names_out()
- assert_array_equal(
- [
- "x0_0",
- "x0_3",
- "x0_infrequent_sklearn",
- "x1_0",
- "x1_5",
- "x1_infrequent_sklearn",
- "x2_0",
- "x2_1",
- ],
- feature_names,
- )
- expected = [
- [1, 0, 0, 1, 0, 0, 0, 1],
- [0, 0, 1, 1, 0, 0, 1, 0],
- [0, 1, 0, 0, 1, 0, 0, 1],
- [0, 1, 0, 0, 0, 1, 1, 0],
- [0, 1, 0, 0, 0, 1, 0, 1],
- [0, 1, 0, 0, 0, 1, 1, 0],
- [0, 0, 1, 0, 1, 0, 0, 1],
- [1, 0, 0, 0, 1, 0, 1, 0],
- [0, 1, 0, 1, 0, 0, 0, 1],
- ]
- assert_allclose(expected, X_trans)
- X_test = [[3, 1, 2], [4, 0, 3]]
- X_test_trans = ohe.transform(X_test)
- # X[:, 2] does not have an infrequent category, thus it is encoded as all
- # zeros
- expected = [[0, 1, 0, 0, 0, 1, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0]]
- assert_allclose(expected, X_test_trans.toarray())
- X_inv = ohe.inverse_transform(X_test_trans)
- expected_inv = np.array(
- [[3, "infrequent_sklearn", None], ["infrequent_sklearn", 0, None]], dtype=object
- )
- assert_array_equal(expected_inv, X_inv)
- # error for unknown categories
- ohe = OneHotEncoder(
- categories="auto", max_categories=3, handle_unknown="error"
- ).fit(X)
- with pytest.raises(ValueError, match="Found unknown categories"):
- ohe.transform(X_test)
- # only infrequent or known categories
- X_test = [[1, 1, 1], [3, 10, 0]]
- X_test_trans = ohe.transform(X_test)
- expected = [[0, 0, 1, 0, 0, 1, 0, 1], [0, 1, 0, 0, 0, 1, 1, 0]]
- assert_allclose(expected, X_test_trans.toarray())
- X_inv = ohe.inverse_transform(X_test_trans)
- expected_inv = np.array(
- [["infrequent_sklearn", "infrequent_sklearn", 1], [3, "infrequent_sklearn", 0]],
- dtype=object,
- )
- assert_array_equal(expected_inv, X_inv)
- def test_ohe_infrequent_multiple_categories_dtypes():
- """Test infrequent categories with a pandas dataframe with multiple dtypes."""
- pd = pytest.importorskip("pandas")
- X = pd.DataFrame(
- {
- "str": ["a", "f", "c", "f", "f", "a", "c", "b", "b"],
- "int": [5, 3, 0, 10, 10, 12, 0, 3, 5],
- },
- columns=["str", "int"],
- )
- ohe = OneHotEncoder(
- categories="auto", max_categories=3, handle_unknown="infrequent_if_exist"
- )
- # X[:, 0] 'a', 'b', 'c' have the same frequency. 'a' and 'b' will be
- # considered infrequent because they are greater
- # X[:, 1] 0, 3, 5, 10 has frequency 2 and 12 has frequency 1.
- # 0, 3, 12 will be considered infrequent
- X_trans = ohe.fit_transform(X).toarray()
- assert_array_equal(ohe.infrequent_categories_[0], ["a", "b"])
- assert_array_equal(ohe.infrequent_categories_[1], [0, 3, 12])
- expected = [
- [0, 0, 1, 1, 0, 0],
- [0, 1, 0, 0, 0, 1],
- [1, 0, 0, 0, 0, 1],
- [0, 1, 0, 0, 1, 0],
- [0, 1, 0, 0, 1, 0],
- [0, 0, 1, 0, 0, 1],
- [1, 0, 0, 0, 0, 1],
- [0, 0, 1, 0, 0, 1],
- [0, 0, 1, 1, 0, 0],
- ]
- assert_allclose(expected, X_trans)
- X_test = pd.DataFrame({"str": ["b", "f"], "int": [14, 12]}, columns=["str", "int"])
- expected = [[0, 0, 1, 0, 0, 1], [0, 1, 0, 0, 0, 1]]
- X_test_trans = ohe.transform(X_test)
- assert_allclose(expected, X_test_trans.toarray())
- X_inv = ohe.inverse_transform(X_test_trans)
- expected_inv = np.array(
- [["infrequent_sklearn", "infrequent_sklearn"], ["f", "infrequent_sklearn"]],
- dtype=object,
- )
- assert_array_equal(expected_inv, X_inv)
- # only infrequent or known categories
- X_test = pd.DataFrame({"str": ["c", "b"], "int": [12, 5]}, columns=["str", "int"])
- X_test_trans = ohe.transform(X_test).toarray()
- expected = [[1, 0, 0, 0, 0, 1], [0, 0, 1, 1, 0, 0]]
- assert_allclose(expected, X_test_trans)
- X_inv = ohe.inverse_transform(X_test_trans)
- expected_inv = np.array(
- [["c", "infrequent_sklearn"], ["infrequent_sklearn", 5]], dtype=object
- )
- assert_array_equal(expected_inv, X_inv)
- @pytest.mark.parametrize("kwargs", [{"min_frequency": 21, "max_categories": 1}])
- def test_ohe_infrequent_one_level_errors(kwargs):
- """All user provided categories are infrequent."""
- X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 2]).T
- ohe = OneHotEncoder(
- handle_unknown="infrequent_if_exist", sparse_output=False, **kwargs
- )
- ohe.fit(X_train)
- X_trans = ohe.transform([["a"]])
- assert_allclose(X_trans, [[1]])
- @pytest.mark.parametrize("kwargs", [{"min_frequency": 2, "max_categories": 3}])
- def test_ohe_infrequent_user_cats_unknown_training_errors(kwargs):
- """All user provided categories are infrequent."""
- X_train = np.array([["e"] * 3], dtype=object).T
- ohe = OneHotEncoder(
- categories=[["c", "d", "a", "b"]],
- sparse_output=False,
- handle_unknown="infrequent_if_exist",
- **kwargs,
- ).fit(X_train)
- X_trans = ohe.transform([["a"], ["e"]])
- assert_allclose(X_trans, [[1], [1]])
- # TODO(1.4): Remove when `sparse` parameter is replaced by `sparse_output`
- def test_one_hot_encoder_sparse_deprecated():
- X = [["Male", 1], ["Female", 3], ["Female", 2]]
- msg = "`sparse` was renamed to `sparse_output`"
- with pytest.warns(FutureWarning, match=msg):
- OneHotEncoder(sparse=False).fit(X)
- # deliberately omit 'OS' as an invalid combo
- @pytest.mark.parametrize(
- "input_dtype, category_dtype", ["OO", "OU", "UO", "UU", "SO", "SU", "SS"]
- )
- @pytest.mark.parametrize("array_type", ["list", "array", "dataframe"])
- def test_encoders_string_categories(input_dtype, category_dtype, array_type):
- """Check that encoding work with object, unicode, and byte string dtypes.
- Non-regression test for:
- https://github.com/scikit-learn/scikit-learn/issues/15616
- https://github.com/scikit-learn/scikit-learn/issues/15726
- https://github.com/scikit-learn/scikit-learn/issues/19677
- """
- X = np.array([["b"], ["a"]], dtype=input_dtype)
- categories = [np.array(["b", "a"], dtype=category_dtype)]
- ohe = OneHotEncoder(categories=categories, sparse_output=False).fit(X)
- X_test = _convert_container(
- [["a"], ["a"], ["b"], ["a"]], array_type, dtype=input_dtype
- )
- X_trans = ohe.transform(X_test)
- expected = np.array([[0, 1], [0, 1], [1, 0], [0, 1]])
- assert_allclose(X_trans, expected)
- oe = OrdinalEncoder(categories=categories).fit(X)
- X_trans = oe.transform(X_test)
- expected = np.array([[1], [1], [0], [1]])
- assert_array_equal(X_trans, expected)
- def test_mixed_string_bytes_categoricals():
- """Check that this mixture of predefined categories and X raises an error.
- Categories defined as bytes can not easily be compared to data that is
- a string.
- """
- # data as unicode
- X = np.array([["b"], ["a"]], dtype="U")
- # predefined categories as bytes
- categories = [np.array(["b", "a"], dtype="S")]
- ohe = OneHotEncoder(categories=categories, sparse_output=False)
- msg = re.escape(
- "In column 0, the predefined categories have type 'bytes' which is incompatible"
- " with values of type 'str_'."
- )
- with pytest.raises(ValueError, match=msg):
- ohe.fit(X)
- @pytest.mark.parametrize("missing_value", [np.nan, None])
- def test_ohe_missing_values_get_feature_names(missing_value):
- # encoder with missing values with object dtypes
- X = np.array([["a", "b", missing_value, "a", missing_value]], dtype=object).T
- ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore").fit(X)
- names = ohe.get_feature_names_out()
- assert_array_equal(names, ["x0_a", "x0_b", f"x0_{missing_value}"])
- def test_ohe_missing_value_support_pandas():
- # check support for pandas with mixed dtypes and missing values
- pd = pytest.importorskip("pandas")
- df = pd.DataFrame(
- {
- "col1": ["dog", "cat", None, "cat"],
- "col2": np.array([3, 0, 4, np.nan], dtype=float),
- },
- columns=["col1", "col2"],
- )
- expected_df_trans = np.array(
- [
- [0, 1, 0, 0, 1, 0, 0],
- [1, 0, 0, 1, 0, 0, 0],
- [0, 0, 1, 0, 0, 1, 0],
- [1, 0, 0, 0, 0, 0, 1],
- ]
- )
- Xtr = check_categorical_onehot(df)
- assert_allclose(Xtr, expected_df_trans)
- @pytest.mark.parametrize("handle_unknown", ["infrequent_if_exist", "ignore"])
- @pytest.mark.parametrize("pd_nan_type", ["pd.NA", "np.nan"])
- def test_ohe_missing_value_support_pandas_categorical(pd_nan_type, handle_unknown):
- # checks pandas dataframe with categorical features
- pd = pytest.importorskip("pandas")
- pd_missing_value = pd.NA if pd_nan_type == "pd.NA" else np.nan
- df = pd.DataFrame(
- {
- "col1": pd.Series(["c", "a", pd_missing_value, "b", "a"], dtype="category"),
- }
- )
- expected_df_trans = np.array(
- [
- [0, 0, 1, 0],
- [1, 0, 0, 0],
- [0, 0, 0, 1],
- [0, 1, 0, 0],
- [1, 0, 0, 0],
- ]
- )
- ohe = OneHotEncoder(sparse_output=False, handle_unknown=handle_unknown)
- df_trans = ohe.fit_transform(df)
- assert_allclose(expected_df_trans, df_trans)
- assert len(ohe.categories_) == 1
- assert_array_equal(ohe.categories_[0][:-1], ["a", "b", "c"])
- assert np.isnan(ohe.categories_[0][-1])
- @pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist"])
- def test_ohe_drop_first_handle_unknown_ignore_warns(handle_unknown):
- """Check drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
- during transform."""
- X = [["a", 0], ["b", 2], ["b", 1]]
- ohe = OneHotEncoder(
- drop="first", sparse_output=False, handle_unknown=handle_unknown
- )
- X_trans = ohe.fit_transform(X)
- X_expected = np.array(
- [
- [0, 0, 0],
- [1, 0, 1],
- [1, 1, 0],
- ]
- )
- assert_allclose(X_trans, X_expected)
- # Both categories are unknown
- X_test = [["c", 3]]
- X_expected = np.array([[0, 0, 0]])
- warn_msg = (
- r"Found unknown categories in columns \[0, 1\] during "
- "transform. These unknown categories will be encoded as all "
- "zeros"
- )
- with pytest.warns(UserWarning, match=warn_msg):
- X_trans = ohe.transform(X_test)
- assert_allclose(X_trans, X_expected)
- # inverse_transform maps to None
- X_inv = ohe.inverse_transform(X_expected)
- assert_array_equal(X_inv, np.array([["a", 0]], dtype=object))
- @pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist"])
- def test_ohe_drop_if_binary_handle_unknown_ignore_warns(handle_unknown):
- """Check drop='if_binary' and handle_unknown='ignore' during transform."""
- X = [["a", 0], ["b", 2], ["b", 1]]
- ohe = OneHotEncoder(
- drop="if_binary", sparse_output=False, handle_unknown=handle_unknown
- )
- X_trans = ohe.fit_transform(X)
- X_expected = np.array(
- [
- [0, 1, 0, 0],
- [1, 0, 0, 1],
- [1, 0, 1, 0],
- ]
- )
- assert_allclose(X_trans, X_expected)
- # Both categories are unknown
- X_test = [["c", 3]]
- X_expected = np.array([[0, 0, 0, 0]])
- warn_msg = (
- r"Found unknown categories in columns \[0, 1\] during "
- "transform. These unknown categories will be encoded as all "
- "zeros"
- )
- with pytest.warns(UserWarning, match=warn_msg):
- X_trans = ohe.transform(X_test)
- assert_allclose(X_trans, X_expected)
- # inverse_transform maps to None
- X_inv = ohe.inverse_transform(X_expected)
- assert_array_equal(X_inv, np.array([["a", None]], dtype=object))
- @pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist"])
- def test_ohe_drop_first_explicit_categories(handle_unknown):
- """Check drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
- during fit with categories passed in."""
- X = [["a", 0], ["b", 2], ["b", 1]]
- ohe = OneHotEncoder(
- drop="first",
- sparse_output=False,
- handle_unknown=handle_unknown,
- categories=[["b", "a"], [1, 2]],
- )
- ohe.fit(X)
- X_test = [["c", 1]]
- X_expected = np.array([[0, 0]])
- warn_msg = (
- r"Found unknown categories in columns \[0\] during transform. "
- r"These unknown categories will be encoded as all zeros"
- )
- with pytest.warns(UserWarning, match=warn_msg):
- X_trans = ohe.transform(X_test)
- assert_allclose(X_trans, X_expected)
- def test_ohe_more_informative_error_message():
- """Raise informative error message when pandas output and sparse_output=True."""
- pd = pytest.importorskip("pandas")
- df = pd.DataFrame({"a": [1, 2, 3], "b": ["z", "b", "b"]}, columns=["a", "b"])
- ohe = OneHotEncoder(sparse_output=True)
- ohe.set_output(transform="pandas")
- msg = (
- "Pandas output does not support sparse data. Set "
- "sparse_output=False to output pandas DataFrames or disable pandas output"
- )
- with pytest.raises(ValueError, match=msg):
- ohe.fit_transform(df)
- ohe.fit(df)
- with pytest.raises(ValueError, match=msg):
- ohe.transform(df)
- def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype():
- """Test ordinal encoder with nan passthrough fails when dtype=np.int32."""
- X = np.array([[np.nan, 3.0, 1.0, 3.0]]).T
- oe = OrdinalEncoder(dtype=np.int32)
- msg = (
- r"There are missing values in features \[0\]. For OrdinalEncoder "
- f"to encode missing values with dtype: {np.int32}"
- )
- with pytest.raises(ValueError, match=msg):
- oe.fit(X)
- @pytest.mark.parametrize("encoded_missing_value", [np.nan, -2])
- def test_ordinal_encoder_passthrough_missing_values_float(encoded_missing_value):
- """Test ordinal encoder with nan on float dtypes."""
- X = np.array([[np.nan, 3.0, 1.0, 3.0]], dtype=np.float64).T
- oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(X)
- assert len(oe.categories_) == 1
- assert_allclose(oe.categories_[0], [1.0, 3.0, np.nan])
- X_trans = oe.transform(X)
- assert_allclose(X_trans, [[encoded_missing_value], [1.0], [0.0], [1.0]])
- X_inverse = oe.inverse_transform(X_trans)
- assert_allclose(X_inverse, X)
- @pytest.mark.parametrize("pd_nan_type", ["pd.NA", "np.nan"])
- @pytest.mark.parametrize("encoded_missing_value", [np.nan, -2])
- def test_ordinal_encoder_missing_value_support_pandas_categorical(
- pd_nan_type, encoded_missing_value
- ):
- """Check ordinal encoder is compatible with pandas."""
- # checks pandas dataframe with categorical features
- pd = pytest.importorskip("pandas")
- pd_missing_value = pd.NA if pd_nan_type == "pd.NA" else np.nan
- df = pd.DataFrame(
- {
- "col1": pd.Series(["c", "a", pd_missing_value, "b", "a"], dtype="category"),
- }
- )
- oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(df)
- assert len(oe.categories_) == 1
- assert_array_equal(oe.categories_[0][:3], ["a", "b", "c"])
- assert np.isnan(oe.categories_[0][-1])
- df_trans = oe.transform(df)
- assert_allclose(df_trans, [[2.0], [0.0], [encoded_missing_value], [1.0], [0.0]])
- X_inverse = oe.inverse_transform(df_trans)
- assert X_inverse.shape == (5, 1)
- assert_array_equal(X_inverse[:2, 0], ["c", "a"])
- assert_array_equal(X_inverse[3:, 0], ["b", "a"])
- assert np.isnan(X_inverse[2, 0])
- @pytest.mark.parametrize(
- "X, X2, cats, cat_dtype",
- [
- (
- (
- np.array([["a", np.nan]], dtype=object).T,
- np.array([["a", "b"]], dtype=object).T,
- [np.array(["a", np.nan, "d"], dtype=object)],
- np.object_,
- )
- ),
- (
- (
- np.array([["a", np.nan]], dtype=object).T,
- np.array([["a", "b"]], dtype=object).T,
- [np.array(["a", np.nan, "d"], dtype=object)],
- np.object_,
- )
- ),
- (
- (
- np.array([[2.0, np.nan]], dtype=np.float64).T,
- np.array([[3.0]], dtype=np.float64).T,
- [np.array([2.0, 4.0, np.nan])],
- np.float64,
- )
- ),
- ],
- ids=[
- "object-None-missing-value",
- "object-nan-missing_value",
- "numeric-missing-value",
- ],
- )
- def test_ordinal_encoder_specified_categories_missing_passthrough(
- X, X2, cats, cat_dtype
- ):
- """Test ordinal encoder for specified categories."""
- oe = OrdinalEncoder(categories=cats)
- exp = np.array([[0.0], [np.nan]])
- assert_array_equal(oe.fit_transform(X), exp)
- # manually specified categories should have same dtype as
- # the data when coerced from lists
- assert oe.categories_[0].dtype == cat_dtype
- # when specifying categories manually, unknown categories should already
- # raise when fitting
- oe = OrdinalEncoder(categories=cats)
- with pytest.raises(ValueError, match="Found unknown categories"):
- oe.fit(X2)
- @pytest.mark.parametrize(
- "X, expected_X_trans, X_test",
- [
- (
- np.array([[1.0, np.nan, 3.0]]).T,
- np.array([[0.0, np.nan, 1.0]]).T,
- np.array([[4.0]]),
- ),
- (
- np.array([[1.0, 4.0, 3.0]]).T,
- np.array([[0.0, 2.0, 1.0]]).T,
- np.array([[np.nan]]),
- ),
- (
- np.array([["c", np.nan, "b"]], dtype=object).T,
- np.array([[1.0, np.nan, 0.0]]).T,
- np.array([["d"]], dtype=object),
- ),
- (
- np.array([["c", "a", "b"]], dtype=object).T,
- np.array([[2.0, 0.0, 1.0]]).T,
- np.array([[np.nan]], dtype=object),
- ),
- ],
- )
- def test_ordinal_encoder_handle_missing_and_unknown(X, expected_X_trans, X_test):
- """Test the interaction between missing values and handle_unknown"""
- oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
- X_trans = oe.fit_transform(X)
- assert_allclose(X_trans, expected_X_trans)
- assert_allclose(oe.transform(X_test), [[-1.0]])
- def test_ordinal_encoder_sparse():
- """Check that we raise proper error with sparse input in OrdinalEncoder.
- Non-regression test for:
- https://github.com/scikit-learn/scikit-learn/issues/19878
- """
- X = np.array([[3, 2, 1], [0, 1, 1]])
- X_sparse = sparse.csr_matrix(X)
- encoder = OrdinalEncoder()
- err_msg = "A sparse matrix was passed, but dense data is required"
- with pytest.raises(TypeError, match=err_msg):
- encoder.fit(X_sparse)
- with pytest.raises(TypeError, match=err_msg):
- encoder.fit_transform(X_sparse)
- X_trans = encoder.fit_transform(X)
- X_trans_sparse = sparse.csr_matrix(X_trans)
- with pytest.raises(TypeError, match=err_msg):
- encoder.inverse_transform(X_trans_sparse)
- def test_ordinal_encoder_fit_with_unseen_category():
- """Check OrdinalEncoder.fit works with unseen category when
- `handle_unknown="use_encoded_value"`.
- Non-regression test for:
- https://github.com/scikit-learn/scikit-learn/issues/19872
- """
- X = np.array([0, 0, 1, 0, 2, 5])[:, np.newaxis]
- oe = OrdinalEncoder(
- categories=[[-1, 0, 1]], handle_unknown="use_encoded_value", unknown_value=-999
- )
- oe.fit(X)
- oe = OrdinalEncoder(categories=[[-1, 0, 1]], handle_unknown="error")
- with pytest.raises(ValueError, match="Found unknown categories"):
- oe.fit(X)
- @pytest.mark.parametrize(
- "X_train",
- [
- [["AA", "B"]],
- np.array([["AA", "B"]], dtype="O"),
- np.array([["AA", "B"]], dtype="U"),
- ],
- )
- @pytest.mark.parametrize(
- "X_test",
- [
- [["A", "B"]],
- np.array([["A", "B"]], dtype="O"),
- np.array([["A", "B"]], dtype="U"),
- ],
- )
- def test_ordinal_encoder_handle_unknown_string_dtypes(X_train, X_test):
- """Checks that `OrdinalEncoder` transforms string dtypes.
- Non-regression test for:
- https://github.com/scikit-learn/scikit-learn/issues/19872
- """
- enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-9)
- enc.fit(X_train)
- X_trans = enc.transform(X_test)
- assert_allclose(X_trans, [[-9, 0]])
- def test_ordinal_encoder_python_integer():
- """Check that `OrdinalEncoder` accepts Python integers that are potentially
- larger than 64 bits.
- Non-regression test for:
- https://github.com/scikit-learn/scikit-learn/issues/20721
- """
- X = np.array(
- [
- 44253463435747313673,
- 9867966753463435747313673,
- 44253462342215747313673,
- 442534634357764313673,
- ]
- ).reshape(-1, 1)
- encoder = OrdinalEncoder().fit(X)
- assert_array_equal(encoder.categories_, np.sort(X, axis=0).T)
- X_trans = encoder.transform(X)
- assert_array_equal(X_trans, [[0], [3], [2], [1]])
- def test_ordinal_encoder_features_names_out_pandas():
- """Check feature names out is same as the input."""
- pd = pytest.importorskip("pandas")
- names = ["b", "c", "a"]
- X = pd.DataFrame([[1, 2, 3]], columns=names)
- enc = OrdinalEncoder().fit(X)
- feature_names_out = enc.get_feature_names_out()
- assert_array_equal(names, feature_names_out)
- def test_ordinal_encoder_unknown_missing_interaction():
- """Check interactions between encode_unknown and missing value encoding."""
- X = np.array([["a"], ["b"], [np.nan]], dtype=object)
- oe = OrdinalEncoder(
- handle_unknown="use_encoded_value",
- unknown_value=np.nan,
- encoded_missing_value=-3,
- ).fit(X)
- X_trans = oe.transform(X)
- assert_allclose(X_trans, [[0], [1], [-3]])
- # "c" is unknown and is mapped to np.nan
- # "None" is a missing value and is set to -3
- X_test = np.array([["c"], [np.nan]], dtype=object)
- X_test_trans = oe.transform(X_test)
- assert_allclose(X_test_trans, [[np.nan], [-3]])
- # Non-regression test for #24082
- X_roundtrip = oe.inverse_transform(X_test_trans)
- # np.nan is unknown so it maps to None
- assert X_roundtrip[0][0] is None
- # -3 is the encoded missing value so it maps back to nan
- assert np.isnan(X_roundtrip[1][0])
- @pytest.mark.parametrize("with_pandas", [True, False])
- def test_ordinal_encoder_encoded_missing_value_error(with_pandas):
- """Check OrdinalEncoder errors when encoded_missing_value is used by
- an known category."""
- X = np.array([["a", "dog"], ["b", "cat"], ["c", np.nan]], dtype=object)
- # The 0-th feature has no missing values so it is not included in the list of
- # features
- error_msg = (
- r"encoded_missing_value \(1\) is already used to encode a known category "
- r"in features: "
- )
- if with_pandas:
- pd = pytest.importorskip("pandas")
- X = pd.DataFrame(X, columns=["letter", "pet"])
- error_msg = error_msg + r"\['pet'\]"
- else:
- error_msg = error_msg + r"\[1\]"
- oe = OrdinalEncoder(encoded_missing_value=1)
- with pytest.raises(ValueError, match=error_msg):
- oe.fit(X)
- @pytest.mark.parametrize(
- "X_train, X_test_trans_expected, X_roundtrip_expected",
- [
- (
- # missing value is not in training set
- # inverse transform will considering encoded nan as unknown
- np.array([["a"], ["1"]], dtype=object),
- [[0], [np.nan], [np.nan]],
- np.asarray([["1"], [None], [None]], dtype=object),
- ),
- (
- # missing value in training set,
- # inverse transform will considering encoded nan as missing
- np.array([[np.nan], ["1"], ["a"]], dtype=object),
- [[0], [np.nan], [np.nan]],
- np.asarray([["1"], [np.nan], [np.nan]], dtype=object),
- ),
- ],
- )
- def test_ordinal_encoder_unknown_missing_interaction_both_nan(
- X_train, X_test_trans_expected, X_roundtrip_expected
- ):
- """Check transform when unknown_value and encoded_missing_value is nan.
- Non-regression test for #24082.
- """
- oe = OrdinalEncoder(
- handle_unknown="use_encoded_value",
- unknown_value=np.nan,
- encoded_missing_value=np.nan,
- ).fit(X_train)
- X_test = np.array([["1"], [np.nan], ["b"]])
- X_test_trans = oe.transform(X_test)
- # both nan and unknown are encoded as nan
- assert_allclose(X_test_trans, X_test_trans_expected)
- X_roundtrip = oe.inverse_transform(X_test_trans)
- n_samples = X_roundtrip_expected.shape[0]
- for i in range(n_samples):
- expected_val = X_roundtrip_expected[i, 0]
- val = X_roundtrip[i, 0]
- if expected_val is None:
- assert val is None
- elif is_scalar_nan(expected_val):
- assert np.isnan(val)
- else:
- assert val == expected_val
- def test_one_hot_encoder_set_output():
- """Check OneHotEncoder works with set_output."""
- pd = pytest.importorskip("pandas")
- X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]})
- ohe = OneHotEncoder()
- ohe.set_output(transform="pandas")
- match = "Pandas output does not support sparse data"
- with pytest.raises(ValueError, match=match):
- ohe.fit_transform(X_df)
- ohe_default = OneHotEncoder(sparse_output=False).set_output(transform="default")
- ohe_pandas = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
- X_default = ohe_default.fit_transform(X_df)
- X_pandas = ohe_pandas.fit_transform(X_df)
- assert_allclose(X_pandas.to_numpy(), X_default)
- assert_array_equal(ohe_pandas.get_feature_names_out(), X_pandas.columns)
- def test_ordinal_set_output():
- """Check OrdinalEncoder works with set_output."""
- pd = pytest.importorskip("pandas")
- X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]})
- ord_default = OrdinalEncoder().set_output(transform="default")
- ord_pandas = OrdinalEncoder().set_output(transform="pandas")
- X_default = ord_default.fit_transform(X_df)
- X_pandas = ord_pandas.fit_transform(X_df)
- assert_allclose(X_pandas.to_numpy(), X_default)
- assert_array_equal(ord_pandas.get_feature_names_out(), X_pandas.columns)
- def test_predefined_categories_dtype():
- """Check that the categories_ dtype is `object` for string categories
- Regression test for gh-25171.
- """
- categories = [["as", "mmas", "eas", "ras", "acs"], ["1", "2"]]
- enc = OneHotEncoder(categories=categories)
- enc.fit([["as", "1"]])
- assert len(categories) == len(enc.categories_)
- for n, cat in enumerate(enc.categories_):
- assert cat.dtype == object
- assert_array_equal(categories[n], cat)
- def test_ordinal_encoder_missing_unknown_encoding_max():
- """Check missing value or unknown encoding can equal the cardinality."""
- X = np.array([["dog"], ["cat"], [np.nan]], dtype=object)
- X_trans = OrdinalEncoder(encoded_missing_value=2).fit_transform(X)
- assert_allclose(X_trans, [[1], [0], [2]])
- enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=2).fit(X)
- X_test = np.array([["snake"]])
- X_trans = enc.transform(X_test)
- assert_allclose(X_trans, [[2]])
- def test_drop_idx_infrequent_categories():
- """Check drop_idx is defined correctly with infrequent categories.
- Non-regression test for gh-25550.
- """
- X = np.array(
- [["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object
- ).T
- ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="first").fit(X)
- assert_array_equal(
- ohe.get_feature_names_out(), ["x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"]
- )
- assert ohe.categories_[0][ohe.drop_idx_[0]] == "b"
- X = np.array([["a"] * 2 + ["b"] * 2 + ["c"] * 10], dtype=object).T
- ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="if_binary").fit(X)
- assert_array_equal(ohe.get_feature_names_out(), ["x0_infrequent_sklearn"])
- assert ohe.categories_[0][ohe.drop_idx_[0]] == "c"
- X = np.array(
- [["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object
- ).T
- ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=["d"]).fit(X)
- assert_array_equal(
- ohe.get_feature_names_out(), ["x0_b", "x0_c", "x0_e", "x0_infrequent_sklearn"]
- )
- assert ohe.categories_[0][ohe.drop_idx_[0]] == "d"
- ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=None).fit(X)
- assert_array_equal(
- ohe.get_feature_names_out(),
- ["x0_b", "x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"],
- )
- assert ohe.drop_idx_ is None
- @pytest.mark.parametrize(
- "kwargs",
- [
- {"max_categories": 3},
- {"min_frequency": 6},
- {"min_frequency": 9},
- {"min_frequency": 0.24},
- {"min_frequency": 0.16},
- {"max_categories": 3, "min_frequency": 8},
- {"max_categories": 4, "min_frequency": 6},
- ],
- )
- def test_ordinal_encoder_infrequent_three_levels(kwargs):
- """Test parameters for grouping 'a', and 'd' into the infrequent category."""
- X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
- ordinal = OrdinalEncoder(
- handle_unknown="use_encoded_value", unknown_value=-1, **kwargs
- ).fit(X_train)
- assert_array_equal(ordinal.categories_, [["a", "b", "c", "d"]])
- assert_array_equal(ordinal.infrequent_categories_, [["a", "d"]])
- X_test = [["a"], ["b"], ["c"], ["d"], ["z"]]
- expected_trans = [[2], [0], [1], [2], [-1]]
- X_trans = ordinal.transform(X_test)
- assert_allclose(X_trans, expected_trans)
- X_inverse = ordinal.inverse_transform(X_trans)
- expected_inverse = [
- ["infrequent_sklearn"],
- ["b"],
- ["c"],
- ["infrequent_sklearn"],
- [None],
- ]
- assert_array_equal(X_inverse, expected_inverse)
- def test_ordinal_encoder_infrequent_three_levels_user_cats():
- """Test that the order of the categories provided by a user is respected.
- In this case 'c' is encoded as the first category and 'b' is encoded
- as the second one.
- """
- X_train = np.array(
- [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object
- ).T
- ordinal = OrdinalEncoder(
- categories=[["c", "d", "b", "a"]],
- max_categories=3,
- handle_unknown="use_encoded_value",
- unknown_value=-1,
- ).fit(X_train)
- assert_array_equal(ordinal.categories_, [["c", "d", "b", "a"]])
- assert_array_equal(ordinal.infrequent_categories_, [["d", "a"]])
- X_test = [["a"], ["b"], ["c"], ["d"], ["z"]]
- expected_trans = [[2], [1], [0], [2], [-1]]
- X_trans = ordinal.transform(X_test)
- assert_allclose(X_trans, expected_trans)
- X_inverse = ordinal.inverse_transform(X_trans)
- expected_inverse = [
- ["infrequent_sklearn"],
- ["b"],
- ["c"],
- ["infrequent_sklearn"],
- [None],
- ]
- assert_array_equal(X_inverse, expected_inverse)
- def test_ordinal_encoder_infrequent_mixed():
- """Test when feature 0 has infrequent categories and feature 1 does not."""
- X = np.column_stack(([0, 1, 3, 3, 3, 3, 2, 0, 3], [0, 0, 0, 0, 1, 1, 1, 1, 1]))
- ordinal = OrdinalEncoder(max_categories=3).fit(X)
- assert_array_equal(ordinal.infrequent_categories_[0], [1, 2])
- assert ordinal.infrequent_categories_[1] is None
- X_test = [[3, 0], [1, 1]]
- expected_trans = [[1, 0], [2, 1]]
- X_trans = ordinal.transform(X_test)
- assert_allclose(X_trans, expected_trans)
- X_inverse = ordinal.inverse_transform(X_trans)
- expected_inverse = np.array([[3, 0], ["infrequent_sklearn", 1]], dtype=object)
- assert_array_equal(X_inverse, expected_inverse)
- def test_ordinal_encoder_infrequent_multiple_categories_dtypes():
- """Test infrequent categories with a pandas DataFrame with multiple dtypes."""
- pd = pytest.importorskip("pandas")
- categorical_dtype = pd.CategoricalDtype(["bird", "cat", "dog", "snake"])
- X = pd.DataFrame(
- {
- "str": ["a", "f", "c", "f", "f", "a", "c", "b", "b"],
- "int": [5, 3, 0, 10, 10, 12, 0, 3, 5],
- "categorical": pd.Series(
- ["dog"] * 4 + ["cat"] * 3 + ["snake"] + ["bird"],
- dtype=categorical_dtype,
- ),
- },
- columns=["str", "int", "categorical"],
- )
- ordinal = OrdinalEncoder(max_categories=3).fit(X)
- # X[:, 0] 'a', 'b', 'c' have the same frequency. 'a' and 'b' will be
- # considered infrequent because they appear first when sorted
- # X[:, 1] 0, 3, 5, 10 has frequency 2 and 12 has frequency 1.
- # 0, 3, 12 will be considered infrequent because they appear first when
- # sorted.
- # X[:, 2] "snake" and "bird" or infrequent
- assert_array_equal(ordinal.infrequent_categories_[0], ["a", "b"])
- assert_array_equal(ordinal.infrequent_categories_[1], [0, 3, 12])
- assert_array_equal(ordinal.infrequent_categories_[2], ["bird", "snake"])
- X_test = pd.DataFrame(
- {
- "str": ["a", "b", "f", "c"],
- "int": [12, 0, 10, 5],
- "categorical": pd.Series(
- ["cat"] + ["snake"] + ["bird"] + ["dog"],
- dtype=categorical_dtype,
- ),
- },
- columns=["str", "int", "categorical"],
- )
- expected_trans = [[2, 2, 0], [2, 2, 2], [1, 1, 2], [0, 0, 1]]
- X_trans = ordinal.transform(X_test)
- assert_allclose(X_trans, expected_trans)
- def test_ordinal_encoder_infrequent_custom_mapping():
- """Check behavior of unknown_value and encoded_missing_value with infrequent."""
- X_train = np.array(
- [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3 + [np.nan]], dtype=object
- ).T
- ordinal = OrdinalEncoder(
- handle_unknown="use_encoded_value",
- unknown_value=2,
- max_categories=2,
- encoded_missing_value=3,
- ).fit(X_train)
- assert_array_equal(ordinal.infrequent_categories_, [["a", "c", "d"]])
- X_test = np.array([["a"], ["b"], ["c"], ["d"], ["e"], [np.nan]], dtype=object)
- expected_trans = [[1], [0], [1], [1], [2], [3]]
- X_trans = ordinal.transform(X_test)
- assert_allclose(X_trans, expected_trans)
- @pytest.mark.parametrize(
- "kwargs",
- [
- {"max_categories": 6},
- {"min_frequency": 2},
- ],
- )
- def test_ordinal_encoder_all_frequent(kwargs):
- """All categories are considered frequent have same encoding as default encoder."""
- X_train = np.array(
- [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object
- ).T
- adjusted_encoder = OrdinalEncoder(
- **kwargs, handle_unknown="use_encoded_value", unknown_value=-1
- ).fit(X_train)
- default_encoder = OrdinalEncoder(
- handle_unknown="use_encoded_value", unknown_value=-1
- ).fit(X_train)
- X_test = [["a"], ["b"], ["c"], ["d"], ["e"]]
- assert_allclose(
- adjusted_encoder.transform(X_test), default_encoder.transform(X_test)
- )
- @pytest.mark.parametrize(
- "kwargs",
- [
- {"max_categories": 1},
- {"min_frequency": 100},
- ],
- )
- def test_ordinal_encoder_all_infrequent(kwargs):
- """When all categories are infrequent, they are all encoded as zero."""
- X_train = np.array(
- [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object
- ).T
- encoder = OrdinalEncoder(
- **kwargs, handle_unknown="use_encoded_value", unknown_value=-1
- ).fit(X_train)
- X_test = [["a"], ["b"], ["c"], ["d"], ["e"]]
- assert_allclose(encoder.transform(X_test), [[0], [0], [0], [0], [-1]])
- def test_ordinal_encoder_missing_appears_frequent():
- """Check behavior when missing value appears frequently."""
- X = np.array(
- [[np.nan] * 20 + ["dog"] * 10 + ["cat"] * 5 + ["snake"] + ["deer"]],
- dtype=object,
- ).T
- ordinal = OrdinalEncoder(max_categories=3).fit(X)
- X_test = np.array([["snake", "cat", "dog", np.nan]], dtype=object).T
- X_trans = ordinal.transform(X_test)
- assert_allclose(X_trans, [[2], [0], [1], [np.nan]])
- def test_ordinal_encoder_missing_appears_infrequent():
- """Check behavior when missing value appears infrequently."""
- # feature 0 has infrequent categories
- # feature 1 has no infrequent categories
- X = np.array(
- [
- [np.nan] + ["dog"] * 10 + ["cat"] * 5 + ["snake"] + ["deer"],
- ["red"] * 9 + ["green"] * 9,
- ],
- dtype=object,
- ).T
- ordinal = OrdinalEncoder(min_frequency=4).fit(X)
- X_test = np.array(
- [
- ["snake", "red"],
- ["deer", "green"],
- [np.nan, "green"],
- ["dog", "green"],
- ["cat", "red"],
- ],
- dtype=object,
- )
- X_trans = ordinal.transform(X_test)
- assert_allclose(X_trans, [[2, 1], [2, 0], [np.nan, 0], [1, 0], [0, 1]])
|