test_encoders.py 76 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325
  1. import re
  2. import numpy as np
  3. import pytest
  4. from scipy import sparse
  5. from sklearn.exceptions import NotFittedError
  6. from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
  7. from sklearn.utils import is_scalar_nan
  8. from sklearn.utils._testing import (
  9. _convert_container,
  10. assert_allclose,
  11. assert_array_equal,
  12. )
  13. def test_one_hot_encoder_sparse_dense():
  14. # check that sparse and dense will give the same results
  15. X = np.array([[3, 2, 1], [0, 1, 1]])
  16. enc_sparse = OneHotEncoder()
  17. enc_dense = OneHotEncoder(sparse_output=False)
  18. X_trans_sparse = enc_sparse.fit_transform(X)
  19. X_trans_dense = enc_dense.fit_transform(X)
  20. assert X_trans_sparse.shape == (2, 5)
  21. assert X_trans_dense.shape == (2, 5)
  22. assert sparse.issparse(X_trans_sparse)
  23. assert not sparse.issparse(X_trans_dense)
  24. # check outcome
  25. assert_array_equal(
  26. X_trans_sparse.toarray(), [[0.0, 1.0, 0.0, 1.0, 1.0], [1.0, 0.0, 1.0, 0.0, 1.0]]
  27. )
  28. assert_array_equal(X_trans_sparse.toarray(), X_trans_dense)
  29. @pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist"])
  30. def test_one_hot_encoder_handle_unknown(handle_unknown):
  31. X = np.array([[0, 2, 1], [1, 0, 3], [1, 0, 2]])
  32. X2 = np.array([[4, 1, 1]])
  33. # Test that one hot encoder raises error for unknown features
  34. # present during transform.
  35. oh = OneHotEncoder(handle_unknown="error")
  36. oh.fit(X)
  37. with pytest.raises(ValueError, match="Found unknown categories"):
  38. oh.transform(X2)
  39. # Test the ignore option, ignores unknown features (giving all 0's)
  40. oh = OneHotEncoder(handle_unknown=handle_unknown)
  41. oh.fit(X)
  42. X2_passed = X2.copy()
  43. assert_array_equal(
  44. oh.transform(X2_passed).toarray(),
  45. np.array([[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]]),
  46. )
  47. # ensure transformed data was not modified in place
  48. assert_allclose(X2, X2_passed)
  49. def test_one_hot_encoder_not_fitted():
  50. X = np.array([["a"], ["b"]])
  51. enc = OneHotEncoder(categories=["a", "b"])
  52. msg = (
  53. "This OneHotEncoder instance is not fitted yet. "
  54. "Call 'fit' with appropriate arguments before using this "
  55. "estimator."
  56. )
  57. with pytest.raises(NotFittedError, match=msg):
  58. enc.transform(X)
  59. @pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist"])
  60. def test_one_hot_encoder_handle_unknown_strings(handle_unknown):
  61. X = np.array(["11111111", "22", "333", "4444"]).reshape((-1, 1))
  62. X2 = np.array(["55555", "22"]).reshape((-1, 1))
  63. # Non Regression test for the issue #12470
  64. # Test the ignore option, when categories are numpy string dtype
  65. # particularly when the known category strings are larger
  66. # than the unknown category strings
  67. oh = OneHotEncoder(handle_unknown=handle_unknown)
  68. oh.fit(X)
  69. X2_passed = X2.copy()
  70. assert_array_equal(
  71. oh.transform(X2_passed).toarray(),
  72. np.array([[0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]]),
  73. )
  74. # ensure transformed data was not modified in place
  75. assert_array_equal(X2, X2_passed)
  76. @pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64])
  77. @pytest.mark.parametrize("input_dtype", [np.int32, np.float32, np.float64])
  78. def test_one_hot_encoder_dtype(input_dtype, output_dtype):
  79. X = np.asarray([[0, 1]], dtype=input_dtype).T
  80. X_expected = np.asarray([[1, 0], [0, 1]], dtype=output_dtype)
  81. oh = OneHotEncoder(categories="auto", dtype=output_dtype)
  82. assert_array_equal(oh.fit_transform(X).toarray(), X_expected)
  83. assert_array_equal(oh.fit(X).transform(X).toarray(), X_expected)
  84. oh = OneHotEncoder(categories="auto", dtype=output_dtype, sparse_output=False)
  85. assert_array_equal(oh.fit_transform(X), X_expected)
  86. assert_array_equal(oh.fit(X).transform(X), X_expected)
  87. @pytest.mark.parametrize("output_dtype", [np.int32, np.float32, np.float64])
  88. def test_one_hot_encoder_dtype_pandas(output_dtype):
  89. pd = pytest.importorskip("pandas")
  90. X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]})
  91. X_expected = np.array([[1, 0, 1, 0], [0, 1, 0, 1]], dtype=output_dtype)
  92. oh = OneHotEncoder(dtype=output_dtype)
  93. assert_array_equal(oh.fit_transform(X_df).toarray(), X_expected)
  94. assert_array_equal(oh.fit(X_df).transform(X_df).toarray(), X_expected)
  95. oh = OneHotEncoder(dtype=output_dtype, sparse_output=False)
  96. assert_array_equal(oh.fit_transform(X_df), X_expected)
  97. assert_array_equal(oh.fit(X_df).transform(X_df), X_expected)
  98. def test_one_hot_encoder_feature_names():
  99. enc = OneHotEncoder()
  100. X = [
  101. ["Male", 1, "girl", 2, 3],
  102. ["Female", 41, "girl", 1, 10],
  103. ["Male", 51, "boy", 12, 3],
  104. ["Male", 91, "girl", 21, 30],
  105. ]
  106. enc.fit(X)
  107. feature_names = enc.get_feature_names_out()
  108. assert_array_equal(
  109. [
  110. "x0_Female",
  111. "x0_Male",
  112. "x1_1",
  113. "x1_41",
  114. "x1_51",
  115. "x1_91",
  116. "x2_boy",
  117. "x2_girl",
  118. "x3_1",
  119. "x3_2",
  120. "x3_12",
  121. "x3_21",
  122. "x4_3",
  123. "x4_10",
  124. "x4_30",
  125. ],
  126. feature_names,
  127. )
  128. feature_names2 = enc.get_feature_names_out(["one", "two", "three", "four", "five"])
  129. assert_array_equal(
  130. [
  131. "one_Female",
  132. "one_Male",
  133. "two_1",
  134. "two_41",
  135. "two_51",
  136. "two_91",
  137. "three_boy",
  138. "three_girl",
  139. "four_1",
  140. "four_2",
  141. "four_12",
  142. "four_21",
  143. "five_3",
  144. "five_10",
  145. "five_30",
  146. ],
  147. feature_names2,
  148. )
  149. with pytest.raises(ValueError, match="input_features should have length"):
  150. enc.get_feature_names_out(["one", "two"])
  151. def test_one_hot_encoder_feature_names_unicode():
  152. enc = OneHotEncoder()
  153. X = np.array([["c❤t1", "dat2"]], dtype=object).T
  154. enc.fit(X)
  155. feature_names = enc.get_feature_names_out()
  156. assert_array_equal(["x0_c❤t1", "x0_dat2"], feature_names)
  157. feature_names = enc.get_feature_names_out(input_features=["n👍me"])
  158. assert_array_equal(["n👍me_c❤t1", "n👍me_dat2"], feature_names)
  159. def test_one_hot_encoder_custom_feature_name_combiner():
  160. """Check the behaviour of `feature_name_combiner` as a callable."""
  161. def name_combiner(feature, category):
  162. return feature + "_" + repr(category)
  163. enc = OneHotEncoder(feature_name_combiner=name_combiner)
  164. X = np.array([["None", None]], dtype=object).T
  165. enc.fit(X)
  166. feature_names = enc.get_feature_names_out()
  167. assert_array_equal(["x0_'None'", "x0_None"], feature_names)
  168. feature_names = enc.get_feature_names_out(input_features=["a"])
  169. assert_array_equal(["a_'None'", "a_None"], feature_names)
  170. def wrong_combiner(feature, category):
  171. # we should be returning a Python string
  172. return 0
  173. enc = OneHotEncoder(feature_name_combiner=wrong_combiner).fit(X)
  174. err_msg = (
  175. "When `feature_name_combiner` is a callable, it should return a Python string."
  176. )
  177. with pytest.raises(TypeError, match=err_msg):
  178. enc.get_feature_names_out()
  179. def test_one_hot_encoder_set_params():
  180. X = np.array([[1, 2]]).T
  181. oh = OneHotEncoder()
  182. # set params on not yet fitted object
  183. oh.set_params(categories=[[0, 1, 2, 3]])
  184. assert oh.get_params()["categories"] == [[0, 1, 2, 3]]
  185. assert oh.fit_transform(X).toarray().shape == (2, 4)
  186. # set params on already fitted object
  187. oh.set_params(categories=[[0, 1, 2, 3, 4]])
  188. assert oh.fit_transform(X).toarray().shape == (2, 5)
  189. def check_categorical_onehot(X):
  190. enc = OneHotEncoder(categories="auto")
  191. Xtr1 = enc.fit_transform(X)
  192. enc = OneHotEncoder(categories="auto", sparse_output=False)
  193. Xtr2 = enc.fit_transform(X)
  194. assert_allclose(Xtr1.toarray(), Xtr2)
  195. assert sparse.issparse(Xtr1) and Xtr1.format == "csr"
  196. return Xtr1.toarray()
  197. @pytest.mark.parametrize(
  198. "X",
  199. [
  200. [["def", 1, 55], ["abc", 2, 55]],
  201. np.array([[10, 1, 55], [5, 2, 55]]),
  202. np.array([["b", "A", "cat"], ["a", "B", "cat"]], dtype=object),
  203. np.array([["b", 1, "cat"], ["a", np.nan, "cat"]], dtype=object),
  204. np.array([["b", 1, "cat"], ["a", float("nan"), "cat"]], dtype=object),
  205. np.array([[None, 1, "cat"], ["a", 2, "cat"]], dtype=object),
  206. np.array([[None, 1, None], ["a", np.nan, None]], dtype=object),
  207. np.array([[None, 1, None], ["a", float("nan"), None]], dtype=object),
  208. ],
  209. ids=[
  210. "mixed",
  211. "numeric",
  212. "object",
  213. "mixed-nan",
  214. "mixed-float-nan",
  215. "mixed-None",
  216. "mixed-None-nan",
  217. "mixed-None-float-nan",
  218. ],
  219. )
  220. def test_one_hot_encoder(X):
  221. Xtr = check_categorical_onehot(np.array(X)[:, [0]])
  222. assert_allclose(Xtr, [[0, 1], [1, 0]])
  223. Xtr = check_categorical_onehot(np.array(X)[:, [0, 1]])
  224. assert_allclose(Xtr, [[0, 1, 1, 0], [1, 0, 0, 1]])
  225. Xtr = OneHotEncoder(categories="auto").fit_transform(X)
  226. assert_allclose(Xtr.toarray(), [[0, 1, 1, 0, 1], [1, 0, 0, 1, 1]])
  227. @pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist"])
  228. @pytest.mark.parametrize("sparse_", [False, True])
  229. @pytest.mark.parametrize("drop", [None, "first"])
  230. def test_one_hot_encoder_inverse(handle_unknown, sparse_, drop):
  231. X = [["abc", 2, 55], ["def", 1, 55], ["abc", 3, 55]]
  232. enc = OneHotEncoder(sparse_output=sparse_, drop=drop)
  233. X_tr = enc.fit_transform(X)
  234. exp = np.array(X, dtype=object)
  235. assert_array_equal(enc.inverse_transform(X_tr), exp)
  236. X = [[2, 55], [1, 55], [3, 55]]
  237. enc = OneHotEncoder(sparse_output=sparse_, categories="auto", drop=drop)
  238. X_tr = enc.fit_transform(X)
  239. exp = np.array(X)
  240. assert_array_equal(enc.inverse_transform(X_tr), exp)
  241. if drop is None:
  242. # with unknown categories
  243. # drop is incompatible with handle_unknown=ignore
  244. X = [["abc", 2, 55], ["def", 1, 55], ["abc", 3, 55]]
  245. enc = OneHotEncoder(
  246. sparse_output=sparse_,
  247. handle_unknown=handle_unknown,
  248. categories=[["abc", "def"], [1, 2], [54, 55, 56]],
  249. )
  250. X_tr = enc.fit_transform(X)
  251. exp = np.array(X, dtype=object)
  252. exp[2, 1] = None
  253. assert_array_equal(enc.inverse_transform(X_tr), exp)
  254. # with an otherwise numerical output, still object if unknown
  255. X = [[2, 55], [1, 55], [3, 55]]
  256. enc = OneHotEncoder(
  257. sparse_output=sparse_,
  258. categories=[[1, 2], [54, 56]],
  259. handle_unknown=handle_unknown,
  260. )
  261. X_tr = enc.fit_transform(X)
  262. exp = np.array(X, dtype=object)
  263. exp[2, 0] = None
  264. exp[:, 1] = None
  265. assert_array_equal(enc.inverse_transform(X_tr), exp)
  266. # incorrect shape raises
  267. X_tr = np.array([[0, 1, 1], [1, 0, 1]])
  268. msg = re.escape("Shape of the passed X data is not correct")
  269. with pytest.raises(ValueError, match=msg):
  270. enc.inverse_transform(X_tr)
  271. @pytest.mark.parametrize("sparse_", [False, True])
  272. @pytest.mark.parametrize(
  273. "X, X_trans",
  274. [
  275. ([[2, 55], [1, 55], [2, 55]], [[0, 1, 1], [0, 0, 0], [0, 1, 1]]),
  276. (
  277. [["one", "a"], ["two", "a"], ["three", "b"], ["two", "a"]],
  278. [[0, 0, 0, 0, 0], [0, 0, 0, 0, 1], [0, 1, 0, 0, 0]],
  279. ),
  280. ],
  281. )
  282. def test_one_hot_encoder_inverse_transform_raise_error_with_unknown(
  283. X, X_trans, sparse_
  284. ):
  285. """Check that `inverse_transform` raise an error with unknown samples, no
  286. dropped feature, and `handle_unknow="error`.
  287. Non-regression test for:
  288. https://github.com/scikit-learn/scikit-learn/issues/14934
  289. """
  290. enc = OneHotEncoder(sparse_output=sparse_).fit(X)
  291. msg = (
  292. r"Samples \[(\d )*\d\] can not be inverted when drop=None and "
  293. r"handle_unknown='error' because they contain all zeros"
  294. )
  295. if sparse_:
  296. # emulate sparse data transform by a one-hot encoder sparse.
  297. X_trans = _convert_container(X_trans, "sparse")
  298. with pytest.raises(ValueError, match=msg):
  299. enc.inverse_transform(X_trans)
  300. def test_one_hot_encoder_inverse_if_binary():
  301. X = np.array([["Male", 1], ["Female", 3], ["Female", 2]], dtype=object)
  302. ohe = OneHotEncoder(drop="if_binary", sparse_output=False)
  303. X_tr = ohe.fit_transform(X)
  304. assert_array_equal(ohe.inverse_transform(X_tr), X)
  305. @pytest.mark.parametrize("drop", ["if_binary", "first", None])
  306. @pytest.mark.parametrize("reset_drop", ["if_binary", "first", None])
  307. def test_one_hot_encoder_drop_reset(drop, reset_drop):
  308. # check that resetting drop option without refitting does not throw an error
  309. X = np.array([["Male", 1], ["Female", 3], ["Female", 2]], dtype=object)
  310. ohe = OneHotEncoder(drop=drop, sparse_output=False)
  311. ohe.fit(X)
  312. X_tr = ohe.transform(X)
  313. feature_names = ohe.get_feature_names_out()
  314. ohe.set_params(drop=reset_drop)
  315. assert_array_equal(ohe.inverse_transform(X_tr), X)
  316. assert_allclose(ohe.transform(X), X_tr)
  317. assert_array_equal(ohe.get_feature_names_out(), feature_names)
  318. @pytest.mark.parametrize("method", ["fit", "fit_transform"])
  319. @pytest.mark.parametrize("X", [[1, 2], np.array([3.0, 4.0])])
  320. def test_X_is_not_1D(X, method):
  321. oh = OneHotEncoder()
  322. msg = "Expected 2D array, got 1D array instead"
  323. with pytest.raises(ValueError, match=msg):
  324. getattr(oh, method)(X)
  325. @pytest.mark.parametrize("method", ["fit", "fit_transform"])
  326. def test_X_is_not_1D_pandas(method):
  327. pd = pytest.importorskip("pandas")
  328. X = pd.Series([6, 3, 4, 6])
  329. oh = OneHotEncoder()
  330. msg = "Expected 2D array, got 1D array instead"
  331. with pytest.raises(ValueError, match=msg):
  332. getattr(oh, method)(X)
  333. @pytest.mark.parametrize(
  334. "X, cat_exp, cat_dtype",
  335. [
  336. ([["abc", 55], ["def", 55]], [["abc", "def"], [55]], np.object_),
  337. (np.array([[1, 2], [3, 2]]), [[1, 3], [2]], np.integer),
  338. (
  339. np.array([["A", "cat"], ["B", "cat"]], dtype=object),
  340. [["A", "B"], ["cat"]],
  341. np.object_,
  342. ),
  343. (np.array([["A", "cat"], ["B", "cat"]]), [["A", "B"], ["cat"]], np.str_),
  344. (np.array([[1, 2], [np.nan, 2]]), [[1, np.nan], [2]], np.float64),
  345. (
  346. np.array([["A", np.nan], [None, np.nan]], dtype=object),
  347. [["A", None], [np.nan]],
  348. np.object_,
  349. ),
  350. (
  351. np.array([["A", float("nan")], [None, float("nan")]], dtype=object),
  352. [["A", None], [float("nan")]],
  353. np.object_,
  354. ),
  355. ],
  356. ids=[
  357. "mixed",
  358. "numeric",
  359. "object",
  360. "string",
  361. "missing-float",
  362. "missing-np.nan-object",
  363. "missing-float-nan-object",
  364. ],
  365. )
  366. def test_one_hot_encoder_categories(X, cat_exp, cat_dtype):
  367. # order of categories should not depend on order of samples
  368. for Xi in [X, X[::-1]]:
  369. enc = OneHotEncoder(categories="auto")
  370. enc.fit(Xi)
  371. # assert enc.categories == 'auto'
  372. assert isinstance(enc.categories_, list)
  373. for res, exp in zip(enc.categories_, cat_exp):
  374. res_list = res.tolist()
  375. if is_scalar_nan(exp[-1]):
  376. assert is_scalar_nan(res_list[-1])
  377. assert res_list[:-1] == exp[:-1]
  378. else:
  379. assert res.tolist() == exp
  380. assert np.issubdtype(res.dtype, cat_dtype)
  381. @pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist"])
  382. @pytest.mark.parametrize(
  383. "X, X2, cats, cat_dtype",
  384. [
  385. (
  386. np.array([["a", "b"]], dtype=object).T,
  387. np.array([["a", "d"]], dtype=object).T,
  388. [["a", "b", "c"]],
  389. np.object_,
  390. ),
  391. (
  392. np.array([[1, 2]], dtype="int64").T,
  393. np.array([[1, 4]], dtype="int64").T,
  394. [[1, 2, 3]],
  395. np.int64,
  396. ),
  397. (
  398. np.array([["a", "b"]], dtype=object).T,
  399. np.array([["a", "d"]], dtype=object).T,
  400. [np.array(["a", "b", "c"])],
  401. np.object_,
  402. ),
  403. (
  404. np.array([[None, "a"]], dtype=object).T,
  405. np.array([[None, "b"]], dtype=object).T,
  406. [[None, "a", "z"]],
  407. object,
  408. ),
  409. (
  410. np.array([["a", "b"]], dtype=object).T,
  411. np.array([["a", np.nan]], dtype=object).T,
  412. [["a", "b", "z"]],
  413. object,
  414. ),
  415. (
  416. np.array([["a", None]], dtype=object).T,
  417. np.array([["a", np.nan]], dtype=object).T,
  418. [["a", None, "z"]],
  419. object,
  420. ),
  421. (
  422. np.array([["a", np.nan]], dtype=object).T,
  423. np.array([["a", None]], dtype=object).T,
  424. [["a", np.nan, "z"]],
  425. object,
  426. ),
  427. ],
  428. ids=[
  429. "object",
  430. "numeric",
  431. "object-string",
  432. "object-string-none",
  433. "object-string-nan",
  434. "object-None-and-nan",
  435. "object-nan-and-None",
  436. ],
  437. )
  438. def test_one_hot_encoder_specified_categories(X, X2, cats, cat_dtype, handle_unknown):
  439. enc = OneHotEncoder(categories=cats)
  440. exp = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]])
  441. assert_array_equal(enc.fit_transform(X).toarray(), exp)
  442. assert list(enc.categories[0]) == list(cats[0])
  443. assert enc.categories_[0].tolist() == list(cats[0])
  444. # manually specified categories should have same dtype as
  445. # the data when coerced from lists
  446. assert enc.categories_[0].dtype == cat_dtype
  447. # when specifying categories manually, unknown categories should already
  448. # raise when fitting
  449. enc = OneHotEncoder(categories=cats)
  450. with pytest.raises(ValueError, match="Found unknown categories"):
  451. enc.fit(X2)
  452. enc = OneHotEncoder(categories=cats, handle_unknown=handle_unknown)
  453. exp = np.array([[1.0, 0.0, 0.0], [0.0, 0.0, 0.0]])
  454. assert_array_equal(enc.fit(X2).transform(X2).toarray(), exp)
  455. def test_one_hot_encoder_unsorted_categories():
  456. X = np.array([["a", "b"]], dtype=object).T
  457. enc = OneHotEncoder(categories=[["b", "a", "c"]])
  458. exp = np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 0.0]])
  459. assert_array_equal(enc.fit(X).transform(X).toarray(), exp)
  460. assert_array_equal(enc.fit_transform(X).toarray(), exp)
  461. assert enc.categories_[0].tolist() == ["b", "a", "c"]
  462. assert np.issubdtype(enc.categories_[0].dtype, np.object_)
  463. # unsorted passed categories still raise for numerical values
  464. X = np.array([[1, 2]]).T
  465. enc = OneHotEncoder(categories=[[2, 1, 3]])
  466. msg = "Unsorted categories are not supported"
  467. with pytest.raises(ValueError, match=msg):
  468. enc.fit_transform(X)
  469. # np.nan must be the last category in categories[0] to be considered sorted
  470. X = np.array([[1, 2, np.nan]]).T
  471. enc = OneHotEncoder(categories=[[1, np.nan, 2]])
  472. with pytest.raises(ValueError, match=msg):
  473. enc.fit_transform(X)
  474. def test_one_hot_encoder_specified_categories_mixed_columns():
  475. # multiple columns
  476. X = np.array([["a", "b"], [0, 2]], dtype=object).T
  477. enc = OneHotEncoder(categories=[["a", "b", "c"], [0, 1, 2]])
  478. exp = np.array([[1.0, 0.0, 0.0, 1.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0, 0.0, 1.0]])
  479. assert_array_equal(enc.fit_transform(X).toarray(), exp)
  480. assert enc.categories_[0].tolist() == ["a", "b", "c"]
  481. assert np.issubdtype(enc.categories_[0].dtype, np.object_)
  482. assert enc.categories_[1].tolist() == [0, 1, 2]
  483. # integer categories but from object dtype data
  484. assert np.issubdtype(enc.categories_[1].dtype, np.object_)
  485. def test_one_hot_encoder_pandas():
  486. pd = pytest.importorskip("pandas")
  487. X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]})
  488. Xtr = check_categorical_onehot(X_df)
  489. assert_allclose(Xtr, [[1, 0, 1, 0], [0, 1, 0, 1]])
  490. @pytest.mark.parametrize(
  491. "drop, expected_names",
  492. [
  493. ("first", ["x0_c", "x2_b"]),
  494. ("if_binary", ["x0_c", "x1_2", "x2_b"]),
  495. (["c", 2, "b"], ["x0_b", "x2_a"]),
  496. ],
  497. ids=["first", "binary", "manual"],
  498. )
  499. def test_one_hot_encoder_feature_names_drop(drop, expected_names):
  500. X = [["c", 2, "a"], ["b", 2, "b"]]
  501. ohe = OneHotEncoder(drop=drop)
  502. ohe.fit(X)
  503. feature_names = ohe.get_feature_names_out()
  504. assert_array_equal(expected_names, feature_names)
  505. def test_one_hot_encoder_drop_equals_if_binary():
  506. # Canonical case
  507. X = [[10, "yes"], [20, "no"], [30, "yes"]]
  508. expected = np.array(
  509. [[1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 1.0]]
  510. )
  511. expected_drop_idx = np.array([None, 0])
  512. ohe = OneHotEncoder(drop="if_binary", sparse_output=False)
  513. result = ohe.fit_transform(X)
  514. assert_array_equal(ohe.drop_idx_, expected_drop_idx)
  515. assert_allclose(result, expected)
  516. # with only one cat, the behaviour is equivalent to drop=None
  517. X = [["true", "a"], ["false", "a"], ["false", "a"]]
  518. expected = np.array([[1.0, 1.0], [0.0, 1.0], [0.0, 1.0]])
  519. expected_drop_idx = np.array([0, None])
  520. ohe = OneHotEncoder(drop="if_binary", sparse_output=False)
  521. result = ohe.fit_transform(X)
  522. assert_array_equal(ohe.drop_idx_, expected_drop_idx)
  523. assert_allclose(result, expected)
  524. @pytest.mark.parametrize(
  525. "X",
  526. [
  527. [["abc", 2, 55], ["def", 1, 55]],
  528. np.array([[10, 2, 55], [20, 1, 55]]),
  529. np.array([["a", "B", "cat"], ["b", "A", "cat"]], dtype=object),
  530. ],
  531. ids=["mixed", "numeric", "object"],
  532. )
  533. def test_ordinal_encoder(X):
  534. enc = OrdinalEncoder()
  535. exp = np.array([[0, 1, 0], [1, 0, 0]], dtype="int64")
  536. assert_array_equal(enc.fit_transform(X), exp.astype("float64"))
  537. enc = OrdinalEncoder(dtype="int64")
  538. assert_array_equal(enc.fit_transform(X), exp)
  539. @pytest.mark.parametrize(
  540. "X, X2, cats, cat_dtype",
  541. [
  542. (
  543. np.array([["a", "b"]], dtype=object).T,
  544. np.array([["a", "d"]], dtype=object).T,
  545. [["a", "b", "c"]],
  546. np.object_,
  547. ),
  548. (
  549. np.array([[1, 2]], dtype="int64").T,
  550. np.array([[1, 4]], dtype="int64").T,
  551. [[1, 2, 3]],
  552. np.int64,
  553. ),
  554. (
  555. np.array([["a", "b"]], dtype=object).T,
  556. np.array([["a", "d"]], dtype=object).T,
  557. [np.array(["a", "b", "c"])],
  558. np.object_,
  559. ),
  560. ],
  561. ids=["object", "numeric", "object-string-cat"],
  562. )
  563. def test_ordinal_encoder_specified_categories(X, X2, cats, cat_dtype):
  564. enc = OrdinalEncoder(categories=cats)
  565. exp = np.array([[0.0], [1.0]])
  566. assert_array_equal(enc.fit_transform(X), exp)
  567. assert list(enc.categories[0]) == list(cats[0])
  568. assert enc.categories_[0].tolist() == list(cats[0])
  569. # manually specified categories should have same dtype as
  570. # the data when coerced from lists
  571. assert enc.categories_[0].dtype == cat_dtype
  572. # when specifying categories manually, unknown categories should already
  573. # raise when fitting
  574. enc = OrdinalEncoder(categories=cats)
  575. with pytest.raises(ValueError, match="Found unknown categories"):
  576. enc.fit(X2)
  577. def test_ordinal_encoder_inverse():
  578. X = [["abc", 2, 55], ["def", 1, 55]]
  579. enc = OrdinalEncoder()
  580. X_tr = enc.fit_transform(X)
  581. exp = np.array(X, dtype=object)
  582. assert_array_equal(enc.inverse_transform(X_tr), exp)
  583. # incorrect shape raises
  584. X_tr = np.array([[0, 1, 1, 2], [1, 0, 1, 0]])
  585. msg = re.escape("Shape of the passed X data is not correct")
  586. with pytest.raises(ValueError, match=msg):
  587. enc.inverse_transform(X_tr)
  588. def test_ordinal_encoder_handle_unknowns_string():
  589. enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-2)
  590. X_fit = np.array([["a", "x"], ["b", "y"], ["c", "z"]], dtype=object)
  591. X_trans = np.array([["c", "xy"], ["bla", "y"], ["a", "x"]], dtype=object)
  592. enc.fit(X_fit)
  593. X_trans_enc = enc.transform(X_trans)
  594. exp = np.array([[2, -2], [-2, 1], [0, 0]], dtype="int64")
  595. assert_array_equal(X_trans_enc, exp)
  596. X_trans_inv = enc.inverse_transform(X_trans_enc)
  597. inv_exp = np.array([["c", None], [None, "y"], ["a", "x"]], dtype=object)
  598. assert_array_equal(X_trans_inv, inv_exp)
  599. @pytest.mark.parametrize("dtype", [float, int])
  600. def test_ordinal_encoder_handle_unknowns_numeric(dtype):
  601. enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-999)
  602. X_fit = np.array([[1, 7], [2, 8], [3, 9]], dtype=dtype)
  603. X_trans = np.array([[3, 12], [23, 8], [1, 7]], dtype=dtype)
  604. enc.fit(X_fit)
  605. X_trans_enc = enc.transform(X_trans)
  606. exp = np.array([[2, -999], [-999, 1], [0, 0]], dtype="int64")
  607. assert_array_equal(X_trans_enc, exp)
  608. X_trans_inv = enc.inverse_transform(X_trans_enc)
  609. inv_exp = np.array([[3, None], [None, 8], [1, 7]], dtype=object)
  610. assert_array_equal(X_trans_inv, inv_exp)
  611. def test_ordinal_encoder_handle_unknowns_nan():
  612. # Make sure unknown_value=np.nan properly works
  613. enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=np.nan)
  614. X_fit = np.array([[1], [2], [3]])
  615. enc.fit(X_fit)
  616. X_trans = enc.transform([[1], [2], [4]])
  617. assert_array_equal(X_trans, [[0], [1], [np.nan]])
  618. def test_ordinal_encoder_handle_unknowns_nan_non_float_dtype():
  619. # Make sure an error is raised when unknown_value=np.nan and the dtype
  620. # isn't a float dtype
  621. enc = OrdinalEncoder(
  622. handle_unknown="use_encoded_value", unknown_value=np.nan, dtype=int
  623. )
  624. X_fit = np.array([[1], [2], [3]])
  625. with pytest.raises(ValueError, match="dtype parameter should be a float dtype"):
  626. enc.fit(X_fit)
  627. def test_ordinal_encoder_raise_categories_shape():
  628. X = np.array([["Low", "Medium", "High", "Medium", "Low"]], dtype=object).T
  629. cats = ["Low", "Medium", "High"]
  630. enc = OrdinalEncoder(categories=cats)
  631. msg = "Shape mismatch: if categories is an array,"
  632. with pytest.raises(ValueError, match=msg):
  633. enc.fit(X)
  634. def test_encoder_dtypes():
  635. # check that dtypes are preserved when determining categories
  636. enc = OneHotEncoder(categories="auto")
  637. exp = np.array([[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0]], dtype="float64")
  638. for X in [
  639. np.array([[1, 2], [3, 4]], dtype="int64"),
  640. np.array([[1, 2], [3, 4]], dtype="float64"),
  641. np.array([["a", "b"], ["c", "d"]]), # str dtype
  642. np.array([[b"a", b"b"], [b"c", b"d"]]), # bytes dtype
  643. np.array([[1, "a"], [3, "b"]], dtype="object"),
  644. ]:
  645. enc.fit(X)
  646. assert all([enc.categories_[i].dtype == X.dtype for i in range(2)])
  647. assert_array_equal(enc.transform(X).toarray(), exp)
  648. X = [[1, 2], [3, 4]]
  649. enc.fit(X)
  650. assert all([np.issubdtype(enc.categories_[i].dtype, np.integer) for i in range(2)])
  651. assert_array_equal(enc.transform(X).toarray(), exp)
  652. X = [[1, "a"], [3, "b"]]
  653. enc.fit(X)
  654. assert all([enc.categories_[i].dtype == "object" for i in range(2)])
  655. assert_array_equal(enc.transform(X).toarray(), exp)
  656. def test_encoder_dtypes_pandas():
  657. # check dtype (similar to test_categorical_encoder_dtypes for dataframes)
  658. pd = pytest.importorskip("pandas")
  659. enc = OneHotEncoder(categories="auto")
  660. exp = np.array(
  661. [[1.0, 0.0, 1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0, 0.0, 1.0]],
  662. dtype="float64",
  663. )
  664. X = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}, dtype="int64")
  665. enc.fit(X)
  666. assert all([enc.categories_[i].dtype == "int64" for i in range(2)])
  667. assert_array_equal(enc.transform(X).toarray(), exp)
  668. X = pd.DataFrame({"A": [1, 2], "B": ["a", "b"], "C": [3.0, 4.0]})
  669. X_type = [X["A"].dtype, X["B"].dtype, X["C"].dtype]
  670. enc.fit(X)
  671. assert all([enc.categories_[i].dtype == X_type[i] for i in range(3)])
  672. assert_array_equal(enc.transform(X).toarray(), exp)
  673. def test_one_hot_encoder_warning():
  674. enc = OneHotEncoder()
  675. X = [["Male", 1], ["Female", 3]]
  676. np.testing.assert_no_warnings(enc.fit_transform, X)
  677. @pytest.mark.parametrize("missing_value", [np.nan, None, float("nan")])
  678. def test_one_hot_encoder_drop_manual(missing_value):
  679. cats_to_drop = ["def", 12, 3, 56, missing_value]
  680. enc = OneHotEncoder(drop=cats_to_drop)
  681. X = [
  682. ["abc", 12, 2, 55, "a"],
  683. ["def", 12, 1, 55, "a"],
  684. ["def", 12, 3, 56, missing_value],
  685. ]
  686. trans = enc.fit_transform(X).toarray()
  687. exp = [[1, 0, 1, 1, 1], [0, 1, 0, 1, 1], [0, 0, 0, 0, 0]]
  688. assert_array_equal(trans, exp)
  689. assert enc.drop is cats_to_drop
  690. dropped_cats = [
  691. cat[feature] for cat, feature in zip(enc.categories_, enc.drop_idx_)
  692. ]
  693. X_inv_trans = enc.inverse_transform(trans)
  694. X_array = np.array(X, dtype=object)
  695. # last value is np.nan
  696. if is_scalar_nan(cats_to_drop[-1]):
  697. assert_array_equal(dropped_cats[:-1], cats_to_drop[:-1])
  698. assert is_scalar_nan(dropped_cats[-1])
  699. assert is_scalar_nan(cats_to_drop[-1])
  700. # do not include the last column which includes missing values
  701. assert_array_equal(X_array[:, :-1], X_inv_trans[:, :-1])
  702. # check last column is the missing value
  703. assert_array_equal(X_array[-1, :-1], X_inv_trans[-1, :-1])
  704. assert is_scalar_nan(X_array[-1, -1])
  705. assert is_scalar_nan(X_inv_trans[-1, -1])
  706. else:
  707. assert_array_equal(dropped_cats, cats_to_drop)
  708. assert_array_equal(X_array, X_inv_trans)
  709. @pytest.mark.parametrize("drop", [["abc", 3], ["abc", 3, 41, "a"]])
  710. def test_invalid_drop_length(drop):
  711. enc = OneHotEncoder(drop=drop)
  712. err_msg = "`drop` should have length equal to the number"
  713. with pytest.raises(ValueError, match=err_msg):
  714. enc.fit([["abc", 2, 55], ["def", 1, 55], ["def", 3, 59]])
  715. @pytest.mark.parametrize("density", [True, False], ids=["sparse", "dense"])
  716. @pytest.mark.parametrize("drop", ["first", ["a", 2, "b"]], ids=["first", "manual"])
  717. def test_categories(density, drop):
  718. ohe_base = OneHotEncoder(sparse_output=density)
  719. ohe_test = OneHotEncoder(sparse_output=density, drop=drop)
  720. X = [["c", 1, "a"], ["a", 2, "b"]]
  721. ohe_base.fit(X)
  722. ohe_test.fit(X)
  723. assert_array_equal(ohe_base.categories_, ohe_test.categories_)
  724. if drop == "first":
  725. assert_array_equal(ohe_test.drop_idx_, 0)
  726. else:
  727. for drop_cat, drop_idx, cat_list in zip(
  728. drop, ohe_test.drop_idx_, ohe_test.categories_
  729. ):
  730. assert cat_list[int(drop_idx)] == drop_cat
  731. assert isinstance(ohe_test.drop_idx_, np.ndarray)
  732. assert ohe_test.drop_idx_.dtype == object
  733. @pytest.mark.parametrize("Encoder", [OneHotEncoder, OrdinalEncoder])
  734. def test_encoders_has_categorical_tags(Encoder):
  735. assert "categorical" in Encoder()._get_tags()["X_types"]
  736. @pytest.mark.parametrize(
  737. "kwargs",
  738. [
  739. {"max_categories": 2},
  740. {"min_frequency": 11},
  741. {"min_frequency": 0.29},
  742. {"max_categories": 2, "min_frequency": 6},
  743. {"max_categories": 4, "min_frequency": 12},
  744. ],
  745. )
  746. @pytest.mark.parametrize("categories", ["auto", [["a", "b", "c", "d"]]])
  747. def test_ohe_infrequent_two_levels(kwargs, categories):
  748. """Test that different parameters for combine 'a', 'c', and 'd' into
  749. the infrequent category works as expected."""
  750. X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
  751. ohe = OneHotEncoder(
  752. categories=categories,
  753. handle_unknown="infrequent_if_exist",
  754. sparse_output=False,
  755. **kwargs,
  756. ).fit(X_train)
  757. assert_array_equal(ohe.infrequent_categories_, [["a", "c", "d"]])
  758. X_test = [["b"], ["a"], ["c"], ["d"], ["e"]]
  759. expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]])
  760. X_trans = ohe.transform(X_test)
  761. assert_allclose(expected, X_trans)
  762. expected_inv = [[col] for col in ["b"] + ["infrequent_sklearn"] * 4]
  763. X_inv = ohe.inverse_transform(X_trans)
  764. assert_array_equal(expected_inv, X_inv)
  765. feature_names = ohe.get_feature_names_out()
  766. assert_array_equal(["x0_b", "x0_infrequent_sklearn"], feature_names)
  767. @pytest.mark.parametrize("drop", ["if_binary", "first", ["b"]])
  768. def test_ohe_infrequent_two_levels_drop_frequent(drop):
  769. """Test two levels and dropping the frequent category."""
  770. X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
  771. ohe = OneHotEncoder(
  772. handle_unknown="infrequent_if_exist",
  773. sparse_output=False,
  774. max_categories=2,
  775. drop=drop,
  776. ).fit(X_train)
  777. assert ohe.categories_[0][ohe.drop_idx_[0]] == "b"
  778. X_test = np.array([["b"], ["c"]])
  779. X_trans = ohe.transform(X_test)
  780. assert_allclose([[0], [1]], X_trans)
  781. feature_names = ohe.get_feature_names_out()
  782. assert_array_equal(["x0_infrequent_sklearn"], feature_names)
  783. X_inverse = ohe.inverse_transform(X_trans)
  784. assert_array_equal([["b"], ["infrequent_sklearn"]], X_inverse)
  785. @pytest.mark.parametrize("drop", [["a"], ["d"]])
  786. def test_ohe_infrequent_two_levels_drop_infrequent_errors(drop):
  787. """Test two levels and dropping any infrequent category removes the
  788. whole infrequent category."""
  789. X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
  790. ohe = OneHotEncoder(
  791. handle_unknown="infrequent_if_exist",
  792. sparse_output=False,
  793. max_categories=2,
  794. drop=drop,
  795. )
  796. msg = f"Unable to drop category {drop[0]!r} from feature 0 because it is infrequent"
  797. with pytest.raises(ValueError, match=msg):
  798. ohe.fit(X_train)
  799. @pytest.mark.parametrize(
  800. "kwargs",
  801. [
  802. {"max_categories": 3},
  803. {"min_frequency": 6},
  804. {"min_frequency": 9},
  805. {"min_frequency": 0.24},
  806. {"min_frequency": 0.16},
  807. {"max_categories": 3, "min_frequency": 8},
  808. {"max_categories": 4, "min_frequency": 6},
  809. ],
  810. )
  811. def test_ohe_infrequent_three_levels(kwargs):
  812. """Test that different parameters for combing 'a', and 'd' into
  813. the infrequent category works as expected."""
  814. X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
  815. ohe = OneHotEncoder(
  816. handle_unknown="infrequent_if_exist", sparse_output=False, **kwargs
  817. ).fit(X_train)
  818. assert_array_equal(ohe.infrequent_categories_, [["a", "d"]])
  819. X_test = [["b"], ["a"], ["c"], ["d"], ["e"]]
  820. expected = np.array([[1, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1], [0, 0, 1]])
  821. X_trans = ohe.transform(X_test)
  822. assert_allclose(expected, X_trans)
  823. expected_inv = [
  824. ["b"],
  825. ["infrequent_sklearn"],
  826. ["c"],
  827. ["infrequent_sklearn"],
  828. ["infrequent_sklearn"],
  829. ]
  830. X_inv = ohe.inverse_transform(X_trans)
  831. assert_array_equal(expected_inv, X_inv)
  832. feature_names = ohe.get_feature_names_out()
  833. assert_array_equal(["x0_b", "x0_c", "x0_infrequent_sklearn"], feature_names)
  834. @pytest.mark.parametrize("drop", ["first", ["b"]])
  835. def test_ohe_infrequent_three_levels_drop_frequent(drop):
  836. """Test three levels and dropping the frequent category."""
  837. X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
  838. ohe = OneHotEncoder(
  839. handle_unknown="infrequent_if_exist",
  840. sparse_output=False,
  841. max_categories=3,
  842. drop=drop,
  843. ).fit(X_train)
  844. X_test = np.array([["b"], ["c"], ["d"]])
  845. assert_allclose([[0, 0], [1, 0], [0, 1]], ohe.transform(X_test))
  846. # Check handle_unknown="ignore"
  847. ohe.set_params(handle_unknown="ignore").fit(X_train)
  848. msg = "Found unknown categories"
  849. with pytest.warns(UserWarning, match=msg):
  850. X_trans = ohe.transform([["b"], ["e"]])
  851. assert_allclose([[0, 0], [0, 0]], X_trans)
  852. @pytest.mark.parametrize("drop", [["a"], ["d"]])
  853. def test_ohe_infrequent_three_levels_drop_infrequent_errors(drop):
  854. """Test three levels and dropping the infrequent category."""
  855. X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
  856. ohe = OneHotEncoder(
  857. handle_unknown="infrequent_if_exist",
  858. sparse_output=False,
  859. max_categories=3,
  860. drop=drop,
  861. )
  862. msg = f"Unable to drop category {drop[0]!r} from feature 0 because it is infrequent"
  863. with pytest.raises(ValueError, match=msg):
  864. ohe.fit(X_train)
  865. def test_ohe_infrequent_handle_unknown_error():
  866. """Test that different parameters for combining 'a', and 'd' into
  867. the infrequent category works as expected."""
  868. X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
  869. ohe = OneHotEncoder(
  870. handle_unknown="error", sparse_output=False, max_categories=3
  871. ).fit(X_train)
  872. assert_array_equal(ohe.infrequent_categories_, [["a", "d"]])
  873. # all categories are known
  874. X_test = [["b"], ["a"], ["c"], ["d"]]
  875. expected = np.array([[1, 0, 0], [0, 0, 1], [0, 1, 0], [0, 0, 1]])
  876. X_trans = ohe.transform(X_test)
  877. assert_allclose(expected, X_trans)
  878. # 'bad' is not known and will error
  879. X_test = [["bad"]]
  880. msg = r"Found unknown categories \['bad'\] in column 0"
  881. with pytest.raises(ValueError, match=msg):
  882. ohe.transform(X_test)
  883. @pytest.mark.parametrize(
  884. "kwargs", [{"max_categories": 3, "min_frequency": 1}, {"min_frequency": 4}]
  885. )
  886. def test_ohe_infrequent_two_levels_user_cats_one_frequent(kwargs):
  887. """'a' is the only frequent category, all other categories are infrequent."""
  888. X_train = np.array([["a"] * 5 + ["e"] * 30], dtype=object).T
  889. ohe = OneHotEncoder(
  890. categories=[["c", "d", "a", "b"]],
  891. sparse_output=False,
  892. handle_unknown="infrequent_if_exist",
  893. **kwargs,
  894. ).fit(X_train)
  895. X_test = [["a"], ["b"], ["c"], ["d"], ["e"]]
  896. expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]])
  897. X_trans = ohe.transform(X_test)
  898. assert_allclose(expected, X_trans)
  899. # 'a' is dropped
  900. drops = ["first", "if_binary", ["a"]]
  901. X_test = [["a"], ["c"]]
  902. for drop in drops:
  903. ohe.set_params(drop=drop).fit(X_train)
  904. assert_allclose([[0], [1]], ohe.transform(X_test))
  905. def test_ohe_infrequent_two_levels_user_cats():
  906. """Test that the order of the categories provided by a user is respected."""
  907. X_train = np.array(
  908. [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object
  909. ).T
  910. ohe = OneHotEncoder(
  911. categories=[["c", "d", "a", "b"]],
  912. sparse_output=False,
  913. handle_unknown="infrequent_if_exist",
  914. max_categories=2,
  915. ).fit(X_train)
  916. assert_array_equal(ohe.infrequent_categories_, [["c", "d", "a"]])
  917. X_test = [["b"], ["a"], ["c"], ["d"], ["e"]]
  918. expected = np.array([[1, 0], [0, 1], [0, 1], [0, 1], [0, 1]])
  919. X_trans = ohe.transform(X_test)
  920. assert_allclose(expected, X_trans)
  921. # 'infrequent' is used to denote the infrequent categories for
  922. # `inverse_transform`
  923. expected_inv = [[col] for col in ["b"] + ["infrequent_sklearn"] * 4]
  924. X_inv = ohe.inverse_transform(X_trans)
  925. assert_array_equal(expected_inv, X_inv)
  926. def test_ohe_infrequent_three_levels_user_cats():
  927. """Test that the order of the categories provided by a user is respected.
  928. In this case 'c' is encoded as the first category and 'b' is encoded
  929. as the second one."""
  930. X_train = np.array(
  931. [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object
  932. ).T
  933. ohe = OneHotEncoder(
  934. categories=[["c", "d", "b", "a"]],
  935. sparse_output=False,
  936. handle_unknown="infrequent_if_exist",
  937. max_categories=3,
  938. ).fit(X_train)
  939. assert_array_equal(ohe.infrequent_categories_, [["d", "a"]])
  940. X_test = [["b"], ["a"], ["c"], ["d"], ["e"]]
  941. expected = np.array([[0, 1, 0], [0, 0, 1], [1, 0, 0], [0, 0, 1], [0, 0, 1]])
  942. X_trans = ohe.transform(X_test)
  943. assert_allclose(expected, X_trans)
  944. # 'infrequent' is used to denote the infrequent categories for
  945. # `inverse_transform`
  946. expected_inv = [
  947. ["b"],
  948. ["infrequent_sklearn"],
  949. ["c"],
  950. ["infrequent_sklearn"],
  951. ["infrequent_sklearn"],
  952. ]
  953. X_inv = ohe.inverse_transform(X_trans)
  954. assert_array_equal(expected_inv, X_inv)
  955. def test_ohe_infrequent_mixed():
  956. """Test infrequent categories where feature 0 has infrequent categories,
  957. and feature 1 does not."""
  958. # X[:, 0] 1 and 2 are infrequent
  959. # X[:, 1] nothing is infrequent
  960. X = np.c_[[0, 1, 3, 3, 3, 3, 2, 0, 3], [0, 0, 0, 0, 1, 1, 1, 1, 1]]
  961. ohe = OneHotEncoder(max_categories=3, drop="if_binary", sparse_output=False)
  962. ohe.fit(X)
  963. X_test = [[3, 0], [1, 1]]
  964. X_trans = ohe.transform(X_test)
  965. # feature 1 is binary so it drops a category 0
  966. assert_allclose(X_trans, [[0, 1, 0, 0], [0, 0, 1, 1]])
  967. def test_ohe_infrequent_multiple_categories():
  968. """Test infrequent categories with feature matrix with 3 features."""
  969. X = np.c_[
  970. [0, 1, 3, 3, 3, 3, 2, 0, 3],
  971. [0, 0, 5, 1, 1, 10, 5, 5, 0],
  972. [1, 0, 1, 0, 1, 0, 1, 0, 1],
  973. ]
  974. ohe = OneHotEncoder(
  975. categories="auto", max_categories=3, handle_unknown="infrequent_if_exist"
  976. )
  977. # X[:, 0] 1 and 2 are infrequent
  978. # X[:, 1] 1 and 10 are infrequent
  979. # X[:, 2] nothing is infrequent
  980. X_trans = ohe.fit_transform(X).toarray()
  981. assert_array_equal(ohe.infrequent_categories_[0], [1, 2])
  982. assert_array_equal(ohe.infrequent_categories_[1], [1, 10])
  983. assert_array_equal(ohe.infrequent_categories_[2], None)
  984. # 'infrequent' is used to denote the infrequent categories
  985. # For the first column, 1 and 2 have the same frequency. In this case,
  986. # 1 will be chosen to be the feature name because is smaller lexiconically
  987. feature_names = ohe.get_feature_names_out()
  988. assert_array_equal(
  989. [
  990. "x0_0",
  991. "x0_3",
  992. "x0_infrequent_sklearn",
  993. "x1_0",
  994. "x1_5",
  995. "x1_infrequent_sklearn",
  996. "x2_0",
  997. "x2_1",
  998. ],
  999. feature_names,
  1000. )
  1001. expected = [
  1002. [1, 0, 0, 1, 0, 0, 0, 1],
  1003. [0, 0, 1, 1, 0, 0, 1, 0],
  1004. [0, 1, 0, 0, 1, 0, 0, 1],
  1005. [0, 1, 0, 0, 0, 1, 1, 0],
  1006. [0, 1, 0, 0, 0, 1, 0, 1],
  1007. [0, 1, 0, 0, 0, 1, 1, 0],
  1008. [0, 0, 1, 0, 1, 0, 0, 1],
  1009. [1, 0, 0, 0, 1, 0, 1, 0],
  1010. [0, 1, 0, 1, 0, 0, 0, 1],
  1011. ]
  1012. assert_allclose(expected, X_trans)
  1013. X_test = [[3, 1, 2], [4, 0, 3]]
  1014. X_test_trans = ohe.transform(X_test)
  1015. # X[:, 2] does not have an infrequent category, thus it is encoded as all
  1016. # zeros
  1017. expected = [[0, 1, 0, 0, 0, 1, 0, 0], [0, 0, 1, 1, 0, 0, 0, 0]]
  1018. assert_allclose(expected, X_test_trans.toarray())
  1019. X_inv = ohe.inverse_transform(X_test_trans)
  1020. expected_inv = np.array(
  1021. [[3, "infrequent_sklearn", None], ["infrequent_sklearn", 0, None]], dtype=object
  1022. )
  1023. assert_array_equal(expected_inv, X_inv)
  1024. # error for unknown categories
  1025. ohe = OneHotEncoder(
  1026. categories="auto", max_categories=3, handle_unknown="error"
  1027. ).fit(X)
  1028. with pytest.raises(ValueError, match="Found unknown categories"):
  1029. ohe.transform(X_test)
  1030. # only infrequent or known categories
  1031. X_test = [[1, 1, 1], [3, 10, 0]]
  1032. X_test_trans = ohe.transform(X_test)
  1033. expected = [[0, 0, 1, 0, 0, 1, 0, 1], [0, 1, 0, 0, 0, 1, 1, 0]]
  1034. assert_allclose(expected, X_test_trans.toarray())
  1035. X_inv = ohe.inverse_transform(X_test_trans)
  1036. expected_inv = np.array(
  1037. [["infrequent_sklearn", "infrequent_sklearn", 1], [3, "infrequent_sklearn", 0]],
  1038. dtype=object,
  1039. )
  1040. assert_array_equal(expected_inv, X_inv)
  1041. def test_ohe_infrequent_multiple_categories_dtypes():
  1042. """Test infrequent categories with a pandas dataframe with multiple dtypes."""
  1043. pd = pytest.importorskip("pandas")
  1044. X = pd.DataFrame(
  1045. {
  1046. "str": ["a", "f", "c", "f", "f", "a", "c", "b", "b"],
  1047. "int": [5, 3, 0, 10, 10, 12, 0, 3, 5],
  1048. },
  1049. columns=["str", "int"],
  1050. )
  1051. ohe = OneHotEncoder(
  1052. categories="auto", max_categories=3, handle_unknown="infrequent_if_exist"
  1053. )
  1054. # X[:, 0] 'a', 'b', 'c' have the same frequency. 'a' and 'b' will be
  1055. # considered infrequent because they are greater
  1056. # X[:, 1] 0, 3, 5, 10 has frequency 2 and 12 has frequency 1.
  1057. # 0, 3, 12 will be considered infrequent
  1058. X_trans = ohe.fit_transform(X).toarray()
  1059. assert_array_equal(ohe.infrequent_categories_[0], ["a", "b"])
  1060. assert_array_equal(ohe.infrequent_categories_[1], [0, 3, 12])
  1061. expected = [
  1062. [0, 0, 1, 1, 0, 0],
  1063. [0, 1, 0, 0, 0, 1],
  1064. [1, 0, 0, 0, 0, 1],
  1065. [0, 1, 0, 0, 1, 0],
  1066. [0, 1, 0, 0, 1, 0],
  1067. [0, 0, 1, 0, 0, 1],
  1068. [1, 0, 0, 0, 0, 1],
  1069. [0, 0, 1, 0, 0, 1],
  1070. [0, 0, 1, 1, 0, 0],
  1071. ]
  1072. assert_allclose(expected, X_trans)
  1073. X_test = pd.DataFrame({"str": ["b", "f"], "int": [14, 12]}, columns=["str", "int"])
  1074. expected = [[0, 0, 1, 0, 0, 1], [0, 1, 0, 0, 0, 1]]
  1075. X_test_trans = ohe.transform(X_test)
  1076. assert_allclose(expected, X_test_trans.toarray())
  1077. X_inv = ohe.inverse_transform(X_test_trans)
  1078. expected_inv = np.array(
  1079. [["infrequent_sklearn", "infrequent_sklearn"], ["f", "infrequent_sklearn"]],
  1080. dtype=object,
  1081. )
  1082. assert_array_equal(expected_inv, X_inv)
  1083. # only infrequent or known categories
  1084. X_test = pd.DataFrame({"str": ["c", "b"], "int": [12, 5]}, columns=["str", "int"])
  1085. X_test_trans = ohe.transform(X_test).toarray()
  1086. expected = [[1, 0, 0, 0, 0, 1], [0, 0, 1, 1, 0, 0]]
  1087. assert_allclose(expected, X_test_trans)
  1088. X_inv = ohe.inverse_transform(X_test_trans)
  1089. expected_inv = np.array(
  1090. [["c", "infrequent_sklearn"], ["infrequent_sklearn", 5]], dtype=object
  1091. )
  1092. assert_array_equal(expected_inv, X_inv)
  1093. @pytest.mark.parametrize("kwargs", [{"min_frequency": 21, "max_categories": 1}])
  1094. def test_ohe_infrequent_one_level_errors(kwargs):
  1095. """All user provided categories are infrequent."""
  1096. X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 2]).T
  1097. ohe = OneHotEncoder(
  1098. handle_unknown="infrequent_if_exist", sparse_output=False, **kwargs
  1099. )
  1100. ohe.fit(X_train)
  1101. X_trans = ohe.transform([["a"]])
  1102. assert_allclose(X_trans, [[1]])
  1103. @pytest.mark.parametrize("kwargs", [{"min_frequency": 2, "max_categories": 3}])
  1104. def test_ohe_infrequent_user_cats_unknown_training_errors(kwargs):
  1105. """All user provided categories are infrequent."""
  1106. X_train = np.array([["e"] * 3], dtype=object).T
  1107. ohe = OneHotEncoder(
  1108. categories=[["c", "d", "a", "b"]],
  1109. sparse_output=False,
  1110. handle_unknown="infrequent_if_exist",
  1111. **kwargs,
  1112. ).fit(X_train)
  1113. X_trans = ohe.transform([["a"], ["e"]])
  1114. assert_allclose(X_trans, [[1], [1]])
  1115. # TODO(1.4): Remove when `sparse` parameter is replaced by `sparse_output`
  1116. def test_one_hot_encoder_sparse_deprecated():
  1117. X = [["Male", 1], ["Female", 3], ["Female", 2]]
  1118. msg = "`sparse` was renamed to `sparse_output`"
  1119. with pytest.warns(FutureWarning, match=msg):
  1120. OneHotEncoder(sparse=False).fit(X)
  1121. # deliberately omit 'OS' as an invalid combo
  1122. @pytest.mark.parametrize(
  1123. "input_dtype, category_dtype", ["OO", "OU", "UO", "UU", "SO", "SU", "SS"]
  1124. )
  1125. @pytest.mark.parametrize("array_type", ["list", "array", "dataframe"])
  1126. def test_encoders_string_categories(input_dtype, category_dtype, array_type):
  1127. """Check that encoding work with object, unicode, and byte string dtypes.
  1128. Non-regression test for:
  1129. https://github.com/scikit-learn/scikit-learn/issues/15616
  1130. https://github.com/scikit-learn/scikit-learn/issues/15726
  1131. https://github.com/scikit-learn/scikit-learn/issues/19677
  1132. """
  1133. X = np.array([["b"], ["a"]], dtype=input_dtype)
  1134. categories = [np.array(["b", "a"], dtype=category_dtype)]
  1135. ohe = OneHotEncoder(categories=categories, sparse_output=False).fit(X)
  1136. X_test = _convert_container(
  1137. [["a"], ["a"], ["b"], ["a"]], array_type, dtype=input_dtype
  1138. )
  1139. X_trans = ohe.transform(X_test)
  1140. expected = np.array([[0, 1], [0, 1], [1, 0], [0, 1]])
  1141. assert_allclose(X_trans, expected)
  1142. oe = OrdinalEncoder(categories=categories).fit(X)
  1143. X_trans = oe.transform(X_test)
  1144. expected = np.array([[1], [1], [0], [1]])
  1145. assert_array_equal(X_trans, expected)
  1146. def test_mixed_string_bytes_categoricals():
  1147. """Check that this mixture of predefined categories and X raises an error.
  1148. Categories defined as bytes can not easily be compared to data that is
  1149. a string.
  1150. """
  1151. # data as unicode
  1152. X = np.array([["b"], ["a"]], dtype="U")
  1153. # predefined categories as bytes
  1154. categories = [np.array(["b", "a"], dtype="S")]
  1155. ohe = OneHotEncoder(categories=categories, sparse_output=False)
  1156. msg = re.escape(
  1157. "In column 0, the predefined categories have type 'bytes' which is incompatible"
  1158. " with values of type 'str_'."
  1159. )
  1160. with pytest.raises(ValueError, match=msg):
  1161. ohe.fit(X)
  1162. @pytest.mark.parametrize("missing_value", [np.nan, None])
  1163. def test_ohe_missing_values_get_feature_names(missing_value):
  1164. # encoder with missing values with object dtypes
  1165. X = np.array([["a", "b", missing_value, "a", missing_value]], dtype=object).T
  1166. ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore").fit(X)
  1167. names = ohe.get_feature_names_out()
  1168. assert_array_equal(names, ["x0_a", "x0_b", f"x0_{missing_value}"])
  1169. def test_ohe_missing_value_support_pandas():
  1170. # check support for pandas with mixed dtypes and missing values
  1171. pd = pytest.importorskip("pandas")
  1172. df = pd.DataFrame(
  1173. {
  1174. "col1": ["dog", "cat", None, "cat"],
  1175. "col2": np.array([3, 0, 4, np.nan], dtype=float),
  1176. },
  1177. columns=["col1", "col2"],
  1178. )
  1179. expected_df_trans = np.array(
  1180. [
  1181. [0, 1, 0, 0, 1, 0, 0],
  1182. [1, 0, 0, 1, 0, 0, 0],
  1183. [0, 0, 1, 0, 0, 1, 0],
  1184. [1, 0, 0, 0, 0, 0, 1],
  1185. ]
  1186. )
  1187. Xtr = check_categorical_onehot(df)
  1188. assert_allclose(Xtr, expected_df_trans)
  1189. @pytest.mark.parametrize("handle_unknown", ["infrequent_if_exist", "ignore"])
  1190. @pytest.mark.parametrize("pd_nan_type", ["pd.NA", "np.nan"])
  1191. def test_ohe_missing_value_support_pandas_categorical(pd_nan_type, handle_unknown):
  1192. # checks pandas dataframe with categorical features
  1193. pd = pytest.importorskip("pandas")
  1194. pd_missing_value = pd.NA if pd_nan_type == "pd.NA" else np.nan
  1195. df = pd.DataFrame(
  1196. {
  1197. "col1": pd.Series(["c", "a", pd_missing_value, "b", "a"], dtype="category"),
  1198. }
  1199. )
  1200. expected_df_trans = np.array(
  1201. [
  1202. [0, 0, 1, 0],
  1203. [1, 0, 0, 0],
  1204. [0, 0, 0, 1],
  1205. [0, 1, 0, 0],
  1206. [1, 0, 0, 0],
  1207. ]
  1208. )
  1209. ohe = OneHotEncoder(sparse_output=False, handle_unknown=handle_unknown)
  1210. df_trans = ohe.fit_transform(df)
  1211. assert_allclose(expected_df_trans, df_trans)
  1212. assert len(ohe.categories_) == 1
  1213. assert_array_equal(ohe.categories_[0][:-1], ["a", "b", "c"])
  1214. assert np.isnan(ohe.categories_[0][-1])
  1215. @pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist"])
  1216. def test_ohe_drop_first_handle_unknown_ignore_warns(handle_unknown):
  1217. """Check drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
  1218. during transform."""
  1219. X = [["a", 0], ["b", 2], ["b", 1]]
  1220. ohe = OneHotEncoder(
  1221. drop="first", sparse_output=False, handle_unknown=handle_unknown
  1222. )
  1223. X_trans = ohe.fit_transform(X)
  1224. X_expected = np.array(
  1225. [
  1226. [0, 0, 0],
  1227. [1, 0, 1],
  1228. [1, 1, 0],
  1229. ]
  1230. )
  1231. assert_allclose(X_trans, X_expected)
  1232. # Both categories are unknown
  1233. X_test = [["c", 3]]
  1234. X_expected = np.array([[0, 0, 0]])
  1235. warn_msg = (
  1236. r"Found unknown categories in columns \[0, 1\] during "
  1237. "transform. These unknown categories will be encoded as all "
  1238. "zeros"
  1239. )
  1240. with pytest.warns(UserWarning, match=warn_msg):
  1241. X_trans = ohe.transform(X_test)
  1242. assert_allclose(X_trans, X_expected)
  1243. # inverse_transform maps to None
  1244. X_inv = ohe.inverse_transform(X_expected)
  1245. assert_array_equal(X_inv, np.array([["a", 0]], dtype=object))
  1246. @pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist"])
  1247. def test_ohe_drop_if_binary_handle_unknown_ignore_warns(handle_unknown):
  1248. """Check drop='if_binary' and handle_unknown='ignore' during transform."""
  1249. X = [["a", 0], ["b", 2], ["b", 1]]
  1250. ohe = OneHotEncoder(
  1251. drop="if_binary", sparse_output=False, handle_unknown=handle_unknown
  1252. )
  1253. X_trans = ohe.fit_transform(X)
  1254. X_expected = np.array(
  1255. [
  1256. [0, 1, 0, 0],
  1257. [1, 0, 0, 1],
  1258. [1, 0, 1, 0],
  1259. ]
  1260. )
  1261. assert_allclose(X_trans, X_expected)
  1262. # Both categories are unknown
  1263. X_test = [["c", 3]]
  1264. X_expected = np.array([[0, 0, 0, 0]])
  1265. warn_msg = (
  1266. r"Found unknown categories in columns \[0, 1\] during "
  1267. "transform. These unknown categories will be encoded as all "
  1268. "zeros"
  1269. )
  1270. with pytest.warns(UserWarning, match=warn_msg):
  1271. X_trans = ohe.transform(X_test)
  1272. assert_allclose(X_trans, X_expected)
  1273. # inverse_transform maps to None
  1274. X_inv = ohe.inverse_transform(X_expected)
  1275. assert_array_equal(X_inv, np.array([["a", None]], dtype=object))
  1276. @pytest.mark.parametrize("handle_unknown", ["ignore", "infrequent_if_exist"])
  1277. def test_ohe_drop_first_explicit_categories(handle_unknown):
  1278. """Check drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
  1279. during fit with categories passed in."""
  1280. X = [["a", 0], ["b", 2], ["b", 1]]
  1281. ohe = OneHotEncoder(
  1282. drop="first",
  1283. sparse_output=False,
  1284. handle_unknown=handle_unknown,
  1285. categories=[["b", "a"], [1, 2]],
  1286. )
  1287. ohe.fit(X)
  1288. X_test = [["c", 1]]
  1289. X_expected = np.array([[0, 0]])
  1290. warn_msg = (
  1291. r"Found unknown categories in columns \[0\] during transform. "
  1292. r"These unknown categories will be encoded as all zeros"
  1293. )
  1294. with pytest.warns(UserWarning, match=warn_msg):
  1295. X_trans = ohe.transform(X_test)
  1296. assert_allclose(X_trans, X_expected)
  1297. def test_ohe_more_informative_error_message():
  1298. """Raise informative error message when pandas output and sparse_output=True."""
  1299. pd = pytest.importorskip("pandas")
  1300. df = pd.DataFrame({"a": [1, 2, 3], "b": ["z", "b", "b"]}, columns=["a", "b"])
  1301. ohe = OneHotEncoder(sparse_output=True)
  1302. ohe.set_output(transform="pandas")
  1303. msg = (
  1304. "Pandas output does not support sparse data. Set "
  1305. "sparse_output=False to output pandas DataFrames or disable pandas output"
  1306. )
  1307. with pytest.raises(ValueError, match=msg):
  1308. ohe.fit_transform(df)
  1309. ohe.fit(df)
  1310. with pytest.raises(ValueError, match=msg):
  1311. ohe.transform(df)
  1312. def test_ordinal_encoder_passthrough_missing_values_float_errors_dtype():
  1313. """Test ordinal encoder with nan passthrough fails when dtype=np.int32."""
  1314. X = np.array([[np.nan, 3.0, 1.0, 3.0]]).T
  1315. oe = OrdinalEncoder(dtype=np.int32)
  1316. msg = (
  1317. r"There are missing values in features \[0\]. For OrdinalEncoder "
  1318. f"to encode missing values with dtype: {np.int32}"
  1319. )
  1320. with pytest.raises(ValueError, match=msg):
  1321. oe.fit(X)
  1322. @pytest.mark.parametrize("encoded_missing_value", [np.nan, -2])
  1323. def test_ordinal_encoder_passthrough_missing_values_float(encoded_missing_value):
  1324. """Test ordinal encoder with nan on float dtypes."""
  1325. X = np.array([[np.nan, 3.0, 1.0, 3.0]], dtype=np.float64).T
  1326. oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(X)
  1327. assert len(oe.categories_) == 1
  1328. assert_allclose(oe.categories_[0], [1.0, 3.0, np.nan])
  1329. X_trans = oe.transform(X)
  1330. assert_allclose(X_trans, [[encoded_missing_value], [1.0], [0.0], [1.0]])
  1331. X_inverse = oe.inverse_transform(X_trans)
  1332. assert_allclose(X_inverse, X)
  1333. @pytest.mark.parametrize("pd_nan_type", ["pd.NA", "np.nan"])
  1334. @pytest.mark.parametrize("encoded_missing_value", [np.nan, -2])
  1335. def test_ordinal_encoder_missing_value_support_pandas_categorical(
  1336. pd_nan_type, encoded_missing_value
  1337. ):
  1338. """Check ordinal encoder is compatible with pandas."""
  1339. # checks pandas dataframe with categorical features
  1340. pd = pytest.importorskip("pandas")
  1341. pd_missing_value = pd.NA if pd_nan_type == "pd.NA" else np.nan
  1342. df = pd.DataFrame(
  1343. {
  1344. "col1": pd.Series(["c", "a", pd_missing_value, "b", "a"], dtype="category"),
  1345. }
  1346. )
  1347. oe = OrdinalEncoder(encoded_missing_value=encoded_missing_value).fit(df)
  1348. assert len(oe.categories_) == 1
  1349. assert_array_equal(oe.categories_[0][:3], ["a", "b", "c"])
  1350. assert np.isnan(oe.categories_[0][-1])
  1351. df_trans = oe.transform(df)
  1352. assert_allclose(df_trans, [[2.0], [0.0], [encoded_missing_value], [1.0], [0.0]])
  1353. X_inverse = oe.inverse_transform(df_trans)
  1354. assert X_inverse.shape == (5, 1)
  1355. assert_array_equal(X_inverse[:2, 0], ["c", "a"])
  1356. assert_array_equal(X_inverse[3:, 0], ["b", "a"])
  1357. assert np.isnan(X_inverse[2, 0])
  1358. @pytest.mark.parametrize(
  1359. "X, X2, cats, cat_dtype",
  1360. [
  1361. (
  1362. (
  1363. np.array([["a", np.nan]], dtype=object).T,
  1364. np.array([["a", "b"]], dtype=object).T,
  1365. [np.array(["a", np.nan, "d"], dtype=object)],
  1366. np.object_,
  1367. )
  1368. ),
  1369. (
  1370. (
  1371. np.array([["a", np.nan]], dtype=object).T,
  1372. np.array([["a", "b"]], dtype=object).T,
  1373. [np.array(["a", np.nan, "d"], dtype=object)],
  1374. np.object_,
  1375. )
  1376. ),
  1377. (
  1378. (
  1379. np.array([[2.0, np.nan]], dtype=np.float64).T,
  1380. np.array([[3.0]], dtype=np.float64).T,
  1381. [np.array([2.0, 4.0, np.nan])],
  1382. np.float64,
  1383. )
  1384. ),
  1385. ],
  1386. ids=[
  1387. "object-None-missing-value",
  1388. "object-nan-missing_value",
  1389. "numeric-missing-value",
  1390. ],
  1391. )
  1392. def test_ordinal_encoder_specified_categories_missing_passthrough(
  1393. X, X2, cats, cat_dtype
  1394. ):
  1395. """Test ordinal encoder for specified categories."""
  1396. oe = OrdinalEncoder(categories=cats)
  1397. exp = np.array([[0.0], [np.nan]])
  1398. assert_array_equal(oe.fit_transform(X), exp)
  1399. # manually specified categories should have same dtype as
  1400. # the data when coerced from lists
  1401. assert oe.categories_[0].dtype == cat_dtype
  1402. # when specifying categories manually, unknown categories should already
  1403. # raise when fitting
  1404. oe = OrdinalEncoder(categories=cats)
  1405. with pytest.raises(ValueError, match="Found unknown categories"):
  1406. oe.fit(X2)
  1407. @pytest.mark.parametrize(
  1408. "X, expected_X_trans, X_test",
  1409. [
  1410. (
  1411. np.array([[1.0, np.nan, 3.0]]).T,
  1412. np.array([[0.0, np.nan, 1.0]]).T,
  1413. np.array([[4.0]]),
  1414. ),
  1415. (
  1416. np.array([[1.0, 4.0, 3.0]]).T,
  1417. np.array([[0.0, 2.0, 1.0]]).T,
  1418. np.array([[np.nan]]),
  1419. ),
  1420. (
  1421. np.array([["c", np.nan, "b"]], dtype=object).T,
  1422. np.array([[1.0, np.nan, 0.0]]).T,
  1423. np.array([["d"]], dtype=object),
  1424. ),
  1425. (
  1426. np.array([["c", "a", "b"]], dtype=object).T,
  1427. np.array([[2.0, 0.0, 1.0]]).T,
  1428. np.array([[np.nan]], dtype=object),
  1429. ),
  1430. ],
  1431. )
  1432. def test_ordinal_encoder_handle_missing_and_unknown(X, expected_X_trans, X_test):
  1433. """Test the interaction between missing values and handle_unknown"""
  1434. oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
  1435. X_trans = oe.fit_transform(X)
  1436. assert_allclose(X_trans, expected_X_trans)
  1437. assert_allclose(oe.transform(X_test), [[-1.0]])
  1438. def test_ordinal_encoder_sparse():
  1439. """Check that we raise proper error with sparse input in OrdinalEncoder.
  1440. Non-regression test for:
  1441. https://github.com/scikit-learn/scikit-learn/issues/19878
  1442. """
  1443. X = np.array([[3, 2, 1], [0, 1, 1]])
  1444. X_sparse = sparse.csr_matrix(X)
  1445. encoder = OrdinalEncoder()
  1446. err_msg = "A sparse matrix was passed, but dense data is required"
  1447. with pytest.raises(TypeError, match=err_msg):
  1448. encoder.fit(X_sparse)
  1449. with pytest.raises(TypeError, match=err_msg):
  1450. encoder.fit_transform(X_sparse)
  1451. X_trans = encoder.fit_transform(X)
  1452. X_trans_sparse = sparse.csr_matrix(X_trans)
  1453. with pytest.raises(TypeError, match=err_msg):
  1454. encoder.inverse_transform(X_trans_sparse)
  1455. def test_ordinal_encoder_fit_with_unseen_category():
  1456. """Check OrdinalEncoder.fit works with unseen category when
  1457. `handle_unknown="use_encoded_value"`.
  1458. Non-regression test for:
  1459. https://github.com/scikit-learn/scikit-learn/issues/19872
  1460. """
  1461. X = np.array([0, 0, 1, 0, 2, 5])[:, np.newaxis]
  1462. oe = OrdinalEncoder(
  1463. categories=[[-1, 0, 1]], handle_unknown="use_encoded_value", unknown_value=-999
  1464. )
  1465. oe.fit(X)
  1466. oe = OrdinalEncoder(categories=[[-1, 0, 1]], handle_unknown="error")
  1467. with pytest.raises(ValueError, match="Found unknown categories"):
  1468. oe.fit(X)
  1469. @pytest.mark.parametrize(
  1470. "X_train",
  1471. [
  1472. [["AA", "B"]],
  1473. np.array([["AA", "B"]], dtype="O"),
  1474. np.array([["AA", "B"]], dtype="U"),
  1475. ],
  1476. )
  1477. @pytest.mark.parametrize(
  1478. "X_test",
  1479. [
  1480. [["A", "B"]],
  1481. np.array([["A", "B"]], dtype="O"),
  1482. np.array([["A", "B"]], dtype="U"),
  1483. ],
  1484. )
  1485. def test_ordinal_encoder_handle_unknown_string_dtypes(X_train, X_test):
  1486. """Checks that `OrdinalEncoder` transforms string dtypes.
  1487. Non-regression test for:
  1488. https://github.com/scikit-learn/scikit-learn/issues/19872
  1489. """
  1490. enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-9)
  1491. enc.fit(X_train)
  1492. X_trans = enc.transform(X_test)
  1493. assert_allclose(X_trans, [[-9, 0]])
  1494. def test_ordinal_encoder_python_integer():
  1495. """Check that `OrdinalEncoder` accepts Python integers that are potentially
  1496. larger than 64 bits.
  1497. Non-regression test for:
  1498. https://github.com/scikit-learn/scikit-learn/issues/20721
  1499. """
  1500. X = np.array(
  1501. [
  1502. 44253463435747313673,
  1503. 9867966753463435747313673,
  1504. 44253462342215747313673,
  1505. 442534634357764313673,
  1506. ]
  1507. ).reshape(-1, 1)
  1508. encoder = OrdinalEncoder().fit(X)
  1509. assert_array_equal(encoder.categories_, np.sort(X, axis=0).T)
  1510. X_trans = encoder.transform(X)
  1511. assert_array_equal(X_trans, [[0], [3], [2], [1]])
  1512. def test_ordinal_encoder_features_names_out_pandas():
  1513. """Check feature names out is same as the input."""
  1514. pd = pytest.importorskip("pandas")
  1515. names = ["b", "c", "a"]
  1516. X = pd.DataFrame([[1, 2, 3]], columns=names)
  1517. enc = OrdinalEncoder().fit(X)
  1518. feature_names_out = enc.get_feature_names_out()
  1519. assert_array_equal(names, feature_names_out)
  1520. def test_ordinal_encoder_unknown_missing_interaction():
  1521. """Check interactions between encode_unknown and missing value encoding."""
  1522. X = np.array([["a"], ["b"], [np.nan]], dtype=object)
  1523. oe = OrdinalEncoder(
  1524. handle_unknown="use_encoded_value",
  1525. unknown_value=np.nan,
  1526. encoded_missing_value=-3,
  1527. ).fit(X)
  1528. X_trans = oe.transform(X)
  1529. assert_allclose(X_trans, [[0], [1], [-3]])
  1530. # "c" is unknown and is mapped to np.nan
  1531. # "None" is a missing value and is set to -3
  1532. X_test = np.array([["c"], [np.nan]], dtype=object)
  1533. X_test_trans = oe.transform(X_test)
  1534. assert_allclose(X_test_trans, [[np.nan], [-3]])
  1535. # Non-regression test for #24082
  1536. X_roundtrip = oe.inverse_transform(X_test_trans)
  1537. # np.nan is unknown so it maps to None
  1538. assert X_roundtrip[0][0] is None
  1539. # -3 is the encoded missing value so it maps back to nan
  1540. assert np.isnan(X_roundtrip[1][0])
  1541. @pytest.mark.parametrize("with_pandas", [True, False])
  1542. def test_ordinal_encoder_encoded_missing_value_error(with_pandas):
  1543. """Check OrdinalEncoder errors when encoded_missing_value is used by
  1544. an known category."""
  1545. X = np.array([["a", "dog"], ["b", "cat"], ["c", np.nan]], dtype=object)
  1546. # The 0-th feature has no missing values so it is not included in the list of
  1547. # features
  1548. error_msg = (
  1549. r"encoded_missing_value \(1\) is already used to encode a known category "
  1550. r"in features: "
  1551. )
  1552. if with_pandas:
  1553. pd = pytest.importorskip("pandas")
  1554. X = pd.DataFrame(X, columns=["letter", "pet"])
  1555. error_msg = error_msg + r"\['pet'\]"
  1556. else:
  1557. error_msg = error_msg + r"\[1\]"
  1558. oe = OrdinalEncoder(encoded_missing_value=1)
  1559. with pytest.raises(ValueError, match=error_msg):
  1560. oe.fit(X)
  1561. @pytest.mark.parametrize(
  1562. "X_train, X_test_trans_expected, X_roundtrip_expected",
  1563. [
  1564. (
  1565. # missing value is not in training set
  1566. # inverse transform will considering encoded nan as unknown
  1567. np.array([["a"], ["1"]], dtype=object),
  1568. [[0], [np.nan], [np.nan]],
  1569. np.asarray([["1"], [None], [None]], dtype=object),
  1570. ),
  1571. (
  1572. # missing value in training set,
  1573. # inverse transform will considering encoded nan as missing
  1574. np.array([[np.nan], ["1"], ["a"]], dtype=object),
  1575. [[0], [np.nan], [np.nan]],
  1576. np.asarray([["1"], [np.nan], [np.nan]], dtype=object),
  1577. ),
  1578. ],
  1579. )
  1580. def test_ordinal_encoder_unknown_missing_interaction_both_nan(
  1581. X_train, X_test_trans_expected, X_roundtrip_expected
  1582. ):
  1583. """Check transform when unknown_value and encoded_missing_value is nan.
  1584. Non-regression test for #24082.
  1585. """
  1586. oe = OrdinalEncoder(
  1587. handle_unknown="use_encoded_value",
  1588. unknown_value=np.nan,
  1589. encoded_missing_value=np.nan,
  1590. ).fit(X_train)
  1591. X_test = np.array([["1"], [np.nan], ["b"]])
  1592. X_test_trans = oe.transform(X_test)
  1593. # both nan and unknown are encoded as nan
  1594. assert_allclose(X_test_trans, X_test_trans_expected)
  1595. X_roundtrip = oe.inverse_transform(X_test_trans)
  1596. n_samples = X_roundtrip_expected.shape[0]
  1597. for i in range(n_samples):
  1598. expected_val = X_roundtrip_expected[i, 0]
  1599. val = X_roundtrip[i, 0]
  1600. if expected_val is None:
  1601. assert val is None
  1602. elif is_scalar_nan(expected_val):
  1603. assert np.isnan(val)
  1604. else:
  1605. assert val == expected_val
  1606. def test_one_hot_encoder_set_output():
  1607. """Check OneHotEncoder works with set_output."""
  1608. pd = pytest.importorskip("pandas")
  1609. X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]})
  1610. ohe = OneHotEncoder()
  1611. ohe.set_output(transform="pandas")
  1612. match = "Pandas output does not support sparse data"
  1613. with pytest.raises(ValueError, match=match):
  1614. ohe.fit_transform(X_df)
  1615. ohe_default = OneHotEncoder(sparse_output=False).set_output(transform="default")
  1616. ohe_pandas = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
  1617. X_default = ohe_default.fit_transform(X_df)
  1618. X_pandas = ohe_pandas.fit_transform(X_df)
  1619. assert_allclose(X_pandas.to_numpy(), X_default)
  1620. assert_array_equal(ohe_pandas.get_feature_names_out(), X_pandas.columns)
  1621. def test_ordinal_set_output():
  1622. """Check OrdinalEncoder works with set_output."""
  1623. pd = pytest.importorskip("pandas")
  1624. X_df = pd.DataFrame({"A": ["a", "b"], "B": [1, 2]})
  1625. ord_default = OrdinalEncoder().set_output(transform="default")
  1626. ord_pandas = OrdinalEncoder().set_output(transform="pandas")
  1627. X_default = ord_default.fit_transform(X_df)
  1628. X_pandas = ord_pandas.fit_transform(X_df)
  1629. assert_allclose(X_pandas.to_numpy(), X_default)
  1630. assert_array_equal(ord_pandas.get_feature_names_out(), X_pandas.columns)
  1631. def test_predefined_categories_dtype():
  1632. """Check that the categories_ dtype is `object` for string categories
  1633. Regression test for gh-25171.
  1634. """
  1635. categories = [["as", "mmas", "eas", "ras", "acs"], ["1", "2"]]
  1636. enc = OneHotEncoder(categories=categories)
  1637. enc.fit([["as", "1"]])
  1638. assert len(categories) == len(enc.categories_)
  1639. for n, cat in enumerate(enc.categories_):
  1640. assert cat.dtype == object
  1641. assert_array_equal(categories[n], cat)
  1642. def test_ordinal_encoder_missing_unknown_encoding_max():
  1643. """Check missing value or unknown encoding can equal the cardinality."""
  1644. X = np.array([["dog"], ["cat"], [np.nan]], dtype=object)
  1645. X_trans = OrdinalEncoder(encoded_missing_value=2).fit_transform(X)
  1646. assert_allclose(X_trans, [[1], [0], [2]])
  1647. enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=2).fit(X)
  1648. X_test = np.array([["snake"]])
  1649. X_trans = enc.transform(X_test)
  1650. assert_allclose(X_trans, [[2]])
  1651. def test_drop_idx_infrequent_categories():
  1652. """Check drop_idx is defined correctly with infrequent categories.
  1653. Non-regression test for gh-25550.
  1654. """
  1655. X = np.array(
  1656. [["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object
  1657. ).T
  1658. ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="first").fit(X)
  1659. assert_array_equal(
  1660. ohe.get_feature_names_out(), ["x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"]
  1661. )
  1662. assert ohe.categories_[0][ohe.drop_idx_[0]] == "b"
  1663. X = np.array([["a"] * 2 + ["b"] * 2 + ["c"] * 10], dtype=object).T
  1664. ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop="if_binary").fit(X)
  1665. assert_array_equal(ohe.get_feature_names_out(), ["x0_infrequent_sklearn"])
  1666. assert ohe.categories_[0][ohe.drop_idx_[0]] == "c"
  1667. X = np.array(
  1668. [["a"] * 2 + ["b"] * 4 + ["c"] * 4 + ["d"] * 4 + ["e"] * 4], dtype=object
  1669. ).T
  1670. ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=["d"]).fit(X)
  1671. assert_array_equal(
  1672. ohe.get_feature_names_out(), ["x0_b", "x0_c", "x0_e", "x0_infrequent_sklearn"]
  1673. )
  1674. assert ohe.categories_[0][ohe.drop_idx_[0]] == "d"
  1675. ohe = OneHotEncoder(min_frequency=4, sparse_output=False, drop=None).fit(X)
  1676. assert_array_equal(
  1677. ohe.get_feature_names_out(),
  1678. ["x0_b", "x0_c", "x0_d", "x0_e", "x0_infrequent_sklearn"],
  1679. )
  1680. assert ohe.drop_idx_ is None
  1681. @pytest.mark.parametrize(
  1682. "kwargs",
  1683. [
  1684. {"max_categories": 3},
  1685. {"min_frequency": 6},
  1686. {"min_frequency": 9},
  1687. {"min_frequency": 0.24},
  1688. {"min_frequency": 0.16},
  1689. {"max_categories": 3, "min_frequency": 8},
  1690. {"max_categories": 4, "min_frequency": 6},
  1691. ],
  1692. )
  1693. def test_ordinal_encoder_infrequent_three_levels(kwargs):
  1694. """Test parameters for grouping 'a', and 'd' into the infrequent category."""
  1695. X_train = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3]).T
  1696. ordinal = OrdinalEncoder(
  1697. handle_unknown="use_encoded_value", unknown_value=-1, **kwargs
  1698. ).fit(X_train)
  1699. assert_array_equal(ordinal.categories_, [["a", "b", "c", "d"]])
  1700. assert_array_equal(ordinal.infrequent_categories_, [["a", "d"]])
  1701. X_test = [["a"], ["b"], ["c"], ["d"], ["z"]]
  1702. expected_trans = [[2], [0], [1], [2], [-1]]
  1703. X_trans = ordinal.transform(X_test)
  1704. assert_allclose(X_trans, expected_trans)
  1705. X_inverse = ordinal.inverse_transform(X_trans)
  1706. expected_inverse = [
  1707. ["infrequent_sklearn"],
  1708. ["b"],
  1709. ["c"],
  1710. ["infrequent_sklearn"],
  1711. [None],
  1712. ]
  1713. assert_array_equal(X_inverse, expected_inverse)
  1714. def test_ordinal_encoder_infrequent_three_levels_user_cats():
  1715. """Test that the order of the categories provided by a user is respected.
  1716. In this case 'c' is encoded as the first category and 'b' is encoded
  1717. as the second one.
  1718. """
  1719. X_train = np.array(
  1720. [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object
  1721. ).T
  1722. ordinal = OrdinalEncoder(
  1723. categories=[["c", "d", "b", "a"]],
  1724. max_categories=3,
  1725. handle_unknown="use_encoded_value",
  1726. unknown_value=-1,
  1727. ).fit(X_train)
  1728. assert_array_equal(ordinal.categories_, [["c", "d", "b", "a"]])
  1729. assert_array_equal(ordinal.infrequent_categories_, [["d", "a"]])
  1730. X_test = [["a"], ["b"], ["c"], ["d"], ["z"]]
  1731. expected_trans = [[2], [1], [0], [2], [-1]]
  1732. X_trans = ordinal.transform(X_test)
  1733. assert_allclose(X_trans, expected_trans)
  1734. X_inverse = ordinal.inverse_transform(X_trans)
  1735. expected_inverse = [
  1736. ["infrequent_sklearn"],
  1737. ["b"],
  1738. ["c"],
  1739. ["infrequent_sklearn"],
  1740. [None],
  1741. ]
  1742. assert_array_equal(X_inverse, expected_inverse)
  1743. def test_ordinal_encoder_infrequent_mixed():
  1744. """Test when feature 0 has infrequent categories and feature 1 does not."""
  1745. X = np.column_stack(([0, 1, 3, 3, 3, 3, 2, 0, 3], [0, 0, 0, 0, 1, 1, 1, 1, 1]))
  1746. ordinal = OrdinalEncoder(max_categories=3).fit(X)
  1747. assert_array_equal(ordinal.infrequent_categories_[0], [1, 2])
  1748. assert ordinal.infrequent_categories_[1] is None
  1749. X_test = [[3, 0], [1, 1]]
  1750. expected_trans = [[1, 0], [2, 1]]
  1751. X_trans = ordinal.transform(X_test)
  1752. assert_allclose(X_trans, expected_trans)
  1753. X_inverse = ordinal.inverse_transform(X_trans)
  1754. expected_inverse = np.array([[3, 0], ["infrequent_sklearn", 1]], dtype=object)
  1755. assert_array_equal(X_inverse, expected_inverse)
  1756. def test_ordinal_encoder_infrequent_multiple_categories_dtypes():
  1757. """Test infrequent categories with a pandas DataFrame with multiple dtypes."""
  1758. pd = pytest.importorskip("pandas")
  1759. categorical_dtype = pd.CategoricalDtype(["bird", "cat", "dog", "snake"])
  1760. X = pd.DataFrame(
  1761. {
  1762. "str": ["a", "f", "c", "f", "f", "a", "c", "b", "b"],
  1763. "int": [5, 3, 0, 10, 10, 12, 0, 3, 5],
  1764. "categorical": pd.Series(
  1765. ["dog"] * 4 + ["cat"] * 3 + ["snake"] + ["bird"],
  1766. dtype=categorical_dtype,
  1767. ),
  1768. },
  1769. columns=["str", "int", "categorical"],
  1770. )
  1771. ordinal = OrdinalEncoder(max_categories=3).fit(X)
  1772. # X[:, 0] 'a', 'b', 'c' have the same frequency. 'a' and 'b' will be
  1773. # considered infrequent because they appear first when sorted
  1774. # X[:, 1] 0, 3, 5, 10 has frequency 2 and 12 has frequency 1.
  1775. # 0, 3, 12 will be considered infrequent because they appear first when
  1776. # sorted.
  1777. # X[:, 2] "snake" and "bird" or infrequent
  1778. assert_array_equal(ordinal.infrequent_categories_[0], ["a", "b"])
  1779. assert_array_equal(ordinal.infrequent_categories_[1], [0, 3, 12])
  1780. assert_array_equal(ordinal.infrequent_categories_[2], ["bird", "snake"])
  1781. X_test = pd.DataFrame(
  1782. {
  1783. "str": ["a", "b", "f", "c"],
  1784. "int": [12, 0, 10, 5],
  1785. "categorical": pd.Series(
  1786. ["cat"] + ["snake"] + ["bird"] + ["dog"],
  1787. dtype=categorical_dtype,
  1788. ),
  1789. },
  1790. columns=["str", "int", "categorical"],
  1791. )
  1792. expected_trans = [[2, 2, 0], [2, 2, 2], [1, 1, 2], [0, 0, 1]]
  1793. X_trans = ordinal.transform(X_test)
  1794. assert_allclose(X_trans, expected_trans)
  1795. def test_ordinal_encoder_infrequent_custom_mapping():
  1796. """Check behavior of unknown_value and encoded_missing_value with infrequent."""
  1797. X_train = np.array(
  1798. [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3 + [np.nan]], dtype=object
  1799. ).T
  1800. ordinal = OrdinalEncoder(
  1801. handle_unknown="use_encoded_value",
  1802. unknown_value=2,
  1803. max_categories=2,
  1804. encoded_missing_value=3,
  1805. ).fit(X_train)
  1806. assert_array_equal(ordinal.infrequent_categories_, [["a", "c", "d"]])
  1807. X_test = np.array([["a"], ["b"], ["c"], ["d"], ["e"], [np.nan]], dtype=object)
  1808. expected_trans = [[1], [0], [1], [1], [2], [3]]
  1809. X_trans = ordinal.transform(X_test)
  1810. assert_allclose(X_trans, expected_trans)
  1811. @pytest.mark.parametrize(
  1812. "kwargs",
  1813. [
  1814. {"max_categories": 6},
  1815. {"min_frequency": 2},
  1816. ],
  1817. )
  1818. def test_ordinal_encoder_all_frequent(kwargs):
  1819. """All categories are considered frequent have same encoding as default encoder."""
  1820. X_train = np.array(
  1821. [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object
  1822. ).T
  1823. adjusted_encoder = OrdinalEncoder(
  1824. **kwargs, handle_unknown="use_encoded_value", unknown_value=-1
  1825. ).fit(X_train)
  1826. default_encoder = OrdinalEncoder(
  1827. handle_unknown="use_encoded_value", unknown_value=-1
  1828. ).fit(X_train)
  1829. X_test = [["a"], ["b"], ["c"], ["d"], ["e"]]
  1830. assert_allclose(
  1831. adjusted_encoder.transform(X_test), default_encoder.transform(X_test)
  1832. )
  1833. @pytest.mark.parametrize(
  1834. "kwargs",
  1835. [
  1836. {"max_categories": 1},
  1837. {"min_frequency": 100},
  1838. ],
  1839. )
  1840. def test_ordinal_encoder_all_infrequent(kwargs):
  1841. """When all categories are infrequent, they are all encoded as zero."""
  1842. X_train = np.array(
  1843. [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object
  1844. ).T
  1845. encoder = OrdinalEncoder(
  1846. **kwargs, handle_unknown="use_encoded_value", unknown_value=-1
  1847. ).fit(X_train)
  1848. X_test = [["a"], ["b"], ["c"], ["d"], ["e"]]
  1849. assert_allclose(encoder.transform(X_test), [[0], [0], [0], [0], [-1]])
  1850. def test_ordinal_encoder_missing_appears_frequent():
  1851. """Check behavior when missing value appears frequently."""
  1852. X = np.array(
  1853. [[np.nan] * 20 + ["dog"] * 10 + ["cat"] * 5 + ["snake"] + ["deer"]],
  1854. dtype=object,
  1855. ).T
  1856. ordinal = OrdinalEncoder(max_categories=3).fit(X)
  1857. X_test = np.array([["snake", "cat", "dog", np.nan]], dtype=object).T
  1858. X_trans = ordinal.transform(X_test)
  1859. assert_allclose(X_trans, [[2], [0], [1], [np.nan]])
  1860. def test_ordinal_encoder_missing_appears_infrequent():
  1861. """Check behavior when missing value appears infrequently."""
  1862. # feature 0 has infrequent categories
  1863. # feature 1 has no infrequent categories
  1864. X = np.array(
  1865. [
  1866. [np.nan] + ["dog"] * 10 + ["cat"] * 5 + ["snake"] + ["deer"],
  1867. ["red"] * 9 + ["green"] * 9,
  1868. ],
  1869. dtype=object,
  1870. ).T
  1871. ordinal = OrdinalEncoder(min_frequency=4).fit(X)
  1872. X_test = np.array(
  1873. [
  1874. ["snake", "red"],
  1875. ["deer", "green"],
  1876. [np.nan, "green"],
  1877. ["dog", "green"],
  1878. ["cat", "red"],
  1879. ],
  1880. dtype=object,
  1881. )
  1882. X_trans = ordinal.transform(X_test)
  1883. assert_allclose(X_trans, [[2, 1], [2, 0], [np.nan, 0], [1, 0], [0, 1]])