_encoders.py 66 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687
  1. # Authors: Andreas Mueller <amueller@ais.uni-bonn.de>
  2. # Joris Van den Bossche <jorisvandenbossche@gmail.com>
  3. # License: BSD 3 clause
  4. import numbers
  5. import warnings
  6. from numbers import Integral
  7. import numpy as np
  8. from scipy import sparse
  9. from ..base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin, _fit_context
  10. from ..utils import _safe_indexing, check_array, is_scalar_nan
  11. from ..utils._encode import _check_unknown, _encode, _get_counts, _unique
  12. from ..utils._mask import _get_mask
  13. from ..utils._param_validation import Hidden, Interval, RealNotInt, StrOptions
  14. from ..utils._set_output import _get_output_config
  15. from ..utils.validation import _check_feature_names_in, check_is_fitted
  16. __all__ = ["OneHotEncoder", "OrdinalEncoder"]
  17. class _BaseEncoder(TransformerMixin, BaseEstimator):
  18. """
  19. Base class for encoders that includes the code to categorize and
  20. transform the input features.
  21. """
  22. def _check_X(self, X, force_all_finite=True):
  23. """
  24. Perform custom check_array:
  25. - convert list of strings to object dtype
  26. - check for missing values for object dtype data (check_array does
  27. not do that)
  28. - return list of features (arrays): this list of features is
  29. constructed feature by feature to preserve the data types
  30. of pandas DataFrame columns, as otherwise information is lost
  31. and cannot be used, e.g. for the `categories_` attribute.
  32. """
  33. if not (hasattr(X, "iloc") and getattr(X, "ndim", 0) == 2):
  34. # if not a dataframe, do normal check_array validation
  35. X_temp = check_array(X, dtype=None, force_all_finite=force_all_finite)
  36. if not hasattr(X, "dtype") and np.issubdtype(X_temp.dtype, np.str_):
  37. X = check_array(X, dtype=object, force_all_finite=force_all_finite)
  38. else:
  39. X = X_temp
  40. needs_validation = False
  41. else:
  42. # pandas dataframe, do validation later column by column, in order
  43. # to keep the dtype information to be used in the encoder.
  44. needs_validation = force_all_finite
  45. n_samples, n_features = X.shape
  46. X_columns = []
  47. for i in range(n_features):
  48. Xi = _safe_indexing(X, indices=i, axis=1)
  49. Xi = check_array(
  50. Xi, ensure_2d=False, dtype=None, force_all_finite=needs_validation
  51. )
  52. X_columns.append(Xi)
  53. return X_columns, n_samples, n_features
  54. def _fit(
  55. self,
  56. X,
  57. handle_unknown="error",
  58. force_all_finite=True,
  59. return_counts=False,
  60. return_and_ignore_missing_for_infrequent=False,
  61. ):
  62. self._check_infrequent_enabled()
  63. self._check_n_features(X, reset=True)
  64. self._check_feature_names(X, reset=True)
  65. X_list, n_samples, n_features = self._check_X(
  66. X, force_all_finite=force_all_finite
  67. )
  68. self.n_features_in_ = n_features
  69. if self.categories != "auto":
  70. if len(self.categories) != n_features:
  71. raise ValueError(
  72. "Shape mismatch: if categories is an array,"
  73. " it has to be of shape (n_features,)."
  74. )
  75. self.categories_ = []
  76. category_counts = []
  77. compute_counts = return_counts or self._infrequent_enabled
  78. for i in range(n_features):
  79. Xi = X_list[i]
  80. if self.categories == "auto":
  81. result = _unique(Xi, return_counts=compute_counts)
  82. if compute_counts:
  83. cats, counts = result
  84. category_counts.append(counts)
  85. else:
  86. cats = result
  87. else:
  88. if np.issubdtype(Xi.dtype, np.str_):
  89. # Always convert string categories to objects to avoid
  90. # unexpected string truncation for longer category labels
  91. # passed in the constructor.
  92. Xi_dtype = object
  93. else:
  94. Xi_dtype = Xi.dtype
  95. cats = np.array(self.categories[i], dtype=Xi_dtype)
  96. if (
  97. cats.dtype == object
  98. and isinstance(cats[0], bytes)
  99. and Xi.dtype.kind != "S"
  100. ):
  101. msg = (
  102. f"In column {i}, the predefined categories have type 'bytes'"
  103. " which is incompatible with values of type"
  104. f" '{type(Xi[0]).__name__}'."
  105. )
  106. raise ValueError(msg)
  107. if Xi.dtype.kind not in "OUS":
  108. sorted_cats = np.sort(cats)
  109. error_msg = (
  110. "Unsorted categories are not supported for numerical categories"
  111. )
  112. # if there are nans, nan should be the last element
  113. stop_idx = -1 if np.isnan(sorted_cats[-1]) else None
  114. if np.any(sorted_cats[:stop_idx] != cats[:stop_idx]) or (
  115. np.isnan(sorted_cats[-1]) and not np.isnan(sorted_cats[-1])
  116. ):
  117. raise ValueError(error_msg)
  118. if handle_unknown == "error":
  119. diff = _check_unknown(Xi, cats)
  120. if diff:
  121. msg = (
  122. "Found unknown categories {0} in column {1}"
  123. " during fit".format(diff, i)
  124. )
  125. raise ValueError(msg)
  126. if compute_counts:
  127. category_counts.append(_get_counts(Xi, cats))
  128. self.categories_.append(cats)
  129. output = {"n_samples": n_samples}
  130. if return_counts:
  131. output["category_counts"] = category_counts
  132. missing_indices = {}
  133. if return_and_ignore_missing_for_infrequent:
  134. for feature_idx, categories_for_idx in enumerate(self.categories_):
  135. for category_idx, category in enumerate(categories_for_idx):
  136. if is_scalar_nan(category):
  137. missing_indices[feature_idx] = category_idx
  138. break
  139. output["missing_indices"] = missing_indices
  140. if self._infrequent_enabled:
  141. self._fit_infrequent_category_mapping(
  142. n_samples,
  143. category_counts,
  144. missing_indices,
  145. )
  146. return output
  147. def _transform(
  148. self,
  149. X,
  150. handle_unknown="error",
  151. force_all_finite=True,
  152. warn_on_unknown=False,
  153. ignore_category_indices=None,
  154. ):
  155. X_list, n_samples, n_features = self._check_X(
  156. X, force_all_finite=force_all_finite
  157. )
  158. self._check_feature_names(X, reset=False)
  159. self._check_n_features(X, reset=False)
  160. X_int = np.zeros((n_samples, n_features), dtype=int)
  161. X_mask = np.ones((n_samples, n_features), dtype=bool)
  162. columns_with_unknown = []
  163. for i in range(n_features):
  164. Xi = X_list[i]
  165. diff, valid_mask = _check_unknown(Xi, self.categories_[i], return_mask=True)
  166. if not np.all(valid_mask):
  167. if handle_unknown == "error":
  168. msg = (
  169. "Found unknown categories {0} in column {1}"
  170. " during transform".format(diff, i)
  171. )
  172. raise ValueError(msg)
  173. else:
  174. if warn_on_unknown:
  175. columns_with_unknown.append(i)
  176. # Set the problematic rows to an acceptable value and
  177. # continue `The rows are marked `X_mask` and will be
  178. # removed later.
  179. X_mask[:, i] = valid_mask
  180. # cast Xi into the largest string type necessary
  181. # to handle different lengths of numpy strings
  182. if (
  183. self.categories_[i].dtype.kind in ("U", "S")
  184. and self.categories_[i].itemsize > Xi.itemsize
  185. ):
  186. Xi = Xi.astype(self.categories_[i].dtype)
  187. elif self.categories_[i].dtype.kind == "O" and Xi.dtype.kind == "U":
  188. # categories are objects and Xi are numpy strings.
  189. # Cast Xi to an object dtype to prevent truncation
  190. # when setting invalid values.
  191. Xi = Xi.astype("O")
  192. else:
  193. Xi = Xi.copy()
  194. Xi[~valid_mask] = self.categories_[i][0]
  195. # We use check_unknown=False, since _check_unknown was
  196. # already called above.
  197. X_int[:, i] = _encode(Xi, uniques=self.categories_[i], check_unknown=False)
  198. if columns_with_unknown:
  199. warnings.warn(
  200. (
  201. "Found unknown categories in columns "
  202. f"{columns_with_unknown} during transform. These "
  203. "unknown categories will be encoded as all zeros"
  204. ),
  205. UserWarning,
  206. )
  207. self._map_infrequent_categories(X_int, X_mask, ignore_category_indices)
  208. return X_int, X_mask
  209. @property
  210. def infrequent_categories_(self):
  211. """Infrequent categories for each feature."""
  212. # raises an AttributeError if `_infrequent_indices` is not defined
  213. infrequent_indices = self._infrequent_indices
  214. return [
  215. None if indices is None else category[indices]
  216. for category, indices in zip(self.categories_, infrequent_indices)
  217. ]
  218. def _check_infrequent_enabled(self):
  219. """
  220. This functions checks whether _infrequent_enabled is True or False.
  221. This has to be called after parameter validation in the fit function.
  222. """
  223. max_categories = getattr(self, "max_categories", None)
  224. min_frequency = getattr(self, "min_frequency", None)
  225. self._infrequent_enabled = (
  226. max_categories is not None and max_categories >= 1
  227. ) or min_frequency is not None
  228. def _identify_infrequent(self, category_count, n_samples, col_idx):
  229. """Compute the infrequent indices.
  230. Parameters
  231. ----------
  232. category_count : ndarray of shape (n_cardinality,)
  233. Category counts.
  234. n_samples : int
  235. Number of samples.
  236. col_idx : int
  237. Index of the current category. Only used for the error message.
  238. Returns
  239. -------
  240. output : ndarray of shape (n_infrequent_categories,) or None
  241. If there are infrequent categories, indices of infrequent
  242. categories. Otherwise None.
  243. """
  244. if isinstance(self.min_frequency, numbers.Integral):
  245. infrequent_mask = category_count < self.min_frequency
  246. elif isinstance(self.min_frequency, numbers.Real):
  247. min_frequency_abs = n_samples * self.min_frequency
  248. infrequent_mask = category_count < min_frequency_abs
  249. else:
  250. infrequent_mask = np.zeros(category_count.shape[0], dtype=bool)
  251. n_current_features = category_count.size - infrequent_mask.sum() + 1
  252. if self.max_categories is not None and self.max_categories < n_current_features:
  253. # max_categories includes the one infrequent category
  254. frequent_category_count = self.max_categories - 1
  255. if frequent_category_count == 0:
  256. # All categories are infrequent
  257. infrequent_mask[:] = True
  258. else:
  259. # stable sort to preserve original count order
  260. smallest_levels = np.argsort(category_count, kind="mergesort")[
  261. :-frequent_category_count
  262. ]
  263. infrequent_mask[smallest_levels] = True
  264. output = np.flatnonzero(infrequent_mask)
  265. return output if output.size > 0 else None
  266. def _fit_infrequent_category_mapping(
  267. self, n_samples, category_counts, missing_indices
  268. ):
  269. """Fit infrequent categories.
  270. Defines the private attribute: `_default_to_infrequent_mappings`. For
  271. feature `i`, `_default_to_infrequent_mappings[i]` defines the mapping
  272. from the integer encoding returned by `super().transform()` into
  273. infrequent categories. If `_default_to_infrequent_mappings[i]` is None,
  274. there were no infrequent categories in the training set.
  275. For example if categories 0, 2 and 4 were frequent, while categories
  276. 1, 3, 5 were infrequent for feature 7, then these categories are mapped
  277. to a single output:
  278. `_default_to_infrequent_mappings[7] = array([0, 3, 1, 3, 2, 3])`
  279. Defines private attribute: `_infrequent_indices`. `_infrequent_indices[i]`
  280. is an array of indices such that
  281. `categories_[i][_infrequent_indices[i]]` are all the infrequent category
  282. labels. If the feature `i` has no infrequent categories
  283. `_infrequent_indices[i]` is None.
  284. .. versionadded:: 1.1
  285. Parameters
  286. ----------
  287. n_samples : int
  288. Number of samples in training set.
  289. category_counts: list of ndarray
  290. `category_counts[i]` is the category counts corresponding to
  291. `self.categories_[i]`.
  292. missing_indices : dict
  293. Dict mapping from feature_idx to category index with a missing value.
  294. """
  295. # Remove missing value from counts, so it is not considered as infrequent
  296. if missing_indices:
  297. category_counts_ = []
  298. for feature_idx, count in enumerate(category_counts):
  299. if feature_idx in missing_indices:
  300. category_counts_.append(
  301. np.delete(count, missing_indices[feature_idx])
  302. )
  303. else:
  304. category_counts_.append(count)
  305. else:
  306. category_counts_ = category_counts
  307. self._infrequent_indices = [
  308. self._identify_infrequent(category_count, n_samples, col_idx)
  309. for col_idx, category_count in enumerate(category_counts_)
  310. ]
  311. # compute mapping from default mapping to infrequent mapping
  312. self._default_to_infrequent_mappings = []
  313. for feature_idx, infreq_idx in enumerate(self._infrequent_indices):
  314. cats = self.categories_[feature_idx]
  315. # no infrequent categories
  316. if infreq_idx is None:
  317. self._default_to_infrequent_mappings.append(None)
  318. continue
  319. n_cats = len(cats)
  320. if feature_idx in missing_indices:
  321. # Missing index was removed from this category when computing
  322. # infrequent indices, thus we need to decrease the number of
  323. # total categories when considering the infrequent mapping.
  324. n_cats -= 1
  325. # infrequent indices exist
  326. mapping = np.empty(n_cats, dtype=np.int64)
  327. n_infrequent_cats = infreq_idx.size
  328. # infrequent categories are mapped to the last element.
  329. n_frequent_cats = n_cats - n_infrequent_cats
  330. mapping[infreq_idx] = n_frequent_cats
  331. frequent_indices = np.setdiff1d(np.arange(n_cats), infreq_idx)
  332. mapping[frequent_indices] = np.arange(n_frequent_cats)
  333. self._default_to_infrequent_mappings.append(mapping)
  334. def _map_infrequent_categories(self, X_int, X_mask, ignore_category_indices):
  335. """Map infrequent categories to integer representing the infrequent category.
  336. This modifies X_int in-place. Values that were invalid based on `X_mask`
  337. are mapped to the infrequent category if there was an infrequent
  338. category for that feature.
  339. Parameters
  340. ----------
  341. X_int: ndarray of shape (n_samples, n_features)
  342. Integer encoded categories.
  343. X_mask: ndarray of shape (n_samples, n_features)
  344. Bool mask for valid values in `X_int`.
  345. ignore_category_indices : dict
  346. Dictionary mapping from feature_idx to category index to ignore.
  347. Ignored indexes will not be grouped and the original ordinal encoding
  348. will remain.
  349. """
  350. if not self._infrequent_enabled:
  351. return
  352. ignore_category_indices = ignore_category_indices or {}
  353. for col_idx in range(X_int.shape[1]):
  354. infrequent_idx = self._infrequent_indices[col_idx]
  355. if infrequent_idx is None:
  356. continue
  357. X_int[~X_mask[:, col_idx], col_idx] = infrequent_idx[0]
  358. if self.handle_unknown == "infrequent_if_exist":
  359. # All the unknown values are now mapped to the
  360. # infrequent_idx[0], which makes the unknown values valid
  361. # This is needed in `transform` when the encoding is formed
  362. # using `X_mask`.
  363. X_mask[:, col_idx] = True
  364. # Remaps encoding in `X_int` where the infrequent categories are
  365. # grouped together.
  366. for i, mapping in enumerate(self._default_to_infrequent_mappings):
  367. if mapping is None:
  368. continue
  369. if i in ignore_category_indices:
  370. # Update rows that are **not** ignored
  371. rows_to_update = X_int[:, i] != ignore_category_indices[i]
  372. else:
  373. rows_to_update = slice(None)
  374. X_int[rows_to_update, i] = np.take(mapping, X_int[rows_to_update, i])
  375. def _more_tags(self):
  376. return {"X_types": ["2darray", "categorical"], "allow_nan": True}
  377. class OneHotEncoder(_BaseEncoder):
  378. """
  379. Encode categorical features as a one-hot numeric array.
  380. The input to this transformer should be an array-like of integers or
  381. strings, denoting the values taken on by categorical (discrete) features.
  382. The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')
  383. encoding scheme. This creates a binary column for each category and
  384. returns a sparse matrix or dense array (depending on the ``sparse_output``
  385. parameter)
  386. By default, the encoder derives the categories based on the unique values
  387. in each feature. Alternatively, you can also specify the `categories`
  388. manually.
  389. This encoding is needed for feeding categorical data to many scikit-learn
  390. estimators, notably linear models and SVMs with the standard kernels.
  391. Note: a one-hot encoding of y labels should use a LabelBinarizer
  392. instead.
  393. Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
  394. For a comparison of different encoders, refer to:
  395. :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`.
  396. Parameters
  397. ----------
  398. categories : 'auto' or a list of array-like, default='auto'
  399. Categories (unique values) per feature:
  400. - 'auto' : Determine categories automatically from the training data.
  401. - list : ``categories[i]`` holds the categories expected in the ith
  402. column. The passed categories should not mix strings and numeric
  403. values within a single feature, and should be sorted in case of
  404. numeric values.
  405. The used categories can be found in the ``categories_`` attribute.
  406. .. versionadded:: 0.20
  407. drop : {'first', 'if_binary'} or an array-like of shape (n_features,), \
  408. default=None
  409. Specifies a methodology to use to drop one of the categories per
  410. feature. This is useful in situations where perfectly collinear
  411. features cause problems, such as when feeding the resulting data
  412. into an unregularized linear regression model.
  413. However, dropping one category breaks the symmetry of the original
  414. representation and can therefore induce a bias in downstream models,
  415. for instance for penalized linear classification or regression models.
  416. - None : retain all features (the default).
  417. - 'first' : drop the first category in each feature. If only one
  418. category is present, the feature will be dropped entirely.
  419. - 'if_binary' : drop the first category in each feature with two
  420. categories. Features with 1 or more than 2 categories are
  421. left intact.
  422. - array : ``drop[i]`` is the category in feature ``X[:, i]`` that
  423. should be dropped.
  424. When `max_categories` or `min_frequency` is configured to group
  425. infrequent categories, the dropping behavior is handled after the
  426. grouping.
  427. .. versionadded:: 0.21
  428. The parameter `drop` was added in 0.21.
  429. .. versionchanged:: 0.23
  430. The option `drop='if_binary'` was added in 0.23.
  431. .. versionchanged:: 1.1
  432. Support for dropping infrequent categories.
  433. sparse : bool, default=True
  434. Will return sparse matrix if set True else will return an array.
  435. .. deprecated:: 1.2
  436. `sparse` is deprecated in 1.2 and will be removed in 1.4. Use
  437. `sparse_output` instead.
  438. sparse_output : bool, default=True
  439. Will return sparse matrix if set True else will return an array.
  440. .. versionadded:: 1.2
  441. `sparse` was renamed to `sparse_output`
  442. dtype : number type, default=np.float64
  443. Desired dtype of output.
  444. handle_unknown : {'error', 'ignore', 'infrequent_if_exist'}, \
  445. default='error'
  446. Specifies the way unknown categories are handled during :meth:`transform`.
  447. - 'error' : Raise an error if an unknown category is present during transform.
  448. - 'ignore' : When an unknown category is encountered during
  449. transform, the resulting one-hot encoded columns for this feature
  450. will be all zeros. In the inverse transform, an unknown category
  451. will be denoted as None.
  452. - 'infrequent_if_exist' : When an unknown category is encountered
  453. during transform, the resulting one-hot encoded columns for this
  454. feature will map to the infrequent category if it exists. The
  455. infrequent category will be mapped to the last position in the
  456. encoding. During inverse transform, an unknown category will be
  457. mapped to the category denoted `'infrequent'` if it exists. If the
  458. `'infrequent'` category does not exist, then :meth:`transform` and
  459. :meth:`inverse_transform` will handle an unknown category as with
  460. `handle_unknown='ignore'`. Infrequent categories exist based on
  461. `min_frequency` and `max_categories`. Read more in the
  462. :ref:`User Guide <encoder_infrequent_categories>`.
  463. .. versionchanged:: 1.1
  464. `'infrequent_if_exist'` was added to automatically handle unknown
  465. categories and infrequent categories.
  466. min_frequency : int or float, default=None
  467. Specifies the minimum frequency below which a category will be
  468. considered infrequent.
  469. - If `int`, categories with a smaller cardinality will be considered
  470. infrequent.
  471. - If `float`, categories with a smaller cardinality than
  472. `min_frequency * n_samples` will be considered infrequent.
  473. .. versionadded:: 1.1
  474. Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
  475. max_categories : int, default=None
  476. Specifies an upper limit to the number of output features for each input
  477. feature when considering infrequent categories. If there are infrequent
  478. categories, `max_categories` includes the category representing the
  479. infrequent categories along with the frequent categories. If `None`,
  480. there is no limit to the number of output features.
  481. .. versionadded:: 1.1
  482. Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
  483. feature_name_combiner : "concat" or callable, default="concat"
  484. Callable with signature `def callable(input_feature, category)` that returns a
  485. string. This is used to create feature names to be returned by
  486. :meth:`get_feature_names_out`.
  487. `"concat"` concatenates encoded feature name and category with
  488. `feature + "_" + str(category)`.E.g. feature X with values 1, 6, 7 create
  489. feature names `X_1, X_6, X_7`.
  490. .. versionadded:: 1.3
  491. Attributes
  492. ----------
  493. categories_ : list of arrays
  494. The categories of each feature determined during fitting
  495. (in order of the features in X and corresponding with the output
  496. of ``transform``). This includes the category specified in ``drop``
  497. (if any).
  498. drop_idx_ : array of shape (n_features,)
  499. - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category
  500. to be dropped for each feature.
  501. - ``drop_idx_[i] = None`` if no category is to be dropped from the
  502. feature with index ``i``, e.g. when `drop='if_binary'` and the
  503. feature isn't binary.
  504. - ``drop_idx_ = None`` if all the transformed features will be
  505. retained.
  506. If infrequent categories are enabled by setting `min_frequency` or
  507. `max_categories` to a non-default value and `drop_idx[i]` corresponds
  508. to a infrequent category, then the entire infrequent category is
  509. dropped.
  510. .. versionchanged:: 0.23
  511. Added the possibility to contain `None` values.
  512. infrequent_categories_ : list of ndarray
  513. Defined only if infrequent categories are enabled by setting
  514. `min_frequency` or `max_categories` to a non-default value.
  515. `infrequent_categories_[i]` are the infrequent categories for feature
  516. `i`. If the feature `i` has no infrequent categories
  517. `infrequent_categories_[i]` is None.
  518. .. versionadded:: 1.1
  519. n_features_in_ : int
  520. Number of features seen during :term:`fit`.
  521. .. versionadded:: 1.0
  522. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  523. Names of features seen during :term:`fit`. Defined only when `X`
  524. has feature names that are all strings.
  525. .. versionadded:: 1.0
  526. feature_name_combiner : callable or None
  527. Callable with signature `def callable(input_feature, category)` that returns a
  528. string. This is used to create feature names to be returned by
  529. :meth:`get_feature_names_out`.
  530. .. versionadded:: 1.3
  531. See Also
  532. --------
  533. OrdinalEncoder : Performs an ordinal (integer)
  534. encoding of the categorical features.
  535. TargetEncoder : Encodes categorical features using the target.
  536. sklearn.feature_extraction.DictVectorizer : Performs a one-hot encoding of
  537. dictionary items (also handles string-valued features).
  538. sklearn.feature_extraction.FeatureHasher : Performs an approximate one-hot
  539. encoding of dictionary items or strings.
  540. LabelBinarizer : Binarizes labels in a one-vs-all
  541. fashion.
  542. MultiLabelBinarizer : Transforms between iterable of
  543. iterables and a multilabel format, e.g. a (samples x classes) binary
  544. matrix indicating the presence of a class label.
  545. Examples
  546. --------
  547. Given a dataset with two features, we let the encoder find the unique
  548. values per feature and transform the data to a binary one-hot encoding.
  549. >>> from sklearn.preprocessing import OneHotEncoder
  550. One can discard categories not seen during `fit`:
  551. >>> enc = OneHotEncoder(handle_unknown='ignore')
  552. >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
  553. >>> enc.fit(X)
  554. OneHotEncoder(handle_unknown='ignore')
  555. >>> enc.categories_
  556. [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
  557. >>> enc.transform([['Female', 1], ['Male', 4]]).toarray()
  558. array([[1., 0., 1., 0., 0.],
  559. [0., 1., 0., 0., 0.]])
  560. >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])
  561. array([['Male', 1],
  562. [None, 2]], dtype=object)
  563. >>> enc.get_feature_names_out(['gender', 'group'])
  564. array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'], ...)
  565. One can always drop the first column for each feature:
  566. >>> drop_enc = OneHotEncoder(drop='first').fit(X)
  567. >>> drop_enc.categories_
  568. [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
  569. >>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray()
  570. array([[0., 0., 0.],
  571. [1., 1., 0.]])
  572. Or drop a column for feature only having 2 categories:
  573. >>> drop_binary_enc = OneHotEncoder(drop='if_binary').fit(X)
  574. >>> drop_binary_enc.transform([['Female', 1], ['Male', 2]]).toarray()
  575. array([[0., 1., 0., 0.],
  576. [1., 0., 1., 0.]])
  577. One can change the way feature names are created.
  578. >>> def custom_combiner(feature, category):
  579. ... return str(feature) + "_" + type(category).__name__ + "_" + str(category)
  580. >>> custom_fnames_enc = OneHotEncoder(feature_name_combiner=custom_combiner).fit(X)
  581. >>> custom_fnames_enc.get_feature_names_out()
  582. array(['x0_str_Female', 'x0_str_Male', 'x1_int_1', 'x1_int_2', 'x1_int_3'],
  583. dtype=object)
  584. Infrequent categories are enabled by setting `max_categories` or `min_frequency`.
  585. >>> import numpy as np
  586. >>> X = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object).T
  587. >>> ohe = OneHotEncoder(max_categories=3, sparse_output=False).fit(X)
  588. >>> ohe.infrequent_categories_
  589. [array(['a', 'd'], dtype=object)]
  590. >>> ohe.transform([["a"], ["b"]])
  591. array([[0., 0., 1.],
  592. [1., 0., 0.]])
  593. """
  594. _parameter_constraints: dict = {
  595. "categories": [StrOptions({"auto"}), list],
  596. "drop": [StrOptions({"first", "if_binary"}), "array-like", None],
  597. "dtype": "no_validation", # validation delegated to numpy
  598. "handle_unknown": [StrOptions({"error", "ignore", "infrequent_if_exist"})],
  599. "max_categories": [Interval(Integral, 1, None, closed="left"), None],
  600. "min_frequency": [
  601. Interval(Integral, 1, None, closed="left"),
  602. Interval(RealNotInt, 0, 1, closed="neither"),
  603. None,
  604. ],
  605. "sparse": [Hidden(StrOptions({"deprecated"})), "boolean"], # deprecated
  606. "sparse_output": ["boolean"],
  607. "feature_name_combiner": [StrOptions({"concat"}), callable],
  608. }
  609. def __init__(
  610. self,
  611. *,
  612. categories="auto",
  613. drop=None,
  614. sparse="deprecated",
  615. sparse_output=True,
  616. dtype=np.float64,
  617. handle_unknown="error",
  618. min_frequency=None,
  619. max_categories=None,
  620. feature_name_combiner="concat",
  621. ):
  622. self.categories = categories
  623. # TODO(1.4): Remove self.sparse
  624. self.sparse = sparse
  625. self.sparse_output = sparse_output
  626. self.dtype = dtype
  627. self.handle_unknown = handle_unknown
  628. self.drop = drop
  629. self.min_frequency = min_frequency
  630. self.max_categories = max_categories
  631. self.feature_name_combiner = feature_name_combiner
  632. def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):
  633. """Convert `drop_idx` into the index for infrequent categories.
  634. If there are no infrequent categories, then `drop_idx` is
  635. returned. This method is called in `_set_drop_idx` when the `drop`
  636. parameter is an array-like.
  637. """
  638. if not self._infrequent_enabled:
  639. return drop_idx
  640. default_to_infrequent = self._default_to_infrequent_mappings[feature_idx]
  641. if default_to_infrequent is None:
  642. return drop_idx
  643. # Raise error when explicitly dropping a category that is infrequent
  644. infrequent_indices = self._infrequent_indices[feature_idx]
  645. if infrequent_indices is not None and drop_idx in infrequent_indices:
  646. categories = self.categories_[feature_idx]
  647. raise ValueError(
  648. f"Unable to drop category {categories[drop_idx].item()!r} from"
  649. f" feature {feature_idx} because it is infrequent"
  650. )
  651. return default_to_infrequent[drop_idx]
  652. def _set_drop_idx(self):
  653. """Compute the drop indices associated with `self.categories_`.
  654. If `self.drop` is:
  655. - `None`, No categories have been dropped.
  656. - `'first'`, All zeros to drop the first category.
  657. - `'if_binary'`, All zeros if the category is binary and `None`
  658. otherwise.
  659. - array-like, The indices of the categories that match the
  660. categories in `self.drop`. If the dropped category is an infrequent
  661. category, then the index for the infrequent category is used. This
  662. means that the entire infrequent category is dropped.
  663. This methods defines a public `drop_idx_` and a private
  664. `_drop_idx_after_grouping`.
  665. - `drop_idx_`: Public facing API that references the drop category in
  666. `self.categories_`.
  667. - `_drop_idx_after_grouping`: Used internally to drop categories *after* the
  668. infrequent categories are grouped together.
  669. If there are no infrequent categories or drop is `None`, then
  670. `drop_idx_=_drop_idx_after_grouping`.
  671. """
  672. if self.drop is None:
  673. drop_idx_after_grouping = None
  674. elif isinstance(self.drop, str):
  675. if self.drop == "first":
  676. drop_idx_after_grouping = np.zeros(len(self.categories_), dtype=object)
  677. elif self.drop == "if_binary":
  678. n_features_out_no_drop = [len(cat) for cat in self.categories_]
  679. if self._infrequent_enabled:
  680. for i, infreq_idx in enumerate(self._infrequent_indices):
  681. if infreq_idx is None:
  682. continue
  683. n_features_out_no_drop[i] -= infreq_idx.size - 1
  684. drop_idx_after_grouping = np.array(
  685. [
  686. 0 if n_features_out == 2 else None
  687. for n_features_out in n_features_out_no_drop
  688. ],
  689. dtype=object,
  690. )
  691. else:
  692. drop_array = np.asarray(self.drop, dtype=object)
  693. droplen = len(drop_array)
  694. if droplen != len(self.categories_):
  695. msg = (
  696. "`drop` should have length equal to the number "
  697. "of features ({}), got {}"
  698. )
  699. raise ValueError(msg.format(len(self.categories_), droplen))
  700. missing_drops = []
  701. drop_indices = []
  702. for feature_idx, (drop_val, cat_list) in enumerate(
  703. zip(drop_array, self.categories_)
  704. ):
  705. if not is_scalar_nan(drop_val):
  706. drop_idx = np.where(cat_list == drop_val)[0]
  707. if drop_idx.size: # found drop idx
  708. drop_indices.append(
  709. self._map_drop_idx_to_infrequent(feature_idx, drop_idx[0])
  710. )
  711. else:
  712. missing_drops.append((feature_idx, drop_val))
  713. continue
  714. # drop_val is nan, find nan in categories manually
  715. for cat_idx, cat in enumerate(cat_list):
  716. if is_scalar_nan(cat):
  717. drop_indices.append(
  718. self._map_drop_idx_to_infrequent(feature_idx, cat_idx)
  719. )
  720. break
  721. else: # loop did not break thus drop is missing
  722. missing_drops.append((feature_idx, drop_val))
  723. if any(missing_drops):
  724. msg = (
  725. "The following categories were supposed to be "
  726. "dropped, but were not found in the training "
  727. "data.\n{}".format(
  728. "\n".join(
  729. [
  730. "Category: {}, Feature: {}".format(c, v)
  731. for c, v in missing_drops
  732. ]
  733. )
  734. )
  735. )
  736. raise ValueError(msg)
  737. drop_idx_after_grouping = np.array(drop_indices, dtype=object)
  738. # `_drop_idx_after_grouping` are the categories to drop *after* the infrequent
  739. # categories are grouped together. If needed, we remap `drop_idx` back
  740. # to the categories seen in `self.categories_`.
  741. self._drop_idx_after_grouping = drop_idx_after_grouping
  742. if not self._infrequent_enabled or drop_idx_after_grouping is None:
  743. self.drop_idx_ = self._drop_idx_after_grouping
  744. else:
  745. drop_idx_ = []
  746. for feature_idx, drop_idx in enumerate(drop_idx_after_grouping):
  747. default_to_infrequent = self._default_to_infrequent_mappings[
  748. feature_idx
  749. ]
  750. if drop_idx is None or default_to_infrequent is None:
  751. orig_drop_idx = drop_idx
  752. else:
  753. orig_drop_idx = np.flatnonzero(default_to_infrequent == drop_idx)[0]
  754. drop_idx_.append(orig_drop_idx)
  755. self.drop_idx_ = np.asarray(drop_idx_, dtype=object)
  756. def _compute_transformed_categories(self, i, remove_dropped=True):
  757. """Compute the transformed categories used for column `i`.
  758. 1. If there are infrequent categories, the category is named
  759. 'infrequent_sklearn'.
  760. 2. Dropped columns are removed when remove_dropped=True.
  761. """
  762. cats = self.categories_[i]
  763. if self._infrequent_enabled:
  764. infreq_map = self._default_to_infrequent_mappings[i]
  765. if infreq_map is not None:
  766. frequent_mask = infreq_map < infreq_map.max()
  767. infrequent_cat = "infrequent_sklearn"
  768. # infrequent category is always at the end
  769. cats = np.concatenate(
  770. (cats[frequent_mask], np.array([infrequent_cat], dtype=object))
  771. )
  772. if remove_dropped:
  773. cats = self._remove_dropped_categories(cats, i)
  774. return cats
  775. def _remove_dropped_categories(self, categories, i):
  776. """Remove dropped categories."""
  777. if (
  778. self._drop_idx_after_grouping is not None
  779. and self._drop_idx_after_grouping[i] is not None
  780. ):
  781. return np.delete(categories, self._drop_idx_after_grouping[i])
  782. return categories
  783. def _compute_n_features_outs(self):
  784. """Compute the n_features_out for each input feature."""
  785. output = [len(cats) for cats in self.categories_]
  786. if self._drop_idx_after_grouping is not None:
  787. for i, drop_idx in enumerate(self._drop_idx_after_grouping):
  788. if drop_idx is not None:
  789. output[i] -= 1
  790. if not self._infrequent_enabled:
  791. return output
  792. # infrequent is enabled, the number of features out are reduced
  793. # because the infrequent categories are grouped together
  794. for i, infreq_idx in enumerate(self._infrequent_indices):
  795. if infreq_idx is None:
  796. continue
  797. output[i] -= infreq_idx.size - 1
  798. return output
  799. @_fit_context(prefer_skip_nested_validation=True)
  800. def fit(self, X, y=None):
  801. """
  802. Fit OneHotEncoder to X.
  803. Parameters
  804. ----------
  805. X : array-like of shape (n_samples, n_features)
  806. The data to determine the categories of each feature.
  807. y : None
  808. Ignored. This parameter exists only for compatibility with
  809. :class:`~sklearn.pipeline.Pipeline`.
  810. Returns
  811. -------
  812. self
  813. Fitted encoder.
  814. """
  815. if self.sparse != "deprecated":
  816. warnings.warn(
  817. (
  818. "`sparse` was renamed to `sparse_output` in version 1.2 and "
  819. "will be removed in 1.4. `sparse_output` is ignored unless you "
  820. "leave `sparse` to its default value."
  821. ),
  822. FutureWarning,
  823. )
  824. self.sparse_output = self.sparse
  825. self._fit(
  826. X,
  827. handle_unknown=self.handle_unknown,
  828. force_all_finite="allow-nan",
  829. )
  830. self._set_drop_idx()
  831. self._n_features_outs = self._compute_n_features_outs()
  832. return self
  833. def transform(self, X):
  834. """
  835. Transform X using one-hot encoding.
  836. If there are infrequent categories for a feature, the infrequent
  837. categories will be grouped into a single category.
  838. Parameters
  839. ----------
  840. X : array-like of shape (n_samples, n_features)
  841. The data to encode.
  842. Returns
  843. -------
  844. X_out : {ndarray, sparse matrix} of shape \
  845. (n_samples, n_encoded_features)
  846. Transformed input. If `sparse_output=True`, a sparse matrix will be
  847. returned.
  848. """
  849. check_is_fitted(self)
  850. transform_output = _get_output_config("transform", estimator=self)["dense"]
  851. if transform_output == "pandas" and self.sparse_output:
  852. raise ValueError(
  853. "Pandas output does not support sparse data. Set sparse_output=False to"
  854. " output pandas DataFrames or disable pandas output via"
  855. ' `ohe.set_output(transform="default").'
  856. )
  857. # validation of X happens in _check_X called by _transform
  858. warn_on_unknown = self.drop is not None and self.handle_unknown in {
  859. "ignore",
  860. "infrequent_if_exist",
  861. }
  862. X_int, X_mask = self._transform(
  863. X,
  864. handle_unknown=self.handle_unknown,
  865. force_all_finite="allow-nan",
  866. warn_on_unknown=warn_on_unknown,
  867. )
  868. n_samples, n_features = X_int.shape
  869. if self._drop_idx_after_grouping is not None:
  870. to_drop = self._drop_idx_after_grouping.copy()
  871. # We remove all the dropped categories from mask, and decrement all
  872. # categories that occur after them to avoid an empty column.
  873. keep_cells = X_int != to_drop
  874. for i, cats in enumerate(self.categories_):
  875. # drop='if_binary' but feature isn't binary
  876. if to_drop[i] is None:
  877. # set to cardinality to not drop from X_int
  878. to_drop[i] = len(cats)
  879. to_drop = to_drop.reshape(1, -1)
  880. X_int[X_int > to_drop] -= 1
  881. X_mask &= keep_cells
  882. mask = X_mask.ravel()
  883. feature_indices = np.cumsum([0] + self._n_features_outs)
  884. indices = (X_int + feature_indices[:-1]).ravel()[mask]
  885. indptr = np.empty(n_samples + 1, dtype=int)
  886. indptr[0] = 0
  887. np.sum(X_mask, axis=1, out=indptr[1:], dtype=indptr.dtype)
  888. np.cumsum(indptr[1:], out=indptr[1:])
  889. data = np.ones(indptr[-1])
  890. out = sparse.csr_matrix(
  891. (data, indices, indptr),
  892. shape=(n_samples, feature_indices[-1]),
  893. dtype=self.dtype,
  894. )
  895. if not self.sparse_output:
  896. return out.toarray()
  897. else:
  898. return out
  899. def inverse_transform(self, X):
  900. """
  901. Convert the data back to the original representation.
  902. When unknown categories are encountered (all zeros in the
  903. one-hot encoding), ``None`` is used to represent this category. If the
  904. feature with the unknown category has a dropped category, the dropped
  905. category will be its inverse.
  906. For a given input feature, if there is an infrequent category,
  907. 'infrequent_sklearn' will be used to represent the infrequent category.
  908. Parameters
  909. ----------
  910. X : {array-like, sparse matrix} of shape \
  911. (n_samples, n_encoded_features)
  912. The transformed data.
  913. Returns
  914. -------
  915. X_tr : ndarray of shape (n_samples, n_features)
  916. Inverse transformed array.
  917. """
  918. check_is_fitted(self)
  919. X = check_array(X, accept_sparse="csr")
  920. n_samples, _ = X.shape
  921. n_features = len(self.categories_)
  922. n_features_out = np.sum(self._n_features_outs)
  923. # validate shape of passed X
  924. msg = (
  925. "Shape of the passed X data is not correct. Expected {0} columns, got {1}."
  926. )
  927. if X.shape[1] != n_features_out:
  928. raise ValueError(msg.format(n_features_out, X.shape[1]))
  929. transformed_features = [
  930. self._compute_transformed_categories(i, remove_dropped=False)
  931. for i, _ in enumerate(self.categories_)
  932. ]
  933. # create resulting array of appropriate dtype
  934. dt = np.result_type(*[cat.dtype for cat in transformed_features])
  935. X_tr = np.empty((n_samples, n_features), dtype=dt)
  936. j = 0
  937. found_unknown = {}
  938. if self._infrequent_enabled:
  939. infrequent_indices = self._infrequent_indices
  940. else:
  941. infrequent_indices = [None] * n_features
  942. for i in range(n_features):
  943. cats_wo_dropped = self._remove_dropped_categories(
  944. transformed_features[i], i
  945. )
  946. n_categories = cats_wo_dropped.shape[0]
  947. # Only happens if there was a column with a unique
  948. # category. In this case we just fill the column with this
  949. # unique category value.
  950. if n_categories == 0:
  951. X_tr[:, i] = self.categories_[i][self._drop_idx_after_grouping[i]]
  952. j += n_categories
  953. continue
  954. sub = X[:, j : j + n_categories]
  955. # for sparse X argmax returns 2D matrix, ensure 1D array
  956. labels = np.asarray(sub.argmax(axis=1)).flatten()
  957. X_tr[:, i] = cats_wo_dropped[labels]
  958. if self.handle_unknown == "ignore" or (
  959. self.handle_unknown == "infrequent_if_exist"
  960. and infrequent_indices[i] is None
  961. ):
  962. unknown = np.asarray(sub.sum(axis=1) == 0).flatten()
  963. # ignored unknown categories: we have a row of all zero
  964. if unknown.any():
  965. # if categories were dropped then unknown categories will
  966. # be mapped to the dropped category
  967. if (
  968. self._drop_idx_after_grouping is None
  969. or self._drop_idx_after_grouping[i] is None
  970. ):
  971. found_unknown[i] = unknown
  972. else:
  973. X_tr[unknown, i] = self.categories_[i][
  974. self._drop_idx_after_grouping[i]
  975. ]
  976. else:
  977. dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
  978. if dropped.any():
  979. if self._drop_idx_after_grouping is None:
  980. all_zero_samples = np.flatnonzero(dropped)
  981. raise ValueError(
  982. f"Samples {all_zero_samples} can not be inverted "
  983. "when drop=None and handle_unknown='error' "
  984. "because they contain all zeros"
  985. )
  986. # we can safely assume that all of the nulls in each column
  987. # are the dropped value
  988. drop_idx = self._drop_idx_after_grouping[i]
  989. X_tr[dropped, i] = transformed_features[i][drop_idx]
  990. j += n_categories
  991. # if ignored are found: potentially need to upcast result to
  992. # insert None values
  993. if found_unknown:
  994. if X_tr.dtype != object:
  995. X_tr = X_tr.astype(object)
  996. for idx, mask in found_unknown.items():
  997. X_tr[mask, idx] = None
  998. return X_tr
  999. def get_feature_names_out(self, input_features=None):
  1000. """Get output feature names for transformation.
  1001. Parameters
  1002. ----------
  1003. input_features : array-like of str or None, default=None
  1004. Input features.
  1005. - If `input_features` is `None`, then `feature_names_in_` is
  1006. used as feature names in. If `feature_names_in_` is not defined,
  1007. then the following input feature names are generated:
  1008. `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
  1009. - If `input_features` is an array-like, then `input_features` must
  1010. match `feature_names_in_` if `feature_names_in_` is defined.
  1011. Returns
  1012. -------
  1013. feature_names_out : ndarray of str objects
  1014. Transformed feature names.
  1015. """
  1016. check_is_fitted(self)
  1017. input_features = _check_feature_names_in(self, input_features)
  1018. cats = [
  1019. self._compute_transformed_categories(i)
  1020. for i, _ in enumerate(self.categories_)
  1021. ]
  1022. name_combiner = self._check_get_feature_name_combiner()
  1023. feature_names = []
  1024. for i in range(len(cats)):
  1025. names = [name_combiner(input_features[i], t) for t in cats[i]]
  1026. feature_names.extend(names)
  1027. return np.array(feature_names, dtype=object)
  1028. def _check_get_feature_name_combiner(self):
  1029. if self.feature_name_combiner == "concat":
  1030. return lambda feature, category: feature + "_" + str(category)
  1031. else: # callable
  1032. dry_run_combiner = self.feature_name_combiner("feature", "category")
  1033. if not isinstance(dry_run_combiner, str):
  1034. raise TypeError(
  1035. "When `feature_name_combiner` is a callable, it should return a "
  1036. f"Python string. Got {type(dry_run_combiner)} instead."
  1037. )
  1038. return self.feature_name_combiner
  1039. class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder):
  1040. """
  1041. Encode categorical features as an integer array.
  1042. The input to this transformer should be an array-like of integers or
  1043. strings, denoting the values taken on by categorical (discrete) features.
  1044. The features are converted to ordinal integers. This results in
  1045. a single column of integers (0 to n_categories - 1) per feature.
  1046. Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
  1047. For a comparison of different encoders, refer to:
  1048. :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`.
  1049. .. versionadded:: 0.20
  1050. Parameters
  1051. ----------
  1052. categories : 'auto' or a list of array-like, default='auto'
  1053. Categories (unique values) per feature:
  1054. - 'auto' : Determine categories automatically from the training data.
  1055. - list : ``categories[i]`` holds the categories expected in the ith
  1056. column. The passed categories should not mix strings and numeric
  1057. values, and should be sorted in case of numeric values.
  1058. The used categories can be found in the ``categories_`` attribute.
  1059. dtype : number type, default=np.float64
  1060. Desired dtype of output.
  1061. handle_unknown : {'error', 'use_encoded_value'}, default='error'
  1062. When set to 'error' an error will be raised in case an unknown
  1063. categorical feature is present during transform. When set to
  1064. 'use_encoded_value', the encoded value of unknown categories will be
  1065. set to the value given for the parameter `unknown_value`. In
  1066. :meth:`inverse_transform`, an unknown category will be denoted as None.
  1067. .. versionadded:: 0.24
  1068. unknown_value : int or np.nan, default=None
  1069. When the parameter handle_unknown is set to 'use_encoded_value', this
  1070. parameter is required and will set the encoded value of unknown
  1071. categories. It has to be distinct from the values used to encode any of
  1072. the categories in `fit`. If set to np.nan, the `dtype` parameter must
  1073. be a float dtype.
  1074. .. versionadded:: 0.24
  1075. encoded_missing_value : int or np.nan, default=np.nan
  1076. Encoded value of missing categories. If set to `np.nan`, then the `dtype`
  1077. parameter must be a float dtype.
  1078. .. versionadded:: 1.1
  1079. min_frequency : int or float, default=None
  1080. Specifies the minimum frequency below which a category will be
  1081. considered infrequent.
  1082. - If `int`, categories with a smaller cardinality will be considered
  1083. infrequent.
  1084. - If `float`, categories with a smaller cardinality than
  1085. `min_frequency * n_samples` will be considered infrequent.
  1086. .. versionadded:: 1.3
  1087. Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
  1088. max_categories : int, default=None
  1089. Specifies an upper limit to the number of output categories for each input
  1090. feature when considering infrequent categories. If there are infrequent
  1091. categories, `max_categories` includes the category representing the
  1092. infrequent categories along with the frequent categories. If `None`,
  1093. there is no limit to the number of output features.
  1094. `max_categories` do **not** take into account missing or unknown
  1095. categories. Setting `unknown_value` or `encoded_missing_value` to an
  1096. integer will increase the number of unique integer codes by one each.
  1097. This can result in up to `max_categories + 2` integer codes.
  1098. .. versionadded:: 1.3
  1099. Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
  1100. Attributes
  1101. ----------
  1102. categories_ : list of arrays
  1103. The categories of each feature determined during ``fit`` (in order of
  1104. the features in X and corresponding with the output of ``transform``).
  1105. This does not include categories that weren't seen during ``fit``.
  1106. n_features_in_ : int
  1107. Number of features seen during :term:`fit`.
  1108. .. versionadded:: 1.0
  1109. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  1110. Names of features seen during :term:`fit`. Defined only when `X`
  1111. has feature names that are all strings.
  1112. .. versionadded:: 1.0
  1113. infrequent_categories_ : list of ndarray
  1114. Defined only if infrequent categories are enabled by setting
  1115. `min_frequency` or `max_categories` to a non-default value.
  1116. `infrequent_categories_[i]` are the infrequent categories for feature
  1117. `i`. If the feature `i` has no infrequent categories
  1118. `infrequent_categories_[i]` is None.
  1119. .. versionadded:: 1.3
  1120. See Also
  1121. --------
  1122. OneHotEncoder : Performs a one-hot encoding of categorical features. This encoding
  1123. is suitable for low to medium cardinality categorical variables, both in
  1124. supervised and unsupervised settings.
  1125. TargetEncoder : Encodes categorical features using supervised signal
  1126. in a classification or regression pipeline. This encoding is typically
  1127. suitable for high cardinality categorical variables.
  1128. LabelEncoder : Encodes target labels with values between 0 and
  1129. ``n_classes-1``.
  1130. Notes
  1131. -----
  1132. With a high proportion of `nan` values, inferring categories becomes slow with
  1133. Python versions before 3.10. The handling of `nan` values was improved
  1134. from Python 3.10 onwards, (c.f.
  1135. `bpo-43475 <https://github.com/python/cpython/issues/87641>`_).
  1136. Examples
  1137. --------
  1138. Given a dataset with two features, we let the encoder find the unique
  1139. values per feature and transform the data to an ordinal encoding.
  1140. >>> from sklearn.preprocessing import OrdinalEncoder
  1141. >>> enc = OrdinalEncoder()
  1142. >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
  1143. >>> enc.fit(X)
  1144. OrdinalEncoder()
  1145. >>> enc.categories_
  1146. [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
  1147. >>> enc.transform([['Female', 3], ['Male', 1]])
  1148. array([[0., 2.],
  1149. [1., 0.]])
  1150. >>> enc.inverse_transform([[1, 0], [0, 1]])
  1151. array([['Male', 1],
  1152. ['Female', 2]], dtype=object)
  1153. By default, :class:`OrdinalEncoder` is lenient towards missing values by
  1154. propagating them.
  1155. >>> import numpy as np
  1156. >>> X = [['Male', 1], ['Female', 3], ['Female', np.nan]]
  1157. >>> enc.fit_transform(X)
  1158. array([[ 1., 0.],
  1159. [ 0., 1.],
  1160. [ 0., nan]])
  1161. You can use the parameter `encoded_missing_value` to encode missing values.
  1162. >>> enc.set_params(encoded_missing_value=-1).fit_transform(X)
  1163. array([[ 1., 0.],
  1164. [ 0., 1.],
  1165. [ 0., -1.]])
  1166. Infrequent categories are enabled by setting `max_categories` or `min_frequency`.
  1167. In the following example, "a" and "d" are considered infrequent and grouped
  1168. together into a single category, "b" and "c" are their own categories, unknown
  1169. values are encoded as 3 and missing values are encoded as 4.
  1170. >>> X_train = np.array(
  1171. ... [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3 + [np.nan]],
  1172. ... dtype=object).T
  1173. >>> enc = OrdinalEncoder(
  1174. ... handle_unknown="use_encoded_value", unknown_value=3,
  1175. ... max_categories=3, encoded_missing_value=4)
  1176. >>> _ = enc.fit(X_train)
  1177. >>> X_test = np.array([["a"], ["b"], ["c"], ["d"], ["e"], [np.nan]], dtype=object)
  1178. >>> enc.transform(X_test)
  1179. array([[2.],
  1180. [0.],
  1181. [1.],
  1182. [2.],
  1183. [3.],
  1184. [4.]])
  1185. """
  1186. _parameter_constraints: dict = {
  1187. "categories": [StrOptions({"auto"}), list],
  1188. "dtype": "no_validation", # validation delegated to numpy
  1189. "encoded_missing_value": [Integral, type(np.nan)],
  1190. "handle_unknown": [StrOptions({"error", "use_encoded_value"})],
  1191. "unknown_value": [Integral, type(np.nan), None],
  1192. "max_categories": [Interval(Integral, 1, None, closed="left"), None],
  1193. "min_frequency": [
  1194. Interval(Integral, 1, None, closed="left"),
  1195. Interval(RealNotInt, 0, 1, closed="neither"),
  1196. None,
  1197. ],
  1198. }
  1199. def __init__(
  1200. self,
  1201. *,
  1202. categories="auto",
  1203. dtype=np.float64,
  1204. handle_unknown="error",
  1205. unknown_value=None,
  1206. encoded_missing_value=np.nan,
  1207. min_frequency=None,
  1208. max_categories=None,
  1209. ):
  1210. self.categories = categories
  1211. self.dtype = dtype
  1212. self.handle_unknown = handle_unknown
  1213. self.unknown_value = unknown_value
  1214. self.encoded_missing_value = encoded_missing_value
  1215. self.min_frequency = min_frequency
  1216. self.max_categories = max_categories
  1217. @_fit_context(prefer_skip_nested_validation=True)
  1218. def fit(self, X, y=None):
  1219. """
  1220. Fit the OrdinalEncoder to X.
  1221. Parameters
  1222. ----------
  1223. X : array-like of shape (n_samples, n_features)
  1224. The data to determine the categories of each feature.
  1225. y : None
  1226. Ignored. This parameter exists only for compatibility with
  1227. :class:`~sklearn.pipeline.Pipeline`.
  1228. Returns
  1229. -------
  1230. self : object
  1231. Fitted encoder.
  1232. """
  1233. if self.handle_unknown == "use_encoded_value":
  1234. if is_scalar_nan(self.unknown_value):
  1235. if np.dtype(self.dtype).kind != "f":
  1236. raise ValueError(
  1237. "When unknown_value is np.nan, the dtype "
  1238. "parameter should be "
  1239. f"a float dtype. Got {self.dtype}."
  1240. )
  1241. elif not isinstance(self.unknown_value, numbers.Integral):
  1242. raise TypeError(
  1243. "unknown_value should be an integer or "
  1244. "np.nan when "
  1245. "handle_unknown is 'use_encoded_value', "
  1246. f"got {self.unknown_value}."
  1247. )
  1248. elif self.unknown_value is not None:
  1249. raise TypeError(
  1250. "unknown_value should only be set when "
  1251. "handle_unknown is 'use_encoded_value', "
  1252. f"got {self.unknown_value}."
  1253. )
  1254. # `_fit` will only raise an error when `self.handle_unknown="error"`
  1255. fit_results = self._fit(
  1256. X,
  1257. handle_unknown=self.handle_unknown,
  1258. force_all_finite="allow-nan",
  1259. return_and_ignore_missing_for_infrequent=True,
  1260. )
  1261. self._missing_indices = fit_results["missing_indices"]
  1262. cardinalities = [len(categories) for categories in self.categories_]
  1263. if self._infrequent_enabled:
  1264. # Cardinality decreases because the infrequent categories are grouped
  1265. # together
  1266. for feature_idx, infrequent in enumerate(self.infrequent_categories_):
  1267. if infrequent is not None:
  1268. cardinalities[feature_idx] -= len(infrequent)
  1269. # stores the missing indices per category
  1270. self._missing_indices = {}
  1271. for cat_idx, categories_for_idx in enumerate(self.categories_):
  1272. for i, cat in enumerate(categories_for_idx):
  1273. if is_scalar_nan(cat):
  1274. self._missing_indices[cat_idx] = i
  1275. # missing values are not considered part of the cardinality
  1276. # when considering unknown categories or encoded_missing_value
  1277. cardinalities[cat_idx] -= 1
  1278. continue
  1279. if self.handle_unknown == "use_encoded_value":
  1280. for cardinality in cardinalities:
  1281. if 0 <= self.unknown_value < cardinality:
  1282. raise ValueError(
  1283. "The used value for unknown_value "
  1284. f"{self.unknown_value} is one of the "
  1285. "values already used for encoding the "
  1286. "seen categories."
  1287. )
  1288. if self._missing_indices:
  1289. if np.dtype(self.dtype).kind != "f" and is_scalar_nan(
  1290. self.encoded_missing_value
  1291. ):
  1292. raise ValueError(
  1293. "There are missing values in features "
  1294. f"{list(self._missing_indices)}. For OrdinalEncoder to "
  1295. f"encode missing values with dtype: {self.dtype}, set "
  1296. "encoded_missing_value to a non-nan value, or "
  1297. "set dtype to a float"
  1298. )
  1299. if not is_scalar_nan(self.encoded_missing_value):
  1300. # Features are invalid when they contain a missing category
  1301. # and encoded_missing_value was already used to encode a
  1302. # known category
  1303. invalid_features = [
  1304. cat_idx
  1305. for cat_idx, cardinality in enumerate(cardinalities)
  1306. if cat_idx in self._missing_indices
  1307. and 0 <= self.encoded_missing_value < cardinality
  1308. ]
  1309. if invalid_features:
  1310. # Use feature names if they are available
  1311. if hasattr(self, "feature_names_in_"):
  1312. invalid_features = self.feature_names_in_[invalid_features]
  1313. raise ValueError(
  1314. f"encoded_missing_value ({self.encoded_missing_value}) "
  1315. "is already used to encode a known category in features: "
  1316. f"{invalid_features}"
  1317. )
  1318. return self
  1319. def transform(self, X):
  1320. """
  1321. Transform X to ordinal codes.
  1322. Parameters
  1323. ----------
  1324. X : array-like of shape (n_samples, n_features)
  1325. The data to encode.
  1326. Returns
  1327. -------
  1328. X_out : ndarray of shape (n_samples, n_features)
  1329. Transformed input.
  1330. """
  1331. X_int, X_mask = self._transform(
  1332. X,
  1333. handle_unknown=self.handle_unknown,
  1334. force_all_finite="allow-nan",
  1335. ignore_category_indices=self._missing_indices,
  1336. )
  1337. X_trans = X_int.astype(self.dtype, copy=False)
  1338. for cat_idx, missing_idx in self._missing_indices.items():
  1339. X_missing_mask = X_int[:, cat_idx] == missing_idx
  1340. X_trans[X_missing_mask, cat_idx] = self.encoded_missing_value
  1341. # create separate category for unknown values
  1342. if self.handle_unknown == "use_encoded_value":
  1343. X_trans[~X_mask] = self.unknown_value
  1344. return X_trans
  1345. def inverse_transform(self, X):
  1346. """
  1347. Convert the data back to the original representation.
  1348. Parameters
  1349. ----------
  1350. X : array-like of shape (n_samples, n_encoded_features)
  1351. The transformed data.
  1352. Returns
  1353. -------
  1354. X_tr : ndarray of shape (n_samples, n_features)
  1355. Inverse transformed array.
  1356. """
  1357. check_is_fitted(self)
  1358. X = check_array(X, force_all_finite="allow-nan")
  1359. n_samples, _ = X.shape
  1360. n_features = len(self.categories_)
  1361. # validate shape of passed X
  1362. msg = (
  1363. "Shape of the passed X data is not correct. Expected {0} columns, got {1}."
  1364. )
  1365. if X.shape[1] != n_features:
  1366. raise ValueError(msg.format(n_features, X.shape[1]))
  1367. # create resulting array of appropriate dtype
  1368. dt = np.result_type(*[cat.dtype for cat in self.categories_])
  1369. X_tr = np.empty((n_samples, n_features), dtype=dt)
  1370. found_unknown = {}
  1371. infrequent_masks = {}
  1372. infrequent_indices = getattr(self, "_infrequent_indices", None)
  1373. for i in range(n_features):
  1374. labels = X[:, i]
  1375. # replace values of X[:, i] that were nan with actual indices
  1376. if i in self._missing_indices:
  1377. X_i_mask = _get_mask(labels, self.encoded_missing_value)
  1378. labels[X_i_mask] = self._missing_indices[i]
  1379. rows_to_update = slice(None)
  1380. categories = self.categories_[i]
  1381. if infrequent_indices is not None and infrequent_indices[i] is not None:
  1382. # Compute mask for frequent categories
  1383. infrequent_encoding_value = len(categories) - len(infrequent_indices[i])
  1384. infrequent_masks[i] = labels == infrequent_encoding_value
  1385. rows_to_update = ~infrequent_masks[i]
  1386. # Remap categories to be only frequent categories. The infrequent
  1387. # categories will be mapped to "infrequent_sklearn" later
  1388. frequent_categories_mask = np.ones_like(categories, dtype=bool)
  1389. frequent_categories_mask[infrequent_indices[i]] = False
  1390. categories = categories[frequent_categories_mask]
  1391. if self.handle_unknown == "use_encoded_value":
  1392. unknown_labels = _get_mask(labels, self.unknown_value)
  1393. found_unknown[i] = unknown_labels
  1394. known_labels = ~unknown_labels
  1395. if isinstance(rows_to_update, np.ndarray):
  1396. rows_to_update &= known_labels
  1397. else:
  1398. rows_to_update = known_labels
  1399. labels_int = labels[rows_to_update].astype("int64", copy=False)
  1400. X_tr[rows_to_update, i] = categories[labels_int]
  1401. if found_unknown or infrequent_masks:
  1402. X_tr = X_tr.astype(object, copy=False)
  1403. # insert None values for unknown values
  1404. if found_unknown:
  1405. for idx, mask in found_unknown.items():
  1406. X_tr[mask, idx] = None
  1407. if infrequent_masks:
  1408. for idx, mask in infrequent_masks.items():
  1409. X_tr[mask, idx] = "infrequent_sklearn"
  1410. return X_tr