| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687 |
- # Authors: Andreas Mueller <amueller@ais.uni-bonn.de>
- # Joris Van den Bossche <jorisvandenbossche@gmail.com>
- # License: BSD 3 clause
- import numbers
- import warnings
- from numbers import Integral
- import numpy as np
- from scipy import sparse
- from ..base import BaseEstimator, OneToOneFeatureMixin, TransformerMixin, _fit_context
- from ..utils import _safe_indexing, check_array, is_scalar_nan
- from ..utils._encode import _check_unknown, _encode, _get_counts, _unique
- from ..utils._mask import _get_mask
- from ..utils._param_validation import Hidden, Interval, RealNotInt, StrOptions
- from ..utils._set_output import _get_output_config
- from ..utils.validation import _check_feature_names_in, check_is_fitted
- __all__ = ["OneHotEncoder", "OrdinalEncoder"]
- class _BaseEncoder(TransformerMixin, BaseEstimator):
- """
- Base class for encoders that includes the code to categorize and
- transform the input features.
- """
- def _check_X(self, X, force_all_finite=True):
- """
- Perform custom check_array:
- - convert list of strings to object dtype
- - check for missing values for object dtype data (check_array does
- not do that)
- - return list of features (arrays): this list of features is
- constructed feature by feature to preserve the data types
- of pandas DataFrame columns, as otherwise information is lost
- and cannot be used, e.g. for the `categories_` attribute.
- """
- if not (hasattr(X, "iloc") and getattr(X, "ndim", 0) == 2):
- # if not a dataframe, do normal check_array validation
- X_temp = check_array(X, dtype=None, force_all_finite=force_all_finite)
- if not hasattr(X, "dtype") and np.issubdtype(X_temp.dtype, np.str_):
- X = check_array(X, dtype=object, force_all_finite=force_all_finite)
- else:
- X = X_temp
- needs_validation = False
- else:
- # pandas dataframe, do validation later column by column, in order
- # to keep the dtype information to be used in the encoder.
- needs_validation = force_all_finite
- n_samples, n_features = X.shape
- X_columns = []
- for i in range(n_features):
- Xi = _safe_indexing(X, indices=i, axis=1)
- Xi = check_array(
- Xi, ensure_2d=False, dtype=None, force_all_finite=needs_validation
- )
- X_columns.append(Xi)
- return X_columns, n_samples, n_features
- def _fit(
- self,
- X,
- handle_unknown="error",
- force_all_finite=True,
- return_counts=False,
- return_and_ignore_missing_for_infrequent=False,
- ):
- self._check_infrequent_enabled()
- self._check_n_features(X, reset=True)
- self._check_feature_names(X, reset=True)
- X_list, n_samples, n_features = self._check_X(
- X, force_all_finite=force_all_finite
- )
- self.n_features_in_ = n_features
- if self.categories != "auto":
- if len(self.categories) != n_features:
- raise ValueError(
- "Shape mismatch: if categories is an array,"
- " it has to be of shape (n_features,)."
- )
- self.categories_ = []
- category_counts = []
- compute_counts = return_counts or self._infrequent_enabled
- for i in range(n_features):
- Xi = X_list[i]
- if self.categories == "auto":
- result = _unique(Xi, return_counts=compute_counts)
- if compute_counts:
- cats, counts = result
- category_counts.append(counts)
- else:
- cats = result
- else:
- if np.issubdtype(Xi.dtype, np.str_):
- # Always convert string categories to objects to avoid
- # unexpected string truncation for longer category labels
- # passed in the constructor.
- Xi_dtype = object
- else:
- Xi_dtype = Xi.dtype
- cats = np.array(self.categories[i], dtype=Xi_dtype)
- if (
- cats.dtype == object
- and isinstance(cats[0], bytes)
- and Xi.dtype.kind != "S"
- ):
- msg = (
- f"In column {i}, the predefined categories have type 'bytes'"
- " which is incompatible with values of type"
- f" '{type(Xi[0]).__name__}'."
- )
- raise ValueError(msg)
- if Xi.dtype.kind not in "OUS":
- sorted_cats = np.sort(cats)
- error_msg = (
- "Unsorted categories are not supported for numerical categories"
- )
- # if there are nans, nan should be the last element
- stop_idx = -1 if np.isnan(sorted_cats[-1]) else None
- if np.any(sorted_cats[:stop_idx] != cats[:stop_idx]) or (
- np.isnan(sorted_cats[-1]) and not np.isnan(sorted_cats[-1])
- ):
- raise ValueError(error_msg)
- if handle_unknown == "error":
- diff = _check_unknown(Xi, cats)
- if diff:
- msg = (
- "Found unknown categories {0} in column {1}"
- " during fit".format(diff, i)
- )
- raise ValueError(msg)
- if compute_counts:
- category_counts.append(_get_counts(Xi, cats))
- self.categories_.append(cats)
- output = {"n_samples": n_samples}
- if return_counts:
- output["category_counts"] = category_counts
- missing_indices = {}
- if return_and_ignore_missing_for_infrequent:
- for feature_idx, categories_for_idx in enumerate(self.categories_):
- for category_idx, category in enumerate(categories_for_idx):
- if is_scalar_nan(category):
- missing_indices[feature_idx] = category_idx
- break
- output["missing_indices"] = missing_indices
- if self._infrequent_enabled:
- self._fit_infrequent_category_mapping(
- n_samples,
- category_counts,
- missing_indices,
- )
- return output
- def _transform(
- self,
- X,
- handle_unknown="error",
- force_all_finite=True,
- warn_on_unknown=False,
- ignore_category_indices=None,
- ):
- X_list, n_samples, n_features = self._check_X(
- X, force_all_finite=force_all_finite
- )
- self._check_feature_names(X, reset=False)
- self._check_n_features(X, reset=False)
- X_int = np.zeros((n_samples, n_features), dtype=int)
- X_mask = np.ones((n_samples, n_features), dtype=bool)
- columns_with_unknown = []
- for i in range(n_features):
- Xi = X_list[i]
- diff, valid_mask = _check_unknown(Xi, self.categories_[i], return_mask=True)
- if not np.all(valid_mask):
- if handle_unknown == "error":
- msg = (
- "Found unknown categories {0} in column {1}"
- " during transform".format(diff, i)
- )
- raise ValueError(msg)
- else:
- if warn_on_unknown:
- columns_with_unknown.append(i)
- # Set the problematic rows to an acceptable value and
- # continue `The rows are marked `X_mask` and will be
- # removed later.
- X_mask[:, i] = valid_mask
- # cast Xi into the largest string type necessary
- # to handle different lengths of numpy strings
- if (
- self.categories_[i].dtype.kind in ("U", "S")
- and self.categories_[i].itemsize > Xi.itemsize
- ):
- Xi = Xi.astype(self.categories_[i].dtype)
- elif self.categories_[i].dtype.kind == "O" and Xi.dtype.kind == "U":
- # categories are objects and Xi are numpy strings.
- # Cast Xi to an object dtype to prevent truncation
- # when setting invalid values.
- Xi = Xi.astype("O")
- else:
- Xi = Xi.copy()
- Xi[~valid_mask] = self.categories_[i][0]
- # We use check_unknown=False, since _check_unknown was
- # already called above.
- X_int[:, i] = _encode(Xi, uniques=self.categories_[i], check_unknown=False)
- if columns_with_unknown:
- warnings.warn(
- (
- "Found unknown categories in columns "
- f"{columns_with_unknown} during transform. These "
- "unknown categories will be encoded as all zeros"
- ),
- UserWarning,
- )
- self._map_infrequent_categories(X_int, X_mask, ignore_category_indices)
- return X_int, X_mask
- @property
- def infrequent_categories_(self):
- """Infrequent categories for each feature."""
- # raises an AttributeError if `_infrequent_indices` is not defined
- infrequent_indices = self._infrequent_indices
- return [
- None if indices is None else category[indices]
- for category, indices in zip(self.categories_, infrequent_indices)
- ]
- def _check_infrequent_enabled(self):
- """
- This functions checks whether _infrequent_enabled is True or False.
- This has to be called after parameter validation in the fit function.
- """
- max_categories = getattr(self, "max_categories", None)
- min_frequency = getattr(self, "min_frequency", None)
- self._infrequent_enabled = (
- max_categories is not None and max_categories >= 1
- ) or min_frequency is not None
- def _identify_infrequent(self, category_count, n_samples, col_idx):
- """Compute the infrequent indices.
- Parameters
- ----------
- category_count : ndarray of shape (n_cardinality,)
- Category counts.
- n_samples : int
- Number of samples.
- col_idx : int
- Index of the current category. Only used for the error message.
- Returns
- -------
- output : ndarray of shape (n_infrequent_categories,) or None
- If there are infrequent categories, indices of infrequent
- categories. Otherwise None.
- """
- if isinstance(self.min_frequency, numbers.Integral):
- infrequent_mask = category_count < self.min_frequency
- elif isinstance(self.min_frequency, numbers.Real):
- min_frequency_abs = n_samples * self.min_frequency
- infrequent_mask = category_count < min_frequency_abs
- else:
- infrequent_mask = np.zeros(category_count.shape[0], dtype=bool)
- n_current_features = category_count.size - infrequent_mask.sum() + 1
- if self.max_categories is not None and self.max_categories < n_current_features:
- # max_categories includes the one infrequent category
- frequent_category_count = self.max_categories - 1
- if frequent_category_count == 0:
- # All categories are infrequent
- infrequent_mask[:] = True
- else:
- # stable sort to preserve original count order
- smallest_levels = np.argsort(category_count, kind="mergesort")[
- :-frequent_category_count
- ]
- infrequent_mask[smallest_levels] = True
- output = np.flatnonzero(infrequent_mask)
- return output if output.size > 0 else None
- def _fit_infrequent_category_mapping(
- self, n_samples, category_counts, missing_indices
- ):
- """Fit infrequent categories.
- Defines the private attribute: `_default_to_infrequent_mappings`. For
- feature `i`, `_default_to_infrequent_mappings[i]` defines the mapping
- from the integer encoding returned by `super().transform()` into
- infrequent categories. If `_default_to_infrequent_mappings[i]` is None,
- there were no infrequent categories in the training set.
- For example if categories 0, 2 and 4 were frequent, while categories
- 1, 3, 5 were infrequent for feature 7, then these categories are mapped
- to a single output:
- `_default_to_infrequent_mappings[7] = array([0, 3, 1, 3, 2, 3])`
- Defines private attribute: `_infrequent_indices`. `_infrequent_indices[i]`
- is an array of indices such that
- `categories_[i][_infrequent_indices[i]]` are all the infrequent category
- labels. If the feature `i` has no infrequent categories
- `_infrequent_indices[i]` is None.
- .. versionadded:: 1.1
- Parameters
- ----------
- n_samples : int
- Number of samples in training set.
- category_counts: list of ndarray
- `category_counts[i]` is the category counts corresponding to
- `self.categories_[i]`.
- missing_indices : dict
- Dict mapping from feature_idx to category index with a missing value.
- """
- # Remove missing value from counts, so it is not considered as infrequent
- if missing_indices:
- category_counts_ = []
- for feature_idx, count in enumerate(category_counts):
- if feature_idx in missing_indices:
- category_counts_.append(
- np.delete(count, missing_indices[feature_idx])
- )
- else:
- category_counts_.append(count)
- else:
- category_counts_ = category_counts
- self._infrequent_indices = [
- self._identify_infrequent(category_count, n_samples, col_idx)
- for col_idx, category_count in enumerate(category_counts_)
- ]
- # compute mapping from default mapping to infrequent mapping
- self._default_to_infrequent_mappings = []
- for feature_idx, infreq_idx in enumerate(self._infrequent_indices):
- cats = self.categories_[feature_idx]
- # no infrequent categories
- if infreq_idx is None:
- self._default_to_infrequent_mappings.append(None)
- continue
- n_cats = len(cats)
- if feature_idx in missing_indices:
- # Missing index was removed from this category when computing
- # infrequent indices, thus we need to decrease the number of
- # total categories when considering the infrequent mapping.
- n_cats -= 1
- # infrequent indices exist
- mapping = np.empty(n_cats, dtype=np.int64)
- n_infrequent_cats = infreq_idx.size
- # infrequent categories are mapped to the last element.
- n_frequent_cats = n_cats - n_infrequent_cats
- mapping[infreq_idx] = n_frequent_cats
- frequent_indices = np.setdiff1d(np.arange(n_cats), infreq_idx)
- mapping[frequent_indices] = np.arange(n_frequent_cats)
- self._default_to_infrequent_mappings.append(mapping)
- def _map_infrequent_categories(self, X_int, X_mask, ignore_category_indices):
- """Map infrequent categories to integer representing the infrequent category.
- This modifies X_int in-place. Values that were invalid based on `X_mask`
- are mapped to the infrequent category if there was an infrequent
- category for that feature.
- Parameters
- ----------
- X_int: ndarray of shape (n_samples, n_features)
- Integer encoded categories.
- X_mask: ndarray of shape (n_samples, n_features)
- Bool mask for valid values in `X_int`.
- ignore_category_indices : dict
- Dictionary mapping from feature_idx to category index to ignore.
- Ignored indexes will not be grouped and the original ordinal encoding
- will remain.
- """
- if not self._infrequent_enabled:
- return
- ignore_category_indices = ignore_category_indices or {}
- for col_idx in range(X_int.shape[1]):
- infrequent_idx = self._infrequent_indices[col_idx]
- if infrequent_idx is None:
- continue
- X_int[~X_mask[:, col_idx], col_idx] = infrequent_idx[0]
- if self.handle_unknown == "infrequent_if_exist":
- # All the unknown values are now mapped to the
- # infrequent_idx[0], which makes the unknown values valid
- # This is needed in `transform` when the encoding is formed
- # using `X_mask`.
- X_mask[:, col_idx] = True
- # Remaps encoding in `X_int` where the infrequent categories are
- # grouped together.
- for i, mapping in enumerate(self._default_to_infrequent_mappings):
- if mapping is None:
- continue
- if i in ignore_category_indices:
- # Update rows that are **not** ignored
- rows_to_update = X_int[:, i] != ignore_category_indices[i]
- else:
- rows_to_update = slice(None)
- X_int[rows_to_update, i] = np.take(mapping, X_int[rows_to_update, i])
- def _more_tags(self):
- return {"X_types": ["2darray", "categorical"], "allow_nan": True}
- class OneHotEncoder(_BaseEncoder):
- """
- Encode categorical features as a one-hot numeric array.
- The input to this transformer should be an array-like of integers or
- strings, denoting the values taken on by categorical (discrete) features.
- The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')
- encoding scheme. This creates a binary column for each category and
- returns a sparse matrix or dense array (depending on the ``sparse_output``
- parameter)
- By default, the encoder derives the categories based on the unique values
- in each feature. Alternatively, you can also specify the `categories`
- manually.
- This encoding is needed for feeding categorical data to many scikit-learn
- estimators, notably linear models and SVMs with the standard kernels.
- Note: a one-hot encoding of y labels should use a LabelBinarizer
- instead.
- Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
- For a comparison of different encoders, refer to:
- :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`.
- Parameters
- ----------
- categories : 'auto' or a list of array-like, default='auto'
- Categories (unique values) per feature:
- - 'auto' : Determine categories automatically from the training data.
- - list : ``categories[i]`` holds the categories expected in the ith
- column. The passed categories should not mix strings and numeric
- values within a single feature, and should be sorted in case of
- numeric values.
- The used categories can be found in the ``categories_`` attribute.
- .. versionadded:: 0.20
- drop : {'first', 'if_binary'} or an array-like of shape (n_features,), \
- default=None
- Specifies a methodology to use to drop one of the categories per
- feature. This is useful in situations where perfectly collinear
- features cause problems, such as when feeding the resulting data
- into an unregularized linear regression model.
- However, dropping one category breaks the symmetry of the original
- representation and can therefore induce a bias in downstream models,
- for instance for penalized linear classification or regression models.
- - None : retain all features (the default).
- - 'first' : drop the first category in each feature. If only one
- category is present, the feature will be dropped entirely.
- - 'if_binary' : drop the first category in each feature with two
- categories. Features with 1 or more than 2 categories are
- left intact.
- - array : ``drop[i]`` is the category in feature ``X[:, i]`` that
- should be dropped.
- When `max_categories` or `min_frequency` is configured to group
- infrequent categories, the dropping behavior is handled after the
- grouping.
- .. versionadded:: 0.21
- The parameter `drop` was added in 0.21.
- .. versionchanged:: 0.23
- The option `drop='if_binary'` was added in 0.23.
- .. versionchanged:: 1.1
- Support for dropping infrequent categories.
- sparse : bool, default=True
- Will return sparse matrix if set True else will return an array.
- .. deprecated:: 1.2
- `sparse` is deprecated in 1.2 and will be removed in 1.4. Use
- `sparse_output` instead.
- sparse_output : bool, default=True
- Will return sparse matrix if set True else will return an array.
- .. versionadded:: 1.2
- `sparse` was renamed to `sparse_output`
- dtype : number type, default=np.float64
- Desired dtype of output.
- handle_unknown : {'error', 'ignore', 'infrequent_if_exist'}, \
- default='error'
- Specifies the way unknown categories are handled during :meth:`transform`.
- - 'error' : Raise an error if an unknown category is present during transform.
- - 'ignore' : When an unknown category is encountered during
- transform, the resulting one-hot encoded columns for this feature
- will be all zeros. In the inverse transform, an unknown category
- will be denoted as None.
- - 'infrequent_if_exist' : When an unknown category is encountered
- during transform, the resulting one-hot encoded columns for this
- feature will map to the infrequent category if it exists. The
- infrequent category will be mapped to the last position in the
- encoding. During inverse transform, an unknown category will be
- mapped to the category denoted `'infrequent'` if it exists. If the
- `'infrequent'` category does not exist, then :meth:`transform` and
- :meth:`inverse_transform` will handle an unknown category as with
- `handle_unknown='ignore'`. Infrequent categories exist based on
- `min_frequency` and `max_categories`. Read more in the
- :ref:`User Guide <encoder_infrequent_categories>`.
- .. versionchanged:: 1.1
- `'infrequent_if_exist'` was added to automatically handle unknown
- categories and infrequent categories.
- min_frequency : int or float, default=None
- Specifies the minimum frequency below which a category will be
- considered infrequent.
- - If `int`, categories with a smaller cardinality will be considered
- infrequent.
- - If `float`, categories with a smaller cardinality than
- `min_frequency * n_samples` will be considered infrequent.
- .. versionadded:: 1.1
- Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
- max_categories : int, default=None
- Specifies an upper limit to the number of output features for each input
- feature when considering infrequent categories. If there are infrequent
- categories, `max_categories` includes the category representing the
- infrequent categories along with the frequent categories. If `None`,
- there is no limit to the number of output features.
- .. versionadded:: 1.1
- Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
- feature_name_combiner : "concat" or callable, default="concat"
- Callable with signature `def callable(input_feature, category)` that returns a
- string. This is used to create feature names to be returned by
- :meth:`get_feature_names_out`.
- `"concat"` concatenates encoded feature name and category with
- `feature + "_" + str(category)`.E.g. feature X with values 1, 6, 7 create
- feature names `X_1, X_6, X_7`.
- .. versionadded:: 1.3
- Attributes
- ----------
- categories_ : list of arrays
- The categories of each feature determined during fitting
- (in order of the features in X and corresponding with the output
- of ``transform``). This includes the category specified in ``drop``
- (if any).
- drop_idx_ : array of shape (n_features,)
- - ``drop_idx_[i]`` is the index in ``categories_[i]`` of the category
- to be dropped for each feature.
- - ``drop_idx_[i] = None`` if no category is to be dropped from the
- feature with index ``i``, e.g. when `drop='if_binary'` and the
- feature isn't binary.
- - ``drop_idx_ = None`` if all the transformed features will be
- retained.
- If infrequent categories are enabled by setting `min_frequency` or
- `max_categories` to a non-default value and `drop_idx[i]` corresponds
- to a infrequent category, then the entire infrequent category is
- dropped.
- .. versionchanged:: 0.23
- Added the possibility to contain `None` values.
- infrequent_categories_ : list of ndarray
- Defined only if infrequent categories are enabled by setting
- `min_frequency` or `max_categories` to a non-default value.
- `infrequent_categories_[i]` are the infrequent categories for feature
- `i`. If the feature `i` has no infrequent categories
- `infrequent_categories_[i]` is None.
- .. versionadded:: 1.1
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 1.0
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- feature_name_combiner : callable or None
- Callable with signature `def callable(input_feature, category)` that returns a
- string. This is used to create feature names to be returned by
- :meth:`get_feature_names_out`.
- .. versionadded:: 1.3
- See Also
- --------
- OrdinalEncoder : Performs an ordinal (integer)
- encoding of the categorical features.
- TargetEncoder : Encodes categorical features using the target.
- sklearn.feature_extraction.DictVectorizer : Performs a one-hot encoding of
- dictionary items (also handles string-valued features).
- sklearn.feature_extraction.FeatureHasher : Performs an approximate one-hot
- encoding of dictionary items or strings.
- LabelBinarizer : Binarizes labels in a one-vs-all
- fashion.
- MultiLabelBinarizer : Transforms between iterable of
- iterables and a multilabel format, e.g. a (samples x classes) binary
- matrix indicating the presence of a class label.
- Examples
- --------
- Given a dataset with two features, we let the encoder find the unique
- values per feature and transform the data to a binary one-hot encoding.
- >>> from sklearn.preprocessing import OneHotEncoder
- One can discard categories not seen during `fit`:
- >>> enc = OneHotEncoder(handle_unknown='ignore')
- >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
- >>> enc.fit(X)
- OneHotEncoder(handle_unknown='ignore')
- >>> enc.categories_
- [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
- >>> enc.transform([['Female', 1], ['Male', 4]]).toarray()
- array([[1., 0., 1., 0., 0.],
- [0., 1., 0., 0., 0.]])
- >>> enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])
- array([['Male', 1],
- [None, 2]], dtype=object)
- >>> enc.get_feature_names_out(['gender', 'group'])
- array(['gender_Female', 'gender_Male', 'group_1', 'group_2', 'group_3'], ...)
- One can always drop the first column for each feature:
- >>> drop_enc = OneHotEncoder(drop='first').fit(X)
- >>> drop_enc.categories_
- [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
- >>> drop_enc.transform([['Female', 1], ['Male', 2]]).toarray()
- array([[0., 0., 0.],
- [1., 1., 0.]])
- Or drop a column for feature only having 2 categories:
- >>> drop_binary_enc = OneHotEncoder(drop='if_binary').fit(X)
- >>> drop_binary_enc.transform([['Female', 1], ['Male', 2]]).toarray()
- array([[0., 1., 0., 0.],
- [1., 0., 1., 0.]])
- One can change the way feature names are created.
- >>> def custom_combiner(feature, category):
- ... return str(feature) + "_" + type(category).__name__ + "_" + str(category)
- >>> custom_fnames_enc = OneHotEncoder(feature_name_combiner=custom_combiner).fit(X)
- >>> custom_fnames_enc.get_feature_names_out()
- array(['x0_str_Female', 'x0_str_Male', 'x1_int_1', 'x1_int_2', 'x1_int_3'],
- dtype=object)
- Infrequent categories are enabled by setting `max_categories` or `min_frequency`.
- >>> import numpy as np
- >>> X = np.array([["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3], dtype=object).T
- >>> ohe = OneHotEncoder(max_categories=3, sparse_output=False).fit(X)
- >>> ohe.infrequent_categories_
- [array(['a', 'd'], dtype=object)]
- >>> ohe.transform([["a"], ["b"]])
- array([[0., 0., 1.],
- [1., 0., 0.]])
- """
- _parameter_constraints: dict = {
- "categories": [StrOptions({"auto"}), list],
- "drop": [StrOptions({"first", "if_binary"}), "array-like", None],
- "dtype": "no_validation", # validation delegated to numpy
- "handle_unknown": [StrOptions({"error", "ignore", "infrequent_if_exist"})],
- "max_categories": [Interval(Integral, 1, None, closed="left"), None],
- "min_frequency": [
- Interval(Integral, 1, None, closed="left"),
- Interval(RealNotInt, 0, 1, closed="neither"),
- None,
- ],
- "sparse": [Hidden(StrOptions({"deprecated"})), "boolean"], # deprecated
- "sparse_output": ["boolean"],
- "feature_name_combiner": [StrOptions({"concat"}), callable],
- }
- def __init__(
- self,
- *,
- categories="auto",
- drop=None,
- sparse="deprecated",
- sparse_output=True,
- dtype=np.float64,
- handle_unknown="error",
- min_frequency=None,
- max_categories=None,
- feature_name_combiner="concat",
- ):
- self.categories = categories
- # TODO(1.4): Remove self.sparse
- self.sparse = sparse
- self.sparse_output = sparse_output
- self.dtype = dtype
- self.handle_unknown = handle_unknown
- self.drop = drop
- self.min_frequency = min_frequency
- self.max_categories = max_categories
- self.feature_name_combiner = feature_name_combiner
- def _map_drop_idx_to_infrequent(self, feature_idx, drop_idx):
- """Convert `drop_idx` into the index for infrequent categories.
- If there are no infrequent categories, then `drop_idx` is
- returned. This method is called in `_set_drop_idx` when the `drop`
- parameter is an array-like.
- """
- if not self._infrequent_enabled:
- return drop_idx
- default_to_infrequent = self._default_to_infrequent_mappings[feature_idx]
- if default_to_infrequent is None:
- return drop_idx
- # Raise error when explicitly dropping a category that is infrequent
- infrequent_indices = self._infrequent_indices[feature_idx]
- if infrequent_indices is not None and drop_idx in infrequent_indices:
- categories = self.categories_[feature_idx]
- raise ValueError(
- f"Unable to drop category {categories[drop_idx].item()!r} from"
- f" feature {feature_idx} because it is infrequent"
- )
- return default_to_infrequent[drop_idx]
- def _set_drop_idx(self):
- """Compute the drop indices associated with `self.categories_`.
- If `self.drop` is:
- - `None`, No categories have been dropped.
- - `'first'`, All zeros to drop the first category.
- - `'if_binary'`, All zeros if the category is binary and `None`
- otherwise.
- - array-like, The indices of the categories that match the
- categories in `self.drop`. If the dropped category is an infrequent
- category, then the index for the infrequent category is used. This
- means that the entire infrequent category is dropped.
- This methods defines a public `drop_idx_` and a private
- `_drop_idx_after_grouping`.
- - `drop_idx_`: Public facing API that references the drop category in
- `self.categories_`.
- - `_drop_idx_after_grouping`: Used internally to drop categories *after* the
- infrequent categories are grouped together.
- If there are no infrequent categories or drop is `None`, then
- `drop_idx_=_drop_idx_after_grouping`.
- """
- if self.drop is None:
- drop_idx_after_grouping = None
- elif isinstance(self.drop, str):
- if self.drop == "first":
- drop_idx_after_grouping = np.zeros(len(self.categories_), dtype=object)
- elif self.drop == "if_binary":
- n_features_out_no_drop = [len(cat) for cat in self.categories_]
- if self._infrequent_enabled:
- for i, infreq_idx in enumerate(self._infrequent_indices):
- if infreq_idx is None:
- continue
- n_features_out_no_drop[i] -= infreq_idx.size - 1
- drop_idx_after_grouping = np.array(
- [
- 0 if n_features_out == 2 else None
- for n_features_out in n_features_out_no_drop
- ],
- dtype=object,
- )
- else:
- drop_array = np.asarray(self.drop, dtype=object)
- droplen = len(drop_array)
- if droplen != len(self.categories_):
- msg = (
- "`drop` should have length equal to the number "
- "of features ({}), got {}"
- )
- raise ValueError(msg.format(len(self.categories_), droplen))
- missing_drops = []
- drop_indices = []
- for feature_idx, (drop_val, cat_list) in enumerate(
- zip(drop_array, self.categories_)
- ):
- if not is_scalar_nan(drop_val):
- drop_idx = np.where(cat_list == drop_val)[0]
- if drop_idx.size: # found drop idx
- drop_indices.append(
- self._map_drop_idx_to_infrequent(feature_idx, drop_idx[0])
- )
- else:
- missing_drops.append((feature_idx, drop_val))
- continue
- # drop_val is nan, find nan in categories manually
- for cat_idx, cat in enumerate(cat_list):
- if is_scalar_nan(cat):
- drop_indices.append(
- self._map_drop_idx_to_infrequent(feature_idx, cat_idx)
- )
- break
- else: # loop did not break thus drop is missing
- missing_drops.append((feature_idx, drop_val))
- if any(missing_drops):
- msg = (
- "The following categories were supposed to be "
- "dropped, but were not found in the training "
- "data.\n{}".format(
- "\n".join(
- [
- "Category: {}, Feature: {}".format(c, v)
- for c, v in missing_drops
- ]
- )
- )
- )
- raise ValueError(msg)
- drop_idx_after_grouping = np.array(drop_indices, dtype=object)
- # `_drop_idx_after_grouping` are the categories to drop *after* the infrequent
- # categories are grouped together. If needed, we remap `drop_idx` back
- # to the categories seen in `self.categories_`.
- self._drop_idx_after_grouping = drop_idx_after_grouping
- if not self._infrequent_enabled or drop_idx_after_grouping is None:
- self.drop_idx_ = self._drop_idx_after_grouping
- else:
- drop_idx_ = []
- for feature_idx, drop_idx in enumerate(drop_idx_after_grouping):
- default_to_infrequent = self._default_to_infrequent_mappings[
- feature_idx
- ]
- if drop_idx is None or default_to_infrequent is None:
- orig_drop_idx = drop_idx
- else:
- orig_drop_idx = np.flatnonzero(default_to_infrequent == drop_idx)[0]
- drop_idx_.append(orig_drop_idx)
- self.drop_idx_ = np.asarray(drop_idx_, dtype=object)
- def _compute_transformed_categories(self, i, remove_dropped=True):
- """Compute the transformed categories used for column `i`.
- 1. If there are infrequent categories, the category is named
- 'infrequent_sklearn'.
- 2. Dropped columns are removed when remove_dropped=True.
- """
- cats = self.categories_[i]
- if self._infrequent_enabled:
- infreq_map = self._default_to_infrequent_mappings[i]
- if infreq_map is not None:
- frequent_mask = infreq_map < infreq_map.max()
- infrequent_cat = "infrequent_sklearn"
- # infrequent category is always at the end
- cats = np.concatenate(
- (cats[frequent_mask], np.array([infrequent_cat], dtype=object))
- )
- if remove_dropped:
- cats = self._remove_dropped_categories(cats, i)
- return cats
- def _remove_dropped_categories(self, categories, i):
- """Remove dropped categories."""
- if (
- self._drop_idx_after_grouping is not None
- and self._drop_idx_after_grouping[i] is not None
- ):
- return np.delete(categories, self._drop_idx_after_grouping[i])
- return categories
- def _compute_n_features_outs(self):
- """Compute the n_features_out for each input feature."""
- output = [len(cats) for cats in self.categories_]
- if self._drop_idx_after_grouping is not None:
- for i, drop_idx in enumerate(self._drop_idx_after_grouping):
- if drop_idx is not None:
- output[i] -= 1
- if not self._infrequent_enabled:
- return output
- # infrequent is enabled, the number of features out are reduced
- # because the infrequent categories are grouped together
- for i, infreq_idx in enumerate(self._infrequent_indices):
- if infreq_idx is None:
- continue
- output[i] -= infreq_idx.size - 1
- return output
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, X, y=None):
- """
- Fit OneHotEncoder to X.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- The data to determine the categories of each feature.
- y : None
- Ignored. This parameter exists only for compatibility with
- :class:`~sklearn.pipeline.Pipeline`.
- Returns
- -------
- self
- Fitted encoder.
- """
- if self.sparse != "deprecated":
- warnings.warn(
- (
- "`sparse` was renamed to `sparse_output` in version 1.2 and "
- "will be removed in 1.4. `sparse_output` is ignored unless you "
- "leave `sparse` to its default value."
- ),
- FutureWarning,
- )
- self.sparse_output = self.sparse
- self._fit(
- X,
- handle_unknown=self.handle_unknown,
- force_all_finite="allow-nan",
- )
- self._set_drop_idx()
- self._n_features_outs = self._compute_n_features_outs()
- return self
- def transform(self, X):
- """
- Transform X using one-hot encoding.
- If there are infrequent categories for a feature, the infrequent
- categories will be grouped into a single category.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- The data to encode.
- Returns
- -------
- X_out : {ndarray, sparse matrix} of shape \
- (n_samples, n_encoded_features)
- Transformed input. If `sparse_output=True`, a sparse matrix will be
- returned.
- """
- check_is_fitted(self)
- transform_output = _get_output_config("transform", estimator=self)["dense"]
- if transform_output == "pandas" and self.sparse_output:
- raise ValueError(
- "Pandas output does not support sparse data. Set sparse_output=False to"
- " output pandas DataFrames or disable pandas output via"
- ' `ohe.set_output(transform="default").'
- )
- # validation of X happens in _check_X called by _transform
- warn_on_unknown = self.drop is not None and self.handle_unknown in {
- "ignore",
- "infrequent_if_exist",
- }
- X_int, X_mask = self._transform(
- X,
- handle_unknown=self.handle_unknown,
- force_all_finite="allow-nan",
- warn_on_unknown=warn_on_unknown,
- )
- n_samples, n_features = X_int.shape
- if self._drop_idx_after_grouping is not None:
- to_drop = self._drop_idx_after_grouping.copy()
- # We remove all the dropped categories from mask, and decrement all
- # categories that occur after them to avoid an empty column.
- keep_cells = X_int != to_drop
- for i, cats in enumerate(self.categories_):
- # drop='if_binary' but feature isn't binary
- if to_drop[i] is None:
- # set to cardinality to not drop from X_int
- to_drop[i] = len(cats)
- to_drop = to_drop.reshape(1, -1)
- X_int[X_int > to_drop] -= 1
- X_mask &= keep_cells
- mask = X_mask.ravel()
- feature_indices = np.cumsum([0] + self._n_features_outs)
- indices = (X_int + feature_indices[:-1]).ravel()[mask]
- indptr = np.empty(n_samples + 1, dtype=int)
- indptr[0] = 0
- np.sum(X_mask, axis=1, out=indptr[1:], dtype=indptr.dtype)
- np.cumsum(indptr[1:], out=indptr[1:])
- data = np.ones(indptr[-1])
- out = sparse.csr_matrix(
- (data, indices, indptr),
- shape=(n_samples, feature_indices[-1]),
- dtype=self.dtype,
- )
- if not self.sparse_output:
- return out.toarray()
- else:
- return out
- def inverse_transform(self, X):
- """
- Convert the data back to the original representation.
- When unknown categories are encountered (all zeros in the
- one-hot encoding), ``None`` is used to represent this category. If the
- feature with the unknown category has a dropped category, the dropped
- category will be its inverse.
- For a given input feature, if there is an infrequent category,
- 'infrequent_sklearn' will be used to represent the infrequent category.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape \
- (n_samples, n_encoded_features)
- The transformed data.
- Returns
- -------
- X_tr : ndarray of shape (n_samples, n_features)
- Inverse transformed array.
- """
- check_is_fitted(self)
- X = check_array(X, accept_sparse="csr")
- n_samples, _ = X.shape
- n_features = len(self.categories_)
- n_features_out = np.sum(self._n_features_outs)
- # validate shape of passed X
- msg = (
- "Shape of the passed X data is not correct. Expected {0} columns, got {1}."
- )
- if X.shape[1] != n_features_out:
- raise ValueError(msg.format(n_features_out, X.shape[1]))
- transformed_features = [
- self._compute_transformed_categories(i, remove_dropped=False)
- for i, _ in enumerate(self.categories_)
- ]
- # create resulting array of appropriate dtype
- dt = np.result_type(*[cat.dtype for cat in transformed_features])
- X_tr = np.empty((n_samples, n_features), dtype=dt)
- j = 0
- found_unknown = {}
- if self._infrequent_enabled:
- infrequent_indices = self._infrequent_indices
- else:
- infrequent_indices = [None] * n_features
- for i in range(n_features):
- cats_wo_dropped = self._remove_dropped_categories(
- transformed_features[i], i
- )
- n_categories = cats_wo_dropped.shape[0]
- # Only happens if there was a column with a unique
- # category. In this case we just fill the column with this
- # unique category value.
- if n_categories == 0:
- X_tr[:, i] = self.categories_[i][self._drop_idx_after_grouping[i]]
- j += n_categories
- continue
- sub = X[:, j : j + n_categories]
- # for sparse X argmax returns 2D matrix, ensure 1D array
- labels = np.asarray(sub.argmax(axis=1)).flatten()
- X_tr[:, i] = cats_wo_dropped[labels]
- if self.handle_unknown == "ignore" or (
- self.handle_unknown == "infrequent_if_exist"
- and infrequent_indices[i] is None
- ):
- unknown = np.asarray(sub.sum(axis=1) == 0).flatten()
- # ignored unknown categories: we have a row of all zero
- if unknown.any():
- # if categories were dropped then unknown categories will
- # be mapped to the dropped category
- if (
- self._drop_idx_after_grouping is None
- or self._drop_idx_after_grouping[i] is None
- ):
- found_unknown[i] = unknown
- else:
- X_tr[unknown, i] = self.categories_[i][
- self._drop_idx_after_grouping[i]
- ]
- else:
- dropped = np.asarray(sub.sum(axis=1) == 0).flatten()
- if dropped.any():
- if self._drop_idx_after_grouping is None:
- all_zero_samples = np.flatnonzero(dropped)
- raise ValueError(
- f"Samples {all_zero_samples} can not be inverted "
- "when drop=None and handle_unknown='error' "
- "because they contain all zeros"
- )
- # we can safely assume that all of the nulls in each column
- # are the dropped value
- drop_idx = self._drop_idx_after_grouping[i]
- X_tr[dropped, i] = transformed_features[i][drop_idx]
- j += n_categories
- # if ignored are found: potentially need to upcast result to
- # insert None values
- if found_unknown:
- if X_tr.dtype != object:
- X_tr = X_tr.astype(object)
- for idx, mask in found_unknown.items():
- X_tr[mask, idx] = None
- return X_tr
- def get_feature_names_out(self, input_features=None):
- """Get output feature names for transformation.
- Parameters
- ----------
- input_features : array-like of str or None, default=None
- Input features.
- - If `input_features` is `None`, then `feature_names_in_` is
- used as feature names in. If `feature_names_in_` is not defined,
- then the following input feature names are generated:
- `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
- - If `input_features` is an array-like, then `input_features` must
- match `feature_names_in_` if `feature_names_in_` is defined.
- Returns
- -------
- feature_names_out : ndarray of str objects
- Transformed feature names.
- """
- check_is_fitted(self)
- input_features = _check_feature_names_in(self, input_features)
- cats = [
- self._compute_transformed_categories(i)
- for i, _ in enumerate(self.categories_)
- ]
- name_combiner = self._check_get_feature_name_combiner()
- feature_names = []
- for i in range(len(cats)):
- names = [name_combiner(input_features[i], t) for t in cats[i]]
- feature_names.extend(names)
- return np.array(feature_names, dtype=object)
- def _check_get_feature_name_combiner(self):
- if self.feature_name_combiner == "concat":
- return lambda feature, category: feature + "_" + str(category)
- else: # callable
- dry_run_combiner = self.feature_name_combiner("feature", "category")
- if not isinstance(dry_run_combiner, str):
- raise TypeError(
- "When `feature_name_combiner` is a callable, it should return a "
- f"Python string. Got {type(dry_run_combiner)} instead."
- )
- return self.feature_name_combiner
- class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder):
- """
- Encode categorical features as an integer array.
- The input to this transformer should be an array-like of integers or
- strings, denoting the values taken on by categorical (discrete) features.
- The features are converted to ordinal integers. This results in
- a single column of integers (0 to n_categories - 1) per feature.
- Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
- For a comparison of different encoders, refer to:
- :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`.
- .. versionadded:: 0.20
- Parameters
- ----------
- categories : 'auto' or a list of array-like, default='auto'
- Categories (unique values) per feature:
- - 'auto' : Determine categories automatically from the training data.
- - list : ``categories[i]`` holds the categories expected in the ith
- column. The passed categories should not mix strings and numeric
- values, and should be sorted in case of numeric values.
- The used categories can be found in the ``categories_`` attribute.
- dtype : number type, default=np.float64
- Desired dtype of output.
- handle_unknown : {'error', 'use_encoded_value'}, default='error'
- When set to 'error' an error will be raised in case an unknown
- categorical feature is present during transform. When set to
- 'use_encoded_value', the encoded value of unknown categories will be
- set to the value given for the parameter `unknown_value`. In
- :meth:`inverse_transform`, an unknown category will be denoted as None.
- .. versionadded:: 0.24
- unknown_value : int or np.nan, default=None
- When the parameter handle_unknown is set to 'use_encoded_value', this
- parameter is required and will set the encoded value of unknown
- categories. It has to be distinct from the values used to encode any of
- the categories in `fit`. If set to np.nan, the `dtype` parameter must
- be a float dtype.
- .. versionadded:: 0.24
- encoded_missing_value : int or np.nan, default=np.nan
- Encoded value of missing categories. If set to `np.nan`, then the `dtype`
- parameter must be a float dtype.
- .. versionadded:: 1.1
- min_frequency : int or float, default=None
- Specifies the minimum frequency below which a category will be
- considered infrequent.
- - If `int`, categories with a smaller cardinality will be considered
- infrequent.
- - If `float`, categories with a smaller cardinality than
- `min_frequency * n_samples` will be considered infrequent.
- .. versionadded:: 1.3
- Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
- max_categories : int, default=None
- Specifies an upper limit to the number of output categories for each input
- feature when considering infrequent categories. If there are infrequent
- categories, `max_categories` includes the category representing the
- infrequent categories along with the frequent categories. If `None`,
- there is no limit to the number of output features.
- `max_categories` do **not** take into account missing or unknown
- categories. Setting `unknown_value` or `encoded_missing_value` to an
- integer will increase the number of unique integer codes by one each.
- This can result in up to `max_categories + 2` integer codes.
- .. versionadded:: 1.3
- Read more in the :ref:`User Guide <encoder_infrequent_categories>`.
- Attributes
- ----------
- categories_ : list of arrays
- The categories of each feature determined during ``fit`` (in order of
- the features in X and corresponding with the output of ``transform``).
- This does not include categories that weren't seen during ``fit``.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 1.0
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- infrequent_categories_ : list of ndarray
- Defined only if infrequent categories are enabled by setting
- `min_frequency` or `max_categories` to a non-default value.
- `infrequent_categories_[i]` are the infrequent categories for feature
- `i`. If the feature `i` has no infrequent categories
- `infrequent_categories_[i]` is None.
- .. versionadded:: 1.3
- See Also
- --------
- OneHotEncoder : Performs a one-hot encoding of categorical features. This encoding
- is suitable for low to medium cardinality categorical variables, both in
- supervised and unsupervised settings.
- TargetEncoder : Encodes categorical features using supervised signal
- in a classification or regression pipeline. This encoding is typically
- suitable for high cardinality categorical variables.
- LabelEncoder : Encodes target labels with values between 0 and
- ``n_classes-1``.
- Notes
- -----
- With a high proportion of `nan` values, inferring categories becomes slow with
- Python versions before 3.10. The handling of `nan` values was improved
- from Python 3.10 onwards, (c.f.
- `bpo-43475 <https://github.com/python/cpython/issues/87641>`_).
- Examples
- --------
- Given a dataset with two features, we let the encoder find the unique
- values per feature and transform the data to an ordinal encoding.
- >>> from sklearn.preprocessing import OrdinalEncoder
- >>> enc = OrdinalEncoder()
- >>> X = [['Male', 1], ['Female', 3], ['Female', 2]]
- >>> enc.fit(X)
- OrdinalEncoder()
- >>> enc.categories_
- [array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]
- >>> enc.transform([['Female', 3], ['Male', 1]])
- array([[0., 2.],
- [1., 0.]])
- >>> enc.inverse_transform([[1, 0], [0, 1]])
- array([['Male', 1],
- ['Female', 2]], dtype=object)
- By default, :class:`OrdinalEncoder` is lenient towards missing values by
- propagating them.
- >>> import numpy as np
- >>> X = [['Male', 1], ['Female', 3], ['Female', np.nan]]
- >>> enc.fit_transform(X)
- array([[ 1., 0.],
- [ 0., 1.],
- [ 0., nan]])
- You can use the parameter `encoded_missing_value` to encode missing values.
- >>> enc.set_params(encoded_missing_value=-1).fit_transform(X)
- array([[ 1., 0.],
- [ 0., 1.],
- [ 0., -1.]])
- Infrequent categories are enabled by setting `max_categories` or `min_frequency`.
- In the following example, "a" and "d" are considered infrequent and grouped
- together into a single category, "b" and "c" are their own categories, unknown
- values are encoded as 3 and missing values are encoded as 4.
- >>> X_train = np.array(
- ... [["a"] * 5 + ["b"] * 20 + ["c"] * 10 + ["d"] * 3 + [np.nan]],
- ... dtype=object).T
- >>> enc = OrdinalEncoder(
- ... handle_unknown="use_encoded_value", unknown_value=3,
- ... max_categories=3, encoded_missing_value=4)
- >>> _ = enc.fit(X_train)
- >>> X_test = np.array([["a"], ["b"], ["c"], ["d"], ["e"], [np.nan]], dtype=object)
- >>> enc.transform(X_test)
- array([[2.],
- [0.],
- [1.],
- [2.],
- [3.],
- [4.]])
- """
- _parameter_constraints: dict = {
- "categories": [StrOptions({"auto"}), list],
- "dtype": "no_validation", # validation delegated to numpy
- "encoded_missing_value": [Integral, type(np.nan)],
- "handle_unknown": [StrOptions({"error", "use_encoded_value"})],
- "unknown_value": [Integral, type(np.nan), None],
- "max_categories": [Interval(Integral, 1, None, closed="left"), None],
- "min_frequency": [
- Interval(Integral, 1, None, closed="left"),
- Interval(RealNotInt, 0, 1, closed="neither"),
- None,
- ],
- }
- def __init__(
- self,
- *,
- categories="auto",
- dtype=np.float64,
- handle_unknown="error",
- unknown_value=None,
- encoded_missing_value=np.nan,
- min_frequency=None,
- max_categories=None,
- ):
- self.categories = categories
- self.dtype = dtype
- self.handle_unknown = handle_unknown
- self.unknown_value = unknown_value
- self.encoded_missing_value = encoded_missing_value
- self.min_frequency = min_frequency
- self.max_categories = max_categories
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, X, y=None):
- """
- Fit the OrdinalEncoder to X.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- The data to determine the categories of each feature.
- y : None
- Ignored. This parameter exists only for compatibility with
- :class:`~sklearn.pipeline.Pipeline`.
- Returns
- -------
- self : object
- Fitted encoder.
- """
- if self.handle_unknown == "use_encoded_value":
- if is_scalar_nan(self.unknown_value):
- if np.dtype(self.dtype).kind != "f":
- raise ValueError(
- "When unknown_value is np.nan, the dtype "
- "parameter should be "
- f"a float dtype. Got {self.dtype}."
- )
- elif not isinstance(self.unknown_value, numbers.Integral):
- raise TypeError(
- "unknown_value should be an integer or "
- "np.nan when "
- "handle_unknown is 'use_encoded_value', "
- f"got {self.unknown_value}."
- )
- elif self.unknown_value is not None:
- raise TypeError(
- "unknown_value should only be set when "
- "handle_unknown is 'use_encoded_value', "
- f"got {self.unknown_value}."
- )
- # `_fit` will only raise an error when `self.handle_unknown="error"`
- fit_results = self._fit(
- X,
- handle_unknown=self.handle_unknown,
- force_all_finite="allow-nan",
- return_and_ignore_missing_for_infrequent=True,
- )
- self._missing_indices = fit_results["missing_indices"]
- cardinalities = [len(categories) for categories in self.categories_]
- if self._infrequent_enabled:
- # Cardinality decreases because the infrequent categories are grouped
- # together
- for feature_idx, infrequent in enumerate(self.infrequent_categories_):
- if infrequent is not None:
- cardinalities[feature_idx] -= len(infrequent)
- # stores the missing indices per category
- self._missing_indices = {}
- for cat_idx, categories_for_idx in enumerate(self.categories_):
- for i, cat in enumerate(categories_for_idx):
- if is_scalar_nan(cat):
- self._missing_indices[cat_idx] = i
- # missing values are not considered part of the cardinality
- # when considering unknown categories or encoded_missing_value
- cardinalities[cat_idx] -= 1
- continue
- if self.handle_unknown == "use_encoded_value":
- for cardinality in cardinalities:
- if 0 <= self.unknown_value < cardinality:
- raise ValueError(
- "The used value for unknown_value "
- f"{self.unknown_value} is one of the "
- "values already used for encoding the "
- "seen categories."
- )
- if self._missing_indices:
- if np.dtype(self.dtype).kind != "f" and is_scalar_nan(
- self.encoded_missing_value
- ):
- raise ValueError(
- "There are missing values in features "
- f"{list(self._missing_indices)}. For OrdinalEncoder to "
- f"encode missing values with dtype: {self.dtype}, set "
- "encoded_missing_value to a non-nan value, or "
- "set dtype to a float"
- )
- if not is_scalar_nan(self.encoded_missing_value):
- # Features are invalid when they contain a missing category
- # and encoded_missing_value was already used to encode a
- # known category
- invalid_features = [
- cat_idx
- for cat_idx, cardinality in enumerate(cardinalities)
- if cat_idx in self._missing_indices
- and 0 <= self.encoded_missing_value < cardinality
- ]
- if invalid_features:
- # Use feature names if they are available
- if hasattr(self, "feature_names_in_"):
- invalid_features = self.feature_names_in_[invalid_features]
- raise ValueError(
- f"encoded_missing_value ({self.encoded_missing_value}) "
- "is already used to encode a known category in features: "
- f"{invalid_features}"
- )
- return self
- def transform(self, X):
- """
- Transform X to ordinal codes.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- The data to encode.
- Returns
- -------
- X_out : ndarray of shape (n_samples, n_features)
- Transformed input.
- """
- X_int, X_mask = self._transform(
- X,
- handle_unknown=self.handle_unknown,
- force_all_finite="allow-nan",
- ignore_category_indices=self._missing_indices,
- )
- X_trans = X_int.astype(self.dtype, copy=False)
- for cat_idx, missing_idx in self._missing_indices.items():
- X_missing_mask = X_int[:, cat_idx] == missing_idx
- X_trans[X_missing_mask, cat_idx] = self.encoded_missing_value
- # create separate category for unknown values
- if self.handle_unknown == "use_encoded_value":
- X_trans[~X_mask] = self.unknown_value
- return X_trans
- def inverse_transform(self, X):
- """
- Convert the data back to the original representation.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_encoded_features)
- The transformed data.
- Returns
- -------
- X_tr : ndarray of shape (n_samples, n_features)
- Inverse transformed array.
- """
- check_is_fitted(self)
- X = check_array(X, force_all_finite="allow-nan")
- n_samples, _ = X.shape
- n_features = len(self.categories_)
- # validate shape of passed X
- msg = (
- "Shape of the passed X data is not correct. Expected {0} columns, got {1}."
- )
- if X.shape[1] != n_features:
- raise ValueError(msg.format(n_features, X.shape[1]))
- # create resulting array of appropriate dtype
- dt = np.result_type(*[cat.dtype for cat in self.categories_])
- X_tr = np.empty((n_samples, n_features), dtype=dt)
- found_unknown = {}
- infrequent_masks = {}
- infrequent_indices = getattr(self, "_infrequent_indices", None)
- for i in range(n_features):
- labels = X[:, i]
- # replace values of X[:, i] that were nan with actual indices
- if i in self._missing_indices:
- X_i_mask = _get_mask(labels, self.encoded_missing_value)
- labels[X_i_mask] = self._missing_indices[i]
- rows_to_update = slice(None)
- categories = self.categories_[i]
- if infrequent_indices is not None and infrequent_indices[i] is not None:
- # Compute mask for frequent categories
- infrequent_encoding_value = len(categories) - len(infrequent_indices[i])
- infrequent_masks[i] = labels == infrequent_encoding_value
- rows_to_update = ~infrequent_masks[i]
- # Remap categories to be only frequent categories. The infrequent
- # categories will be mapped to "infrequent_sklearn" later
- frequent_categories_mask = np.ones_like(categories, dtype=bool)
- frequent_categories_mask[infrequent_indices[i]] = False
- categories = categories[frequent_categories_mask]
- if self.handle_unknown == "use_encoded_value":
- unknown_labels = _get_mask(labels, self.unknown_value)
- found_unknown[i] = unknown_labels
- known_labels = ~unknown_labels
- if isinstance(rows_to_update, np.ndarray):
- rows_to_update &= known_labels
- else:
- rows_to_update = known_labels
- labels_int = labels[rows_to_update].astype("int64", copy=False)
- X_tr[rows_to_update, i] = categories[labels_int]
- if found_unknown or infrequent_masks:
- X_tr = X_tr.astype(object, copy=False)
- # insert None values for unknown values
- if found_unknown:
- for idx, mask in found_unknown.items():
- X_tr[mask, idx] = None
- if infrequent_masks:
- for idx, mask in infrequent_masks.items():
- X_tr[mask, idx] = "infrequent_sklearn"
- return X_tr
|