_target_encoder.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361
  1. from numbers import Integral, Real
  2. import numpy as np
  3. from ..base import OneToOneFeatureMixin, _fit_context
  4. from ..utils._param_validation import Interval, StrOptions
  5. from ..utils.multiclass import type_of_target
  6. from ..utils.validation import _check_y, check_consistent_length
  7. from ._encoders import _BaseEncoder
  8. from ._target_encoder_fast import _fit_encoding_fast, _fit_encoding_fast_auto_smooth
  9. class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder):
  10. """Target Encoder for regression and classification targets.
  11. Each category is encoded based on a shrunk estimate of the average target
  12. values for observations belonging to the category. The encoding scheme mixes
  13. the global target mean with the target mean conditioned on the value of the
  14. category. [MIC]_
  15. :class:`TargetEncoder` considers missing values, such as `np.nan` or `None`,
  16. as another category and encodes them like any other category. Categories
  17. that are not seen during :meth:`fit` are encoded with the target mean, i.e.
  18. `target_mean_`.
  19. For a demo on the importance of the `TargetEncoder` internal cross-fitting,
  20. see
  21. ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder_cross_val.py`.
  22. For a comparison of different encoders, refer to
  23. :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`. Read
  24. more in the :ref:`User Guide <target_encoder>`.
  25. .. note::
  26. `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
  27. :term:`cross fitting` scheme is used in `fit_transform` for encoding.
  28. See the :ref:`User Guide <target_encoder>` for details.
  29. .. versionadded:: 1.3
  30. Parameters
  31. ----------
  32. categories : "auto" or list of shape (n_features,) of array-like, default="auto"
  33. Categories (unique values) per feature:
  34. - `"auto"` : Determine categories automatically from the training data.
  35. - list : `categories[i]` holds the categories expected in the i-th column. The
  36. passed categories should not mix strings and numeric values within a single
  37. feature, and should be sorted in case of numeric values.
  38. The used categories are stored in the `categories_` fitted attribute.
  39. target_type : {"auto", "continuous", "binary"}, default="auto"
  40. Type of target.
  41. - `"auto"` : Type of target is inferred with
  42. :func:`~sklearn.utils.multiclass.type_of_target`.
  43. - `"continuous"` : Continuous target
  44. - `"binary"` : Binary target
  45. .. note::
  46. The type of target inferred with `"auto"` may not be the desired target
  47. type used for modeling. For example, if the target consisted of integers
  48. between 0 and 100, then :func:`~sklearn.utils.multiclass.type_of_target`
  49. will infer the target as `"multiclass"`. In this case, setting
  50. `target_type="continuous"` will specify the target as a regression
  51. problem. The `target_type_` attribute gives the target type used by the
  52. encoder.
  53. smooth : "auto" or float, default="auto"
  54. The amount of mixing of the target mean conditioned on the value of the
  55. category with the global target mean. A larger `smooth` value will put
  56. more weight on the global target mean.
  57. If `"auto"`, then `smooth` is set to an empirical Bayes estimate.
  58. cv : int, default=5
  59. Determines the number of folds in the :term:`cross fitting` strategy used in
  60. :meth:`fit_transform`. For classification targets, `StratifiedKFold` is used
  61. and for continuous targets, `KFold` is used.
  62. shuffle : bool, default=True
  63. Whether to shuffle the data in :meth:`fit_transform` before splitting into
  64. folds. Note that the samples within each split will not be shuffled.
  65. random_state : int, RandomState instance or None, default=None
  66. When `shuffle` is True, `random_state` affects the ordering of the
  67. indices, which controls the randomness of each fold. Otherwise, this
  68. parameter has no effect.
  69. Pass an int for reproducible output across multiple function calls.
  70. See :term:`Glossary <random_state>`.
  71. Attributes
  72. ----------
  73. encodings_ : list of shape (n_features,) of ndarray
  74. Encodings learnt on all of `X`.
  75. For feature `i`, `encodings_[i]` are the encodings matching the
  76. categories listed in `categories_[i]`.
  77. categories_ : list of shape (n_features,) of ndarray
  78. The categories of each feature determined during fitting or specified
  79. in `categories`
  80. (in order of the features in `X` and corresponding with the output
  81. of :meth:`transform`).
  82. target_type_ : str
  83. Type of target.
  84. target_mean_ : float
  85. The overall mean of the target. This value is only used in :meth:`transform`
  86. to encode categories.
  87. n_features_in_ : int
  88. Number of features seen during :term:`fit`.
  89. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  90. Names of features seen during :term:`fit`. Defined only when `X`
  91. has feature names that are all strings.
  92. See Also
  93. --------
  94. OrdinalEncoder : Performs an ordinal (integer) encoding of the categorical features.
  95. Contrary to TargetEncoder, this encoding is not supervised. Treating the
  96. resulting encoding as a numerical features therefore lead arbitrarily
  97. ordered values and therefore typically lead to lower predictive performance
  98. when used as preprocessing for a classifier or regressor.
  99. OneHotEncoder : Performs a one-hot encoding of categorical features. This
  100. unsupervised encoding is better suited for low cardinality categorical
  101. variables as it generate one new feature per unique category.
  102. References
  103. ----------
  104. .. [MIC] :doi:`Micci-Barreca, Daniele. "A preprocessing scheme for high-cardinality
  105. categorical attributes in classification and prediction problems"
  106. SIGKDD Explor. Newsl. 3, 1 (July 2001), 27–32. <10.1145/507533.507538>`
  107. Examples
  108. --------
  109. With `smooth="auto"`, the smoothing parameter is set to an empirical Bayes estimate:
  110. >>> import numpy as np
  111. >>> from sklearn.preprocessing import TargetEncoder
  112. >>> X = np.array([["dog"] * 20 + ["cat"] * 30 + ["snake"] * 38], dtype=object).T
  113. >>> y = [90.3] * 5 + [80.1] * 15 + [20.4] * 5 + [20.1] * 25 + [21.2] * 8 + [49] * 30
  114. >>> enc_auto = TargetEncoder(smooth="auto")
  115. >>> X_trans = enc_auto.fit_transform(X, y)
  116. >>> # A high `smooth` parameter puts more weight on global mean on the categorical
  117. >>> # encodings:
  118. >>> enc_high_smooth = TargetEncoder(smooth=5000.0).fit(X, y)
  119. >>> enc_high_smooth.target_mean_
  120. 44...
  121. >>> enc_high_smooth.encodings_
  122. [array([44..., 44..., 44...])]
  123. >>> # On the other hand, a low `smooth` parameter puts more weight on target
  124. >>> # conditioned on the value of the categorical:
  125. >>> enc_low_smooth = TargetEncoder(smooth=1.0).fit(X, y)
  126. >>> enc_low_smooth.encodings_
  127. [array([20..., 80..., 43...])]
  128. """
  129. _parameter_constraints: dict = {
  130. "categories": [StrOptions({"auto"}), list],
  131. "target_type": [StrOptions({"auto", "continuous", "binary"})],
  132. "smooth": [StrOptions({"auto"}), Interval(Real, 0, None, closed="left")],
  133. "cv": [Interval(Integral, 2, None, closed="left")],
  134. "shuffle": ["boolean"],
  135. "random_state": ["random_state"],
  136. }
  137. def __init__(
  138. self,
  139. categories="auto",
  140. target_type="auto",
  141. smooth="auto",
  142. cv=5,
  143. shuffle=True,
  144. random_state=None,
  145. ):
  146. self.categories = categories
  147. self.smooth = smooth
  148. self.target_type = target_type
  149. self.cv = cv
  150. self.shuffle = shuffle
  151. self.random_state = random_state
  152. @_fit_context(prefer_skip_nested_validation=True)
  153. def fit(self, X, y):
  154. """Fit the :class:`TargetEncoder` to X and y.
  155. Parameters
  156. ----------
  157. X : array-like of shape (n_samples, n_features)
  158. The data to determine the categories of each feature.
  159. y : array-like of shape (n_samples,)
  160. The target data used to encode the categories.
  161. Returns
  162. -------
  163. self : object
  164. Fitted encoder.
  165. """
  166. self._fit_encodings_all(X, y)
  167. return self
  168. @_fit_context(prefer_skip_nested_validation=True)
  169. def fit_transform(self, X, y):
  170. """Fit :class:`TargetEncoder` and transform X with the target encoding.
  171. .. note::
  172. `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
  173. :term:`cross fitting` scheme is used in `fit_transform` for encoding.
  174. See the :ref:`User Guide <target_encoder>`. for details.
  175. Parameters
  176. ----------
  177. X : array-like of shape (n_samples, n_features)
  178. The data to determine the categories of each feature.
  179. y : array-like of shape (n_samples,)
  180. The target data used to encode the categories.
  181. Returns
  182. -------
  183. X_trans : ndarray of shape (n_samples, n_features)
  184. Transformed input.
  185. """
  186. from ..model_selection import KFold, StratifiedKFold # avoid circular import
  187. X_ordinal, X_known_mask, y, n_categories = self._fit_encodings_all(X, y)
  188. # The cv splitter is voluntarily restricted to *KFold to enforce non
  189. # overlapping validation folds, otherwise the fit_transform output will
  190. # not be well-specified.
  191. if self.target_type_ == "continuous":
  192. cv = KFold(self.cv, shuffle=self.shuffle, random_state=self.random_state)
  193. else:
  194. cv = StratifiedKFold(
  195. self.cv, shuffle=self.shuffle, random_state=self.random_state
  196. )
  197. X_out = np.empty_like(X_ordinal, dtype=np.float64)
  198. X_unknown_mask = ~X_known_mask
  199. for train_idx, test_idx in cv.split(X, y):
  200. X_train, y_train = X_ordinal[train_idx, :], y[train_idx]
  201. y_mean = np.mean(y_train)
  202. if self.smooth == "auto":
  203. y_variance = np.var(y_train)
  204. encodings = _fit_encoding_fast_auto_smooth(
  205. X_train, y_train, n_categories, y_mean, y_variance
  206. )
  207. else:
  208. encodings = _fit_encoding_fast(
  209. X_train, y_train, n_categories, self.smooth, y_mean
  210. )
  211. self._transform_X_ordinal(
  212. X_out, X_ordinal, X_unknown_mask, test_idx, encodings, y_mean
  213. )
  214. return X_out
  215. def transform(self, X):
  216. """Transform X with the target encoding.
  217. .. note::
  218. `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
  219. :term:`cross fitting` scheme is used in `fit_transform` for encoding.
  220. See the :ref:`User Guide <target_encoder>`. for details.
  221. Parameters
  222. ----------
  223. X : array-like of shape (n_samples, n_features)
  224. The data to determine the categories of each feature.
  225. Returns
  226. -------
  227. X_trans : ndarray of shape (n_samples, n_features)
  228. Transformed input.
  229. """
  230. X_ordinal, X_known_mask = self._transform(
  231. X, handle_unknown="ignore", force_all_finite="allow-nan"
  232. )
  233. X_out = np.empty_like(X_ordinal, dtype=np.float64)
  234. self._transform_X_ordinal(
  235. X_out,
  236. X_ordinal,
  237. ~X_known_mask,
  238. slice(None),
  239. self.encodings_,
  240. self.target_mean_,
  241. )
  242. return X_out
  243. def _fit_encodings_all(self, X, y):
  244. """Fit a target encoding with all the data."""
  245. from ..preprocessing import LabelEncoder # avoid circular import
  246. check_consistent_length(X, y)
  247. self._fit(X, handle_unknown="ignore", force_all_finite="allow-nan")
  248. if self.target_type == "auto":
  249. accepted_target_types = ("binary", "continuous")
  250. inferred_type_of_target = type_of_target(y, input_name="y")
  251. if inferred_type_of_target not in accepted_target_types:
  252. raise ValueError(
  253. "Unknown label type: Target type was inferred to be "
  254. f"{inferred_type_of_target!r}. Only {accepted_target_types} are "
  255. "supported."
  256. )
  257. self.target_type_ = inferred_type_of_target
  258. else:
  259. self.target_type_ = self.target_type
  260. if self.target_type_ == "binary":
  261. y = LabelEncoder().fit_transform(y)
  262. else: # continuous
  263. y = _check_y(y, y_numeric=True, estimator=self)
  264. self.target_mean_ = np.mean(y)
  265. X_ordinal, X_known_mask = self._transform(
  266. X, handle_unknown="ignore", force_all_finite="allow-nan"
  267. )
  268. n_categories = np.fromiter(
  269. (len(category_for_feature) for category_for_feature in self.categories_),
  270. dtype=np.int64,
  271. count=len(self.categories_),
  272. )
  273. if self.smooth == "auto":
  274. y_variance = np.var(y)
  275. self.encodings_ = _fit_encoding_fast_auto_smooth(
  276. X_ordinal, y, n_categories, self.target_mean_, y_variance
  277. )
  278. else:
  279. self.encodings_ = _fit_encoding_fast(
  280. X_ordinal, y, n_categories, self.smooth, self.target_mean_
  281. )
  282. return X_ordinal, X_known_mask, y, n_categories
  283. @staticmethod
  284. def _transform_X_ordinal(
  285. X_out, X_ordinal, X_unknown_mask, indices, encodings, y_mean
  286. ):
  287. """Transform X_ordinal using encodings."""
  288. for f_idx, encoding in enumerate(encodings):
  289. X_out[indices, f_idx] = encoding[X_ordinal[indices, f_idx]]
  290. X_out[X_unknown_mask[:, f_idx], f_idx] = y_mean
  291. def _more_tags(self):
  292. return {
  293. "requires_y": True,
  294. # TargetEncoder is a special case where a transformer uses `y` but
  295. # only accept binary classification and regression targets. For the
  296. # purpose of common tests we use `binary_only` tag to eliminate the
  297. # multiclass tests. TODO: remove this special case when multiclass
  298. # support is added to TargetEncoder. xref:
  299. # https://github.com/scikit-learn/scikit-learn/pull/26674
  300. "binary_only": True,
  301. }