_discretization.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472
  1. # Author: Henry Lin <hlin117@gmail.com>
  2. # Tom Dupré la Tour
  3. # License: BSD
  4. import warnings
  5. from numbers import Integral
  6. import numpy as np
  7. from ..base import BaseEstimator, TransformerMixin, _fit_context
  8. from ..utils import _safe_indexing
  9. from ..utils._param_validation import Hidden, Interval, Options, StrOptions
  10. from ..utils.stats import _weighted_percentile
  11. from ..utils.validation import (
  12. _check_feature_names_in,
  13. _check_sample_weight,
  14. check_array,
  15. check_is_fitted,
  16. check_random_state,
  17. )
  18. from ._encoders import OneHotEncoder
  19. class KBinsDiscretizer(TransformerMixin, BaseEstimator):
  20. """
  21. Bin continuous data into intervals.
  22. Read more in the :ref:`User Guide <preprocessing_discretization>`.
  23. .. versionadded:: 0.20
  24. Parameters
  25. ----------
  26. n_bins : int or array-like of shape (n_features,), default=5
  27. The number of bins to produce. Raises ValueError if ``n_bins < 2``.
  28. encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot'
  29. Method used to encode the transformed result.
  30. - 'onehot': Encode the transformed result with one-hot encoding
  31. and return a sparse matrix. Ignored features are always
  32. stacked to the right.
  33. - 'onehot-dense': Encode the transformed result with one-hot encoding
  34. and return a dense array. Ignored features are always
  35. stacked to the right.
  36. - 'ordinal': Return the bin identifier encoded as an integer value.
  37. strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'
  38. Strategy used to define the widths of the bins.
  39. - 'uniform': All bins in each feature have identical widths.
  40. - 'quantile': All bins in each feature have the same number of points.
  41. - 'kmeans': Values in each bin have the same nearest center of a 1D
  42. k-means cluster.
  43. For an example of the different strategies see:
  44. :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_strategies.py`.
  45. dtype : {np.float32, np.float64}, default=None
  46. The desired data-type for the output. If None, output dtype is
  47. consistent with input dtype. Only np.float32 and np.float64 are
  48. supported.
  49. .. versionadded:: 0.24
  50. subsample : int or None, default='warn'
  51. Maximum number of samples, used to fit the model, for computational
  52. efficiency. Defaults to 200_000 when `strategy='quantile'` and to `None`
  53. when `strategy='uniform'` or `strategy='kmeans'`.
  54. `subsample=None` means that all the training samples are used when
  55. computing the quantiles that determine the binning thresholds.
  56. Since quantile computation relies on sorting each column of `X` and
  57. that sorting has an `n log(n)` time complexity,
  58. it is recommended to use subsampling on datasets with a
  59. very large number of samples.
  60. .. versionchanged:: 1.3
  61. The default value of `subsample` changed from `None` to `200_000` when
  62. `strategy="quantile"`.
  63. .. versionchanged:: 1.5
  64. The default value of `subsample` changed from `None` to `200_000` when
  65. `strategy="uniform"` or `strategy="kmeans"`.
  66. random_state : int, RandomState instance or None, default=None
  67. Determines random number generation for subsampling.
  68. Pass an int for reproducible results across multiple function calls.
  69. See the `subsample` parameter for more details.
  70. See :term:`Glossary <random_state>`.
  71. .. versionadded:: 1.1
  72. Attributes
  73. ----------
  74. bin_edges_ : ndarray of ndarray of shape (n_features,)
  75. The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
  76. Ignored features will have empty arrays.
  77. n_bins_ : ndarray of shape (n_features,), dtype=np.int_
  78. Number of bins per feature. Bins whose width are too small
  79. (i.e., <= 1e-8) are removed with a warning.
  80. n_features_in_ : int
  81. Number of features seen during :term:`fit`.
  82. .. versionadded:: 0.24
  83. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  84. Names of features seen during :term:`fit`. Defined only when `X`
  85. has feature names that are all strings.
  86. .. versionadded:: 1.0
  87. See Also
  88. --------
  89. Binarizer : Class used to bin values as ``0`` or
  90. ``1`` based on a parameter ``threshold``.
  91. Notes
  92. -----
  93. For a visualization of discretization on different datasets refer to
  94. :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_classification.py`.
  95. On the effect of discretization on linear models see:
  96. :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization.py`.
  97. In bin edges for feature ``i``, the first and last values are used only for
  98. ``inverse_transform``. During transform, bin edges are extended to::
  99. np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])
  100. You can combine ``KBinsDiscretizer`` with
  101. :class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess
  102. part of the features.
  103. ``KBinsDiscretizer`` might produce constant features (e.g., when
  104. ``encode = 'onehot'`` and certain bins do not contain any data).
  105. These features can be removed with feature selection algorithms
  106. (e.g., :class:`~sklearn.feature_selection.VarianceThreshold`).
  107. Examples
  108. --------
  109. >>> from sklearn.preprocessing import KBinsDiscretizer
  110. >>> X = [[-2, 1, -4, -1],
  111. ... [-1, 2, -3, -0.5],
  112. ... [ 0, 3, -2, 0.5],
  113. ... [ 1, 4, -1, 2]]
  114. >>> est = KBinsDiscretizer(
  115. ... n_bins=3, encode='ordinal', strategy='uniform', subsample=None
  116. ... )
  117. >>> est.fit(X)
  118. KBinsDiscretizer(...)
  119. >>> Xt = est.transform(X)
  120. >>> Xt # doctest: +SKIP
  121. array([[ 0., 0., 0., 0.],
  122. [ 1., 1., 1., 0.],
  123. [ 2., 2., 2., 1.],
  124. [ 2., 2., 2., 2.]])
  125. Sometimes it may be useful to convert the data back into the original
  126. feature space. The ``inverse_transform`` function converts the binned
  127. data into the original feature space. Each value will be equal to the mean
  128. of the two bin edges.
  129. >>> est.bin_edges_[0]
  130. array([-2., -1., 0., 1.])
  131. >>> est.inverse_transform(Xt)
  132. array([[-1.5, 1.5, -3.5, -0.5],
  133. [-0.5, 2.5, -2.5, -0.5],
  134. [ 0.5, 3.5, -1.5, 0.5],
  135. [ 0.5, 3.5, -1.5, 1.5]])
  136. """
  137. _parameter_constraints: dict = {
  138. "n_bins": [Interval(Integral, 2, None, closed="left"), "array-like"],
  139. "encode": [StrOptions({"onehot", "onehot-dense", "ordinal"})],
  140. "strategy": [StrOptions({"uniform", "quantile", "kmeans"})],
  141. "dtype": [Options(type, {np.float64, np.float32}), None],
  142. "subsample": [
  143. Interval(Integral, 1, None, closed="left"),
  144. None,
  145. Hidden(StrOptions({"warn"})),
  146. ],
  147. "random_state": ["random_state"],
  148. }
  149. def __init__(
  150. self,
  151. n_bins=5,
  152. *,
  153. encode="onehot",
  154. strategy="quantile",
  155. dtype=None,
  156. subsample="warn",
  157. random_state=None,
  158. ):
  159. self.n_bins = n_bins
  160. self.encode = encode
  161. self.strategy = strategy
  162. self.dtype = dtype
  163. self.subsample = subsample
  164. self.random_state = random_state
  165. @_fit_context(prefer_skip_nested_validation=True)
  166. def fit(self, X, y=None, sample_weight=None):
  167. """
  168. Fit the estimator.
  169. Parameters
  170. ----------
  171. X : array-like of shape (n_samples, n_features)
  172. Data to be discretized.
  173. y : None
  174. Ignored. This parameter exists only for compatibility with
  175. :class:`~sklearn.pipeline.Pipeline`.
  176. sample_weight : ndarray of shape (n_samples,)
  177. Contains weight values to be associated with each sample.
  178. Only possible when `strategy` is set to `"quantile"`.
  179. .. versionadded:: 1.3
  180. Returns
  181. -------
  182. self : object
  183. Returns the instance itself.
  184. """
  185. X = self._validate_data(X, dtype="numeric")
  186. if self.dtype in (np.float64, np.float32):
  187. output_dtype = self.dtype
  188. else: # self.dtype is None
  189. output_dtype = X.dtype
  190. n_samples, n_features = X.shape
  191. if sample_weight is not None and self.strategy == "uniform":
  192. raise ValueError(
  193. "`sample_weight` was provided but it cannot be "
  194. "used with strategy='uniform'. Got strategy="
  195. f"{self.strategy!r} instead."
  196. )
  197. if self.strategy in ("uniform", "kmeans") and self.subsample == "warn":
  198. warnings.warn(
  199. (
  200. "In version 1.5 onwards, subsample=200_000 "
  201. "will be used by default. Set subsample explicitly to "
  202. "silence this warning in the mean time. Set "
  203. "subsample=None to disable subsampling explicitly."
  204. ),
  205. FutureWarning,
  206. )
  207. subsample = self.subsample
  208. if subsample == "warn":
  209. subsample = 200000 if self.strategy == "quantile" else None
  210. if subsample is not None and n_samples > subsample:
  211. rng = check_random_state(self.random_state)
  212. subsample_idx = rng.choice(n_samples, size=subsample, replace=False)
  213. X = _safe_indexing(X, subsample_idx)
  214. n_features = X.shape[1]
  215. n_bins = self._validate_n_bins(n_features)
  216. if sample_weight is not None:
  217. sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
  218. bin_edges = np.zeros(n_features, dtype=object)
  219. for jj in range(n_features):
  220. column = X[:, jj]
  221. col_min, col_max = column.min(), column.max()
  222. if col_min == col_max:
  223. warnings.warn(
  224. "Feature %d is constant and will be replaced with 0." % jj
  225. )
  226. n_bins[jj] = 1
  227. bin_edges[jj] = np.array([-np.inf, np.inf])
  228. continue
  229. if self.strategy == "uniform":
  230. bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1)
  231. elif self.strategy == "quantile":
  232. quantiles = np.linspace(0, 100, n_bins[jj] + 1)
  233. if sample_weight is None:
  234. bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
  235. else:
  236. bin_edges[jj] = np.asarray(
  237. [
  238. _weighted_percentile(column, sample_weight, q)
  239. for q in quantiles
  240. ],
  241. dtype=np.float64,
  242. )
  243. elif self.strategy == "kmeans":
  244. from ..cluster import KMeans # fixes import loops
  245. # Deterministic initialization with uniform spacing
  246. uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1)
  247. init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5
  248. # 1D k-means procedure
  249. km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1)
  250. centers = km.fit(
  251. column[:, None], sample_weight=sample_weight
  252. ).cluster_centers_[:, 0]
  253. # Must sort, centers may be unsorted even with sorted init
  254. centers.sort()
  255. bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5
  256. bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]
  257. # Remove bins whose width are too small (i.e., <= 1e-8)
  258. if self.strategy in ("quantile", "kmeans"):
  259. mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8
  260. bin_edges[jj] = bin_edges[jj][mask]
  261. if len(bin_edges[jj]) - 1 != n_bins[jj]:
  262. warnings.warn(
  263. "Bins whose width are too small (i.e., <= "
  264. "1e-8) in feature %d are removed. Consider "
  265. "decreasing the number of bins." % jj
  266. )
  267. n_bins[jj] = len(bin_edges[jj]) - 1
  268. self.bin_edges_ = bin_edges
  269. self.n_bins_ = n_bins
  270. if "onehot" in self.encode:
  271. self._encoder = OneHotEncoder(
  272. categories=[np.arange(i) for i in self.n_bins_],
  273. sparse_output=self.encode == "onehot",
  274. dtype=output_dtype,
  275. )
  276. # Fit the OneHotEncoder with toy datasets
  277. # so that it's ready for use after the KBinsDiscretizer is fitted
  278. self._encoder.fit(np.zeros((1, len(self.n_bins_))))
  279. return self
  280. def _validate_n_bins(self, n_features):
  281. """Returns n_bins_, the number of bins per feature."""
  282. orig_bins = self.n_bins
  283. if isinstance(orig_bins, Integral):
  284. return np.full(n_features, orig_bins, dtype=int)
  285. n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False)
  286. if n_bins.ndim > 1 or n_bins.shape[0] != n_features:
  287. raise ValueError("n_bins must be a scalar or array of shape (n_features,).")
  288. bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins)
  289. violating_indices = np.where(bad_nbins_value)[0]
  290. if violating_indices.shape[0] > 0:
  291. indices = ", ".join(str(i) for i in violating_indices)
  292. raise ValueError(
  293. "{} received an invalid number "
  294. "of bins at indices {}. Number of bins "
  295. "must be at least 2, and must be an int.".format(
  296. KBinsDiscretizer.__name__, indices
  297. )
  298. )
  299. return n_bins
  300. def transform(self, X):
  301. """
  302. Discretize the data.
  303. Parameters
  304. ----------
  305. X : array-like of shape (n_samples, n_features)
  306. Data to be discretized.
  307. Returns
  308. -------
  309. Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}
  310. Data in the binned space. Will be a sparse matrix if
  311. `self.encode='onehot'` and ndarray otherwise.
  312. """
  313. check_is_fitted(self)
  314. # check input and attribute dtypes
  315. dtype = (np.float64, np.float32) if self.dtype is None else self.dtype
  316. Xt = self._validate_data(X, copy=True, dtype=dtype, reset=False)
  317. bin_edges = self.bin_edges_
  318. for jj in range(Xt.shape[1]):
  319. Xt[:, jj] = np.searchsorted(bin_edges[jj][1:-1], Xt[:, jj], side="right")
  320. if self.encode == "ordinal":
  321. return Xt
  322. dtype_init = None
  323. if "onehot" in self.encode:
  324. dtype_init = self._encoder.dtype
  325. self._encoder.dtype = Xt.dtype
  326. try:
  327. Xt_enc = self._encoder.transform(Xt)
  328. finally:
  329. # revert the initial dtype to avoid modifying self.
  330. self._encoder.dtype = dtype_init
  331. return Xt_enc
  332. def inverse_transform(self, Xt):
  333. """
  334. Transform discretized data back to original feature space.
  335. Note that this function does not regenerate the original data
  336. due to discretization rounding.
  337. Parameters
  338. ----------
  339. Xt : array-like of shape (n_samples, n_features)
  340. Transformed data in the binned space.
  341. Returns
  342. -------
  343. Xinv : ndarray, dtype={np.float32, np.float64}
  344. Data in the original feature space.
  345. """
  346. check_is_fitted(self)
  347. if "onehot" in self.encode:
  348. Xt = self._encoder.inverse_transform(Xt)
  349. Xinv = check_array(Xt, copy=True, dtype=(np.float64, np.float32))
  350. n_features = self.n_bins_.shape[0]
  351. if Xinv.shape[1] != n_features:
  352. raise ValueError(
  353. "Incorrect number of features. Expecting {}, received {}.".format(
  354. n_features, Xinv.shape[1]
  355. )
  356. )
  357. for jj in range(n_features):
  358. bin_edges = self.bin_edges_[jj]
  359. bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5
  360. Xinv[:, jj] = bin_centers[np.int_(Xinv[:, jj])]
  361. return Xinv
  362. def get_feature_names_out(self, input_features=None):
  363. """Get output feature names.
  364. Parameters
  365. ----------
  366. input_features : array-like of str or None, default=None
  367. Input features.
  368. - If `input_features` is `None`, then `feature_names_in_` is
  369. used as feature names in. If `feature_names_in_` is not defined,
  370. then the following input feature names are generated:
  371. `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
  372. - If `input_features` is an array-like, then `input_features` must
  373. match `feature_names_in_` if `feature_names_in_` is defined.
  374. Returns
  375. -------
  376. feature_names_out : ndarray of str objects
  377. Transformed feature names.
  378. """
  379. check_is_fitted(self, "n_features_in_")
  380. input_features = _check_feature_names_in(self, input_features)
  381. if hasattr(self, "_encoder"):
  382. return self._encoder.get_feature_names_out(input_features)
  383. # ordinal encoding
  384. return input_features