| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472 |
- # Author: Henry Lin <hlin117@gmail.com>
- # Tom Dupré la Tour
- # License: BSD
- import warnings
- from numbers import Integral
- import numpy as np
- from ..base import BaseEstimator, TransformerMixin, _fit_context
- from ..utils import _safe_indexing
- from ..utils._param_validation import Hidden, Interval, Options, StrOptions
- from ..utils.stats import _weighted_percentile
- from ..utils.validation import (
- _check_feature_names_in,
- _check_sample_weight,
- check_array,
- check_is_fitted,
- check_random_state,
- )
- from ._encoders import OneHotEncoder
- class KBinsDiscretizer(TransformerMixin, BaseEstimator):
- """
- Bin continuous data into intervals.
- Read more in the :ref:`User Guide <preprocessing_discretization>`.
- .. versionadded:: 0.20
- Parameters
- ----------
- n_bins : int or array-like of shape (n_features,), default=5
- The number of bins to produce. Raises ValueError if ``n_bins < 2``.
- encode : {'onehot', 'onehot-dense', 'ordinal'}, default='onehot'
- Method used to encode the transformed result.
- - 'onehot': Encode the transformed result with one-hot encoding
- and return a sparse matrix. Ignored features are always
- stacked to the right.
- - 'onehot-dense': Encode the transformed result with one-hot encoding
- and return a dense array. Ignored features are always
- stacked to the right.
- - 'ordinal': Return the bin identifier encoded as an integer value.
- strategy : {'uniform', 'quantile', 'kmeans'}, default='quantile'
- Strategy used to define the widths of the bins.
- - 'uniform': All bins in each feature have identical widths.
- - 'quantile': All bins in each feature have the same number of points.
- - 'kmeans': Values in each bin have the same nearest center of a 1D
- k-means cluster.
- For an example of the different strategies see:
- :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_strategies.py`.
- dtype : {np.float32, np.float64}, default=None
- The desired data-type for the output. If None, output dtype is
- consistent with input dtype. Only np.float32 and np.float64 are
- supported.
- .. versionadded:: 0.24
- subsample : int or None, default='warn'
- Maximum number of samples, used to fit the model, for computational
- efficiency. Defaults to 200_000 when `strategy='quantile'` and to `None`
- when `strategy='uniform'` or `strategy='kmeans'`.
- `subsample=None` means that all the training samples are used when
- computing the quantiles that determine the binning thresholds.
- Since quantile computation relies on sorting each column of `X` and
- that sorting has an `n log(n)` time complexity,
- it is recommended to use subsampling on datasets with a
- very large number of samples.
- .. versionchanged:: 1.3
- The default value of `subsample` changed from `None` to `200_000` when
- `strategy="quantile"`.
- .. versionchanged:: 1.5
- The default value of `subsample` changed from `None` to `200_000` when
- `strategy="uniform"` or `strategy="kmeans"`.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for subsampling.
- Pass an int for reproducible results across multiple function calls.
- See the `subsample` parameter for more details.
- See :term:`Glossary <random_state>`.
- .. versionadded:: 1.1
- Attributes
- ----------
- bin_edges_ : ndarray of ndarray of shape (n_features,)
- The edges of each bin. Contain arrays of varying shapes ``(n_bins_, )``
- Ignored features will have empty arrays.
- n_bins_ : ndarray of shape (n_features,), dtype=np.int_
- Number of bins per feature. Bins whose width are too small
- (i.e., <= 1e-8) are removed with a warning.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- See Also
- --------
- Binarizer : Class used to bin values as ``0`` or
- ``1`` based on a parameter ``threshold``.
- Notes
- -----
- For a visualization of discretization on different datasets refer to
- :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_classification.py`.
- On the effect of discretization on linear models see:
- :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization.py`.
- In bin edges for feature ``i``, the first and last values are used only for
- ``inverse_transform``. During transform, bin edges are extended to::
- np.concatenate([-np.inf, bin_edges_[i][1:-1], np.inf])
- You can combine ``KBinsDiscretizer`` with
- :class:`~sklearn.compose.ColumnTransformer` if you only want to preprocess
- part of the features.
- ``KBinsDiscretizer`` might produce constant features (e.g., when
- ``encode = 'onehot'`` and certain bins do not contain any data).
- These features can be removed with feature selection algorithms
- (e.g., :class:`~sklearn.feature_selection.VarianceThreshold`).
- Examples
- --------
- >>> from sklearn.preprocessing import KBinsDiscretizer
- >>> X = [[-2, 1, -4, -1],
- ... [-1, 2, -3, -0.5],
- ... [ 0, 3, -2, 0.5],
- ... [ 1, 4, -1, 2]]
- >>> est = KBinsDiscretizer(
- ... n_bins=3, encode='ordinal', strategy='uniform', subsample=None
- ... )
- >>> est.fit(X)
- KBinsDiscretizer(...)
- >>> Xt = est.transform(X)
- >>> Xt # doctest: +SKIP
- array([[ 0., 0., 0., 0.],
- [ 1., 1., 1., 0.],
- [ 2., 2., 2., 1.],
- [ 2., 2., 2., 2.]])
- Sometimes it may be useful to convert the data back into the original
- feature space. The ``inverse_transform`` function converts the binned
- data into the original feature space. Each value will be equal to the mean
- of the two bin edges.
- >>> est.bin_edges_[0]
- array([-2., -1., 0., 1.])
- >>> est.inverse_transform(Xt)
- array([[-1.5, 1.5, -3.5, -0.5],
- [-0.5, 2.5, -2.5, -0.5],
- [ 0.5, 3.5, -1.5, 0.5],
- [ 0.5, 3.5, -1.5, 1.5]])
- """
- _parameter_constraints: dict = {
- "n_bins": [Interval(Integral, 2, None, closed="left"), "array-like"],
- "encode": [StrOptions({"onehot", "onehot-dense", "ordinal"})],
- "strategy": [StrOptions({"uniform", "quantile", "kmeans"})],
- "dtype": [Options(type, {np.float64, np.float32}), None],
- "subsample": [
- Interval(Integral, 1, None, closed="left"),
- None,
- Hidden(StrOptions({"warn"})),
- ],
- "random_state": ["random_state"],
- }
- def __init__(
- self,
- n_bins=5,
- *,
- encode="onehot",
- strategy="quantile",
- dtype=None,
- subsample="warn",
- random_state=None,
- ):
- self.n_bins = n_bins
- self.encode = encode
- self.strategy = strategy
- self.dtype = dtype
- self.subsample = subsample
- self.random_state = random_state
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, X, y=None, sample_weight=None):
- """
- Fit the estimator.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Data to be discretized.
- y : None
- Ignored. This parameter exists only for compatibility with
- :class:`~sklearn.pipeline.Pipeline`.
- sample_weight : ndarray of shape (n_samples,)
- Contains weight values to be associated with each sample.
- Only possible when `strategy` is set to `"quantile"`.
- .. versionadded:: 1.3
- Returns
- -------
- self : object
- Returns the instance itself.
- """
- X = self._validate_data(X, dtype="numeric")
- if self.dtype in (np.float64, np.float32):
- output_dtype = self.dtype
- else: # self.dtype is None
- output_dtype = X.dtype
- n_samples, n_features = X.shape
- if sample_weight is not None and self.strategy == "uniform":
- raise ValueError(
- "`sample_weight` was provided but it cannot be "
- "used with strategy='uniform'. Got strategy="
- f"{self.strategy!r} instead."
- )
- if self.strategy in ("uniform", "kmeans") and self.subsample == "warn":
- warnings.warn(
- (
- "In version 1.5 onwards, subsample=200_000 "
- "will be used by default. Set subsample explicitly to "
- "silence this warning in the mean time. Set "
- "subsample=None to disable subsampling explicitly."
- ),
- FutureWarning,
- )
- subsample = self.subsample
- if subsample == "warn":
- subsample = 200000 if self.strategy == "quantile" else None
- if subsample is not None and n_samples > subsample:
- rng = check_random_state(self.random_state)
- subsample_idx = rng.choice(n_samples, size=subsample, replace=False)
- X = _safe_indexing(X, subsample_idx)
- n_features = X.shape[1]
- n_bins = self._validate_n_bins(n_features)
- if sample_weight is not None:
- sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
- bin_edges = np.zeros(n_features, dtype=object)
- for jj in range(n_features):
- column = X[:, jj]
- col_min, col_max = column.min(), column.max()
- if col_min == col_max:
- warnings.warn(
- "Feature %d is constant and will be replaced with 0." % jj
- )
- n_bins[jj] = 1
- bin_edges[jj] = np.array([-np.inf, np.inf])
- continue
- if self.strategy == "uniform":
- bin_edges[jj] = np.linspace(col_min, col_max, n_bins[jj] + 1)
- elif self.strategy == "quantile":
- quantiles = np.linspace(0, 100, n_bins[jj] + 1)
- if sample_weight is None:
- bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
- else:
- bin_edges[jj] = np.asarray(
- [
- _weighted_percentile(column, sample_weight, q)
- for q in quantiles
- ],
- dtype=np.float64,
- )
- elif self.strategy == "kmeans":
- from ..cluster import KMeans # fixes import loops
- # Deterministic initialization with uniform spacing
- uniform_edges = np.linspace(col_min, col_max, n_bins[jj] + 1)
- init = (uniform_edges[1:] + uniform_edges[:-1])[:, None] * 0.5
- # 1D k-means procedure
- km = KMeans(n_clusters=n_bins[jj], init=init, n_init=1)
- centers = km.fit(
- column[:, None], sample_weight=sample_weight
- ).cluster_centers_[:, 0]
- # Must sort, centers may be unsorted even with sorted init
- centers.sort()
- bin_edges[jj] = (centers[1:] + centers[:-1]) * 0.5
- bin_edges[jj] = np.r_[col_min, bin_edges[jj], col_max]
- # Remove bins whose width are too small (i.e., <= 1e-8)
- if self.strategy in ("quantile", "kmeans"):
- mask = np.ediff1d(bin_edges[jj], to_begin=np.inf) > 1e-8
- bin_edges[jj] = bin_edges[jj][mask]
- if len(bin_edges[jj]) - 1 != n_bins[jj]:
- warnings.warn(
- "Bins whose width are too small (i.e., <= "
- "1e-8) in feature %d are removed. Consider "
- "decreasing the number of bins." % jj
- )
- n_bins[jj] = len(bin_edges[jj]) - 1
- self.bin_edges_ = bin_edges
- self.n_bins_ = n_bins
- if "onehot" in self.encode:
- self._encoder = OneHotEncoder(
- categories=[np.arange(i) for i in self.n_bins_],
- sparse_output=self.encode == "onehot",
- dtype=output_dtype,
- )
- # Fit the OneHotEncoder with toy datasets
- # so that it's ready for use after the KBinsDiscretizer is fitted
- self._encoder.fit(np.zeros((1, len(self.n_bins_))))
- return self
- def _validate_n_bins(self, n_features):
- """Returns n_bins_, the number of bins per feature."""
- orig_bins = self.n_bins
- if isinstance(orig_bins, Integral):
- return np.full(n_features, orig_bins, dtype=int)
- n_bins = check_array(orig_bins, dtype=int, copy=True, ensure_2d=False)
- if n_bins.ndim > 1 or n_bins.shape[0] != n_features:
- raise ValueError("n_bins must be a scalar or array of shape (n_features,).")
- bad_nbins_value = (n_bins < 2) | (n_bins != orig_bins)
- violating_indices = np.where(bad_nbins_value)[0]
- if violating_indices.shape[0] > 0:
- indices = ", ".join(str(i) for i in violating_indices)
- raise ValueError(
- "{} received an invalid number "
- "of bins at indices {}. Number of bins "
- "must be at least 2, and must be an int.".format(
- KBinsDiscretizer.__name__, indices
- )
- )
- return n_bins
- def transform(self, X):
- """
- Discretize the data.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Data to be discretized.
- Returns
- -------
- Xt : {ndarray, sparse matrix}, dtype={np.float32, np.float64}
- Data in the binned space. Will be a sparse matrix if
- `self.encode='onehot'` and ndarray otherwise.
- """
- check_is_fitted(self)
- # check input and attribute dtypes
- dtype = (np.float64, np.float32) if self.dtype is None else self.dtype
- Xt = self._validate_data(X, copy=True, dtype=dtype, reset=False)
- bin_edges = self.bin_edges_
- for jj in range(Xt.shape[1]):
- Xt[:, jj] = np.searchsorted(bin_edges[jj][1:-1], Xt[:, jj], side="right")
- if self.encode == "ordinal":
- return Xt
- dtype_init = None
- if "onehot" in self.encode:
- dtype_init = self._encoder.dtype
- self._encoder.dtype = Xt.dtype
- try:
- Xt_enc = self._encoder.transform(Xt)
- finally:
- # revert the initial dtype to avoid modifying self.
- self._encoder.dtype = dtype_init
- return Xt_enc
- def inverse_transform(self, Xt):
- """
- Transform discretized data back to original feature space.
- Note that this function does not regenerate the original data
- due to discretization rounding.
- Parameters
- ----------
- Xt : array-like of shape (n_samples, n_features)
- Transformed data in the binned space.
- Returns
- -------
- Xinv : ndarray, dtype={np.float32, np.float64}
- Data in the original feature space.
- """
- check_is_fitted(self)
- if "onehot" in self.encode:
- Xt = self._encoder.inverse_transform(Xt)
- Xinv = check_array(Xt, copy=True, dtype=(np.float64, np.float32))
- n_features = self.n_bins_.shape[0]
- if Xinv.shape[1] != n_features:
- raise ValueError(
- "Incorrect number of features. Expecting {}, received {}.".format(
- n_features, Xinv.shape[1]
- )
- )
- for jj in range(n_features):
- bin_edges = self.bin_edges_[jj]
- bin_centers = (bin_edges[1:] + bin_edges[:-1]) * 0.5
- Xinv[:, jj] = bin_centers[np.int_(Xinv[:, jj])]
- return Xinv
- def get_feature_names_out(self, input_features=None):
- """Get output feature names.
- Parameters
- ----------
- input_features : array-like of str or None, default=None
- Input features.
- - If `input_features` is `None`, then `feature_names_in_` is
- used as feature names in. If `feature_names_in_` is not defined,
- then the following input feature names are generated:
- `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
- - If `input_features` is an array-like, then `input_features` must
- match `feature_names_in_` if `feature_names_in_` is defined.
- Returns
- -------
- feature_names_out : ndarray of str objects
- Transformed feature names.
- """
- check_is_fitted(self, "n_features_in_")
- input_features = _check_feature_names_in(self, input_features)
- if hasattr(self, "_encoder"):
- return self._encoder.get_feature_names_out(input_features)
- # ordinal encoding
- return input_features
|