| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136 |
- # Author: Lars Buitinck
- # License: 3-clause BSD
- from numbers import Real
- import numpy as np
- from ..base import BaseEstimator, _fit_context
- from ..utils._param_validation import Interval
- from ..utils.sparsefuncs import mean_variance_axis, min_max_axis
- from ..utils.validation import check_is_fitted
- from ._base import SelectorMixin
- class VarianceThreshold(SelectorMixin, BaseEstimator):
- """Feature selector that removes all low-variance features.
- This feature selection algorithm looks only at the features (X), not the
- desired outputs (y), and can thus be used for unsupervised learning.
- Read more in the :ref:`User Guide <variance_threshold>`.
- Parameters
- ----------
- threshold : float, default=0
- Features with a training-set variance lower than this threshold will
- be removed. The default is to keep all features with non-zero variance,
- i.e. remove the features that have the same value in all samples.
- Attributes
- ----------
- variances_ : array, shape (n_features,)
- Variances of individual features.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- See Also
- --------
- SelectFromModel: Meta-transformer for selecting features based on
- importance weights.
- SelectPercentile : Select features according to a percentile of the highest
- scores.
- SequentialFeatureSelector : Transformer that performs Sequential Feature
- Selection.
- Notes
- -----
- Allows NaN in the input.
- Raises ValueError if no feature in X meets the variance threshold.
- Examples
- --------
- The following dataset has integer features, two of which are the same
- in every sample. These are removed with the default setting for threshold::
- >>> from sklearn.feature_selection import VarianceThreshold
- >>> X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
- >>> selector = VarianceThreshold()
- >>> selector.fit_transform(X)
- array([[2, 0],
- [1, 4],
- [1, 1]])
- """
- _parameter_constraints: dict = {
- "threshold": [Interval(Real, 0, None, closed="left")]
- }
- def __init__(self, threshold=0.0):
- self.threshold = threshold
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, X, y=None):
- """Learn empirical variances from X.
- Parameters
- ----------
- X : {array-like, sparse matrix}, shape (n_samples, n_features)
- Data from which to compute variances, where `n_samples` is
- the number of samples and `n_features` is the number of features.
- y : any, default=None
- Ignored. This parameter exists only for compatibility with
- sklearn.pipeline.Pipeline.
- Returns
- -------
- self : object
- Returns the instance itself.
- """
- X = self._validate_data(
- X,
- accept_sparse=("csr", "csc"),
- dtype=np.float64,
- force_all_finite="allow-nan",
- )
- if hasattr(X, "toarray"): # sparse matrix
- _, self.variances_ = mean_variance_axis(X, axis=0)
- if self.threshold == 0:
- mins, maxes = min_max_axis(X, axis=0)
- peak_to_peaks = maxes - mins
- else:
- self.variances_ = np.nanvar(X, axis=0)
- if self.threshold == 0:
- peak_to_peaks = np.ptp(X, axis=0)
- if self.threshold == 0:
- # Use peak-to-peak to avoid numeric precision issues
- # for constant features
- compare_arr = np.array([self.variances_, peak_to_peaks])
- self.variances_ = np.nanmin(compare_arr, axis=0)
- if np.all(~np.isfinite(self.variances_) | (self.variances_ <= self.threshold)):
- msg = "No feature in X meets the variance threshold {0:.5f}"
- if X.shape[0] == 1:
- msg += " (X contains only one sample)"
- raise ValueError(msg.format(self.threshold))
- return self
- def _get_support_mask(self):
- check_is_fitted(self)
- return self.variances_ > self.threshold
- def _more_tags(self):
- return {"allow_nan": True}
|