_variance_threshold.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. # Author: Lars Buitinck
  2. # License: 3-clause BSD
  3. from numbers import Real
  4. import numpy as np
  5. from ..base import BaseEstimator, _fit_context
  6. from ..utils._param_validation import Interval
  7. from ..utils.sparsefuncs import mean_variance_axis, min_max_axis
  8. from ..utils.validation import check_is_fitted
  9. from ._base import SelectorMixin
  10. class VarianceThreshold(SelectorMixin, BaseEstimator):
  11. """Feature selector that removes all low-variance features.
  12. This feature selection algorithm looks only at the features (X), not the
  13. desired outputs (y), and can thus be used for unsupervised learning.
  14. Read more in the :ref:`User Guide <variance_threshold>`.
  15. Parameters
  16. ----------
  17. threshold : float, default=0
  18. Features with a training-set variance lower than this threshold will
  19. be removed. The default is to keep all features with non-zero variance,
  20. i.e. remove the features that have the same value in all samples.
  21. Attributes
  22. ----------
  23. variances_ : array, shape (n_features,)
  24. Variances of individual features.
  25. n_features_in_ : int
  26. Number of features seen during :term:`fit`.
  27. .. versionadded:: 0.24
  28. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  29. Names of features seen during :term:`fit`. Defined only when `X`
  30. has feature names that are all strings.
  31. .. versionadded:: 1.0
  32. See Also
  33. --------
  34. SelectFromModel: Meta-transformer for selecting features based on
  35. importance weights.
  36. SelectPercentile : Select features according to a percentile of the highest
  37. scores.
  38. SequentialFeatureSelector : Transformer that performs Sequential Feature
  39. Selection.
  40. Notes
  41. -----
  42. Allows NaN in the input.
  43. Raises ValueError if no feature in X meets the variance threshold.
  44. Examples
  45. --------
  46. The following dataset has integer features, two of which are the same
  47. in every sample. These are removed with the default setting for threshold::
  48. >>> from sklearn.feature_selection import VarianceThreshold
  49. >>> X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
  50. >>> selector = VarianceThreshold()
  51. >>> selector.fit_transform(X)
  52. array([[2, 0],
  53. [1, 4],
  54. [1, 1]])
  55. """
  56. _parameter_constraints: dict = {
  57. "threshold": [Interval(Real, 0, None, closed="left")]
  58. }
  59. def __init__(self, threshold=0.0):
  60. self.threshold = threshold
  61. @_fit_context(prefer_skip_nested_validation=True)
  62. def fit(self, X, y=None):
  63. """Learn empirical variances from X.
  64. Parameters
  65. ----------
  66. X : {array-like, sparse matrix}, shape (n_samples, n_features)
  67. Data from which to compute variances, where `n_samples` is
  68. the number of samples and `n_features` is the number of features.
  69. y : any, default=None
  70. Ignored. This parameter exists only for compatibility with
  71. sklearn.pipeline.Pipeline.
  72. Returns
  73. -------
  74. self : object
  75. Returns the instance itself.
  76. """
  77. X = self._validate_data(
  78. X,
  79. accept_sparse=("csr", "csc"),
  80. dtype=np.float64,
  81. force_all_finite="allow-nan",
  82. )
  83. if hasattr(X, "toarray"): # sparse matrix
  84. _, self.variances_ = mean_variance_axis(X, axis=0)
  85. if self.threshold == 0:
  86. mins, maxes = min_max_axis(X, axis=0)
  87. peak_to_peaks = maxes - mins
  88. else:
  89. self.variances_ = np.nanvar(X, axis=0)
  90. if self.threshold == 0:
  91. peak_to_peaks = np.ptp(X, axis=0)
  92. if self.threshold == 0:
  93. # Use peak-to-peak to avoid numeric precision issues
  94. # for constant features
  95. compare_arr = np.array([self.variances_, peak_to_peaks])
  96. self.variances_ = np.nanmin(compare_arr, axis=0)
  97. if np.all(~np.isfinite(self.variances_) | (self.variances_ <= self.threshold)):
  98. msg = "No feature in X meets the variance threshold {0:.5f}"
  99. if X.shape[0] == 1:
  100. msg += " (X contains only one sample)"
  101. raise ValueError(msg.format(self.threshold))
  102. return self
  103. def _get_support_mask(self):
  104. check_is_fitted(self)
  105. return self.variances_ > self.threshold
  106. def _more_tags(self):
  107. return {"allow_nan": True}