_sequential.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. """
  2. Sequential feature selection
  3. """
  4. from numbers import Integral, Real
  5. import numpy as np
  6. from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier
  7. from ..metrics import get_scorer_names
  8. from ..model_selection import check_cv, cross_val_score
  9. from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions
  10. from ..utils._tags import _safe_tags
  11. from ..utils.validation import check_is_fitted
  12. from ._base import SelectorMixin
  13. class SequentialFeatureSelector(SelectorMixin, MetaEstimatorMixin, BaseEstimator):
  14. """Transformer that performs Sequential Feature Selection.
  15. This Sequential Feature Selector adds (forward selection) or
  16. removes (backward selection) features to form a feature subset in a
  17. greedy fashion. At each stage, this estimator chooses the best feature to
  18. add or remove based on the cross-validation score of an estimator. In
  19. the case of unsupervised learning, this Sequential Feature Selector
  20. looks only at the features (X), not the desired outputs (y).
  21. Read more in the :ref:`User Guide <sequential_feature_selection>`.
  22. .. versionadded:: 0.24
  23. Parameters
  24. ----------
  25. estimator : estimator instance
  26. An unfitted estimator.
  27. n_features_to_select : "auto", int or float, default="auto"
  28. If `"auto"`, the behaviour depends on the `tol` parameter:
  29. - if `tol` is not `None`, then features are selected while the score
  30. change does not exceed `tol`.
  31. - otherwise, half of the features are selected.
  32. If integer, the parameter is the absolute number of features to select.
  33. If float between 0 and 1, it is the fraction of features to select.
  34. .. versionadded:: 1.1
  35. The option `"auto"` was added in version 1.1.
  36. .. versionchanged:: 1.3
  37. The default changed from `"warn"` to `"auto"` in 1.3.
  38. tol : float, default=None
  39. If the score is not incremented by at least `tol` between two
  40. consecutive feature additions or removals, stop adding or removing.
  41. `tol` can be negative when removing features using `direction="backward"`.
  42. It can be useful to reduce the number of features at the cost of a small
  43. decrease in the score.
  44. `tol` is enabled only when `n_features_to_select` is `"auto"`.
  45. .. versionadded:: 1.1
  46. direction : {'forward', 'backward'}, default='forward'
  47. Whether to perform forward selection or backward selection.
  48. scoring : str or callable, default=None
  49. A single str (see :ref:`scoring_parameter`) or a callable
  50. (see :ref:`scoring`) to evaluate the predictions on the test set.
  51. NOTE that when using a custom scorer, it should return a single
  52. value.
  53. If None, the estimator's score method is used.
  54. cv : int, cross-validation generator or an iterable, default=None
  55. Determines the cross-validation splitting strategy.
  56. Possible inputs for cv are:
  57. - None, to use the default 5-fold cross validation,
  58. - integer, to specify the number of folds in a `(Stratified)KFold`,
  59. - :term:`CV splitter`,
  60. - An iterable yielding (train, test) splits as arrays of indices.
  61. For integer/None inputs, if the estimator is a classifier and ``y`` is
  62. either binary or multiclass,
  63. :class:`~sklearn.model_selection.StratifiedKFold` is used. In all other
  64. cases, :class:`~sklearn.model_selection.KFold` is used. These splitters
  65. are instantiated with `shuffle=False` so the splits will be the same
  66. across calls.
  67. Refer :ref:`User Guide <cross_validation>` for the various
  68. cross-validation strategies that can be used here.
  69. n_jobs : int, default=None
  70. Number of jobs to run in parallel. When evaluating a new feature to
  71. add or remove, the cross-validation procedure is parallel over the
  72. folds.
  73. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
  74. ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
  75. for more details.
  76. Attributes
  77. ----------
  78. n_features_in_ : int
  79. Number of features seen during :term:`fit`. Only defined if the
  80. underlying estimator exposes such an attribute when fit.
  81. .. versionadded:: 0.24
  82. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  83. Names of features seen during :term:`fit`. Defined only when `X`
  84. has feature names that are all strings.
  85. .. versionadded:: 1.0
  86. n_features_to_select_ : int
  87. The number of features that were selected.
  88. support_ : ndarray of shape (n_features,), dtype=bool
  89. The mask of selected features.
  90. See Also
  91. --------
  92. GenericUnivariateSelect : Univariate feature selector with configurable
  93. strategy.
  94. RFE : Recursive feature elimination based on importance weights.
  95. RFECV : Recursive feature elimination based on importance weights, with
  96. automatic selection of the number of features.
  97. SelectFromModel : Feature selection based on thresholds of importance
  98. weights.
  99. Examples
  100. --------
  101. >>> from sklearn.feature_selection import SequentialFeatureSelector
  102. >>> from sklearn.neighbors import KNeighborsClassifier
  103. >>> from sklearn.datasets import load_iris
  104. >>> X, y = load_iris(return_X_y=True)
  105. >>> knn = KNeighborsClassifier(n_neighbors=3)
  106. >>> sfs = SequentialFeatureSelector(knn, n_features_to_select=3)
  107. >>> sfs.fit(X, y)
  108. SequentialFeatureSelector(estimator=KNeighborsClassifier(n_neighbors=3),
  109. n_features_to_select=3)
  110. >>> sfs.get_support()
  111. array([ True, False, True, True])
  112. >>> sfs.transform(X).shape
  113. (150, 3)
  114. """
  115. _parameter_constraints: dict = {
  116. "estimator": [HasMethods(["fit"])],
  117. "n_features_to_select": [
  118. StrOptions({"auto"}),
  119. Interval(RealNotInt, 0, 1, closed="right"),
  120. Interval(Integral, 0, None, closed="neither"),
  121. ],
  122. "tol": [None, Interval(Real, None, None, closed="neither")],
  123. "direction": [StrOptions({"forward", "backward"})],
  124. "scoring": [None, StrOptions(set(get_scorer_names())), callable],
  125. "cv": ["cv_object"],
  126. "n_jobs": [None, Integral],
  127. }
  128. def __init__(
  129. self,
  130. estimator,
  131. *,
  132. n_features_to_select="auto",
  133. tol=None,
  134. direction="forward",
  135. scoring=None,
  136. cv=5,
  137. n_jobs=None,
  138. ):
  139. self.estimator = estimator
  140. self.n_features_to_select = n_features_to_select
  141. self.tol = tol
  142. self.direction = direction
  143. self.scoring = scoring
  144. self.cv = cv
  145. self.n_jobs = n_jobs
  146. @_fit_context(
  147. # SequentialFeatureSelector.estimator is not validated yet
  148. prefer_skip_nested_validation=False
  149. )
  150. def fit(self, X, y=None):
  151. """Learn the features to select from X.
  152. Parameters
  153. ----------
  154. X : array-like of shape (n_samples, n_features)
  155. Training vectors, where `n_samples` is the number of samples and
  156. `n_features` is the number of predictors.
  157. y : array-like of shape (n_samples,), default=None
  158. Target values. This parameter may be ignored for
  159. unsupervised learning.
  160. Returns
  161. -------
  162. self : object
  163. Returns the instance itself.
  164. """
  165. tags = self._get_tags()
  166. X = self._validate_data(
  167. X,
  168. accept_sparse="csc",
  169. ensure_min_features=2,
  170. force_all_finite=not tags.get("allow_nan", True),
  171. )
  172. n_features = X.shape[1]
  173. if self.n_features_to_select == "auto":
  174. if self.tol is not None:
  175. # With auto feature selection, `n_features_to_select_` will be updated
  176. # to `support_.sum()` after features are selected.
  177. self.n_features_to_select_ = n_features - 1
  178. else:
  179. self.n_features_to_select_ = n_features // 2
  180. elif isinstance(self.n_features_to_select, Integral):
  181. if self.n_features_to_select >= n_features:
  182. raise ValueError("n_features_to_select must be < n_features.")
  183. self.n_features_to_select_ = self.n_features_to_select
  184. elif isinstance(self.n_features_to_select, Real):
  185. self.n_features_to_select_ = int(n_features * self.n_features_to_select)
  186. if self.tol is not None and self.tol < 0 and self.direction == "forward":
  187. raise ValueError("tol must be positive when doing forward selection")
  188. cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
  189. cloned_estimator = clone(self.estimator)
  190. # the current mask corresponds to the set of features:
  191. # - that we have already *selected* if we do forward selection
  192. # - that we have already *excluded* if we do backward selection
  193. current_mask = np.zeros(shape=n_features, dtype=bool)
  194. n_iterations = (
  195. self.n_features_to_select_
  196. if self.n_features_to_select == "auto" or self.direction == "forward"
  197. else n_features - self.n_features_to_select_
  198. )
  199. old_score = -np.inf
  200. is_auto_select = self.tol is not None and self.n_features_to_select == "auto"
  201. for _ in range(n_iterations):
  202. new_feature_idx, new_score = self._get_best_new_feature_score(
  203. cloned_estimator, X, y, cv, current_mask
  204. )
  205. if is_auto_select and ((new_score - old_score) < self.tol):
  206. break
  207. old_score = new_score
  208. current_mask[new_feature_idx] = True
  209. if self.direction == "backward":
  210. current_mask = ~current_mask
  211. self.support_ = current_mask
  212. self.n_features_to_select_ = self.support_.sum()
  213. return self
  214. def _get_best_new_feature_score(self, estimator, X, y, cv, current_mask):
  215. # Return the best new feature and its score to add to the current_mask,
  216. # i.e. return the best new feature and its score to add (resp. remove)
  217. # when doing forward selection (resp. backward selection).
  218. # Feature will be added if the current score and past score are greater
  219. # than tol when n_feature is auto,
  220. candidate_feature_indices = np.flatnonzero(~current_mask)
  221. scores = {}
  222. for feature_idx in candidate_feature_indices:
  223. candidate_mask = current_mask.copy()
  224. candidate_mask[feature_idx] = True
  225. if self.direction == "backward":
  226. candidate_mask = ~candidate_mask
  227. X_new = X[:, candidate_mask]
  228. scores[feature_idx] = cross_val_score(
  229. estimator,
  230. X_new,
  231. y,
  232. cv=cv,
  233. scoring=self.scoring,
  234. n_jobs=self.n_jobs,
  235. ).mean()
  236. new_feature_idx = max(scores, key=lambda feature_idx: scores[feature_idx])
  237. return new_feature_idx, scores[new_feature_idx]
  238. def _get_support_mask(self):
  239. check_is_fitted(self)
  240. return self.support_
  241. def _more_tags(self):
  242. return {
  243. "allow_nan": _safe_tags(self.estimator, key="allow_nan"),
  244. }