| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766 |
- # Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
- # Vincent Michel <vincent.michel@inria.fr>
- # Gilles Louppe <g.louppe@gmail.com>
- #
- # License: BSD 3 clause
- """Recursive feature elimination for feature ranking"""
- from numbers import Integral
- import numpy as np
- from joblib import effective_n_jobs
- from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier
- from ..metrics import check_scoring
- from ..model_selection import check_cv
- from ..model_selection._validation import _score
- from ..utils._param_validation import HasMethods, Interval, RealNotInt
- from ..utils._tags import _safe_tags
- from ..utils.metaestimators import _safe_split, available_if
- from ..utils.parallel import Parallel, delayed
- from ..utils.validation import check_is_fitted
- from ._base import SelectorMixin, _get_feature_importances
- def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer):
- """
- Return the score for a fit across one fold.
- """
- X_train, y_train = _safe_split(estimator, X, y, train)
- X_test, y_test = _safe_split(estimator, X, y, test, train)
- return rfe._fit(
- X_train,
- y_train,
- lambda estimator, features: _score(
- estimator, X_test[:, features], y_test, scorer
- ),
- ).scores_
- def _estimator_has(attr):
- """Check if we can delegate a method to the underlying estimator.
- First, we check the first fitted estimator if available, otherwise we
- check the unfitted estimator.
- """
- return lambda self: (
- hasattr(self.estimator_, attr)
- if hasattr(self, "estimator_")
- else hasattr(self.estimator, attr)
- )
- class RFE(SelectorMixin, MetaEstimatorMixin, BaseEstimator):
- """Feature ranking with recursive feature elimination.
- Given an external estimator that assigns weights to features (e.g., the
- coefficients of a linear model), the goal of recursive feature elimination
- (RFE) is to select features by recursively considering smaller and smaller
- sets of features. First, the estimator is trained on the initial set of
- features and the importance of each feature is obtained either through
- any specific attribute or callable.
- Then, the least important features are pruned from current set of features.
- That procedure is recursively repeated on the pruned set until the desired
- number of features to select is eventually reached.
- Read more in the :ref:`User Guide <rfe>`.
- Parameters
- ----------
- estimator : ``Estimator`` instance
- A supervised learning estimator with a ``fit`` method that provides
- information about feature importance
- (e.g. `coef_`, `feature_importances_`).
- n_features_to_select : int or float, default=None
- The number of features to select. If `None`, half of the features are
- selected. If integer, the parameter is the absolute number of features
- to select. If float between 0 and 1, it is the fraction of features to
- select.
- .. versionchanged:: 0.24
- Added float values for fractions.
- step : int or float, default=1
- If greater than or equal to 1, then ``step`` corresponds to the
- (integer) number of features to remove at each iteration.
- If within (0.0, 1.0), then ``step`` corresponds to the percentage
- (rounded down) of features to remove at each iteration.
- verbose : int, default=0
- Controls verbosity of output.
- importance_getter : str or callable, default='auto'
- If 'auto', uses the feature importance either through a `coef_`
- or `feature_importances_` attributes of estimator.
- Also accepts a string that specifies an attribute name/path
- for extracting feature importance (implemented with `attrgetter`).
- For example, give `regressor_.coef_` in case of
- :class:`~sklearn.compose.TransformedTargetRegressor` or
- `named_steps.clf.feature_importances_` in case of
- class:`~sklearn.pipeline.Pipeline` with its last step named `clf`.
- If `callable`, overrides the default feature importance getter.
- The callable is passed with the fitted estimator and it should
- return importance for each feature.
- .. versionadded:: 0.24
- Attributes
- ----------
- classes_ : ndarray of shape (n_classes,)
- The classes labels. Only available when `estimator` is a classifier.
- estimator_ : ``Estimator`` instance
- The fitted estimator used to select features.
- n_features_ : int
- The number of selected features.
- n_features_in_ : int
- Number of features seen during :term:`fit`. Only defined if the
- underlying estimator exposes such an attribute when fit.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- ranking_ : ndarray of shape (n_features,)
- The feature ranking, such that ``ranking_[i]`` corresponds to the
- ranking position of the i-th feature. Selected (i.e., estimated
- best) features are assigned rank 1.
- support_ : ndarray of shape (n_features,)
- The mask of selected features.
- See Also
- --------
- RFECV : Recursive feature elimination with built-in cross-validated
- selection of the best number of features.
- SelectFromModel : Feature selection based on thresholds of importance
- weights.
- SequentialFeatureSelector : Sequential cross-validation based feature
- selection. Does not rely on importance weights.
- Notes
- -----
- Allows NaN/Inf in the input if the underlying estimator does as well.
- References
- ----------
- .. [1] Guyon, I., Weston, J., Barnhill, S., & Vapnik, V., "Gene selection
- for cancer classification using support vector machines",
- Mach. Learn., 46(1-3), 389--422, 2002.
- Examples
- --------
- The following example shows how to retrieve the 5 most informative
- features in the Friedman #1 dataset.
- >>> from sklearn.datasets import make_friedman1
- >>> from sklearn.feature_selection import RFE
- >>> from sklearn.svm import SVR
- >>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
- >>> estimator = SVR(kernel="linear")
- >>> selector = RFE(estimator, n_features_to_select=5, step=1)
- >>> selector = selector.fit(X, y)
- >>> selector.support_
- array([ True, True, True, True, True, False, False, False, False,
- False])
- >>> selector.ranking_
- array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5])
- """
- _parameter_constraints: dict = {
- "estimator": [HasMethods(["fit"])],
- "n_features_to_select": [
- None,
- Interval(RealNotInt, 0, 1, closed="right"),
- Interval(Integral, 0, None, closed="neither"),
- ],
- "step": [
- Interval(Integral, 0, None, closed="neither"),
- Interval(RealNotInt, 0, 1, closed="neither"),
- ],
- "verbose": ["verbose"],
- "importance_getter": [str, callable],
- }
- def __init__(
- self,
- estimator,
- *,
- n_features_to_select=None,
- step=1,
- verbose=0,
- importance_getter="auto",
- ):
- self.estimator = estimator
- self.n_features_to_select = n_features_to_select
- self.step = step
- self.importance_getter = importance_getter
- self.verbose = verbose
- @property
- def _estimator_type(self):
- return self.estimator._estimator_type
- @property
- def classes_(self):
- """Classes labels available when `estimator` is a classifier.
- Returns
- -------
- ndarray of shape (n_classes,)
- """
- return self.estimator_.classes_
- @_fit_context(
- # RFE.estimator is not validated yet
- prefer_skip_nested_validation=False
- )
- def fit(self, X, y, **fit_params):
- """Fit the RFE model and then the underlying estimator on the selected features.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The training input samples.
- y : array-like of shape (n_samples,)
- The target values.
- **fit_params : dict
- Additional parameters passed to the `fit` method of the underlying
- estimator.
- Returns
- -------
- self : object
- Fitted estimator.
- """
- return self._fit(X, y, **fit_params)
- def _fit(self, X, y, step_score=None, **fit_params):
- # Parameter step_score controls the calculation of self.scores_
- # step_score is not exposed to users
- # and is used when implementing RFECV
- # self.scores_ will not be calculated when calling _fit through fit
- tags = self._get_tags()
- X, y = self._validate_data(
- X,
- y,
- accept_sparse="csc",
- ensure_min_features=2,
- force_all_finite=not tags.get("allow_nan", True),
- multi_output=True,
- )
- # Initialization
- n_features = X.shape[1]
- if self.n_features_to_select is None:
- n_features_to_select = n_features // 2
- elif isinstance(self.n_features_to_select, Integral): # int
- n_features_to_select = self.n_features_to_select
- else: # float
- n_features_to_select = int(n_features * self.n_features_to_select)
- if 0.0 < self.step < 1.0:
- step = int(max(1, self.step * n_features))
- else:
- step = int(self.step)
- support_ = np.ones(n_features, dtype=bool)
- ranking_ = np.ones(n_features, dtype=int)
- if step_score:
- self.scores_ = []
- # Elimination
- while np.sum(support_) > n_features_to_select:
- # Remaining features
- features = np.arange(n_features)[support_]
- # Rank the remaining features
- estimator = clone(self.estimator)
- if self.verbose > 0:
- print("Fitting estimator with %d features." % np.sum(support_))
- estimator.fit(X[:, features], y, **fit_params)
- # Get importance and rank them
- importances = _get_feature_importances(
- estimator,
- self.importance_getter,
- transform_func="square",
- )
- ranks = np.argsort(importances)
- # for sparse case ranks is matrix
- ranks = np.ravel(ranks)
- # Eliminate the worse features
- threshold = min(step, np.sum(support_) - n_features_to_select)
- # Compute step score on the previous selection iteration
- # because 'estimator' must use features
- # that have not been eliminated yet
- if step_score:
- self.scores_.append(step_score(estimator, features))
- support_[features[ranks][:threshold]] = False
- ranking_[np.logical_not(support_)] += 1
- # Set final attributes
- features = np.arange(n_features)[support_]
- self.estimator_ = clone(self.estimator)
- self.estimator_.fit(X[:, features], y, **fit_params)
- # Compute step score when only n_features_to_select features left
- if step_score:
- self.scores_.append(step_score(self.estimator_, features))
- self.n_features_ = support_.sum()
- self.support_ = support_
- self.ranking_ = ranking_
- return self
- @available_if(_estimator_has("predict"))
- def predict(self, X):
- """Reduce X to the selected features and predict using the estimator.
- Parameters
- ----------
- X : array of shape [n_samples, n_features]
- The input samples.
- Returns
- -------
- y : array of shape [n_samples]
- The predicted target values.
- """
- check_is_fitted(self)
- return self.estimator_.predict(self.transform(X))
- @available_if(_estimator_has("score"))
- def score(self, X, y, **fit_params):
- """Reduce X to the selected features and return the score of the estimator.
- Parameters
- ----------
- X : array of shape [n_samples, n_features]
- The input samples.
- y : array of shape [n_samples]
- The target values.
- **fit_params : dict
- Parameters to pass to the `score` method of the underlying
- estimator.
- .. versionadded:: 1.0
- Returns
- -------
- score : float
- Score of the underlying base estimator computed with the selected
- features returned by `rfe.transform(X)` and `y`.
- """
- check_is_fitted(self)
- return self.estimator_.score(self.transform(X), y, **fit_params)
- def _get_support_mask(self):
- check_is_fitted(self)
- return self.support_
- @available_if(_estimator_has("decision_function"))
- def decision_function(self, X):
- """Compute the decision function of ``X``.
- Parameters
- ----------
- X : {array-like or sparse matrix} of shape (n_samples, n_features)
- The input samples. Internally, it will be converted to
- ``dtype=np.float32`` and if a sparse matrix is provided
- to a sparse ``csr_matrix``.
- Returns
- -------
- score : array, shape = [n_samples, n_classes] or [n_samples]
- The decision function of the input samples. The order of the
- classes corresponds to that in the attribute :term:`classes_`.
- Regression and binary classification produce an array of shape
- [n_samples].
- """
- check_is_fitted(self)
- return self.estimator_.decision_function(self.transform(X))
- @available_if(_estimator_has("predict_proba"))
- def predict_proba(self, X):
- """Predict class probabilities for X.
- Parameters
- ----------
- X : {array-like or sparse matrix} of shape (n_samples, n_features)
- The input samples. Internally, it will be converted to
- ``dtype=np.float32`` and if a sparse matrix is provided
- to a sparse ``csr_matrix``.
- Returns
- -------
- p : array of shape (n_samples, n_classes)
- The class probabilities of the input samples. The order of the
- classes corresponds to that in the attribute :term:`classes_`.
- """
- check_is_fitted(self)
- return self.estimator_.predict_proba(self.transform(X))
- @available_if(_estimator_has("predict_log_proba"))
- def predict_log_proba(self, X):
- """Predict class log-probabilities for X.
- Parameters
- ----------
- X : array of shape [n_samples, n_features]
- The input samples.
- Returns
- -------
- p : array of shape (n_samples, n_classes)
- The class log-probabilities of the input samples. The order of the
- classes corresponds to that in the attribute :term:`classes_`.
- """
- check_is_fitted(self)
- return self.estimator_.predict_log_proba(self.transform(X))
- def _more_tags(self):
- return {
- "poor_score": True,
- "allow_nan": _safe_tags(self.estimator, key="allow_nan"),
- "requires_y": True,
- }
- class RFECV(RFE):
- """Recursive feature elimination with cross-validation to select features.
- See glossary entry for :term:`cross-validation estimator`.
- Read more in the :ref:`User Guide <rfe>`.
- Parameters
- ----------
- estimator : ``Estimator`` instance
- A supervised learning estimator with a ``fit`` method that provides
- information about feature importance either through a ``coef_``
- attribute or through a ``feature_importances_`` attribute.
- step : int or float, default=1
- If greater than or equal to 1, then ``step`` corresponds to the
- (integer) number of features to remove at each iteration.
- If within (0.0, 1.0), then ``step`` corresponds to the percentage
- (rounded down) of features to remove at each iteration.
- Note that the last iteration may remove fewer than ``step`` features in
- order to reach ``min_features_to_select``.
- min_features_to_select : int, default=1
- The minimum number of features to be selected. This number of features
- will always be scored, even if the difference between the original
- feature count and ``min_features_to_select`` isn't divisible by
- ``step``.
- .. versionadded:: 0.20
- cv : int, cross-validation generator or an iterable, default=None
- Determines the cross-validation splitting strategy.
- Possible inputs for cv are:
- - None, to use the default 5-fold cross-validation,
- - integer, to specify the number of folds.
- - :term:`CV splitter`,
- - An iterable yielding (train, test) splits as arrays of indices.
- For integer/None inputs, if ``y`` is binary or multiclass,
- :class:`~sklearn.model_selection.StratifiedKFold` is used. If the
- estimator is a classifier or if ``y`` is neither binary nor multiclass,
- :class:`~sklearn.model_selection.KFold` is used.
- Refer :ref:`User Guide <cross_validation>` for the various
- cross-validation strategies that can be used here.
- .. versionchanged:: 0.22
- ``cv`` default value of None changed from 3-fold to 5-fold.
- scoring : str, callable or None, default=None
- A string (see model evaluation documentation) or
- a scorer callable object / function with signature
- ``scorer(estimator, X, y)``.
- verbose : int, default=0
- Controls verbosity of output.
- n_jobs : int or None, default=None
- Number of cores to run in parallel while fitting across folds.
- ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
- ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
- for more details.
- .. versionadded:: 0.18
- importance_getter : str or callable, default='auto'
- If 'auto', uses the feature importance either through a `coef_`
- or `feature_importances_` attributes of estimator.
- Also accepts a string that specifies an attribute name/path
- for extracting feature importance.
- For example, give `regressor_.coef_` in case of
- :class:`~sklearn.compose.TransformedTargetRegressor` or
- `named_steps.clf.feature_importances_` in case of
- :class:`~sklearn.pipeline.Pipeline` with its last step named `clf`.
- If `callable`, overrides the default feature importance getter.
- The callable is passed with the fitted estimator and it should
- return importance for each feature.
- .. versionadded:: 0.24
- Attributes
- ----------
- classes_ : ndarray of shape (n_classes,)
- The classes labels. Only available when `estimator` is a classifier.
- estimator_ : ``Estimator`` instance
- The fitted estimator used to select features.
- cv_results_ : dict of ndarrays
- A dict with keys:
- split(k)_test_score : ndarray of shape (n_subsets_of_features,)
- The cross-validation scores across (k)th fold.
- mean_test_score : ndarray of shape (n_subsets_of_features,)
- Mean of scores over the folds.
- std_test_score : ndarray of shape (n_subsets_of_features,)
- Standard deviation of scores over the folds.
- .. versionadded:: 1.0
- n_features_ : int
- The number of selected features with cross-validation.
- n_features_in_ : int
- Number of features seen during :term:`fit`. Only defined if the
- underlying estimator exposes such an attribute when fit.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- ranking_ : narray of shape (n_features,)
- The feature ranking, such that `ranking_[i]`
- corresponds to the ranking
- position of the i-th feature.
- Selected (i.e., estimated best)
- features are assigned rank 1.
- support_ : ndarray of shape (n_features,)
- The mask of selected features.
- See Also
- --------
- RFE : Recursive feature elimination.
- Notes
- -----
- The size of all values in ``cv_results_`` is equal to
- ``ceil((n_features - min_features_to_select) / step) + 1``,
- where step is the number of features removed at each iteration.
- Allows NaN/Inf in the input if the underlying estimator does as well.
- References
- ----------
- .. [1] Guyon, I., Weston, J., Barnhill, S., & Vapnik, V., "Gene selection
- for cancer classification using support vector machines",
- Mach. Learn., 46(1-3), 389--422, 2002.
- Examples
- --------
- The following example shows how to retrieve the a-priori not known 5
- informative features in the Friedman #1 dataset.
- >>> from sklearn.datasets import make_friedman1
- >>> from sklearn.feature_selection import RFECV
- >>> from sklearn.svm import SVR
- >>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
- >>> estimator = SVR(kernel="linear")
- >>> selector = RFECV(estimator, step=1, cv=5)
- >>> selector = selector.fit(X, y)
- >>> selector.support_
- array([ True, True, True, True, True, False, False, False, False,
- False])
- >>> selector.ranking_
- array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5])
- """
- _parameter_constraints: dict = {
- **RFE._parameter_constraints,
- "min_features_to_select": [Interval(Integral, 0, None, closed="neither")],
- "cv": ["cv_object"],
- "scoring": [None, str, callable],
- "n_jobs": [None, Integral],
- }
- _parameter_constraints.pop("n_features_to_select")
- def __init__(
- self,
- estimator,
- *,
- step=1,
- min_features_to_select=1,
- cv=None,
- scoring=None,
- verbose=0,
- n_jobs=None,
- importance_getter="auto",
- ):
- self.estimator = estimator
- self.step = step
- self.importance_getter = importance_getter
- self.cv = cv
- self.scoring = scoring
- self.verbose = verbose
- self.n_jobs = n_jobs
- self.min_features_to_select = min_features_to_select
- @_fit_context(
- # RFECV.estimator is not validated yet
- prefer_skip_nested_validation=False
- )
- def fit(self, X, y, groups=None):
- """Fit the RFE model and automatically tune the number of selected features.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Training vector, where `n_samples` is the number of samples and
- `n_features` is the total number of features.
- y : array-like of shape (n_samples,)
- Target values (integers for classification, real numbers for
- regression).
- groups : array-like of shape (n_samples,) or None, default=None
- Group labels for the samples used while splitting the dataset into
- train/test set. Only used in conjunction with a "Group" :term:`cv`
- instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).
- .. versionadded:: 0.20
- Returns
- -------
- self : object
- Fitted estimator.
- """
- tags = self._get_tags()
- X, y = self._validate_data(
- X,
- y,
- accept_sparse="csr",
- ensure_min_features=2,
- force_all_finite=not tags.get("allow_nan", True),
- multi_output=True,
- )
- # Initialization
- cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
- scorer = check_scoring(self.estimator, scoring=self.scoring)
- n_features = X.shape[1]
- if 0.0 < self.step < 1.0:
- step = int(max(1, self.step * n_features))
- else:
- step = int(self.step)
- # Build an RFE object, which will evaluate and score each possible
- # feature count, down to self.min_features_to_select
- rfe = RFE(
- estimator=self.estimator,
- n_features_to_select=self.min_features_to_select,
- importance_getter=self.importance_getter,
- step=self.step,
- verbose=self.verbose,
- )
- # Determine the number of subsets of features by fitting across
- # the train folds and choosing the "features_to_select" parameter
- # that gives the least averaged error across all folds.
- # Note that joblib raises a non-picklable error for bound methods
- # even if n_jobs is set to 1 with the default multiprocessing
- # backend.
- # This branching is done so that to
- # make sure that user code that sets n_jobs to 1
- # and provides bound methods as scorers is not broken with the
- # addition of n_jobs parameter in version 0.18.
- if effective_n_jobs(self.n_jobs) == 1:
- parallel, func = list, _rfe_single_fit
- else:
- parallel = Parallel(n_jobs=self.n_jobs)
- func = delayed(_rfe_single_fit)
- scores = parallel(
- func(rfe, self.estimator, X, y, train, test, scorer)
- for train, test in cv.split(X, y, groups)
- )
- scores = np.array(scores)
- scores_sum = np.sum(scores, axis=0)
- scores_sum_rev = scores_sum[::-1]
- argmax_idx = len(scores_sum) - np.argmax(scores_sum_rev) - 1
- n_features_to_select = max(
- n_features - (argmax_idx * step), self.min_features_to_select
- )
- # Re-execute an elimination with best_k over the whole set
- rfe = RFE(
- estimator=self.estimator,
- n_features_to_select=n_features_to_select,
- step=self.step,
- importance_getter=self.importance_getter,
- verbose=self.verbose,
- )
- rfe.fit(X, y)
- # Set final attributes
- self.support_ = rfe.support_
- self.n_features_ = rfe.n_features_
- self.ranking_ = rfe.ranking_
- self.estimator_ = clone(self.estimator)
- self.estimator_.fit(self._transform(X), y)
- # reverse to stay consistent with before
- scores_rev = scores[:, ::-1]
- self.cv_results_ = {}
- self.cv_results_["mean_test_score"] = np.mean(scores_rev, axis=0)
- self.cv_results_["std_test_score"] = np.std(scores_rev, axis=0)
- for i in range(scores.shape[0]):
- self.cv_results_[f"split{i}_test_score"] = scores_rev[i]
- return self
|