_rfe.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766
  1. # Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
  2. # Vincent Michel <vincent.michel@inria.fr>
  3. # Gilles Louppe <g.louppe@gmail.com>
  4. #
  5. # License: BSD 3 clause
  6. """Recursive feature elimination for feature ranking"""
  7. from numbers import Integral
  8. import numpy as np
  9. from joblib import effective_n_jobs
  10. from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier
  11. from ..metrics import check_scoring
  12. from ..model_selection import check_cv
  13. from ..model_selection._validation import _score
  14. from ..utils._param_validation import HasMethods, Interval, RealNotInt
  15. from ..utils._tags import _safe_tags
  16. from ..utils.metaestimators import _safe_split, available_if
  17. from ..utils.parallel import Parallel, delayed
  18. from ..utils.validation import check_is_fitted
  19. from ._base import SelectorMixin, _get_feature_importances
  20. def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer):
  21. """
  22. Return the score for a fit across one fold.
  23. """
  24. X_train, y_train = _safe_split(estimator, X, y, train)
  25. X_test, y_test = _safe_split(estimator, X, y, test, train)
  26. return rfe._fit(
  27. X_train,
  28. y_train,
  29. lambda estimator, features: _score(
  30. estimator, X_test[:, features], y_test, scorer
  31. ),
  32. ).scores_
  33. def _estimator_has(attr):
  34. """Check if we can delegate a method to the underlying estimator.
  35. First, we check the first fitted estimator if available, otherwise we
  36. check the unfitted estimator.
  37. """
  38. return lambda self: (
  39. hasattr(self.estimator_, attr)
  40. if hasattr(self, "estimator_")
  41. else hasattr(self.estimator, attr)
  42. )
  43. class RFE(SelectorMixin, MetaEstimatorMixin, BaseEstimator):
  44. """Feature ranking with recursive feature elimination.
  45. Given an external estimator that assigns weights to features (e.g., the
  46. coefficients of a linear model), the goal of recursive feature elimination
  47. (RFE) is to select features by recursively considering smaller and smaller
  48. sets of features. First, the estimator is trained on the initial set of
  49. features and the importance of each feature is obtained either through
  50. any specific attribute or callable.
  51. Then, the least important features are pruned from current set of features.
  52. That procedure is recursively repeated on the pruned set until the desired
  53. number of features to select is eventually reached.
  54. Read more in the :ref:`User Guide <rfe>`.
  55. Parameters
  56. ----------
  57. estimator : ``Estimator`` instance
  58. A supervised learning estimator with a ``fit`` method that provides
  59. information about feature importance
  60. (e.g. `coef_`, `feature_importances_`).
  61. n_features_to_select : int or float, default=None
  62. The number of features to select. If `None`, half of the features are
  63. selected. If integer, the parameter is the absolute number of features
  64. to select. If float between 0 and 1, it is the fraction of features to
  65. select.
  66. .. versionchanged:: 0.24
  67. Added float values for fractions.
  68. step : int or float, default=1
  69. If greater than or equal to 1, then ``step`` corresponds to the
  70. (integer) number of features to remove at each iteration.
  71. If within (0.0, 1.0), then ``step`` corresponds to the percentage
  72. (rounded down) of features to remove at each iteration.
  73. verbose : int, default=0
  74. Controls verbosity of output.
  75. importance_getter : str or callable, default='auto'
  76. If 'auto', uses the feature importance either through a `coef_`
  77. or `feature_importances_` attributes of estimator.
  78. Also accepts a string that specifies an attribute name/path
  79. for extracting feature importance (implemented with `attrgetter`).
  80. For example, give `regressor_.coef_` in case of
  81. :class:`~sklearn.compose.TransformedTargetRegressor` or
  82. `named_steps.clf.feature_importances_` in case of
  83. class:`~sklearn.pipeline.Pipeline` with its last step named `clf`.
  84. If `callable`, overrides the default feature importance getter.
  85. The callable is passed with the fitted estimator and it should
  86. return importance for each feature.
  87. .. versionadded:: 0.24
  88. Attributes
  89. ----------
  90. classes_ : ndarray of shape (n_classes,)
  91. The classes labels. Only available when `estimator` is a classifier.
  92. estimator_ : ``Estimator`` instance
  93. The fitted estimator used to select features.
  94. n_features_ : int
  95. The number of selected features.
  96. n_features_in_ : int
  97. Number of features seen during :term:`fit`. Only defined if the
  98. underlying estimator exposes such an attribute when fit.
  99. .. versionadded:: 0.24
  100. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  101. Names of features seen during :term:`fit`. Defined only when `X`
  102. has feature names that are all strings.
  103. .. versionadded:: 1.0
  104. ranking_ : ndarray of shape (n_features,)
  105. The feature ranking, such that ``ranking_[i]`` corresponds to the
  106. ranking position of the i-th feature. Selected (i.e., estimated
  107. best) features are assigned rank 1.
  108. support_ : ndarray of shape (n_features,)
  109. The mask of selected features.
  110. See Also
  111. --------
  112. RFECV : Recursive feature elimination with built-in cross-validated
  113. selection of the best number of features.
  114. SelectFromModel : Feature selection based on thresholds of importance
  115. weights.
  116. SequentialFeatureSelector : Sequential cross-validation based feature
  117. selection. Does not rely on importance weights.
  118. Notes
  119. -----
  120. Allows NaN/Inf in the input if the underlying estimator does as well.
  121. References
  122. ----------
  123. .. [1] Guyon, I., Weston, J., Barnhill, S., & Vapnik, V., "Gene selection
  124. for cancer classification using support vector machines",
  125. Mach. Learn., 46(1-3), 389--422, 2002.
  126. Examples
  127. --------
  128. The following example shows how to retrieve the 5 most informative
  129. features in the Friedman #1 dataset.
  130. >>> from sklearn.datasets import make_friedman1
  131. >>> from sklearn.feature_selection import RFE
  132. >>> from sklearn.svm import SVR
  133. >>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
  134. >>> estimator = SVR(kernel="linear")
  135. >>> selector = RFE(estimator, n_features_to_select=5, step=1)
  136. >>> selector = selector.fit(X, y)
  137. >>> selector.support_
  138. array([ True, True, True, True, True, False, False, False, False,
  139. False])
  140. >>> selector.ranking_
  141. array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5])
  142. """
  143. _parameter_constraints: dict = {
  144. "estimator": [HasMethods(["fit"])],
  145. "n_features_to_select": [
  146. None,
  147. Interval(RealNotInt, 0, 1, closed="right"),
  148. Interval(Integral, 0, None, closed="neither"),
  149. ],
  150. "step": [
  151. Interval(Integral, 0, None, closed="neither"),
  152. Interval(RealNotInt, 0, 1, closed="neither"),
  153. ],
  154. "verbose": ["verbose"],
  155. "importance_getter": [str, callable],
  156. }
  157. def __init__(
  158. self,
  159. estimator,
  160. *,
  161. n_features_to_select=None,
  162. step=1,
  163. verbose=0,
  164. importance_getter="auto",
  165. ):
  166. self.estimator = estimator
  167. self.n_features_to_select = n_features_to_select
  168. self.step = step
  169. self.importance_getter = importance_getter
  170. self.verbose = verbose
  171. @property
  172. def _estimator_type(self):
  173. return self.estimator._estimator_type
  174. @property
  175. def classes_(self):
  176. """Classes labels available when `estimator` is a classifier.
  177. Returns
  178. -------
  179. ndarray of shape (n_classes,)
  180. """
  181. return self.estimator_.classes_
  182. @_fit_context(
  183. # RFE.estimator is not validated yet
  184. prefer_skip_nested_validation=False
  185. )
  186. def fit(self, X, y, **fit_params):
  187. """Fit the RFE model and then the underlying estimator on the selected features.
  188. Parameters
  189. ----------
  190. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  191. The training input samples.
  192. y : array-like of shape (n_samples,)
  193. The target values.
  194. **fit_params : dict
  195. Additional parameters passed to the `fit` method of the underlying
  196. estimator.
  197. Returns
  198. -------
  199. self : object
  200. Fitted estimator.
  201. """
  202. return self._fit(X, y, **fit_params)
  203. def _fit(self, X, y, step_score=None, **fit_params):
  204. # Parameter step_score controls the calculation of self.scores_
  205. # step_score is not exposed to users
  206. # and is used when implementing RFECV
  207. # self.scores_ will not be calculated when calling _fit through fit
  208. tags = self._get_tags()
  209. X, y = self._validate_data(
  210. X,
  211. y,
  212. accept_sparse="csc",
  213. ensure_min_features=2,
  214. force_all_finite=not tags.get("allow_nan", True),
  215. multi_output=True,
  216. )
  217. # Initialization
  218. n_features = X.shape[1]
  219. if self.n_features_to_select is None:
  220. n_features_to_select = n_features // 2
  221. elif isinstance(self.n_features_to_select, Integral): # int
  222. n_features_to_select = self.n_features_to_select
  223. else: # float
  224. n_features_to_select = int(n_features * self.n_features_to_select)
  225. if 0.0 < self.step < 1.0:
  226. step = int(max(1, self.step * n_features))
  227. else:
  228. step = int(self.step)
  229. support_ = np.ones(n_features, dtype=bool)
  230. ranking_ = np.ones(n_features, dtype=int)
  231. if step_score:
  232. self.scores_ = []
  233. # Elimination
  234. while np.sum(support_) > n_features_to_select:
  235. # Remaining features
  236. features = np.arange(n_features)[support_]
  237. # Rank the remaining features
  238. estimator = clone(self.estimator)
  239. if self.verbose > 0:
  240. print("Fitting estimator with %d features." % np.sum(support_))
  241. estimator.fit(X[:, features], y, **fit_params)
  242. # Get importance and rank them
  243. importances = _get_feature_importances(
  244. estimator,
  245. self.importance_getter,
  246. transform_func="square",
  247. )
  248. ranks = np.argsort(importances)
  249. # for sparse case ranks is matrix
  250. ranks = np.ravel(ranks)
  251. # Eliminate the worse features
  252. threshold = min(step, np.sum(support_) - n_features_to_select)
  253. # Compute step score on the previous selection iteration
  254. # because 'estimator' must use features
  255. # that have not been eliminated yet
  256. if step_score:
  257. self.scores_.append(step_score(estimator, features))
  258. support_[features[ranks][:threshold]] = False
  259. ranking_[np.logical_not(support_)] += 1
  260. # Set final attributes
  261. features = np.arange(n_features)[support_]
  262. self.estimator_ = clone(self.estimator)
  263. self.estimator_.fit(X[:, features], y, **fit_params)
  264. # Compute step score when only n_features_to_select features left
  265. if step_score:
  266. self.scores_.append(step_score(self.estimator_, features))
  267. self.n_features_ = support_.sum()
  268. self.support_ = support_
  269. self.ranking_ = ranking_
  270. return self
  271. @available_if(_estimator_has("predict"))
  272. def predict(self, X):
  273. """Reduce X to the selected features and predict using the estimator.
  274. Parameters
  275. ----------
  276. X : array of shape [n_samples, n_features]
  277. The input samples.
  278. Returns
  279. -------
  280. y : array of shape [n_samples]
  281. The predicted target values.
  282. """
  283. check_is_fitted(self)
  284. return self.estimator_.predict(self.transform(X))
  285. @available_if(_estimator_has("score"))
  286. def score(self, X, y, **fit_params):
  287. """Reduce X to the selected features and return the score of the estimator.
  288. Parameters
  289. ----------
  290. X : array of shape [n_samples, n_features]
  291. The input samples.
  292. y : array of shape [n_samples]
  293. The target values.
  294. **fit_params : dict
  295. Parameters to pass to the `score` method of the underlying
  296. estimator.
  297. .. versionadded:: 1.0
  298. Returns
  299. -------
  300. score : float
  301. Score of the underlying base estimator computed with the selected
  302. features returned by `rfe.transform(X)` and `y`.
  303. """
  304. check_is_fitted(self)
  305. return self.estimator_.score(self.transform(X), y, **fit_params)
  306. def _get_support_mask(self):
  307. check_is_fitted(self)
  308. return self.support_
  309. @available_if(_estimator_has("decision_function"))
  310. def decision_function(self, X):
  311. """Compute the decision function of ``X``.
  312. Parameters
  313. ----------
  314. X : {array-like or sparse matrix} of shape (n_samples, n_features)
  315. The input samples. Internally, it will be converted to
  316. ``dtype=np.float32`` and if a sparse matrix is provided
  317. to a sparse ``csr_matrix``.
  318. Returns
  319. -------
  320. score : array, shape = [n_samples, n_classes] or [n_samples]
  321. The decision function of the input samples. The order of the
  322. classes corresponds to that in the attribute :term:`classes_`.
  323. Regression and binary classification produce an array of shape
  324. [n_samples].
  325. """
  326. check_is_fitted(self)
  327. return self.estimator_.decision_function(self.transform(X))
  328. @available_if(_estimator_has("predict_proba"))
  329. def predict_proba(self, X):
  330. """Predict class probabilities for X.
  331. Parameters
  332. ----------
  333. X : {array-like or sparse matrix} of shape (n_samples, n_features)
  334. The input samples. Internally, it will be converted to
  335. ``dtype=np.float32`` and if a sparse matrix is provided
  336. to a sparse ``csr_matrix``.
  337. Returns
  338. -------
  339. p : array of shape (n_samples, n_classes)
  340. The class probabilities of the input samples. The order of the
  341. classes corresponds to that in the attribute :term:`classes_`.
  342. """
  343. check_is_fitted(self)
  344. return self.estimator_.predict_proba(self.transform(X))
  345. @available_if(_estimator_has("predict_log_proba"))
  346. def predict_log_proba(self, X):
  347. """Predict class log-probabilities for X.
  348. Parameters
  349. ----------
  350. X : array of shape [n_samples, n_features]
  351. The input samples.
  352. Returns
  353. -------
  354. p : array of shape (n_samples, n_classes)
  355. The class log-probabilities of the input samples. The order of the
  356. classes corresponds to that in the attribute :term:`classes_`.
  357. """
  358. check_is_fitted(self)
  359. return self.estimator_.predict_log_proba(self.transform(X))
  360. def _more_tags(self):
  361. return {
  362. "poor_score": True,
  363. "allow_nan": _safe_tags(self.estimator, key="allow_nan"),
  364. "requires_y": True,
  365. }
  366. class RFECV(RFE):
  367. """Recursive feature elimination with cross-validation to select features.
  368. See glossary entry for :term:`cross-validation estimator`.
  369. Read more in the :ref:`User Guide <rfe>`.
  370. Parameters
  371. ----------
  372. estimator : ``Estimator`` instance
  373. A supervised learning estimator with a ``fit`` method that provides
  374. information about feature importance either through a ``coef_``
  375. attribute or through a ``feature_importances_`` attribute.
  376. step : int or float, default=1
  377. If greater than or equal to 1, then ``step`` corresponds to the
  378. (integer) number of features to remove at each iteration.
  379. If within (0.0, 1.0), then ``step`` corresponds to the percentage
  380. (rounded down) of features to remove at each iteration.
  381. Note that the last iteration may remove fewer than ``step`` features in
  382. order to reach ``min_features_to_select``.
  383. min_features_to_select : int, default=1
  384. The minimum number of features to be selected. This number of features
  385. will always be scored, even if the difference between the original
  386. feature count and ``min_features_to_select`` isn't divisible by
  387. ``step``.
  388. .. versionadded:: 0.20
  389. cv : int, cross-validation generator or an iterable, default=None
  390. Determines the cross-validation splitting strategy.
  391. Possible inputs for cv are:
  392. - None, to use the default 5-fold cross-validation,
  393. - integer, to specify the number of folds.
  394. - :term:`CV splitter`,
  395. - An iterable yielding (train, test) splits as arrays of indices.
  396. For integer/None inputs, if ``y`` is binary or multiclass,
  397. :class:`~sklearn.model_selection.StratifiedKFold` is used. If the
  398. estimator is a classifier or if ``y`` is neither binary nor multiclass,
  399. :class:`~sklearn.model_selection.KFold` is used.
  400. Refer :ref:`User Guide <cross_validation>` for the various
  401. cross-validation strategies that can be used here.
  402. .. versionchanged:: 0.22
  403. ``cv`` default value of None changed from 3-fold to 5-fold.
  404. scoring : str, callable or None, default=None
  405. A string (see model evaluation documentation) or
  406. a scorer callable object / function with signature
  407. ``scorer(estimator, X, y)``.
  408. verbose : int, default=0
  409. Controls verbosity of output.
  410. n_jobs : int or None, default=None
  411. Number of cores to run in parallel while fitting across folds.
  412. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
  413. ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
  414. for more details.
  415. .. versionadded:: 0.18
  416. importance_getter : str or callable, default='auto'
  417. If 'auto', uses the feature importance either through a `coef_`
  418. or `feature_importances_` attributes of estimator.
  419. Also accepts a string that specifies an attribute name/path
  420. for extracting feature importance.
  421. For example, give `regressor_.coef_` in case of
  422. :class:`~sklearn.compose.TransformedTargetRegressor` or
  423. `named_steps.clf.feature_importances_` in case of
  424. :class:`~sklearn.pipeline.Pipeline` with its last step named `clf`.
  425. If `callable`, overrides the default feature importance getter.
  426. The callable is passed with the fitted estimator and it should
  427. return importance for each feature.
  428. .. versionadded:: 0.24
  429. Attributes
  430. ----------
  431. classes_ : ndarray of shape (n_classes,)
  432. The classes labels. Only available when `estimator` is a classifier.
  433. estimator_ : ``Estimator`` instance
  434. The fitted estimator used to select features.
  435. cv_results_ : dict of ndarrays
  436. A dict with keys:
  437. split(k)_test_score : ndarray of shape (n_subsets_of_features,)
  438. The cross-validation scores across (k)th fold.
  439. mean_test_score : ndarray of shape (n_subsets_of_features,)
  440. Mean of scores over the folds.
  441. std_test_score : ndarray of shape (n_subsets_of_features,)
  442. Standard deviation of scores over the folds.
  443. .. versionadded:: 1.0
  444. n_features_ : int
  445. The number of selected features with cross-validation.
  446. n_features_in_ : int
  447. Number of features seen during :term:`fit`. Only defined if the
  448. underlying estimator exposes such an attribute when fit.
  449. .. versionadded:: 0.24
  450. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  451. Names of features seen during :term:`fit`. Defined only when `X`
  452. has feature names that are all strings.
  453. .. versionadded:: 1.0
  454. ranking_ : narray of shape (n_features,)
  455. The feature ranking, such that `ranking_[i]`
  456. corresponds to the ranking
  457. position of the i-th feature.
  458. Selected (i.e., estimated best)
  459. features are assigned rank 1.
  460. support_ : ndarray of shape (n_features,)
  461. The mask of selected features.
  462. See Also
  463. --------
  464. RFE : Recursive feature elimination.
  465. Notes
  466. -----
  467. The size of all values in ``cv_results_`` is equal to
  468. ``ceil((n_features - min_features_to_select) / step) + 1``,
  469. where step is the number of features removed at each iteration.
  470. Allows NaN/Inf in the input if the underlying estimator does as well.
  471. References
  472. ----------
  473. .. [1] Guyon, I., Weston, J., Barnhill, S., & Vapnik, V., "Gene selection
  474. for cancer classification using support vector machines",
  475. Mach. Learn., 46(1-3), 389--422, 2002.
  476. Examples
  477. --------
  478. The following example shows how to retrieve the a-priori not known 5
  479. informative features in the Friedman #1 dataset.
  480. >>> from sklearn.datasets import make_friedman1
  481. >>> from sklearn.feature_selection import RFECV
  482. >>> from sklearn.svm import SVR
  483. >>> X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
  484. >>> estimator = SVR(kernel="linear")
  485. >>> selector = RFECV(estimator, step=1, cv=5)
  486. >>> selector = selector.fit(X, y)
  487. >>> selector.support_
  488. array([ True, True, True, True, True, False, False, False, False,
  489. False])
  490. >>> selector.ranking_
  491. array([1, 1, 1, 1, 1, 6, 4, 3, 2, 5])
  492. """
  493. _parameter_constraints: dict = {
  494. **RFE._parameter_constraints,
  495. "min_features_to_select": [Interval(Integral, 0, None, closed="neither")],
  496. "cv": ["cv_object"],
  497. "scoring": [None, str, callable],
  498. "n_jobs": [None, Integral],
  499. }
  500. _parameter_constraints.pop("n_features_to_select")
  501. def __init__(
  502. self,
  503. estimator,
  504. *,
  505. step=1,
  506. min_features_to_select=1,
  507. cv=None,
  508. scoring=None,
  509. verbose=0,
  510. n_jobs=None,
  511. importance_getter="auto",
  512. ):
  513. self.estimator = estimator
  514. self.step = step
  515. self.importance_getter = importance_getter
  516. self.cv = cv
  517. self.scoring = scoring
  518. self.verbose = verbose
  519. self.n_jobs = n_jobs
  520. self.min_features_to_select = min_features_to_select
  521. @_fit_context(
  522. # RFECV.estimator is not validated yet
  523. prefer_skip_nested_validation=False
  524. )
  525. def fit(self, X, y, groups=None):
  526. """Fit the RFE model and automatically tune the number of selected features.
  527. Parameters
  528. ----------
  529. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  530. Training vector, where `n_samples` is the number of samples and
  531. `n_features` is the total number of features.
  532. y : array-like of shape (n_samples,)
  533. Target values (integers for classification, real numbers for
  534. regression).
  535. groups : array-like of shape (n_samples,) or None, default=None
  536. Group labels for the samples used while splitting the dataset into
  537. train/test set. Only used in conjunction with a "Group" :term:`cv`
  538. instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).
  539. .. versionadded:: 0.20
  540. Returns
  541. -------
  542. self : object
  543. Fitted estimator.
  544. """
  545. tags = self._get_tags()
  546. X, y = self._validate_data(
  547. X,
  548. y,
  549. accept_sparse="csr",
  550. ensure_min_features=2,
  551. force_all_finite=not tags.get("allow_nan", True),
  552. multi_output=True,
  553. )
  554. # Initialization
  555. cv = check_cv(self.cv, y, classifier=is_classifier(self.estimator))
  556. scorer = check_scoring(self.estimator, scoring=self.scoring)
  557. n_features = X.shape[1]
  558. if 0.0 < self.step < 1.0:
  559. step = int(max(1, self.step * n_features))
  560. else:
  561. step = int(self.step)
  562. # Build an RFE object, which will evaluate and score each possible
  563. # feature count, down to self.min_features_to_select
  564. rfe = RFE(
  565. estimator=self.estimator,
  566. n_features_to_select=self.min_features_to_select,
  567. importance_getter=self.importance_getter,
  568. step=self.step,
  569. verbose=self.verbose,
  570. )
  571. # Determine the number of subsets of features by fitting across
  572. # the train folds and choosing the "features_to_select" parameter
  573. # that gives the least averaged error across all folds.
  574. # Note that joblib raises a non-picklable error for bound methods
  575. # even if n_jobs is set to 1 with the default multiprocessing
  576. # backend.
  577. # This branching is done so that to
  578. # make sure that user code that sets n_jobs to 1
  579. # and provides bound methods as scorers is not broken with the
  580. # addition of n_jobs parameter in version 0.18.
  581. if effective_n_jobs(self.n_jobs) == 1:
  582. parallel, func = list, _rfe_single_fit
  583. else:
  584. parallel = Parallel(n_jobs=self.n_jobs)
  585. func = delayed(_rfe_single_fit)
  586. scores = parallel(
  587. func(rfe, self.estimator, X, y, train, test, scorer)
  588. for train, test in cv.split(X, y, groups)
  589. )
  590. scores = np.array(scores)
  591. scores_sum = np.sum(scores, axis=0)
  592. scores_sum_rev = scores_sum[::-1]
  593. argmax_idx = len(scores_sum) - np.argmax(scores_sum_rev) - 1
  594. n_features_to_select = max(
  595. n_features - (argmax_idx * step), self.min_features_to_select
  596. )
  597. # Re-execute an elimination with best_k over the whole set
  598. rfe = RFE(
  599. estimator=self.estimator,
  600. n_features_to_select=n_features_to_select,
  601. step=self.step,
  602. importance_getter=self.importance_getter,
  603. verbose=self.verbose,
  604. )
  605. rfe.fit(X, y)
  606. # Set final attributes
  607. self.support_ = rfe.support_
  608. self.n_features_ = rfe.n_features_
  609. self.ranking_ = rfe.ranking_
  610. self.estimator_ = clone(self.estimator)
  611. self.estimator_.fit(self._transform(X), y)
  612. # reverse to stay consistent with before
  613. scores_rev = scores[:, ::-1]
  614. self.cv_results_ = {}
  615. self.cv_results_["mean_test_score"] = np.mean(scores_rev, axis=0)
  616. self.cv_results_["std_test_score"] = np.std(scores_rev, axis=0)
  617. for i in range(scores.shape[0]):
  618. self.cv_results_[f"split{i}_test_score"] = scores_rev[i]
  619. return self