test_rfe.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568
  1. """
  2. Testing Recursive feature elimination
  3. """
  4. from operator import attrgetter
  5. import numpy as np
  6. import pytest
  7. from numpy.testing import assert_allclose, assert_array_almost_equal, assert_array_equal
  8. from scipy import sparse
  9. from sklearn.base import BaseEstimator, ClassifierMixin
  10. from sklearn.compose import TransformedTargetRegressor
  11. from sklearn.cross_decomposition import CCA, PLSCanonical, PLSRegression
  12. from sklearn.datasets import load_iris, make_friedman1
  13. from sklearn.ensemble import RandomForestClassifier
  14. from sklearn.feature_selection import RFE, RFECV
  15. from sklearn.linear_model import LogisticRegression
  16. from sklearn.metrics import get_scorer, make_scorer, zero_one_loss
  17. from sklearn.model_selection import GroupKFold, cross_val_score
  18. from sklearn.pipeline import make_pipeline
  19. from sklearn.preprocessing import StandardScaler
  20. from sklearn.svm import SVC, SVR, LinearSVR
  21. from sklearn.utils import check_random_state
  22. from sklearn.utils._testing import ignore_warnings
  23. class MockClassifier:
  24. """
  25. Dummy classifier to test recursive feature elimination
  26. """
  27. def __init__(self, foo_param=0):
  28. self.foo_param = foo_param
  29. def fit(self, X, y):
  30. assert len(X) == len(y)
  31. self.coef_ = np.ones(X.shape[1], dtype=np.float64)
  32. return self
  33. def predict(self, T):
  34. return T.shape[0]
  35. predict_proba = predict
  36. decision_function = predict
  37. transform = predict
  38. def score(self, X=None, y=None):
  39. return 0.0
  40. def get_params(self, deep=True):
  41. return {"foo_param": self.foo_param}
  42. def set_params(self, **params):
  43. return self
  44. def _more_tags(self):
  45. return {"allow_nan": True}
  46. def test_rfe_features_importance():
  47. generator = check_random_state(0)
  48. iris = load_iris()
  49. # Add some irrelevant features. Random seed is set to make sure that
  50. # irrelevant features are always irrelevant.
  51. X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
  52. y = iris.target
  53. clf = RandomForestClassifier(n_estimators=20, random_state=generator, max_depth=2)
  54. rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
  55. rfe.fit(X, y)
  56. assert len(rfe.ranking_) == X.shape[1]
  57. clf_svc = SVC(kernel="linear")
  58. rfe_svc = RFE(estimator=clf_svc, n_features_to_select=4, step=0.1)
  59. rfe_svc.fit(X, y)
  60. # Check if the supports are equal
  61. assert_array_equal(rfe.get_support(), rfe_svc.get_support())
  62. def test_rfe():
  63. generator = check_random_state(0)
  64. iris = load_iris()
  65. # Add some irrelevant features. Random seed is set to make sure that
  66. # irrelevant features are always irrelevant.
  67. X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
  68. X_sparse = sparse.csr_matrix(X)
  69. y = iris.target
  70. # dense model
  71. clf = SVC(kernel="linear")
  72. rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
  73. rfe.fit(X, y)
  74. X_r = rfe.transform(X)
  75. clf.fit(X_r, y)
  76. assert len(rfe.ranking_) == X.shape[1]
  77. # sparse model
  78. clf_sparse = SVC(kernel="linear")
  79. rfe_sparse = RFE(estimator=clf_sparse, n_features_to_select=4, step=0.1)
  80. rfe_sparse.fit(X_sparse, y)
  81. X_r_sparse = rfe_sparse.transform(X_sparse)
  82. assert X_r.shape == iris.data.shape
  83. assert_array_almost_equal(X_r[:10], iris.data[:10])
  84. assert_array_almost_equal(rfe.predict(X), clf.predict(iris.data))
  85. assert rfe.score(X, y) == clf.score(iris.data, iris.target)
  86. assert_array_almost_equal(X_r, X_r_sparse.toarray())
  87. def test_RFE_fit_score_params():
  88. # Make sure RFE passes the metadata down to fit and score methods of the
  89. # underlying estimator
  90. class TestEstimator(BaseEstimator, ClassifierMixin):
  91. def fit(self, X, y, prop=None):
  92. if prop is None:
  93. raise ValueError("fit: prop cannot be None")
  94. self.svc_ = SVC(kernel="linear").fit(X, y)
  95. self.coef_ = self.svc_.coef_
  96. return self
  97. def score(self, X, y, prop=None):
  98. if prop is None:
  99. raise ValueError("score: prop cannot be None")
  100. return self.svc_.score(X, y)
  101. X, y = load_iris(return_X_y=True)
  102. with pytest.raises(ValueError, match="fit: prop cannot be None"):
  103. RFE(estimator=TestEstimator()).fit(X, y)
  104. with pytest.raises(ValueError, match="score: prop cannot be None"):
  105. RFE(estimator=TestEstimator()).fit(X, y, prop="foo").score(X, y)
  106. RFE(estimator=TestEstimator()).fit(X, y, prop="foo").score(X, y, prop="foo")
  107. def test_rfe_percent_n_features():
  108. # test that the results are the same
  109. generator = check_random_state(0)
  110. iris = load_iris()
  111. # Add some irrelevant features. Random seed is set to make sure that
  112. # irrelevant features are always irrelevant.
  113. X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
  114. y = iris.target
  115. # there are 10 features in the data. We select 40%.
  116. clf = SVC(kernel="linear")
  117. rfe_num = RFE(estimator=clf, n_features_to_select=4, step=0.1)
  118. rfe_num.fit(X, y)
  119. rfe_perc = RFE(estimator=clf, n_features_to_select=0.4, step=0.1)
  120. rfe_perc.fit(X, y)
  121. assert_array_equal(rfe_perc.ranking_, rfe_num.ranking_)
  122. assert_array_equal(rfe_perc.support_, rfe_num.support_)
  123. def test_rfe_mockclassifier():
  124. generator = check_random_state(0)
  125. iris = load_iris()
  126. # Add some irrelevant features. Random seed is set to make sure that
  127. # irrelevant features are always irrelevant.
  128. X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
  129. y = iris.target
  130. # dense model
  131. clf = MockClassifier()
  132. rfe = RFE(estimator=clf, n_features_to_select=4, step=0.1)
  133. rfe.fit(X, y)
  134. X_r = rfe.transform(X)
  135. clf.fit(X_r, y)
  136. assert len(rfe.ranking_) == X.shape[1]
  137. assert X_r.shape == iris.data.shape
  138. def test_rfecv():
  139. generator = check_random_state(0)
  140. iris = load_iris()
  141. # Add some irrelevant features. Random seed is set to make sure that
  142. # irrelevant features are always irrelevant.
  143. X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
  144. y = list(iris.target) # regression test: list should be supported
  145. # Test using the score function
  146. rfecv = RFECV(estimator=SVC(kernel="linear"), step=1)
  147. rfecv.fit(X, y)
  148. # non-regression test for missing worst feature:
  149. for key in rfecv.cv_results_.keys():
  150. assert len(rfecv.cv_results_[key]) == X.shape[1]
  151. assert len(rfecv.ranking_) == X.shape[1]
  152. X_r = rfecv.transform(X)
  153. # All the noisy variable were filtered out
  154. assert_array_equal(X_r, iris.data)
  155. # same in sparse
  156. rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=1)
  157. X_sparse = sparse.csr_matrix(X)
  158. rfecv_sparse.fit(X_sparse, y)
  159. X_r_sparse = rfecv_sparse.transform(X_sparse)
  160. assert_array_equal(X_r_sparse.toarray(), iris.data)
  161. # Test using a customized loss function
  162. scoring = make_scorer(zero_one_loss, greater_is_better=False)
  163. rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=scoring)
  164. ignore_warnings(rfecv.fit)(X, y)
  165. X_r = rfecv.transform(X)
  166. assert_array_equal(X_r, iris.data)
  167. # Test using a scorer
  168. scorer = get_scorer("accuracy")
  169. rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=scorer)
  170. rfecv.fit(X, y)
  171. X_r = rfecv.transform(X)
  172. assert_array_equal(X_r, iris.data)
  173. # Test fix on cv_results_
  174. def test_scorer(estimator, X, y):
  175. return 1.0
  176. rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, scoring=test_scorer)
  177. rfecv.fit(X, y)
  178. # In the event of cross validation score ties, the expected behavior of
  179. # RFECV is to return the FEWEST features that maximize the CV score.
  180. # Because test_scorer always returns 1.0 in this example, RFECV should
  181. # reduce the dimensionality to a single feature (i.e. n_features_ = 1)
  182. assert rfecv.n_features_ == 1
  183. # Same as the first two tests, but with step=2
  184. rfecv = RFECV(estimator=SVC(kernel="linear"), step=2)
  185. rfecv.fit(X, y)
  186. for key in rfecv.cv_results_.keys():
  187. assert len(rfecv.cv_results_[key]) == 6
  188. assert len(rfecv.ranking_) == X.shape[1]
  189. X_r = rfecv.transform(X)
  190. assert_array_equal(X_r, iris.data)
  191. rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=2)
  192. X_sparse = sparse.csr_matrix(X)
  193. rfecv_sparse.fit(X_sparse, y)
  194. X_r_sparse = rfecv_sparse.transform(X_sparse)
  195. assert_array_equal(X_r_sparse.toarray(), iris.data)
  196. # Verifying that steps < 1 don't blow up.
  197. rfecv_sparse = RFECV(estimator=SVC(kernel="linear"), step=0.2)
  198. X_sparse = sparse.csr_matrix(X)
  199. rfecv_sparse.fit(X_sparse, y)
  200. X_r_sparse = rfecv_sparse.transform(X_sparse)
  201. assert_array_equal(X_r_sparse.toarray(), iris.data)
  202. def test_rfecv_mockclassifier():
  203. generator = check_random_state(0)
  204. iris = load_iris()
  205. X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
  206. y = list(iris.target) # regression test: list should be supported
  207. # Test using the score function
  208. rfecv = RFECV(estimator=MockClassifier(), step=1)
  209. rfecv.fit(X, y)
  210. # non-regression test for missing worst feature:
  211. for key in rfecv.cv_results_.keys():
  212. assert len(rfecv.cv_results_[key]) == X.shape[1]
  213. assert len(rfecv.ranking_) == X.shape[1]
  214. def test_rfecv_verbose_output():
  215. # Check verbose=1 is producing an output.
  216. import sys
  217. from io import StringIO
  218. sys.stdout = StringIO()
  219. generator = check_random_state(0)
  220. iris = load_iris()
  221. X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
  222. y = list(iris.target)
  223. rfecv = RFECV(estimator=SVC(kernel="linear"), step=1, verbose=1)
  224. rfecv.fit(X, y)
  225. verbose_output = sys.stdout
  226. verbose_output.seek(0)
  227. assert len(verbose_output.readline()) > 0
  228. def test_rfecv_cv_results_size(global_random_seed):
  229. generator = check_random_state(global_random_seed)
  230. iris = load_iris()
  231. X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
  232. y = list(iris.target) # regression test: list should be supported
  233. # Non-regression test for varying combinations of step and
  234. # min_features_to_select.
  235. for step, min_features_to_select in [[2, 1], [2, 2], [3, 3]]:
  236. rfecv = RFECV(
  237. estimator=MockClassifier(),
  238. step=step,
  239. min_features_to_select=min_features_to_select,
  240. )
  241. rfecv.fit(X, y)
  242. score_len = np.ceil((X.shape[1] - min_features_to_select) / step) + 1
  243. for key in rfecv.cv_results_.keys():
  244. assert len(rfecv.cv_results_[key]) == score_len
  245. assert len(rfecv.ranking_) == X.shape[1]
  246. assert rfecv.n_features_ >= min_features_to_select
  247. def test_rfe_estimator_tags():
  248. rfe = RFE(SVC(kernel="linear"))
  249. assert rfe._estimator_type == "classifier"
  250. # make sure that cross-validation is stratified
  251. iris = load_iris()
  252. score = cross_val_score(rfe, iris.data, iris.target)
  253. assert score.min() > 0.7
  254. def test_rfe_min_step(global_random_seed):
  255. n_features = 10
  256. X, y = make_friedman1(
  257. n_samples=50, n_features=n_features, random_state=global_random_seed
  258. )
  259. n_samples, n_features = X.shape
  260. estimator = SVR(kernel="linear")
  261. # Test when floor(step * n_features) <= 0
  262. selector = RFE(estimator, step=0.01)
  263. sel = selector.fit(X, y)
  264. assert sel.support_.sum() == n_features // 2
  265. # Test when step is between (0,1) and floor(step * n_features) > 0
  266. selector = RFE(estimator, step=0.20)
  267. sel = selector.fit(X, y)
  268. assert sel.support_.sum() == n_features // 2
  269. # Test when step is an integer
  270. selector = RFE(estimator, step=5)
  271. sel = selector.fit(X, y)
  272. assert sel.support_.sum() == n_features // 2
  273. def test_number_of_subsets_of_features(global_random_seed):
  274. # In RFE, 'number_of_subsets_of_features'
  275. # = the number of iterations in '_fit'
  276. # = max(ranking_)
  277. # = 1 + (n_features + step - n_features_to_select - 1) // step
  278. # After optimization #4534, this number
  279. # = 1 + np.ceil((n_features - n_features_to_select) / float(step))
  280. # This test case is to test their equivalence, refer to #4534 and #3824
  281. def formula1(n_features, n_features_to_select, step):
  282. return 1 + ((n_features + step - n_features_to_select - 1) // step)
  283. def formula2(n_features, n_features_to_select, step):
  284. return 1 + np.ceil((n_features - n_features_to_select) / float(step))
  285. # RFE
  286. # Case 1, n_features - n_features_to_select is divisible by step
  287. # Case 2, n_features - n_features_to_select is not divisible by step
  288. n_features_list = [11, 11]
  289. n_features_to_select_list = [3, 3]
  290. step_list = [2, 3]
  291. for n_features, n_features_to_select, step in zip(
  292. n_features_list, n_features_to_select_list, step_list
  293. ):
  294. generator = check_random_state(global_random_seed)
  295. X = generator.normal(size=(100, n_features))
  296. y = generator.rand(100).round()
  297. rfe = RFE(
  298. estimator=SVC(kernel="linear"),
  299. n_features_to_select=n_features_to_select,
  300. step=step,
  301. )
  302. rfe.fit(X, y)
  303. # this number also equals to the maximum of ranking_
  304. assert np.max(rfe.ranking_) == formula1(n_features, n_features_to_select, step)
  305. assert np.max(rfe.ranking_) == formula2(n_features, n_features_to_select, step)
  306. # In RFECV, 'fit' calls 'RFE._fit'
  307. # 'number_of_subsets_of_features' of RFE
  308. # = the size of each score in 'cv_results_' of RFECV
  309. # = the number of iterations of the for loop before optimization #4534
  310. # RFECV, n_features_to_select = 1
  311. # Case 1, n_features - 1 is divisible by step
  312. # Case 2, n_features - 1 is not divisible by step
  313. n_features_to_select = 1
  314. n_features_list = [11, 10]
  315. step_list = [2, 2]
  316. for n_features, step in zip(n_features_list, step_list):
  317. generator = check_random_state(global_random_seed)
  318. X = generator.normal(size=(100, n_features))
  319. y = generator.rand(100).round()
  320. rfecv = RFECV(estimator=SVC(kernel="linear"), step=step)
  321. rfecv.fit(X, y)
  322. for key in rfecv.cv_results_.keys():
  323. assert len(rfecv.cv_results_[key]) == formula1(
  324. n_features, n_features_to_select, step
  325. )
  326. assert len(rfecv.cv_results_[key]) == formula2(
  327. n_features, n_features_to_select, step
  328. )
  329. def test_rfe_cv_n_jobs(global_random_seed):
  330. generator = check_random_state(global_random_seed)
  331. iris = load_iris()
  332. X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
  333. y = iris.target
  334. rfecv = RFECV(estimator=SVC(kernel="linear"))
  335. rfecv.fit(X, y)
  336. rfecv_ranking = rfecv.ranking_
  337. rfecv_cv_results_ = rfecv.cv_results_
  338. rfecv.set_params(n_jobs=2)
  339. rfecv.fit(X, y)
  340. assert_array_almost_equal(rfecv.ranking_, rfecv_ranking)
  341. assert rfecv_cv_results_.keys() == rfecv.cv_results_.keys()
  342. for key in rfecv_cv_results_.keys():
  343. assert rfecv_cv_results_[key] == pytest.approx(rfecv.cv_results_[key])
  344. def test_rfe_cv_groups():
  345. generator = check_random_state(0)
  346. iris = load_iris()
  347. number_groups = 4
  348. groups = np.floor(np.linspace(0, number_groups, len(iris.target)))
  349. X = iris.data
  350. y = (iris.target > 0).astype(int)
  351. est_groups = RFECV(
  352. estimator=RandomForestClassifier(random_state=generator),
  353. step=1,
  354. scoring="accuracy",
  355. cv=GroupKFold(n_splits=2),
  356. )
  357. est_groups.fit(X, y, groups=groups)
  358. assert est_groups.n_features_ > 0
  359. @pytest.mark.parametrize(
  360. "importance_getter", [attrgetter("regressor_.coef_"), "regressor_.coef_"]
  361. )
  362. @pytest.mark.parametrize("selector, expected_n_features", [(RFE, 5), (RFECV, 4)])
  363. def test_rfe_wrapped_estimator(importance_getter, selector, expected_n_features):
  364. # Non-regression test for
  365. # https://github.com/scikit-learn/scikit-learn/issues/15312
  366. X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
  367. estimator = LinearSVR(dual="auto", random_state=0)
  368. log_estimator = TransformedTargetRegressor(
  369. regressor=estimator, func=np.log, inverse_func=np.exp
  370. )
  371. selector = selector(log_estimator, importance_getter=importance_getter)
  372. sel = selector.fit(X, y)
  373. assert sel.support_.sum() == expected_n_features
  374. @pytest.mark.parametrize(
  375. "importance_getter, err_type",
  376. [
  377. ("auto", ValueError),
  378. ("random", AttributeError),
  379. (lambda x: x.importance, AttributeError),
  380. ],
  381. )
  382. @pytest.mark.parametrize("Selector", [RFE, RFECV])
  383. def test_rfe_importance_getter_validation(importance_getter, err_type, Selector):
  384. X, y = make_friedman1(n_samples=50, n_features=10, random_state=42)
  385. estimator = LinearSVR(dual="auto")
  386. log_estimator = TransformedTargetRegressor(
  387. regressor=estimator, func=np.log, inverse_func=np.exp
  388. )
  389. with pytest.raises(err_type):
  390. model = Selector(log_estimator, importance_getter=importance_getter)
  391. model.fit(X, y)
  392. @pytest.mark.parametrize("cv", [None, 5])
  393. def test_rfe_allow_nan_inf_in_x(cv):
  394. iris = load_iris()
  395. X = iris.data
  396. y = iris.target
  397. # add nan and inf value to X
  398. X[0][0] = np.nan
  399. X[0][1] = np.inf
  400. clf = MockClassifier()
  401. if cv is not None:
  402. rfe = RFECV(estimator=clf, cv=cv)
  403. else:
  404. rfe = RFE(estimator=clf)
  405. rfe.fit(X, y)
  406. rfe.transform(X)
  407. def test_w_pipeline_2d_coef_():
  408. pipeline = make_pipeline(StandardScaler(), LogisticRegression())
  409. data, y = load_iris(return_X_y=True)
  410. sfm = RFE(
  411. pipeline,
  412. n_features_to_select=2,
  413. importance_getter="named_steps.logisticregression.coef_",
  414. )
  415. sfm.fit(data, y)
  416. assert sfm.transform(data).shape[1] == 2
  417. def test_rfecv_std_and_mean(global_random_seed):
  418. generator = check_random_state(global_random_seed)
  419. iris = load_iris()
  420. X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
  421. y = iris.target
  422. rfecv = RFECV(estimator=SVC(kernel="linear"))
  423. rfecv.fit(X, y)
  424. n_split_keys = len(rfecv.cv_results_) - 2
  425. split_keys = [f"split{i}_test_score" for i in range(n_split_keys)]
  426. cv_scores = np.asarray([rfecv.cv_results_[key] for key in split_keys])
  427. expected_mean = np.mean(cv_scores, axis=0)
  428. expected_std = np.std(cv_scores, axis=0)
  429. assert_allclose(rfecv.cv_results_["mean_test_score"], expected_mean)
  430. assert_allclose(rfecv.cv_results_["std_test_score"], expected_std)
  431. @pytest.mark.parametrize("ClsRFE", [RFE, RFECV])
  432. def test_multioutput(ClsRFE):
  433. X = np.random.normal(size=(10, 3))
  434. y = np.random.randint(2, size=(10, 2))
  435. clf = RandomForestClassifier(n_estimators=5)
  436. rfe_test = ClsRFE(clf)
  437. rfe_test.fit(X, y)
  438. @pytest.mark.parametrize("ClsRFE", [RFE, RFECV])
  439. @pytest.mark.parametrize("PLSEstimator", [CCA, PLSCanonical, PLSRegression])
  440. def test_rfe_pls(ClsRFE, PLSEstimator):
  441. """Check the behaviour of RFE with PLS estimators.
  442. Non-regression test for:
  443. https://github.com/scikit-learn/scikit-learn/issues/12410
  444. """
  445. X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
  446. estimator = PLSEstimator(n_components=1)
  447. selector = ClsRFE(estimator, step=1).fit(X, y)
  448. assert selector.score(X, y) > 0.5