test_feature_select.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985
  1. """
  2. Todo: cross-check the F-value with stats model
  3. """
  4. import itertools
  5. import warnings
  6. import numpy as np
  7. import pytest
  8. from numpy.testing import assert_allclose
  9. from scipy import sparse, stats
  10. from sklearn.datasets import load_iris, make_classification, make_regression
  11. from sklearn.feature_selection import (
  12. GenericUnivariateSelect,
  13. SelectFdr,
  14. SelectFpr,
  15. SelectFwe,
  16. SelectKBest,
  17. SelectPercentile,
  18. chi2,
  19. f_classif,
  20. f_oneway,
  21. f_regression,
  22. mutual_info_classif,
  23. mutual_info_regression,
  24. r_regression,
  25. )
  26. from sklearn.utils import safe_mask
  27. from sklearn.utils._testing import (
  28. _convert_container,
  29. assert_almost_equal,
  30. assert_array_almost_equal,
  31. assert_array_equal,
  32. ignore_warnings,
  33. )
  34. ##############################################################################
  35. # Test the score functions
  36. def test_f_oneway_vs_scipy_stats():
  37. # Test that our f_oneway gives the same result as scipy.stats
  38. rng = np.random.RandomState(0)
  39. X1 = rng.randn(10, 3)
  40. X2 = 1 + rng.randn(10, 3)
  41. f, pv = stats.f_oneway(X1, X2)
  42. f2, pv2 = f_oneway(X1, X2)
  43. assert np.allclose(f, f2)
  44. assert np.allclose(pv, pv2)
  45. def test_f_oneway_ints():
  46. # Smoke test f_oneway on integers: that it does raise casting errors
  47. # with recent numpys
  48. rng = np.random.RandomState(0)
  49. X = rng.randint(10, size=(10, 10))
  50. y = np.arange(10)
  51. fint, pint = f_oneway(X, y)
  52. # test that is gives the same result as with float
  53. f, p = f_oneway(X.astype(float), y)
  54. assert_array_almost_equal(f, fint, decimal=4)
  55. assert_array_almost_equal(p, pint, decimal=4)
  56. def test_f_classif():
  57. # Test whether the F test yields meaningful results
  58. # on a simple simulated classification problem
  59. X, y = make_classification(
  60. n_samples=200,
  61. n_features=20,
  62. n_informative=3,
  63. n_redundant=2,
  64. n_repeated=0,
  65. n_classes=8,
  66. n_clusters_per_class=1,
  67. flip_y=0.0,
  68. class_sep=10,
  69. shuffle=False,
  70. random_state=0,
  71. )
  72. F, pv = f_classif(X, y)
  73. F_sparse, pv_sparse = f_classif(sparse.csr_matrix(X), y)
  74. assert (F > 0).all()
  75. assert (pv > 0).all()
  76. assert (pv < 1).all()
  77. assert (pv[:5] < 0.05).all()
  78. assert (pv[5:] > 1.0e-4).all()
  79. assert_array_almost_equal(F_sparse, F)
  80. assert_array_almost_equal(pv_sparse, pv)
  81. @pytest.mark.parametrize("center", [True, False])
  82. def test_r_regression(center):
  83. X, y = make_regression(
  84. n_samples=2000, n_features=20, n_informative=5, shuffle=False, random_state=0
  85. )
  86. corr_coeffs = r_regression(X, y, center=center)
  87. assert (-1 < corr_coeffs).all()
  88. assert (corr_coeffs < 1).all()
  89. sparse_X = _convert_container(X, "sparse")
  90. sparse_corr_coeffs = r_regression(sparse_X, y, center=center)
  91. assert_allclose(sparse_corr_coeffs, corr_coeffs)
  92. # Testing against numpy for reference
  93. Z = np.hstack((X, y[:, np.newaxis]))
  94. correlation_matrix = np.corrcoef(Z, rowvar=False)
  95. np_corr_coeffs = correlation_matrix[:-1, -1]
  96. assert_array_almost_equal(np_corr_coeffs, corr_coeffs, decimal=3)
  97. def test_f_regression():
  98. # Test whether the F test yields meaningful results
  99. # on a simple simulated regression problem
  100. X, y = make_regression(
  101. n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0
  102. )
  103. F, pv = f_regression(X, y)
  104. assert (F > 0).all()
  105. assert (pv > 0).all()
  106. assert (pv < 1).all()
  107. assert (pv[:5] < 0.05).all()
  108. assert (pv[5:] > 1.0e-4).all()
  109. # with centering, compare with sparse
  110. F, pv = f_regression(X, y, center=True)
  111. F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=True)
  112. assert_allclose(F_sparse, F)
  113. assert_allclose(pv_sparse, pv)
  114. # again without centering, compare with sparse
  115. F, pv = f_regression(X, y, center=False)
  116. F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=False)
  117. assert_allclose(F_sparse, F)
  118. assert_allclose(pv_sparse, pv)
  119. def test_f_regression_input_dtype():
  120. # Test whether f_regression returns the same value
  121. # for any numeric data_type
  122. rng = np.random.RandomState(0)
  123. X = rng.rand(10, 20)
  124. y = np.arange(10).astype(int)
  125. F1, pv1 = f_regression(X, y)
  126. F2, pv2 = f_regression(X, y.astype(float))
  127. assert_allclose(F1, F2, 5)
  128. assert_allclose(pv1, pv2, 5)
  129. def test_f_regression_center():
  130. # Test whether f_regression preserves dof according to 'center' argument
  131. # We use two centered variates so we have a simple relationship between
  132. # F-score with variates centering and F-score without variates centering.
  133. # Create toy example
  134. X = np.arange(-5, 6).reshape(-1, 1) # X has zero mean
  135. n_samples = X.size
  136. Y = np.ones(n_samples)
  137. Y[::2] *= -1.0
  138. Y[0] = 0.0 # have Y mean being null
  139. F1, _ = f_regression(X, Y, center=True)
  140. F2, _ = f_regression(X, Y, center=False)
  141. assert_allclose(F1 * (n_samples - 1.0) / (n_samples - 2.0), F2)
  142. assert_almost_equal(F2[0], 0.232558139) # value from statsmodels OLS
  143. @pytest.mark.parametrize(
  144. "X, y, expected_corr_coef, force_finite",
  145. [
  146. (
  147. # A feature in X is constant - forcing finite
  148. np.array([[2, 1], [2, 0], [2, 10], [2, 4]]),
  149. np.array([0, 1, 1, 0]),
  150. np.array([0.0, 0.32075]),
  151. True,
  152. ),
  153. (
  154. # The target y is constant - forcing finite
  155. np.array([[5, 1], [3, 0], [2, 10], [8, 4]]),
  156. np.array([0, 0, 0, 0]),
  157. np.array([0.0, 0.0]),
  158. True,
  159. ),
  160. (
  161. # A feature in X is constant - not forcing finite
  162. np.array([[2, 1], [2, 0], [2, 10], [2, 4]]),
  163. np.array([0, 1, 1, 0]),
  164. np.array([np.nan, 0.32075]),
  165. False,
  166. ),
  167. (
  168. # The target y is constant - not forcing finite
  169. np.array([[5, 1], [3, 0], [2, 10], [8, 4]]),
  170. np.array([0, 0, 0, 0]),
  171. np.array([np.nan, np.nan]),
  172. False,
  173. ),
  174. ],
  175. )
  176. def test_r_regression_force_finite(X, y, expected_corr_coef, force_finite):
  177. """Check the behaviour of `force_finite` for some corner cases with `r_regression`.
  178. Non-regression test for:
  179. https://github.com/scikit-learn/scikit-learn/issues/15672
  180. """
  181. with warnings.catch_warnings():
  182. warnings.simplefilter("error", RuntimeWarning)
  183. corr_coef = r_regression(X, y, force_finite=force_finite)
  184. np.testing.assert_array_almost_equal(corr_coef, expected_corr_coef)
  185. @pytest.mark.parametrize(
  186. "X, y, expected_f_statistic, expected_p_values, force_finite",
  187. [
  188. (
  189. # A feature in X is constant - forcing finite
  190. np.array([[2, 1], [2, 0], [2, 10], [2, 4]]),
  191. np.array([0, 1, 1, 0]),
  192. np.array([0.0, 0.2293578]),
  193. np.array([1.0, 0.67924985]),
  194. True,
  195. ),
  196. (
  197. # The target y is constant - forcing finite
  198. np.array([[5, 1], [3, 0], [2, 10], [8, 4]]),
  199. np.array([0, 0, 0, 0]),
  200. np.array([0.0, 0.0]),
  201. np.array([1.0, 1.0]),
  202. True,
  203. ),
  204. (
  205. # Feature in X correlated with y - forcing finite
  206. np.array([[0, 1], [1, 0], [2, 10], [3, 4]]),
  207. np.array([0, 1, 2, 3]),
  208. np.array([np.finfo(np.float64).max, 0.845433]),
  209. np.array([0.0, 0.454913]),
  210. True,
  211. ),
  212. (
  213. # Feature in X anti-correlated with y - forcing finite
  214. np.array([[3, 1], [2, 0], [1, 10], [0, 4]]),
  215. np.array([0, 1, 2, 3]),
  216. np.array([np.finfo(np.float64).max, 0.845433]),
  217. np.array([0.0, 0.454913]),
  218. True,
  219. ),
  220. (
  221. # A feature in X is constant - not forcing finite
  222. np.array([[2, 1], [2, 0], [2, 10], [2, 4]]),
  223. np.array([0, 1, 1, 0]),
  224. np.array([np.nan, 0.2293578]),
  225. np.array([np.nan, 0.67924985]),
  226. False,
  227. ),
  228. (
  229. # The target y is constant - not forcing finite
  230. np.array([[5, 1], [3, 0], [2, 10], [8, 4]]),
  231. np.array([0, 0, 0, 0]),
  232. np.array([np.nan, np.nan]),
  233. np.array([np.nan, np.nan]),
  234. False,
  235. ),
  236. (
  237. # Feature in X correlated with y - not forcing finite
  238. np.array([[0, 1], [1, 0], [2, 10], [3, 4]]),
  239. np.array([0, 1, 2, 3]),
  240. np.array([np.inf, 0.845433]),
  241. np.array([0.0, 0.454913]),
  242. False,
  243. ),
  244. (
  245. # Feature in X anti-correlated with y - not forcing finite
  246. np.array([[3, 1], [2, 0], [1, 10], [0, 4]]),
  247. np.array([0, 1, 2, 3]),
  248. np.array([np.inf, 0.845433]),
  249. np.array([0.0, 0.454913]),
  250. False,
  251. ),
  252. ],
  253. )
  254. def test_f_regression_corner_case(
  255. X, y, expected_f_statistic, expected_p_values, force_finite
  256. ):
  257. """Check the behaviour of `force_finite` for some corner cases with `f_regression`.
  258. Non-regression test for:
  259. https://github.com/scikit-learn/scikit-learn/issues/15672
  260. """
  261. with warnings.catch_warnings():
  262. warnings.simplefilter("error", RuntimeWarning)
  263. f_statistic, p_values = f_regression(X, y, force_finite=force_finite)
  264. np.testing.assert_array_almost_equal(f_statistic, expected_f_statistic)
  265. np.testing.assert_array_almost_equal(p_values, expected_p_values)
  266. def test_f_classif_multi_class():
  267. # Test whether the F test yields meaningful results
  268. # on a simple simulated classification problem
  269. X, y = make_classification(
  270. n_samples=200,
  271. n_features=20,
  272. n_informative=3,
  273. n_redundant=2,
  274. n_repeated=0,
  275. n_classes=8,
  276. n_clusters_per_class=1,
  277. flip_y=0.0,
  278. class_sep=10,
  279. shuffle=False,
  280. random_state=0,
  281. )
  282. F, pv = f_classif(X, y)
  283. assert (F > 0).all()
  284. assert (pv > 0).all()
  285. assert (pv < 1).all()
  286. assert (pv[:5] < 0.05).all()
  287. assert (pv[5:] > 1.0e-4).all()
  288. def test_select_percentile_classif():
  289. # Test whether the relative univariate feature selection
  290. # gets the correct items in a simple classification problem
  291. # with the percentile heuristic
  292. X, y = make_classification(
  293. n_samples=200,
  294. n_features=20,
  295. n_informative=3,
  296. n_redundant=2,
  297. n_repeated=0,
  298. n_classes=8,
  299. n_clusters_per_class=1,
  300. flip_y=0.0,
  301. class_sep=10,
  302. shuffle=False,
  303. random_state=0,
  304. )
  305. univariate_filter = SelectPercentile(f_classif, percentile=25)
  306. X_r = univariate_filter.fit(X, y).transform(X)
  307. X_r2 = (
  308. GenericUnivariateSelect(f_classif, mode="percentile", param=25)
  309. .fit(X, y)
  310. .transform(X)
  311. )
  312. assert_array_equal(X_r, X_r2)
  313. support = univariate_filter.get_support()
  314. gtruth = np.zeros(20)
  315. gtruth[:5] = 1
  316. assert_array_equal(support, gtruth)
  317. def test_select_percentile_classif_sparse():
  318. # Test whether the relative univariate feature selection
  319. # gets the correct items in a simple classification problem
  320. # with the percentile heuristic
  321. X, y = make_classification(
  322. n_samples=200,
  323. n_features=20,
  324. n_informative=3,
  325. n_redundant=2,
  326. n_repeated=0,
  327. n_classes=8,
  328. n_clusters_per_class=1,
  329. flip_y=0.0,
  330. class_sep=10,
  331. shuffle=False,
  332. random_state=0,
  333. )
  334. X = sparse.csr_matrix(X)
  335. univariate_filter = SelectPercentile(f_classif, percentile=25)
  336. X_r = univariate_filter.fit(X, y).transform(X)
  337. X_r2 = (
  338. GenericUnivariateSelect(f_classif, mode="percentile", param=25)
  339. .fit(X, y)
  340. .transform(X)
  341. )
  342. assert_array_equal(X_r.toarray(), X_r2.toarray())
  343. support = univariate_filter.get_support()
  344. gtruth = np.zeros(20)
  345. gtruth[:5] = 1
  346. assert_array_equal(support, gtruth)
  347. X_r2inv = univariate_filter.inverse_transform(X_r2)
  348. assert sparse.issparse(X_r2inv)
  349. support_mask = safe_mask(X_r2inv, support)
  350. assert X_r2inv.shape == X.shape
  351. assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray())
  352. # Check other columns are empty
  353. assert X_r2inv.getnnz() == X_r.getnnz()
  354. ##############################################################################
  355. # Test univariate selection in classification settings
  356. def test_select_kbest_classif():
  357. # Test whether the relative univariate feature selection
  358. # gets the correct items in a simple classification problem
  359. # with the k best heuristic
  360. X, y = make_classification(
  361. n_samples=200,
  362. n_features=20,
  363. n_informative=3,
  364. n_redundant=2,
  365. n_repeated=0,
  366. n_classes=8,
  367. n_clusters_per_class=1,
  368. flip_y=0.0,
  369. class_sep=10,
  370. shuffle=False,
  371. random_state=0,
  372. )
  373. univariate_filter = SelectKBest(f_classif, k=5)
  374. X_r = univariate_filter.fit(X, y).transform(X)
  375. X_r2 = (
  376. GenericUnivariateSelect(f_classif, mode="k_best", param=5)
  377. .fit(X, y)
  378. .transform(X)
  379. )
  380. assert_array_equal(X_r, X_r2)
  381. support = univariate_filter.get_support()
  382. gtruth = np.zeros(20)
  383. gtruth[:5] = 1
  384. assert_array_equal(support, gtruth)
  385. def test_select_kbest_all():
  386. # Test whether k="all" correctly returns all features.
  387. X, y = make_classification(
  388. n_samples=20, n_features=10, shuffle=False, random_state=0
  389. )
  390. univariate_filter = SelectKBest(f_classif, k="all")
  391. X_r = univariate_filter.fit(X, y).transform(X)
  392. assert_array_equal(X, X_r)
  393. # Non-regression test for:
  394. # https://github.com/scikit-learn/scikit-learn/issues/24949
  395. X_r2 = (
  396. GenericUnivariateSelect(f_classif, mode="k_best", param="all")
  397. .fit(X, y)
  398. .transform(X)
  399. )
  400. assert_array_equal(X_r, X_r2)
  401. @pytest.mark.parametrize("dtype_in", [np.float32, np.float64])
  402. def test_select_kbest_zero(dtype_in):
  403. # Test whether k=0 correctly returns no features.
  404. X, y = make_classification(
  405. n_samples=20, n_features=10, shuffle=False, random_state=0
  406. )
  407. X = X.astype(dtype_in)
  408. univariate_filter = SelectKBest(f_classif, k=0)
  409. univariate_filter.fit(X, y)
  410. support = univariate_filter.get_support()
  411. gtruth = np.zeros(10, dtype=bool)
  412. assert_array_equal(support, gtruth)
  413. with pytest.warns(UserWarning, match="No features were selected"):
  414. X_selected = univariate_filter.transform(X)
  415. assert X_selected.shape == (20, 0)
  416. assert X_selected.dtype == dtype_in
  417. def test_select_heuristics_classif():
  418. # Test whether the relative univariate feature selection
  419. # gets the correct items in a simple classification problem
  420. # with the fdr, fwe and fpr heuristics
  421. X, y = make_classification(
  422. n_samples=200,
  423. n_features=20,
  424. n_informative=3,
  425. n_redundant=2,
  426. n_repeated=0,
  427. n_classes=8,
  428. n_clusters_per_class=1,
  429. flip_y=0.0,
  430. class_sep=10,
  431. shuffle=False,
  432. random_state=0,
  433. )
  434. univariate_filter = SelectFwe(f_classif, alpha=0.01)
  435. X_r = univariate_filter.fit(X, y).transform(X)
  436. gtruth = np.zeros(20)
  437. gtruth[:5] = 1
  438. for mode in ["fdr", "fpr", "fwe"]:
  439. X_r2 = (
  440. GenericUnivariateSelect(f_classif, mode=mode, param=0.01)
  441. .fit(X, y)
  442. .transform(X)
  443. )
  444. assert_array_equal(X_r, X_r2)
  445. support = univariate_filter.get_support()
  446. assert_allclose(support, gtruth)
  447. ##############################################################################
  448. # Test univariate selection in regression settings
  449. def assert_best_scores_kept(score_filter):
  450. scores = score_filter.scores_
  451. support = score_filter.get_support()
  452. assert_allclose(np.sort(scores[support]), np.sort(scores)[-support.sum() :])
  453. def test_select_percentile_regression():
  454. # Test whether the relative univariate feature selection
  455. # gets the correct items in a simple regression problem
  456. # with the percentile heuristic
  457. X, y = make_regression(
  458. n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0
  459. )
  460. univariate_filter = SelectPercentile(f_regression, percentile=25)
  461. X_r = univariate_filter.fit(X, y).transform(X)
  462. assert_best_scores_kept(univariate_filter)
  463. X_r2 = (
  464. GenericUnivariateSelect(f_regression, mode="percentile", param=25)
  465. .fit(X, y)
  466. .transform(X)
  467. )
  468. assert_array_equal(X_r, X_r2)
  469. support = univariate_filter.get_support()
  470. gtruth = np.zeros(20)
  471. gtruth[:5] = 1
  472. assert_array_equal(support, gtruth)
  473. X_2 = X.copy()
  474. X_2[:, np.logical_not(support)] = 0
  475. assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))
  476. # Check inverse_transform respects dtype
  477. assert_array_equal(
  478. X_2.astype(bool), univariate_filter.inverse_transform(X_r.astype(bool))
  479. )
  480. def test_select_percentile_regression_full():
  481. # Test whether the relative univariate feature selection
  482. # selects all features when '100%' is asked.
  483. X, y = make_regression(
  484. n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0
  485. )
  486. univariate_filter = SelectPercentile(f_regression, percentile=100)
  487. X_r = univariate_filter.fit(X, y).transform(X)
  488. assert_best_scores_kept(univariate_filter)
  489. X_r2 = (
  490. GenericUnivariateSelect(f_regression, mode="percentile", param=100)
  491. .fit(X, y)
  492. .transform(X)
  493. )
  494. assert_array_equal(X_r, X_r2)
  495. support = univariate_filter.get_support()
  496. gtruth = np.ones(20)
  497. assert_array_equal(support, gtruth)
  498. def test_select_kbest_regression():
  499. # Test whether the relative univariate feature selection
  500. # gets the correct items in a simple regression problem
  501. # with the k best heuristic
  502. X, y = make_regression(
  503. n_samples=200,
  504. n_features=20,
  505. n_informative=5,
  506. shuffle=False,
  507. random_state=0,
  508. noise=10,
  509. )
  510. univariate_filter = SelectKBest(f_regression, k=5)
  511. X_r = univariate_filter.fit(X, y).transform(X)
  512. assert_best_scores_kept(univariate_filter)
  513. X_r2 = (
  514. GenericUnivariateSelect(f_regression, mode="k_best", param=5)
  515. .fit(X, y)
  516. .transform(X)
  517. )
  518. assert_array_equal(X_r, X_r2)
  519. support = univariate_filter.get_support()
  520. gtruth = np.zeros(20)
  521. gtruth[:5] = 1
  522. assert_array_equal(support, gtruth)
  523. def test_select_heuristics_regression():
  524. # Test whether the relative univariate feature selection
  525. # gets the correct items in a simple regression problem
  526. # with the fpr, fdr or fwe heuristics
  527. X, y = make_regression(
  528. n_samples=200,
  529. n_features=20,
  530. n_informative=5,
  531. shuffle=False,
  532. random_state=0,
  533. noise=10,
  534. )
  535. univariate_filter = SelectFpr(f_regression, alpha=0.01)
  536. X_r = univariate_filter.fit(X, y).transform(X)
  537. gtruth = np.zeros(20)
  538. gtruth[:5] = 1
  539. for mode in ["fdr", "fpr", "fwe"]:
  540. X_r2 = (
  541. GenericUnivariateSelect(f_regression, mode=mode, param=0.01)
  542. .fit(X, y)
  543. .transform(X)
  544. )
  545. assert_array_equal(X_r, X_r2)
  546. support = univariate_filter.get_support()
  547. assert_array_equal(support[:5], np.ones((5,), dtype=bool))
  548. assert np.sum(support[5:] == 1) < 3
  549. def test_boundary_case_ch2():
  550. # Test boundary case, and always aim to select 1 feature.
  551. X = np.array([[10, 20], [20, 20], [20, 30]])
  552. y = np.array([[1], [0], [0]])
  553. scores, pvalues = chi2(X, y)
  554. assert_array_almost_equal(scores, np.array([4.0, 0.71428571]))
  555. assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))
  556. filter_fdr = SelectFdr(chi2, alpha=0.1)
  557. filter_fdr.fit(X, y)
  558. support_fdr = filter_fdr.get_support()
  559. assert_array_equal(support_fdr, np.array([True, False]))
  560. filter_kbest = SelectKBest(chi2, k=1)
  561. filter_kbest.fit(X, y)
  562. support_kbest = filter_kbest.get_support()
  563. assert_array_equal(support_kbest, np.array([True, False]))
  564. filter_percentile = SelectPercentile(chi2, percentile=50)
  565. filter_percentile.fit(X, y)
  566. support_percentile = filter_percentile.get_support()
  567. assert_array_equal(support_percentile, np.array([True, False]))
  568. filter_fpr = SelectFpr(chi2, alpha=0.1)
  569. filter_fpr.fit(X, y)
  570. support_fpr = filter_fpr.get_support()
  571. assert_array_equal(support_fpr, np.array([True, False]))
  572. filter_fwe = SelectFwe(chi2, alpha=0.1)
  573. filter_fwe.fit(X, y)
  574. support_fwe = filter_fwe.get_support()
  575. assert_array_equal(support_fwe, np.array([True, False]))
  576. @pytest.mark.parametrize("alpha", [0.001, 0.01, 0.1])
  577. @pytest.mark.parametrize("n_informative", [1, 5, 10])
  578. def test_select_fdr_regression(alpha, n_informative):
  579. # Test that fdr heuristic actually has low FDR.
  580. def single_fdr(alpha, n_informative, random_state):
  581. X, y = make_regression(
  582. n_samples=150,
  583. n_features=20,
  584. n_informative=n_informative,
  585. shuffle=False,
  586. random_state=random_state,
  587. noise=10,
  588. )
  589. with warnings.catch_warnings(record=True):
  590. # Warnings can be raised when no features are selected
  591. # (low alpha or very noisy data)
  592. univariate_filter = SelectFdr(f_regression, alpha=alpha)
  593. X_r = univariate_filter.fit(X, y).transform(X)
  594. X_r2 = (
  595. GenericUnivariateSelect(f_regression, mode="fdr", param=alpha)
  596. .fit(X, y)
  597. .transform(X)
  598. )
  599. assert_array_equal(X_r, X_r2)
  600. support = univariate_filter.get_support()
  601. num_false_positives = np.sum(support[n_informative:] == 1)
  602. num_true_positives = np.sum(support[:n_informative] == 1)
  603. if num_false_positives == 0:
  604. return 0.0
  605. false_discovery_rate = num_false_positives / (
  606. num_true_positives + num_false_positives
  607. )
  608. return false_discovery_rate
  609. # As per Benjamini-Hochberg, the expected false discovery rate
  610. # should be lower than alpha:
  611. # FDR = E(FP / (TP + FP)) <= alpha
  612. false_discovery_rate = np.mean(
  613. [single_fdr(alpha, n_informative, random_state) for random_state in range(100)]
  614. )
  615. assert alpha >= false_discovery_rate
  616. # Make sure that the empirical false discovery rate increases
  617. # with alpha:
  618. if false_discovery_rate != 0:
  619. assert false_discovery_rate > alpha / 10
  620. def test_select_fwe_regression():
  621. # Test whether the relative univariate feature selection
  622. # gets the correct items in a simple regression problem
  623. # with the fwe heuristic
  624. X, y = make_regression(
  625. n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0
  626. )
  627. univariate_filter = SelectFwe(f_regression, alpha=0.01)
  628. X_r = univariate_filter.fit(X, y).transform(X)
  629. X_r2 = (
  630. GenericUnivariateSelect(f_regression, mode="fwe", param=0.01)
  631. .fit(X, y)
  632. .transform(X)
  633. )
  634. assert_array_equal(X_r, X_r2)
  635. support = univariate_filter.get_support()
  636. gtruth = np.zeros(20)
  637. gtruth[:5] = 1
  638. assert_array_equal(support[:5], np.ones((5,), dtype=bool))
  639. assert np.sum(support[5:] == 1) < 2
  640. def test_selectkbest_tiebreaking():
  641. # Test whether SelectKBest actually selects k features in case of ties.
  642. # Prior to 0.11, SelectKBest would return more features than requested.
  643. Xs = [[0, 1, 1], [0, 0, 1], [1, 0, 0], [1, 1, 0]]
  644. y = [1]
  645. dummy_score = lambda X, y: (X[0], X[0])
  646. for X in Xs:
  647. sel = SelectKBest(dummy_score, k=1)
  648. X1 = ignore_warnings(sel.fit_transform)([X], y)
  649. assert X1.shape[1] == 1
  650. assert_best_scores_kept(sel)
  651. sel = SelectKBest(dummy_score, k=2)
  652. X2 = ignore_warnings(sel.fit_transform)([X], y)
  653. assert X2.shape[1] == 2
  654. assert_best_scores_kept(sel)
  655. def test_selectpercentile_tiebreaking():
  656. # Test if SelectPercentile selects the right n_features in case of ties.
  657. Xs = [[0, 1, 1], [0, 0, 1], [1, 0, 0], [1, 1, 0]]
  658. y = [1]
  659. dummy_score = lambda X, y: (X[0], X[0])
  660. for X in Xs:
  661. sel = SelectPercentile(dummy_score, percentile=34)
  662. X1 = ignore_warnings(sel.fit_transform)([X], y)
  663. assert X1.shape[1] == 1
  664. assert_best_scores_kept(sel)
  665. sel = SelectPercentile(dummy_score, percentile=67)
  666. X2 = ignore_warnings(sel.fit_transform)([X], y)
  667. assert X2.shape[1] == 2
  668. assert_best_scores_kept(sel)
  669. def test_tied_pvalues():
  670. # Test whether k-best and percentiles work with tied pvalues from chi2.
  671. # chi2 will return the same p-values for the following features, but it
  672. # will return different scores.
  673. X0 = np.array([[10000, 9999, 9998], [1, 1, 1]])
  674. y = [0, 1]
  675. for perm in itertools.permutations((0, 1, 2)):
  676. X = X0[:, perm]
  677. Xt = SelectKBest(chi2, k=2).fit_transform(X, y)
  678. assert Xt.shape == (2, 2)
  679. assert 9998 not in Xt
  680. Xt = SelectPercentile(chi2, percentile=67).fit_transform(X, y)
  681. assert Xt.shape == (2, 2)
  682. assert 9998 not in Xt
  683. def test_scorefunc_multilabel():
  684. # Test whether k-best and percentiles works with multilabels with chi2.
  685. X = np.array([[10000, 9999, 0], [100, 9999, 0], [1000, 99, 0]])
  686. y = [[1, 1], [0, 1], [1, 0]]
  687. Xt = SelectKBest(chi2, k=2).fit_transform(X, y)
  688. assert Xt.shape == (3, 2)
  689. assert 0 not in Xt
  690. Xt = SelectPercentile(chi2, percentile=67).fit_transform(X, y)
  691. assert Xt.shape == (3, 2)
  692. assert 0 not in Xt
  693. def test_tied_scores():
  694. # Test for stable sorting in k-best with tied scores.
  695. X_train = np.array([[0, 0, 0], [1, 1, 1]])
  696. y_train = [0, 1]
  697. for n_features in [1, 2, 3]:
  698. sel = SelectKBest(chi2, k=n_features).fit(X_train, y_train)
  699. X_test = sel.transform([[0, 1, 2]])
  700. assert_array_equal(X_test[0], np.arange(3)[-n_features:])
  701. def test_nans():
  702. # Assert that SelectKBest and SelectPercentile can handle NaNs.
  703. # First feature has zero variance to confuse f_classif (ANOVA) and
  704. # make it return a NaN.
  705. X = [[0, 1, 0], [0, -1, -1], [0, 0.5, 0.5]]
  706. y = [1, 0, 1]
  707. for select in (
  708. SelectKBest(f_classif, k=2),
  709. SelectPercentile(f_classif, percentile=67),
  710. ):
  711. ignore_warnings(select.fit)(X, y)
  712. assert_array_equal(select.get_support(indices=True), np.array([1, 2]))
  713. def test_invalid_k():
  714. X = [[0, 1, 0], [0, -1, -1], [0, 0.5, 0.5]]
  715. y = [1, 0, 1]
  716. with pytest.raises(ValueError):
  717. SelectKBest(k=4).fit(X, y)
  718. with pytest.raises(ValueError):
  719. GenericUnivariateSelect(mode="k_best", param=4).fit(X, y)
  720. def test_f_classif_constant_feature():
  721. # Test that f_classif warns if a feature is constant throughout.
  722. X, y = make_classification(n_samples=10, n_features=5)
  723. X[:, 0] = 2.0
  724. with pytest.warns(UserWarning):
  725. f_classif(X, y)
  726. def test_no_feature_selected():
  727. rng = np.random.RandomState(0)
  728. # Generate random uncorrelated data: a strict univariate test should
  729. # rejects all the features
  730. X = rng.rand(40, 10)
  731. y = rng.randint(0, 4, size=40)
  732. strict_selectors = [
  733. SelectFwe(alpha=0.01).fit(X, y),
  734. SelectFdr(alpha=0.01).fit(X, y),
  735. SelectFpr(alpha=0.01).fit(X, y),
  736. SelectPercentile(percentile=0).fit(X, y),
  737. SelectKBest(k=0).fit(X, y),
  738. ]
  739. for selector in strict_selectors:
  740. assert_array_equal(selector.get_support(), np.zeros(10))
  741. with pytest.warns(UserWarning, match="No features were selected"):
  742. X_selected = selector.transform(X)
  743. assert X_selected.shape == (40, 0)
  744. def test_mutual_info_classif():
  745. X, y = make_classification(
  746. n_samples=100,
  747. n_features=5,
  748. n_informative=1,
  749. n_redundant=1,
  750. n_repeated=0,
  751. n_classes=2,
  752. n_clusters_per_class=1,
  753. flip_y=0.0,
  754. class_sep=10,
  755. shuffle=False,
  756. random_state=0,
  757. )
  758. # Test in KBest mode.
  759. univariate_filter = SelectKBest(mutual_info_classif, k=2)
  760. X_r = univariate_filter.fit(X, y).transform(X)
  761. X_r2 = (
  762. GenericUnivariateSelect(mutual_info_classif, mode="k_best", param=2)
  763. .fit(X, y)
  764. .transform(X)
  765. )
  766. assert_array_equal(X_r, X_r2)
  767. support = univariate_filter.get_support()
  768. gtruth = np.zeros(5)
  769. gtruth[:2] = 1
  770. assert_array_equal(support, gtruth)
  771. # Test in Percentile mode.
  772. univariate_filter = SelectPercentile(mutual_info_classif, percentile=40)
  773. X_r = univariate_filter.fit(X, y).transform(X)
  774. X_r2 = (
  775. GenericUnivariateSelect(mutual_info_classif, mode="percentile", param=40)
  776. .fit(X, y)
  777. .transform(X)
  778. )
  779. assert_array_equal(X_r, X_r2)
  780. support = univariate_filter.get_support()
  781. gtruth = np.zeros(5)
  782. gtruth[:2] = 1
  783. assert_array_equal(support, gtruth)
  784. def test_mutual_info_regression():
  785. X, y = make_regression(
  786. n_samples=100,
  787. n_features=10,
  788. n_informative=2,
  789. shuffle=False,
  790. random_state=0,
  791. noise=10,
  792. )
  793. # Test in KBest mode.
  794. univariate_filter = SelectKBest(mutual_info_regression, k=2)
  795. X_r = univariate_filter.fit(X, y).transform(X)
  796. assert_best_scores_kept(univariate_filter)
  797. X_r2 = (
  798. GenericUnivariateSelect(mutual_info_regression, mode="k_best", param=2)
  799. .fit(X, y)
  800. .transform(X)
  801. )
  802. assert_array_equal(X_r, X_r2)
  803. support = univariate_filter.get_support()
  804. gtruth = np.zeros(10)
  805. gtruth[:2] = 1
  806. assert_array_equal(support, gtruth)
  807. # Test in Percentile mode.
  808. univariate_filter = SelectPercentile(mutual_info_regression, percentile=20)
  809. X_r = univariate_filter.fit(X, y).transform(X)
  810. X_r2 = (
  811. GenericUnivariateSelect(mutual_info_regression, mode="percentile", param=20)
  812. .fit(X, y)
  813. .transform(X)
  814. )
  815. assert_array_equal(X_r, X_r2)
  816. support = univariate_filter.get_support()
  817. gtruth = np.zeros(10)
  818. gtruth[:2] = 1
  819. assert_array_equal(support, gtruth)
  820. def test_dataframe_output_dtypes():
  821. """Check that the output datafarme dtypes are the same as the input.
  822. Non-regression test for gh-24860.
  823. """
  824. pd = pytest.importorskip("pandas")
  825. X, y = load_iris(return_X_y=True, as_frame=True)
  826. X = X.astype(
  827. {
  828. "petal length (cm)": np.float32,
  829. "petal width (cm)": np.float64,
  830. }
  831. )
  832. X["petal_width_binned"] = pd.cut(X["petal width (cm)"], bins=10)
  833. column_order = X.columns
  834. def selector(X, y):
  835. ranking = {
  836. "sepal length (cm)": 1,
  837. "sepal width (cm)": 2,
  838. "petal length (cm)": 3,
  839. "petal width (cm)": 4,
  840. "petal_width_binned": 5,
  841. }
  842. return np.asarray([ranking[name] for name in column_order])
  843. univariate_filter = SelectKBest(selector, k=3).set_output(transform="pandas")
  844. output = univariate_filter.fit_transform(X, y)
  845. assert_array_equal(
  846. output.columns, ["petal length (cm)", "petal width (cm)", "petal_width_binned"]
  847. )
  848. for name, dtype in output.dtypes.items():
  849. assert dtype == X.dtypes[name]