test_sequential.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322
  1. import numpy as np
  2. import pytest
  3. import scipy
  4. from numpy.testing import assert_array_equal
  5. from sklearn.cluster import KMeans
  6. from sklearn.datasets import make_blobs, make_classification, make_regression
  7. from sklearn.ensemble import HistGradientBoostingRegressor
  8. from sklearn.feature_selection import SequentialFeatureSelector
  9. from sklearn.linear_model import LinearRegression
  10. from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
  11. from sklearn.neighbors import KNeighborsClassifier
  12. from sklearn.pipeline import make_pipeline
  13. from sklearn.preprocessing import StandardScaler
  14. def test_bad_n_features_to_select():
  15. n_features = 5
  16. X, y = make_regression(n_features=n_features)
  17. sfs = SequentialFeatureSelector(LinearRegression(), n_features_to_select=n_features)
  18. with pytest.raises(ValueError, match="n_features_to_select must be < n_features"):
  19. sfs.fit(X, y)
  20. @pytest.mark.parametrize("direction", ("forward", "backward"))
  21. @pytest.mark.parametrize("n_features_to_select", (1, 5, 9, "auto"))
  22. def test_n_features_to_select(direction, n_features_to_select):
  23. # Make sure n_features_to_select is respected
  24. n_features = 10
  25. X, y = make_regression(n_features=n_features, random_state=0)
  26. sfs = SequentialFeatureSelector(
  27. LinearRegression(),
  28. n_features_to_select=n_features_to_select,
  29. direction=direction,
  30. cv=2,
  31. )
  32. sfs.fit(X, y)
  33. if n_features_to_select == "auto":
  34. n_features_to_select = n_features // 2
  35. assert sfs.get_support(indices=True).shape[0] == n_features_to_select
  36. assert sfs.n_features_to_select_ == n_features_to_select
  37. assert sfs.transform(X).shape[1] == n_features_to_select
  38. @pytest.mark.parametrize("direction", ("forward", "backward"))
  39. def test_n_features_to_select_auto(direction):
  40. """Check the behaviour of `n_features_to_select="auto"` with different
  41. values for the parameter `tol`.
  42. """
  43. n_features = 10
  44. tol = 1e-3
  45. X, y = make_regression(n_features=n_features, random_state=0)
  46. sfs = SequentialFeatureSelector(
  47. LinearRegression(),
  48. n_features_to_select="auto",
  49. tol=tol,
  50. direction=direction,
  51. cv=2,
  52. )
  53. sfs.fit(X, y)
  54. max_features_to_select = n_features - 1
  55. assert sfs.get_support(indices=True).shape[0] <= max_features_to_select
  56. assert sfs.n_features_to_select_ <= max_features_to_select
  57. assert sfs.transform(X).shape[1] <= max_features_to_select
  58. assert sfs.get_support(indices=True).shape[0] == sfs.n_features_to_select_
  59. @pytest.mark.parametrize("direction", ("forward", "backward"))
  60. def test_n_features_to_select_stopping_criterion(direction):
  61. """Check the behaviour stopping criterion for feature selection
  62. depending on the values of `n_features_to_select` and `tol`.
  63. When `direction` is `'forward'`, select a new features at random
  64. among those not currently selected in selector.support_,
  65. build a new version of the data that includes all the features
  66. in selector.support_ + this newly selected feature.
  67. And check that the cross-validation score of the model trained on
  68. this new dataset variant is lower than the model with
  69. the selected forward selected features or at least does not improve
  70. by more than the tol margin.
  71. When `direction` is `'backward'`, instead of adding a new feature
  72. to selector.support_, try to remove one of those selected features at random
  73. And check that the cross-validation score is either decreasing or
  74. not improving by more than the tol margin.
  75. """
  76. X, y = make_regression(n_features=50, n_informative=10, random_state=0)
  77. tol = 1e-3
  78. sfs = SequentialFeatureSelector(
  79. LinearRegression(),
  80. n_features_to_select="auto",
  81. tol=tol,
  82. direction=direction,
  83. cv=2,
  84. )
  85. sfs.fit(X, y)
  86. selected_X = sfs.transform(X)
  87. rng = np.random.RandomState(0)
  88. added_candidates = list(set(range(X.shape[1])) - set(sfs.get_support(indices=True)))
  89. added_X = np.hstack(
  90. [
  91. selected_X,
  92. (X[:, rng.choice(added_candidates)])[:, np.newaxis],
  93. ]
  94. )
  95. removed_candidate = rng.choice(list(range(sfs.n_features_to_select_)))
  96. removed_X = np.delete(selected_X, removed_candidate, axis=1)
  97. plain_cv_score = cross_val_score(LinearRegression(), X, y, cv=2).mean()
  98. sfs_cv_score = cross_val_score(LinearRegression(), selected_X, y, cv=2).mean()
  99. added_cv_score = cross_val_score(LinearRegression(), added_X, y, cv=2).mean()
  100. removed_cv_score = cross_val_score(LinearRegression(), removed_X, y, cv=2).mean()
  101. assert sfs_cv_score >= plain_cv_score
  102. if direction == "forward":
  103. assert (sfs_cv_score - added_cv_score) <= tol
  104. assert (sfs_cv_score - removed_cv_score) >= tol
  105. else:
  106. assert (added_cv_score - sfs_cv_score) <= tol
  107. assert (removed_cv_score - sfs_cv_score) <= tol
  108. @pytest.mark.parametrize("direction", ("forward", "backward"))
  109. @pytest.mark.parametrize(
  110. "n_features_to_select, expected",
  111. (
  112. (0.1, 1),
  113. (1.0, 10),
  114. (0.5, 5),
  115. ),
  116. )
  117. def test_n_features_to_select_float(direction, n_features_to_select, expected):
  118. # Test passing a float as n_features_to_select
  119. X, y = make_regression(n_features=10)
  120. sfs = SequentialFeatureSelector(
  121. LinearRegression(),
  122. n_features_to_select=n_features_to_select,
  123. direction=direction,
  124. cv=2,
  125. )
  126. sfs.fit(X, y)
  127. assert sfs.n_features_to_select_ == expected
  128. @pytest.mark.parametrize("seed", range(10))
  129. @pytest.mark.parametrize("direction", ("forward", "backward"))
  130. @pytest.mark.parametrize(
  131. "n_features_to_select, expected_selected_features",
  132. [
  133. (2, [0, 2]), # f1 is dropped since it has no predictive power
  134. (1, [2]), # f2 is more predictive than f0 so it's kept
  135. ],
  136. )
  137. def test_sanity(seed, direction, n_features_to_select, expected_selected_features):
  138. # Basic sanity check: 3 features, only f0 and f2 are correlated with the
  139. # target, f2 having a stronger correlation than f0. We expect f1 to be
  140. # dropped, and f2 to always be selected.
  141. rng = np.random.RandomState(seed)
  142. n_samples = 100
  143. X = rng.randn(n_samples, 3)
  144. y = 3 * X[:, 0] - 10 * X[:, 2]
  145. sfs = SequentialFeatureSelector(
  146. LinearRegression(),
  147. n_features_to_select=n_features_to_select,
  148. direction=direction,
  149. cv=2,
  150. )
  151. sfs.fit(X, y)
  152. assert_array_equal(sfs.get_support(indices=True), expected_selected_features)
  153. def test_sparse_support():
  154. # Make sure sparse data is supported
  155. X, y = make_regression(n_features=10)
  156. X = scipy.sparse.csr_matrix(X)
  157. sfs = SequentialFeatureSelector(
  158. LinearRegression(), n_features_to_select="auto", cv=2
  159. )
  160. sfs.fit(X, y)
  161. sfs.transform(X)
  162. def test_nan_support():
  163. # Make sure nans are OK if the underlying estimator supports nans
  164. rng = np.random.RandomState(0)
  165. n_samples, n_features = 40, 4
  166. X, y = make_regression(n_samples, n_features, random_state=0)
  167. nan_mask = rng.randint(0, 2, size=(n_samples, n_features), dtype=bool)
  168. X[nan_mask] = np.nan
  169. sfs = SequentialFeatureSelector(
  170. HistGradientBoostingRegressor(), n_features_to_select="auto", cv=2
  171. )
  172. sfs.fit(X, y)
  173. sfs.transform(X)
  174. with pytest.raises(ValueError, match="Input X contains NaN"):
  175. # LinearRegression does not support nans
  176. SequentialFeatureSelector(
  177. LinearRegression(), n_features_to_select="auto", cv=2
  178. ).fit(X, y)
  179. def test_pipeline_support():
  180. # Make sure that pipelines can be passed into SFS and that SFS can be
  181. # passed into a pipeline
  182. n_samples, n_features = 50, 3
  183. X, y = make_regression(n_samples, n_features, random_state=0)
  184. # pipeline in SFS
  185. pipe = make_pipeline(StandardScaler(), LinearRegression())
  186. sfs = SequentialFeatureSelector(pipe, n_features_to_select="auto", cv=2)
  187. sfs.fit(X, y)
  188. sfs.transform(X)
  189. # SFS in pipeline
  190. sfs = SequentialFeatureSelector(
  191. LinearRegression(), n_features_to_select="auto", cv=2
  192. )
  193. pipe = make_pipeline(StandardScaler(), sfs)
  194. pipe.fit(X, y)
  195. pipe.transform(X)
  196. @pytest.mark.parametrize("n_features_to_select", (2, 3))
  197. def test_unsupervised_model_fit(n_features_to_select):
  198. # Make sure that models without classification labels are not being
  199. # validated
  200. X, y = make_blobs(n_features=4)
  201. sfs = SequentialFeatureSelector(
  202. KMeans(n_init=1),
  203. n_features_to_select=n_features_to_select,
  204. )
  205. sfs.fit(X)
  206. assert sfs.transform(X).shape[1] == n_features_to_select
  207. @pytest.mark.parametrize("y", ("no_validation", 1j, 99.9, np.nan, 3))
  208. def test_no_y_validation_model_fit(y):
  209. # Make sure that other non-conventional y labels are not accepted
  210. X, clusters = make_blobs(n_features=6)
  211. sfs = SequentialFeatureSelector(
  212. KMeans(),
  213. n_features_to_select=3,
  214. )
  215. with pytest.raises((TypeError, ValueError)):
  216. sfs.fit(X, y)
  217. def test_forward_neg_tol_error():
  218. """Check that we raise an error when tol<0 and direction='forward'"""
  219. X, y = make_regression(n_features=10, random_state=0)
  220. sfs = SequentialFeatureSelector(
  221. LinearRegression(),
  222. n_features_to_select="auto",
  223. direction="forward",
  224. tol=-1e-3,
  225. )
  226. with pytest.raises(ValueError, match="tol must be positive"):
  227. sfs.fit(X, y)
  228. def test_backward_neg_tol():
  229. """Check that SequentialFeatureSelector works negative tol
  230. non-regression test for #25525
  231. """
  232. X, y = make_regression(n_features=10, random_state=0)
  233. lr = LinearRegression()
  234. initial_score = lr.fit(X, y).score(X, y)
  235. sfs = SequentialFeatureSelector(
  236. lr,
  237. n_features_to_select="auto",
  238. direction="backward",
  239. tol=-1e-3,
  240. )
  241. Xr = sfs.fit_transform(X, y)
  242. new_score = lr.fit(Xr, y).score(Xr, y)
  243. assert 0 < sfs.get_support().sum() < X.shape[1]
  244. assert new_score < initial_score
  245. def test_cv_generator_support():
  246. """Check that no exception raised when cv is generator
  247. non-regression test for #25957
  248. """
  249. X, y = make_classification(random_state=0)
  250. groups = np.zeros_like(y, dtype=int)
  251. groups[y.size // 2 :] = 1
  252. cv = LeaveOneGroupOut()
  253. splits = cv.split(X, y, groups=groups)
  254. knc = KNeighborsClassifier(n_neighbors=5)
  255. sfs = SequentialFeatureSelector(knc, n_features_to_select=5, cv=splits)
  256. sfs.fit(X, y)