_univariate_selection.py 36 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084
  1. """Univariate features selection."""
  2. # Authors: V. Michel, B. Thirion, G. Varoquaux, A. Gramfort, E. Duchesnay.
  3. # L. Buitinck, A. Joly
  4. # License: BSD 3 clause
  5. import warnings
  6. from numbers import Integral, Real
  7. import numpy as np
  8. from scipy import special, stats
  9. from scipy.sparse import issparse
  10. from ..base import BaseEstimator, _fit_context
  11. from ..preprocessing import LabelBinarizer
  12. from ..utils import as_float_array, check_array, check_X_y, safe_mask, safe_sqr
  13. from ..utils._param_validation import Interval, StrOptions, validate_params
  14. from ..utils.extmath import row_norms, safe_sparse_dot
  15. from ..utils.validation import check_is_fitted
  16. from ._base import SelectorMixin
  17. def _clean_nans(scores):
  18. """
  19. Fixes Issue #1240: NaNs can't be properly compared, so change them to the
  20. smallest value of scores's dtype. -inf seems to be unreliable.
  21. """
  22. # XXX where should this function be called? fit? scoring functions
  23. # themselves?
  24. scores = as_float_array(scores, copy=True)
  25. scores[np.isnan(scores)] = np.finfo(scores.dtype).min
  26. return scores
  27. ######################################################################
  28. # Scoring functions
  29. # The following function is a rewriting of scipy.stats.f_oneway
  30. # Contrary to the scipy.stats.f_oneway implementation it does not
  31. # copy the data while keeping the inputs unchanged.
  32. def f_oneway(*args):
  33. """Perform a 1-way ANOVA.
  34. The one-way ANOVA tests the null hypothesis that 2 or more groups have
  35. the same population mean. The test is applied to samples from two or
  36. more groups, possibly with differing sizes.
  37. Read more in the :ref:`User Guide <univariate_feature_selection>`.
  38. Parameters
  39. ----------
  40. *args : {array-like, sparse matrix}
  41. Sample1, sample2... The sample measurements should be given as
  42. arguments.
  43. Returns
  44. -------
  45. f_statistic : float
  46. The computed F-value of the test.
  47. p_value : float
  48. The associated p-value from the F-distribution.
  49. Notes
  50. -----
  51. The ANOVA test has important assumptions that must be satisfied in order
  52. for the associated p-value to be valid.
  53. 1. The samples are independent
  54. 2. Each sample is from a normally distributed population
  55. 3. The population standard deviations of the groups are all equal. This
  56. property is known as homoscedasticity.
  57. If these assumptions are not true for a given set of data, it may still be
  58. possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`_) although
  59. with some loss of power.
  60. The algorithm is from Heiman[2], pp.394-7.
  61. See ``scipy.stats.f_oneway`` that should give the same results while
  62. being less efficient.
  63. References
  64. ----------
  65. .. [1] Lowry, Richard. "Concepts and Applications of Inferential
  66. Statistics". Chapter 14.
  67. http://vassarstats.net/textbook
  68. .. [2] Heiman, G.W. Research Methods in Statistics. 2002.
  69. """
  70. n_classes = len(args)
  71. args = [as_float_array(a) for a in args]
  72. n_samples_per_class = np.array([a.shape[0] for a in args])
  73. n_samples = np.sum(n_samples_per_class)
  74. ss_alldata = sum(safe_sqr(a).sum(axis=0) for a in args)
  75. sums_args = [np.asarray(a.sum(axis=0)) for a in args]
  76. square_of_sums_alldata = sum(sums_args) ** 2
  77. square_of_sums_args = [s**2 for s in sums_args]
  78. sstot = ss_alldata - square_of_sums_alldata / float(n_samples)
  79. ssbn = 0.0
  80. for k, _ in enumerate(args):
  81. ssbn += square_of_sums_args[k] / n_samples_per_class[k]
  82. ssbn -= square_of_sums_alldata / float(n_samples)
  83. sswn = sstot - ssbn
  84. dfbn = n_classes - 1
  85. dfwn = n_samples - n_classes
  86. msb = ssbn / float(dfbn)
  87. msw = sswn / float(dfwn)
  88. constant_features_idx = np.where(msw == 0.0)[0]
  89. if np.nonzero(msb)[0].size != msb.size and constant_features_idx.size:
  90. warnings.warn("Features %s are constant." % constant_features_idx, UserWarning)
  91. f = msb / msw
  92. # flatten matrix to vector in sparse case
  93. f = np.asarray(f).ravel()
  94. prob = special.fdtrc(dfbn, dfwn, f)
  95. return f, prob
  96. @validate_params(
  97. {
  98. "X": ["array-like", "sparse matrix"],
  99. "y": ["array-like"],
  100. },
  101. prefer_skip_nested_validation=True,
  102. )
  103. def f_classif(X, y):
  104. """Compute the ANOVA F-value for the provided sample.
  105. Read more in the :ref:`User Guide <univariate_feature_selection>`.
  106. Parameters
  107. ----------
  108. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  109. The set of regressors that will be tested sequentially.
  110. y : array-like of shape (n_samples,)
  111. The target vector.
  112. Returns
  113. -------
  114. f_statistic : ndarray of shape (n_features,)
  115. F-statistic for each feature.
  116. p_values : ndarray of shape (n_features,)
  117. P-values associated with the F-statistic.
  118. See Also
  119. --------
  120. chi2 : Chi-squared stats of non-negative features for classification tasks.
  121. f_regression : F-value between label/feature for regression tasks.
  122. """
  123. X, y = check_X_y(X, y, accept_sparse=["csr", "csc", "coo"])
  124. args = [X[safe_mask(X, y == k)] for k in np.unique(y)]
  125. return f_oneway(*args)
  126. def _chisquare(f_obs, f_exp):
  127. """Fast replacement for scipy.stats.chisquare.
  128. Version from https://github.com/scipy/scipy/pull/2525 with additional
  129. optimizations.
  130. """
  131. f_obs = np.asarray(f_obs, dtype=np.float64)
  132. k = len(f_obs)
  133. # Reuse f_obs for chi-squared statistics
  134. chisq = f_obs
  135. chisq -= f_exp
  136. chisq **= 2
  137. with np.errstate(invalid="ignore"):
  138. chisq /= f_exp
  139. chisq = chisq.sum(axis=0)
  140. return chisq, special.chdtrc(k - 1, chisq)
  141. @validate_params(
  142. {
  143. "X": ["array-like", "sparse matrix"],
  144. "y": ["array-like"],
  145. },
  146. prefer_skip_nested_validation=True,
  147. )
  148. def chi2(X, y):
  149. """Compute chi-squared stats between each non-negative feature and class.
  150. This score can be used to select the `n_features` features with the
  151. highest values for the test chi-squared statistic from X, which must
  152. contain only **non-negative features** such as booleans or frequencies
  153. (e.g., term counts in document classification), relative to the classes.
  154. Recall that the chi-square test measures dependence between stochastic
  155. variables, so using this function "weeds out" the features that are the
  156. most likely to be independent of class and therefore irrelevant for
  157. classification.
  158. Read more in the :ref:`User Guide <univariate_feature_selection>`.
  159. Parameters
  160. ----------
  161. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  162. Sample vectors.
  163. y : array-like of shape (n_samples,)
  164. Target vector (class labels).
  165. Returns
  166. -------
  167. chi2 : ndarray of shape (n_features,)
  168. Chi2 statistics for each feature.
  169. p_values : ndarray of shape (n_features,)
  170. P-values for each feature.
  171. See Also
  172. --------
  173. f_classif : ANOVA F-value between label/feature for classification tasks.
  174. f_regression : F-value between label/feature for regression tasks.
  175. Notes
  176. -----
  177. Complexity of this algorithm is O(n_classes * n_features).
  178. """
  179. # XXX: we might want to do some of the following in logspace instead for
  180. # numerical stability.
  181. # Converting X to float allows getting better performance for the
  182. # safe_sparse_dot call made below.
  183. X = check_array(X, accept_sparse="csr", dtype=(np.float64, np.float32))
  184. if np.any((X.data if issparse(X) else X) < 0):
  185. raise ValueError("Input X must be non-negative.")
  186. # Use a sparse representation for Y by default to reduce memory usage when
  187. # y has many unique classes.
  188. Y = LabelBinarizer(sparse_output=True).fit_transform(y)
  189. if Y.shape[1] == 1:
  190. Y = Y.toarray()
  191. Y = np.append(1 - Y, Y, axis=1)
  192. observed = safe_sparse_dot(Y.T, X) # n_classes * n_features
  193. if issparse(observed):
  194. # convert back to a dense array before calling _chisquare
  195. # XXX: could _chisquare be reimplement to accept sparse matrices for
  196. # cases where both n_classes and n_features are large (and X is
  197. # sparse)?
  198. observed = observed.toarray()
  199. feature_count = X.sum(axis=0).reshape(1, -1)
  200. class_prob = Y.mean(axis=0).reshape(1, -1)
  201. expected = np.dot(class_prob.T, feature_count)
  202. return _chisquare(observed, expected)
  203. @validate_params(
  204. {
  205. "X": ["array-like", "sparse matrix"],
  206. "y": ["array-like"],
  207. "center": ["boolean"],
  208. "force_finite": ["boolean"],
  209. },
  210. prefer_skip_nested_validation=True,
  211. )
  212. def r_regression(X, y, *, center=True, force_finite=True):
  213. """Compute Pearson's r for each features and the target.
  214. Pearson's r is also known as the Pearson correlation coefficient.
  215. Linear model for testing the individual effect of each of many regressors.
  216. This is a scoring function to be used in a feature selection procedure, not
  217. a free standing feature selection procedure.
  218. The cross correlation between each regressor and the target is computed
  219. as::
  220. E[(X[:, i] - mean(X[:, i])) * (y - mean(y))] / (std(X[:, i]) * std(y))
  221. For more on usage see the :ref:`User Guide <univariate_feature_selection>`.
  222. .. versionadded:: 1.0
  223. Parameters
  224. ----------
  225. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  226. The data matrix.
  227. y : array-like of shape (n_samples,)
  228. The target vector.
  229. center : bool, default=True
  230. Whether or not to center the data matrix `X` and the target vector `y`.
  231. By default, `X` and `y` will be centered.
  232. force_finite : bool, default=True
  233. Whether or not to force the Pearson's R correlation to be finite.
  234. In the particular case where some features in `X` or the target `y`
  235. are constant, the Pearson's R correlation is not defined. When
  236. `force_finite=False`, a correlation of `np.nan` is returned to
  237. acknowledge this case. When `force_finite=True`, this value will be
  238. forced to a minimal correlation of `0.0`.
  239. .. versionadded:: 1.1
  240. Returns
  241. -------
  242. correlation_coefficient : ndarray of shape (n_features,)
  243. Pearson's R correlation coefficients of features.
  244. See Also
  245. --------
  246. f_regression: Univariate linear regression tests returning f-statistic
  247. and p-values.
  248. mutual_info_regression: Mutual information for a continuous target.
  249. f_classif: ANOVA F-value between label/feature for classification tasks.
  250. chi2: Chi-squared stats of non-negative features for classification tasks.
  251. """
  252. X, y = check_X_y(X, y, accept_sparse=["csr", "csc", "coo"], dtype=np.float64)
  253. n_samples = X.shape[0]
  254. # Compute centered values
  255. # Note that E[(x - mean(x))*(y - mean(y))] = E[x*(y - mean(y))], so we
  256. # need not center X
  257. if center:
  258. y = y - np.mean(y)
  259. if issparse(X):
  260. X_means = X.mean(axis=0).getA1()
  261. else:
  262. X_means = X.mean(axis=0)
  263. # Compute the scaled standard deviations via moments
  264. X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means**2)
  265. else:
  266. X_norms = row_norms(X.T)
  267. correlation_coefficient = safe_sparse_dot(y, X)
  268. with np.errstate(divide="ignore", invalid="ignore"):
  269. correlation_coefficient /= X_norms
  270. correlation_coefficient /= np.linalg.norm(y)
  271. if force_finite and not np.isfinite(correlation_coefficient).all():
  272. # case where the target or some features are constant
  273. # the correlation coefficient(s) is/are set to the minimum (i.e. 0.0)
  274. nan_mask = np.isnan(correlation_coefficient)
  275. correlation_coefficient[nan_mask] = 0.0
  276. return correlation_coefficient
  277. @validate_params(
  278. {
  279. "X": ["array-like", "sparse matrix"],
  280. "y": ["array-like"],
  281. "center": ["boolean"],
  282. "force_finite": ["boolean"],
  283. },
  284. prefer_skip_nested_validation=True,
  285. )
  286. def f_regression(X, y, *, center=True, force_finite=True):
  287. """Univariate linear regression tests returning F-statistic and p-values.
  288. Quick linear model for testing the effect of a single regressor,
  289. sequentially for many regressors.
  290. This is done in 2 steps:
  291. 1. The cross correlation between each regressor and the target is computed
  292. using :func:`r_regression` as::
  293. E[(X[:, i] - mean(X[:, i])) * (y - mean(y))] / (std(X[:, i]) * std(y))
  294. 2. It is converted to an F score and then to a p-value.
  295. :func:`f_regression` is derived from :func:`r_regression` and will rank
  296. features in the same order if all the features are positively correlated
  297. with the target.
  298. Note however that contrary to :func:`f_regression`, :func:`r_regression`
  299. values lie in [-1, 1] and can thus be negative. :func:`f_regression` is
  300. therefore recommended as a feature selection criterion to identify
  301. potentially predictive feature for a downstream classifier, irrespective of
  302. the sign of the association with the target variable.
  303. Furthermore :func:`f_regression` returns p-values while
  304. :func:`r_regression` does not.
  305. Read more in the :ref:`User Guide <univariate_feature_selection>`.
  306. Parameters
  307. ----------
  308. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  309. The data matrix.
  310. y : array-like of shape (n_samples,)
  311. The target vector.
  312. center : bool, default=True
  313. Whether or not to center the data matrix `X` and the target vector `y`.
  314. By default, `X` and `y` will be centered.
  315. force_finite : bool, default=True
  316. Whether or not to force the F-statistics and associated p-values to
  317. be finite. There are two cases where the F-statistic is expected to not
  318. be finite:
  319. - when the target `y` or some features in `X` are constant. In this
  320. case, the Pearson's R correlation is not defined leading to obtain
  321. `np.nan` values in the F-statistic and p-value. When
  322. `force_finite=True`, the F-statistic is set to `0.0` and the
  323. associated p-value is set to `1.0`.
  324. - when a feature in `X` is perfectly correlated (or
  325. anti-correlated) with the target `y`. In this case, the F-statistic
  326. is expected to be `np.inf`. When `force_finite=True`, the F-statistic
  327. is set to `np.finfo(dtype).max` and the associated p-value is set to
  328. `0.0`.
  329. .. versionadded:: 1.1
  330. Returns
  331. -------
  332. f_statistic : ndarray of shape (n_features,)
  333. F-statistic for each feature.
  334. p_values : ndarray of shape (n_features,)
  335. P-values associated with the F-statistic.
  336. See Also
  337. --------
  338. r_regression: Pearson's R between label/feature for regression tasks.
  339. f_classif: ANOVA F-value between label/feature for classification tasks.
  340. chi2: Chi-squared stats of non-negative features for classification tasks.
  341. SelectKBest: Select features based on the k highest scores.
  342. SelectFpr: Select features based on a false positive rate test.
  343. SelectFdr: Select features based on an estimated false discovery rate.
  344. SelectFwe: Select features based on family-wise error rate.
  345. SelectPercentile: Select features based on percentile of the highest
  346. scores.
  347. """
  348. correlation_coefficient = r_regression(
  349. X, y, center=center, force_finite=force_finite
  350. )
  351. deg_of_freedom = y.size - (2 if center else 1)
  352. corr_coef_squared = correlation_coefficient**2
  353. with np.errstate(divide="ignore", invalid="ignore"):
  354. f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom
  355. p_values = stats.f.sf(f_statistic, 1, deg_of_freedom)
  356. if force_finite and not np.isfinite(f_statistic).all():
  357. # case where there is a perfect (anti-)correlation
  358. # f-statistics can be set to the maximum and p-values to zero
  359. mask_inf = np.isinf(f_statistic)
  360. f_statistic[mask_inf] = np.finfo(f_statistic.dtype).max
  361. # case where the target or some features are constant
  362. # f-statistics would be minimum and thus p-values large
  363. mask_nan = np.isnan(f_statistic)
  364. f_statistic[mask_nan] = 0.0
  365. p_values[mask_nan] = 1.0
  366. return f_statistic, p_values
  367. ######################################################################
  368. # Base classes
  369. class _BaseFilter(SelectorMixin, BaseEstimator):
  370. """Initialize the univariate feature selection.
  371. Parameters
  372. ----------
  373. score_func : callable
  374. Function taking two arrays X and y, and returning a pair of arrays
  375. (scores, pvalues) or a single array with scores.
  376. """
  377. _parameter_constraints: dict = {"score_func": [callable]}
  378. def __init__(self, score_func):
  379. self.score_func = score_func
  380. @_fit_context(prefer_skip_nested_validation=True)
  381. def fit(self, X, y):
  382. """Run score function on (X, y) and get the appropriate features.
  383. Parameters
  384. ----------
  385. X : array-like of shape (n_samples, n_features)
  386. The training input samples.
  387. y : array-like of shape (n_samples,)
  388. The target values (class labels in classification, real numbers in
  389. regression).
  390. Returns
  391. -------
  392. self : object
  393. Returns the instance itself.
  394. """
  395. X, y = self._validate_data(
  396. X, y, accept_sparse=["csr", "csc"], multi_output=True
  397. )
  398. self._check_params(X, y)
  399. score_func_ret = self.score_func(X, y)
  400. if isinstance(score_func_ret, (list, tuple)):
  401. self.scores_, self.pvalues_ = score_func_ret
  402. self.pvalues_ = np.asarray(self.pvalues_)
  403. else:
  404. self.scores_ = score_func_ret
  405. self.pvalues_ = None
  406. self.scores_ = np.asarray(self.scores_)
  407. return self
  408. def _check_params(self, X, y):
  409. pass
  410. def _more_tags(self):
  411. return {"requires_y": True}
  412. ######################################################################
  413. # Specific filters
  414. ######################################################################
  415. class SelectPercentile(_BaseFilter):
  416. """Select features according to a percentile of the highest scores.
  417. Read more in the :ref:`User Guide <univariate_feature_selection>`.
  418. Parameters
  419. ----------
  420. score_func : callable, default=f_classif
  421. Function taking two arrays X and y, and returning a pair of arrays
  422. (scores, pvalues) or a single array with scores.
  423. Default is f_classif (see below "See Also"). The default function only
  424. works with classification tasks.
  425. .. versionadded:: 0.18
  426. percentile : int, default=10
  427. Percent of features to keep.
  428. Attributes
  429. ----------
  430. scores_ : array-like of shape (n_features,)
  431. Scores of features.
  432. pvalues_ : array-like of shape (n_features,)
  433. p-values of feature scores, None if `score_func` returned only scores.
  434. n_features_in_ : int
  435. Number of features seen during :term:`fit`.
  436. .. versionadded:: 0.24
  437. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  438. Names of features seen during :term:`fit`. Defined only when `X`
  439. has feature names that are all strings.
  440. .. versionadded:: 1.0
  441. See Also
  442. --------
  443. f_classif : ANOVA F-value between label/feature for classification tasks.
  444. mutual_info_classif : Mutual information for a discrete target.
  445. chi2 : Chi-squared stats of non-negative features for classification tasks.
  446. f_regression : F-value between label/feature for regression tasks.
  447. mutual_info_regression : Mutual information for a continuous target.
  448. SelectKBest : Select features based on the k highest scores.
  449. SelectFpr : Select features based on a false positive rate test.
  450. SelectFdr : Select features based on an estimated false discovery rate.
  451. SelectFwe : Select features based on family-wise error rate.
  452. GenericUnivariateSelect : Univariate feature selector with configurable
  453. mode.
  454. Notes
  455. -----
  456. Ties between features with equal scores will be broken in an unspecified
  457. way.
  458. Examples
  459. --------
  460. >>> from sklearn.datasets import load_digits
  461. >>> from sklearn.feature_selection import SelectPercentile, chi2
  462. >>> X, y = load_digits(return_X_y=True)
  463. >>> X.shape
  464. (1797, 64)
  465. >>> X_new = SelectPercentile(chi2, percentile=10).fit_transform(X, y)
  466. >>> X_new.shape
  467. (1797, 7)
  468. """
  469. _parameter_constraints: dict = {
  470. **_BaseFilter._parameter_constraints,
  471. "percentile": [Interval(Real, 0, 100, closed="both")],
  472. }
  473. def __init__(self, score_func=f_classif, *, percentile=10):
  474. super().__init__(score_func=score_func)
  475. self.percentile = percentile
  476. def _get_support_mask(self):
  477. check_is_fitted(self)
  478. # Cater for NaNs
  479. if self.percentile == 100:
  480. return np.ones(len(self.scores_), dtype=bool)
  481. elif self.percentile == 0:
  482. return np.zeros(len(self.scores_), dtype=bool)
  483. scores = _clean_nans(self.scores_)
  484. threshold = np.percentile(scores, 100 - self.percentile)
  485. mask = scores > threshold
  486. ties = np.where(scores == threshold)[0]
  487. if len(ties):
  488. max_feats = int(len(scores) * self.percentile / 100)
  489. kept_ties = ties[: max_feats - mask.sum()]
  490. mask[kept_ties] = True
  491. return mask
  492. class SelectKBest(_BaseFilter):
  493. """Select features according to the k highest scores.
  494. Read more in the :ref:`User Guide <univariate_feature_selection>`.
  495. Parameters
  496. ----------
  497. score_func : callable, default=f_classif
  498. Function taking two arrays X and y, and returning a pair of arrays
  499. (scores, pvalues) or a single array with scores.
  500. Default is f_classif (see below "See Also"). The default function only
  501. works with classification tasks.
  502. .. versionadded:: 0.18
  503. k : int or "all", default=10
  504. Number of top features to select.
  505. The "all" option bypasses selection, for use in a parameter search.
  506. Attributes
  507. ----------
  508. scores_ : array-like of shape (n_features,)
  509. Scores of features.
  510. pvalues_ : array-like of shape (n_features,)
  511. p-values of feature scores, None if `score_func` returned only scores.
  512. n_features_in_ : int
  513. Number of features seen during :term:`fit`.
  514. .. versionadded:: 0.24
  515. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  516. Names of features seen during :term:`fit`. Defined only when `X`
  517. has feature names that are all strings.
  518. .. versionadded:: 1.0
  519. See Also
  520. --------
  521. f_classif: ANOVA F-value between label/feature for classification tasks.
  522. mutual_info_classif: Mutual information for a discrete target.
  523. chi2: Chi-squared stats of non-negative features for classification tasks.
  524. f_regression: F-value between label/feature for regression tasks.
  525. mutual_info_regression: Mutual information for a continuous target.
  526. SelectPercentile: Select features based on percentile of the highest
  527. scores.
  528. SelectFpr : Select features based on a false positive rate test.
  529. SelectFdr : Select features based on an estimated false discovery rate.
  530. SelectFwe : Select features based on family-wise error rate.
  531. GenericUnivariateSelect : Univariate feature selector with configurable
  532. mode.
  533. Notes
  534. -----
  535. Ties between features with equal scores will be broken in an unspecified
  536. way.
  537. Examples
  538. --------
  539. >>> from sklearn.datasets import load_digits
  540. >>> from sklearn.feature_selection import SelectKBest, chi2
  541. >>> X, y = load_digits(return_X_y=True)
  542. >>> X.shape
  543. (1797, 64)
  544. >>> X_new = SelectKBest(chi2, k=20).fit_transform(X, y)
  545. >>> X_new.shape
  546. (1797, 20)
  547. """
  548. _parameter_constraints: dict = {
  549. **_BaseFilter._parameter_constraints,
  550. "k": [StrOptions({"all"}), Interval(Integral, 0, None, closed="left")],
  551. }
  552. def __init__(self, score_func=f_classif, *, k=10):
  553. super().__init__(score_func=score_func)
  554. self.k = k
  555. def _check_params(self, X, y):
  556. if not isinstance(self.k, str) and self.k > X.shape[1]:
  557. raise ValueError(
  558. f"k should be <= n_features = {X.shape[1]}; "
  559. f"got {self.k}. Use k='all' to return all features."
  560. )
  561. def _get_support_mask(self):
  562. check_is_fitted(self)
  563. if self.k == "all":
  564. return np.ones(self.scores_.shape, dtype=bool)
  565. elif self.k == 0:
  566. return np.zeros(self.scores_.shape, dtype=bool)
  567. else:
  568. scores = _clean_nans(self.scores_)
  569. mask = np.zeros(scores.shape, dtype=bool)
  570. # Request a stable sort. Mergesort takes more memory (~40MB per
  571. # megafeature on x86-64).
  572. mask[np.argsort(scores, kind="mergesort")[-self.k :]] = 1
  573. return mask
  574. class SelectFpr(_BaseFilter):
  575. """Filter: Select the pvalues below alpha based on a FPR test.
  576. FPR test stands for False Positive Rate test. It controls the total
  577. amount of false detections.
  578. Read more in the :ref:`User Guide <univariate_feature_selection>`.
  579. Parameters
  580. ----------
  581. score_func : callable, default=f_classif
  582. Function taking two arrays X and y, and returning a pair of arrays
  583. (scores, pvalues).
  584. Default is f_classif (see below "See Also"). The default function only
  585. works with classification tasks.
  586. alpha : float, default=5e-2
  587. Features with p-values less than `alpha` are selected.
  588. Attributes
  589. ----------
  590. scores_ : array-like of shape (n_features,)
  591. Scores of features.
  592. pvalues_ : array-like of shape (n_features,)
  593. p-values of feature scores.
  594. n_features_in_ : int
  595. Number of features seen during :term:`fit`.
  596. .. versionadded:: 0.24
  597. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  598. Names of features seen during :term:`fit`. Defined only when `X`
  599. has feature names that are all strings.
  600. .. versionadded:: 1.0
  601. See Also
  602. --------
  603. f_classif : ANOVA F-value between label/feature for classification tasks.
  604. chi2 : Chi-squared stats of non-negative features for classification tasks.
  605. mutual_info_classif: Mutual information for a discrete target.
  606. f_regression : F-value between label/feature for regression tasks.
  607. mutual_info_regression : Mutual information for a continuous target.
  608. SelectPercentile : Select features based on percentile of the highest
  609. scores.
  610. SelectKBest : Select features based on the k highest scores.
  611. SelectFdr : Select features based on an estimated false discovery rate.
  612. SelectFwe : Select features based on family-wise error rate.
  613. GenericUnivariateSelect : Univariate feature selector with configurable
  614. mode.
  615. Examples
  616. --------
  617. >>> from sklearn.datasets import load_breast_cancer
  618. >>> from sklearn.feature_selection import SelectFpr, chi2
  619. >>> X, y = load_breast_cancer(return_X_y=True)
  620. >>> X.shape
  621. (569, 30)
  622. >>> X_new = SelectFpr(chi2, alpha=0.01).fit_transform(X, y)
  623. >>> X_new.shape
  624. (569, 16)
  625. """
  626. _parameter_constraints: dict = {
  627. **_BaseFilter._parameter_constraints,
  628. "alpha": [Interval(Real, 0, 1, closed="both")],
  629. }
  630. def __init__(self, score_func=f_classif, *, alpha=5e-2):
  631. super().__init__(score_func=score_func)
  632. self.alpha = alpha
  633. def _get_support_mask(self):
  634. check_is_fitted(self)
  635. return self.pvalues_ < self.alpha
  636. class SelectFdr(_BaseFilter):
  637. """Filter: Select the p-values for an estimated false discovery rate.
  638. This uses the Benjamini-Hochberg procedure. ``alpha`` is an upper bound
  639. on the expected false discovery rate.
  640. Read more in the :ref:`User Guide <univariate_feature_selection>`.
  641. Parameters
  642. ----------
  643. score_func : callable, default=f_classif
  644. Function taking two arrays X and y, and returning a pair of arrays
  645. (scores, pvalues).
  646. Default is f_classif (see below "See Also"). The default function only
  647. works with classification tasks.
  648. alpha : float, default=5e-2
  649. The highest uncorrected p-value for features to keep.
  650. Attributes
  651. ----------
  652. scores_ : array-like of shape (n_features,)
  653. Scores of features.
  654. pvalues_ : array-like of shape (n_features,)
  655. p-values of feature scores.
  656. n_features_in_ : int
  657. Number of features seen during :term:`fit`.
  658. .. versionadded:: 0.24
  659. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  660. Names of features seen during :term:`fit`. Defined only when `X`
  661. has feature names that are all strings.
  662. .. versionadded:: 1.0
  663. See Also
  664. --------
  665. f_classif : ANOVA F-value between label/feature for classification tasks.
  666. mutual_info_classif : Mutual information for a discrete target.
  667. chi2 : Chi-squared stats of non-negative features for classification tasks.
  668. f_regression : F-value between label/feature for regression tasks.
  669. mutual_info_regression : Mutual information for a continuous target.
  670. SelectPercentile : Select features based on percentile of the highest
  671. scores.
  672. SelectKBest : Select features based on the k highest scores.
  673. SelectFpr : Select features based on a false positive rate test.
  674. SelectFwe : Select features based on family-wise error rate.
  675. GenericUnivariateSelect : Univariate feature selector with configurable
  676. mode.
  677. References
  678. ----------
  679. https://en.wikipedia.org/wiki/False_discovery_rate
  680. Examples
  681. --------
  682. >>> from sklearn.datasets import load_breast_cancer
  683. >>> from sklearn.feature_selection import SelectFdr, chi2
  684. >>> X, y = load_breast_cancer(return_X_y=True)
  685. >>> X.shape
  686. (569, 30)
  687. >>> X_new = SelectFdr(chi2, alpha=0.01).fit_transform(X, y)
  688. >>> X_new.shape
  689. (569, 16)
  690. """
  691. _parameter_constraints: dict = {
  692. **_BaseFilter._parameter_constraints,
  693. "alpha": [Interval(Real, 0, 1, closed="both")],
  694. }
  695. def __init__(self, score_func=f_classif, *, alpha=5e-2):
  696. super().__init__(score_func=score_func)
  697. self.alpha = alpha
  698. def _get_support_mask(self):
  699. check_is_fitted(self)
  700. n_features = len(self.pvalues_)
  701. sv = np.sort(self.pvalues_)
  702. selected = sv[
  703. sv <= float(self.alpha) / n_features * np.arange(1, n_features + 1)
  704. ]
  705. if selected.size == 0:
  706. return np.zeros_like(self.pvalues_, dtype=bool)
  707. return self.pvalues_ <= selected.max()
  708. class SelectFwe(_BaseFilter):
  709. """Filter: Select the p-values corresponding to Family-wise error rate.
  710. Read more in the :ref:`User Guide <univariate_feature_selection>`.
  711. Parameters
  712. ----------
  713. score_func : callable, default=f_classif
  714. Function taking two arrays X and y, and returning a pair of arrays
  715. (scores, pvalues).
  716. Default is f_classif (see below "See Also"). The default function only
  717. works with classification tasks.
  718. alpha : float, default=5e-2
  719. The highest uncorrected p-value for features to keep.
  720. Attributes
  721. ----------
  722. scores_ : array-like of shape (n_features,)
  723. Scores of features.
  724. pvalues_ : array-like of shape (n_features,)
  725. p-values of feature scores.
  726. n_features_in_ : int
  727. Number of features seen during :term:`fit`.
  728. .. versionadded:: 0.24
  729. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  730. Names of features seen during :term:`fit`. Defined only when `X`
  731. has feature names that are all strings.
  732. .. versionadded:: 1.0
  733. See Also
  734. --------
  735. f_classif : ANOVA F-value between label/feature for classification tasks.
  736. chi2 : Chi-squared stats of non-negative features for classification tasks.
  737. f_regression : F-value between label/feature for regression tasks.
  738. SelectPercentile : Select features based on percentile of the highest
  739. scores.
  740. SelectKBest : Select features based on the k highest scores.
  741. SelectFpr : Select features based on a false positive rate test.
  742. SelectFdr : Select features based on an estimated false discovery rate.
  743. GenericUnivariateSelect : Univariate feature selector with configurable
  744. mode.
  745. Examples
  746. --------
  747. >>> from sklearn.datasets import load_breast_cancer
  748. >>> from sklearn.feature_selection import SelectFwe, chi2
  749. >>> X, y = load_breast_cancer(return_X_y=True)
  750. >>> X.shape
  751. (569, 30)
  752. >>> X_new = SelectFwe(chi2, alpha=0.01).fit_transform(X, y)
  753. >>> X_new.shape
  754. (569, 15)
  755. """
  756. _parameter_constraints: dict = {
  757. **_BaseFilter._parameter_constraints,
  758. "alpha": [Interval(Real, 0, 1, closed="both")],
  759. }
  760. def __init__(self, score_func=f_classif, *, alpha=5e-2):
  761. super().__init__(score_func=score_func)
  762. self.alpha = alpha
  763. def _get_support_mask(self):
  764. check_is_fitted(self)
  765. return self.pvalues_ < self.alpha / len(self.pvalues_)
  766. ######################################################################
  767. # Generic filter
  768. ######################################################################
  769. # TODO this class should fit on either p-values or scores,
  770. # depending on the mode.
  771. class GenericUnivariateSelect(_BaseFilter):
  772. """Univariate feature selector with configurable strategy.
  773. Read more in the :ref:`User Guide <univariate_feature_selection>`.
  774. Parameters
  775. ----------
  776. score_func : callable, default=f_classif
  777. Function taking two arrays X and y, and returning a pair of arrays
  778. (scores, pvalues). For modes 'percentile' or 'kbest' it can return
  779. a single array scores.
  780. mode : {'percentile', 'k_best', 'fpr', 'fdr', 'fwe'}, default='percentile'
  781. Feature selection mode.
  782. param : "all", float or int, default=1e-5
  783. Parameter of the corresponding mode.
  784. Attributes
  785. ----------
  786. scores_ : array-like of shape (n_features,)
  787. Scores of features.
  788. pvalues_ : array-like of shape (n_features,)
  789. p-values of feature scores, None if `score_func` returned scores only.
  790. n_features_in_ : int
  791. Number of features seen during :term:`fit`.
  792. .. versionadded:: 0.24
  793. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  794. Names of features seen during :term:`fit`. Defined only when `X`
  795. has feature names that are all strings.
  796. .. versionadded:: 1.0
  797. See Also
  798. --------
  799. f_classif : ANOVA F-value between label/feature for classification tasks.
  800. mutual_info_classif : Mutual information for a discrete target.
  801. chi2 : Chi-squared stats of non-negative features for classification tasks.
  802. f_regression : F-value between label/feature for regression tasks.
  803. mutual_info_regression : Mutual information for a continuous target.
  804. SelectPercentile : Select features based on percentile of the highest
  805. scores.
  806. SelectKBest : Select features based on the k highest scores.
  807. SelectFpr : Select features based on a false positive rate test.
  808. SelectFdr : Select features based on an estimated false discovery rate.
  809. SelectFwe : Select features based on family-wise error rate.
  810. Examples
  811. --------
  812. >>> from sklearn.datasets import load_breast_cancer
  813. >>> from sklearn.feature_selection import GenericUnivariateSelect, chi2
  814. >>> X, y = load_breast_cancer(return_X_y=True)
  815. >>> X.shape
  816. (569, 30)
  817. >>> transformer = GenericUnivariateSelect(chi2, mode='k_best', param=20)
  818. >>> X_new = transformer.fit_transform(X, y)
  819. >>> X_new.shape
  820. (569, 20)
  821. """
  822. _selection_modes: dict = {
  823. "percentile": SelectPercentile,
  824. "k_best": SelectKBest,
  825. "fpr": SelectFpr,
  826. "fdr": SelectFdr,
  827. "fwe": SelectFwe,
  828. }
  829. _parameter_constraints: dict = {
  830. **_BaseFilter._parameter_constraints,
  831. "mode": [StrOptions(set(_selection_modes.keys()))],
  832. "param": [Interval(Real, 0, None, closed="left"), StrOptions({"all"})],
  833. }
  834. def __init__(self, score_func=f_classif, *, mode="percentile", param=1e-5):
  835. super().__init__(score_func=score_func)
  836. self.mode = mode
  837. self.param = param
  838. def _make_selector(self):
  839. selector = self._selection_modes[self.mode](score_func=self.score_func)
  840. # Now perform some acrobatics to set the right named parameter in
  841. # the selector
  842. possible_params = selector._get_param_names()
  843. possible_params.remove("score_func")
  844. selector.set_params(**{possible_params[0]: self.param})
  845. return selector
  846. def _more_tags(self):
  847. return {"preserves_dtype": [np.float64, np.float32]}
  848. def _check_params(self, X, y):
  849. self._make_selector()._check_params(X, y)
  850. def _get_support_mask(self):
  851. check_is_fitted(self)
  852. selector = self._make_selector()
  853. selector.pvalues_ = self.pvalues_
  854. selector.scores_ = self.scores_
  855. return selector._get_support_mask()