| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084 |
- """Univariate features selection."""
- # Authors: V. Michel, B. Thirion, G. Varoquaux, A. Gramfort, E. Duchesnay.
- # L. Buitinck, A. Joly
- # License: BSD 3 clause
- import warnings
- from numbers import Integral, Real
- import numpy as np
- from scipy import special, stats
- from scipy.sparse import issparse
- from ..base import BaseEstimator, _fit_context
- from ..preprocessing import LabelBinarizer
- from ..utils import as_float_array, check_array, check_X_y, safe_mask, safe_sqr
- from ..utils._param_validation import Interval, StrOptions, validate_params
- from ..utils.extmath import row_norms, safe_sparse_dot
- from ..utils.validation import check_is_fitted
- from ._base import SelectorMixin
- def _clean_nans(scores):
- """
- Fixes Issue #1240: NaNs can't be properly compared, so change them to the
- smallest value of scores's dtype. -inf seems to be unreliable.
- """
- # XXX where should this function be called? fit? scoring functions
- # themselves?
- scores = as_float_array(scores, copy=True)
- scores[np.isnan(scores)] = np.finfo(scores.dtype).min
- return scores
- ######################################################################
- # Scoring functions
- # The following function is a rewriting of scipy.stats.f_oneway
- # Contrary to the scipy.stats.f_oneway implementation it does not
- # copy the data while keeping the inputs unchanged.
- def f_oneway(*args):
- """Perform a 1-way ANOVA.
- The one-way ANOVA tests the null hypothesis that 2 or more groups have
- the same population mean. The test is applied to samples from two or
- more groups, possibly with differing sizes.
- Read more in the :ref:`User Guide <univariate_feature_selection>`.
- Parameters
- ----------
- *args : {array-like, sparse matrix}
- Sample1, sample2... The sample measurements should be given as
- arguments.
- Returns
- -------
- f_statistic : float
- The computed F-value of the test.
- p_value : float
- The associated p-value from the F-distribution.
- Notes
- -----
- The ANOVA test has important assumptions that must be satisfied in order
- for the associated p-value to be valid.
- 1. The samples are independent
- 2. Each sample is from a normally distributed population
- 3. The population standard deviations of the groups are all equal. This
- property is known as homoscedasticity.
- If these assumptions are not true for a given set of data, it may still be
- possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`_) although
- with some loss of power.
- The algorithm is from Heiman[2], pp.394-7.
- See ``scipy.stats.f_oneway`` that should give the same results while
- being less efficient.
- References
- ----------
- .. [1] Lowry, Richard. "Concepts and Applications of Inferential
- Statistics". Chapter 14.
- http://vassarstats.net/textbook
- .. [2] Heiman, G.W. Research Methods in Statistics. 2002.
- """
- n_classes = len(args)
- args = [as_float_array(a) for a in args]
- n_samples_per_class = np.array([a.shape[0] for a in args])
- n_samples = np.sum(n_samples_per_class)
- ss_alldata = sum(safe_sqr(a).sum(axis=0) for a in args)
- sums_args = [np.asarray(a.sum(axis=0)) for a in args]
- square_of_sums_alldata = sum(sums_args) ** 2
- square_of_sums_args = [s**2 for s in sums_args]
- sstot = ss_alldata - square_of_sums_alldata / float(n_samples)
- ssbn = 0.0
- for k, _ in enumerate(args):
- ssbn += square_of_sums_args[k] / n_samples_per_class[k]
- ssbn -= square_of_sums_alldata / float(n_samples)
- sswn = sstot - ssbn
- dfbn = n_classes - 1
- dfwn = n_samples - n_classes
- msb = ssbn / float(dfbn)
- msw = sswn / float(dfwn)
- constant_features_idx = np.where(msw == 0.0)[0]
- if np.nonzero(msb)[0].size != msb.size and constant_features_idx.size:
- warnings.warn("Features %s are constant." % constant_features_idx, UserWarning)
- f = msb / msw
- # flatten matrix to vector in sparse case
- f = np.asarray(f).ravel()
- prob = special.fdtrc(dfbn, dfwn, f)
- return f, prob
- @validate_params(
- {
- "X": ["array-like", "sparse matrix"],
- "y": ["array-like"],
- },
- prefer_skip_nested_validation=True,
- )
- def f_classif(X, y):
- """Compute the ANOVA F-value for the provided sample.
- Read more in the :ref:`User Guide <univariate_feature_selection>`.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The set of regressors that will be tested sequentially.
- y : array-like of shape (n_samples,)
- The target vector.
- Returns
- -------
- f_statistic : ndarray of shape (n_features,)
- F-statistic for each feature.
- p_values : ndarray of shape (n_features,)
- P-values associated with the F-statistic.
- See Also
- --------
- chi2 : Chi-squared stats of non-negative features for classification tasks.
- f_regression : F-value between label/feature for regression tasks.
- """
- X, y = check_X_y(X, y, accept_sparse=["csr", "csc", "coo"])
- args = [X[safe_mask(X, y == k)] for k in np.unique(y)]
- return f_oneway(*args)
- def _chisquare(f_obs, f_exp):
- """Fast replacement for scipy.stats.chisquare.
- Version from https://github.com/scipy/scipy/pull/2525 with additional
- optimizations.
- """
- f_obs = np.asarray(f_obs, dtype=np.float64)
- k = len(f_obs)
- # Reuse f_obs for chi-squared statistics
- chisq = f_obs
- chisq -= f_exp
- chisq **= 2
- with np.errstate(invalid="ignore"):
- chisq /= f_exp
- chisq = chisq.sum(axis=0)
- return chisq, special.chdtrc(k - 1, chisq)
- @validate_params(
- {
- "X": ["array-like", "sparse matrix"],
- "y": ["array-like"],
- },
- prefer_skip_nested_validation=True,
- )
- def chi2(X, y):
- """Compute chi-squared stats between each non-negative feature and class.
- This score can be used to select the `n_features` features with the
- highest values for the test chi-squared statistic from X, which must
- contain only **non-negative features** such as booleans or frequencies
- (e.g., term counts in document classification), relative to the classes.
- Recall that the chi-square test measures dependence between stochastic
- variables, so using this function "weeds out" the features that are the
- most likely to be independent of class and therefore irrelevant for
- classification.
- Read more in the :ref:`User Guide <univariate_feature_selection>`.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Sample vectors.
- y : array-like of shape (n_samples,)
- Target vector (class labels).
- Returns
- -------
- chi2 : ndarray of shape (n_features,)
- Chi2 statistics for each feature.
- p_values : ndarray of shape (n_features,)
- P-values for each feature.
- See Also
- --------
- f_classif : ANOVA F-value between label/feature for classification tasks.
- f_regression : F-value between label/feature for regression tasks.
- Notes
- -----
- Complexity of this algorithm is O(n_classes * n_features).
- """
- # XXX: we might want to do some of the following in logspace instead for
- # numerical stability.
- # Converting X to float allows getting better performance for the
- # safe_sparse_dot call made below.
- X = check_array(X, accept_sparse="csr", dtype=(np.float64, np.float32))
- if np.any((X.data if issparse(X) else X) < 0):
- raise ValueError("Input X must be non-negative.")
- # Use a sparse representation for Y by default to reduce memory usage when
- # y has many unique classes.
- Y = LabelBinarizer(sparse_output=True).fit_transform(y)
- if Y.shape[1] == 1:
- Y = Y.toarray()
- Y = np.append(1 - Y, Y, axis=1)
- observed = safe_sparse_dot(Y.T, X) # n_classes * n_features
- if issparse(observed):
- # convert back to a dense array before calling _chisquare
- # XXX: could _chisquare be reimplement to accept sparse matrices for
- # cases where both n_classes and n_features are large (and X is
- # sparse)?
- observed = observed.toarray()
- feature_count = X.sum(axis=0).reshape(1, -1)
- class_prob = Y.mean(axis=0).reshape(1, -1)
- expected = np.dot(class_prob.T, feature_count)
- return _chisquare(observed, expected)
- @validate_params(
- {
- "X": ["array-like", "sparse matrix"],
- "y": ["array-like"],
- "center": ["boolean"],
- "force_finite": ["boolean"],
- },
- prefer_skip_nested_validation=True,
- )
- def r_regression(X, y, *, center=True, force_finite=True):
- """Compute Pearson's r for each features and the target.
- Pearson's r is also known as the Pearson correlation coefficient.
- Linear model for testing the individual effect of each of many regressors.
- This is a scoring function to be used in a feature selection procedure, not
- a free standing feature selection procedure.
- The cross correlation between each regressor and the target is computed
- as::
- E[(X[:, i] - mean(X[:, i])) * (y - mean(y))] / (std(X[:, i]) * std(y))
- For more on usage see the :ref:`User Guide <univariate_feature_selection>`.
- .. versionadded:: 1.0
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The data matrix.
- y : array-like of shape (n_samples,)
- The target vector.
- center : bool, default=True
- Whether or not to center the data matrix `X` and the target vector `y`.
- By default, `X` and `y` will be centered.
- force_finite : bool, default=True
- Whether or not to force the Pearson's R correlation to be finite.
- In the particular case where some features in `X` or the target `y`
- are constant, the Pearson's R correlation is not defined. When
- `force_finite=False`, a correlation of `np.nan` is returned to
- acknowledge this case. When `force_finite=True`, this value will be
- forced to a minimal correlation of `0.0`.
- .. versionadded:: 1.1
- Returns
- -------
- correlation_coefficient : ndarray of shape (n_features,)
- Pearson's R correlation coefficients of features.
- See Also
- --------
- f_regression: Univariate linear regression tests returning f-statistic
- and p-values.
- mutual_info_regression: Mutual information for a continuous target.
- f_classif: ANOVA F-value between label/feature for classification tasks.
- chi2: Chi-squared stats of non-negative features for classification tasks.
- """
- X, y = check_X_y(X, y, accept_sparse=["csr", "csc", "coo"], dtype=np.float64)
- n_samples = X.shape[0]
- # Compute centered values
- # Note that E[(x - mean(x))*(y - mean(y))] = E[x*(y - mean(y))], so we
- # need not center X
- if center:
- y = y - np.mean(y)
- if issparse(X):
- X_means = X.mean(axis=0).getA1()
- else:
- X_means = X.mean(axis=0)
- # Compute the scaled standard deviations via moments
- X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means**2)
- else:
- X_norms = row_norms(X.T)
- correlation_coefficient = safe_sparse_dot(y, X)
- with np.errstate(divide="ignore", invalid="ignore"):
- correlation_coefficient /= X_norms
- correlation_coefficient /= np.linalg.norm(y)
- if force_finite and not np.isfinite(correlation_coefficient).all():
- # case where the target or some features are constant
- # the correlation coefficient(s) is/are set to the minimum (i.e. 0.0)
- nan_mask = np.isnan(correlation_coefficient)
- correlation_coefficient[nan_mask] = 0.0
- return correlation_coefficient
- @validate_params(
- {
- "X": ["array-like", "sparse matrix"],
- "y": ["array-like"],
- "center": ["boolean"],
- "force_finite": ["boolean"],
- },
- prefer_skip_nested_validation=True,
- )
- def f_regression(X, y, *, center=True, force_finite=True):
- """Univariate linear regression tests returning F-statistic and p-values.
- Quick linear model for testing the effect of a single regressor,
- sequentially for many regressors.
- This is done in 2 steps:
- 1. The cross correlation between each regressor and the target is computed
- using :func:`r_regression` as::
- E[(X[:, i] - mean(X[:, i])) * (y - mean(y))] / (std(X[:, i]) * std(y))
- 2. It is converted to an F score and then to a p-value.
- :func:`f_regression` is derived from :func:`r_regression` and will rank
- features in the same order if all the features are positively correlated
- with the target.
- Note however that contrary to :func:`f_regression`, :func:`r_regression`
- values lie in [-1, 1] and can thus be negative. :func:`f_regression` is
- therefore recommended as a feature selection criterion to identify
- potentially predictive feature for a downstream classifier, irrespective of
- the sign of the association with the target variable.
- Furthermore :func:`f_regression` returns p-values while
- :func:`r_regression` does not.
- Read more in the :ref:`User Guide <univariate_feature_selection>`.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The data matrix.
- y : array-like of shape (n_samples,)
- The target vector.
- center : bool, default=True
- Whether or not to center the data matrix `X` and the target vector `y`.
- By default, `X` and `y` will be centered.
- force_finite : bool, default=True
- Whether or not to force the F-statistics and associated p-values to
- be finite. There are two cases where the F-statistic is expected to not
- be finite:
- - when the target `y` or some features in `X` are constant. In this
- case, the Pearson's R correlation is not defined leading to obtain
- `np.nan` values in the F-statistic and p-value. When
- `force_finite=True`, the F-statistic is set to `0.0` and the
- associated p-value is set to `1.0`.
- - when a feature in `X` is perfectly correlated (or
- anti-correlated) with the target `y`. In this case, the F-statistic
- is expected to be `np.inf`. When `force_finite=True`, the F-statistic
- is set to `np.finfo(dtype).max` and the associated p-value is set to
- `0.0`.
- .. versionadded:: 1.1
- Returns
- -------
- f_statistic : ndarray of shape (n_features,)
- F-statistic for each feature.
- p_values : ndarray of shape (n_features,)
- P-values associated with the F-statistic.
- See Also
- --------
- r_regression: Pearson's R between label/feature for regression tasks.
- f_classif: ANOVA F-value between label/feature for classification tasks.
- chi2: Chi-squared stats of non-negative features for classification tasks.
- SelectKBest: Select features based on the k highest scores.
- SelectFpr: Select features based on a false positive rate test.
- SelectFdr: Select features based on an estimated false discovery rate.
- SelectFwe: Select features based on family-wise error rate.
- SelectPercentile: Select features based on percentile of the highest
- scores.
- """
- correlation_coefficient = r_regression(
- X, y, center=center, force_finite=force_finite
- )
- deg_of_freedom = y.size - (2 if center else 1)
- corr_coef_squared = correlation_coefficient**2
- with np.errstate(divide="ignore", invalid="ignore"):
- f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom
- p_values = stats.f.sf(f_statistic, 1, deg_of_freedom)
- if force_finite and not np.isfinite(f_statistic).all():
- # case where there is a perfect (anti-)correlation
- # f-statistics can be set to the maximum and p-values to zero
- mask_inf = np.isinf(f_statistic)
- f_statistic[mask_inf] = np.finfo(f_statistic.dtype).max
- # case where the target or some features are constant
- # f-statistics would be minimum and thus p-values large
- mask_nan = np.isnan(f_statistic)
- f_statistic[mask_nan] = 0.0
- p_values[mask_nan] = 1.0
- return f_statistic, p_values
- ######################################################################
- # Base classes
- class _BaseFilter(SelectorMixin, BaseEstimator):
- """Initialize the univariate feature selection.
- Parameters
- ----------
- score_func : callable
- Function taking two arrays X and y, and returning a pair of arrays
- (scores, pvalues) or a single array with scores.
- """
- _parameter_constraints: dict = {"score_func": [callable]}
- def __init__(self, score_func):
- self.score_func = score_func
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, X, y):
- """Run score function on (X, y) and get the appropriate features.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- The training input samples.
- y : array-like of shape (n_samples,)
- The target values (class labels in classification, real numbers in
- regression).
- Returns
- -------
- self : object
- Returns the instance itself.
- """
- X, y = self._validate_data(
- X, y, accept_sparse=["csr", "csc"], multi_output=True
- )
- self._check_params(X, y)
- score_func_ret = self.score_func(X, y)
- if isinstance(score_func_ret, (list, tuple)):
- self.scores_, self.pvalues_ = score_func_ret
- self.pvalues_ = np.asarray(self.pvalues_)
- else:
- self.scores_ = score_func_ret
- self.pvalues_ = None
- self.scores_ = np.asarray(self.scores_)
- return self
- def _check_params(self, X, y):
- pass
- def _more_tags(self):
- return {"requires_y": True}
- ######################################################################
- # Specific filters
- ######################################################################
- class SelectPercentile(_BaseFilter):
- """Select features according to a percentile of the highest scores.
- Read more in the :ref:`User Guide <univariate_feature_selection>`.
- Parameters
- ----------
- score_func : callable, default=f_classif
- Function taking two arrays X and y, and returning a pair of arrays
- (scores, pvalues) or a single array with scores.
- Default is f_classif (see below "See Also"). The default function only
- works with classification tasks.
- .. versionadded:: 0.18
- percentile : int, default=10
- Percent of features to keep.
- Attributes
- ----------
- scores_ : array-like of shape (n_features,)
- Scores of features.
- pvalues_ : array-like of shape (n_features,)
- p-values of feature scores, None if `score_func` returned only scores.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- See Also
- --------
- f_classif : ANOVA F-value between label/feature for classification tasks.
- mutual_info_classif : Mutual information for a discrete target.
- chi2 : Chi-squared stats of non-negative features for classification tasks.
- f_regression : F-value between label/feature for regression tasks.
- mutual_info_regression : Mutual information for a continuous target.
- SelectKBest : Select features based on the k highest scores.
- SelectFpr : Select features based on a false positive rate test.
- SelectFdr : Select features based on an estimated false discovery rate.
- SelectFwe : Select features based on family-wise error rate.
- GenericUnivariateSelect : Univariate feature selector with configurable
- mode.
- Notes
- -----
- Ties between features with equal scores will be broken in an unspecified
- way.
- Examples
- --------
- >>> from sklearn.datasets import load_digits
- >>> from sklearn.feature_selection import SelectPercentile, chi2
- >>> X, y = load_digits(return_X_y=True)
- >>> X.shape
- (1797, 64)
- >>> X_new = SelectPercentile(chi2, percentile=10).fit_transform(X, y)
- >>> X_new.shape
- (1797, 7)
- """
- _parameter_constraints: dict = {
- **_BaseFilter._parameter_constraints,
- "percentile": [Interval(Real, 0, 100, closed="both")],
- }
- def __init__(self, score_func=f_classif, *, percentile=10):
- super().__init__(score_func=score_func)
- self.percentile = percentile
- def _get_support_mask(self):
- check_is_fitted(self)
- # Cater for NaNs
- if self.percentile == 100:
- return np.ones(len(self.scores_), dtype=bool)
- elif self.percentile == 0:
- return np.zeros(len(self.scores_), dtype=bool)
- scores = _clean_nans(self.scores_)
- threshold = np.percentile(scores, 100 - self.percentile)
- mask = scores > threshold
- ties = np.where(scores == threshold)[0]
- if len(ties):
- max_feats = int(len(scores) * self.percentile / 100)
- kept_ties = ties[: max_feats - mask.sum()]
- mask[kept_ties] = True
- return mask
- class SelectKBest(_BaseFilter):
- """Select features according to the k highest scores.
- Read more in the :ref:`User Guide <univariate_feature_selection>`.
- Parameters
- ----------
- score_func : callable, default=f_classif
- Function taking two arrays X and y, and returning a pair of arrays
- (scores, pvalues) or a single array with scores.
- Default is f_classif (see below "See Also"). The default function only
- works with classification tasks.
- .. versionadded:: 0.18
- k : int or "all", default=10
- Number of top features to select.
- The "all" option bypasses selection, for use in a parameter search.
- Attributes
- ----------
- scores_ : array-like of shape (n_features,)
- Scores of features.
- pvalues_ : array-like of shape (n_features,)
- p-values of feature scores, None if `score_func` returned only scores.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- See Also
- --------
- f_classif: ANOVA F-value between label/feature for classification tasks.
- mutual_info_classif: Mutual information for a discrete target.
- chi2: Chi-squared stats of non-negative features for classification tasks.
- f_regression: F-value between label/feature for regression tasks.
- mutual_info_regression: Mutual information for a continuous target.
- SelectPercentile: Select features based on percentile of the highest
- scores.
- SelectFpr : Select features based on a false positive rate test.
- SelectFdr : Select features based on an estimated false discovery rate.
- SelectFwe : Select features based on family-wise error rate.
- GenericUnivariateSelect : Univariate feature selector with configurable
- mode.
- Notes
- -----
- Ties between features with equal scores will be broken in an unspecified
- way.
- Examples
- --------
- >>> from sklearn.datasets import load_digits
- >>> from sklearn.feature_selection import SelectKBest, chi2
- >>> X, y = load_digits(return_X_y=True)
- >>> X.shape
- (1797, 64)
- >>> X_new = SelectKBest(chi2, k=20).fit_transform(X, y)
- >>> X_new.shape
- (1797, 20)
- """
- _parameter_constraints: dict = {
- **_BaseFilter._parameter_constraints,
- "k": [StrOptions({"all"}), Interval(Integral, 0, None, closed="left")],
- }
- def __init__(self, score_func=f_classif, *, k=10):
- super().__init__(score_func=score_func)
- self.k = k
- def _check_params(self, X, y):
- if not isinstance(self.k, str) and self.k > X.shape[1]:
- raise ValueError(
- f"k should be <= n_features = {X.shape[1]}; "
- f"got {self.k}. Use k='all' to return all features."
- )
- def _get_support_mask(self):
- check_is_fitted(self)
- if self.k == "all":
- return np.ones(self.scores_.shape, dtype=bool)
- elif self.k == 0:
- return np.zeros(self.scores_.shape, dtype=bool)
- else:
- scores = _clean_nans(self.scores_)
- mask = np.zeros(scores.shape, dtype=bool)
- # Request a stable sort. Mergesort takes more memory (~40MB per
- # megafeature on x86-64).
- mask[np.argsort(scores, kind="mergesort")[-self.k :]] = 1
- return mask
- class SelectFpr(_BaseFilter):
- """Filter: Select the pvalues below alpha based on a FPR test.
- FPR test stands for False Positive Rate test. It controls the total
- amount of false detections.
- Read more in the :ref:`User Guide <univariate_feature_selection>`.
- Parameters
- ----------
- score_func : callable, default=f_classif
- Function taking two arrays X and y, and returning a pair of arrays
- (scores, pvalues).
- Default is f_classif (see below "See Also"). The default function only
- works with classification tasks.
- alpha : float, default=5e-2
- Features with p-values less than `alpha` are selected.
- Attributes
- ----------
- scores_ : array-like of shape (n_features,)
- Scores of features.
- pvalues_ : array-like of shape (n_features,)
- p-values of feature scores.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- See Also
- --------
- f_classif : ANOVA F-value between label/feature for classification tasks.
- chi2 : Chi-squared stats of non-negative features for classification tasks.
- mutual_info_classif: Mutual information for a discrete target.
- f_regression : F-value between label/feature for regression tasks.
- mutual_info_regression : Mutual information for a continuous target.
- SelectPercentile : Select features based on percentile of the highest
- scores.
- SelectKBest : Select features based on the k highest scores.
- SelectFdr : Select features based on an estimated false discovery rate.
- SelectFwe : Select features based on family-wise error rate.
- GenericUnivariateSelect : Univariate feature selector with configurable
- mode.
- Examples
- --------
- >>> from sklearn.datasets import load_breast_cancer
- >>> from sklearn.feature_selection import SelectFpr, chi2
- >>> X, y = load_breast_cancer(return_X_y=True)
- >>> X.shape
- (569, 30)
- >>> X_new = SelectFpr(chi2, alpha=0.01).fit_transform(X, y)
- >>> X_new.shape
- (569, 16)
- """
- _parameter_constraints: dict = {
- **_BaseFilter._parameter_constraints,
- "alpha": [Interval(Real, 0, 1, closed="both")],
- }
- def __init__(self, score_func=f_classif, *, alpha=5e-2):
- super().__init__(score_func=score_func)
- self.alpha = alpha
- def _get_support_mask(self):
- check_is_fitted(self)
- return self.pvalues_ < self.alpha
- class SelectFdr(_BaseFilter):
- """Filter: Select the p-values for an estimated false discovery rate.
- This uses the Benjamini-Hochberg procedure. ``alpha`` is an upper bound
- on the expected false discovery rate.
- Read more in the :ref:`User Guide <univariate_feature_selection>`.
- Parameters
- ----------
- score_func : callable, default=f_classif
- Function taking two arrays X and y, and returning a pair of arrays
- (scores, pvalues).
- Default is f_classif (see below "See Also"). The default function only
- works with classification tasks.
- alpha : float, default=5e-2
- The highest uncorrected p-value for features to keep.
- Attributes
- ----------
- scores_ : array-like of shape (n_features,)
- Scores of features.
- pvalues_ : array-like of shape (n_features,)
- p-values of feature scores.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- See Also
- --------
- f_classif : ANOVA F-value between label/feature for classification tasks.
- mutual_info_classif : Mutual information for a discrete target.
- chi2 : Chi-squared stats of non-negative features for classification tasks.
- f_regression : F-value between label/feature for regression tasks.
- mutual_info_regression : Mutual information for a continuous target.
- SelectPercentile : Select features based on percentile of the highest
- scores.
- SelectKBest : Select features based on the k highest scores.
- SelectFpr : Select features based on a false positive rate test.
- SelectFwe : Select features based on family-wise error rate.
- GenericUnivariateSelect : Univariate feature selector with configurable
- mode.
- References
- ----------
- https://en.wikipedia.org/wiki/False_discovery_rate
- Examples
- --------
- >>> from sklearn.datasets import load_breast_cancer
- >>> from sklearn.feature_selection import SelectFdr, chi2
- >>> X, y = load_breast_cancer(return_X_y=True)
- >>> X.shape
- (569, 30)
- >>> X_new = SelectFdr(chi2, alpha=0.01).fit_transform(X, y)
- >>> X_new.shape
- (569, 16)
- """
- _parameter_constraints: dict = {
- **_BaseFilter._parameter_constraints,
- "alpha": [Interval(Real, 0, 1, closed="both")],
- }
- def __init__(self, score_func=f_classif, *, alpha=5e-2):
- super().__init__(score_func=score_func)
- self.alpha = alpha
- def _get_support_mask(self):
- check_is_fitted(self)
- n_features = len(self.pvalues_)
- sv = np.sort(self.pvalues_)
- selected = sv[
- sv <= float(self.alpha) / n_features * np.arange(1, n_features + 1)
- ]
- if selected.size == 0:
- return np.zeros_like(self.pvalues_, dtype=bool)
- return self.pvalues_ <= selected.max()
- class SelectFwe(_BaseFilter):
- """Filter: Select the p-values corresponding to Family-wise error rate.
- Read more in the :ref:`User Guide <univariate_feature_selection>`.
- Parameters
- ----------
- score_func : callable, default=f_classif
- Function taking two arrays X and y, and returning a pair of arrays
- (scores, pvalues).
- Default is f_classif (see below "See Also"). The default function only
- works with classification tasks.
- alpha : float, default=5e-2
- The highest uncorrected p-value for features to keep.
- Attributes
- ----------
- scores_ : array-like of shape (n_features,)
- Scores of features.
- pvalues_ : array-like of shape (n_features,)
- p-values of feature scores.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- See Also
- --------
- f_classif : ANOVA F-value between label/feature for classification tasks.
- chi2 : Chi-squared stats of non-negative features for classification tasks.
- f_regression : F-value between label/feature for regression tasks.
- SelectPercentile : Select features based on percentile of the highest
- scores.
- SelectKBest : Select features based on the k highest scores.
- SelectFpr : Select features based on a false positive rate test.
- SelectFdr : Select features based on an estimated false discovery rate.
- GenericUnivariateSelect : Univariate feature selector with configurable
- mode.
- Examples
- --------
- >>> from sklearn.datasets import load_breast_cancer
- >>> from sklearn.feature_selection import SelectFwe, chi2
- >>> X, y = load_breast_cancer(return_X_y=True)
- >>> X.shape
- (569, 30)
- >>> X_new = SelectFwe(chi2, alpha=0.01).fit_transform(X, y)
- >>> X_new.shape
- (569, 15)
- """
- _parameter_constraints: dict = {
- **_BaseFilter._parameter_constraints,
- "alpha": [Interval(Real, 0, 1, closed="both")],
- }
- def __init__(self, score_func=f_classif, *, alpha=5e-2):
- super().__init__(score_func=score_func)
- self.alpha = alpha
- def _get_support_mask(self):
- check_is_fitted(self)
- return self.pvalues_ < self.alpha / len(self.pvalues_)
- ######################################################################
- # Generic filter
- ######################################################################
- # TODO this class should fit on either p-values or scores,
- # depending on the mode.
- class GenericUnivariateSelect(_BaseFilter):
- """Univariate feature selector with configurable strategy.
- Read more in the :ref:`User Guide <univariate_feature_selection>`.
- Parameters
- ----------
- score_func : callable, default=f_classif
- Function taking two arrays X and y, and returning a pair of arrays
- (scores, pvalues). For modes 'percentile' or 'kbest' it can return
- a single array scores.
- mode : {'percentile', 'k_best', 'fpr', 'fdr', 'fwe'}, default='percentile'
- Feature selection mode.
- param : "all", float or int, default=1e-5
- Parameter of the corresponding mode.
- Attributes
- ----------
- scores_ : array-like of shape (n_features,)
- Scores of features.
- pvalues_ : array-like of shape (n_features,)
- p-values of feature scores, None if `score_func` returned scores only.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- See Also
- --------
- f_classif : ANOVA F-value between label/feature for classification tasks.
- mutual_info_classif : Mutual information for a discrete target.
- chi2 : Chi-squared stats of non-negative features for classification tasks.
- f_regression : F-value between label/feature for regression tasks.
- mutual_info_regression : Mutual information for a continuous target.
- SelectPercentile : Select features based on percentile of the highest
- scores.
- SelectKBest : Select features based on the k highest scores.
- SelectFpr : Select features based on a false positive rate test.
- SelectFdr : Select features based on an estimated false discovery rate.
- SelectFwe : Select features based on family-wise error rate.
- Examples
- --------
- >>> from sklearn.datasets import load_breast_cancer
- >>> from sklearn.feature_selection import GenericUnivariateSelect, chi2
- >>> X, y = load_breast_cancer(return_X_y=True)
- >>> X.shape
- (569, 30)
- >>> transformer = GenericUnivariateSelect(chi2, mode='k_best', param=20)
- >>> X_new = transformer.fit_transform(X, y)
- >>> X_new.shape
- (569, 20)
- """
- _selection_modes: dict = {
- "percentile": SelectPercentile,
- "k_best": SelectKBest,
- "fpr": SelectFpr,
- "fdr": SelectFdr,
- "fwe": SelectFwe,
- }
- _parameter_constraints: dict = {
- **_BaseFilter._parameter_constraints,
- "mode": [StrOptions(set(_selection_modes.keys()))],
- "param": [Interval(Real, 0, None, closed="left"), StrOptions({"all"})],
- }
- def __init__(self, score_func=f_classif, *, mode="percentile", param=1e-5):
- super().__init__(score_func=score_func)
- self.mode = mode
- self.param = param
- def _make_selector(self):
- selector = self._selection_modes[self.mode](score_func=self.score_func)
- # Now perform some acrobatics to set the right named parameter in
- # the selector
- possible_params = selector._get_param_names()
- possible_params.remove("score_func")
- selector.set_params(**{possible_params[0]: self.param})
- return selector
- def _more_tags(self):
- return {"preserves_dtype": [np.float64, np.float32]}
- def _check_params(self, X, y):
- self._make_selector()._check_params(X, y)
- def _get_support_mask(self):
- check_is_fitted(self)
- selector = self._make_selector()
- selector.pvalues_ = self.pvalues_
- selector.scores_ = self.scores_
- return selector._get_support_mask()
|