| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985 |
- """
- Todo: cross-check the F-value with stats model
- """
- import itertools
- import warnings
- import numpy as np
- import pytest
- from numpy.testing import assert_allclose
- from scipy import sparse, stats
- from sklearn.datasets import load_iris, make_classification, make_regression
- from sklearn.feature_selection import (
- GenericUnivariateSelect,
- SelectFdr,
- SelectFpr,
- SelectFwe,
- SelectKBest,
- SelectPercentile,
- chi2,
- f_classif,
- f_oneway,
- f_regression,
- mutual_info_classif,
- mutual_info_regression,
- r_regression,
- )
- from sklearn.utils import safe_mask
- from sklearn.utils._testing import (
- _convert_container,
- assert_almost_equal,
- assert_array_almost_equal,
- assert_array_equal,
- ignore_warnings,
- )
- ##############################################################################
- # Test the score functions
- def test_f_oneway_vs_scipy_stats():
- # Test that our f_oneway gives the same result as scipy.stats
- rng = np.random.RandomState(0)
- X1 = rng.randn(10, 3)
- X2 = 1 + rng.randn(10, 3)
- f, pv = stats.f_oneway(X1, X2)
- f2, pv2 = f_oneway(X1, X2)
- assert np.allclose(f, f2)
- assert np.allclose(pv, pv2)
- def test_f_oneway_ints():
- # Smoke test f_oneway on integers: that it does raise casting errors
- # with recent numpys
- rng = np.random.RandomState(0)
- X = rng.randint(10, size=(10, 10))
- y = np.arange(10)
- fint, pint = f_oneway(X, y)
- # test that is gives the same result as with float
- f, p = f_oneway(X.astype(float), y)
- assert_array_almost_equal(f, fint, decimal=4)
- assert_array_almost_equal(p, pint, decimal=4)
- def test_f_classif():
- # Test whether the F test yields meaningful results
- # on a simple simulated classification problem
- X, y = make_classification(
- n_samples=200,
- n_features=20,
- n_informative=3,
- n_redundant=2,
- n_repeated=0,
- n_classes=8,
- n_clusters_per_class=1,
- flip_y=0.0,
- class_sep=10,
- shuffle=False,
- random_state=0,
- )
- F, pv = f_classif(X, y)
- F_sparse, pv_sparse = f_classif(sparse.csr_matrix(X), y)
- assert (F > 0).all()
- assert (pv > 0).all()
- assert (pv < 1).all()
- assert (pv[:5] < 0.05).all()
- assert (pv[5:] > 1.0e-4).all()
- assert_array_almost_equal(F_sparse, F)
- assert_array_almost_equal(pv_sparse, pv)
- @pytest.mark.parametrize("center", [True, False])
- def test_r_regression(center):
- X, y = make_regression(
- n_samples=2000, n_features=20, n_informative=5, shuffle=False, random_state=0
- )
- corr_coeffs = r_regression(X, y, center=center)
- assert (-1 < corr_coeffs).all()
- assert (corr_coeffs < 1).all()
- sparse_X = _convert_container(X, "sparse")
- sparse_corr_coeffs = r_regression(sparse_X, y, center=center)
- assert_allclose(sparse_corr_coeffs, corr_coeffs)
- # Testing against numpy for reference
- Z = np.hstack((X, y[:, np.newaxis]))
- correlation_matrix = np.corrcoef(Z, rowvar=False)
- np_corr_coeffs = correlation_matrix[:-1, -1]
- assert_array_almost_equal(np_corr_coeffs, corr_coeffs, decimal=3)
- def test_f_regression():
- # Test whether the F test yields meaningful results
- # on a simple simulated regression problem
- X, y = make_regression(
- n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0
- )
- F, pv = f_regression(X, y)
- assert (F > 0).all()
- assert (pv > 0).all()
- assert (pv < 1).all()
- assert (pv[:5] < 0.05).all()
- assert (pv[5:] > 1.0e-4).all()
- # with centering, compare with sparse
- F, pv = f_regression(X, y, center=True)
- F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=True)
- assert_allclose(F_sparse, F)
- assert_allclose(pv_sparse, pv)
- # again without centering, compare with sparse
- F, pv = f_regression(X, y, center=False)
- F_sparse, pv_sparse = f_regression(sparse.csr_matrix(X), y, center=False)
- assert_allclose(F_sparse, F)
- assert_allclose(pv_sparse, pv)
- def test_f_regression_input_dtype():
- # Test whether f_regression returns the same value
- # for any numeric data_type
- rng = np.random.RandomState(0)
- X = rng.rand(10, 20)
- y = np.arange(10).astype(int)
- F1, pv1 = f_regression(X, y)
- F2, pv2 = f_regression(X, y.astype(float))
- assert_allclose(F1, F2, 5)
- assert_allclose(pv1, pv2, 5)
- def test_f_regression_center():
- # Test whether f_regression preserves dof according to 'center' argument
- # We use two centered variates so we have a simple relationship between
- # F-score with variates centering and F-score without variates centering.
- # Create toy example
- X = np.arange(-5, 6).reshape(-1, 1) # X has zero mean
- n_samples = X.size
- Y = np.ones(n_samples)
- Y[::2] *= -1.0
- Y[0] = 0.0 # have Y mean being null
- F1, _ = f_regression(X, Y, center=True)
- F2, _ = f_regression(X, Y, center=False)
- assert_allclose(F1 * (n_samples - 1.0) / (n_samples - 2.0), F2)
- assert_almost_equal(F2[0], 0.232558139) # value from statsmodels OLS
- @pytest.mark.parametrize(
- "X, y, expected_corr_coef, force_finite",
- [
- (
- # A feature in X is constant - forcing finite
- np.array([[2, 1], [2, 0], [2, 10], [2, 4]]),
- np.array([0, 1, 1, 0]),
- np.array([0.0, 0.32075]),
- True,
- ),
- (
- # The target y is constant - forcing finite
- np.array([[5, 1], [3, 0], [2, 10], [8, 4]]),
- np.array([0, 0, 0, 0]),
- np.array([0.0, 0.0]),
- True,
- ),
- (
- # A feature in X is constant - not forcing finite
- np.array([[2, 1], [2, 0], [2, 10], [2, 4]]),
- np.array([0, 1, 1, 0]),
- np.array([np.nan, 0.32075]),
- False,
- ),
- (
- # The target y is constant - not forcing finite
- np.array([[5, 1], [3, 0], [2, 10], [8, 4]]),
- np.array([0, 0, 0, 0]),
- np.array([np.nan, np.nan]),
- False,
- ),
- ],
- )
- def test_r_regression_force_finite(X, y, expected_corr_coef, force_finite):
- """Check the behaviour of `force_finite` for some corner cases with `r_regression`.
- Non-regression test for:
- https://github.com/scikit-learn/scikit-learn/issues/15672
- """
- with warnings.catch_warnings():
- warnings.simplefilter("error", RuntimeWarning)
- corr_coef = r_regression(X, y, force_finite=force_finite)
- np.testing.assert_array_almost_equal(corr_coef, expected_corr_coef)
- @pytest.mark.parametrize(
- "X, y, expected_f_statistic, expected_p_values, force_finite",
- [
- (
- # A feature in X is constant - forcing finite
- np.array([[2, 1], [2, 0], [2, 10], [2, 4]]),
- np.array([0, 1, 1, 0]),
- np.array([0.0, 0.2293578]),
- np.array([1.0, 0.67924985]),
- True,
- ),
- (
- # The target y is constant - forcing finite
- np.array([[5, 1], [3, 0], [2, 10], [8, 4]]),
- np.array([0, 0, 0, 0]),
- np.array([0.0, 0.0]),
- np.array([1.0, 1.0]),
- True,
- ),
- (
- # Feature in X correlated with y - forcing finite
- np.array([[0, 1], [1, 0], [2, 10], [3, 4]]),
- np.array([0, 1, 2, 3]),
- np.array([np.finfo(np.float64).max, 0.845433]),
- np.array([0.0, 0.454913]),
- True,
- ),
- (
- # Feature in X anti-correlated with y - forcing finite
- np.array([[3, 1], [2, 0], [1, 10], [0, 4]]),
- np.array([0, 1, 2, 3]),
- np.array([np.finfo(np.float64).max, 0.845433]),
- np.array([0.0, 0.454913]),
- True,
- ),
- (
- # A feature in X is constant - not forcing finite
- np.array([[2, 1], [2, 0], [2, 10], [2, 4]]),
- np.array([0, 1, 1, 0]),
- np.array([np.nan, 0.2293578]),
- np.array([np.nan, 0.67924985]),
- False,
- ),
- (
- # The target y is constant - not forcing finite
- np.array([[5, 1], [3, 0], [2, 10], [8, 4]]),
- np.array([0, 0, 0, 0]),
- np.array([np.nan, np.nan]),
- np.array([np.nan, np.nan]),
- False,
- ),
- (
- # Feature in X correlated with y - not forcing finite
- np.array([[0, 1], [1, 0], [2, 10], [3, 4]]),
- np.array([0, 1, 2, 3]),
- np.array([np.inf, 0.845433]),
- np.array([0.0, 0.454913]),
- False,
- ),
- (
- # Feature in X anti-correlated with y - not forcing finite
- np.array([[3, 1], [2, 0], [1, 10], [0, 4]]),
- np.array([0, 1, 2, 3]),
- np.array([np.inf, 0.845433]),
- np.array([0.0, 0.454913]),
- False,
- ),
- ],
- )
- def test_f_regression_corner_case(
- X, y, expected_f_statistic, expected_p_values, force_finite
- ):
- """Check the behaviour of `force_finite` for some corner cases with `f_regression`.
- Non-regression test for:
- https://github.com/scikit-learn/scikit-learn/issues/15672
- """
- with warnings.catch_warnings():
- warnings.simplefilter("error", RuntimeWarning)
- f_statistic, p_values = f_regression(X, y, force_finite=force_finite)
- np.testing.assert_array_almost_equal(f_statistic, expected_f_statistic)
- np.testing.assert_array_almost_equal(p_values, expected_p_values)
- def test_f_classif_multi_class():
- # Test whether the F test yields meaningful results
- # on a simple simulated classification problem
- X, y = make_classification(
- n_samples=200,
- n_features=20,
- n_informative=3,
- n_redundant=2,
- n_repeated=0,
- n_classes=8,
- n_clusters_per_class=1,
- flip_y=0.0,
- class_sep=10,
- shuffle=False,
- random_state=0,
- )
- F, pv = f_classif(X, y)
- assert (F > 0).all()
- assert (pv > 0).all()
- assert (pv < 1).all()
- assert (pv[:5] < 0.05).all()
- assert (pv[5:] > 1.0e-4).all()
- def test_select_percentile_classif():
- # Test whether the relative univariate feature selection
- # gets the correct items in a simple classification problem
- # with the percentile heuristic
- X, y = make_classification(
- n_samples=200,
- n_features=20,
- n_informative=3,
- n_redundant=2,
- n_repeated=0,
- n_classes=8,
- n_clusters_per_class=1,
- flip_y=0.0,
- class_sep=10,
- shuffle=False,
- random_state=0,
- )
- univariate_filter = SelectPercentile(f_classif, percentile=25)
- X_r = univariate_filter.fit(X, y).transform(X)
- X_r2 = (
- GenericUnivariateSelect(f_classif, mode="percentile", param=25)
- .fit(X, y)
- .transform(X)
- )
- assert_array_equal(X_r, X_r2)
- support = univariate_filter.get_support()
- gtruth = np.zeros(20)
- gtruth[:5] = 1
- assert_array_equal(support, gtruth)
- def test_select_percentile_classif_sparse():
- # Test whether the relative univariate feature selection
- # gets the correct items in a simple classification problem
- # with the percentile heuristic
- X, y = make_classification(
- n_samples=200,
- n_features=20,
- n_informative=3,
- n_redundant=2,
- n_repeated=0,
- n_classes=8,
- n_clusters_per_class=1,
- flip_y=0.0,
- class_sep=10,
- shuffle=False,
- random_state=0,
- )
- X = sparse.csr_matrix(X)
- univariate_filter = SelectPercentile(f_classif, percentile=25)
- X_r = univariate_filter.fit(X, y).transform(X)
- X_r2 = (
- GenericUnivariateSelect(f_classif, mode="percentile", param=25)
- .fit(X, y)
- .transform(X)
- )
- assert_array_equal(X_r.toarray(), X_r2.toarray())
- support = univariate_filter.get_support()
- gtruth = np.zeros(20)
- gtruth[:5] = 1
- assert_array_equal(support, gtruth)
- X_r2inv = univariate_filter.inverse_transform(X_r2)
- assert sparse.issparse(X_r2inv)
- support_mask = safe_mask(X_r2inv, support)
- assert X_r2inv.shape == X.shape
- assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray())
- # Check other columns are empty
- assert X_r2inv.getnnz() == X_r.getnnz()
- ##############################################################################
- # Test univariate selection in classification settings
- def test_select_kbest_classif():
- # Test whether the relative univariate feature selection
- # gets the correct items in a simple classification problem
- # with the k best heuristic
- X, y = make_classification(
- n_samples=200,
- n_features=20,
- n_informative=3,
- n_redundant=2,
- n_repeated=0,
- n_classes=8,
- n_clusters_per_class=1,
- flip_y=0.0,
- class_sep=10,
- shuffle=False,
- random_state=0,
- )
- univariate_filter = SelectKBest(f_classif, k=5)
- X_r = univariate_filter.fit(X, y).transform(X)
- X_r2 = (
- GenericUnivariateSelect(f_classif, mode="k_best", param=5)
- .fit(X, y)
- .transform(X)
- )
- assert_array_equal(X_r, X_r2)
- support = univariate_filter.get_support()
- gtruth = np.zeros(20)
- gtruth[:5] = 1
- assert_array_equal(support, gtruth)
- def test_select_kbest_all():
- # Test whether k="all" correctly returns all features.
- X, y = make_classification(
- n_samples=20, n_features=10, shuffle=False, random_state=0
- )
- univariate_filter = SelectKBest(f_classif, k="all")
- X_r = univariate_filter.fit(X, y).transform(X)
- assert_array_equal(X, X_r)
- # Non-regression test for:
- # https://github.com/scikit-learn/scikit-learn/issues/24949
- X_r2 = (
- GenericUnivariateSelect(f_classif, mode="k_best", param="all")
- .fit(X, y)
- .transform(X)
- )
- assert_array_equal(X_r, X_r2)
- @pytest.mark.parametrize("dtype_in", [np.float32, np.float64])
- def test_select_kbest_zero(dtype_in):
- # Test whether k=0 correctly returns no features.
- X, y = make_classification(
- n_samples=20, n_features=10, shuffle=False, random_state=0
- )
- X = X.astype(dtype_in)
- univariate_filter = SelectKBest(f_classif, k=0)
- univariate_filter.fit(X, y)
- support = univariate_filter.get_support()
- gtruth = np.zeros(10, dtype=bool)
- assert_array_equal(support, gtruth)
- with pytest.warns(UserWarning, match="No features were selected"):
- X_selected = univariate_filter.transform(X)
- assert X_selected.shape == (20, 0)
- assert X_selected.dtype == dtype_in
- def test_select_heuristics_classif():
- # Test whether the relative univariate feature selection
- # gets the correct items in a simple classification problem
- # with the fdr, fwe and fpr heuristics
- X, y = make_classification(
- n_samples=200,
- n_features=20,
- n_informative=3,
- n_redundant=2,
- n_repeated=0,
- n_classes=8,
- n_clusters_per_class=1,
- flip_y=0.0,
- class_sep=10,
- shuffle=False,
- random_state=0,
- )
- univariate_filter = SelectFwe(f_classif, alpha=0.01)
- X_r = univariate_filter.fit(X, y).transform(X)
- gtruth = np.zeros(20)
- gtruth[:5] = 1
- for mode in ["fdr", "fpr", "fwe"]:
- X_r2 = (
- GenericUnivariateSelect(f_classif, mode=mode, param=0.01)
- .fit(X, y)
- .transform(X)
- )
- assert_array_equal(X_r, X_r2)
- support = univariate_filter.get_support()
- assert_allclose(support, gtruth)
- ##############################################################################
- # Test univariate selection in regression settings
- def assert_best_scores_kept(score_filter):
- scores = score_filter.scores_
- support = score_filter.get_support()
- assert_allclose(np.sort(scores[support]), np.sort(scores)[-support.sum() :])
- def test_select_percentile_regression():
- # Test whether the relative univariate feature selection
- # gets the correct items in a simple regression problem
- # with the percentile heuristic
- X, y = make_regression(
- n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0
- )
- univariate_filter = SelectPercentile(f_regression, percentile=25)
- X_r = univariate_filter.fit(X, y).transform(X)
- assert_best_scores_kept(univariate_filter)
- X_r2 = (
- GenericUnivariateSelect(f_regression, mode="percentile", param=25)
- .fit(X, y)
- .transform(X)
- )
- assert_array_equal(X_r, X_r2)
- support = univariate_filter.get_support()
- gtruth = np.zeros(20)
- gtruth[:5] = 1
- assert_array_equal(support, gtruth)
- X_2 = X.copy()
- X_2[:, np.logical_not(support)] = 0
- assert_array_equal(X_2, univariate_filter.inverse_transform(X_r))
- # Check inverse_transform respects dtype
- assert_array_equal(
- X_2.astype(bool), univariate_filter.inverse_transform(X_r.astype(bool))
- )
- def test_select_percentile_regression_full():
- # Test whether the relative univariate feature selection
- # selects all features when '100%' is asked.
- X, y = make_regression(
- n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0
- )
- univariate_filter = SelectPercentile(f_regression, percentile=100)
- X_r = univariate_filter.fit(X, y).transform(X)
- assert_best_scores_kept(univariate_filter)
- X_r2 = (
- GenericUnivariateSelect(f_regression, mode="percentile", param=100)
- .fit(X, y)
- .transform(X)
- )
- assert_array_equal(X_r, X_r2)
- support = univariate_filter.get_support()
- gtruth = np.ones(20)
- assert_array_equal(support, gtruth)
- def test_select_kbest_regression():
- # Test whether the relative univariate feature selection
- # gets the correct items in a simple regression problem
- # with the k best heuristic
- X, y = make_regression(
- n_samples=200,
- n_features=20,
- n_informative=5,
- shuffle=False,
- random_state=0,
- noise=10,
- )
- univariate_filter = SelectKBest(f_regression, k=5)
- X_r = univariate_filter.fit(X, y).transform(X)
- assert_best_scores_kept(univariate_filter)
- X_r2 = (
- GenericUnivariateSelect(f_regression, mode="k_best", param=5)
- .fit(X, y)
- .transform(X)
- )
- assert_array_equal(X_r, X_r2)
- support = univariate_filter.get_support()
- gtruth = np.zeros(20)
- gtruth[:5] = 1
- assert_array_equal(support, gtruth)
- def test_select_heuristics_regression():
- # Test whether the relative univariate feature selection
- # gets the correct items in a simple regression problem
- # with the fpr, fdr or fwe heuristics
- X, y = make_regression(
- n_samples=200,
- n_features=20,
- n_informative=5,
- shuffle=False,
- random_state=0,
- noise=10,
- )
- univariate_filter = SelectFpr(f_regression, alpha=0.01)
- X_r = univariate_filter.fit(X, y).transform(X)
- gtruth = np.zeros(20)
- gtruth[:5] = 1
- for mode in ["fdr", "fpr", "fwe"]:
- X_r2 = (
- GenericUnivariateSelect(f_regression, mode=mode, param=0.01)
- .fit(X, y)
- .transform(X)
- )
- assert_array_equal(X_r, X_r2)
- support = univariate_filter.get_support()
- assert_array_equal(support[:5], np.ones((5,), dtype=bool))
- assert np.sum(support[5:] == 1) < 3
- def test_boundary_case_ch2():
- # Test boundary case, and always aim to select 1 feature.
- X = np.array([[10, 20], [20, 20], [20, 30]])
- y = np.array([[1], [0], [0]])
- scores, pvalues = chi2(X, y)
- assert_array_almost_equal(scores, np.array([4.0, 0.71428571]))
- assert_array_almost_equal(pvalues, np.array([0.04550026, 0.39802472]))
- filter_fdr = SelectFdr(chi2, alpha=0.1)
- filter_fdr.fit(X, y)
- support_fdr = filter_fdr.get_support()
- assert_array_equal(support_fdr, np.array([True, False]))
- filter_kbest = SelectKBest(chi2, k=1)
- filter_kbest.fit(X, y)
- support_kbest = filter_kbest.get_support()
- assert_array_equal(support_kbest, np.array([True, False]))
- filter_percentile = SelectPercentile(chi2, percentile=50)
- filter_percentile.fit(X, y)
- support_percentile = filter_percentile.get_support()
- assert_array_equal(support_percentile, np.array([True, False]))
- filter_fpr = SelectFpr(chi2, alpha=0.1)
- filter_fpr.fit(X, y)
- support_fpr = filter_fpr.get_support()
- assert_array_equal(support_fpr, np.array([True, False]))
- filter_fwe = SelectFwe(chi2, alpha=0.1)
- filter_fwe.fit(X, y)
- support_fwe = filter_fwe.get_support()
- assert_array_equal(support_fwe, np.array([True, False]))
- @pytest.mark.parametrize("alpha", [0.001, 0.01, 0.1])
- @pytest.mark.parametrize("n_informative", [1, 5, 10])
- def test_select_fdr_regression(alpha, n_informative):
- # Test that fdr heuristic actually has low FDR.
- def single_fdr(alpha, n_informative, random_state):
- X, y = make_regression(
- n_samples=150,
- n_features=20,
- n_informative=n_informative,
- shuffle=False,
- random_state=random_state,
- noise=10,
- )
- with warnings.catch_warnings(record=True):
- # Warnings can be raised when no features are selected
- # (low alpha or very noisy data)
- univariate_filter = SelectFdr(f_regression, alpha=alpha)
- X_r = univariate_filter.fit(X, y).transform(X)
- X_r2 = (
- GenericUnivariateSelect(f_regression, mode="fdr", param=alpha)
- .fit(X, y)
- .transform(X)
- )
- assert_array_equal(X_r, X_r2)
- support = univariate_filter.get_support()
- num_false_positives = np.sum(support[n_informative:] == 1)
- num_true_positives = np.sum(support[:n_informative] == 1)
- if num_false_positives == 0:
- return 0.0
- false_discovery_rate = num_false_positives / (
- num_true_positives + num_false_positives
- )
- return false_discovery_rate
- # As per Benjamini-Hochberg, the expected false discovery rate
- # should be lower than alpha:
- # FDR = E(FP / (TP + FP)) <= alpha
- false_discovery_rate = np.mean(
- [single_fdr(alpha, n_informative, random_state) for random_state in range(100)]
- )
- assert alpha >= false_discovery_rate
- # Make sure that the empirical false discovery rate increases
- # with alpha:
- if false_discovery_rate != 0:
- assert false_discovery_rate > alpha / 10
- def test_select_fwe_regression():
- # Test whether the relative univariate feature selection
- # gets the correct items in a simple regression problem
- # with the fwe heuristic
- X, y = make_regression(
- n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0
- )
- univariate_filter = SelectFwe(f_regression, alpha=0.01)
- X_r = univariate_filter.fit(X, y).transform(X)
- X_r2 = (
- GenericUnivariateSelect(f_regression, mode="fwe", param=0.01)
- .fit(X, y)
- .transform(X)
- )
- assert_array_equal(X_r, X_r2)
- support = univariate_filter.get_support()
- gtruth = np.zeros(20)
- gtruth[:5] = 1
- assert_array_equal(support[:5], np.ones((5,), dtype=bool))
- assert np.sum(support[5:] == 1) < 2
- def test_selectkbest_tiebreaking():
- # Test whether SelectKBest actually selects k features in case of ties.
- # Prior to 0.11, SelectKBest would return more features than requested.
- Xs = [[0, 1, 1], [0, 0, 1], [1, 0, 0], [1, 1, 0]]
- y = [1]
- dummy_score = lambda X, y: (X[0], X[0])
- for X in Xs:
- sel = SelectKBest(dummy_score, k=1)
- X1 = ignore_warnings(sel.fit_transform)([X], y)
- assert X1.shape[1] == 1
- assert_best_scores_kept(sel)
- sel = SelectKBest(dummy_score, k=2)
- X2 = ignore_warnings(sel.fit_transform)([X], y)
- assert X2.shape[1] == 2
- assert_best_scores_kept(sel)
- def test_selectpercentile_tiebreaking():
- # Test if SelectPercentile selects the right n_features in case of ties.
- Xs = [[0, 1, 1], [0, 0, 1], [1, 0, 0], [1, 1, 0]]
- y = [1]
- dummy_score = lambda X, y: (X[0], X[0])
- for X in Xs:
- sel = SelectPercentile(dummy_score, percentile=34)
- X1 = ignore_warnings(sel.fit_transform)([X], y)
- assert X1.shape[1] == 1
- assert_best_scores_kept(sel)
- sel = SelectPercentile(dummy_score, percentile=67)
- X2 = ignore_warnings(sel.fit_transform)([X], y)
- assert X2.shape[1] == 2
- assert_best_scores_kept(sel)
- def test_tied_pvalues():
- # Test whether k-best and percentiles work with tied pvalues from chi2.
- # chi2 will return the same p-values for the following features, but it
- # will return different scores.
- X0 = np.array([[10000, 9999, 9998], [1, 1, 1]])
- y = [0, 1]
- for perm in itertools.permutations((0, 1, 2)):
- X = X0[:, perm]
- Xt = SelectKBest(chi2, k=2).fit_transform(X, y)
- assert Xt.shape == (2, 2)
- assert 9998 not in Xt
- Xt = SelectPercentile(chi2, percentile=67).fit_transform(X, y)
- assert Xt.shape == (2, 2)
- assert 9998 not in Xt
- def test_scorefunc_multilabel():
- # Test whether k-best and percentiles works with multilabels with chi2.
- X = np.array([[10000, 9999, 0], [100, 9999, 0], [1000, 99, 0]])
- y = [[1, 1], [0, 1], [1, 0]]
- Xt = SelectKBest(chi2, k=2).fit_transform(X, y)
- assert Xt.shape == (3, 2)
- assert 0 not in Xt
- Xt = SelectPercentile(chi2, percentile=67).fit_transform(X, y)
- assert Xt.shape == (3, 2)
- assert 0 not in Xt
- def test_tied_scores():
- # Test for stable sorting in k-best with tied scores.
- X_train = np.array([[0, 0, 0], [1, 1, 1]])
- y_train = [0, 1]
- for n_features in [1, 2, 3]:
- sel = SelectKBest(chi2, k=n_features).fit(X_train, y_train)
- X_test = sel.transform([[0, 1, 2]])
- assert_array_equal(X_test[0], np.arange(3)[-n_features:])
- def test_nans():
- # Assert that SelectKBest and SelectPercentile can handle NaNs.
- # First feature has zero variance to confuse f_classif (ANOVA) and
- # make it return a NaN.
- X = [[0, 1, 0], [0, -1, -1], [0, 0.5, 0.5]]
- y = [1, 0, 1]
- for select in (
- SelectKBest(f_classif, k=2),
- SelectPercentile(f_classif, percentile=67),
- ):
- ignore_warnings(select.fit)(X, y)
- assert_array_equal(select.get_support(indices=True), np.array([1, 2]))
- def test_invalid_k():
- X = [[0, 1, 0], [0, -1, -1], [0, 0.5, 0.5]]
- y = [1, 0, 1]
- with pytest.raises(ValueError):
- SelectKBest(k=4).fit(X, y)
- with pytest.raises(ValueError):
- GenericUnivariateSelect(mode="k_best", param=4).fit(X, y)
- def test_f_classif_constant_feature():
- # Test that f_classif warns if a feature is constant throughout.
- X, y = make_classification(n_samples=10, n_features=5)
- X[:, 0] = 2.0
- with pytest.warns(UserWarning):
- f_classif(X, y)
- def test_no_feature_selected():
- rng = np.random.RandomState(0)
- # Generate random uncorrelated data: a strict univariate test should
- # rejects all the features
- X = rng.rand(40, 10)
- y = rng.randint(0, 4, size=40)
- strict_selectors = [
- SelectFwe(alpha=0.01).fit(X, y),
- SelectFdr(alpha=0.01).fit(X, y),
- SelectFpr(alpha=0.01).fit(X, y),
- SelectPercentile(percentile=0).fit(X, y),
- SelectKBest(k=0).fit(X, y),
- ]
- for selector in strict_selectors:
- assert_array_equal(selector.get_support(), np.zeros(10))
- with pytest.warns(UserWarning, match="No features were selected"):
- X_selected = selector.transform(X)
- assert X_selected.shape == (40, 0)
- def test_mutual_info_classif():
- X, y = make_classification(
- n_samples=100,
- n_features=5,
- n_informative=1,
- n_redundant=1,
- n_repeated=0,
- n_classes=2,
- n_clusters_per_class=1,
- flip_y=0.0,
- class_sep=10,
- shuffle=False,
- random_state=0,
- )
- # Test in KBest mode.
- univariate_filter = SelectKBest(mutual_info_classif, k=2)
- X_r = univariate_filter.fit(X, y).transform(X)
- X_r2 = (
- GenericUnivariateSelect(mutual_info_classif, mode="k_best", param=2)
- .fit(X, y)
- .transform(X)
- )
- assert_array_equal(X_r, X_r2)
- support = univariate_filter.get_support()
- gtruth = np.zeros(5)
- gtruth[:2] = 1
- assert_array_equal(support, gtruth)
- # Test in Percentile mode.
- univariate_filter = SelectPercentile(mutual_info_classif, percentile=40)
- X_r = univariate_filter.fit(X, y).transform(X)
- X_r2 = (
- GenericUnivariateSelect(mutual_info_classif, mode="percentile", param=40)
- .fit(X, y)
- .transform(X)
- )
- assert_array_equal(X_r, X_r2)
- support = univariate_filter.get_support()
- gtruth = np.zeros(5)
- gtruth[:2] = 1
- assert_array_equal(support, gtruth)
- def test_mutual_info_regression():
- X, y = make_regression(
- n_samples=100,
- n_features=10,
- n_informative=2,
- shuffle=False,
- random_state=0,
- noise=10,
- )
- # Test in KBest mode.
- univariate_filter = SelectKBest(mutual_info_regression, k=2)
- X_r = univariate_filter.fit(X, y).transform(X)
- assert_best_scores_kept(univariate_filter)
- X_r2 = (
- GenericUnivariateSelect(mutual_info_regression, mode="k_best", param=2)
- .fit(X, y)
- .transform(X)
- )
- assert_array_equal(X_r, X_r2)
- support = univariate_filter.get_support()
- gtruth = np.zeros(10)
- gtruth[:2] = 1
- assert_array_equal(support, gtruth)
- # Test in Percentile mode.
- univariate_filter = SelectPercentile(mutual_info_regression, percentile=20)
- X_r = univariate_filter.fit(X, y).transform(X)
- X_r2 = (
- GenericUnivariateSelect(mutual_info_regression, mode="percentile", param=20)
- .fit(X, y)
- .transform(X)
- )
- assert_array_equal(X_r, X_r2)
- support = univariate_filter.get_support()
- gtruth = np.zeros(10)
- gtruth[:2] = 1
- assert_array_equal(support, gtruth)
- def test_dataframe_output_dtypes():
- """Check that the output datafarme dtypes are the same as the input.
- Non-regression test for gh-24860.
- """
- pd = pytest.importorskip("pandas")
- X, y = load_iris(return_X_y=True, as_frame=True)
- X = X.astype(
- {
- "petal length (cm)": np.float32,
- "petal width (cm)": np.float64,
- }
- )
- X["petal_width_binned"] = pd.cut(X["petal width (cm)"], bins=10)
- column_order = X.columns
- def selector(X, y):
- ranking = {
- "sepal length (cm)": 1,
- "sepal width (cm)": 2,
- "petal length (cm)": 3,
- "petal width (cm)": 4,
- "petal_width_binned": 5,
- }
- return np.asarray([ranking[name] for name in column_order])
- univariate_filter = SelectKBest(selector, k=3).set_output(transform="pandas")
- output = univariate_filter.fit_transform(X, y)
- assert_array_equal(
- output.columns, ["petal length (cm)", "petal width (cm)", "petal_width_binned"]
- )
- for name, dtype in output.dtypes.items():
- assert dtype == X.dtypes[name]
|