| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663 |
- import re
- import warnings
- from unittest.mock import Mock
- import numpy as np
- import pytest
- from sklearn import datasets
- from sklearn.base import BaseEstimator
- from sklearn.cross_decomposition import CCA, PLSCanonical, PLSRegression
- from sklearn.datasets import make_friedman1
- from sklearn.decomposition import PCA
- from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
- from sklearn.exceptions import NotFittedError
- from sklearn.feature_selection import SelectFromModel
- from sklearn.linear_model import (
- ElasticNet,
- ElasticNetCV,
- Lasso,
- LassoCV,
- LogisticRegression,
- PassiveAggressiveClassifier,
- SGDClassifier,
- )
- from sklearn.pipeline import make_pipeline
- from sklearn.svm import LinearSVC
- from sklearn.utils._testing import (
- MinimalClassifier,
- assert_allclose,
- assert_array_almost_equal,
- assert_array_equal,
- skip_if_32bit,
- )
- class NaNTag(BaseEstimator):
- def _more_tags(self):
- return {"allow_nan": True}
- class NoNaNTag(BaseEstimator):
- def _more_tags(self):
- return {"allow_nan": False}
- class NaNTagRandomForest(RandomForestClassifier):
- def _more_tags(self):
- return {"allow_nan": True}
- iris = datasets.load_iris()
- data, y = iris.data, iris.target
- rng = np.random.RandomState(0)
- def test_invalid_input():
- clf = SGDClassifier(
- alpha=0.1, max_iter=10, shuffle=True, random_state=None, tol=None
- )
- for threshold in ["gobbledigook", ".5 * gobbledigook"]:
- model = SelectFromModel(clf, threshold=threshold)
- model.fit(data, y)
- with pytest.raises(ValueError):
- model.transform(data)
- def test_input_estimator_unchanged():
- # Test that SelectFromModel fits on a clone of the estimator.
- est = RandomForestClassifier()
- transformer = SelectFromModel(estimator=est)
- transformer.fit(data, y)
- assert transformer.estimator is est
- @pytest.mark.parametrize(
- "max_features, err_type, err_msg",
- [
- (
- data.shape[1] + 1,
- ValueError,
- "max_features ==",
- ),
- (
- lambda X: 1.5,
- TypeError,
- "max_features must be an instance of int, not float.",
- ),
- (
- lambda X: data.shape[1] + 1,
- ValueError,
- "max_features ==",
- ),
- (
- lambda X: -1,
- ValueError,
- "max_features ==",
- ),
- ],
- )
- def test_max_features_error(max_features, err_type, err_msg):
- err_msg = re.escape(err_msg)
- clf = RandomForestClassifier(n_estimators=5, random_state=0)
- transformer = SelectFromModel(
- estimator=clf, max_features=max_features, threshold=-np.inf
- )
- with pytest.raises(err_type, match=err_msg):
- transformer.fit(data, y)
- @pytest.mark.parametrize("max_features", [0, 2, data.shape[1], None])
- def test_inferred_max_features_integer(max_features):
- """Check max_features_ and output shape for integer max_features."""
- clf = RandomForestClassifier(n_estimators=5, random_state=0)
- transformer = SelectFromModel(
- estimator=clf, max_features=max_features, threshold=-np.inf
- )
- X_trans = transformer.fit_transform(data, y)
- if max_features is not None:
- assert transformer.max_features_ == max_features
- assert X_trans.shape[1] == transformer.max_features_
- else:
- assert not hasattr(transformer, "max_features_")
- assert X_trans.shape[1] == data.shape[1]
- @pytest.mark.parametrize(
- "max_features",
- [lambda X: 1, lambda X: X.shape[1], lambda X: min(X.shape[1], 10000)],
- )
- def test_inferred_max_features_callable(max_features):
- """Check max_features_ and output shape for callable max_features."""
- clf = RandomForestClassifier(n_estimators=5, random_state=0)
- transformer = SelectFromModel(
- estimator=clf, max_features=max_features, threshold=-np.inf
- )
- X_trans = transformer.fit_transform(data, y)
- assert transformer.max_features_ == max_features(data)
- assert X_trans.shape[1] == transformer.max_features_
- @pytest.mark.parametrize("max_features", [lambda X: round(len(X[0]) / 2), 2])
- def test_max_features_array_like(max_features):
- X = [
- [0.87, -1.34, 0.31],
- [-2.79, -0.02, -0.85],
- [-1.34, -0.48, -2.55],
- [1.92, 1.48, 0.65],
- ]
- y = [0, 1, 0, 1]
- clf = RandomForestClassifier(n_estimators=5, random_state=0)
- transformer = SelectFromModel(
- estimator=clf, max_features=max_features, threshold=-np.inf
- )
- X_trans = transformer.fit_transform(X, y)
- assert X_trans.shape[1] == transformer.max_features_
- @pytest.mark.parametrize(
- "max_features",
- [lambda X: min(X.shape[1], 10000), lambda X: X.shape[1], lambda X: 1],
- )
- def test_max_features_callable_data(max_features):
- """Tests that the callable passed to `fit` is called on X."""
- clf = RandomForestClassifier(n_estimators=50, random_state=0)
- m = Mock(side_effect=max_features)
- transformer = SelectFromModel(estimator=clf, max_features=m, threshold=-np.inf)
- transformer.fit_transform(data, y)
- m.assert_called_with(data)
- class FixedImportanceEstimator(BaseEstimator):
- def __init__(self, importances):
- self.importances = importances
- def fit(self, X, y=None):
- self.feature_importances_ = np.array(self.importances)
- def test_max_features():
- # Test max_features parameter using various values
- X, y = datasets.make_classification(
- n_samples=1000,
- n_features=10,
- n_informative=3,
- n_redundant=0,
- n_repeated=0,
- shuffle=False,
- random_state=0,
- )
- max_features = X.shape[1]
- est = RandomForestClassifier(n_estimators=50, random_state=0)
- transformer1 = SelectFromModel(estimator=est, threshold=-np.inf)
- transformer2 = SelectFromModel(
- estimator=est, max_features=max_features, threshold=-np.inf
- )
- X_new1 = transformer1.fit_transform(X, y)
- X_new2 = transformer2.fit_transform(X, y)
- assert_allclose(X_new1, X_new2)
- # Test max_features against actual model.
- transformer1 = SelectFromModel(estimator=Lasso(alpha=0.025, random_state=42))
- X_new1 = transformer1.fit_transform(X, y)
- scores1 = np.abs(transformer1.estimator_.coef_)
- candidate_indices1 = np.argsort(-scores1, kind="mergesort")
- for n_features in range(1, X_new1.shape[1] + 1):
- transformer2 = SelectFromModel(
- estimator=Lasso(alpha=0.025, random_state=42),
- max_features=n_features,
- threshold=-np.inf,
- )
- X_new2 = transformer2.fit_transform(X, y)
- scores2 = np.abs(transformer2.estimator_.coef_)
- candidate_indices2 = np.argsort(-scores2, kind="mergesort")
- assert_allclose(
- X[:, candidate_indices1[:n_features]], X[:, candidate_indices2[:n_features]]
- )
- assert_allclose(transformer1.estimator_.coef_, transformer2.estimator_.coef_)
- def test_max_features_tiebreak():
- # Test if max_features can break tie among feature importance
- X, y = datasets.make_classification(
- n_samples=1000,
- n_features=10,
- n_informative=3,
- n_redundant=0,
- n_repeated=0,
- shuffle=False,
- random_state=0,
- )
- max_features = X.shape[1]
- feature_importances = np.array([4, 4, 4, 4, 3, 3, 3, 2, 2, 1])
- for n_features in range(1, max_features + 1):
- transformer = SelectFromModel(
- FixedImportanceEstimator(feature_importances),
- max_features=n_features,
- threshold=-np.inf,
- )
- X_new = transformer.fit_transform(X, y)
- selected_feature_indices = np.where(transformer._get_support_mask())[0]
- assert_array_equal(selected_feature_indices, np.arange(n_features))
- assert X_new.shape[1] == n_features
- def test_threshold_and_max_features():
- X, y = datasets.make_classification(
- n_samples=1000,
- n_features=10,
- n_informative=3,
- n_redundant=0,
- n_repeated=0,
- shuffle=False,
- random_state=0,
- )
- est = RandomForestClassifier(n_estimators=50, random_state=0)
- transformer1 = SelectFromModel(estimator=est, max_features=3, threshold=-np.inf)
- X_new1 = transformer1.fit_transform(X, y)
- transformer2 = SelectFromModel(estimator=est, threshold=0.04)
- X_new2 = transformer2.fit_transform(X, y)
- transformer3 = SelectFromModel(estimator=est, max_features=3, threshold=0.04)
- X_new3 = transformer3.fit_transform(X, y)
- assert X_new3.shape[1] == min(X_new1.shape[1], X_new2.shape[1])
- selected_indices = transformer3.transform(np.arange(X.shape[1])[np.newaxis, :])
- assert_allclose(X_new3, X[:, selected_indices[0]])
- @skip_if_32bit
- def test_feature_importances():
- X, y = datasets.make_classification(
- n_samples=1000,
- n_features=10,
- n_informative=3,
- n_redundant=0,
- n_repeated=0,
- shuffle=False,
- random_state=0,
- )
- est = RandomForestClassifier(n_estimators=50, random_state=0)
- for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
- transformer = SelectFromModel(estimator=est, threshold=threshold)
- transformer.fit(X, y)
- assert hasattr(transformer.estimator_, "feature_importances_")
- X_new = transformer.transform(X)
- assert X_new.shape[1] < X.shape[1]
- importances = transformer.estimator_.feature_importances_
- feature_mask = np.abs(importances) > func(importances)
- assert_array_almost_equal(X_new, X[:, feature_mask])
- def test_sample_weight():
- # Ensure sample weights are passed to underlying estimator
- X, y = datasets.make_classification(
- n_samples=100,
- n_features=10,
- n_informative=3,
- n_redundant=0,
- n_repeated=0,
- shuffle=False,
- random_state=0,
- )
- # Check with sample weights
- sample_weight = np.ones(y.shape)
- sample_weight[y == 1] *= 100
- est = LogisticRegression(random_state=0, fit_intercept=False)
- transformer = SelectFromModel(estimator=est)
- transformer.fit(X, y, sample_weight=None)
- mask = transformer._get_support_mask()
- transformer.fit(X, y, sample_weight=sample_weight)
- weighted_mask = transformer._get_support_mask()
- assert not np.all(weighted_mask == mask)
- transformer.fit(X, y, sample_weight=3 * sample_weight)
- reweighted_mask = transformer._get_support_mask()
- assert np.all(weighted_mask == reweighted_mask)
- @pytest.mark.parametrize(
- "estimator",
- [
- Lasso(alpha=0.1, random_state=42),
- LassoCV(random_state=42),
- ElasticNet(l1_ratio=1, random_state=42),
- ElasticNetCV(l1_ratio=[1], random_state=42),
- ],
- )
- def test_coef_default_threshold(estimator):
- X, y = datasets.make_classification(
- n_samples=100,
- n_features=10,
- n_informative=3,
- n_redundant=0,
- n_repeated=0,
- shuffle=False,
- random_state=0,
- )
- # For the Lasso and related models, the threshold defaults to 1e-5
- transformer = SelectFromModel(estimator=estimator)
- transformer.fit(X, y)
- X_new = transformer.transform(X)
- mask = np.abs(transformer.estimator_.coef_) > 1e-5
- assert_array_almost_equal(X_new, X[:, mask])
- @skip_if_32bit
- def test_2d_coef():
- X, y = datasets.make_classification(
- n_samples=1000,
- n_features=10,
- n_informative=3,
- n_redundant=0,
- n_repeated=0,
- shuffle=False,
- random_state=0,
- n_classes=4,
- )
- est = LogisticRegression()
- for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
- for order in [1, 2, np.inf]:
- # Fit SelectFromModel a multi-class problem
- transformer = SelectFromModel(
- estimator=LogisticRegression(), threshold=threshold, norm_order=order
- )
- transformer.fit(X, y)
- assert hasattr(transformer.estimator_, "coef_")
- X_new = transformer.transform(X)
- assert X_new.shape[1] < X.shape[1]
- # Manually check that the norm is correctly performed
- est.fit(X, y)
- importances = np.linalg.norm(est.coef_, axis=0, ord=order)
- feature_mask = importances > func(importances)
- assert_array_almost_equal(X_new, X[:, feature_mask])
- def test_partial_fit():
- est = PassiveAggressiveClassifier(
- random_state=0, shuffle=False, max_iter=5, tol=None
- )
- transformer = SelectFromModel(estimator=est)
- transformer.partial_fit(data, y, classes=np.unique(y))
- old_model = transformer.estimator_
- transformer.partial_fit(data, y, classes=np.unique(y))
- new_model = transformer.estimator_
- assert old_model is new_model
- X_transform = transformer.transform(data)
- transformer.fit(np.vstack((data, data)), np.concatenate((y, y)))
- assert_array_almost_equal(X_transform, transformer.transform(data))
- # check that if est doesn't have partial_fit, neither does SelectFromModel
- transformer = SelectFromModel(estimator=RandomForestClassifier())
- assert not hasattr(transformer, "partial_fit")
- def test_calling_fit_reinitializes():
- est = LinearSVC(dual="auto", random_state=0)
- transformer = SelectFromModel(estimator=est)
- transformer.fit(data, y)
- transformer.set_params(estimator__C=100)
- transformer.fit(data, y)
- assert transformer.estimator_.C == 100
- def test_prefit():
- # Test all possible combinations of the prefit parameter.
- # Passing a prefit parameter with the selected model
- # and fitting a unfit model with prefit=False should give same results.
- clf = SGDClassifier(alpha=0.1, max_iter=10, shuffle=True, random_state=0, tol=None)
- model = SelectFromModel(clf)
- model.fit(data, y)
- X_transform = model.transform(data)
- clf.fit(data, y)
- model = SelectFromModel(clf, prefit=True)
- assert_array_almost_equal(model.transform(data), X_transform)
- model.fit(data, y)
- assert model.estimator_ is not clf
- # Check that the model is rewritten if prefit=False and a fitted model is
- # passed
- model = SelectFromModel(clf, prefit=False)
- model.fit(data, y)
- assert_array_almost_equal(model.transform(data), X_transform)
- # Check that passing an unfitted estimator with `prefit=True` raises a
- # `ValueError`
- clf = SGDClassifier(alpha=0.1, max_iter=10, shuffle=True, random_state=0, tol=None)
- model = SelectFromModel(clf, prefit=True)
- err_msg = "When `prefit=True`, `estimator` is expected to be a fitted estimator."
- with pytest.raises(NotFittedError, match=err_msg):
- model.fit(data, y)
- with pytest.raises(NotFittedError, match=err_msg):
- model.partial_fit(data, y)
- with pytest.raises(NotFittedError, match=err_msg):
- model.transform(data)
- # Check that the internal parameters of prefitted model are not changed
- # when calling `fit` or `partial_fit` with `prefit=True`
- clf = SGDClassifier(alpha=0.1, max_iter=10, shuffle=True, tol=None).fit(data, y)
- model = SelectFromModel(clf, prefit=True)
- model.fit(data, y)
- assert_allclose(model.estimator_.coef_, clf.coef_)
- model.partial_fit(data, y)
- assert_allclose(model.estimator_.coef_, clf.coef_)
- def test_prefit_max_features():
- """Check the interaction between `prefit` and `max_features`."""
- # case 1: an error should be raised at `transform` if `fit` was not called to
- # validate the attributes
- estimator = RandomForestClassifier(n_estimators=5, random_state=0)
- estimator.fit(data, y)
- model = SelectFromModel(estimator, prefit=True, max_features=lambda X: X.shape[1])
- err_msg = (
- "When `prefit=True` and `max_features` is a callable, call `fit` "
- "before calling `transform`."
- )
- with pytest.raises(NotFittedError, match=err_msg):
- model.transform(data)
- # case 2: `max_features` is not validated and different from an integer
- # FIXME: we cannot validate the upper bound of the attribute at transform
- # and we should force calling `fit` if we intend to force the attribute
- # to have such an upper bound.
- max_features = 2.5
- model.set_params(max_features=max_features)
- with pytest.raises(ValueError, match="`max_features` must be an integer"):
- model.transform(data)
- def test_prefit_get_feature_names_out():
- """Check the interaction between prefit and the feature names."""
- clf = RandomForestClassifier(n_estimators=2, random_state=0)
- clf.fit(data, y)
- model = SelectFromModel(clf, prefit=True, max_features=1)
- name = type(model).__name__
- err_msg = (
- f"This {name} instance is not fitted yet. Call 'fit' with "
- "appropriate arguments before using this estimator."
- )
- with pytest.raises(NotFittedError, match=err_msg):
- model.get_feature_names_out()
- model.fit(data, y)
- feature_names = model.get_feature_names_out()
- assert feature_names == ["x3"]
- def test_threshold_string():
- est = RandomForestClassifier(n_estimators=50, random_state=0)
- model = SelectFromModel(est, threshold="0.5*mean")
- model.fit(data, y)
- X_transform = model.transform(data)
- # Calculate the threshold from the estimator directly.
- est.fit(data, y)
- threshold = 0.5 * np.mean(est.feature_importances_)
- mask = est.feature_importances_ > threshold
- assert_array_almost_equal(X_transform, data[:, mask])
- def test_threshold_without_refitting():
- # Test that the threshold can be set without refitting the model.
- clf = SGDClassifier(alpha=0.1, max_iter=10, shuffle=True, random_state=0, tol=None)
- model = SelectFromModel(clf, threshold="0.1 * mean")
- model.fit(data, y)
- X_transform = model.transform(data)
- # Set a higher threshold to filter out more features.
- model.threshold = "1.0 * mean"
- assert X_transform.shape[1] > model.transform(data).shape[1]
- def test_fit_accepts_nan_inf():
- # Test that fit doesn't check for np.inf and np.nan values.
- clf = HistGradientBoostingClassifier(random_state=0)
- model = SelectFromModel(estimator=clf)
- nan_data = data.copy()
- nan_data[0] = np.nan
- nan_data[1] = np.inf
- model.fit(data, y)
- def test_transform_accepts_nan_inf():
- # Test that transform doesn't check for np.inf and np.nan values.
- clf = NaNTagRandomForest(n_estimators=100, random_state=0)
- nan_data = data.copy()
- model = SelectFromModel(estimator=clf)
- model.fit(nan_data, y)
- nan_data[0] = np.nan
- nan_data[1] = np.inf
- model.transform(nan_data)
- def test_allow_nan_tag_comes_from_estimator():
- allow_nan_est = NaNTag()
- model = SelectFromModel(estimator=allow_nan_est)
- assert model._get_tags()["allow_nan"] is True
- no_nan_est = NoNaNTag()
- model = SelectFromModel(estimator=no_nan_est)
- assert model._get_tags()["allow_nan"] is False
- def _pca_importances(pca_estimator):
- return np.abs(pca_estimator.explained_variance_)
- @pytest.mark.parametrize(
- "estimator, importance_getter",
- [
- (
- make_pipeline(PCA(random_state=0), LogisticRegression()),
- "named_steps.logisticregression.coef_",
- ),
- (PCA(random_state=0), _pca_importances),
- ],
- )
- def test_importance_getter(estimator, importance_getter):
- selector = SelectFromModel(
- estimator, threshold="mean", importance_getter=importance_getter
- )
- selector.fit(data, y)
- assert selector.transform(data).shape[1] == 1
- @pytest.mark.parametrize("PLSEstimator", [CCA, PLSCanonical, PLSRegression])
- def test_select_from_model_pls(PLSEstimator):
- """Check the behaviour of SelectFromModel with PLS estimators.
- Non-regression test for:
- https://github.com/scikit-learn/scikit-learn/issues/12410
- """
- X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
- estimator = PLSEstimator(n_components=1)
- model = make_pipeline(SelectFromModel(estimator), estimator).fit(X, y)
- assert model.score(X, y) > 0.5
- def test_estimator_does_not_support_feature_names():
- """SelectFromModel works with estimators that do not support feature_names_in_.
- Non-regression test for #21949.
- """
- pytest.importorskip("pandas")
- X, y = datasets.load_iris(as_frame=True, return_X_y=True)
- all_feature_names = set(X.columns)
- def importance_getter(estimator):
- return np.arange(X.shape[1])
- selector = SelectFromModel(
- MinimalClassifier(), importance_getter=importance_getter
- ).fit(X, y)
- # selector learns the feature names itself
- assert_array_equal(selector.feature_names_in_, X.columns)
- feature_names_out = set(selector.get_feature_names_out())
- assert feature_names_out < all_feature_names
- with warnings.catch_warnings():
- warnings.simplefilter("error", UserWarning)
- selector.transform(X.iloc[1:3])
- @pytest.mark.parametrize(
- "error, err_msg, max_features",
- (
- [ValueError, "max_features == 10, must be <= 4", 10],
- [ValueError, "max_features == 5, must be <= 4", lambda x: x.shape[1] + 1],
- ),
- )
- def test_partial_fit_validate_max_features(error, err_msg, max_features):
- """Test that partial_fit from SelectFromModel validates `max_features`."""
- X, y = datasets.make_classification(
- n_samples=100,
- n_features=4,
- random_state=0,
- )
- with pytest.raises(error, match=err_msg):
- SelectFromModel(
- estimator=SGDClassifier(), max_features=max_features
- ).partial_fit(X, y, classes=[0, 1])
- @pytest.mark.parametrize("as_frame", [True, False])
- def test_partial_fit_validate_feature_names(as_frame):
- """Test that partial_fit from SelectFromModel validates `feature_names_in_`."""
- pytest.importorskip("pandas")
- X, y = datasets.load_iris(as_frame=as_frame, return_X_y=True)
- selector = SelectFromModel(estimator=SGDClassifier(), max_features=4).partial_fit(
- X, y, classes=[0, 1, 2]
- )
- if as_frame:
- assert_array_equal(selector.feature_names_in_, X.columns)
- else:
- assert not hasattr(selector, "feature_names_in_")
|