| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262 |
- import numpy as np
- import pytest
- from sklearn.base import ClassifierMixin, clone, is_classifier
- from sklearn.datasets import (
- load_diabetes,
- load_iris,
- make_classification,
- make_regression,
- )
- from sklearn.ensemble import (
- RandomForestClassifier,
- RandomForestRegressor,
- StackingClassifier,
- StackingRegressor,
- VotingClassifier,
- VotingRegressor,
- )
- from sklearn.impute import SimpleImputer
- from sklearn.linear_model import LinearRegression, LogisticRegression
- from sklearn.pipeline import make_pipeline
- from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR
- X, y = load_iris(return_X_y=True)
- X_r, y_r = load_diabetes(return_X_y=True)
- @pytest.mark.parametrize(
- "X, y, estimator",
- [
- (
- *make_classification(n_samples=10),
- StackingClassifier(
- estimators=[
- ("lr", LogisticRegression()),
- ("svm", LinearSVC(dual="auto")),
- ("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
- ],
- cv=2,
- ),
- ),
- (
- *make_classification(n_samples=10),
- VotingClassifier(
- estimators=[
- ("lr", LogisticRegression()),
- ("svm", LinearSVC(dual="auto")),
- ("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
- ]
- ),
- ),
- (
- *make_regression(n_samples=10),
- StackingRegressor(
- estimators=[
- ("lr", LinearRegression()),
- ("svm", LinearSVR(dual="auto")),
- ("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
- ],
- cv=2,
- ),
- ),
- (
- *make_regression(n_samples=10),
- VotingRegressor(
- estimators=[
- ("lr", LinearRegression()),
- ("svm", LinearSVR(dual="auto")),
- ("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
- ]
- ),
- ),
- ],
- ids=[
- "stacking-classifier",
- "voting-classifier",
- "stacking-regressor",
- "voting-regressor",
- ],
- )
- def test_ensemble_heterogeneous_estimators_behavior(X, y, estimator):
- # check that the behavior of `estimators`, `estimators_`,
- # `named_estimators`, `named_estimators_` is consistent across all
- # ensemble classes and when using `set_params()`.
- # before fit
- assert "svm" in estimator.named_estimators
- assert estimator.named_estimators.svm is estimator.estimators[1][1]
- assert estimator.named_estimators.svm is estimator.named_estimators["svm"]
- # check fitted attributes
- estimator.fit(X, y)
- assert len(estimator.named_estimators) == 3
- assert len(estimator.named_estimators_) == 3
- assert sorted(list(estimator.named_estimators_.keys())) == sorted(
- ["lr", "svm", "rf"]
- )
- # check that set_params() does not add a new attribute
- estimator_new_params = clone(estimator)
- svm_estimator = SVC() if is_classifier(estimator) else SVR()
- estimator_new_params.set_params(svm=svm_estimator).fit(X, y)
- assert not hasattr(estimator_new_params, "svm")
- assert (
- estimator_new_params.named_estimators.lr.get_params()
- == estimator.named_estimators.lr.get_params()
- )
- assert (
- estimator_new_params.named_estimators.rf.get_params()
- == estimator.named_estimators.rf.get_params()
- )
- # check the behavior when setting an dropping an estimator
- estimator_dropped = clone(estimator)
- estimator_dropped.set_params(svm="drop")
- estimator_dropped.fit(X, y)
- assert len(estimator_dropped.named_estimators) == 3
- assert estimator_dropped.named_estimators.svm == "drop"
- assert len(estimator_dropped.named_estimators_) == 3
- assert sorted(list(estimator_dropped.named_estimators_.keys())) == sorted(
- ["lr", "svm", "rf"]
- )
- for sub_est in estimator_dropped.named_estimators_:
- # check that the correspondence is correct
- assert not isinstance(sub_est, type(estimator.named_estimators.svm))
- # check that we can set the parameters of the underlying classifier
- estimator.set_params(svm__C=10.0)
- estimator.set_params(rf__max_depth=5)
- assert (
- estimator.get_params()["svm__C"]
- == estimator.get_params()["svm"].get_params()["C"]
- )
- assert (
- estimator.get_params()["rf__max_depth"]
- == estimator.get_params()["rf"].get_params()["max_depth"]
- )
- @pytest.mark.parametrize(
- "Ensemble",
- [VotingClassifier, StackingRegressor, VotingRegressor],
- )
- def test_ensemble_heterogeneous_estimators_type(Ensemble):
- # check that ensemble will fail during validation if the underlying
- # estimators are not of the same type (i.e. classifier or regressor)
- # StackingClassifier can have an underlying regresor so it's not checked
- if issubclass(Ensemble, ClassifierMixin):
- X, y = make_classification(n_samples=10)
- estimators = [("lr", LinearRegression())]
- ensemble_type = "classifier"
- else:
- X, y = make_regression(n_samples=10)
- estimators = [("lr", LogisticRegression())]
- ensemble_type = "regressor"
- ensemble = Ensemble(estimators=estimators)
- err_msg = "should be a {}".format(ensemble_type)
- with pytest.raises(ValueError, match=err_msg):
- ensemble.fit(X, y)
- @pytest.mark.parametrize(
- "X, y, Ensemble",
- [
- (*make_classification(n_samples=10), StackingClassifier),
- (*make_classification(n_samples=10), VotingClassifier),
- (*make_regression(n_samples=10), StackingRegressor),
- (*make_regression(n_samples=10), VotingRegressor),
- ],
- )
- def test_ensemble_heterogeneous_estimators_name_validation(X, y, Ensemble):
- # raise an error when the name contains dunder
- if issubclass(Ensemble, ClassifierMixin):
- estimators = [("lr__", LogisticRegression())]
- else:
- estimators = [("lr__", LinearRegression())]
- ensemble = Ensemble(estimators=estimators)
- err_msg = r"Estimator names must not contain __: got \['lr__'\]"
- with pytest.raises(ValueError, match=err_msg):
- ensemble.fit(X, y)
- # raise an error when the name is not unique
- if issubclass(Ensemble, ClassifierMixin):
- estimators = [("lr", LogisticRegression()), ("lr", LogisticRegression())]
- else:
- estimators = [("lr", LinearRegression()), ("lr", LinearRegression())]
- ensemble = Ensemble(estimators=estimators)
- err_msg = r"Names provided are not unique: \['lr', 'lr'\]"
- with pytest.raises(ValueError, match=err_msg):
- ensemble.fit(X, y)
- # raise an error when the name conflicts with the parameters
- if issubclass(Ensemble, ClassifierMixin):
- estimators = [("estimators", LogisticRegression())]
- else:
- estimators = [("estimators", LinearRegression())]
- ensemble = Ensemble(estimators=estimators)
- err_msg = "Estimator names conflict with constructor arguments"
- with pytest.raises(ValueError, match=err_msg):
- ensemble.fit(X, y)
- @pytest.mark.parametrize(
- "X, y, estimator",
- [
- (
- *make_classification(n_samples=10),
- StackingClassifier(estimators=[("lr", LogisticRegression())]),
- ),
- (
- *make_classification(n_samples=10),
- VotingClassifier(estimators=[("lr", LogisticRegression())]),
- ),
- (
- *make_regression(n_samples=10),
- StackingRegressor(estimators=[("lr", LinearRegression())]),
- ),
- (
- *make_regression(n_samples=10),
- VotingRegressor(estimators=[("lr", LinearRegression())]),
- ),
- ],
- ids=[
- "stacking-classifier",
- "voting-classifier",
- "stacking-regressor",
- "voting-regressor",
- ],
- )
- def test_ensemble_heterogeneous_estimators_all_dropped(X, y, estimator):
- # check that we raise a consistent error when all estimators are
- # dropped
- estimator.set_params(lr="drop")
- with pytest.raises(ValueError, match="All estimators are dropped."):
- estimator.fit(X, y)
- @pytest.mark.parametrize(
- "Ensemble, Estimator, X, y",
- [
- (StackingClassifier, LogisticRegression, X, y),
- (StackingRegressor, LinearRegression, X_r, y_r),
- (VotingClassifier, LogisticRegression, X, y),
- (VotingRegressor, LinearRegression, X_r, y_r),
- ],
- )
- # FIXME: we should move this test in `estimator_checks` once we are able
- # to construct meta-estimator instances
- def test_heterogeneous_ensemble_support_missing_values(Ensemble, Estimator, X, y):
- # check that Voting and Stacking predictor delegate the missing values
- # validation to the underlying estimator.
- X = X.copy()
- mask = np.random.choice([1, 0], X.shape, p=[0.1, 0.9]).astype(bool)
- X[mask] = np.nan
- pipe = make_pipeline(SimpleImputer(), Estimator())
- ensemble = Ensemble(estimators=[("pipe1", pipe), ("pipe2", pipe)])
- ensemble.fit(X, y).score(X, y)
|