test_stacking.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856
  1. """Test the stacking classifier and regressor."""
  2. # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
  3. # License: BSD 3 clause
  4. from unittest.mock import Mock
  5. import numpy as np
  6. import pytest
  7. import scipy.sparse as sparse
  8. from numpy.testing import assert_array_equal
  9. from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin, clone
  10. from sklearn.datasets import (
  11. load_breast_cancer,
  12. load_diabetes,
  13. load_iris,
  14. make_classification,
  15. make_multilabel_classification,
  16. make_regression,
  17. )
  18. from sklearn.dummy import DummyClassifier, DummyRegressor
  19. from sklearn.ensemble import (
  20. RandomForestClassifier,
  21. RandomForestRegressor,
  22. StackingClassifier,
  23. StackingRegressor,
  24. )
  25. from sklearn.exceptions import ConvergenceWarning, NotFittedError
  26. from sklearn.linear_model import (
  27. LinearRegression,
  28. LogisticRegression,
  29. Ridge,
  30. RidgeClassifier,
  31. )
  32. from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
  33. from sklearn.neighbors import KNeighborsClassifier
  34. from sklearn.neural_network import MLPClassifier
  35. from sklearn.preprocessing import scale
  36. from sklearn.svm import SVC, LinearSVC, LinearSVR
  37. from sklearn.utils._mocking import CheckingClassifier
  38. from sklearn.utils._testing import (
  39. assert_allclose,
  40. assert_allclose_dense_sparse,
  41. ignore_warnings,
  42. )
  43. diabetes = load_diabetes()
  44. X_diabetes, y_diabetes = diabetes.data, diabetes.target
  45. iris = load_iris()
  46. X_iris, y_iris = iris.data, iris.target
  47. X_multilabel, y_multilabel = make_multilabel_classification(
  48. n_classes=3, random_state=42
  49. )
  50. X_binary, y_binary = make_classification(n_classes=2, random_state=42)
  51. @pytest.mark.parametrize(
  52. "cv", [3, StratifiedKFold(n_splits=3, shuffle=True, random_state=42)]
  53. )
  54. @pytest.mark.parametrize(
  55. "final_estimator", [None, RandomForestClassifier(random_state=42)]
  56. )
  57. @pytest.mark.parametrize("passthrough", [False, True])
  58. def test_stacking_classifier_iris(cv, final_estimator, passthrough):
  59. # prescale the data to avoid convergence warning without using a pipeline
  60. # for later assert
  61. X_train, X_test, y_train, y_test = train_test_split(
  62. scale(X_iris), y_iris, stratify=y_iris, random_state=42
  63. )
  64. estimators = [("lr", LogisticRegression()), ("svc", LinearSVC(dual="auto"))]
  65. clf = StackingClassifier(
  66. estimators=estimators,
  67. final_estimator=final_estimator,
  68. cv=cv,
  69. passthrough=passthrough,
  70. )
  71. clf.fit(X_train, y_train)
  72. clf.predict(X_test)
  73. clf.predict_proba(X_test)
  74. assert clf.score(X_test, y_test) > 0.8
  75. X_trans = clf.transform(X_test)
  76. expected_column_count = 10 if passthrough else 6
  77. assert X_trans.shape[1] == expected_column_count
  78. if passthrough:
  79. assert_allclose(X_test, X_trans[:, -4:])
  80. clf.set_params(lr="drop")
  81. clf.fit(X_train, y_train)
  82. clf.predict(X_test)
  83. clf.predict_proba(X_test)
  84. if final_estimator is None:
  85. # LogisticRegression has decision_function method
  86. clf.decision_function(X_test)
  87. X_trans = clf.transform(X_test)
  88. expected_column_count_drop = 7 if passthrough else 3
  89. assert X_trans.shape[1] == expected_column_count_drop
  90. if passthrough:
  91. assert_allclose(X_test, X_trans[:, -4:])
  92. def test_stacking_classifier_drop_column_binary_classification():
  93. # check that a column is dropped in binary classification
  94. X, y = load_breast_cancer(return_X_y=True)
  95. X_train, X_test, y_train, _ = train_test_split(
  96. scale(X), y, stratify=y, random_state=42
  97. )
  98. # both classifiers implement 'predict_proba' and will both drop one column
  99. estimators = [
  100. ("lr", LogisticRegression()),
  101. ("rf", RandomForestClassifier(random_state=42)),
  102. ]
  103. clf = StackingClassifier(estimators=estimators, cv=3)
  104. clf.fit(X_train, y_train)
  105. X_trans = clf.transform(X_test)
  106. assert X_trans.shape[1] == 2
  107. # LinearSVC does not implement 'predict_proba' and will not drop one column
  108. estimators = [("lr", LogisticRegression()), ("svc", LinearSVC(dual="auto"))]
  109. clf.set_params(estimators=estimators)
  110. clf.fit(X_train, y_train)
  111. X_trans = clf.transform(X_test)
  112. assert X_trans.shape[1] == 2
  113. def test_stacking_classifier_drop_estimator():
  114. # prescale the data to avoid convergence warning without using a pipeline
  115. # for later assert
  116. X_train, X_test, y_train, _ = train_test_split(
  117. scale(X_iris), y_iris, stratify=y_iris, random_state=42
  118. )
  119. estimators = [("lr", "drop"), ("svc", LinearSVC(dual="auto", random_state=0))]
  120. rf = RandomForestClassifier(n_estimators=10, random_state=42)
  121. clf = StackingClassifier(
  122. estimators=[("svc", LinearSVC(dual="auto", random_state=0))],
  123. final_estimator=rf,
  124. cv=5,
  125. )
  126. clf_drop = StackingClassifier(estimators=estimators, final_estimator=rf, cv=5)
  127. clf.fit(X_train, y_train)
  128. clf_drop.fit(X_train, y_train)
  129. assert_allclose(clf.predict(X_test), clf_drop.predict(X_test))
  130. assert_allclose(clf.predict_proba(X_test), clf_drop.predict_proba(X_test))
  131. assert_allclose(clf.transform(X_test), clf_drop.transform(X_test))
  132. def test_stacking_regressor_drop_estimator():
  133. # prescale the data to avoid convergence warning without using a pipeline
  134. # for later assert
  135. X_train, X_test, y_train, _ = train_test_split(
  136. scale(X_diabetes), y_diabetes, random_state=42
  137. )
  138. estimators = [("lr", "drop"), ("svr", LinearSVR(dual="auto", random_state=0))]
  139. rf = RandomForestRegressor(n_estimators=10, random_state=42)
  140. reg = StackingRegressor(
  141. estimators=[("svr", LinearSVR(dual="auto", random_state=0))],
  142. final_estimator=rf,
  143. cv=5,
  144. )
  145. reg_drop = StackingRegressor(estimators=estimators, final_estimator=rf, cv=5)
  146. reg.fit(X_train, y_train)
  147. reg_drop.fit(X_train, y_train)
  148. assert_allclose(reg.predict(X_test), reg_drop.predict(X_test))
  149. assert_allclose(reg.transform(X_test), reg_drop.transform(X_test))
  150. @pytest.mark.parametrize("cv", [3, KFold(n_splits=3, shuffle=True, random_state=42)])
  151. @pytest.mark.parametrize(
  152. "final_estimator, predict_params",
  153. [
  154. (None, {}),
  155. (RandomForestRegressor(random_state=42), {}),
  156. (DummyRegressor(), {"return_std": True}),
  157. ],
  158. )
  159. @pytest.mark.parametrize("passthrough", [False, True])
  160. def test_stacking_regressor_diabetes(cv, final_estimator, predict_params, passthrough):
  161. # prescale the data to avoid convergence warning without using a pipeline
  162. # for later assert
  163. X_train, X_test, y_train, _ = train_test_split(
  164. scale(X_diabetes), y_diabetes, random_state=42
  165. )
  166. estimators = [("lr", LinearRegression()), ("svr", LinearSVR(dual="auto"))]
  167. reg = StackingRegressor(
  168. estimators=estimators,
  169. final_estimator=final_estimator,
  170. cv=cv,
  171. passthrough=passthrough,
  172. )
  173. reg.fit(X_train, y_train)
  174. result = reg.predict(X_test, **predict_params)
  175. expected_result_length = 2 if predict_params else 1
  176. if predict_params:
  177. assert len(result) == expected_result_length
  178. X_trans = reg.transform(X_test)
  179. expected_column_count = 12 if passthrough else 2
  180. assert X_trans.shape[1] == expected_column_count
  181. if passthrough:
  182. assert_allclose(X_test, X_trans[:, -10:])
  183. reg.set_params(lr="drop")
  184. reg.fit(X_train, y_train)
  185. reg.predict(X_test)
  186. X_trans = reg.transform(X_test)
  187. expected_column_count_drop = 11 if passthrough else 1
  188. assert X_trans.shape[1] == expected_column_count_drop
  189. if passthrough:
  190. assert_allclose(X_test, X_trans[:, -10:])
  191. @pytest.mark.parametrize("fmt", ["csc", "csr", "coo"])
  192. def test_stacking_regressor_sparse_passthrough(fmt):
  193. # Check passthrough behavior on a sparse X matrix
  194. X_train, X_test, y_train, _ = train_test_split(
  195. sparse.coo_matrix(scale(X_diabetes)).asformat(fmt), y_diabetes, random_state=42
  196. )
  197. estimators = [("lr", LinearRegression()), ("svr", LinearSVR(dual="auto"))]
  198. rf = RandomForestRegressor(n_estimators=10, random_state=42)
  199. clf = StackingRegressor(
  200. estimators=estimators, final_estimator=rf, cv=5, passthrough=True
  201. )
  202. clf.fit(X_train, y_train)
  203. X_trans = clf.transform(X_test)
  204. assert_allclose_dense_sparse(X_test, X_trans[:, -10:])
  205. assert sparse.issparse(X_trans)
  206. assert X_test.format == X_trans.format
  207. @pytest.mark.parametrize("fmt", ["csc", "csr", "coo"])
  208. def test_stacking_classifier_sparse_passthrough(fmt):
  209. # Check passthrough behavior on a sparse X matrix
  210. X_train, X_test, y_train, _ = train_test_split(
  211. sparse.coo_matrix(scale(X_iris)).asformat(fmt), y_iris, random_state=42
  212. )
  213. estimators = [("lr", LogisticRegression()), ("svc", LinearSVC(dual="auto"))]
  214. rf = RandomForestClassifier(n_estimators=10, random_state=42)
  215. clf = StackingClassifier(
  216. estimators=estimators, final_estimator=rf, cv=5, passthrough=True
  217. )
  218. clf.fit(X_train, y_train)
  219. X_trans = clf.transform(X_test)
  220. assert_allclose_dense_sparse(X_test, X_trans[:, -4:])
  221. assert sparse.issparse(X_trans)
  222. assert X_test.format == X_trans.format
  223. def test_stacking_classifier_drop_binary_prob():
  224. # check that classifier will drop one of the probability column for
  225. # binary classification problem
  226. # Select only the 2 first classes
  227. X_, y_ = scale(X_iris[:100]), y_iris[:100]
  228. estimators = [("lr", LogisticRegression()), ("rf", RandomForestClassifier())]
  229. clf = StackingClassifier(estimators=estimators)
  230. clf.fit(X_, y_)
  231. X_meta = clf.transform(X_)
  232. assert X_meta.shape[1] == 2
  233. class NoWeightRegressor(RegressorMixin, BaseEstimator):
  234. def fit(self, X, y):
  235. self.reg = DummyRegressor()
  236. return self.reg.fit(X, y)
  237. def predict(self, X):
  238. return np.ones(X.shape[0])
  239. class NoWeightClassifier(ClassifierMixin, BaseEstimator):
  240. def fit(self, X, y):
  241. self.clf = DummyClassifier(strategy="stratified")
  242. return self.clf.fit(X, y)
  243. @pytest.mark.parametrize(
  244. "y, params, type_err, msg_err",
  245. [
  246. (y_iris, {"estimators": []}, ValueError, "Invalid 'estimators' attribute,"),
  247. (
  248. y_iris,
  249. {
  250. "estimators": [
  251. ("lr", LogisticRegression()),
  252. ("svm", SVC(max_iter=50_000)),
  253. ],
  254. "stack_method": "predict_proba",
  255. },
  256. ValueError,
  257. "does not implement the method predict_proba",
  258. ),
  259. (
  260. y_iris,
  261. {
  262. "estimators": [
  263. ("lr", LogisticRegression()),
  264. ("cor", NoWeightClassifier()),
  265. ]
  266. },
  267. TypeError,
  268. "does not support sample weight",
  269. ),
  270. (
  271. y_iris,
  272. {
  273. "estimators": [
  274. ("lr", LogisticRegression()),
  275. ("cor", LinearSVC(dual="auto", max_iter=50_000)),
  276. ],
  277. "final_estimator": NoWeightClassifier(),
  278. },
  279. TypeError,
  280. "does not support sample weight",
  281. ),
  282. ],
  283. )
  284. def test_stacking_classifier_error(y, params, type_err, msg_err):
  285. with pytest.raises(type_err, match=msg_err):
  286. clf = StackingClassifier(**params, cv=3)
  287. clf.fit(scale(X_iris), y, sample_weight=np.ones(X_iris.shape[0]))
  288. @pytest.mark.parametrize(
  289. "y, params, type_err, msg_err",
  290. [
  291. (y_diabetes, {"estimators": []}, ValueError, "Invalid 'estimators' attribute,"),
  292. (
  293. y_diabetes,
  294. {"estimators": [("lr", LinearRegression()), ("cor", NoWeightRegressor())]},
  295. TypeError,
  296. "does not support sample weight",
  297. ),
  298. (
  299. y_diabetes,
  300. {
  301. "estimators": [
  302. ("lr", LinearRegression()),
  303. ("cor", LinearSVR(dual="auto")),
  304. ],
  305. "final_estimator": NoWeightRegressor(),
  306. },
  307. TypeError,
  308. "does not support sample weight",
  309. ),
  310. ],
  311. )
  312. def test_stacking_regressor_error(y, params, type_err, msg_err):
  313. with pytest.raises(type_err, match=msg_err):
  314. reg = StackingRegressor(**params, cv=3)
  315. reg.fit(scale(X_diabetes), y, sample_weight=np.ones(X_diabetes.shape[0]))
  316. @pytest.mark.parametrize(
  317. "estimator, X, y",
  318. [
  319. (
  320. StackingClassifier(
  321. estimators=[
  322. ("lr", LogisticRegression(random_state=0)),
  323. ("svm", LinearSVC(dual="auto", random_state=0)),
  324. ]
  325. ),
  326. X_iris[:100],
  327. y_iris[:100],
  328. ), # keep only classes 0 and 1
  329. (
  330. StackingRegressor(
  331. estimators=[
  332. ("lr", LinearRegression()),
  333. ("svm", LinearSVR(dual="auto", random_state=0)),
  334. ]
  335. ),
  336. X_diabetes,
  337. y_diabetes,
  338. ),
  339. ],
  340. ids=["StackingClassifier", "StackingRegressor"],
  341. )
  342. def test_stacking_randomness(estimator, X, y):
  343. # checking that fixing the random state of the CV will lead to the same
  344. # results
  345. estimator_full = clone(estimator)
  346. estimator_full.set_params(
  347. cv=KFold(shuffle=True, random_state=np.random.RandomState(0))
  348. )
  349. estimator_drop = clone(estimator)
  350. estimator_drop.set_params(lr="drop")
  351. estimator_drop.set_params(
  352. cv=KFold(shuffle=True, random_state=np.random.RandomState(0))
  353. )
  354. assert_allclose(
  355. estimator_full.fit(X, y).transform(X)[:, 1:],
  356. estimator_drop.fit(X, y).transform(X),
  357. )
  358. def test_stacking_classifier_stratify_default():
  359. # check that we stratify the classes for the default CV
  360. clf = StackingClassifier(
  361. estimators=[
  362. ("lr", LogisticRegression(max_iter=10_000)),
  363. ("svm", LinearSVC(dual="auto", max_iter=10_000)),
  364. ]
  365. )
  366. # since iris is not shuffled, a simple k-fold would not contain the
  367. # 3 classes during training
  368. clf.fit(X_iris, y_iris)
  369. @pytest.mark.parametrize(
  370. "stacker, X, y",
  371. [
  372. (
  373. StackingClassifier(
  374. estimators=[
  375. ("lr", LogisticRegression()),
  376. ("svm", LinearSVC(dual="auto", random_state=42)),
  377. ],
  378. final_estimator=LogisticRegression(),
  379. cv=KFold(shuffle=True, random_state=42),
  380. ),
  381. *load_breast_cancer(return_X_y=True),
  382. ),
  383. (
  384. StackingRegressor(
  385. estimators=[
  386. ("lr", LinearRegression()),
  387. ("svm", LinearSVR(dual="auto", random_state=42)),
  388. ],
  389. final_estimator=LinearRegression(),
  390. cv=KFold(shuffle=True, random_state=42),
  391. ),
  392. X_diabetes,
  393. y_diabetes,
  394. ),
  395. ],
  396. ids=["StackingClassifier", "StackingRegressor"],
  397. )
  398. def test_stacking_with_sample_weight(stacker, X, y):
  399. # check that sample weights has an influence on the fitting
  400. # note: ConvergenceWarning are catch since we are not worrying about the
  401. # convergence here
  402. n_half_samples = len(y) // 2
  403. total_sample_weight = np.array(
  404. [0.1] * n_half_samples + [0.9] * (len(y) - n_half_samples)
  405. )
  406. X_train, X_test, y_train, _, sample_weight_train, _ = train_test_split(
  407. X, y, total_sample_weight, random_state=42
  408. )
  409. with ignore_warnings(category=ConvergenceWarning):
  410. stacker.fit(X_train, y_train)
  411. y_pred_no_weight = stacker.predict(X_test)
  412. with ignore_warnings(category=ConvergenceWarning):
  413. stacker.fit(X_train, y_train, sample_weight=np.ones(y_train.shape))
  414. y_pred_unit_weight = stacker.predict(X_test)
  415. assert_allclose(y_pred_no_weight, y_pred_unit_weight)
  416. with ignore_warnings(category=ConvergenceWarning):
  417. stacker.fit(X_train, y_train, sample_weight=sample_weight_train)
  418. y_pred_biased = stacker.predict(X_test)
  419. assert np.abs(y_pred_no_weight - y_pred_biased).sum() > 0
  420. def test_stacking_classifier_sample_weight_fit_param():
  421. # check sample_weight is passed to all invocations of fit
  422. stacker = StackingClassifier(
  423. estimators=[("lr", CheckingClassifier(expected_sample_weight=True))],
  424. final_estimator=CheckingClassifier(expected_sample_weight=True),
  425. )
  426. stacker.fit(X_iris, y_iris, sample_weight=np.ones(X_iris.shape[0]))
  427. @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
  428. @pytest.mark.parametrize(
  429. "stacker, X, y",
  430. [
  431. (
  432. StackingClassifier(
  433. estimators=[
  434. ("lr", LogisticRegression()),
  435. ("svm", LinearSVC(dual="auto", random_state=42)),
  436. ],
  437. final_estimator=LogisticRegression(),
  438. ),
  439. *load_breast_cancer(return_X_y=True),
  440. ),
  441. (
  442. StackingRegressor(
  443. estimators=[
  444. ("lr", LinearRegression()),
  445. ("svm", LinearSVR(dual="auto", random_state=42)),
  446. ],
  447. final_estimator=LinearRegression(),
  448. ),
  449. X_diabetes,
  450. y_diabetes,
  451. ),
  452. ],
  453. ids=["StackingClassifier", "StackingRegressor"],
  454. )
  455. def test_stacking_cv_influence(stacker, X, y):
  456. # check that the stacking affects the fit of the final estimator but not
  457. # the fit of the base estimators
  458. # note: ConvergenceWarning are catch since we are not worrying about the
  459. # convergence here
  460. stacker_cv_3 = clone(stacker)
  461. stacker_cv_5 = clone(stacker)
  462. stacker_cv_3.set_params(cv=3)
  463. stacker_cv_5.set_params(cv=5)
  464. stacker_cv_3.fit(X, y)
  465. stacker_cv_5.fit(X, y)
  466. # the base estimators should be identical
  467. for est_cv_3, est_cv_5 in zip(stacker_cv_3.estimators_, stacker_cv_5.estimators_):
  468. assert_allclose(est_cv_3.coef_, est_cv_5.coef_)
  469. # the final estimator should be different
  470. with pytest.raises(AssertionError, match="Not equal"):
  471. assert_allclose(
  472. stacker_cv_3.final_estimator_.coef_, stacker_cv_5.final_estimator_.coef_
  473. )
  474. @pytest.mark.parametrize(
  475. "Stacker, Estimator, stack_method, final_estimator, X, y",
  476. [
  477. (
  478. StackingClassifier,
  479. DummyClassifier,
  480. "predict_proba",
  481. LogisticRegression(random_state=42),
  482. X_iris,
  483. y_iris,
  484. ),
  485. (
  486. StackingRegressor,
  487. DummyRegressor,
  488. "predict",
  489. LinearRegression(),
  490. X_diabetes,
  491. y_diabetes,
  492. ),
  493. ],
  494. )
  495. def test_stacking_prefit(Stacker, Estimator, stack_method, final_estimator, X, y):
  496. """Check the behaviour of stacking when `cv='prefit'`"""
  497. X_train1, X_train2, y_train1, y_train2 = train_test_split(
  498. X, y, random_state=42, test_size=0.5
  499. )
  500. estimators = [
  501. ("d0", Estimator().fit(X_train1, y_train1)),
  502. ("d1", Estimator().fit(X_train1, y_train1)),
  503. ]
  504. # mock out fit and stack_method to be asserted later
  505. for _, estimator in estimators:
  506. estimator.fit = Mock(name="fit")
  507. stack_func = getattr(estimator, stack_method)
  508. predict_method_mocked = Mock(side_effect=stack_func)
  509. # Mocking a method will not provide a `__name__` while Python methods
  510. # do and we are using it in `_get_response_method`.
  511. predict_method_mocked.__name__ = stack_method
  512. setattr(estimator, stack_method, predict_method_mocked)
  513. stacker = Stacker(
  514. estimators=estimators, cv="prefit", final_estimator=final_estimator
  515. )
  516. stacker.fit(X_train2, y_train2)
  517. assert stacker.estimators_ == [estimator for _, estimator in estimators]
  518. # fit was not called again
  519. assert all(estimator.fit.call_count == 0 for estimator in stacker.estimators_)
  520. # stack method is called with the proper inputs
  521. for estimator in stacker.estimators_:
  522. stack_func_mock = getattr(estimator, stack_method)
  523. stack_func_mock.assert_called_with(X_train2)
  524. @pytest.mark.parametrize(
  525. "stacker, X, y",
  526. [
  527. (
  528. StackingClassifier(
  529. estimators=[("lr", LogisticRegression()), ("svm", SVC())],
  530. cv="prefit",
  531. ),
  532. X_iris,
  533. y_iris,
  534. ),
  535. (
  536. StackingRegressor(
  537. estimators=[
  538. ("lr", LinearRegression()),
  539. ("svm", LinearSVR(dual="auto")),
  540. ],
  541. cv="prefit",
  542. ),
  543. X_diabetes,
  544. y_diabetes,
  545. ),
  546. ],
  547. )
  548. def test_stacking_prefit_error(stacker, X, y):
  549. # check that NotFittedError is raised
  550. # if base estimators are not fitted when cv="prefit"
  551. with pytest.raises(NotFittedError):
  552. stacker.fit(X, y)
  553. @pytest.mark.parametrize(
  554. "make_dataset, Stacking, Estimator",
  555. [
  556. (make_classification, StackingClassifier, LogisticRegression),
  557. (make_regression, StackingRegressor, LinearRegression),
  558. ],
  559. )
  560. def test_stacking_without_n_features_in(make_dataset, Stacking, Estimator):
  561. # Stacking supports estimators without `n_features_in_`. Regression test
  562. # for #17353
  563. class MyEstimator(Estimator):
  564. """Estimator without n_features_in_"""
  565. def fit(self, X, y):
  566. super().fit(X, y)
  567. del self.n_features_in_
  568. X, y = make_dataset(random_state=0, n_samples=100)
  569. stacker = Stacking(estimators=[("lr", MyEstimator())])
  570. msg = f"{Stacking.__name__} object has no attribute n_features_in_"
  571. with pytest.raises(AttributeError, match=msg):
  572. stacker.n_features_in_
  573. # Does not raise
  574. stacker.fit(X, y)
  575. msg = "'MyEstimator' object has no attribute 'n_features_in_'"
  576. with pytest.raises(AttributeError, match=msg):
  577. stacker.n_features_in_
  578. @pytest.mark.parametrize(
  579. "estimator",
  580. [
  581. # output a 2D array of the probability of the positive class for each output
  582. MLPClassifier(random_state=42),
  583. # output a list of 2D array containing the probability of each class
  584. # for each output
  585. RandomForestClassifier(random_state=42),
  586. ],
  587. ids=["MLPClassifier", "RandomForestClassifier"],
  588. )
  589. def test_stacking_classifier_multilabel_predict_proba(estimator):
  590. """Check the behaviour for the multilabel classification case and the
  591. `predict_proba` stacking method.
  592. Estimators are not consistent with the output arrays and we need to ensure that
  593. we handle all cases.
  594. """
  595. X_train, X_test, y_train, y_test = train_test_split(
  596. X_multilabel, y_multilabel, stratify=y_multilabel, random_state=42
  597. )
  598. n_outputs = 3
  599. estimators = [("est", estimator)]
  600. stacker = StackingClassifier(
  601. estimators=estimators,
  602. final_estimator=KNeighborsClassifier(),
  603. stack_method="predict_proba",
  604. ).fit(X_train, y_train)
  605. X_trans = stacker.transform(X_test)
  606. assert X_trans.shape == (X_test.shape[0], n_outputs)
  607. # we should not have any collinear classes and thus nothing should sum to 1
  608. assert not any(np.isclose(X_trans.sum(axis=1), 1.0))
  609. y_pred = stacker.predict(X_test)
  610. assert y_pred.shape == y_test.shape
  611. def test_stacking_classifier_multilabel_decision_function():
  612. """Check the behaviour for the multilabel classification case and the
  613. `decision_function` stacking method. Only `RidgeClassifier` supports this
  614. case.
  615. """
  616. X_train, X_test, y_train, y_test = train_test_split(
  617. X_multilabel, y_multilabel, stratify=y_multilabel, random_state=42
  618. )
  619. n_outputs = 3
  620. estimators = [("est", RidgeClassifier())]
  621. stacker = StackingClassifier(
  622. estimators=estimators,
  623. final_estimator=KNeighborsClassifier(),
  624. stack_method="decision_function",
  625. ).fit(X_train, y_train)
  626. X_trans = stacker.transform(X_test)
  627. assert X_trans.shape == (X_test.shape[0], n_outputs)
  628. y_pred = stacker.predict(X_test)
  629. assert y_pred.shape == y_test.shape
  630. @pytest.mark.parametrize("stack_method", ["auto", "predict"])
  631. @pytest.mark.parametrize("passthrough", [False, True])
  632. def test_stacking_classifier_multilabel_auto_predict(stack_method, passthrough):
  633. """Check the behaviour for the multilabel classification case for stack methods
  634. supported for all estimators or automatically picked up.
  635. """
  636. X_train, X_test, y_train, y_test = train_test_split(
  637. X_multilabel, y_multilabel, stratify=y_multilabel, random_state=42
  638. )
  639. y_train_before_fit = y_train.copy()
  640. n_outputs = 3
  641. estimators = [
  642. ("mlp", MLPClassifier(random_state=42)),
  643. ("rf", RandomForestClassifier(random_state=42)),
  644. ("ridge", RidgeClassifier()),
  645. ]
  646. final_estimator = KNeighborsClassifier()
  647. clf = StackingClassifier(
  648. estimators=estimators,
  649. final_estimator=final_estimator,
  650. passthrough=passthrough,
  651. stack_method=stack_method,
  652. ).fit(X_train, y_train)
  653. # make sure we don't change `y_train` inplace
  654. assert_array_equal(y_train_before_fit, y_train)
  655. y_pred = clf.predict(X_test)
  656. assert y_pred.shape == y_test.shape
  657. if stack_method == "auto":
  658. expected_stack_methods = ["predict_proba", "predict_proba", "decision_function"]
  659. else:
  660. expected_stack_methods = ["predict"] * len(estimators)
  661. assert clf.stack_method_ == expected_stack_methods
  662. n_features_X_trans = n_outputs * len(estimators)
  663. if passthrough:
  664. n_features_X_trans += X_train.shape[1]
  665. X_trans = clf.transform(X_test)
  666. assert X_trans.shape == (X_test.shape[0], n_features_X_trans)
  667. assert_array_equal(clf.classes_, [np.array([0, 1])] * n_outputs)
  668. @pytest.mark.parametrize(
  669. "stacker, feature_names, X, y, expected_names",
  670. [
  671. (
  672. StackingClassifier(
  673. estimators=[
  674. ("lr", LogisticRegression(random_state=0)),
  675. ("svm", LinearSVC(dual="auto", random_state=0)),
  676. ]
  677. ),
  678. iris.feature_names,
  679. X_iris,
  680. y_iris,
  681. [
  682. "stackingclassifier_lr0",
  683. "stackingclassifier_lr1",
  684. "stackingclassifier_lr2",
  685. "stackingclassifier_svm0",
  686. "stackingclassifier_svm1",
  687. "stackingclassifier_svm2",
  688. ],
  689. ),
  690. (
  691. StackingClassifier(
  692. estimators=[
  693. ("lr", LogisticRegression(random_state=0)),
  694. ("other", "drop"),
  695. ("svm", LinearSVC(dual="auto", random_state=0)),
  696. ]
  697. ),
  698. iris.feature_names,
  699. X_iris[:100],
  700. y_iris[:100], # keep only classes 0 and 1
  701. [
  702. "stackingclassifier_lr",
  703. "stackingclassifier_svm",
  704. ],
  705. ),
  706. (
  707. StackingRegressor(
  708. estimators=[
  709. ("lr", LinearRegression()),
  710. ("svm", LinearSVR(dual="auto", random_state=0)),
  711. ]
  712. ),
  713. diabetes.feature_names,
  714. X_diabetes,
  715. y_diabetes,
  716. [
  717. "stackingregressor_lr",
  718. "stackingregressor_svm",
  719. ],
  720. ),
  721. ],
  722. ids=[
  723. "StackingClassifier_multiclass",
  724. "StackingClassifier_binary",
  725. "StackingRegressor",
  726. ],
  727. )
  728. @pytest.mark.parametrize("passthrough", [True, False])
  729. def test_get_feature_names_out(
  730. stacker, feature_names, X, y, expected_names, passthrough
  731. ):
  732. """Check get_feature_names_out works for stacking."""
  733. stacker.set_params(passthrough=passthrough)
  734. stacker.fit(scale(X), y)
  735. if passthrough:
  736. expected_names = np.concatenate((expected_names, feature_names))
  737. names_out = stacker.get_feature_names_out(feature_names)
  738. assert_array_equal(names_out, expected_names)
  739. def test_stacking_classifier_base_regressor():
  740. """Check that a regressor can be used as the first layer in `StackingClassifier`."""
  741. X_train, X_test, y_train, y_test = train_test_split(
  742. scale(X_iris), y_iris, stratify=y_iris, random_state=42
  743. )
  744. clf = StackingClassifier(estimators=[("ridge", Ridge())])
  745. clf.fit(X_train, y_train)
  746. clf.predict(X_test)
  747. clf.predict_proba(X_test)
  748. assert clf.score(X_test, y_test) > 0.8