test_bagging.py 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995
  1. """
  2. Testing for the bagging ensemble module (sklearn.ensemble.bagging).
  3. """
  4. # Author: Gilles Louppe
  5. # License: BSD 3 clause
  6. from itertools import cycle, product
  7. import joblib
  8. import numpy as np
  9. import pytest
  10. from scipy.sparse import csc_matrix, csr_matrix
  11. from sklearn.base import BaseEstimator
  12. from sklearn.datasets import load_diabetes, load_iris, make_hastie_10_2
  13. from sklearn.dummy import DummyClassifier, DummyRegressor
  14. from sklearn.ensemble import (
  15. BaggingClassifier,
  16. BaggingRegressor,
  17. HistGradientBoostingClassifier,
  18. HistGradientBoostingRegressor,
  19. )
  20. from sklearn.feature_selection import SelectKBest
  21. from sklearn.linear_model import LogisticRegression, Perceptron
  22. from sklearn.model_selection import GridSearchCV, ParameterGrid, train_test_split
  23. from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
  24. from sklearn.pipeline import make_pipeline
  25. from sklearn.preprocessing import FunctionTransformer, scale
  26. from sklearn.random_projection import SparseRandomProjection
  27. from sklearn.svm import SVC, SVR
  28. from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
  29. from sklearn.utils import check_random_state
  30. from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal
  31. rng = check_random_state(0)
  32. # also load the iris dataset
  33. # and randomly permute it
  34. iris = load_iris()
  35. perm = rng.permutation(iris.target.size)
  36. iris.data = iris.data[perm]
  37. iris.target = iris.target[perm]
  38. # also load the diabetes dataset
  39. # and randomly permute it
  40. diabetes = load_diabetes()
  41. perm = rng.permutation(diabetes.target.size)
  42. diabetes.data = diabetes.data[perm]
  43. diabetes.target = diabetes.target[perm]
  44. def test_classification():
  45. # Check classification for various parameter settings.
  46. rng = check_random_state(0)
  47. X_train, X_test, y_train, y_test = train_test_split(
  48. iris.data, iris.target, random_state=rng
  49. )
  50. grid = ParameterGrid(
  51. {
  52. "max_samples": [0.5, 1.0],
  53. "max_features": [1, 4],
  54. "bootstrap": [True, False],
  55. "bootstrap_features": [True, False],
  56. }
  57. )
  58. estimators = [
  59. None,
  60. DummyClassifier(),
  61. Perceptron(max_iter=20),
  62. DecisionTreeClassifier(max_depth=2),
  63. KNeighborsClassifier(),
  64. SVC(),
  65. ]
  66. # Try different parameter settings with different base classifiers without
  67. # doing the full cartesian product to keep the test durations low.
  68. for params, estimator in zip(grid, cycle(estimators)):
  69. BaggingClassifier(
  70. estimator=estimator,
  71. random_state=rng,
  72. n_estimators=2,
  73. **params,
  74. ).fit(X_train, y_train).predict(X_test)
  75. @pytest.mark.parametrize(
  76. "sparse_format, params, method",
  77. product(
  78. [csc_matrix, csr_matrix],
  79. [
  80. {
  81. "max_samples": 0.5,
  82. "max_features": 2,
  83. "bootstrap": True,
  84. "bootstrap_features": True,
  85. },
  86. {
  87. "max_samples": 1.0,
  88. "max_features": 4,
  89. "bootstrap": True,
  90. "bootstrap_features": True,
  91. },
  92. {"max_features": 2, "bootstrap": False, "bootstrap_features": True},
  93. {"max_samples": 0.5, "bootstrap": True, "bootstrap_features": False},
  94. ],
  95. ["predict", "predict_proba", "predict_log_proba", "decision_function"],
  96. ),
  97. )
  98. def test_sparse_classification(sparse_format, params, method):
  99. # Check classification for various parameter settings on sparse input.
  100. class CustomSVC(SVC):
  101. """SVC variant that records the nature of the training set"""
  102. def fit(self, X, y):
  103. super().fit(X, y)
  104. self.data_type_ = type(X)
  105. return self
  106. rng = check_random_state(0)
  107. X_train, X_test, y_train, y_test = train_test_split(
  108. scale(iris.data), iris.target, random_state=rng
  109. )
  110. X_train_sparse = sparse_format(X_train)
  111. X_test_sparse = sparse_format(X_test)
  112. # Trained on sparse format
  113. sparse_classifier = BaggingClassifier(
  114. estimator=CustomSVC(kernel="linear", decision_function_shape="ovr"),
  115. random_state=1,
  116. **params,
  117. ).fit(X_train_sparse, y_train)
  118. sparse_results = getattr(sparse_classifier, method)(X_test_sparse)
  119. # Trained on dense format
  120. dense_classifier = BaggingClassifier(
  121. estimator=CustomSVC(kernel="linear", decision_function_shape="ovr"),
  122. random_state=1,
  123. **params,
  124. ).fit(X_train, y_train)
  125. dense_results = getattr(dense_classifier, method)(X_test)
  126. assert_array_almost_equal(sparse_results, dense_results)
  127. sparse_type = type(X_train_sparse)
  128. types = [i.data_type_ for i in sparse_classifier.estimators_]
  129. assert all([t == sparse_type for t in types])
  130. def test_regression():
  131. # Check regression for various parameter settings.
  132. rng = check_random_state(0)
  133. X_train, X_test, y_train, y_test = train_test_split(
  134. diabetes.data[:50], diabetes.target[:50], random_state=rng
  135. )
  136. grid = ParameterGrid(
  137. {
  138. "max_samples": [0.5, 1.0],
  139. "max_features": [0.5, 1.0],
  140. "bootstrap": [True, False],
  141. "bootstrap_features": [True, False],
  142. }
  143. )
  144. for estimator in [
  145. None,
  146. DummyRegressor(),
  147. DecisionTreeRegressor(),
  148. KNeighborsRegressor(),
  149. SVR(),
  150. ]:
  151. for params in grid:
  152. BaggingRegressor(estimator=estimator, random_state=rng, **params).fit(
  153. X_train, y_train
  154. ).predict(X_test)
  155. def test_sparse_regression():
  156. # Check regression for various parameter settings on sparse input.
  157. rng = check_random_state(0)
  158. X_train, X_test, y_train, y_test = train_test_split(
  159. diabetes.data[:50], diabetes.target[:50], random_state=rng
  160. )
  161. class CustomSVR(SVR):
  162. """SVC variant that records the nature of the training set"""
  163. def fit(self, X, y):
  164. super().fit(X, y)
  165. self.data_type_ = type(X)
  166. return self
  167. parameter_sets = [
  168. {
  169. "max_samples": 0.5,
  170. "max_features": 2,
  171. "bootstrap": True,
  172. "bootstrap_features": True,
  173. },
  174. {
  175. "max_samples": 1.0,
  176. "max_features": 4,
  177. "bootstrap": True,
  178. "bootstrap_features": True,
  179. },
  180. {"max_features": 2, "bootstrap": False, "bootstrap_features": True},
  181. {"max_samples": 0.5, "bootstrap": True, "bootstrap_features": False},
  182. ]
  183. for sparse_format in [csc_matrix, csr_matrix]:
  184. X_train_sparse = sparse_format(X_train)
  185. X_test_sparse = sparse_format(X_test)
  186. for params in parameter_sets:
  187. # Trained on sparse format
  188. sparse_classifier = BaggingRegressor(
  189. estimator=CustomSVR(), random_state=1, **params
  190. ).fit(X_train_sparse, y_train)
  191. sparse_results = sparse_classifier.predict(X_test_sparse)
  192. # Trained on dense format
  193. dense_results = (
  194. BaggingRegressor(estimator=CustomSVR(), random_state=1, **params)
  195. .fit(X_train, y_train)
  196. .predict(X_test)
  197. )
  198. sparse_type = type(X_train_sparse)
  199. types = [i.data_type_ for i in sparse_classifier.estimators_]
  200. assert_array_almost_equal(sparse_results, dense_results)
  201. assert all([t == sparse_type for t in types])
  202. assert_array_almost_equal(sparse_results, dense_results)
  203. class DummySizeEstimator(BaseEstimator):
  204. def fit(self, X, y):
  205. self.training_size_ = X.shape[0]
  206. self.training_hash_ = joblib.hash(X)
  207. def predict(self, X):
  208. return np.ones(X.shape[0])
  209. def test_bootstrap_samples():
  210. # Test that bootstrapping samples generate non-perfect base estimators.
  211. rng = check_random_state(0)
  212. X_train, X_test, y_train, y_test = train_test_split(
  213. diabetes.data, diabetes.target, random_state=rng
  214. )
  215. estimator = DecisionTreeRegressor().fit(X_train, y_train)
  216. # without bootstrap, all trees are perfect on the training set
  217. ensemble = BaggingRegressor(
  218. estimator=DecisionTreeRegressor(),
  219. max_samples=1.0,
  220. bootstrap=False,
  221. random_state=rng,
  222. ).fit(X_train, y_train)
  223. assert estimator.score(X_train, y_train) == ensemble.score(X_train, y_train)
  224. # with bootstrap, trees are no longer perfect on the training set
  225. ensemble = BaggingRegressor(
  226. estimator=DecisionTreeRegressor(),
  227. max_samples=1.0,
  228. bootstrap=True,
  229. random_state=rng,
  230. ).fit(X_train, y_train)
  231. assert estimator.score(X_train, y_train) > ensemble.score(X_train, y_train)
  232. # check that each sampling correspond to a complete bootstrap resample.
  233. # the size of each bootstrap should be the same as the input data but
  234. # the data should be different (checked using the hash of the data).
  235. ensemble = BaggingRegressor(estimator=DummySizeEstimator(), bootstrap=True).fit(
  236. X_train, y_train
  237. )
  238. training_hash = []
  239. for estimator in ensemble.estimators_:
  240. assert estimator.training_size_ == X_train.shape[0]
  241. training_hash.append(estimator.training_hash_)
  242. assert len(set(training_hash)) == len(training_hash)
  243. def test_bootstrap_features():
  244. # Test that bootstrapping features may generate duplicate features.
  245. rng = check_random_state(0)
  246. X_train, X_test, y_train, y_test = train_test_split(
  247. diabetes.data, diabetes.target, random_state=rng
  248. )
  249. ensemble = BaggingRegressor(
  250. estimator=DecisionTreeRegressor(),
  251. max_features=1.0,
  252. bootstrap_features=False,
  253. random_state=rng,
  254. ).fit(X_train, y_train)
  255. for features in ensemble.estimators_features_:
  256. assert diabetes.data.shape[1] == np.unique(features).shape[0]
  257. ensemble = BaggingRegressor(
  258. estimator=DecisionTreeRegressor(),
  259. max_features=1.0,
  260. bootstrap_features=True,
  261. random_state=rng,
  262. ).fit(X_train, y_train)
  263. for features in ensemble.estimators_features_:
  264. assert diabetes.data.shape[1] > np.unique(features).shape[0]
  265. def test_probability():
  266. # Predict probabilities.
  267. rng = check_random_state(0)
  268. X_train, X_test, y_train, y_test = train_test_split(
  269. iris.data, iris.target, random_state=rng
  270. )
  271. with np.errstate(divide="ignore", invalid="ignore"):
  272. # Normal case
  273. ensemble = BaggingClassifier(
  274. estimator=DecisionTreeClassifier(), random_state=rng
  275. ).fit(X_train, y_train)
  276. assert_array_almost_equal(
  277. np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))
  278. )
  279. assert_array_almost_equal(
  280. ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test))
  281. )
  282. # Degenerate case, where some classes are missing
  283. ensemble = BaggingClassifier(
  284. estimator=LogisticRegression(), random_state=rng, max_samples=5
  285. ).fit(X_train, y_train)
  286. assert_array_almost_equal(
  287. np.sum(ensemble.predict_proba(X_test), axis=1), np.ones(len(X_test))
  288. )
  289. assert_array_almost_equal(
  290. ensemble.predict_proba(X_test), np.exp(ensemble.predict_log_proba(X_test))
  291. )
  292. def test_oob_score_classification():
  293. # Check that oob prediction is a good estimation of the generalization
  294. # error.
  295. rng = check_random_state(0)
  296. X_train, X_test, y_train, y_test = train_test_split(
  297. iris.data, iris.target, random_state=rng
  298. )
  299. for estimator in [DecisionTreeClassifier(), SVC()]:
  300. clf = BaggingClassifier(
  301. estimator=estimator,
  302. n_estimators=100,
  303. bootstrap=True,
  304. oob_score=True,
  305. random_state=rng,
  306. ).fit(X_train, y_train)
  307. test_score = clf.score(X_test, y_test)
  308. assert abs(test_score - clf.oob_score_) < 0.1
  309. # Test with few estimators
  310. warn_msg = (
  311. "Some inputs do not have OOB scores. This probably means too few "
  312. "estimators were used to compute any reliable oob estimates."
  313. )
  314. with pytest.warns(UserWarning, match=warn_msg):
  315. clf = BaggingClassifier(
  316. estimator=estimator,
  317. n_estimators=1,
  318. bootstrap=True,
  319. oob_score=True,
  320. random_state=rng,
  321. )
  322. clf.fit(X_train, y_train)
  323. def test_oob_score_regression():
  324. # Check that oob prediction is a good estimation of the generalization
  325. # error.
  326. rng = check_random_state(0)
  327. X_train, X_test, y_train, y_test = train_test_split(
  328. diabetes.data, diabetes.target, random_state=rng
  329. )
  330. clf = BaggingRegressor(
  331. estimator=DecisionTreeRegressor(),
  332. n_estimators=50,
  333. bootstrap=True,
  334. oob_score=True,
  335. random_state=rng,
  336. ).fit(X_train, y_train)
  337. test_score = clf.score(X_test, y_test)
  338. assert abs(test_score - clf.oob_score_) < 0.1
  339. # Test with few estimators
  340. warn_msg = (
  341. "Some inputs do not have OOB scores. This probably means too few "
  342. "estimators were used to compute any reliable oob estimates."
  343. )
  344. with pytest.warns(UserWarning, match=warn_msg):
  345. regr = BaggingRegressor(
  346. estimator=DecisionTreeRegressor(),
  347. n_estimators=1,
  348. bootstrap=True,
  349. oob_score=True,
  350. random_state=rng,
  351. )
  352. regr.fit(X_train, y_train)
  353. def test_single_estimator():
  354. # Check singleton ensembles.
  355. rng = check_random_state(0)
  356. X_train, X_test, y_train, y_test = train_test_split(
  357. diabetes.data, diabetes.target, random_state=rng
  358. )
  359. clf1 = BaggingRegressor(
  360. estimator=KNeighborsRegressor(),
  361. n_estimators=1,
  362. bootstrap=False,
  363. bootstrap_features=False,
  364. random_state=rng,
  365. ).fit(X_train, y_train)
  366. clf2 = KNeighborsRegressor().fit(X_train, y_train)
  367. assert_array_almost_equal(clf1.predict(X_test), clf2.predict(X_test))
  368. def test_error():
  369. # Test support of decision_function
  370. X, y = iris.data, iris.target
  371. base = DecisionTreeClassifier()
  372. assert not hasattr(BaggingClassifier(base).fit(X, y), "decision_function")
  373. def test_parallel_classification():
  374. # Check parallel classification.
  375. X_train, X_test, y_train, y_test = train_test_split(
  376. iris.data, iris.target, random_state=0
  377. )
  378. ensemble = BaggingClassifier(
  379. DecisionTreeClassifier(), n_jobs=3, random_state=0
  380. ).fit(X_train, y_train)
  381. # predict_proba
  382. y1 = ensemble.predict_proba(X_test)
  383. ensemble.set_params(n_jobs=1)
  384. y2 = ensemble.predict_proba(X_test)
  385. assert_array_almost_equal(y1, y2)
  386. ensemble = BaggingClassifier(
  387. DecisionTreeClassifier(), n_jobs=1, random_state=0
  388. ).fit(X_train, y_train)
  389. y3 = ensemble.predict_proba(X_test)
  390. assert_array_almost_equal(y1, y3)
  391. # decision_function
  392. ensemble = BaggingClassifier(
  393. SVC(decision_function_shape="ovr"), n_jobs=3, random_state=0
  394. ).fit(X_train, y_train)
  395. decisions1 = ensemble.decision_function(X_test)
  396. ensemble.set_params(n_jobs=1)
  397. decisions2 = ensemble.decision_function(X_test)
  398. assert_array_almost_equal(decisions1, decisions2)
  399. ensemble = BaggingClassifier(
  400. SVC(decision_function_shape="ovr"), n_jobs=1, random_state=0
  401. ).fit(X_train, y_train)
  402. decisions3 = ensemble.decision_function(X_test)
  403. assert_array_almost_equal(decisions1, decisions3)
  404. def test_parallel_regression():
  405. # Check parallel regression.
  406. rng = check_random_state(0)
  407. X_train, X_test, y_train, y_test = train_test_split(
  408. diabetes.data, diabetes.target, random_state=rng
  409. )
  410. ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(
  411. X_train, y_train
  412. )
  413. ensemble.set_params(n_jobs=1)
  414. y1 = ensemble.predict(X_test)
  415. ensemble.set_params(n_jobs=2)
  416. y2 = ensemble.predict(X_test)
  417. assert_array_almost_equal(y1, y2)
  418. ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=1, random_state=0).fit(
  419. X_train, y_train
  420. )
  421. y3 = ensemble.predict(X_test)
  422. assert_array_almost_equal(y1, y3)
  423. def test_gridsearch():
  424. # Check that bagging ensembles can be grid-searched.
  425. # Transform iris into a binary classification task
  426. X, y = iris.data, iris.target
  427. y[y == 2] = 1
  428. # Grid search with scoring based on decision_function
  429. parameters = {"n_estimators": (1, 2), "estimator__C": (1, 2)}
  430. GridSearchCV(BaggingClassifier(SVC()), parameters, scoring="roc_auc").fit(X, y)
  431. def test_estimator():
  432. # Check estimator and its default values.
  433. rng = check_random_state(0)
  434. # Classification
  435. X_train, X_test, y_train, y_test = train_test_split(
  436. iris.data, iris.target, random_state=rng
  437. )
  438. ensemble = BaggingClassifier(None, n_jobs=3, random_state=0).fit(X_train, y_train)
  439. assert isinstance(ensemble.estimator_, DecisionTreeClassifier)
  440. ensemble = BaggingClassifier(
  441. DecisionTreeClassifier(), n_jobs=3, random_state=0
  442. ).fit(X_train, y_train)
  443. assert isinstance(ensemble.estimator_, DecisionTreeClassifier)
  444. ensemble = BaggingClassifier(Perceptron(), n_jobs=3, random_state=0).fit(
  445. X_train, y_train
  446. )
  447. assert isinstance(ensemble.estimator_, Perceptron)
  448. # Regression
  449. X_train, X_test, y_train, y_test = train_test_split(
  450. diabetes.data, diabetes.target, random_state=rng
  451. )
  452. ensemble = BaggingRegressor(None, n_jobs=3, random_state=0).fit(X_train, y_train)
  453. assert isinstance(ensemble.estimator_, DecisionTreeRegressor)
  454. ensemble = BaggingRegressor(DecisionTreeRegressor(), n_jobs=3, random_state=0).fit(
  455. X_train, y_train
  456. )
  457. assert isinstance(ensemble.estimator_, DecisionTreeRegressor)
  458. ensemble = BaggingRegressor(SVR(), n_jobs=3, random_state=0).fit(X_train, y_train)
  459. assert isinstance(ensemble.estimator_, SVR)
  460. def test_bagging_with_pipeline():
  461. estimator = BaggingClassifier(
  462. make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2
  463. )
  464. estimator.fit(iris.data, iris.target)
  465. assert isinstance(estimator[0].steps[-1][1].random_state, int)
  466. class DummyZeroEstimator(BaseEstimator):
  467. def fit(self, X, y):
  468. self.classes_ = np.unique(y)
  469. return self
  470. def predict(self, X):
  471. return self.classes_[np.zeros(X.shape[0], dtype=int)]
  472. def test_bagging_sample_weight_unsupported_but_passed():
  473. estimator = BaggingClassifier(DummyZeroEstimator())
  474. rng = check_random_state(0)
  475. estimator.fit(iris.data, iris.target).predict(iris.data)
  476. with pytest.raises(ValueError):
  477. estimator.fit(
  478. iris.data,
  479. iris.target,
  480. sample_weight=rng.randint(10, size=(iris.data.shape[0])),
  481. )
  482. def test_warm_start(random_state=42):
  483. # Test if fitting incrementally with warm start gives a forest of the
  484. # right size and the same results as a normal fit.
  485. X, y = make_hastie_10_2(n_samples=20, random_state=1)
  486. clf_ws = None
  487. for n_estimators in [5, 10]:
  488. if clf_ws is None:
  489. clf_ws = BaggingClassifier(
  490. n_estimators=n_estimators, random_state=random_state, warm_start=True
  491. )
  492. else:
  493. clf_ws.set_params(n_estimators=n_estimators)
  494. clf_ws.fit(X, y)
  495. assert len(clf_ws) == n_estimators
  496. clf_no_ws = BaggingClassifier(
  497. n_estimators=10, random_state=random_state, warm_start=False
  498. )
  499. clf_no_ws.fit(X, y)
  500. assert set([tree.random_state for tree in clf_ws]) == set(
  501. [tree.random_state for tree in clf_no_ws]
  502. )
  503. def test_warm_start_smaller_n_estimators():
  504. # Test if warm start'ed second fit with smaller n_estimators raises error.
  505. X, y = make_hastie_10_2(n_samples=20, random_state=1)
  506. clf = BaggingClassifier(n_estimators=5, warm_start=True)
  507. clf.fit(X, y)
  508. clf.set_params(n_estimators=4)
  509. with pytest.raises(ValueError):
  510. clf.fit(X, y)
  511. def test_warm_start_equal_n_estimators():
  512. # Test that nothing happens when fitting without increasing n_estimators
  513. X, y = make_hastie_10_2(n_samples=20, random_state=1)
  514. X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
  515. clf = BaggingClassifier(n_estimators=5, warm_start=True, random_state=83)
  516. clf.fit(X_train, y_train)
  517. y_pred = clf.predict(X_test)
  518. # modify X to nonsense values, this should not change anything
  519. X_train += 1.0
  520. warn_msg = "Warm-start fitting without increasing n_estimators does not"
  521. with pytest.warns(UserWarning, match=warn_msg):
  522. clf.fit(X_train, y_train)
  523. assert_array_equal(y_pred, clf.predict(X_test))
  524. def test_warm_start_equivalence():
  525. # warm started classifier with 5+5 estimators should be equivalent to
  526. # one classifier with 10 estimators
  527. X, y = make_hastie_10_2(n_samples=20, random_state=1)
  528. X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=43)
  529. clf_ws = BaggingClassifier(n_estimators=5, warm_start=True, random_state=3141)
  530. clf_ws.fit(X_train, y_train)
  531. clf_ws.set_params(n_estimators=10)
  532. clf_ws.fit(X_train, y_train)
  533. y1 = clf_ws.predict(X_test)
  534. clf = BaggingClassifier(n_estimators=10, warm_start=False, random_state=3141)
  535. clf.fit(X_train, y_train)
  536. y2 = clf.predict(X_test)
  537. assert_array_almost_equal(y1, y2)
  538. def test_warm_start_with_oob_score_fails():
  539. # Check using oob_score and warm_start simultaneously fails
  540. X, y = make_hastie_10_2(n_samples=20, random_state=1)
  541. clf = BaggingClassifier(n_estimators=5, warm_start=True, oob_score=True)
  542. with pytest.raises(ValueError):
  543. clf.fit(X, y)
  544. def test_oob_score_removed_on_warm_start():
  545. X, y = make_hastie_10_2(n_samples=100, random_state=1)
  546. clf = BaggingClassifier(n_estimators=5, oob_score=True)
  547. clf.fit(X, y)
  548. clf.set_params(warm_start=True, oob_score=False, n_estimators=10)
  549. clf.fit(X, y)
  550. with pytest.raises(AttributeError):
  551. getattr(clf, "oob_score_")
  552. def test_oob_score_consistency():
  553. # Make sure OOB scores are identical when random_state, estimator, and
  554. # training data are fixed and fitting is done twice
  555. X, y = make_hastie_10_2(n_samples=200, random_state=1)
  556. bagging = BaggingClassifier(
  557. KNeighborsClassifier(),
  558. max_samples=0.5,
  559. max_features=0.5,
  560. oob_score=True,
  561. random_state=1,
  562. )
  563. assert bagging.fit(X, y).oob_score_ == bagging.fit(X, y).oob_score_
  564. def test_estimators_samples():
  565. # Check that format of estimators_samples_ is correct and that results
  566. # generated at fit time can be identically reproduced at a later time
  567. # using data saved in object attributes.
  568. X, y = make_hastie_10_2(n_samples=200, random_state=1)
  569. bagging = BaggingClassifier(
  570. LogisticRegression(),
  571. max_samples=0.5,
  572. max_features=0.5,
  573. random_state=1,
  574. bootstrap=False,
  575. )
  576. bagging.fit(X, y)
  577. # Get relevant attributes
  578. estimators_samples = bagging.estimators_samples_
  579. estimators_features = bagging.estimators_features_
  580. estimators = bagging.estimators_
  581. # Test for correct formatting
  582. assert len(estimators_samples) == len(estimators)
  583. assert len(estimators_samples[0]) == len(X) // 2
  584. assert estimators_samples[0].dtype.kind == "i"
  585. # Re-fit single estimator to test for consistent sampling
  586. estimator_index = 0
  587. estimator_samples = estimators_samples[estimator_index]
  588. estimator_features = estimators_features[estimator_index]
  589. estimator = estimators[estimator_index]
  590. X_train = (X[estimator_samples])[:, estimator_features]
  591. y_train = y[estimator_samples]
  592. orig_coefs = estimator.coef_
  593. estimator.fit(X_train, y_train)
  594. new_coefs = estimator.coef_
  595. assert_array_almost_equal(orig_coefs, new_coefs)
  596. def test_estimators_samples_deterministic():
  597. # This test is a regression test to check that with a random step
  598. # (e.g. SparseRandomProjection) and a given random state, the results
  599. # generated at fit time can be identically reproduced at a later time using
  600. # data saved in object attributes. Check issue #9524 for full discussion.
  601. iris = load_iris()
  602. X, y = iris.data, iris.target
  603. base_pipeline = make_pipeline(
  604. SparseRandomProjection(n_components=2), LogisticRegression()
  605. )
  606. clf = BaggingClassifier(estimator=base_pipeline, max_samples=0.5, random_state=0)
  607. clf.fit(X, y)
  608. pipeline_estimator_coef = clf.estimators_[0].steps[-1][1].coef_.copy()
  609. estimator = clf.estimators_[0]
  610. estimator_sample = clf.estimators_samples_[0]
  611. estimator_feature = clf.estimators_features_[0]
  612. X_train = (X[estimator_sample])[:, estimator_feature]
  613. y_train = y[estimator_sample]
  614. estimator.fit(X_train, y_train)
  615. assert_array_equal(estimator.steps[-1][1].coef_, pipeline_estimator_coef)
  616. def test_max_samples_consistency():
  617. # Make sure validated max_samples and original max_samples are identical
  618. # when valid integer max_samples supplied by user
  619. max_samples = 100
  620. X, y = make_hastie_10_2(n_samples=2 * max_samples, random_state=1)
  621. bagging = BaggingClassifier(
  622. KNeighborsClassifier(),
  623. max_samples=max_samples,
  624. max_features=0.5,
  625. random_state=1,
  626. )
  627. bagging.fit(X, y)
  628. assert bagging._max_samples == max_samples
  629. def test_set_oob_score_label_encoding():
  630. # Make sure the oob_score doesn't change when the labels change
  631. # See: https://github.com/scikit-learn/scikit-learn/issues/8933
  632. random_state = 5
  633. X = [[-1], [0], [1]] * 5
  634. Y1 = ["A", "B", "C"] * 5
  635. Y2 = [-1, 0, 1] * 5
  636. Y3 = [0, 1, 2] * 5
  637. x1 = (
  638. BaggingClassifier(oob_score=True, random_state=random_state)
  639. .fit(X, Y1)
  640. .oob_score_
  641. )
  642. x2 = (
  643. BaggingClassifier(oob_score=True, random_state=random_state)
  644. .fit(X, Y2)
  645. .oob_score_
  646. )
  647. x3 = (
  648. BaggingClassifier(oob_score=True, random_state=random_state)
  649. .fit(X, Y3)
  650. .oob_score_
  651. )
  652. assert [x1, x2] == [x3, x3]
  653. def replace(X):
  654. X = X.astype("float", copy=True)
  655. X[~np.isfinite(X)] = 0
  656. return X
  657. def test_bagging_regressor_with_missing_inputs():
  658. # Check that BaggingRegressor can accept X with missing/infinite data
  659. X = np.array(
  660. [
  661. [1, 3, 5],
  662. [2, None, 6],
  663. [2, np.nan, 6],
  664. [2, np.inf, 6],
  665. [2, -np.inf, 6],
  666. ]
  667. )
  668. y_values = [
  669. np.array([2, 3, 3, 3, 3]),
  670. np.array(
  671. [
  672. [2, 1, 9],
  673. [3, 6, 8],
  674. [3, 6, 8],
  675. [3, 6, 8],
  676. [3, 6, 8],
  677. ]
  678. ),
  679. ]
  680. for y in y_values:
  681. regressor = DecisionTreeRegressor()
  682. pipeline = make_pipeline(FunctionTransformer(replace), regressor)
  683. pipeline.fit(X, y).predict(X)
  684. bagging_regressor = BaggingRegressor(pipeline)
  685. y_hat = bagging_regressor.fit(X, y).predict(X)
  686. assert y.shape == y_hat.shape
  687. # Verify that exceptions can be raised by wrapper regressor
  688. regressor = DecisionTreeRegressor()
  689. pipeline = make_pipeline(regressor)
  690. with pytest.raises(ValueError):
  691. pipeline.fit(X, y)
  692. bagging_regressor = BaggingRegressor(pipeline)
  693. with pytest.raises(ValueError):
  694. bagging_regressor.fit(X, y)
  695. def test_bagging_classifier_with_missing_inputs():
  696. # Check that BaggingClassifier can accept X with missing/infinite data
  697. X = np.array(
  698. [
  699. [1, 3, 5],
  700. [2, None, 6],
  701. [2, np.nan, 6],
  702. [2, np.inf, 6],
  703. [2, -np.inf, 6],
  704. ]
  705. )
  706. y = np.array([3, 6, 6, 6, 6])
  707. classifier = DecisionTreeClassifier()
  708. pipeline = make_pipeline(FunctionTransformer(replace), classifier)
  709. pipeline.fit(X, y).predict(X)
  710. bagging_classifier = BaggingClassifier(pipeline)
  711. bagging_classifier.fit(X, y)
  712. y_hat = bagging_classifier.predict(X)
  713. assert y.shape == y_hat.shape
  714. bagging_classifier.predict_log_proba(X)
  715. bagging_classifier.predict_proba(X)
  716. # Verify that exceptions can be raised by wrapper classifier
  717. classifier = DecisionTreeClassifier()
  718. pipeline = make_pipeline(classifier)
  719. with pytest.raises(ValueError):
  720. pipeline.fit(X, y)
  721. bagging_classifier = BaggingClassifier(pipeline)
  722. with pytest.raises(ValueError):
  723. bagging_classifier.fit(X, y)
  724. def test_bagging_small_max_features():
  725. # Check that Bagging estimator can accept low fractional max_features
  726. X = np.array([[1, 2], [3, 4]])
  727. y = np.array([1, 0])
  728. bagging = BaggingClassifier(LogisticRegression(), max_features=0.3, random_state=1)
  729. bagging.fit(X, y)
  730. def test_bagging_get_estimators_indices():
  731. # Check that Bagging estimator can generate sample indices properly
  732. # Non-regression test for:
  733. # https://github.com/scikit-learn/scikit-learn/issues/16436
  734. rng = np.random.RandomState(0)
  735. X = rng.randn(13, 4)
  736. y = np.arange(13)
  737. class MyEstimator(DecisionTreeRegressor):
  738. """An estimator which stores y indices information at fit."""
  739. def fit(self, X, y):
  740. self._sample_indices = y
  741. clf = BaggingRegressor(estimator=MyEstimator(), n_estimators=1, random_state=0)
  742. clf.fit(X, y)
  743. assert_array_equal(clf.estimators_[0]._sample_indices, clf.estimators_samples_[0])
  744. # TODO(1.4): remove in 1.4
  745. @pytest.mark.parametrize(
  746. "Bagging, Estimator",
  747. [
  748. (BaggingClassifier, DecisionTreeClassifier),
  749. (BaggingRegressor, DecisionTreeRegressor),
  750. ],
  751. )
  752. def test_base_estimator_argument_deprecated(Bagging, Estimator):
  753. X = np.array([[1, 2], [3, 4]])
  754. y = np.array([1, 0])
  755. model = Bagging(base_estimator=Estimator(), n_estimators=10)
  756. warn_msg = (
  757. "`base_estimator` was renamed to `estimator` in version 1.2 and "
  758. "will be removed in 1.4."
  759. )
  760. with pytest.warns(FutureWarning, match=warn_msg):
  761. model.fit(X, y)
  762. # TODO(1.4): remove in 1.4
  763. @pytest.mark.parametrize(
  764. "Bagging",
  765. [BaggingClassifier, BaggingClassifier],
  766. )
  767. def test_base_estimator_property_deprecated(Bagging):
  768. X = np.array([[1, 2], [3, 4]])
  769. y = np.array([1, 0])
  770. model = Bagging()
  771. model.fit(X, y)
  772. warn_msg = (
  773. "Attribute `base_estimator_` was deprecated in version 1.2 and "
  774. "will be removed in 1.4. Use `estimator_` instead."
  775. )
  776. with pytest.warns(FutureWarning, match=warn_msg):
  777. model.base_estimator_
  778. # TODO(1.4): remove
  779. def test_deprecated_base_estimator_has_decision_function():
  780. """Check that `BaggingClassifier` delegate to classifier with
  781. `decision_function`."""
  782. iris = load_iris()
  783. X, y = iris.data, iris.target
  784. clf = BaggingClassifier(base_estimator=SVC())
  785. assert hasattr(clf, "decision_function")
  786. warn_msg = (
  787. "`base_estimator` was renamed to `estimator` in version 1.2 and "
  788. "will be removed in 1.4."
  789. )
  790. with pytest.warns(FutureWarning, match=warn_msg):
  791. y_decision = clf.fit(X, y).decision_function(X)
  792. assert y_decision.shape == (150, 3)
  793. @pytest.mark.parametrize(
  794. "bagging, expected_allow_nan",
  795. [
  796. (BaggingClassifier(HistGradientBoostingClassifier(max_iter=1)), True),
  797. (BaggingRegressor(HistGradientBoostingRegressor(max_iter=1)), True),
  798. (BaggingClassifier(LogisticRegression()), False),
  799. (BaggingRegressor(SVR()), False),
  800. ],
  801. )
  802. def test_bagging_allow_nan_tag(bagging, expected_allow_nan):
  803. """Check that bagging inherits allow_nan tag."""
  804. assert bagging._get_tags()["allow_nan"] == expected_allow_nan