test_weight_boosting.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708
  1. """Testing for the boost module (sklearn.ensemble.boost)."""
  2. import re
  3. import numpy as np
  4. import pytest
  5. from scipy.sparse import coo_matrix, csc_matrix, csr_matrix, dok_matrix, lil_matrix
  6. from sklearn import datasets
  7. from sklearn.base import BaseEstimator, clone
  8. from sklearn.dummy import DummyClassifier, DummyRegressor
  9. from sklearn.ensemble import AdaBoostClassifier, AdaBoostRegressor
  10. from sklearn.ensemble._weight_boosting import _samme_proba
  11. from sklearn.linear_model import LinearRegression
  12. from sklearn.model_selection import GridSearchCV, train_test_split
  13. from sklearn.svm import SVC, SVR
  14. from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
  15. from sklearn.utils import shuffle
  16. from sklearn.utils._mocking import NoSampleWeightWrapper
  17. from sklearn.utils._testing import (
  18. assert_allclose,
  19. assert_array_almost_equal,
  20. assert_array_equal,
  21. assert_array_less,
  22. )
  23. # Common random state
  24. rng = np.random.RandomState(0)
  25. # Toy sample
  26. X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
  27. y_class = ["foo", "foo", "foo", 1, 1, 1] # test string class labels
  28. y_regr = [-1, -1, -1, 1, 1, 1]
  29. T = [[-1, -1], [2, 2], [3, 2]]
  30. y_t_class = ["foo", 1, 1]
  31. y_t_regr = [-1, 1, 1]
  32. # Load the iris dataset and randomly permute it
  33. iris = datasets.load_iris()
  34. perm = rng.permutation(iris.target.size)
  35. iris.data, iris.target = shuffle(iris.data, iris.target, random_state=rng)
  36. # Load the diabetes dataset and randomly permute it
  37. diabetes = datasets.load_diabetes()
  38. diabetes.data, diabetes.target = shuffle(
  39. diabetes.data, diabetes.target, random_state=rng
  40. )
  41. def test_samme_proba():
  42. # Test the `_samme_proba` helper function.
  43. # Define some example (bad) `predict_proba` output.
  44. probs = np.array(
  45. [[1, 1e-6, 0], [0.19, 0.6, 0.2], [-999, 0.51, 0.5], [1e-6, 1, 1e-9]]
  46. )
  47. probs /= np.abs(probs.sum(axis=1))[:, np.newaxis]
  48. # _samme_proba calls estimator.predict_proba.
  49. # Make a mock object so I can control what gets returned.
  50. class MockEstimator:
  51. def predict_proba(self, X):
  52. assert_array_equal(X.shape, probs.shape)
  53. return probs
  54. mock = MockEstimator()
  55. samme_proba = _samme_proba(mock, 3, np.ones_like(probs))
  56. assert_array_equal(samme_proba.shape, probs.shape)
  57. assert np.isfinite(samme_proba).all()
  58. # Make sure that the correct elements come out as smallest --
  59. # `_samme_proba` should preserve the ordering in each example.
  60. assert_array_equal(np.argmin(samme_proba, axis=1), [2, 0, 0, 2])
  61. assert_array_equal(np.argmax(samme_proba, axis=1), [0, 1, 1, 1])
  62. def test_oneclass_adaboost_proba():
  63. # Test predict_proba robustness for one class label input.
  64. # In response to issue #7501
  65. # https://github.com/scikit-learn/scikit-learn/issues/7501
  66. y_t = np.ones(len(X))
  67. clf = AdaBoostClassifier().fit(X, y_t)
  68. assert_array_almost_equal(clf.predict_proba(X), np.ones((len(X), 1)))
  69. @pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
  70. def test_classification_toy(algorithm):
  71. # Check classification on a toy dataset.
  72. clf = AdaBoostClassifier(algorithm=algorithm, random_state=0)
  73. clf.fit(X, y_class)
  74. assert_array_equal(clf.predict(T), y_t_class)
  75. assert_array_equal(np.unique(np.asarray(y_t_class)), clf.classes_)
  76. assert clf.predict_proba(T).shape == (len(T), 2)
  77. assert clf.decision_function(T).shape == (len(T),)
  78. def test_regression_toy():
  79. # Check classification on a toy dataset.
  80. clf = AdaBoostRegressor(random_state=0)
  81. clf.fit(X, y_regr)
  82. assert_array_equal(clf.predict(T), y_t_regr)
  83. def test_iris():
  84. # Check consistency on dataset iris.
  85. classes = np.unique(iris.target)
  86. clf_samme = prob_samme = None
  87. for alg in ["SAMME", "SAMME.R"]:
  88. clf = AdaBoostClassifier(algorithm=alg)
  89. clf.fit(iris.data, iris.target)
  90. assert_array_equal(classes, clf.classes_)
  91. proba = clf.predict_proba(iris.data)
  92. if alg == "SAMME":
  93. clf_samme = clf
  94. prob_samme = proba
  95. assert proba.shape[1] == len(classes)
  96. assert clf.decision_function(iris.data).shape[1] == len(classes)
  97. score = clf.score(iris.data, iris.target)
  98. assert score > 0.9, "Failed with algorithm %s and score = %f" % (alg, score)
  99. # Check we used multiple estimators
  100. assert len(clf.estimators_) > 1
  101. # Check for distinct random states (see issue #7408)
  102. assert len(set(est.random_state for est in clf.estimators_)) == len(
  103. clf.estimators_
  104. )
  105. # Somewhat hacky regression test: prior to
  106. # ae7adc880d624615a34bafdb1d75ef67051b8200,
  107. # predict_proba returned SAMME.R values for SAMME.
  108. clf_samme.algorithm = "SAMME.R"
  109. assert_array_less(0, np.abs(clf_samme.predict_proba(iris.data) - prob_samme))
  110. @pytest.mark.parametrize("loss", ["linear", "square", "exponential"])
  111. def test_diabetes(loss):
  112. # Check consistency on dataset diabetes.
  113. reg = AdaBoostRegressor(loss=loss, random_state=0)
  114. reg.fit(diabetes.data, diabetes.target)
  115. score = reg.score(diabetes.data, diabetes.target)
  116. assert score > 0.55
  117. # Check we used multiple estimators
  118. assert len(reg.estimators_) > 1
  119. # Check for distinct random states (see issue #7408)
  120. assert len(set(est.random_state for est in reg.estimators_)) == len(reg.estimators_)
  121. @pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
  122. def test_staged_predict(algorithm):
  123. # Check staged predictions.
  124. rng = np.random.RandomState(0)
  125. iris_weights = rng.randint(10, size=iris.target.shape)
  126. diabetes_weights = rng.randint(10, size=diabetes.target.shape)
  127. clf = AdaBoostClassifier(algorithm=algorithm, n_estimators=10)
  128. clf.fit(iris.data, iris.target, sample_weight=iris_weights)
  129. predictions = clf.predict(iris.data)
  130. staged_predictions = [p for p in clf.staged_predict(iris.data)]
  131. proba = clf.predict_proba(iris.data)
  132. staged_probas = [p for p in clf.staged_predict_proba(iris.data)]
  133. score = clf.score(iris.data, iris.target, sample_weight=iris_weights)
  134. staged_scores = [
  135. s for s in clf.staged_score(iris.data, iris.target, sample_weight=iris_weights)
  136. ]
  137. assert len(staged_predictions) == 10
  138. assert_array_almost_equal(predictions, staged_predictions[-1])
  139. assert len(staged_probas) == 10
  140. assert_array_almost_equal(proba, staged_probas[-1])
  141. assert len(staged_scores) == 10
  142. assert_array_almost_equal(score, staged_scores[-1])
  143. # AdaBoost regression
  144. clf = AdaBoostRegressor(n_estimators=10, random_state=0)
  145. clf.fit(diabetes.data, diabetes.target, sample_weight=diabetes_weights)
  146. predictions = clf.predict(diabetes.data)
  147. staged_predictions = [p for p in clf.staged_predict(diabetes.data)]
  148. score = clf.score(diabetes.data, diabetes.target, sample_weight=diabetes_weights)
  149. staged_scores = [
  150. s
  151. for s in clf.staged_score(
  152. diabetes.data, diabetes.target, sample_weight=diabetes_weights
  153. )
  154. ]
  155. assert len(staged_predictions) == 10
  156. assert_array_almost_equal(predictions, staged_predictions[-1])
  157. assert len(staged_scores) == 10
  158. assert_array_almost_equal(score, staged_scores[-1])
  159. def test_gridsearch():
  160. # Check that base trees can be grid-searched.
  161. # AdaBoost classification
  162. boost = AdaBoostClassifier(estimator=DecisionTreeClassifier())
  163. parameters = {
  164. "n_estimators": (1, 2),
  165. "estimator__max_depth": (1, 2),
  166. "algorithm": ("SAMME", "SAMME.R"),
  167. }
  168. clf = GridSearchCV(boost, parameters)
  169. clf.fit(iris.data, iris.target)
  170. # AdaBoost regression
  171. boost = AdaBoostRegressor(estimator=DecisionTreeRegressor(), random_state=0)
  172. parameters = {"n_estimators": (1, 2), "estimator__max_depth": (1, 2)}
  173. clf = GridSearchCV(boost, parameters)
  174. clf.fit(diabetes.data, diabetes.target)
  175. def test_pickle():
  176. # Check pickability.
  177. import pickle
  178. # Adaboost classifier
  179. for alg in ["SAMME", "SAMME.R"]:
  180. obj = AdaBoostClassifier(algorithm=alg)
  181. obj.fit(iris.data, iris.target)
  182. score = obj.score(iris.data, iris.target)
  183. s = pickle.dumps(obj)
  184. obj2 = pickle.loads(s)
  185. assert type(obj2) == obj.__class__
  186. score2 = obj2.score(iris.data, iris.target)
  187. assert score == score2
  188. # Adaboost regressor
  189. obj = AdaBoostRegressor(random_state=0)
  190. obj.fit(diabetes.data, diabetes.target)
  191. score = obj.score(diabetes.data, diabetes.target)
  192. s = pickle.dumps(obj)
  193. obj2 = pickle.loads(s)
  194. assert type(obj2) == obj.__class__
  195. score2 = obj2.score(diabetes.data, diabetes.target)
  196. assert score == score2
  197. def test_importances():
  198. # Check variable importances.
  199. X, y = datasets.make_classification(
  200. n_samples=2000,
  201. n_features=10,
  202. n_informative=3,
  203. n_redundant=0,
  204. n_repeated=0,
  205. shuffle=False,
  206. random_state=1,
  207. )
  208. for alg in ["SAMME", "SAMME.R"]:
  209. clf = AdaBoostClassifier(algorithm=alg)
  210. clf.fit(X, y)
  211. importances = clf.feature_importances_
  212. assert importances.shape[0] == 10
  213. assert (importances[:3, np.newaxis] >= importances[3:]).all()
  214. def test_adaboost_classifier_sample_weight_error():
  215. # Test that it gives proper exception on incorrect sample weight.
  216. clf = AdaBoostClassifier()
  217. msg = re.escape("sample_weight.shape == (1,), expected (6,)")
  218. with pytest.raises(ValueError, match=msg):
  219. clf.fit(X, y_class, sample_weight=np.asarray([-1]))
  220. def test_estimator():
  221. # Test different estimators.
  222. from sklearn.ensemble import RandomForestClassifier
  223. # XXX doesn't work with y_class because RF doesn't support classes_
  224. # Shouldn't AdaBoost run a LabelBinarizer?
  225. clf = AdaBoostClassifier(RandomForestClassifier())
  226. clf.fit(X, y_regr)
  227. clf = AdaBoostClassifier(SVC(), algorithm="SAMME")
  228. clf.fit(X, y_class)
  229. from sklearn.ensemble import RandomForestRegressor
  230. clf = AdaBoostRegressor(RandomForestRegressor(), random_state=0)
  231. clf.fit(X, y_regr)
  232. clf = AdaBoostRegressor(SVR(), random_state=0)
  233. clf.fit(X, y_regr)
  234. # Check that an empty discrete ensemble fails in fit, not predict.
  235. X_fail = [[1, 1], [1, 1], [1, 1], [1, 1]]
  236. y_fail = ["foo", "bar", 1, 2]
  237. clf = AdaBoostClassifier(SVC(), algorithm="SAMME")
  238. with pytest.raises(ValueError, match="worse than random"):
  239. clf.fit(X_fail, y_fail)
  240. def test_sample_weights_infinite():
  241. msg = "Sample weights have reached infinite values"
  242. clf = AdaBoostClassifier(n_estimators=30, learning_rate=23.0, algorithm="SAMME")
  243. with pytest.warns(UserWarning, match=msg):
  244. clf.fit(iris.data, iris.target)
  245. def test_sparse_classification():
  246. # Check classification with sparse input.
  247. class CustomSVC(SVC):
  248. """SVC variant that records the nature of the training set."""
  249. def fit(self, X, y, sample_weight=None):
  250. """Modification on fit caries data type for later verification."""
  251. super().fit(X, y, sample_weight=sample_weight)
  252. self.data_type_ = type(X)
  253. return self
  254. X, y = datasets.make_multilabel_classification(
  255. n_classes=1, n_samples=15, n_features=5, random_state=42
  256. )
  257. # Flatten y to a 1d array
  258. y = np.ravel(y)
  259. X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
  260. for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix, dok_matrix]:
  261. X_train_sparse = sparse_format(X_train)
  262. X_test_sparse = sparse_format(X_test)
  263. # Trained on sparse format
  264. sparse_classifier = AdaBoostClassifier(
  265. estimator=CustomSVC(probability=True),
  266. random_state=1,
  267. algorithm="SAMME",
  268. ).fit(X_train_sparse, y_train)
  269. # Trained on dense format
  270. dense_classifier = AdaBoostClassifier(
  271. estimator=CustomSVC(probability=True),
  272. random_state=1,
  273. algorithm="SAMME",
  274. ).fit(X_train, y_train)
  275. # predict
  276. sparse_results = sparse_classifier.predict(X_test_sparse)
  277. dense_results = dense_classifier.predict(X_test)
  278. assert_array_equal(sparse_results, dense_results)
  279. # decision_function
  280. sparse_results = sparse_classifier.decision_function(X_test_sparse)
  281. dense_results = dense_classifier.decision_function(X_test)
  282. assert_array_almost_equal(sparse_results, dense_results)
  283. # predict_log_proba
  284. sparse_results = sparse_classifier.predict_log_proba(X_test_sparse)
  285. dense_results = dense_classifier.predict_log_proba(X_test)
  286. assert_array_almost_equal(sparse_results, dense_results)
  287. # predict_proba
  288. sparse_results = sparse_classifier.predict_proba(X_test_sparse)
  289. dense_results = dense_classifier.predict_proba(X_test)
  290. assert_array_almost_equal(sparse_results, dense_results)
  291. # score
  292. sparse_results = sparse_classifier.score(X_test_sparse, y_test)
  293. dense_results = dense_classifier.score(X_test, y_test)
  294. assert_array_almost_equal(sparse_results, dense_results)
  295. # staged_decision_function
  296. sparse_results = sparse_classifier.staged_decision_function(X_test_sparse)
  297. dense_results = dense_classifier.staged_decision_function(X_test)
  298. for sprase_res, dense_res in zip(sparse_results, dense_results):
  299. assert_array_almost_equal(sprase_res, dense_res)
  300. # staged_predict
  301. sparse_results = sparse_classifier.staged_predict(X_test_sparse)
  302. dense_results = dense_classifier.staged_predict(X_test)
  303. for sprase_res, dense_res in zip(sparse_results, dense_results):
  304. assert_array_equal(sprase_res, dense_res)
  305. # staged_predict_proba
  306. sparse_results = sparse_classifier.staged_predict_proba(X_test_sparse)
  307. dense_results = dense_classifier.staged_predict_proba(X_test)
  308. for sprase_res, dense_res in zip(sparse_results, dense_results):
  309. assert_array_almost_equal(sprase_res, dense_res)
  310. # staged_score
  311. sparse_results = sparse_classifier.staged_score(X_test_sparse, y_test)
  312. dense_results = dense_classifier.staged_score(X_test, y_test)
  313. for sprase_res, dense_res in zip(sparse_results, dense_results):
  314. assert_array_equal(sprase_res, dense_res)
  315. # Verify sparsity of data is maintained during training
  316. types = [i.data_type_ for i in sparse_classifier.estimators_]
  317. assert all([(t == csc_matrix or t == csr_matrix) for t in types])
  318. def test_sparse_regression():
  319. # Check regression with sparse input.
  320. class CustomSVR(SVR):
  321. """SVR variant that records the nature of the training set."""
  322. def fit(self, X, y, sample_weight=None):
  323. """Modification on fit caries data type for later verification."""
  324. super().fit(X, y, sample_weight=sample_weight)
  325. self.data_type_ = type(X)
  326. return self
  327. X, y = datasets.make_regression(
  328. n_samples=15, n_features=50, n_targets=1, random_state=42
  329. )
  330. X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
  331. for sparse_format in [csc_matrix, csr_matrix, lil_matrix, coo_matrix, dok_matrix]:
  332. X_train_sparse = sparse_format(X_train)
  333. X_test_sparse = sparse_format(X_test)
  334. # Trained on sparse format
  335. sparse_classifier = AdaBoostRegressor(
  336. estimator=CustomSVR(), random_state=1
  337. ).fit(X_train_sparse, y_train)
  338. # Trained on dense format
  339. dense_classifier = dense_results = AdaBoostRegressor(
  340. estimator=CustomSVR(), random_state=1
  341. ).fit(X_train, y_train)
  342. # predict
  343. sparse_results = sparse_classifier.predict(X_test_sparse)
  344. dense_results = dense_classifier.predict(X_test)
  345. assert_array_almost_equal(sparse_results, dense_results)
  346. # staged_predict
  347. sparse_results = sparse_classifier.staged_predict(X_test_sparse)
  348. dense_results = dense_classifier.staged_predict(X_test)
  349. for sprase_res, dense_res in zip(sparse_results, dense_results):
  350. assert_array_almost_equal(sprase_res, dense_res)
  351. types = [i.data_type_ for i in sparse_classifier.estimators_]
  352. assert all([(t == csc_matrix or t == csr_matrix) for t in types])
  353. def test_sample_weight_adaboost_regressor():
  354. """
  355. AdaBoostRegressor should work without sample_weights in the base estimator
  356. The random weighted sampling is done internally in the _boost method in
  357. AdaBoostRegressor.
  358. """
  359. class DummyEstimator(BaseEstimator):
  360. def fit(self, X, y):
  361. pass
  362. def predict(self, X):
  363. return np.zeros(X.shape[0])
  364. boost = AdaBoostRegressor(DummyEstimator(), n_estimators=3)
  365. boost.fit(X, y_regr)
  366. assert len(boost.estimator_weights_) == len(boost.estimator_errors_)
  367. def test_multidimensional_X():
  368. """
  369. Check that the AdaBoost estimators can work with n-dimensional
  370. data matrix
  371. """
  372. rng = np.random.RandomState(0)
  373. X = rng.randn(51, 3, 3)
  374. yc = rng.choice([0, 1], 51)
  375. yr = rng.randn(51)
  376. boost = AdaBoostClassifier(DummyClassifier(strategy="most_frequent"))
  377. boost.fit(X, yc)
  378. boost.predict(X)
  379. boost.predict_proba(X)
  380. boost = AdaBoostRegressor(DummyRegressor())
  381. boost.fit(X, yr)
  382. boost.predict(X)
  383. @pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
  384. def test_adaboostclassifier_without_sample_weight(algorithm):
  385. X, y = iris.data, iris.target
  386. estimator = NoSampleWeightWrapper(DummyClassifier())
  387. clf = AdaBoostClassifier(estimator=estimator, algorithm=algorithm)
  388. err_msg = "{} doesn't support sample_weight".format(estimator.__class__.__name__)
  389. with pytest.raises(ValueError, match=err_msg):
  390. clf.fit(X, y)
  391. def test_adaboostregressor_sample_weight():
  392. # check that giving weight will have an influence on the error computed
  393. # for a weak learner
  394. rng = np.random.RandomState(42)
  395. X = np.linspace(0, 100, num=1000)
  396. y = (0.8 * X + 0.2) + (rng.rand(X.shape[0]) * 0.0001)
  397. X = X.reshape(-1, 1)
  398. # add an arbitrary outlier
  399. X[-1] *= 10
  400. y[-1] = 10000
  401. # random_state=0 ensure that the underlying bootstrap will use the outlier
  402. regr_no_outlier = AdaBoostRegressor(
  403. estimator=LinearRegression(), n_estimators=1, random_state=0
  404. )
  405. regr_with_weight = clone(regr_no_outlier)
  406. regr_with_outlier = clone(regr_no_outlier)
  407. # fit 3 models:
  408. # - a model containing the outlier
  409. # - a model without the outlier
  410. # - a model containing the outlier but with a null sample-weight
  411. regr_with_outlier.fit(X, y)
  412. regr_no_outlier.fit(X[:-1], y[:-1])
  413. sample_weight = np.ones_like(y)
  414. sample_weight[-1] = 0
  415. regr_with_weight.fit(X, y, sample_weight=sample_weight)
  416. score_with_outlier = regr_with_outlier.score(X[:-1], y[:-1])
  417. score_no_outlier = regr_no_outlier.score(X[:-1], y[:-1])
  418. score_with_weight = regr_with_weight.score(X[:-1], y[:-1])
  419. assert score_with_outlier < score_no_outlier
  420. assert score_with_outlier < score_with_weight
  421. assert score_no_outlier == pytest.approx(score_with_weight)
  422. @pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
  423. def test_adaboost_consistent_predict(algorithm):
  424. # check that predict_proba and predict give consistent results
  425. # regression test for:
  426. # https://github.com/scikit-learn/scikit-learn/issues/14084
  427. X_train, X_test, y_train, y_test = train_test_split(
  428. *datasets.load_digits(return_X_y=True), random_state=42
  429. )
  430. model = AdaBoostClassifier(algorithm=algorithm, random_state=42)
  431. model.fit(X_train, y_train)
  432. assert_array_equal(
  433. np.argmax(model.predict_proba(X_test), axis=1), model.predict(X_test)
  434. )
  435. @pytest.mark.parametrize(
  436. "model, X, y",
  437. [
  438. (AdaBoostClassifier(), iris.data, iris.target),
  439. (AdaBoostRegressor(), diabetes.data, diabetes.target),
  440. ],
  441. )
  442. def test_adaboost_negative_weight_error(model, X, y):
  443. sample_weight = np.ones_like(y)
  444. sample_weight[-1] = -10
  445. err_msg = "Negative values in data passed to `sample_weight`"
  446. with pytest.raises(ValueError, match=err_msg):
  447. model.fit(X, y, sample_weight=sample_weight)
  448. def test_adaboost_numerically_stable_feature_importance_with_small_weights():
  449. """Check that we don't create NaN feature importance with numerically
  450. instable inputs.
  451. Non-regression test for:
  452. https://github.com/scikit-learn/scikit-learn/issues/20320
  453. """
  454. rng = np.random.RandomState(42)
  455. X = rng.normal(size=(1000, 10))
  456. y = rng.choice([0, 1], size=1000)
  457. sample_weight = np.ones_like(y) * 1e-263
  458. tree = DecisionTreeClassifier(max_depth=10, random_state=12)
  459. ada_model = AdaBoostClassifier(estimator=tree, n_estimators=20, random_state=12)
  460. ada_model.fit(X, y, sample_weight=sample_weight)
  461. assert np.isnan(ada_model.feature_importances_).sum() == 0
  462. # TODO(1.4): remove in 1.4
  463. @pytest.mark.parametrize(
  464. "AdaBoost, Estimator",
  465. [
  466. (AdaBoostClassifier, DecisionTreeClassifier),
  467. (AdaBoostRegressor, DecisionTreeRegressor),
  468. ],
  469. )
  470. def test_base_estimator_argument_deprecated(AdaBoost, Estimator):
  471. X = np.array([[1, 2], [3, 4]])
  472. y = np.array([1, 0])
  473. model = AdaBoost(base_estimator=Estimator())
  474. warn_msg = (
  475. "`base_estimator` was renamed to `estimator` in version 1.2 and "
  476. "will be removed in 1.4."
  477. )
  478. with pytest.warns(FutureWarning, match=warn_msg):
  479. model.fit(X, y)
  480. # TODO(1.4): remove in 1.4
  481. @pytest.mark.parametrize(
  482. "AdaBoost",
  483. [
  484. AdaBoostClassifier,
  485. AdaBoostRegressor,
  486. ],
  487. )
  488. def test_base_estimator_argument_deprecated_none(AdaBoost):
  489. X = np.array([[1, 2], [3, 4]])
  490. y = np.array([1, 0])
  491. model = AdaBoost(base_estimator=None)
  492. warn_msg = (
  493. "`base_estimator` was renamed to `estimator` in version 1.2 and "
  494. "will be removed in 1.4."
  495. )
  496. with pytest.warns(FutureWarning, match=warn_msg):
  497. model.fit(X, y)
  498. # TODO(1.4): remove in 1.4
  499. @pytest.mark.parametrize(
  500. "AdaBoost",
  501. [AdaBoostClassifier, AdaBoostRegressor],
  502. )
  503. def test_base_estimator_property_deprecated(AdaBoost):
  504. X = np.array([[1, 2], [3, 4]])
  505. y = np.array([1, 0])
  506. model = AdaBoost()
  507. model.fit(X, y)
  508. warn_msg = (
  509. "Attribute `base_estimator_` was deprecated in version 1.2 and "
  510. "will be removed in 1.4. Use `estimator_` instead."
  511. )
  512. with pytest.warns(FutureWarning, match=warn_msg):
  513. model.base_estimator_
  514. # TODO(1.4): remove in 1.4
  515. def test_deprecated_base_estimator_parameters_can_be_set():
  516. """Check that setting base_estimator parameters works.
  517. During the deprecation cycle setting "base_estimator__*" params should
  518. work.
  519. Non-regression test for https://github.com/scikit-learn/scikit-learn/issues/25470
  520. """
  521. # This implicitly sets "estimator", it is how old code (pre v1.2) would
  522. # have instantiated AdaBoostClassifier and back then it would set
  523. # "base_estimator".
  524. clf = AdaBoostClassifier(DecisionTreeClassifier())
  525. with pytest.warns(FutureWarning, match="Parameter 'base_estimator' of"):
  526. clf.set_params(base_estimator__max_depth=2)
  527. @pytest.mark.parametrize("algorithm", ["SAMME", "SAMME.R"])
  528. def test_adaboost_decision_function(algorithm, global_random_seed):
  529. """Check that the decision function respects the symmetric constraint for weak
  530. learners.
  531. Non-regression test for:
  532. https://github.com/scikit-learn/scikit-learn/issues/26520
  533. """
  534. n_classes = 3
  535. X, y = datasets.make_classification(
  536. n_classes=n_classes, n_clusters_per_class=1, random_state=global_random_seed
  537. )
  538. clf = AdaBoostClassifier(
  539. n_estimators=1, random_state=global_random_seed, algorithm=algorithm
  540. ).fit(X, y)
  541. y_score = clf.decision_function(X)
  542. assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
  543. if algorithm == "SAMME":
  544. # With a single learner, we expect to have a decision function in
  545. # {1, - 1 / (n_classes - 1)}.
  546. assert set(np.unique(y_score)) == {1, -1 / (n_classes - 1)}
  547. # We can assert the same for staged_decision_function since we have a single learner
  548. for y_score in clf.staged_decision_function(X):
  549. assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
  550. if algorithm == "SAMME":
  551. # With a single learner, we expect to have a decision function in
  552. # {1, - 1 / (n_classes - 1)}.
  553. assert set(np.unique(y_score)) == {1, -1 / (n_classes - 1)}
  554. clf.set_params(n_estimators=5).fit(X, y)
  555. y_score = clf.decision_function(X)
  556. assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)
  557. for y_score in clf.staged_decision_function(X):
  558. assert_allclose(y_score.sum(axis=1), 0, atol=1e-8)