test_gradient_boosting.py 49 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423
  1. """
  2. Testing for the gradient boosting module (sklearn.ensemble.gradient_boosting).
  3. """
  4. import re
  5. import warnings
  6. import numpy as np
  7. import pytest
  8. from numpy.testing import assert_allclose
  9. from scipy.sparse import coo_matrix, csc_matrix, csr_matrix
  10. from scipy.special import expit
  11. from sklearn import datasets
  12. from sklearn.base import clone
  13. from sklearn.datasets import make_classification, make_regression
  14. from sklearn.dummy import DummyClassifier, DummyRegressor
  15. from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
  16. from sklearn.ensemble._gradient_boosting import predict_stages
  17. from sklearn.exceptions import DataConversionWarning, NotFittedError
  18. from sklearn.linear_model import LinearRegression
  19. from sklearn.metrics import mean_squared_error
  20. from sklearn.model_selection import train_test_split
  21. from sklearn.pipeline import make_pipeline
  22. from sklearn.preprocessing import scale
  23. from sklearn.svm import NuSVR
  24. from sklearn.utils import check_random_state, tosequence
  25. from sklearn.utils._mocking import NoSampleWeightWrapper
  26. from sklearn.utils._param_validation import InvalidParameterError
  27. from sklearn.utils._testing import (
  28. assert_array_almost_equal,
  29. assert_array_equal,
  30. skip_if_32bit,
  31. )
  32. GRADIENT_BOOSTING_ESTIMATORS = [GradientBoostingClassifier, GradientBoostingRegressor]
  33. # toy sample
  34. X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
  35. y = [-1, -1, -1, 1, 1, 1]
  36. T = [[-1, -1], [2, 2], [3, 2]]
  37. true_result = [-1, 1, 1]
  38. # also make regression dataset
  39. X_reg, y_reg = make_regression(
  40. n_samples=100, n_features=4, n_informative=8, noise=10, random_state=7
  41. )
  42. y_reg = scale(y_reg)
  43. rng = np.random.RandomState(0)
  44. # also load the iris dataset
  45. # and randomly permute it
  46. iris = datasets.load_iris()
  47. perm = rng.permutation(iris.target.size)
  48. iris.data = iris.data[perm]
  49. iris.target = iris.target[perm]
  50. @pytest.mark.parametrize("loss", ("log_loss", "exponential"))
  51. def test_classification_toy(loss, global_random_seed):
  52. # Check classification on a toy dataset.
  53. clf = GradientBoostingClassifier(
  54. loss=loss, n_estimators=10, random_state=global_random_seed
  55. )
  56. with pytest.raises(ValueError):
  57. clf.predict(T)
  58. clf.fit(X, y)
  59. assert_array_equal(clf.predict(T), true_result)
  60. assert 10 == len(clf.estimators_)
  61. log_loss_decrease = clf.train_score_[:-1] - clf.train_score_[1:]
  62. assert np.any(log_loss_decrease >= 0.0)
  63. leaves = clf.apply(X)
  64. assert leaves.shape == (6, 10, 1)
  65. @pytest.mark.parametrize("loss", ("log_loss", "exponential"))
  66. def test_classification_synthetic(loss, global_random_seed):
  67. # Test GradientBoostingClassifier on synthetic dataset used by
  68. # Hastie et al. in ESLII - Figure 10.9
  69. # Note that Figure 10.9 reuses the dataset generated for figure 10.2
  70. # and should have 2_000 train data points and 10_000 test data points.
  71. # Here we intentionally use a smaller variant to make the test run faster,
  72. # but the conclusions are still the same, despite the smaller datasets.
  73. X, y = datasets.make_hastie_10_2(n_samples=2000, random_state=global_random_seed)
  74. split_idx = 500
  75. X_train, X_test = X[:split_idx], X[split_idx:]
  76. y_train, y_test = y[:split_idx], y[split_idx:]
  77. # Increasing the number of trees should decrease the test error
  78. common_params = {
  79. "max_depth": 1,
  80. "learning_rate": 1.0,
  81. "loss": loss,
  82. "random_state": global_random_seed,
  83. }
  84. gbrt_10_stumps = GradientBoostingClassifier(n_estimators=10, **common_params)
  85. gbrt_10_stumps.fit(X_train, y_train)
  86. gbrt_50_stumps = GradientBoostingClassifier(n_estimators=50, **common_params)
  87. gbrt_50_stumps.fit(X_train, y_train)
  88. assert gbrt_10_stumps.score(X_test, y_test) < gbrt_50_stumps.score(X_test, y_test)
  89. # Decision stumps are better suited for this dataset with a large number of
  90. # estimators.
  91. common_params = {
  92. "n_estimators": 200,
  93. "learning_rate": 1.0,
  94. "loss": loss,
  95. "random_state": global_random_seed,
  96. }
  97. gbrt_stumps = GradientBoostingClassifier(max_depth=1, **common_params)
  98. gbrt_stumps.fit(X_train, y_train)
  99. gbrt_10_nodes = GradientBoostingClassifier(max_leaf_nodes=10, **common_params)
  100. gbrt_10_nodes.fit(X_train, y_train)
  101. assert gbrt_stumps.score(X_test, y_test) > gbrt_10_nodes.score(X_test, y_test)
  102. @pytest.mark.parametrize("loss", ("squared_error", "absolute_error", "huber"))
  103. @pytest.mark.parametrize("subsample", (1.0, 0.5))
  104. def test_regression_dataset(loss, subsample, global_random_seed):
  105. # Check consistency on regression dataset with least squares
  106. # and least absolute deviation.
  107. ones = np.ones(len(y_reg))
  108. last_y_pred = None
  109. for sample_weight in [None, ones, 2 * ones]:
  110. # learning_rate, max_depth and n_estimators were adjusted to get a mode
  111. # that is accurate enough to reach a low MSE on the training set while
  112. # keeping the resource used to execute this test low enough.
  113. reg = GradientBoostingRegressor(
  114. n_estimators=30,
  115. loss=loss,
  116. max_depth=4,
  117. subsample=subsample,
  118. min_samples_split=2,
  119. random_state=global_random_seed,
  120. learning_rate=0.5,
  121. )
  122. reg.fit(X_reg, y_reg, sample_weight=sample_weight)
  123. leaves = reg.apply(X_reg)
  124. assert leaves.shape == (100, 30)
  125. y_pred = reg.predict(X_reg)
  126. mse = mean_squared_error(y_reg, y_pred)
  127. assert mse < 0.05
  128. if last_y_pred is not None:
  129. # FIXME: We temporarily bypass this test. This is due to the fact
  130. # that GBRT with and without `sample_weight` do not use the same
  131. # implementation of the median during the initialization with the
  132. # `DummyRegressor`. In the future, we should make sure that both
  133. # implementations should be the same. See PR #17377 for more.
  134. # assert_allclose(last_y_pred, y_pred)
  135. pass
  136. last_y_pred = y_pred
  137. @pytest.mark.parametrize("subsample", (1.0, 0.5))
  138. @pytest.mark.parametrize("sample_weight", (None, 1))
  139. def test_iris(subsample, sample_weight, global_random_seed):
  140. if sample_weight == 1:
  141. sample_weight = np.ones(len(iris.target))
  142. # Check consistency on dataset iris.
  143. clf = GradientBoostingClassifier(
  144. n_estimators=100,
  145. loss="log_loss",
  146. random_state=global_random_seed,
  147. subsample=subsample,
  148. )
  149. clf.fit(iris.data, iris.target, sample_weight=sample_weight)
  150. score = clf.score(iris.data, iris.target)
  151. assert score > 0.9
  152. leaves = clf.apply(iris.data)
  153. assert leaves.shape == (150, 100, 3)
  154. def test_regression_synthetic(global_random_seed):
  155. # Test on synthetic regression datasets used in Leo Breiman,
  156. # `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996).
  157. random_state = check_random_state(global_random_seed)
  158. regression_params = {
  159. "n_estimators": 100,
  160. "max_depth": 4,
  161. "min_samples_split": 2,
  162. "learning_rate": 0.1,
  163. "loss": "squared_error",
  164. "random_state": global_random_seed,
  165. }
  166. # Friedman1
  167. X, y = datasets.make_friedman1(n_samples=1200, random_state=random_state, noise=1.0)
  168. X_train, y_train = X[:200], y[:200]
  169. X_test, y_test = X[200:], y[200:]
  170. clf = GradientBoostingRegressor(**regression_params)
  171. clf.fit(X_train, y_train)
  172. mse = mean_squared_error(y_test, clf.predict(X_test))
  173. assert mse < 6.5
  174. # Friedman2
  175. X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state)
  176. X_train, y_train = X[:200], y[:200]
  177. X_test, y_test = X[200:], y[200:]
  178. clf = GradientBoostingRegressor(**regression_params)
  179. clf.fit(X_train, y_train)
  180. mse = mean_squared_error(y_test, clf.predict(X_test))
  181. assert mse < 2500.0
  182. # Friedman3
  183. X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state)
  184. X_train, y_train = X[:200], y[:200]
  185. X_test, y_test = X[200:], y[200:]
  186. clf = GradientBoostingRegressor(**regression_params)
  187. clf.fit(X_train, y_train)
  188. mse = mean_squared_error(y_test, clf.predict(X_test))
  189. assert mse < 0.025
  190. @pytest.mark.parametrize(
  191. "GradientBoosting, X, y",
  192. [
  193. (GradientBoostingRegressor, X_reg, y_reg),
  194. (GradientBoostingClassifier, iris.data, iris.target),
  195. ],
  196. )
  197. def test_feature_importances(GradientBoosting, X, y):
  198. # smoke test to check that the gradient boosting expose an attribute
  199. # feature_importances_
  200. gbdt = GradientBoosting()
  201. assert not hasattr(gbdt, "feature_importances_")
  202. gbdt.fit(X, y)
  203. assert hasattr(gbdt, "feature_importances_")
  204. def test_probability_log(global_random_seed):
  205. # Predict probabilities.
  206. clf = GradientBoostingClassifier(n_estimators=100, random_state=global_random_seed)
  207. with pytest.raises(ValueError):
  208. clf.predict_proba(T)
  209. clf.fit(X, y)
  210. assert_array_equal(clf.predict(T), true_result)
  211. # check if probabilities are in [0, 1].
  212. y_proba = clf.predict_proba(T)
  213. assert np.all(y_proba >= 0.0)
  214. assert np.all(y_proba <= 1.0)
  215. # derive predictions from probabilities
  216. y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0)
  217. assert_array_equal(y_pred, true_result)
  218. def test_single_class_with_sample_weight():
  219. sample_weight = [0, 0, 0, 1, 1, 1]
  220. clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
  221. msg = (
  222. "y contains 1 class after sample_weight trimmed classes with "
  223. "zero weights, while a minimum of 2 classes are required."
  224. )
  225. with pytest.raises(ValueError, match=msg):
  226. clf.fit(X, y, sample_weight=sample_weight)
  227. def test_check_inputs_predict_stages():
  228. # check that predict_stages through an error if the type of X is not
  229. # supported
  230. x, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
  231. x_sparse_csc = csc_matrix(x)
  232. clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
  233. clf.fit(x, y)
  234. score = np.zeros((y.shape)).reshape(-1, 1)
  235. err_msg = "When X is a sparse matrix, a CSR format is expected"
  236. with pytest.raises(ValueError, match=err_msg):
  237. predict_stages(clf.estimators_, x_sparse_csc, clf.learning_rate, score)
  238. x_fortran = np.asfortranarray(x)
  239. with pytest.raises(ValueError, match="X should be C-ordered np.ndarray"):
  240. predict_stages(clf.estimators_, x_fortran, clf.learning_rate, score)
  241. def test_max_feature_regression(global_random_seed):
  242. # Test to make sure random state is set properly.
  243. X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=global_random_seed)
  244. X_train, X_test = X[:2000], X[2000:]
  245. y_train, y_test = y[:2000], y[2000:]
  246. gbrt = GradientBoostingClassifier(
  247. n_estimators=100,
  248. min_samples_split=5,
  249. max_depth=2,
  250. learning_rate=0.1,
  251. max_features=2,
  252. random_state=global_random_seed,
  253. )
  254. gbrt.fit(X_train, y_train)
  255. log_loss = gbrt._loss(y_test, gbrt.decision_function(X_test))
  256. assert log_loss < 0.5, "GB failed with deviance %.4f" % log_loss
  257. def test_feature_importance_regression(
  258. fetch_california_housing_fxt, global_random_seed
  259. ):
  260. """Test that Gini importance is calculated correctly.
  261. This test follows the example from [1]_ (pg. 373).
  262. .. [1] Friedman, J., Hastie, T., & Tibshirani, R. (2001). The elements
  263. of statistical learning. New York: Springer series in statistics.
  264. """
  265. california = fetch_california_housing_fxt()
  266. X, y = california.data, california.target
  267. X_train, X_test, y_train, y_test = train_test_split(
  268. X, y, random_state=global_random_seed
  269. )
  270. reg = GradientBoostingRegressor(
  271. loss="huber",
  272. learning_rate=0.1,
  273. max_leaf_nodes=6,
  274. n_estimators=100,
  275. random_state=global_random_seed,
  276. )
  277. reg.fit(X_train, y_train)
  278. sorted_idx = np.argsort(reg.feature_importances_)[::-1]
  279. sorted_features = [california.feature_names[s] for s in sorted_idx]
  280. # The most important feature is the median income by far.
  281. assert sorted_features[0] == "MedInc"
  282. # The three subsequent features are the following. Their relative ordering
  283. # might change a bit depending on the randomness of the trees and the
  284. # train / test split.
  285. assert set(sorted_features[1:4]) == {"Longitude", "AveOccup", "Latitude"}
  286. def test_max_features():
  287. # Test if max features is set properly for floats and str.
  288. X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
  289. _, n_features = X.shape
  290. X_train = X[:2000]
  291. y_train = y[:2000]
  292. gbrt = GradientBoostingClassifier(n_estimators=1, max_features=None)
  293. gbrt.fit(X_train, y_train)
  294. assert gbrt.max_features_ == n_features
  295. gbrt = GradientBoostingRegressor(n_estimators=1, max_features=None)
  296. gbrt.fit(X_train, y_train)
  297. assert gbrt.max_features_ == n_features
  298. gbrt = GradientBoostingRegressor(n_estimators=1, max_features=0.3)
  299. gbrt.fit(X_train, y_train)
  300. assert gbrt.max_features_ == int(n_features * 0.3)
  301. gbrt = GradientBoostingRegressor(n_estimators=1, max_features="sqrt")
  302. gbrt.fit(X_train, y_train)
  303. assert gbrt.max_features_ == int(np.sqrt(n_features))
  304. gbrt = GradientBoostingRegressor(n_estimators=1, max_features="log2")
  305. gbrt.fit(X_train, y_train)
  306. assert gbrt.max_features_ == int(np.log2(n_features))
  307. gbrt = GradientBoostingRegressor(n_estimators=1, max_features=0.01 / X.shape[1])
  308. gbrt.fit(X_train, y_train)
  309. assert gbrt.max_features_ == 1
  310. def test_staged_predict():
  311. # Test whether staged decision function eventually gives
  312. # the same prediction.
  313. X, y = datasets.make_friedman1(n_samples=1200, random_state=1, noise=1.0)
  314. X_train, y_train = X[:200], y[:200]
  315. X_test = X[200:]
  316. clf = GradientBoostingRegressor()
  317. # test raise ValueError if not fitted
  318. with pytest.raises(ValueError):
  319. np.fromiter(clf.staged_predict(X_test), dtype=np.float64)
  320. clf.fit(X_train, y_train)
  321. y_pred = clf.predict(X_test)
  322. # test if prediction for last stage equals ``predict``
  323. for y in clf.staged_predict(X_test):
  324. assert y.shape == y_pred.shape
  325. assert_array_almost_equal(y_pred, y)
  326. def test_staged_predict_proba():
  327. # Test whether staged predict proba eventually gives
  328. # the same prediction.
  329. X, y = datasets.make_hastie_10_2(n_samples=1200, random_state=1)
  330. X_train, y_train = X[:200], y[:200]
  331. X_test, y_test = X[200:], y[200:]
  332. clf = GradientBoostingClassifier(n_estimators=20)
  333. # test raise NotFittedError if not
  334. with pytest.raises(NotFittedError):
  335. np.fromiter(clf.staged_predict_proba(X_test), dtype=np.float64)
  336. clf.fit(X_train, y_train)
  337. # test if prediction for last stage equals ``predict``
  338. for y_pred in clf.staged_predict(X_test):
  339. assert y_test.shape == y_pred.shape
  340. assert_array_equal(clf.predict(X_test), y_pred)
  341. # test if prediction for last stage equals ``predict_proba``
  342. for staged_proba in clf.staged_predict_proba(X_test):
  343. assert y_test.shape[0] == staged_proba.shape[0]
  344. assert 2 == staged_proba.shape[1]
  345. assert_array_almost_equal(clf.predict_proba(X_test), staged_proba)
  346. @pytest.mark.parametrize("Estimator", GRADIENT_BOOSTING_ESTIMATORS)
  347. def test_staged_functions_defensive(Estimator, global_random_seed):
  348. # test that staged_functions make defensive copies
  349. rng = np.random.RandomState(global_random_seed)
  350. X = rng.uniform(size=(10, 3))
  351. y = (4 * X[:, 0]).astype(int) + 1 # don't predict zeros
  352. estimator = Estimator()
  353. estimator.fit(X, y)
  354. for func in ["predict", "decision_function", "predict_proba"]:
  355. staged_func = getattr(estimator, "staged_" + func, None)
  356. if staged_func is None:
  357. # regressor has no staged_predict_proba
  358. continue
  359. with warnings.catch_warnings(record=True):
  360. staged_result = list(staged_func(X))
  361. staged_result[1][:] = 0
  362. assert np.all(staged_result[0] != 0)
  363. def test_serialization():
  364. # Check model serialization.
  365. clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
  366. clf.fit(X, y)
  367. assert_array_equal(clf.predict(T), true_result)
  368. assert 100 == len(clf.estimators_)
  369. try:
  370. import cPickle as pickle
  371. except ImportError:
  372. import pickle
  373. serialized_clf = pickle.dumps(clf, protocol=pickle.HIGHEST_PROTOCOL)
  374. clf = None
  375. clf = pickle.loads(serialized_clf)
  376. assert_array_equal(clf.predict(T), true_result)
  377. assert 100 == len(clf.estimators_)
  378. def test_degenerate_targets():
  379. # Check if we can fit even though all targets are equal.
  380. clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
  381. # classifier should raise exception
  382. with pytest.raises(ValueError):
  383. clf.fit(X, np.ones(len(X)))
  384. clf = GradientBoostingRegressor(n_estimators=100, random_state=1)
  385. clf.fit(X, np.ones(len(X)))
  386. clf.predict([rng.rand(2)])
  387. assert_array_equal(np.ones((1,), dtype=np.float64), clf.predict([rng.rand(2)]))
  388. def test_quantile_loss(global_random_seed):
  389. # Check if quantile loss with alpha=0.5 equals absolute_error.
  390. clf_quantile = GradientBoostingRegressor(
  391. n_estimators=100,
  392. loss="quantile",
  393. max_depth=4,
  394. alpha=0.5,
  395. random_state=global_random_seed,
  396. )
  397. clf_quantile.fit(X_reg, y_reg)
  398. y_quantile = clf_quantile.predict(X_reg)
  399. clf_ae = GradientBoostingRegressor(
  400. n_estimators=100,
  401. loss="absolute_error",
  402. max_depth=4,
  403. random_state=global_random_seed,
  404. )
  405. clf_ae.fit(X_reg, y_reg)
  406. y_ae = clf_ae.predict(X_reg)
  407. assert_allclose(y_quantile, y_ae)
  408. def test_symbol_labels():
  409. # Test with non-integer class labels.
  410. clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
  411. symbol_y = tosequence(map(str, y))
  412. clf.fit(X, symbol_y)
  413. assert_array_equal(clf.predict(T), tosequence(map(str, true_result)))
  414. assert 100 == len(clf.estimators_)
  415. def test_float_class_labels():
  416. # Test with float class labels.
  417. clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
  418. float_y = np.asarray(y, dtype=np.float32)
  419. clf.fit(X, float_y)
  420. assert_array_equal(clf.predict(T), np.asarray(true_result, dtype=np.float32))
  421. assert 100 == len(clf.estimators_)
  422. def test_shape_y():
  423. # Test with float class labels.
  424. clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
  425. y_ = np.asarray(y, dtype=np.int32)
  426. y_ = y_[:, np.newaxis]
  427. # This will raise a DataConversionWarning that we want to
  428. # "always" raise, elsewhere the warnings gets ignored in the
  429. # later tests, and the tests that check for this warning fail
  430. warn_msg = (
  431. "A column-vector y was passed when a 1d array was expected. "
  432. "Please change the shape of y to \\(n_samples, \\), for "
  433. "example using ravel()."
  434. )
  435. with pytest.warns(DataConversionWarning, match=warn_msg):
  436. clf.fit(X, y_)
  437. assert_array_equal(clf.predict(T), true_result)
  438. assert 100 == len(clf.estimators_)
  439. def test_mem_layout():
  440. # Test with different memory layouts of X and y
  441. X_ = np.asfortranarray(X)
  442. clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
  443. clf.fit(X_, y)
  444. assert_array_equal(clf.predict(T), true_result)
  445. assert 100 == len(clf.estimators_)
  446. X_ = np.ascontiguousarray(X)
  447. clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
  448. clf.fit(X_, y)
  449. assert_array_equal(clf.predict(T), true_result)
  450. assert 100 == len(clf.estimators_)
  451. y_ = np.asarray(y, dtype=np.int32)
  452. y_ = np.ascontiguousarray(y_)
  453. clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
  454. clf.fit(X, y_)
  455. assert_array_equal(clf.predict(T), true_result)
  456. assert 100 == len(clf.estimators_)
  457. y_ = np.asarray(y, dtype=np.int32)
  458. y_ = np.asfortranarray(y_)
  459. clf = GradientBoostingClassifier(n_estimators=100, random_state=1)
  460. clf.fit(X, y_)
  461. assert_array_equal(clf.predict(T), true_result)
  462. assert 100 == len(clf.estimators_)
  463. @pytest.mark.parametrize("GradientBoostingEstimator", GRADIENT_BOOSTING_ESTIMATORS)
  464. def test_oob_improvement(GradientBoostingEstimator):
  465. # Test if oob improvement has correct shape and regression test.
  466. estimator = GradientBoostingEstimator(
  467. n_estimators=100, random_state=1, subsample=0.5
  468. )
  469. estimator.fit(X, y)
  470. assert estimator.oob_improvement_.shape[0] == 100
  471. # hard-coded regression test - change if modification in OOB computation
  472. assert_array_almost_equal(
  473. estimator.oob_improvement_[:5],
  474. np.array([0.19, 0.15, 0.12, -0.11, 0.11]),
  475. decimal=2,
  476. )
  477. @pytest.mark.parametrize("GradientBoostingEstimator", GRADIENT_BOOSTING_ESTIMATORS)
  478. def test_oob_scores(GradientBoostingEstimator):
  479. # Test if oob scores has correct shape and regression test.
  480. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
  481. estimator = GradientBoostingEstimator(
  482. n_estimators=100, random_state=1, subsample=0.5
  483. )
  484. estimator.fit(X, y)
  485. assert estimator.oob_scores_.shape[0] == 100
  486. assert estimator.oob_scores_[-1] == pytest.approx(estimator.oob_score_)
  487. estimator = GradientBoostingEstimator(
  488. n_estimators=100,
  489. random_state=1,
  490. subsample=0.5,
  491. n_iter_no_change=5,
  492. )
  493. estimator.fit(X, y)
  494. assert estimator.oob_scores_.shape[0] < 100
  495. assert estimator.oob_scores_[-1] == pytest.approx(estimator.oob_score_)
  496. @pytest.mark.parametrize(
  497. "GradientBoostingEstimator, oob_attribute",
  498. [
  499. (GradientBoostingClassifier, "oob_improvement_"),
  500. (GradientBoostingClassifier, "oob_scores_"),
  501. (GradientBoostingClassifier, "oob_score_"),
  502. (GradientBoostingRegressor, "oob_improvement_"),
  503. (GradientBoostingRegressor, "oob_scores_"),
  504. (GradientBoostingRegressor, "oob_score_"),
  505. ],
  506. )
  507. def test_oob_attributes_error(GradientBoostingEstimator, oob_attribute):
  508. """
  509. Check that we raise an AttributeError when the OOB statistics were not computed.
  510. """
  511. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
  512. estimator = GradientBoostingEstimator(
  513. n_estimators=100,
  514. random_state=1,
  515. subsample=1.0,
  516. )
  517. estimator.fit(X, y)
  518. with pytest.raises(AttributeError):
  519. estimator.oob_attribute
  520. def test_oob_multilcass_iris():
  521. # Check OOB improvement on multi-class dataset.
  522. estimator = GradientBoostingClassifier(
  523. n_estimators=100, loss="log_loss", random_state=1, subsample=0.5
  524. )
  525. estimator.fit(iris.data, iris.target)
  526. score = estimator.score(iris.data, iris.target)
  527. assert score > 0.9
  528. assert estimator.oob_improvement_.shape[0] == estimator.n_estimators
  529. assert estimator.oob_scores_.shape[0] == estimator.n_estimators
  530. assert estimator.oob_scores_[-1] == pytest.approx(estimator.oob_score_)
  531. estimator = GradientBoostingClassifier(
  532. n_estimators=100,
  533. loss="log_loss",
  534. random_state=1,
  535. subsample=0.5,
  536. n_iter_no_change=5,
  537. )
  538. estimator.fit(iris.data, iris.target)
  539. score = estimator.score(iris.data, iris.target)
  540. assert estimator.oob_improvement_.shape[0] < estimator.n_estimators
  541. assert estimator.oob_scores_.shape[0] < estimator.n_estimators
  542. assert estimator.oob_scores_[-1] == pytest.approx(estimator.oob_score_)
  543. # hard-coded regression test - change if modification in OOB computation
  544. # FIXME: the following snippet does not yield the same results on 32 bits
  545. # assert_array_almost_equal(estimator.oob_improvement_[:5],
  546. # np.array([12.68, 10.45, 8.18, 6.43, 5.13]),
  547. # decimal=2)
  548. def test_verbose_output():
  549. # Check verbose=1 does not cause error.
  550. import sys
  551. from io import StringIO
  552. old_stdout = sys.stdout
  553. sys.stdout = StringIO()
  554. clf = GradientBoostingClassifier(
  555. n_estimators=100, random_state=1, verbose=1, subsample=0.8
  556. )
  557. clf.fit(X, y)
  558. verbose_output = sys.stdout
  559. sys.stdout = old_stdout
  560. # check output
  561. verbose_output.seek(0)
  562. header = verbose_output.readline().rstrip()
  563. # with OOB
  564. true_header = " ".join(["%10s"] + ["%16s"] * 3) % (
  565. "Iter",
  566. "Train Loss",
  567. "OOB Improve",
  568. "Remaining Time",
  569. )
  570. assert true_header == header
  571. n_lines = sum(1 for l in verbose_output.readlines())
  572. # one for 1-10 and then 9 for 20-100
  573. assert 10 + 9 == n_lines
  574. def test_more_verbose_output():
  575. # Check verbose=2 does not cause error.
  576. import sys
  577. from io import StringIO
  578. old_stdout = sys.stdout
  579. sys.stdout = StringIO()
  580. clf = GradientBoostingClassifier(n_estimators=100, random_state=1, verbose=2)
  581. clf.fit(X, y)
  582. verbose_output = sys.stdout
  583. sys.stdout = old_stdout
  584. # check output
  585. verbose_output.seek(0)
  586. header = verbose_output.readline().rstrip()
  587. # no OOB
  588. true_header = " ".join(["%10s"] + ["%16s"] * 2) % (
  589. "Iter",
  590. "Train Loss",
  591. "Remaining Time",
  592. )
  593. assert true_header == header
  594. n_lines = sum(1 for l in verbose_output.readlines())
  595. # 100 lines for n_estimators==100
  596. assert 100 == n_lines
  597. @pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
  598. def test_warm_start(Cls, global_random_seed):
  599. # Test if warm start equals fit.
  600. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=global_random_seed)
  601. est = Cls(n_estimators=200, max_depth=1, random_state=global_random_seed)
  602. est.fit(X, y)
  603. est_ws = Cls(
  604. n_estimators=100, max_depth=1, warm_start=True, random_state=global_random_seed
  605. )
  606. est_ws.fit(X, y)
  607. est_ws.set_params(n_estimators=200)
  608. est_ws.fit(X, y)
  609. if Cls is GradientBoostingRegressor:
  610. assert_allclose(est_ws.predict(X), est.predict(X))
  611. else:
  612. # Random state is preserved and hence predict_proba must also be
  613. # same
  614. assert_array_equal(est_ws.predict(X), est.predict(X))
  615. assert_allclose(est_ws.predict_proba(X), est.predict_proba(X))
  616. @pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
  617. def test_warm_start_n_estimators(Cls, global_random_seed):
  618. # Test if warm start equals fit - set n_estimators.
  619. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=global_random_seed)
  620. est = Cls(n_estimators=300, max_depth=1, random_state=global_random_seed)
  621. est.fit(X, y)
  622. est_ws = Cls(
  623. n_estimators=100, max_depth=1, warm_start=True, random_state=global_random_seed
  624. )
  625. est_ws.fit(X, y)
  626. est_ws.set_params(n_estimators=300)
  627. est_ws.fit(X, y)
  628. assert_allclose(est_ws.predict(X), est.predict(X))
  629. @pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
  630. def test_warm_start_max_depth(Cls):
  631. # Test if possible to fit trees of different depth in ensemble.
  632. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
  633. est = Cls(n_estimators=100, max_depth=1, warm_start=True)
  634. est.fit(X, y)
  635. est.set_params(n_estimators=110, max_depth=2)
  636. est.fit(X, y)
  637. # last 10 trees have different depth
  638. assert est.estimators_[0, 0].max_depth == 1
  639. for i in range(1, 11):
  640. assert est.estimators_[-i, 0].max_depth == 2
  641. @pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
  642. def test_warm_start_clear(Cls):
  643. # Test if fit clears state.
  644. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
  645. est = Cls(n_estimators=100, max_depth=1)
  646. est.fit(X, y)
  647. est_2 = Cls(n_estimators=100, max_depth=1, warm_start=True)
  648. est_2.fit(X, y) # inits state
  649. est_2.set_params(warm_start=False)
  650. est_2.fit(X, y) # clears old state and equals est
  651. assert_array_almost_equal(est_2.predict(X), est.predict(X))
  652. @pytest.mark.parametrize("GradientBoosting", GRADIENT_BOOSTING_ESTIMATORS)
  653. def test_warm_start_state_oob_scores(GradientBoosting):
  654. """
  655. Check that the states of the OOB scores are cleared when used with `warm_start`.
  656. """
  657. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
  658. n_estimators = 100
  659. estimator = GradientBoosting(
  660. n_estimators=n_estimators,
  661. max_depth=1,
  662. subsample=0.5,
  663. warm_start=True,
  664. random_state=1,
  665. )
  666. estimator.fit(X, y)
  667. oob_scores, oob_score = estimator.oob_scores_, estimator.oob_score_
  668. assert len(oob_scores) == n_estimators
  669. assert oob_scores[-1] == pytest.approx(oob_score)
  670. n_more_estimators = 200
  671. estimator.set_params(n_estimators=n_more_estimators).fit(X, y)
  672. assert len(estimator.oob_scores_) == n_more_estimators
  673. assert_allclose(estimator.oob_scores_[:n_estimators], oob_scores)
  674. estimator.set_params(n_estimators=n_estimators, warm_start=False).fit(X, y)
  675. assert estimator.oob_scores_ is not oob_scores
  676. assert estimator.oob_score_ is not oob_score
  677. assert_allclose(estimator.oob_scores_, oob_scores)
  678. assert estimator.oob_score_ == pytest.approx(oob_score)
  679. assert oob_scores[-1] == pytest.approx(oob_score)
  680. @pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
  681. def test_warm_start_smaller_n_estimators(Cls):
  682. # Test if warm start with smaller n_estimators raises error
  683. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
  684. est = Cls(n_estimators=100, max_depth=1, warm_start=True)
  685. est.fit(X, y)
  686. est.set_params(n_estimators=99)
  687. with pytest.raises(ValueError):
  688. est.fit(X, y)
  689. @pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
  690. def test_warm_start_equal_n_estimators(Cls):
  691. # Test if warm start with equal n_estimators does nothing
  692. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
  693. est = Cls(n_estimators=100, max_depth=1)
  694. est.fit(X, y)
  695. est2 = clone(est)
  696. est2.set_params(n_estimators=est.n_estimators, warm_start=True)
  697. est2.fit(X, y)
  698. assert_array_almost_equal(est2.predict(X), est.predict(X))
  699. @pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
  700. def test_warm_start_oob_switch(Cls):
  701. # Test if oob can be turned on during warm start.
  702. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
  703. est = Cls(n_estimators=100, max_depth=1, warm_start=True)
  704. est.fit(X, y)
  705. est.set_params(n_estimators=110, subsample=0.5)
  706. est.fit(X, y)
  707. assert_array_equal(est.oob_improvement_[:100], np.zeros(100))
  708. assert_array_equal(est.oob_scores_[:100], np.zeros(100))
  709. # the last 10 are not zeros
  710. assert (est.oob_improvement_[-10:] != 0.0).all()
  711. assert (est.oob_scores_[-10:] != 0.0).all()
  712. assert est.oob_scores_[-1] == pytest.approx(est.oob_score_)
  713. @pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
  714. def test_warm_start_oob(Cls):
  715. # Test if warm start OOB equals fit.
  716. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
  717. est = Cls(n_estimators=200, max_depth=1, subsample=0.5, random_state=1)
  718. est.fit(X, y)
  719. est_ws = Cls(
  720. n_estimators=100, max_depth=1, subsample=0.5, random_state=1, warm_start=True
  721. )
  722. est_ws.fit(X, y)
  723. est_ws.set_params(n_estimators=200)
  724. est_ws.fit(X, y)
  725. assert_array_almost_equal(est_ws.oob_improvement_[:100], est.oob_improvement_[:100])
  726. assert_array_almost_equal(est_ws.oob_scores_[:100], est.oob_scores_[:100])
  727. assert est.oob_scores_[-1] == pytest.approx(est.oob_score_)
  728. assert est_ws.oob_scores_[-1] == pytest.approx(est_ws.oob_score_)
  729. @pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
  730. def test_warm_start_sparse(Cls):
  731. # Test that all sparse matrix types are supported
  732. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
  733. sparse_matrix_type = [csr_matrix, csc_matrix, coo_matrix]
  734. est_dense = Cls(
  735. n_estimators=100, max_depth=1, subsample=0.5, random_state=1, warm_start=True
  736. )
  737. est_dense.fit(X, y)
  738. est_dense.predict(X)
  739. est_dense.set_params(n_estimators=200)
  740. est_dense.fit(X, y)
  741. y_pred_dense = est_dense.predict(X)
  742. for sparse_constructor in sparse_matrix_type:
  743. X_sparse = sparse_constructor(X)
  744. est_sparse = Cls(
  745. n_estimators=100,
  746. max_depth=1,
  747. subsample=0.5,
  748. random_state=1,
  749. warm_start=True,
  750. )
  751. est_sparse.fit(X_sparse, y)
  752. est_sparse.predict(X)
  753. est_sparse.set_params(n_estimators=200)
  754. est_sparse.fit(X_sparse, y)
  755. y_pred_sparse = est_sparse.predict(X)
  756. assert_array_almost_equal(
  757. est_dense.oob_improvement_[:100], est_sparse.oob_improvement_[:100]
  758. )
  759. assert est_dense.oob_scores_[-1] == pytest.approx(est_dense.oob_score_)
  760. assert_array_almost_equal(
  761. est_dense.oob_scores_[:100], est_sparse.oob_scores_[:100]
  762. )
  763. assert est_sparse.oob_scores_[-1] == pytest.approx(est_sparse.oob_score_)
  764. assert_array_almost_equal(y_pred_dense, y_pred_sparse)
  765. @pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
  766. def test_warm_start_fortran(Cls, global_random_seed):
  767. # Test that feeding a X in Fortran-ordered is giving the same results as
  768. # in C-ordered
  769. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=global_random_seed)
  770. est_c = Cls(n_estimators=1, random_state=global_random_seed, warm_start=True)
  771. est_fortran = Cls(n_estimators=1, random_state=global_random_seed, warm_start=True)
  772. est_c.fit(X, y)
  773. est_c.set_params(n_estimators=11)
  774. est_c.fit(X, y)
  775. X_fortran = np.asfortranarray(X)
  776. est_fortran.fit(X_fortran, y)
  777. est_fortran.set_params(n_estimators=11)
  778. est_fortran.fit(X_fortran, y)
  779. assert_allclose(est_c.predict(X), est_fortran.predict(X))
  780. def early_stopping_monitor(i, est, locals):
  781. """Returns True on the 10th iteration."""
  782. if i == 9:
  783. return True
  784. else:
  785. return False
  786. @pytest.mark.parametrize("Cls", GRADIENT_BOOSTING_ESTIMATORS)
  787. def test_monitor_early_stopping(Cls):
  788. # Test if monitor return value works.
  789. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
  790. est = Cls(n_estimators=20, max_depth=1, random_state=1, subsample=0.5)
  791. est.fit(X, y, monitor=early_stopping_monitor)
  792. assert est.n_estimators == 20 # this is not altered
  793. assert est.estimators_.shape[0] == 10
  794. assert est.train_score_.shape[0] == 10
  795. assert est.oob_improvement_.shape[0] == 10
  796. assert est.oob_scores_.shape[0] == 10
  797. assert est.oob_scores_[-1] == pytest.approx(est.oob_score_)
  798. # try refit
  799. est.set_params(n_estimators=30)
  800. est.fit(X, y)
  801. assert est.n_estimators == 30
  802. assert est.estimators_.shape[0] == 30
  803. assert est.train_score_.shape[0] == 30
  804. assert est.oob_improvement_.shape[0] == 30
  805. assert est.oob_scores_.shape[0] == 30
  806. assert est.oob_scores_[-1] == pytest.approx(est.oob_score_)
  807. est = Cls(
  808. n_estimators=20, max_depth=1, random_state=1, subsample=0.5, warm_start=True
  809. )
  810. est.fit(X, y, monitor=early_stopping_monitor)
  811. assert est.n_estimators == 20
  812. assert est.estimators_.shape[0] == 10
  813. assert est.train_score_.shape[0] == 10
  814. assert est.oob_improvement_.shape[0] == 10
  815. assert est.oob_scores_.shape[0] == 10
  816. assert est.oob_scores_[-1] == pytest.approx(est.oob_score_)
  817. # try refit
  818. est.set_params(n_estimators=30, warm_start=False)
  819. est.fit(X, y)
  820. assert est.n_estimators == 30
  821. assert est.train_score_.shape[0] == 30
  822. assert est.estimators_.shape[0] == 30
  823. assert est.oob_improvement_.shape[0] == 30
  824. assert est.oob_scores_.shape[0] == 30
  825. assert est.oob_scores_[-1] == pytest.approx(est.oob_score_)
  826. def test_complete_classification():
  827. # Test greedy trees with max_depth + 1 leafs.
  828. from sklearn.tree._tree import TREE_LEAF
  829. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
  830. k = 4
  831. est = GradientBoostingClassifier(
  832. n_estimators=20, max_depth=None, random_state=1, max_leaf_nodes=k + 1
  833. )
  834. est.fit(X, y)
  835. tree = est.estimators_[0, 0].tree_
  836. assert tree.max_depth == k
  837. assert tree.children_left[tree.children_left == TREE_LEAF].shape[0] == k + 1
  838. def test_complete_regression():
  839. # Test greedy trees with max_depth + 1 leafs.
  840. from sklearn.tree._tree import TREE_LEAF
  841. k = 4
  842. est = GradientBoostingRegressor(
  843. n_estimators=20, max_depth=None, random_state=1, max_leaf_nodes=k + 1
  844. )
  845. est.fit(X_reg, y_reg)
  846. tree = est.estimators_[-1, 0].tree_
  847. assert tree.children_left[tree.children_left == TREE_LEAF].shape[0] == k + 1
  848. def test_zero_estimator_reg(global_random_seed):
  849. # Test if init='zero' works for regression by checking that it is better
  850. # than a simple baseline.
  851. baseline = DummyRegressor(strategy="mean").fit(X_reg, y_reg)
  852. mse_baseline = mean_squared_error(baseline.predict(X_reg), y_reg)
  853. est = GradientBoostingRegressor(
  854. n_estimators=5,
  855. max_depth=1,
  856. random_state=global_random_seed,
  857. init="zero",
  858. learning_rate=0.5,
  859. )
  860. est.fit(X_reg, y_reg)
  861. y_pred = est.predict(X_reg)
  862. mse_gbdt = mean_squared_error(y_reg, y_pred)
  863. assert mse_gbdt < mse_baseline
  864. def test_zero_estimator_clf(global_random_seed):
  865. # Test if init='zero' works for classification.
  866. X = iris.data
  867. y = np.array(iris.target)
  868. est = GradientBoostingClassifier(
  869. n_estimators=20, max_depth=1, random_state=global_random_seed, init="zero"
  870. )
  871. est.fit(X, y)
  872. assert est.score(X, y) > 0.96
  873. # binary clf
  874. mask = y != 0
  875. y[mask] = 1
  876. y[~mask] = 0
  877. est = GradientBoostingClassifier(
  878. n_estimators=20, max_depth=1, random_state=global_random_seed, init="zero"
  879. )
  880. est.fit(X, y)
  881. assert est.score(X, y) > 0.96
  882. @pytest.mark.parametrize("GBEstimator", GRADIENT_BOOSTING_ESTIMATORS)
  883. def test_max_leaf_nodes_max_depth(GBEstimator):
  884. # Test precedence of max_leaf_nodes over max_depth.
  885. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
  886. k = 4
  887. est = GBEstimator(max_depth=1, max_leaf_nodes=k).fit(X, y)
  888. tree = est.estimators_[0, 0].tree_
  889. assert tree.max_depth == 1
  890. est = GBEstimator(max_depth=1).fit(X, y)
  891. tree = est.estimators_[0, 0].tree_
  892. assert tree.max_depth == 1
  893. @pytest.mark.parametrize("GBEstimator", GRADIENT_BOOSTING_ESTIMATORS)
  894. def test_min_impurity_decrease(GBEstimator):
  895. X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
  896. est = GBEstimator(min_impurity_decrease=0.1)
  897. est.fit(X, y)
  898. for tree in est.estimators_.flat:
  899. # Simply check if the parameter is passed on correctly. Tree tests
  900. # will suffice for the actual working of this param
  901. assert tree.min_impurity_decrease == 0.1
  902. def test_warm_start_wo_nestimators_change():
  903. # Test if warm_start does nothing if n_estimators is not changed.
  904. # Regression test for #3513.
  905. clf = GradientBoostingClassifier(n_estimators=10, warm_start=True)
  906. clf.fit([[0, 1], [2, 3]], [0, 1])
  907. assert clf.estimators_.shape[0] == 10
  908. clf.fit([[0, 1], [2, 3]], [0, 1])
  909. assert clf.estimators_.shape[0] == 10
  910. def test_probability_exponential(global_random_seed):
  911. # Predict probabilities.
  912. clf = GradientBoostingClassifier(
  913. loss="exponential", n_estimators=100, random_state=global_random_seed
  914. )
  915. with pytest.raises(ValueError):
  916. clf.predict_proba(T)
  917. clf.fit(X, y)
  918. assert_array_equal(clf.predict(T), true_result)
  919. # check if probabilities are in [0, 1].
  920. y_proba = clf.predict_proba(T)
  921. assert np.all(y_proba >= 0.0)
  922. assert np.all(y_proba <= 1.0)
  923. score = clf.decision_function(T).ravel()
  924. assert_allclose(y_proba[:, 1], expit(2 * score))
  925. # derive predictions from probabilities
  926. y_pred = clf.classes_.take(y_proba.argmax(axis=1), axis=0)
  927. assert_array_equal(y_pred, true_result)
  928. def test_non_uniform_weights_toy_edge_case_reg():
  929. X = [[1, 0], [1, 0], [1, 0], [0, 1]]
  930. y = [0, 0, 1, 0]
  931. # ignore the first 2 training samples by setting their weight to 0
  932. sample_weight = [0, 0, 1, 1]
  933. for loss in ("huber", "squared_error", "absolute_error", "quantile"):
  934. gb = GradientBoostingRegressor(learning_rate=1.0, n_estimators=2, loss=loss)
  935. gb.fit(X, y, sample_weight=sample_weight)
  936. assert gb.predict([[1, 0]])[0] > 0.5
  937. def test_non_uniform_weights_toy_edge_case_clf():
  938. X = [[1, 0], [1, 0], [1, 0], [0, 1]]
  939. y = [0, 0, 1, 0]
  940. # ignore the first 2 training samples by setting their weight to 0
  941. sample_weight = [0, 0, 1, 1]
  942. for loss in ("log_loss", "exponential"):
  943. gb = GradientBoostingClassifier(n_estimators=5, loss=loss)
  944. gb.fit(X, y, sample_weight=sample_weight)
  945. assert_array_equal(gb.predict([[1, 0]]), [1])
  946. @skip_if_32bit
  947. @pytest.mark.parametrize(
  948. "EstimatorClass", (GradientBoostingClassifier, GradientBoostingRegressor)
  949. )
  950. @pytest.mark.parametrize("sparse_matrix", (csr_matrix, csc_matrix, coo_matrix))
  951. def test_sparse_input(EstimatorClass, sparse_matrix):
  952. y, X = datasets.make_multilabel_classification(
  953. random_state=0, n_samples=50, n_features=1, n_classes=20
  954. )
  955. y = y[:, 0]
  956. X_sparse = sparse_matrix(X)
  957. dense = EstimatorClass(
  958. n_estimators=10, random_state=0, max_depth=2, min_impurity_decrease=1e-7
  959. ).fit(X, y)
  960. sparse = EstimatorClass(
  961. n_estimators=10, random_state=0, max_depth=2, min_impurity_decrease=1e-7
  962. ).fit(X_sparse, y)
  963. assert_array_almost_equal(sparse.apply(X), dense.apply(X))
  964. assert_array_almost_equal(sparse.predict(X), dense.predict(X))
  965. assert_array_almost_equal(sparse.feature_importances_, dense.feature_importances_)
  966. assert_array_almost_equal(sparse.predict(X_sparse), dense.predict(X))
  967. assert_array_almost_equal(dense.predict(X_sparse), sparse.predict(X))
  968. if issubclass(EstimatorClass, GradientBoostingClassifier):
  969. assert_array_almost_equal(sparse.predict_proba(X), dense.predict_proba(X))
  970. assert_array_almost_equal(
  971. sparse.predict_log_proba(X), dense.predict_log_proba(X)
  972. )
  973. assert_array_almost_equal(
  974. sparse.decision_function(X_sparse), sparse.decision_function(X)
  975. )
  976. assert_array_almost_equal(
  977. dense.decision_function(X_sparse), sparse.decision_function(X)
  978. )
  979. for res_sparse, res in zip(
  980. sparse.staged_decision_function(X_sparse),
  981. sparse.staged_decision_function(X),
  982. ):
  983. assert_array_almost_equal(res_sparse, res)
  984. @pytest.mark.parametrize(
  985. "GradientBoostingEstimator", [GradientBoostingClassifier, GradientBoostingRegressor]
  986. )
  987. def test_gradient_boosting_early_stopping(GradientBoostingEstimator):
  988. # Check if early stopping works as expected, that is empirically check that the
  989. # number of trained estimators is increasing when the tolerance decreases.
  990. X, y = make_classification(n_samples=1000, random_state=0)
  991. n_estimators = 1000
  992. gb_large_tol = GradientBoostingEstimator(
  993. n_estimators=n_estimators,
  994. n_iter_no_change=10,
  995. learning_rate=0.1,
  996. max_depth=3,
  997. random_state=42,
  998. tol=1e-1,
  999. )
  1000. gb_small_tol = GradientBoostingEstimator(
  1001. n_estimators=n_estimators,
  1002. n_iter_no_change=10,
  1003. learning_rate=0.1,
  1004. max_depth=3,
  1005. random_state=42,
  1006. tol=1e-3,
  1007. )
  1008. X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
  1009. gb_large_tol.fit(X_train, y_train)
  1010. gb_small_tol.fit(X_train, y_train)
  1011. assert gb_large_tol.n_estimators_ < gb_small_tol.n_estimators_ < n_estimators
  1012. assert gb_large_tol.score(X_test, y_test) > 0.7
  1013. assert gb_small_tol.score(X_test, y_test) > 0.7
  1014. def test_gradient_boosting_without_early_stopping():
  1015. # When early stopping is not used, the number of trained estimators
  1016. # must be the one specified.
  1017. X, y = make_classification(n_samples=1000, random_state=0)
  1018. gbc = GradientBoostingClassifier(
  1019. n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42
  1020. )
  1021. gbc.fit(X, y)
  1022. gbr = GradientBoostingRegressor(
  1023. n_estimators=30, learning_rate=0.1, max_depth=3, random_state=42
  1024. )
  1025. gbr.fit(X, y)
  1026. # The number of trained estimators must be the one specified.
  1027. assert gbc.n_estimators_ == 50
  1028. assert gbr.n_estimators_ == 30
  1029. def test_gradient_boosting_validation_fraction():
  1030. X, y = make_classification(n_samples=1000, random_state=0)
  1031. gbc = GradientBoostingClassifier(
  1032. n_estimators=100,
  1033. n_iter_no_change=10,
  1034. validation_fraction=0.1,
  1035. learning_rate=0.1,
  1036. max_depth=3,
  1037. random_state=42,
  1038. )
  1039. gbc2 = clone(gbc).set_params(validation_fraction=0.3)
  1040. gbc3 = clone(gbc).set_params(n_iter_no_change=20)
  1041. gbr = GradientBoostingRegressor(
  1042. n_estimators=100,
  1043. n_iter_no_change=10,
  1044. learning_rate=0.1,
  1045. max_depth=3,
  1046. validation_fraction=0.1,
  1047. random_state=42,
  1048. )
  1049. gbr2 = clone(gbr).set_params(validation_fraction=0.3)
  1050. gbr3 = clone(gbr).set_params(n_iter_no_change=20)
  1051. X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
  1052. # Check if validation_fraction has an effect
  1053. gbc.fit(X_train, y_train)
  1054. gbc2.fit(X_train, y_train)
  1055. assert gbc.n_estimators_ != gbc2.n_estimators_
  1056. gbr.fit(X_train, y_train)
  1057. gbr2.fit(X_train, y_train)
  1058. assert gbr.n_estimators_ != gbr2.n_estimators_
  1059. # Check if n_estimators_ increase monotonically with n_iter_no_change
  1060. # Set validation
  1061. gbc3.fit(X_train, y_train)
  1062. gbr3.fit(X_train, y_train)
  1063. assert gbr.n_estimators_ < gbr3.n_estimators_
  1064. assert gbc.n_estimators_ < gbc3.n_estimators_
  1065. def test_early_stopping_stratified():
  1066. # Make sure data splitting for early stopping is stratified
  1067. X = [[1, 2], [2, 3], [3, 4], [4, 5]]
  1068. y = [0, 0, 0, 1]
  1069. gbc = GradientBoostingClassifier(n_iter_no_change=5)
  1070. with pytest.raises(
  1071. ValueError, match="The least populated class in y has only 1 member"
  1072. ):
  1073. gbc.fit(X, y)
  1074. def _make_multiclass():
  1075. return make_classification(n_classes=3, n_clusters_per_class=1)
  1076. @pytest.mark.parametrize(
  1077. "gb, dataset_maker, init_estimator",
  1078. [
  1079. (GradientBoostingClassifier, make_classification, DummyClassifier),
  1080. (GradientBoostingClassifier, _make_multiclass, DummyClassifier),
  1081. (GradientBoostingRegressor, make_regression, DummyRegressor),
  1082. ],
  1083. ids=["binary classification", "multiclass classification", "regression"],
  1084. )
  1085. def test_gradient_boosting_with_init(
  1086. gb, dataset_maker, init_estimator, global_random_seed
  1087. ):
  1088. # Check that GradientBoostingRegressor works when init is a sklearn
  1089. # estimator.
  1090. # Check that an error is raised if trying to fit with sample weight but
  1091. # initial estimator does not support sample weight
  1092. X, y = dataset_maker()
  1093. sample_weight = np.random.RandomState(global_random_seed).rand(100)
  1094. # init supports sample weights
  1095. init_est = init_estimator()
  1096. gb(init=init_est).fit(X, y, sample_weight=sample_weight)
  1097. # init does not support sample weights
  1098. init_est = NoSampleWeightWrapper(init_estimator())
  1099. gb(init=init_est).fit(X, y) # ok no sample weights
  1100. with pytest.raises(ValueError, match="estimator.*does not support sample weights"):
  1101. gb(init=init_est).fit(X, y, sample_weight=sample_weight)
  1102. def test_gradient_boosting_with_init_pipeline():
  1103. # Check that the init estimator can be a pipeline (see issue #13466)
  1104. X, y = make_regression(random_state=0)
  1105. init = make_pipeline(LinearRegression())
  1106. gb = GradientBoostingRegressor(init=init)
  1107. gb.fit(X, y) # pipeline without sample_weight works fine
  1108. with pytest.raises(
  1109. ValueError,
  1110. match="The initial estimator Pipeline does not support sample weights",
  1111. ):
  1112. gb.fit(X, y, sample_weight=np.ones(X.shape[0]))
  1113. # Passing sample_weight to a pipeline raises a ValueError. This test makes
  1114. # sure we make the distinction between ValueError raised by a pipeline that
  1115. # was passed sample_weight, and a InvalidParameterError raised by a regular
  1116. # estimator whose input checking failed.
  1117. invalid_nu = 1.5
  1118. err_msg = (
  1119. "The 'nu' parameter of NuSVR must be a float in the"
  1120. f" range (0.0, 1.0]. Got {invalid_nu} instead."
  1121. )
  1122. with pytest.raises(InvalidParameterError, match=re.escape(err_msg)):
  1123. # Note that NuSVR properly supports sample_weight
  1124. init = NuSVR(gamma="auto", nu=invalid_nu)
  1125. gb = GradientBoostingRegressor(init=init)
  1126. gb.fit(X, y, sample_weight=np.ones(X.shape[0]))
  1127. def test_early_stopping_n_classes():
  1128. # when doing early stopping (_, , y_train, _ = train_test_split(X, y))
  1129. # there might be classes in y that are missing in y_train. As the init
  1130. # estimator will be trained on y_train, we need to raise an error if this
  1131. # happens.
  1132. X = [[1]] * 10
  1133. y = [0, 0] + [1] * 8 # only 2 negative class over 10 samples
  1134. gb = GradientBoostingClassifier(
  1135. n_iter_no_change=5, random_state=0, validation_fraction=0.8
  1136. )
  1137. with pytest.raises(
  1138. ValueError, match="The training data after the early stopping split"
  1139. ):
  1140. gb.fit(X, y)
  1141. # No error if we let training data be big enough
  1142. gb = GradientBoostingClassifier(
  1143. n_iter_no_change=5, random_state=0, validation_fraction=0.4
  1144. )
  1145. def test_gbr_degenerate_feature_importances():
  1146. # growing an ensemble of single node trees. See #13620
  1147. X = np.zeros((10, 10))
  1148. y = np.ones((10,))
  1149. gbr = GradientBoostingRegressor().fit(X, y)
  1150. assert_array_equal(gbr.feature_importances_, np.zeros(10, dtype=np.float64))