_bagging.py 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278
  1. """Bagging meta-estimator."""
  2. # Author: Gilles Louppe <g.louppe@gmail.com>
  3. # License: BSD 3 clause
  4. import itertools
  5. import numbers
  6. from abc import ABCMeta, abstractmethod
  7. from functools import partial
  8. from numbers import Integral
  9. from warnings import warn
  10. import numpy as np
  11. from ..base import ClassifierMixin, RegressorMixin, _fit_context
  12. from ..metrics import accuracy_score, r2_score
  13. from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
  14. from ..utils import check_random_state, column_or_1d, indices_to_mask
  15. from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions
  16. from ..utils._tags import _safe_tags
  17. from ..utils.metaestimators import available_if
  18. from ..utils.multiclass import check_classification_targets
  19. from ..utils.parallel import Parallel, delayed
  20. from ..utils.random import sample_without_replacement
  21. from ..utils.validation import _check_sample_weight, check_is_fitted, has_fit_parameter
  22. from ._base import BaseEnsemble, _partition_estimators
  23. __all__ = ["BaggingClassifier", "BaggingRegressor"]
  24. MAX_INT = np.iinfo(np.int32).max
  25. def _generate_indices(random_state, bootstrap, n_population, n_samples):
  26. """Draw randomly sampled indices."""
  27. # Draw sample indices
  28. if bootstrap:
  29. indices = random_state.randint(0, n_population, n_samples)
  30. else:
  31. indices = sample_without_replacement(
  32. n_population, n_samples, random_state=random_state
  33. )
  34. return indices
  35. def _generate_bagging_indices(
  36. random_state,
  37. bootstrap_features,
  38. bootstrap_samples,
  39. n_features,
  40. n_samples,
  41. max_features,
  42. max_samples,
  43. ):
  44. """Randomly draw feature and sample indices."""
  45. # Get valid random state
  46. random_state = check_random_state(random_state)
  47. # Draw indices
  48. feature_indices = _generate_indices(
  49. random_state, bootstrap_features, n_features, max_features
  50. )
  51. sample_indices = _generate_indices(
  52. random_state, bootstrap_samples, n_samples, max_samples
  53. )
  54. return feature_indices, sample_indices
  55. def _parallel_build_estimators(
  56. n_estimators,
  57. ensemble,
  58. X,
  59. y,
  60. sample_weight,
  61. seeds,
  62. total_n_estimators,
  63. verbose,
  64. check_input,
  65. ):
  66. """Private function used to build a batch of estimators within a job."""
  67. # Retrieve settings
  68. n_samples, n_features = X.shape
  69. max_features = ensemble._max_features
  70. max_samples = ensemble._max_samples
  71. bootstrap = ensemble.bootstrap
  72. bootstrap_features = ensemble.bootstrap_features
  73. support_sample_weight = has_fit_parameter(ensemble.estimator_, "sample_weight")
  74. has_check_input = has_fit_parameter(ensemble.estimator_, "check_input")
  75. requires_feature_indexing = bootstrap_features or max_features != n_features
  76. if not support_sample_weight and sample_weight is not None:
  77. raise ValueError("The base estimator doesn't support sample weight")
  78. # Build estimators
  79. estimators = []
  80. estimators_features = []
  81. for i in range(n_estimators):
  82. if verbose > 1:
  83. print(
  84. "Building estimator %d of %d for this parallel run (total %d)..."
  85. % (i + 1, n_estimators, total_n_estimators)
  86. )
  87. random_state = seeds[i]
  88. estimator = ensemble._make_estimator(append=False, random_state=random_state)
  89. if has_check_input:
  90. estimator_fit = partial(estimator.fit, check_input=check_input)
  91. else:
  92. estimator_fit = estimator.fit
  93. # Draw random feature, sample indices
  94. features, indices = _generate_bagging_indices(
  95. random_state,
  96. bootstrap_features,
  97. bootstrap,
  98. n_features,
  99. n_samples,
  100. max_features,
  101. max_samples,
  102. )
  103. # Draw samples, using sample weights, and then fit
  104. if support_sample_weight:
  105. if sample_weight is None:
  106. curr_sample_weight = np.ones((n_samples,))
  107. else:
  108. curr_sample_weight = sample_weight.copy()
  109. if bootstrap:
  110. sample_counts = np.bincount(indices, minlength=n_samples)
  111. curr_sample_weight *= sample_counts
  112. else:
  113. not_indices_mask = ~indices_to_mask(indices, n_samples)
  114. curr_sample_weight[not_indices_mask] = 0
  115. X_ = X[:, features] if requires_feature_indexing else X
  116. estimator_fit(X_, y, sample_weight=curr_sample_weight)
  117. else:
  118. X_ = X[indices][:, features] if requires_feature_indexing else X[indices]
  119. estimator_fit(X_, y[indices])
  120. estimators.append(estimator)
  121. estimators_features.append(features)
  122. return estimators, estimators_features
  123. def _parallel_predict_proba(estimators, estimators_features, X, n_classes):
  124. """Private function used to compute (proba-)predictions within a job."""
  125. n_samples = X.shape[0]
  126. proba = np.zeros((n_samples, n_classes))
  127. for estimator, features in zip(estimators, estimators_features):
  128. if hasattr(estimator, "predict_proba"):
  129. proba_estimator = estimator.predict_proba(X[:, features])
  130. if n_classes == len(estimator.classes_):
  131. proba += proba_estimator
  132. else:
  133. proba[:, estimator.classes_] += proba_estimator[
  134. :, range(len(estimator.classes_))
  135. ]
  136. else:
  137. # Resort to voting
  138. predictions = estimator.predict(X[:, features])
  139. for i in range(n_samples):
  140. proba[i, predictions[i]] += 1
  141. return proba
  142. def _parallel_predict_log_proba(estimators, estimators_features, X, n_classes):
  143. """Private function used to compute log probabilities within a job."""
  144. n_samples = X.shape[0]
  145. log_proba = np.empty((n_samples, n_classes))
  146. log_proba.fill(-np.inf)
  147. all_classes = np.arange(n_classes, dtype=int)
  148. for estimator, features in zip(estimators, estimators_features):
  149. log_proba_estimator = estimator.predict_log_proba(X[:, features])
  150. if n_classes == len(estimator.classes_):
  151. log_proba = np.logaddexp(log_proba, log_proba_estimator)
  152. else:
  153. log_proba[:, estimator.classes_] = np.logaddexp(
  154. log_proba[:, estimator.classes_],
  155. log_proba_estimator[:, range(len(estimator.classes_))],
  156. )
  157. missing = np.setdiff1d(all_classes, estimator.classes_)
  158. log_proba[:, missing] = np.logaddexp(log_proba[:, missing], -np.inf)
  159. return log_proba
  160. def _parallel_decision_function(estimators, estimators_features, X):
  161. """Private function used to compute decisions within a job."""
  162. return sum(
  163. estimator.decision_function(X[:, features])
  164. for estimator, features in zip(estimators, estimators_features)
  165. )
  166. def _parallel_predict_regression(estimators, estimators_features, X):
  167. """Private function used to compute predictions within a job."""
  168. return sum(
  169. estimator.predict(X[:, features])
  170. for estimator, features in zip(estimators, estimators_features)
  171. )
  172. def _estimator_has(attr):
  173. """Check if we can delegate a method to the underlying estimator.
  174. First, we check the first fitted estimator if available, otherwise we
  175. check the estimator attribute.
  176. """
  177. def check(self):
  178. if hasattr(self, "estimators_"):
  179. return hasattr(self.estimators_[0], attr)
  180. elif self.estimator is not None:
  181. return hasattr(self.estimator, attr)
  182. else: # TODO(1.4): Remove when the base_estimator deprecation cycle ends
  183. return hasattr(self.base_estimator, attr)
  184. return check
  185. class BaseBagging(BaseEnsemble, metaclass=ABCMeta):
  186. """Base class for Bagging meta-estimator.
  187. Warning: This class should not be used directly. Use derived classes
  188. instead.
  189. """
  190. _parameter_constraints: dict = {
  191. "estimator": [HasMethods(["fit", "predict"]), None],
  192. "n_estimators": [Interval(Integral, 1, None, closed="left")],
  193. "max_samples": [
  194. Interval(Integral, 1, None, closed="left"),
  195. Interval(RealNotInt, 0, 1, closed="right"),
  196. ],
  197. "max_features": [
  198. Interval(Integral, 1, None, closed="left"),
  199. Interval(RealNotInt, 0, 1, closed="right"),
  200. ],
  201. "bootstrap": ["boolean"],
  202. "bootstrap_features": ["boolean"],
  203. "oob_score": ["boolean"],
  204. "warm_start": ["boolean"],
  205. "n_jobs": [None, Integral],
  206. "random_state": ["random_state"],
  207. "verbose": ["verbose"],
  208. "base_estimator": [
  209. HasMethods(["fit", "predict"]),
  210. StrOptions({"deprecated"}),
  211. None,
  212. ],
  213. }
  214. @abstractmethod
  215. def __init__(
  216. self,
  217. estimator=None,
  218. n_estimators=10,
  219. *,
  220. max_samples=1.0,
  221. max_features=1.0,
  222. bootstrap=True,
  223. bootstrap_features=False,
  224. oob_score=False,
  225. warm_start=False,
  226. n_jobs=None,
  227. random_state=None,
  228. verbose=0,
  229. base_estimator="deprecated",
  230. ):
  231. super().__init__(
  232. estimator=estimator,
  233. n_estimators=n_estimators,
  234. base_estimator=base_estimator,
  235. )
  236. self.max_samples = max_samples
  237. self.max_features = max_features
  238. self.bootstrap = bootstrap
  239. self.bootstrap_features = bootstrap_features
  240. self.oob_score = oob_score
  241. self.warm_start = warm_start
  242. self.n_jobs = n_jobs
  243. self.random_state = random_state
  244. self.verbose = verbose
  245. @_fit_context(
  246. # BaseBagging.estimator is not validated yet
  247. prefer_skip_nested_validation=False
  248. )
  249. def fit(self, X, y, sample_weight=None):
  250. """Build a Bagging ensemble of estimators from the training set (X, y).
  251. Parameters
  252. ----------
  253. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  254. The training input samples. Sparse matrices are accepted only if
  255. they are supported by the base estimator.
  256. y : array-like of shape (n_samples,)
  257. The target values (class labels in classification, real numbers in
  258. regression).
  259. sample_weight : array-like of shape (n_samples,), default=None
  260. Sample weights. If None, then samples are equally weighted.
  261. Note that this is supported only if the base estimator supports
  262. sample weighting.
  263. Returns
  264. -------
  265. self : object
  266. Fitted estimator.
  267. """
  268. # Convert data (X is required to be 2d and indexable)
  269. X, y = self._validate_data(
  270. X,
  271. y,
  272. accept_sparse=["csr", "csc"],
  273. dtype=None,
  274. force_all_finite=False,
  275. multi_output=True,
  276. )
  277. return self._fit(X, y, self.max_samples, sample_weight=sample_weight)
  278. def _parallel_args(self):
  279. return {}
  280. def _fit(
  281. self,
  282. X,
  283. y,
  284. max_samples=None,
  285. max_depth=None,
  286. sample_weight=None,
  287. check_input=True,
  288. ):
  289. """Build a Bagging ensemble of estimators from the training
  290. set (X, y).
  291. Parameters
  292. ----------
  293. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  294. The training input samples. Sparse matrices are accepted only if
  295. they are supported by the base estimator.
  296. y : array-like of shape (n_samples,)
  297. The target values (class labels in classification, real numbers in
  298. regression).
  299. max_samples : int or float, default=None
  300. Argument to use instead of self.max_samples.
  301. max_depth : int, default=None
  302. Override value used when constructing base estimator. Only
  303. supported if the base estimator has a max_depth parameter.
  304. sample_weight : array-like of shape (n_samples,), default=None
  305. Sample weights. If None, then samples are equally weighted.
  306. Note that this is supported only if the base estimator supports
  307. sample weighting.
  308. check_input : bool, default=True
  309. Override value used when fitting base estimator. Only supported
  310. if the base estimator has a check_input parameter for fit function.
  311. Returns
  312. -------
  313. self : object
  314. Fitted estimator.
  315. """
  316. random_state = check_random_state(self.random_state)
  317. if sample_weight is not None:
  318. sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
  319. # Remap output
  320. n_samples = X.shape[0]
  321. self._n_samples = n_samples
  322. y = self._validate_y(y)
  323. # Check parameters
  324. self._validate_estimator()
  325. if max_depth is not None:
  326. self.estimator_.max_depth = max_depth
  327. # Validate max_samples
  328. if max_samples is None:
  329. max_samples = self.max_samples
  330. elif not isinstance(max_samples, numbers.Integral):
  331. max_samples = int(max_samples * X.shape[0])
  332. if max_samples > X.shape[0]:
  333. raise ValueError("max_samples must be <= n_samples")
  334. # Store validated integer row sampling value
  335. self._max_samples = max_samples
  336. # Validate max_features
  337. if isinstance(self.max_features, numbers.Integral):
  338. max_features = self.max_features
  339. elif isinstance(self.max_features, float):
  340. max_features = int(self.max_features * self.n_features_in_)
  341. if max_features > self.n_features_in_:
  342. raise ValueError("max_features must be <= n_features")
  343. max_features = max(1, int(max_features))
  344. # Store validated integer feature sampling value
  345. self._max_features = max_features
  346. # Other checks
  347. if not self.bootstrap and self.oob_score:
  348. raise ValueError("Out of bag estimation only available if bootstrap=True")
  349. if self.warm_start and self.oob_score:
  350. raise ValueError("Out of bag estimate only available if warm_start=False")
  351. if hasattr(self, "oob_score_") and self.warm_start:
  352. del self.oob_score_
  353. if not self.warm_start or not hasattr(self, "estimators_"):
  354. # Free allocated memory, if any
  355. self.estimators_ = []
  356. self.estimators_features_ = []
  357. n_more_estimators = self.n_estimators - len(self.estimators_)
  358. if n_more_estimators < 0:
  359. raise ValueError(
  360. "n_estimators=%d must be larger or equal to "
  361. "len(estimators_)=%d when warm_start==True"
  362. % (self.n_estimators, len(self.estimators_))
  363. )
  364. elif n_more_estimators == 0:
  365. warn(
  366. "Warm-start fitting without increasing n_estimators does not "
  367. "fit new trees."
  368. )
  369. return self
  370. # Parallel loop
  371. n_jobs, n_estimators, starts = _partition_estimators(
  372. n_more_estimators, self.n_jobs
  373. )
  374. total_n_estimators = sum(n_estimators)
  375. # Advance random state to state after training
  376. # the first n_estimators
  377. if self.warm_start and len(self.estimators_) > 0:
  378. random_state.randint(MAX_INT, size=len(self.estimators_))
  379. seeds = random_state.randint(MAX_INT, size=n_more_estimators)
  380. self._seeds = seeds
  381. all_results = Parallel(
  382. n_jobs=n_jobs, verbose=self.verbose, **self._parallel_args()
  383. )(
  384. delayed(_parallel_build_estimators)(
  385. n_estimators[i],
  386. self,
  387. X,
  388. y,
  389. sample_weight,
  390. seeds[starts[i] : starts[i + 1]],
  391. total_n_estimators,
  392. verbose=self.verbose,
  393. check_input=check_input,
  394. )
  395. for i in range(n_jobs)
  396. )
  397. # Reduce
  398. self.estimators_ += list(
  399. itertools.chain.from_iterable(t[0] for t in all_results)
  400. )
  401. self.estimators_features_ += list(
  402. itertools.chain.from_iterable(t[1] for t in all_results)
  403. )
  404. if self.oob_score:
  405. self._set_oob_score(X, y)
  406. return self
  407. @abstractmethod
  408. def _set_oob_score(self, X, y):
  409. """Calculate out of bag predictions and score."""
  410. def _validate_y(self, y):
  411. if len(y.shape) == 1 or y.shape[1] == 1:
  412. return column_or_1d(y, warn=True)
  413. return y
  414. def _get_estimators_indices(self):
  415. # Get drawn indices along both sample and feature axes
  416. for seed in self._seeds:
  417. # Operations accessing random_state must be performed identically
  418. # to those in `_parallel_build_estimators()`
  419. feature_indices, sample_indices = _generate_bagging_indices(
  420. seed,
  421. self.bootstrap_features,
  422. self.bootstrap,
  423. self.n_features_in_,
  424. self._n_samples,
  425. self._max_features,
  426. self._max_samples,
  427. )
  428. yield feature_indices, sample_indices
  429. @property
  430. def estimators_samples_(self):
  431. """
  432. The subset of drawn samples for each base estimator.
  433. Returns a dynamically generated list of indices identifying
  434. the samples used for fitting each member of the ensemble, i.e.,
  435. the in-bag samples.
  436. Note: the list is re-created at each call to the property in order
  437. to reduce the object memory footprint by not storing the sampling
  438. data. Thus fetching the property may be slower than expected.
  439. """
  440. return [sample_indices for _, sample_indices in self._get_estimators_indices()]
  441. class BaggingClassifier(ClassifierMixin, BaseBagging):
  442. """A Bagging classifier.
  443. A Bagging classifier is an ensemble meta-estimator that fits base
  444. classifiers each on random subsets of the original dataset and then
  445. aggregate their individual predictions (either by voting or by averaging)
  446. to form a final prediction. Such a meta-estimator can typically be used as
  447. a way to reduce the variance of a black-box estimator (e.g., a decision
  448. tree), by introducing randomization into its construction procedure and
  449. then making an ensemble out of it.
  450. This algorithm encompasses several works from the literature. When random
  451. subsets of the dataset are drawn as random subsets of the samples, then
  452. this algorithm is known as Pasting [1]_. If samples are drawn with
  453. replacement, then the method is known as Bagging [2]_. When random subsets
  454. of the dataset are drawn as random subsets of the features, then the method
  455. is known as Random Subspaces [3]_. Finally, when base estimators are built
  456. on subsets of both samples and features, then the method is known as
  457. Random Patches [4]_.
  458. Read more in the :ref:`User Guide <bagging>`.
  459. .. versionadded:: 0.15
  460. Parameters
  461. ----------
  462. estimator : object, default=None
  463. The base estimator to fit on random subsets of the dataset.
  464. If None, then the base estimator is a
  465. :class:`~sklearn.tree.DecisionTreeClassifier`.
  466. .. versionadded:: 1.2
  467. `base_estimator` was renamed to `estimator`.
  468. n_estimators : int, default=10
  469. The number of base estimators in the ensemble.
  470. max_samples : int or float, default=1.0
  471. The number of samples to draw from X to train each base estimator (with
  472. replacement by default, see `bootstrap` for more details).
  473. - If int, then draw `max_samples` samples.
  474. - If float, then draw `max_samples * X.shape[0]` samples.
  475. max_features : int or float, default=1.0
  476. The number of features to draw from X to train each base estimator (
  477. without replacement by default, see `bootstrap_features` for more
  478. details).
  479. - If int, then draw `max_features` features.
  480. - If float, then draw `max(1, int(max_features * n_features_in_))` features.
  481. bootstrap : bool, default=True
  482. Whether samples are drawn with replacement. If False, sampling
  483. without replacement is performed.
  484. bootstrap_features : bool, default=False
  485. Whether features are drawn with replacement.
  486. oob_score : bool, default=False
  487. Whether to use out-of-bag samples to estimate
  488. the generalization error. Only available if bootstrap=True.
  489. warm_start : bool, default=False
  490. When set to True, reuse the solution of the previous call to fit
  491. and add more estimators to the ensemble, otherwise, just fit
  492. a whole new ensemble. See :term:`the Glossary <warm_start>`.
  493. .. versionadded:: 0.17
  494. *warm_start* constructor parameter.
  495. n_jobs : int, default=None
  496. The number of jobs to run in parallel for both :meth:`fit` and
  497. :meth:`predict`. ``None`` means 1 unless in a
  498. :obj:`joblib.parallel_backend` context. ``-1`` means using all
  499. processors. See :term:`Glossary <n_jobs>` for more details.
  500. random_state : int, RandomState instance or None, default=None
  501. Controls the random resampling of the original dataset
  502. (sample wise and feature wise).
  503. If the base estimator accepts a `random_state` attribute, a different
  504. seed is generated for each instance in the ensemble.
  505. Pass an int for reproducible output across multiple function calls.
  506. See :term:`Glossary <random_state>`.
  507. verbose : int, default=0
  508. Controls the verbosity when fitting and predicting.
  509. base_estimator : object, default="deprecated"
  510. Use `estimator` instead.
  511. .. deprecated:: 1.2
  512. `base_estimator` is deprecated and will be removed in 1.4.
  513. Use `estimator` instead.
  514. Attributes
  515. ----------
  516. estimator_ : estimator
  517. The base estimator from which the ensemble is grown.
  518. .. versionadded:: 1.2
  519. `base_estimator_` was renamed to `estimator_`.
  520. base_estimator_ : estimator
  521. The base estimator from which the ensemble is grown.
  522. .. deprecated:: 1.2
  523. `base_estimator_` is deprecated and will be removed in 1.4.
  524. Use `estimator_` instead.
  525. n_features_in_ : int
  526. Number of features seen during :term:`fit`.
  527. .. versionadded:: 0.24
  528. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  529. Names of features seen during :term:`fit`. Defined only when `X`
  530. has feature names that are all strings.
  531. .. versionadded:: 1.0
  532. estimators_ : list of estimators
  533. The collection of fitted base estimators.
  534. estimators_samples_ : list of arrays
  535. The subset of drawn samples (i.e., the in-bag samples) for each base
  536. estimator. Each subset is defined by an array of the indices selected.
  537. estimators_features_ : list of arrays
  538. The subset of drawn features for each base estimator.
  539. classes_ : ndarray of shape (n_classes,)
  540. The classes labels.
  541. n_classes_ : int or list
  542. The number of classes.
  543. oob_score_ : float
  544. Score of the training dataset obtained using an out-of-bag estimate.
  545. This attribute exists only when ``oob_score`` is True.
  546. oob_decision_function_ : ndarray of shape (n_samples, n_classes)
  547. Decision function computed with out-of-bag estimate on the training
  548. set. If n_estimators is small it might be possible that a data point
  549. was never left out during the bootstrap. In this case,
  550. `oob_decision_function_` might contain NaN. This attribute exists
  551. only when ``oob_score`` is True.
  552. See Also
  553. --------
  554. BaggingRegressor : A Bagging regressor.
  555. References
  556. ----------
  557. .. [1] L. Breiman, "Pasting small votes for classification in large
  558. databases and on-line", Machine Learning, 36(1), 85-103, 1999.
  559. .. [2] L. Breiman, "Bagging predictors", Machine Learning, 24(2), 123-140,
  560. 1996.
  561. .. [3] T. Ho, "The random subspace method for constructing decision
  562. forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844,
  563. 1998.
  564. .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine
  565. Learning and Knowledge Discovery in Databases, 346-361, 2012.
  566. Examples
  567. --------
  568. >>> from sklearn.svm import SVC
  569. >>> from sklearn.ensemble import BaggingClassifier
  570. >>> from sklearn.datasets import make_classification
  571. >>> X, y = make_classification(n_samples=100, n_features=4,
  572. ... n_informative=2, n_redundant=0,
  573. ... random_state=0, shuffle=False)
  574. >>> clf = BaggingClassifier(estimator=SVC(),
  575. ... n_estimators=10, random_state=0).fit(X, y)
  576. >>> clf.predict([[0, 0, 0, 0]])
  577. array([1])
  578. """
  579. def __init__(
  580. self,
  581. estimator=None,
  582. n_estimators=10,
  583. *,
  584. max_samples=1.0,
  585. max_features=1.0,
  586. bootstrap=True,
  587. bootstrap_features=False,
  588. oob_score=False,
  589. warm_start=False,
  590. n_jobs=None,
  591. random_state=None,
  592. verbose=0,
  593. base_estimator="deprecated",
  594. ):
  595. super().__init__(
  596. estimator=estimator,
  597. n_estimators=n_estimators,
  598. max_samples=max_samples,
  599. max_features=max_features,
  600. bootstrap=bootstrap,
  601. bootstrap_features=bootstrap_features,
  602. oob_score=oob_score,
  603. warm_start=warm_start,
  604. n_jobs=n_jobs,
  605. random_state=random_state,
  606. verbose=verbose,
  607. base_estimator=base_estimator,
  608. )
  609. def _validate_estimator(self):
  610. """Check the estimator and set the estimator_ attribute."""
  611. super()._validate_estimator(default=DecisionTreeClassifier())
  612. def _set_oob_score(self, X, y):
  613. n_samples = y.shape[0]
  614. n_classes_ = self.n_classes_
  615. predictions = np.zeros((n_samples, n_classes_))
  616. for estimator, samples, features in zip(
  617. self.estimators_, self.estimators_samples_, self.estimators_features_
  618. ):
  619. # Create mask for OOB samples
  620. mask = ~indices_to_mask(samples, n_samples)
  621. if hasattr(estimator, "predict_proba"):
  622. predictions[mask, :] += estimator.predict_proba(
  623. (X[mask, :])[:, features]
  624. )
  625. else:
  626. p = estimator.predict((X[mask, :])[:, features])
  627. j = 0
  628. for i in range(n_samples):
  629. if mask[i]:
  630. predictions[i, p[j]] += 1
  631. j += 1
  632. if (predictions.sum(axis=1) == 0).any():
  633. warn(
  634. "Some inputs do not have OOB scores. "
  635. "This probably means too few estimators were used "
  636. "to compute any reliable oob estimates."
  637. )
  638. oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  639. oob_score = accuracy_score(y, np.argmax(predictions, axis=1))
  640. self.oob_decision_function_ = oob_decision_function
  641. self.oob_score_ = oob_score
  642. def _validate_y(self, y):
  643. y = column_or_1d(y, warn=True)
  644. check_classification_targets(y)
  645. self.classes_, y = np.unique(y, return_inverse=True)
  646. self.n_classes_ = len(self.classes_)
  647. return y
  648. def predict(self, X):
  649. """Predict class for X.
  650. The predicted class of an input sample is computed as the class with
  651. the highest mean predicted probability. If base estimators do not
  652. implement a ``predict_proba`` method, then it resorts to voting.
  653. Parameters
  654. ----------
  655. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  656. The training input samples. Sparse matrices are accepted only if
  657. they are supported by the base estimator.
  658. Returns
  659. -------
  660. y : ndarray of shape (n_samples,)
  661. The predicted classes.
  662. """
  663. predicted_probabilitiy = self.predict_proba(X)
  664. return self.classes_.take((np.argmax(predicted_probabilitiy, axis=1)), axis=0)
  665. def predict_proba(self, X):
  666. """Predict class probabilities for X.
  667. The predicted class probabilities of an input sample is computed as
  668. the mean predicted class probabilities of the base estimators in the
  669. ensemble. If base estimators do not implement a ``predict_proba``
  670. method, then it resorts to voting and the predicted class probabilities
  671. of an input sample represents the proportion of estimators predicting
  672. each class.
  673. Parameters
  674. ----------
  675. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  676. The training input samples. Sparse matrices are accepted only if
  677. they are supported by the base estimator.
  678. Returns
  679. -------
  680. p : ndarray of shape (n_samples, n_classes)
  681. The class probabilities of the input samples. The order of the
  682. classes corresponds to that in the attribute :term:`classes_`.
  683. """
  684. check_is_fitted(self)
  685. # Check data
  686. X = self._validate_data(
  687. X,
  688. accept_sparse=["csr", "csc"],
  689. dtype=None,
  690. force_all_finite=False,
  691. reset=False,
  692. )
  693. # Parallel loop
  694. n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs)
  695. all_proba = Parallel(
  696. n_jobs=n_jobs, verbose=self.verbose, **self._parallel_args()
  697. )(
  698. delayed(_parallel_predict_proba)(
  699. self.estimators_[starts[i] : starts[i + 1]],
  700. self.estimators_features_[starts[i] : starts[i + 1]],
  701. X,
  702. self.n_classes_,
  703. )
  704. for i in range(n_jobs)
  705. )
  706. # Reduce
  707. proba = sum(all_proba) / self.n_estimators
  708. return proba
  709. def predict_log_proba(self, X):
  710. """Predict class log-probabilities for X.
  711. The predicted class log-probabilities of an input sample is computed as
  712. the log of the mean predicted class probabilities of the base
  713. estimators in the ensemble.
  714. Parameters
  715. ----------
  716. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  717. The training input samples. Sparse matrices are accepted only if
  718. they are supported by the base estimator.
  719. Returns
  720. -------
  721. p : ndarray of shape (n_samples, n_classes)
  722. The class log-probabilities of the input samples. The order of the
  723. classes corresponds to that in the attribute :term:`classes_`.
  724. """
  725. check_is_fitted(self)
  726. if hasattr(self.estimator_, "predict_log_proba"):
  727. # Check data
  728. X = self._validate_data(
  729. X,
  730. accept_sparse=["csr", "csc"],
  731. dtype=None,
  732. force_all_finite=False,
  733. reset=False,
  734. )
  735. # Parallel loop
  736. n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs)
  737. all_log_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
  738. delayed(_parallel_predict_log_proba)(
  739. self.estimators_[starts[i] : starts[i + 1]],
  740. self.estimators_features_[starts[i] : starts[i + 1]],
  741. X,
  742. self.n_classes_,
  743. )
  744. for i in range(n_jobs)
  745. )
  746. # Reduce
  747. log_proba = all_log_proba[0]
  748. for j in range(1, len(all_log_proba)):
  749. log_proba = np.logaddexp(log_proba, all_log_proba[j])
  750. log_proba -= np.log(self.n_estimators)
  751. else:
  752. log_proba = np.log(self.predict_proba(X))
  753. return log_proba
  754. @available_if(_estimator_has("decision_function"))
  755. def decision_function(self, X):
  756. """Average of the decision functions of the base classifiers.
  757. Parameters
  758. ----------
  759. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  760. The training input samples. Sparse matrices are accepted only if
  761. they are supported by the base estimator.
  762. Returns
  763. -------
  764. score : ndarray of shape (n_samples, k)
  765. The decision function of the input samples. The columns correspond
  766. to the classes in sorted order, as they appear in the attribute
  767. ``classes_``. Regression and binary classification are special
  768. cases with ``k == 1``, otherwise ``k==n_classes``.
  769. """
  770. check_is_fitted(self)
  771. # Check data
  772. X = self._validate_data(
  773. X,
  774. accept_sparse=["csr", "csc"],
  775. dtype=None,
  776. force_all_finite=False,
  777. reset=False,
  778. )
  779. # Parallel loop
  780. n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs)
  781. all_decisions = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
  782. delayed(_parallel_decision_function)(
  783. self.estimators_[starts[i] : starts[i + 1]],
  784. self.estimators_features_[starts[i] : starts[i + 1]],
  785. X,
  786. )
  787. for i in range(n_jobs)
  788. )
  789. # Reduce
  790. decisions = sum(all_decisions) / self.n_estimators
  791. return decisions
  792. def _more_tags(self):
  793. if self.estimator is None:
  794. estimator = DecisionTreeClassifier()
  795. else:
  796. estimator = self.estimator
  797. return {"allow_nan": _safe_tags(estimator, "allow_nan")}
  798. class BaggingRegressor(RegressorMixin, BaseBagging):
  799. """A Bagging regressor.
  800. A Bagging regressor is an ensemble meta-estimator that fits base
  801. regressors each on random subsets of the original dataset and then
  802. aggregate their individual predictions (either by voting or by averaging)
  803. to form a final prediction. Such a meta-estimator can typically be used as
  804. a way to reduce the variance of a black-box estimator (e.g., a decision
  805. tree), by introducing randomization into its construction procedure and
  806. then making an ensemble out of it.
  807. This algorithm encompasses several works from the literature. When random
  808. subsets of the dataset are drawn as random subsets of the samples, then
  809. this algorithm is known as Pasting [1]_. If samples are drawn with
  810. replacement, then the method is known as Bagging [2]_. When random subsets
  811. of the dataset are drawn as random subsets of the features, then the method
  812. is known as Random Subspaces [3]_. Finally, when base estimators are built
  813. on subsets of both samples and features, then the method is known as
  814. Random Patches [4]_.
  815. Read more in the :ref:`User Guide <bagging>`.
  816. .. versionadded:: 0.15
  817. Parameters
  818. ----------
  819. estimator : object, default=None
  820. The base estimator to fit on random subsets of the dataset.
  821. If None, then the base estimator is a
  822. :class:`~sklearn.tree.DecisionTreeRegressor`.
  823. .. versionadded:: 1.2
  824. `base_estimator` was renamed to `estimator`.
  825. n_estimators : int, default=10
  826. The number of base estimators in the ensemble.
  827. max_samples : int or float, default=1.0
  828. The number of samples to draw from X to train each base estimator (with
  829. replacement by default, see `bootstrap` for more details).
  830. - If int, then draw `max_samples` samples.
  831. - If float, then draw `max_samples * X.shape[0]` samples.
  832. max_features : int or float, default=1.0
  833. The number of features to draw from X to train each base estimator (
  834. without replacement by default, see `bootstrap_features` for more
  835. details).
  836. - If int, then draw `max_features` features.
  837. - If float, then draw `max(1, int(max_features * n_features_in_))` features.
  838. bootstrap : bool, default=True
  839. Whether samples are drawn with replacement. If False, sampling
  840. without replacement is performed.
  841. bootstrap_features : bool, default=False
  842. Whether features are drawn with replacement.
  843. oob_score : bool, default=False
  844. Whether to use out-of-bag samples to estimate
  845. the generalization error. Only available if bootstrap=True.
  846. warm_start : bool, default=False
  847. When set to True, reuse the solution of the previous call to fit
  848. and add more estimators to the ensemble, otherwise, just fit
  849. a whole new ensemble. See :term:`the Glossary <warm_start>`.
  850. n_jobs : int, default=None
  851. The number of jobs to run in parallel for both :meth:`fit` and
  852. :meth:`predict`. ``None`` means 1 unless in a
  853. :obj:`joblib.parallel_backend` context. ``-1`` means using all
  854. processors. See :term:`Glossary <n_jobs>` for more details.
  855. random_state : int, RandomState instance or None, default=None
  856. Controls the random resampling of the original dataset
  857. (sample wise and feature wise).
  858. If the base estimator accepts a `random_state` attribute, a different
  859. seed is generated for each instance in the ensemble.
  860. Pass an int for reproducible output across multiple function calls.
  861. See :term:`Glossary <random_state>`.
  862. verbose : int, default=0
  863. Controls the verbosity when fitting and predicting.
  864. base_estimator : object, default="deprecated"
  865. Use `estimator` instead.
  866. .. deprecated:: 1.2
  867. `base_estimator` is deprecated and will be removed in 1.4.
  868. Use `estimator` instead.
  869. Attributes
  870. ----------
  871. estimator_ : estimator
  872. The base estimator from which the ensemble is grown.
  873. .. versionadded:: 1.2
  874. `base_estimator_` was renamed to `estimator_`.
  875. base_estimator_ : estimator
  876. The base estimator from which the ensemble is grown.
  877. .. deprecated:: 1.2
  878. `base_estimator_` is deprecated and will be removed in 1.4.
  879. Use `estimator_` instead.
  880. n_features_in_ : int
  881. Number of features seen during :term:`fit`.
  882. .. versionadded:: 0.24
  883. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  884. Names of features seen during :term:`fit`. Defined only when `X`
  885. has feature names that are all strings.
  886. .. versionadded:: 1.0
  887. estimators_ : list of estimators
  888. The collection of fitted sub-estimators.
  889. estimators_samples_ : list of arrays
  890. The subset of drawn samples (i.e., the in-bag samples) for each base
  891. estimator. Each subset is defined by an array of the indices selected.
  892. estimators_features_ : list of arrays
  893. The subset of drawn features for each base estimator.
  894. oob_score_ : float
  895. Score of the training dataset obtained using an out-of-bag estimate.
  896. This attribute exists only when ``oob_score`` is True.
  897. oob_prediction_ : ndarray of shape (n_samples,)
  898. Prediction computed with out-of-bag estimate on the training
  899. set. If n_estimators is small it might be possible that a data point
  900. was never left out during the bootstrap. In this case,
  901. `oob_prediction_` might contain NaN. This attribute exists only
  902. when ``oob_score`` is True.
  903. See Also
  904. --------
  905. BaggingClassifier : A Bagging classifier.
  906. References
  907. ----------
  908. .. [1] L. Breiman, "Pasting small votes for classification in large
  909. databases and on-line", Machine Learning, 36(1), 85-103, 1999.
  910. .. [2] L. Breiman, "Bagging predictors", Machine Learning, 24(2), 123-140,
  911. 1996.
  912. .. [3] T. Ho, "The random subspace method for constructing decision
  913. forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844,
  914. 1998.
  915. .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine
  916. Learning and Knowledge Discovery in Databases, 346-361, 2012.
  917. Examples
  918. --------
  919. >>> from sklearn.svm import SVR
  920. >>> from sklearn.ensemble import BaggingRegressor
  921. >>> from sklearn.datasets import make_regression
  922. >>> X, y = make_regression(n_samples=100, n_features=4,
  923. ... n_informative=2, n_targets=1,
  924. ... random_state=0, shuffle=False)
  925. >>> regr = BaggingRegressor(estimator=SVR(),
  926. ... n_estimators=10, random_state=0).fit(X, y)
  927. >>> regr.predict([[0, 0, 0, 0]])
  928. array([-2.8720...])
  929. """
  930. def __init__(
  931. self,
  932. estimator=None,
  933. n_estimators=10,
  934. *,
  935. max_samples=1.0,
  936. max_features=1.0,
  937. bootstrap=True,
  938. bootstrap_features=False,
  939. oob_score=False,
  940. warm_start=False,
  941. n_jobs=None,
  942. random_state=None,
  943. verbose=0,
  944. base_estimator="deprecated",
  945. ):
  946. super().__init__(
  947. estimator=estimator,
  948. n_estimators=n_estimators,
  949. max_samples=max_samples,
  950. max_features=max_features,
  951. bootstrap=bootstrap,
  952. bootstrap_features=bootstrap_features,
  953. oob_score=oob_score,
  954. warm_start=warm_start,
  955. n_jobs=n_jobs,
  956. random_state=random_state,
  957. verbose=verbose,
  958. base_estimator=base_estimator,
  959. )
  960. def predict(self, X):
  961. """Predict regression target for X.
  962. The predicted regression target of an input sample is computed as the
  963. mean predicted regression targets of the estimators in the ensemble.
  964. Parameters
  965. ----------
  966. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  967. The training input samples. Sparse matrices are accepted only if
  968. they are supported by the base estimator.
  969. Returns
  970. -------
  971. y : ndarray of shape (n_samples,)
  972. The predicted values.
  973. """
  974. check_is_fitted(self)
  975. # Check data
  976. X = self._validate_data(
  977. X,
  978. accept_sparse=["csr", "csc"],
  979. dtype=None,
  980. force_all_finite=False,
  981. reset=False,
  982. )
  983. # Parallel loop
  984. n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs)
  985. all_y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
  986. delayed(_parallel_predict_regression)(
  987. self.estimators_[starts[i] : starts[i + 1]],
  988. self.estimators_features_[starts[i] : starts[i + 1]],
  989. X,
  990. )
  991. for i in range(n_jobs)
  992. )
  993. # Reduce
  994. y_hat = sum(all_y_hat) / self.n_estimators
  995. return y_hat
  996. def _validate_estimator(self):
  997. """Check the estimator and set the estimator_ attribute."""
  998. super()._validate_estimator(default=DecisionTreeRegressor())
  999. def _set_oob_score(self, X, y):
  1000. n_samples = y.shape[0]
  1001. predictions = np.zeros((n_samples,))
  1002. n_predictions = np.zeros((n_samples,))
  1003. for estimator, samples, features in zip(
  1004. self.estimators_, self.estimators_samples_, self.estimators_features_
  1005. ):
  1006. # Create mask for OOB samples
  1007. mask = ~indices_to_mask(samples, n_samples)
  1008. predictions[mask] += estimator.predict((X[mask, :])[:, features])
  1009. n_predictions[mask] += 1
  1010. if (n_predictions == 0).any():
  1011. warn(
  1012. "Some inputs do not have OOB scores. "
  1013. "This probably means too few estimators were used "
  1014. "to compute any reliable oob estimates."
  1015. )
  1016. n_predictions[n_predictions == 0] = 1
  1017. predictions /= n_predictions
  1018. self.oob_prediction_ = predictions
  1019. self.oob_score_ = r2_score(y, predictions)
  1020. def _more_tags(self):
  1021. if self.estimator is None:
  1022. estimator = DecisionTreeRegressor()
  1023. else:
  1024. estimator = self.estimator
  1025. return {"allow_nan": _safe_tags(estimator, "allow_nan")}