| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278 |
- """Bagging meta-estimator."""
- # Author: Gilles Louppe <g.louppe@gmail.com>
- # License: BSD 3 clause
- import itertools
- import numbers
- from abc import ABCMeta, abstractmethod
- from functools import partial
- from numbers import Integral
- from warnings import warn
- import numpy as np
- from ..base import ClassifierMixin, RegressorMixin, _fit_context
- from ..metrics import accuracy_score, r2_score
- from ..tree import DecisionTreeClassifier, DecisionTreeRegressor
- from ..utils import check_random_state, column_or_1d, indices_to_mask
- from ..utils._param_validation import HasMethods, Interval, RealNotInt, StrOptions
- from ..utils._tags import _safe_tags
- from ..utils.metaestimators import available_if
- from ..utils.multiclass import check_classification_targets
- from ..utils.parallel import Parallel, delayed
- from ..utils.random import sample_without_replacement
- from ..utils.validation import _check_sample_weight, check_is_fitted, has_fit_parameter
- from ._base import BaseEnsemble, _partition_estimators
- __all__ = ["BaggingClassifier", "BaggingRegressor"]
- MAX_INT = np.iinfo(np.int32).max
- def _generate_indices(random_state, bootstrap, n_population, n_samples):
- """Draw randomly sampled indices."""
- # Draw sample indices
- if bootstrap:
- indices = random_state.randint(0, n_population, n_samples)
- else:
- indices = sample_without_replacement(
- n_population, n_samples, random_state=random_state
- )
- return indices
- def _generate_bagging_indices(
- random_state,
- bootstrap_features,
- bootstrap_samples,
- n_features,
- n_samples,
- max_features,
- max_samples,
- ):
- """Randomly draw feature and sample indices."""
- # Get valid random state
- random_state = check_random_state(random_state)
- # Draw indices
- feature_indices = _generate_indices(
- random_state, bootstrap_features, n_features, max_features
- )
- sample_indices = _generate_indices(
- random_state, bootstrap_samples, n_samples, max_samples
- )
- return feature_indices, sample_indices
- def _parallel_build_estimators(
- n_estimators,
- ensemble,
- X,
- y,
- sample_weight,
- seeds,
- total_n_estimators,
- verbose,
- check_input,
- ):
- """Private function used to build a batch of estimators within a job."""
- # Retrieve settings
- n_samples, n_features = X.shape
- max_features = ensemble._max_features
- max_samples = ensemble._max_samples
- bootstrap = ensemble.bootstrap
- bootstrap_features = ensemble.bootstrap_features
- support_sample_weight = has_fit_parameter(ensemble.estimator_, "sample_weight")
- has_check_input = has_fit_parameter(ensemble.estimator_, "check_input")
- requires_feature_indexing = bootstrap_features or max_features != n_features
- if not support_sample_weight and sample_weight is not None:
- raise ValueError("The base estimator doesn't support sample weight")
- # Build estimators
- estimators = []
- estimators_features = []
- for i in range(n_estimators):
- if verbose > 1:
- print(
- "Building estimator %d of %d for this parallel run (total %d)..."
- % (i + 1, n_estimators, total_n_estimators)
- )
- random_state = seeds[i]
- estimator = ensemble._make_estimator(append=False, random_state=random_state)
- if has_check_input:
- estimator_fit = partial(estimator.fit, check_input=check_input)
- else:
- estimator_fit = estimator.fit
- # Draw random feature, sample indices
- features, indices = _generate_bagging_indices(
- random_state,
- bootstrap_features,
- bootstrap,
- n_features,
- n_samples,
- max_features,
- max_samples,
- )
- # Draw samples, using sample weights, and then fit
- if support_sample_weight:
- if sample_weight is None:
- curr_sample_weight = np.ones((n_samples,))
- else:
- curr_sample_weight = sample_weight.copy()
- if bootstrap:
- sample_counts = np.bincount(indices, minlength=n_samples)
- curr_sample_weight *= sample_counts
- else:
- not_indices_mask = ~indices_to_mask(indices, n_samples)
- curr_sample_weight[not_indices_mask] = 0
- X_ = X[:, features] if requires_feature_indexing else X
- estimator_fit(X_, y, sample_weight=curr_sample_weight)
- else:
- X_ = X[indices][:, features] if requires_feature_indexing else X[indices]
- estimator_fit(X_, y[indices])
- estimators.append(estimator)
- estimators_features.append(features)
- return estimators, estimators_features
- def _parallel_predict_proba(estimators, estimators_features, X, n_classes):
- """Private function used to compute (proba-)predictions within a job."""
- n_samples = X.shape[0]
- proba = np.zeros((n_samples, n_classes))
- for estimator, features in zip(estimators, estimators_features):
- if hasattr(estimator, "predict_proba"):
- proba_estimator = estimator.predict_proba(X[:, features])
- if n_classes == len(estimator.classes_):
- proba += proba_estimator
- else:
- proba[:, estimator.classes_] += proba_estimator[
- :, range(len(estimator.classes_))
- ]
- else:
- # Resort to voting
- predictions = estimator.predict(X[:, features])
- for i in range(n_samples):
- proba[i, predictions[i]] += 1
- return proba
- def _parallel_predict_log_proba(estimators, estimators_features, X, n_classes):
- """Private function used to compute log probabilities within a job."""
- n_samples = X.shape[0]
- log_proba = np.empty((n_samples, n_classes))
- log_proba.fill(-np.inf)
- all_classes = np.arange(n_classes, dtype=int)
- for estimator, features in zip(estimators, estimators_features):
- log_proba_estimator = estimator.predict_log_proba(X[:, features])
- if n_classes == len(estimator.classes_):
- log_proba = np.logaddexp(log_proba, log_proba_estimator)
- else:
- log_proba[:, estimator.classes_] = np.logaddexp(
- log_proba[:, estimator.classes_],
- log_proba_estimator[:, range(len(estimator.classes_))],
- )
- missing = np.setdiff1d(all_classes, estimator.classes_)
- log_proba[:, missing] = np.logaddexp(log_proba[:, missing], -np.inf)
- return log_proba
- def _parallel_decision_function(estimators, estimators_features, X):
- """Private function used to compute decisions within a job."""
- return sum(
- estimator.decision_function(X[:, features])
- for estimator, features in zip(estimators, estimators_features)
- )
- def _parallel_predict_regression(estimators, estimators_features, X):
- """Private function used to compute predictions within a job."""
- return sum(
- estimator.predict(X[:, features])
- for estimator, features in zip(estimators, estimators_features)
- )
- def _estimator_has(attr):
- """Check if we can delegate a method to the underlying estimator.
- First, we check the first fitted estimator if available, otherwise we
- check the estimator attribute.
- """
- def check(self):
- if hasattr(self, "estimators_"):
- return hasattr(self.estimators_[0], attr)
- elif self.estimator is not None:
- return hasattr(self.estimator, attr)
- else: # TODO(1.4): Remove when the base_estimator deprecation cycle ends
- return hasattr(self.base_estimator, attr)
- return check
- class BaseBagging(BaseEnsemble, metaclass=ABCMeta):
- """Base class for Bagging meta-estimator.
- Warning: This class should not be used directly. Use derived classes
- instead.
- """
- _parameter_constraints: dict = {
- "estimator": [HasMethods(["fit", "predict"]), None],
- "n_estimators": [Interval(Integral, 1, None, closed="left")],
- "max_samples": [
- Interval(Integral, 1, None, closed="left"),
- Interval(RealNotInt, 0, 1, closed="right"),
- ],
- "max_features": [
- Interval(Integral, 1, None, closed="left"),
- Interval(RealNotInt, 0, 1, closed="right"),
- ],
- "bootstrap": ["boolean"],
- "bootstrap_features": ["boolean"],
- "oob_score": ["boolean"],
- "warm_start": ["boolean"],
- "n_jobs": [None, Integral],
- "random_state": ["random_state"],
- "verbose": ["verbose"],
- "base_estimator": [
- HasMethods(["fit", "predict"]),
- StrOptions({"deprecated"}),
- None,
- ],
- }
- @abstractmethod
- def __init__(
- self,
- estimator=None,
- n_estimators=10,
- *,
- max_samples=1.0,
- max_features=1.0,
- bootstrap=True,
- bootstrap_features=False,
- oob_score=False,
- warm_start=False,
- n_jobs=None,
- random_state=None,
- verbose=0,
- base_estimator="deprecated",
- ):
- super().__init__(
- estimator=estimator,
- n_estimators=n_estimators,
- base_estimator=base_estimator,
- )
- self.max_samples = max_samples
- self.max_features = max_features
- self.bootstrap = bootstrap
- self.bootstrap_features = bootstrap_features
- self.oob_score = oob_score
- self.warm_start = warm_start
- self.n_jobs = n_jobs
- self.random_state = random_state
- self.verbose = verbose
- @_fit_context(
- # BaseBagging.estimator is not validated yet
- prefer_skip_nested_validation=False
- )
- def fit(self, X, y, sample_weight=None):
- """Build a Bagging ensemble of estimators from the training set (X, y).
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The training input samples. Sparse matrices are accepted only if
- they are supported by the base estimator.
- y : array-like of shape (n_samples,)
- The target values (class labels in classification, real numbers in
- regression).
- sample_weight : array-like of shape (n_samples,), default=None
- Sample weights. If None, then samples are equally weighted.
- Note that this is supported only if the base estimator supports
- sample weighting.
- Returns
- -------
- self : object
- Fitted estimator.
- """
- # Convert data (X is required to be 2d and indexable)
- X, y = self._validate_data(
- X,
- y,
- accept_sparse=["csr", "csc"],
- dtype=None,
- force_all_finite=False,
- multi_output=True,
- )
- return self._fit(X, y, self.max_samples, sample_weight=sample_weight)
- def _parallel_args(self):
- return {}
- def _fit(
- self,
- X,
- y,
- max_samples=None,
- max_depth=None,
- sample_weight=None,
- check_input=True,
- ):
- """Build a Bagging ensemble of estimators from the training
- set (X, y).
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The training input samples. Sparse matrices are accepted only if
- they are supported by the base estimator.
- y : array-like of shape (n_samples,)
- The target values (class labels in classification, real numbers in
- regression).
- max_samples : int or float, default=None
- Argument to use instead of self.max_samples.
- max_depth : int, default=None
- Override value used when constructing base estimator. Only
- supported if the base estimator has a max_depth parameter.
- sample_weight : array-like of shape (n_samples,), default=None
- Sample weights. If None, then samples are equally weighted.
- Note that this is supported only if the base estimator supports
- sample weighting.
- check_input : bool, default=True
- Override value used when fitting base estimator. Only supported
- if the base estimator has a check_input parameter for fit function.
- Returns
- -------
- self : object
- Fitted estimator.
- """
- random_state = check_random_state(self.random_state)
- if sample_weight is not None:
- sample_weight = _check_sample_weight(sample_weight, X, dtype=None)
- # Remap output
- n_samples = X.shape[0]
- self._n_samples = n_samples
- y = self._validate_y(y)
- # Check parameters
- self._validate_estimator()
- if max_depth is not None:
- self.estimator_.max_depth = max_depth
- # Validate max_samples
- if max_samples is None:
- max_samples = self.max_samples
- elif not isinstance(max_samples, numbers.Integral):
- max_samples = int(max_samples * X.shape[0])
- if max_samples > X.shape[0]:
- raise ValueError("max_samples must be <= n_samples")
- # Store validated integer row sampling value
- self._max_samples = max_samples
- # Validate max_features
- if isinstance(self.max_features, numbers.Integral):
- max_features = self.max_features
- elif isinstance(self.max_features, float):
- max_features = int(self.max_features * self.n_features_in_)
- if max_features > self.n_features_in_:
- raise ValueError("max_features must be <= n_features")
- max_features = max(1, int(max_features))
- # Store validated integer feature sampling value
- self._max_features = max_features
- # Other checks
- if not self.bootstrap and self.oob_score:
- raise ValueError("Out of bag estimation only available if bootstrap=True")
- if self.warm_start and self.oob_score:
- raise ValueError("Out of bag estimate only available if warm_start=False")
- if hasattr(self, "oob_score_") and self.warm_start:
- del self.oob_score_
- if not self.warm_start or not hasattr(self, "estimators_"):
- # Free allocated memory, if any
- self.estimators_ = []
- self.estimators_features_ = []
- n_more_estimators = self.n_estimators - len(self.estimators_)
- if n_more_estimators < 0:
- raise ValueError(
- "n_estimators=%d must be larger or equal to "
- "len(estimators_)=%d when warm_start==True"
- % (self.n_estimators, len(self.estimators_))
- )
- elif n_more_estimators == 0:
- warn(
- "Warm-start fitting without increasing n_estimators does not "
- "fit new trees."
- )
- return self
- # Parallel loop
- n_jobs, n_estimators, starts = _partition_estimators(
- n_more_estimators, self.n_jobs
- )
- total_n_estimators = sum(n_estimators)
- # Advance random state to state after training
- # the first n_estimators
- if self.warm_start and len(self.estimators_) > 0:
- random_state.randint(MAX_INT, size=len(self.estimators_))
- seeds = random_state.randint(MAX_INT, size=n_more_estimators)
- self._seeds = seeds
- all_results = Parallel(
- n_jobs=n_jobs, verbose=self.verbose, **self._parallel_args()
- )(
- delayed(_parallel_build_estimators)(
- n_estimators[i],
- self,
- X,
- y,
- sample_weight,
- seeds[starts[i] : starts[i + 1]],
- total_n_estimators,
- verbose=self.verbose,
- check_input=check_input,
- )
- for i in range(n_jobs)
- )
- # Reduce
- self.estimators_ += list(
- itertools.chain.from_iterable(t[0] for t in all_results)
- )
- self.estimators_features_ += list(
- itertools.chain.from_iterable(t[1] for t in all_results)
- )
- if self.oob_score:
- self._set_oob_score(X, y)
- return self
- @abstractmethod
- def _set_oob_score(self, X, y):
- """Calculate out of bag predictions and score."""
- def _validate_y(self, y):
- if len(y.shape) == 1 or y.shape[1] == 1:
- return column_or_1d(y, warn=True)
- return y
- def _get_estimators_indices(self):
- # Get drawn indices along both sample and feature axes
- for seed in self._seeds:
- # Operations accessing random_state must be performed identically
- # to those in `_parallel_build_estimators()`
- feature_indices, sample_indices = _generate_bagging_indices(
- seed,
- self.bootstrap_features,
- self.bootstrap,
- self.n_features_in_,
- self._n_samples,
- self._max_features,
- self._max_samples,
- )
- yield feature_indices, sample_indices
- @property
- def estimators_samples_(self):
- """
- The subset of drawn samples for each base estimator.
- Returns a dynamically generated list of indices identifying
- the samples used for fitting each member of the ensemble, i.e.,
- the in-bag samples.
- Note: the list is re-created at each call to the property in order
- to reduce the object memory footprint by not storing the sampling
- data. Thus fetching the property may be slower than expected.
- """
- return [sample_indices for _, sample_indices in self._get_estimators_indices()]
- class BaggingClassifier(ClassifierMixin, BaseBagging):
- """A Bagging classifier.
- A Bagging classifier is an ensemble meta-estimator that fits base
- classifiers each on random subsets of the original dataset and then
- aggregate their individual predictions (either by voting or by averaging)
- to form a final prediction. Such a meta-estimator can typically be used as
- a way to reduce the variance of a black-box estimator (e.g., a decision
- tree), by introducing randomization into its construction procedure and
- then making an ensemble out of it.
- This algorithm encompasses several works from the literature. When random
- subsets of the dataset are drawn as random subsets of the samples, then
- this algorithm is known as Pasting [1]_. If samples are drawn with
- replacement, then the method is known as Bagging [2]_. When random subsets
- of the dataset are drawn as random subsets of the features, then the method
- is known as Random Subspaces [3]_. Finally, when base estimators are built
- on subsets of both samples and features, then the method is known as
- Random Patches [4]_.
- Read more in the :ref:`User Guide <bagging>`.
- .. versionadded:: 0.15
- Parameters
- ----------
- estimator : object, default=None
- The base estimator to fit on random subsets of the dataset.
- If None, then the base estimator is a
- :class:`~sklearn.tree.DecisionTreeClassifier`.
- .. versionadded:: 1.2
- `base_estimator` was renamed to `estimator`.
- n_estimators : int, default=10
- The number of base estimators in the ensemble.
- max_samples : int or float, default=1.0
- The number of samples to draw from X to train each base estimator (with
- replacement by default, see `bootstrap` for more details).
- - If int, then draw `max_samples` samples.
- - If float, then draw `max_samples * X.shape[0]` samples.
- max_features : int or float, default=1.0
- The number of features to draw from X to train each base estimator (
- without replacement by default, see `bootstrap_features` for more
- details).
- - If int, then draw `max_features` features.
- - If float, then draw `max(1, int(max_features * n_features_in_))` features.
- bootstrap : bool, default=True
- Whether samples are drawn with replacement. If False, sampling
- without replacement is performed.
- bootstrap_features : bool, default=False
- Whether features are drawn with replacement.
- oob_score : bool, default=False
- Whether to use out-of-bag samples to estimate
- the generalization error. Only available if bootstrap=True.
- warm_start : bool, default=False
- When set to True, reuse the solution of the previous call to fit
- and add more estimators to the ensemble, otherwise, just fit
- a whole new ensemble. See :term:`the Glossary <warm_start>`.
- .. versionadded:: 0.17
- *warm_start* constructor parameter.
- n_jobs : int, default=None
- The number of jobs to run in parallel for both :meth:`fit` and
- :meth:`predict`. ``None`` means 1 unless in a
- :obj:`joblib.parallel_backend` context. ``-1`` means using all
- processors. See :term:`Glossary <n_jobs>` for more details.
- random_state : int, RandomState instance or None, default=None
- Controls the random resampling of the original dataset
- (sample wise and feature wise).
- If the base estimator accepts a `random_state` attribute, a different
- seed is generated for each instance in the ensemble.
- Pass an int for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- verbose : int, default=0
- Controls the verbosity when fitting and predicting.
- base_estimator : object, default="deprecated"
- Use `estimator` instead.
- .. deprecated:: 1.2
- `base_estimator` is deprecated and will be removed in 1.4.
- Use `estimator` instead.
- Attributes
- ----------
- estimator_ : estimator
- The base estimator from which the ensemble is grown.
- .. versionadded:: 1.2
- `base_estimator_` was renamed to `estimator_`.
- base_estimator_ : estimator
- The base estimator from which the ensemble is grown.
- .. deprecated:: 1.2
- `base_estimator_` is deprecated and will be removed in 1.4.
- Use `estimator_` instead.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- estimators_ : list of estimators
- The collection of fitted base estimators.
- estimators_samples_ : list of arrays
- The subset of drawn samples (i.e., the in-bag samples) for each base
- estimator. Each subset is defined by an array of the indices selected.
- estimators_features_ : list of arrays
- The subset of drawn features for each base estimator.
- classes_ : ndarray of shape (n_classes,)
- The classes labels.
- n_classes_ : int or list
- The number of classes.
- oob_score_ : float
- Score of the training dataset obtained using an out-of-bag estimate.
- This attribute exists only when ``oob_score`` is True.
- oob_decision_function_ : ndarray of shape (n_samples, n_classes)
- Decision function computed with out-of-bag estimate on the training
- set. If n_estimators is small it might be possible that a data point
- was never left out during the bootstrap. In this case,
- `oob_decision_function_` might contain NaN. This attribute exists
- only when ``oob_score`` is True.
- See Also
- --------
- BaggingRegressor : A Bagging regressor.
- References
- ----------
- .. [1] L. Breiman, "Pasting small votes for classification in large
- databases and on-line", Machine Learning, 36(1), 85-103, 1999.
- .. [2] L. Breiman, "Bagging predictors", Machine Learning, 24(2), 123-140,
- 1996.
- .. [3] T. Ho, "The random subspace method for constructing decision
- forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844,
- 1998.
- .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine
- Learning and Knowledge Discovery in Databases, 346-361, 2012.
- Examples
- --------
- >>> from sklearn.svm import SVC
- >>> from sklearn.ensemble import BaggingClassifier
- >>> from sklearn.datasets import make_classification
- >>> X, y = make_classification(n_samples=100, n_features=4,
- ... n_informative=2, n_redundant=0,
- ... random_state=0, shuffle=False)
- >>> clf = BaggingClassifier(estimator=SVC(),
- ... n_estimators=10, random_state=0).fit(X, y)
- >>> clf.predict([[0, 0, 0, 0]])
- array([1])
- """
- def __init__(
- self,
- estimator=None,
- n_estimators=10,
- *,
- max_samples=1.0,
- max_features=1.0,
- bootstrap=True,
- bootstrap_features=False,
- oob_score=False,
- warm_start=False,
- n_jobs=None,
- random_state=None,
- verbose=0,
- base_estimator="deprecated",
- ):
- super().__init__(
- estimator=estimator,
- n_estimators=n_estimators,
- max_samples=max_samples,
- max_features=max_features,
- bootstrap=bootstrap,
- bootstrap_features=bootstrap_features,
- oob_score=oob_score,
- warm_start=warm_start,
- n_jobs=n_jobs,
- random_state=random_state,
- verbose=verbose,
- base_estimator=base_estimator,
- )
- def _validate_estimator(self):
- """Check the estimator and set the estimator_ attribute."""
- super()._validate_estimator(default=DecisionTreeClassifier())
- def _set_oob_score(self, X, y):
- n_samples = y.shape[0]
- n_classes_ = self.n_classes_
- predictions = np.zeros((n_samples, n_classes_))
- for estimator, samples, features in zip(
- self.estimators_, self.estimators_samples_, self.estimators_features_
- ):
- # Create mask for OOB samples
- mask = ~indices_to_mask(samples, n_samples)
- if hasattr(estimator, "predict_proba"):
- predictions[mask, :] += estimator.predict_proba(
- (X[mask, :])[:, features]
- )
- else:
- p = estimator.predict((X[mask, :])[:, features])
- j = 0
- for i in range(n_samples):
- if mask[i]:
- predictions[i, p[j]] += 1
- j += 1
- if (predictions.sum(axis=1) == 0).any():
- warn(
- "Some inputs do not have OOB scores. "
- "This probably means too few estimators were used "
- "to compute any reliable oob estimates."
- )
- oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
- oob_score = accuracy_score(y, np.argmax(predictions, axis=1))
- self.oob_decision_function_ = oob_decision_function
- self.oob_score_ = oob_score
- def _validate_y(self, y):
- y = column_or_1d(y, warn=True)
- check_classification_targets(y)
- self.classes_, y = np.unique(y, return_inverse=True)
- self.n_classes_ = len(self.classes_)
- return y
- def predict(self, X):
- """Predict class for X.
- The predicted class of an input sample is computed as the class with
- the highest mean predicted probability. If base estimators do not
- implement a ``predict_proba`` method, then it resorts to voting.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The training input samples. Sparse matrices are accepted only if
- they are supported by the base estimator.
- Returns
- -------
- y : ndarray of shape (n_samples,)
- The predicted classes.
- """
- predicted_probabilitiy = self.predict_proba(X)
- return self.classes_.take((np.argmax(predicted_probabilitiy, axis=1)), axis=0)
- def predict_proba(self, X):
- """Predict class probabilities for X.
- The predicted class probabilities of an input sample is computed as
- the mean predicted class probabilities of the base estimators in the
- ensemble. If base estimators do not implement a ``predict_proba``
- method, then it resorts to voting and the predicted class probabilities
- of an input sample represents the proportion of estimators predicting
- each class.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The training input samples. Sparse matrices are accepted only if
- they are supported by the base estimator.
- Returns
- -------
- p : ndarray of shape (n_samples, n_classes)
- The class probabilities of the input samples. The order of the
- classes corresponds to that in the attribute :term:`classes_`.
- """
- check_is_fitted(self)
- # Check data
- X = self._validate_data(
- X,
- accept_sparse=["csr", "csc"],
- dtype=None,
- force_all_finite=False,
- reset=False,
- )
- # Parallel loop
- n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs)
- all_proba = Parallel(
- n_jobs=n_jobs, verbose=self.verbose, **self._parallel_args()
- )(
- delayed(_parallel_predict_proba)(
- self.estimators_[starts[i] : starts[i + 1]],
- self.estimators_features_[starts[i] : starts[i + 1]],
- X,
- self.n_classes_,
- )
- for i in range(n_jobs)
- )
- # Reduce
- proba = sum(all_proba) / self.n_estimators
- return proba
- def predict_log_proba(self, X):
- """Predict class log-probabilities for X.
- The predicted class log-probabilities of an input sample is computed as
- the log of the mean predicted class probabilities of the base
- estimators in the ensemble.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The training input samples. Sparse matrices are accepted only if
- they are supported by the base estimator.
- Returns
- -------
- p : ndarray of shape (n_samples, n_classes)
- The class log-probabilities of the input samples. The order of the
- classes corresponds to that in the attribute :term:`classes_`.
- """
- check_is_fitted(self)
- if hasattr(self.estimator_, "predict_log_proba"):
- # Check data
- X = self._validate_data(
- X,
- accept_sparse=["csr", "csc"],
- dtype=None,
- force_all_finite=False,
- reset=False,
- )
- # Parallel loop
- n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs)
- all_log_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
- delayed(_parallel_predict_log_proba)(
- self.estimators_[starts[i] : starts[i + 1]],
- self.estimators_features_[starts[i] : starts[i + 1]],
- X,
- self.n_classes_,
- )
- for i in range(n_jobs)
- )
- # Reduce
- log_proba = all_log_proba[0]
- for j in range(1, len(all_log_proba)):
- log_proba = np.logaddexp(log_proba, all_log_proba[j])
- log_proba -= np.log(self.n_estimators)
- else:
- log_proba = np.log(self.predict_proba(X))
- return log_proba
- @available_if(_estimator_has("decision_function"))
- def decision_function(self, X):
- """Average of the decision functions of the base classifiers.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The training input samples. Sparse matrices are accepted only if
- they are supported by the base estimator.
- Returns
- -------
- score : ndarray of shape (n_samples, k)
- The decision function of the input samples. The columns correspond
- to the classes in sorted order, as they appear in the attribute
- ``classes_``. Regression and binary classification are special
- cases with ``k == 1``, otherwise ``k==n_classes``.
- """
- check_is_fitted(self)
- # Check data
- X = self._validate_data(
- X,
- accept_sparse=["csr", "csc"],
- dtype=None,
- force_all_finite=False,
- reset=False,
- )
- # Parallel loop
- n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs)
- all_decisions = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
- delayed(_parallel_decision_function)(
- self.estimators_[starts[i] : starts[i + 1]],
- self.estimators_features_[starts[i] : starts[i + 1]],
- X,
- )
- for i in range(n_jobs)
- )
- # Reduce
- decisions = sum(all_decisions) / self.n_estimators
- return decisions
- def _more_tags(self):
- if self.estimator is None:
- estimator = DecisionTreeClassifier()
- else:
- estimator = self.estimator
- return {"allow_nan": _safe_tags(estimator, "allow_nan")}
- class BaggingRegressor(RegressorMixin, BaseBagging):
- """A Bagging regressor.
- A Bagging regressor is an ensemble meta-estimator that fits base
- regressors each on random subsets of the original dataset and then
- aggregate their individual predictions (either by voting or by averaging)
- to form a final prediction. Such a meta-estimator can typically be used as
- a way to reduce the variance of a black-box estimator (e.g., a decision
- tree), by introducing randomization into its construction procedure and
- then making an ensemble out of it.
- This algorithm encompasses several works from the literature. When random
- subsets of the dataset are drawn as random subsets of the samples, then
- this algorithm is known as Pasting [1]_. If samples are drawn with
- replacement, then the method is known as Bagging [2]_. When random subsets
- of the dataset are drawn as random subsets of the features, then the method
- is known as Random Subspaces [3]_. Finally, when base estimators are built
- on subsets of both samples and features, then the method is known as
- Random Patches [4]_.
- Read more in the :ref:`User Guide <bagging>`.
- .. versionadded:: 0.15
- Parameters
- ----------
- estimator : object, default=None
- The base estimator to fit on random subsets of the dataset.
- If None, then the base estimator is a
- :class:`~sklearn.tree.DecisionTreeRegressor`.
- .. versionadded:: 1.2
- `base_estimator` was renamed to `estimator`.
- n_estimators : int, default=10
- The number of base estimators in the ensemble.
- max_samples : int or float, default=1.0
- The number of samples to draw from X to train each base estimator (with
- replacement by default, see `bootstrap` for more details).
- - If int, then draw `max_samples` samples.
- - If float, then draw `max_samples * X.shape[0]` samples.
- max_features : int or float, default=1.0
- The number of features to draw from X to train each base estimator (
- without replacement by default, see `bootstrap_features` for more
- details).
- - If int, then draw `max_features` features.
- - If float, then draw `max(1, int(max_features * n_features_in_))` features.
- bootstrap : bool, default=True
- Whether samples are drawn with replacement. If False, sampling
- without replacement is performed.
- bootstrap_features : bool, default=False
- Whether features are drawn with replacement.
- oob_score : bool, default=False
- Whether to use out-of-bag samples to estimate
- the generalization error. Only available if bootstrap=True.
- warm_start : bool, default=False
- When set to True, reuse the solution of the previous call to fit
- and add more estimators to the ensemble, otherwise, just fit
- a whole new ensemble. See :term:`the Glossary <warm_start>`.
- n_jobs : int, default=None
- The number of jobs to run in parallel for both :meth:`fit` and
- :meth:`predict`. ``None`` means 1 unless in a
- :obj:`joblib.parallel_backend` context. ``-1`` means using all
- processors. See :term:`Glossary <n_jobs>` for more details.
- random_state : int, RandomState instance or None, default=None
- Controls the random resampling of the original dataset
- (sample wise and feature wise).
- If the base estimator accepts a `random_state` attribute, a different
- seed is generated for each instance in the ensemble.
- Pass an int for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- verbose : int, default=0
- Controls the verbosity when fitting and predicting.
- base_estimator : object, default="deprecated"
- Use `estimator` instead.
- .. deprecated:: 1.2
- `base_estimator` is deprecated and will be removed in 1.4.
- Use `estimator` instead.
- Attributes
- ----------
- estimator_ : estimator
- The base estimator from which the ensemble is grown.
- .. versionadded:: 1.2
- `base_estimator_` was renamed to `estimator_`.
- base_estimator_ : estimator
- The base estimator from which the ensemble is grown.
- .. deprecated:: 1.2
- `base_estimator_` is deprecated and will be removed in 1.4.
- Use `estimator_` instead.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- estimators_ : list of estimators
- The collection of fitted sub-estimators.
- estimators_samples_ : list of arrays
- The subset of drawn samples (i.e., the in-bag samples) for each base
- estimator. Each subset is defined by an array of the indices selected.
- estimators_features_ : list of arrays
- The subset of drawn features for each base estimator.
- oob_score_ : float
- Score of the training dataset obtained using an out-of-bag estimate.
- This attribute exists only when ``oob_score`` is True.
- oob_prediction_ : ndarray of shape (n_samples,)
- Prediction computed with out-of-bag estimate on the training
- set. If n_estimators is small it might be possible that a data point
- was never left out during the bootstrap. In this case,
- `oob_prediction_` might contain NaN. This attribute exists only
- when ``oob_score`` is True.
- See Also
- --------
- BaggingClassifier : A Bagging classifier.
- References
- ----------
- .. [1] L. Breiman, "Pasting small votes for classification in large
- databases and on-line", Machine Learning, 36(1), 85-103, 1999.
- .. [2] L. Breiman, "Bagging predictors", Machine Learning, 24(2), 123-140,
- 1996.
- .. [3] T. Ho, "The random subspace method for constructing decision
- forests", Pattern Analysis and Machine Intelligence, 20(8), 832-844,
- 1998.
- .. [4] G. Louppe and P. Geurts, "Ensembles on Random Patches", Machine
- Learning and Knowledge Discovery in Databases, 346-361, 2012.
- Examples
- --------
- >>> from sklearn.svm import SVR
- >>> from sklearn.ensemble import BaggingRegressor
- >>> from sklearn.datasets import make_regression
- >>> X, y = make_regression(n_samples=100, n_features=4,
- ... n_informative=2, n_targets=1,
- ... random_state=0, shuffle=False)
- >>> regr = BaggingRegressor(estimator=SVR(),
- ... n_estimators=10, random_state=0).fit(X, y)
- >>> regr.predict([[0, 0, 0, 0]])
- array([-2.8720...])
- """
- def __init__(
- self,
- estimator=None,
- n_estimators=10,
- *,
- max_samples=1.0,
- max_features=1.0,
- bootstrap=True,
- bootstrap_features=False,
- oob_score=False,
- warm_start=False,
- n_jobs=None,
- random_state=None,
- verbose=0,
- base_estimator="deprecated",
- ):
- super().__init__(
- estimator=estimator,
- n_estimators=n_estimators,
- max_samples=max_samples,
- max_features=max_features,
- bootstrap=bootstrap,
- bootstrap_features=bootstrap_features,
- oob_score=oob_score,
- warm_start=warm_start,
- n_jobs=n_jobs,
- random_state=random_state,
- verbose=verbose,
- base_estimator=base_estimator,
- )
- def predict(self, X):
- """Predict regression target for X.
- The predicted regression target of an input sample is computed as the
- mean predicted regression targets of the estimators in the ensemble.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The training input samples. Sparse matrices are accepted only if
- they are supported by the base estimator.
- Returns
- -------
- y : ndarray of shape (n_samples,)
- The predicted values.
- """
- check_is_fitted(self)
- # Check data
- X = self._validate_data(
- X,
- accept_sparse=["csr", "csc"],
- dtype=None,
- force_all_finite=False,
- reset=False,
- )
- # Parallel loop
- n_jobs, _, starts = _partition_estimators(self.n_estimators, self.n_jobs)
- all_y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
- delayed(_parallel_predict_regression)(
- self.estimators_[starts[i] : starts[i + 1]],
- self.estimators_features_[starts[i] : starts[i + 1]],
- X,
- )
- for i in range(n_jobs)
- )
- # Reduce
- y_hat = sum(all_y_hat) / self.n_estimators
- return y_hat
- def _validate_estimator(self):
- """Check the estimator and set the estimator_ attribute."""
- super()._validate_estimator(default=DecisionTreeRegressor())
- def _set_oob_score(self, X, y):
- n_samples = y.shape[0]
- predictions = np.zeros((n_samples,))
- n_predictions = np.zeros((n_samples,))
- for estimator, samples, features in zip(
- self.estimators_, self.estimators_samples_, self.estimators_features_
- ):
- # Create mask for OOB samples
- mask = ~indices_to_mask(samples, n_samples)
- predictions[mask] += estimator.predict((X[mask, :])[:, features])
- n_predictions[mask] += 1
- if (n_predictions == 0).any():
- warn(
- "Some inputs do not have OOB scores. "
- "This probably means too few estimators were used "
- "to compute any reliable oob estimates."
- )
- n_predictions[n_predictions == 0] = 1
- predictions /= n_predictions
- self.oob_prediction_ = predictions
- self.oob_score_ = r2_score(y, predictions)
- def _more_tags(self):
- if self.estimator is None:
- estimator = DecisionTreeRegressor()
- else:
- estimator = self.estimator
- return {"allow_nan": _safe_tags(estimator, "allow_nan")}
|