| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661 |
- """
- Soft Voting/Majority Rule classifier and Voting regressor.
- This module contains:
- - A Soft Voting/Majority Rule classifier for classification estimators.
- - A Voting regressor for regression estimators.
- """
- # Authors: Sebastian Raschka <se.raschka@gmail.com>,
- # Gilles Louppe <g.louppe@gmail.com>,
- # Ramil Nugmanov <stsouko@live.ru>
- # Mohamed Ali Jamaoui <m.ali.jamaoui@gmail.com>
- #
- # License: BSD 3 clause
- from abc import abstractmethod
- from numbers import Integral
- import numpy as np
- from ..base import (
- ClassifierMixin,
- RegressorMixin,
- TransformerMixin,
- _fit_context,
- clone,
- )
- from ..exceptions import NotFittedError
- from ..preprocessing import LabelEncoder
- from ..utils import Bunch
- from ..utils._estimator_html_repr import _VisualBlock
- from ..utils._param_validation import StrOptions
- from ..utils.metaestimators import available_if
- from ..utils.multiclass import check_classification_targets
- from ..utils.parallel import Parallel, delayed
- from ..utils.validation import _check_feature_names_in, check_is_fitted, column_or_1d
- from ._base import _BaseHeterogeneousEnsemble, _fit_single_estimator
- class _BaseVoting(TransformerMixin, _BaseHeterogeneousEnsemble):
- """Base class for voting.
- Warning: This class should not be used directly. Use derived classes
- instead.
- """
- _parameter_constraints: dict = {
- "estimators": [list],
- "weights": ["array-like", None],
- "n_jobs": [None, Integral],
- "verbose": ["verbose"],
- }
- def _log_message(self, name, idx, total):
- if not self.verbose:
- return None
- return f"({idx} of {total}) Processing {name}"
- @property
- def _weights_not_none(self):
- """Get the weights of not `None` estimators."""
- if self.weights is None:
- return None
- return [w for est, w in zip(self.estimators, self.weights) if est[1] != "drop"]
- def _predict(self, X):
- """Collect results from clf.predict calls."""
- return np.asarray([est.predict(X) for est in self.estimators_]).T
- @abstractmethod
- def fit(self, X, y, sample_weight=None):
- """Get common fit operations."""
- names, clfs = self._validate_estimators()
- if self.weights is not None and len(self.weights) != len(self.estimators):
- raise ValueError(
- "Number of `estimators` and weights must be equal; got"
- f" {len(self.weights)} weights, {len(self.estimators)} estimators"
- )
- self.estimators_ = Parallel(n_jobs=self.n_jobs)(
- delayed(_fit_single_estimator)(
- clone(clf),
- X,
- y,
- sample_weight=sample_weight,
- message_clsname="Voting",
- message=self._log_message(names[idx], idx + 1, len(clfs)),
- )
- for idx, clf in enumerate(clfs)
- if clf != "drop"
- )
- self.named_estimators_ = Bunch()
- # Uses 'drop' as placeholder for dropped estimators
- est_iter = iter(self.estimators_)
- for name, est in self.estimators:
- current_est = est if est == "drop" else next(est_iter)
- self.named_estimators_[name] = current_est
- if hasattr(current_est, "feature_names_in_"):
- self.feature_names_in_ = current_est.feature_names_in_
- return self
- def fit_transform(self, X, y=None, **fit_params):
- """Return class labels or probabilities for each estimator.
- Return predictions for X for each estimator.
- Parameters
- ----------
- X : {array-like, sparse matrix, dataframe} of shape \
- (n_samples, n_features)
- Input samples.
- y : ndarray of shape (n_samples,), default=None
- Target values (None for unsupervised transformations).
- **fit_params : dict
- Additional fit parameters.
- Returns
- -------
- X_new : ndarray array of shape (n_samples, n_features_new)
- Transformed array.
- """
- return super().fit_transform(X, y, **fit_params)
- @property
- def n_features_in_(self):
- """Number of features seen during :term:`fit`."""
- # For consistency with other estimators we raise a AttributeError so
- # that hasattr() fails if the estimator isn't fitted.
- try:
- check_is_fitted(self)
- except NotFittedError as nfe:
- raise AttributeError(
- "{} object has no n_features_in_ attribute.".format(
- self.__class__.__name__
- )
- ) from nfe
- return self.estimators_[0].n_features_in_
- def _sk_visual_block_(self):
- names, estimators = zip(*self.estimators)
- return _VisualBlock("parallel", estimators, names=names)
- def _more_tags(self):
- return {"preserves_dtype": []}
- class VotingClassifier(ClassifierMixin, _BaseVoting):
- """Soft Voting/Majority Rule classifier for unfitted estimators.
- Read more in the :ref:`User Guide <voting_classifier>`.
- .. versionadded:: 0.17
- Parameters
- ----------
- estimators : list of (str, estimator) tuples
- Invoking the ``fit`` method on the ``VotingClassifier`` will fit clones
- of those original estimators that will be stored in the class attribute
- ``self.estimators_``. An estimator can be set to ``'drop'`` using
- :meth:`set_params`.
- .. versionchanged:: 0.21
- ``'drop'`` is accepted. Using None was deprecated in 0.22 and
- support was removed in 0.24.
- voting : {'hard', 'soft'}, default='hard'
- If 'hard', uses predicted class labels for majority rule voting.
- Else if 'soft', predicts the class label based on the argmax of
- the sums of the predicted probabilities, which is recommended for
- an ensemble of well-calibrated classifiers.
- weights : array-like of shape (n_classifiers,), default=None
- Sequence of weights (`float` or `int`) to weight the occurrences of
- predicted class labels (`hard` voting) or class probabilities
- before averaging (`soft` voting). Uses uniform weights if `None`.
- n_jobs : int, default=None
- The number of jobs to run in parallel for ``fit``.
- ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
- ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
- for more details.
- .. versionadded:: 0.18
- flatten_transform : bool, default=True
- Affects shape of transform output only when voting='soft'
- If voting='soft' and flatten_transform=True, transform method returns
- matrix with shape (n_samples, n_classifiers * n_classes). If
- flatten_transform=False, it returns
- (n_classifiers, n_samples, n_classes).
- verbose : bool, default=False
- If True, the time elapsed while fitting will be printed as it
- is completed.
- .. versionadded:: 0.23
- Attributes
- ----------
- estimators_ : list of classifiers
- The collection of fitted sub-estimators as defined in ``estimators``
- that are not 'drop'.
- named_estimators_ : :class:`~sklearn.utils.Bunch`
- Attribute to access any fitted sub-estimators by name.
- .. versionadded:: 0.20
- le_ : :class:`~sklearn.preprocessing.LabelEncoder`
- Transformer used to encode the labels during fit and decode during
- prediction.
- classes_ : ndarray of shape (n_classes,)
- The classes labels.
- n_features_in_ : int
- Number of features seen during :term:`fit`. Only defined if the
- underlying classifier exposes such an attribute when fit.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Only defined if the
- underlying estimators expose such an attribute when fit.
- .. versionadded:: 1.0
- See Also
- --------
- VotingRegressor : Prediction voting regressor.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.linear_model import LogisticRegression
- >>> from sklearn.naive_bayes import GaussianNB
- >>> from sklearn.ensemble import RandomForestClassifier, VotingClassifier
- >>> clf1 = LogisticRegression(multi_class='multinomial', random_state=1)
- >>> clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
- >>> clf3 = GaussianNB()
- >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
- >>> y = np.array([1, 1, 1, 2, 2, 2])
- >>> eclf1 = VotingClassifier(estimators=[
- ... ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')
- >>> eclf1 = eclf1.fit(X, y)
- >>> print(eclf1.predict(X))
- [1 1 1 2 2 2]
- >>> np.array_equal(eclf1.named_estimators_.lr.predict(X),
- ... eclf1.named_estimators_['lr'].predict(X))
- True
- >>> eclf2 = VotingClassifier(estimators=[
- ... ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
- ... voting='soft')
- >>> eclf2 = eclf2.fit(X, y)
- >>> print(eclf2.predict(X))
- [1 1 1 2 2 2]
- To drop an estimator, :meth:`set_params` can be used to remove it. Here we
- dropped one of the estimators, resulting in 2 fitted estimators:
- >>> eclf2 = eclf2.set_params(lr='drop')
- >>> eclf2 = eclf2.fit(X, y)
- >>> len(eclf2.estimators_)
- 2
- Setting `flatten_transform=True` with `voting='soft'` flattens output shape of
- `transform`:
- >>> eclf3 = VotingClassifier(estimators=[
- ... ('lr', clf1), ('rf', clf2), ('gnb', clf3)],
- ... voting='soft', weights=[2,1,1],
- ... flatten_transform=True)
- >>> eclf3 = eclf3.fit(X, y)
- >>> print(eclf3.predict(X))
- [1 1 1 2 2 2]
- >>> print(eclf3.transform(X).shape)
- (6, 6)
- """
- _parameter_constraints: dict = {
- **_BaseVoting._parameter_constraints,
- "voting": [StrOptions({"hard", "soft"})],
- "flatten_transform": ["boolean"],
- }
- def __init__(
- self,
- estimators,
- *,
- voting="hard",
- weights=None,
- n_jobs=None,
- flatten_transform=True,
- verbose=False,
- ):
- super().__init__(estimators=estimators)
- self.voting = voting
- self.weights = weights
- self.n_jobs = n_jobs
- self.flatten_transform = flatten_transform
- self.verbose = verbose
- @_fit_context(
- # estimators in VotingClassifier.estimators are not validated yet
- prefer_skip_nested_validation=False
- )
- def fit(self, X, y, sample_weight=None):
- """Fit the estimators.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Training vectors, where `n_samples` is the number of samples and
- `n_features` is the number of features.
- y : array-like of shape (n_samples,)
- Target values.
- sample_weight : array-like of shape (n_samples,), default=None
- Sample weights. If None, then samples are equally weighted.
- Note that this is supported only if all underlying estimators
- support sample weights.
- .. versionadded:: 0.18
- Returns
- -------
- self : object
- Returns the instance itself.
- """
- check_classification_targets(y)
- if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
- raise NotImplementedError(
- "Multilabel and multi-output classification is not supported."
- )
- self.le_ = LabelEncoder().fit(y)
- self.classes_ = self.le_.classes_
- transformed_y = self.le_.transform(y)
- return super().fit(X, transformed_y, sample_weight)
- def predict(self, X):
- """Predict class labels for X.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The input samples.
- Returns
- -------
- maj : array-like of shape (n_samples,)
- Predicted class labels.
- """
- check_is_fitted(self)
- if self.voting == "soft":
- maj = np.argmax(self.predict_proba(X), axis=1)
- else: # 'hard' voting
- predictions = self._predict(X)
- maj = np.apply_along_axis(
- lambda x: np.argmax(np.bincount(x, weights=self._weights_not_none)),
- axis=1,
- arr=predictions,
- )
- maj = self.le_.inverse_transform(maj)
- return maj
- def _collect_probas(self, X):
- """Collect results from clf.predict calls."""
- return np.asarray([clf.predict_proba(X) for clf in self.estimators_])
- def _check_voting(self):
- if self.voting == "hard":
- raise AttributeError(
- f"predict_proba is not available when voting={repr(self.voting)}"
- )
- return True
- @available_if(_check_voting)
- def predict_proba(self, X):
- """Compute probabilities of possible outcomes for samples in X.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The input samples.
- Returns
- -------
- avg : array-like of shape (n_samples, n_classes)
- Weighted average probability for each class per sample.
- """
- check_is_fitted(self)
- avg = np.average(
- self._collect_probas(X), axis=0, weights=self._weights_not_none
- )
- return avg
- def transform(self, X):
- """Return class labels or probabilities for X for each estimator.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Training vectors, where `n_samples` is the number of samples and
- `n_features` is the number of features.
- Returns
- -------
- probabilities_or_labels
- If `voting='soft'` and `flatten_transform=True`:
- returns ndarray of shape (n_samples, n_classifiers * n_classes),
- being class probabilities calculated by each classifier.
- If `voting='soft' and `flatten_transform=False`:
- ndarray of shape (n_classifiers, n_samples, n_classes)
- If `voting='hard'`:
- ndarray of shape (n_samples, n_classifiers), being
- class labels predicted by each classifier.
- """
- check_is_fitted(self)
- if self.voting == "soft":
- probas = self._collect_probas(X)
- if not self.flatten_transform:
- return probas
- return np.hstack(probas)
- else:
- return self._predict(X)
- def get_feature_names_out(self, input_features=None):
- """Get output feature names for transformation.
- Parameters
- ----------
- input_features : array-like of str or None, default=None
- Not used, present here for API consistency by convention.
- Returns
- -------
- feature_names_out : ndarray of str objects
- Transformed feature names.
- """
- check_is_fitted(self, "n_features_in_")
- if self.voting == "soft" and not self.flatten_transform:
- raise ValueError(
- "get_feature_names_out is not supported when `voting='soft'` and "
- "`flatten_transform=False`"
- )
- _check_feature_names_in(self, input_features, generate_names=False)
- class_name = self.__class__.__name__.lower()
- active_names = [name for name, est in self.estimators if est != "drop"]
- if self.voting == "hard":
- return np.asarray(
- [f"{class_name}_{name}" for name in active_names], dtype=object
- )
- # voting == "soft"
- n_classes = len(self.classes_)
- names_out = [
- f"{class_name}_{name}{i}" for name in active_names for i in range(n_classes)
- ]
- return np.asarray(names_out, dtype=object)
- class VotingRegressor(RegressorMixin, _BaseVoting):
- """Prediction voting regressor for unfitted estimators.
- A voting regressor is an ensemble meta-estimator that fits several base
- regressors, each on the whole dataset. Then it averages the individual
- predictions to form a final prediction.
- Read more in the :ref:`User Guide <voting_regressor>`.
- .. versionadded:: 0.21
- Parameters
- ----------
- estimators : list of (str, estimator) tuples
- Invoking the ``fit`` method on the ``VotingRegressor`` will fit clones
- of those original estimators that will be stored in the class attribute
- ``self.estimators_``. An estimator can be set to ``'drop'`` using
- :meth:`set_params`.
- .. versionchanged:: 0.21
- ``'drop'`` is accepted. Using None was deprecated in 0.22 and
- support was removed in 0.24.
- weights : array-like of shape (n_regressors,), default=None
- Sequence of weights (`float` or `int`) to weight the occurrences of
- predicted values before averaging. Uses uniform weights if `None`.
- n_jobs : int, default=None
- The number of jobs to run in parallel for ``fit``.
- ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
- ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
- for more details.
- verbose : bool, default=False
- If True, the time elapsed while fitting will be printed as it
- is completed.
- .. versionadded:: 0.23
- Attributes
- ----------
- estimators_ : list of regressors
- The collection of fitted sub-estimators as defined in ``estimators``
- that are not 'drop'.
- named_estimators_ : :class:`~sklearn.utils.Bunch`
- Attribute to access any fitted sub-estimators by name.
- .. versionadded:: 0.20
- n_features_in_ : int
- Number of features seen during :term:`fit`. Only defined if the
- underlying regressor exposes such an attribute when fit.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Only defined if the
- underlying estimators expose such an attribute when fit.
- .. versionadded:: 1.0
- See Also
- --------
- VotingClassifier : Soft Voting/Majority Rule classifier.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.linear_model import LinearRegression
- >>> from sklearn.ensemble import RandomForestRegressor
- >>> from sklearn.ensemble import VotingRegressor
- >>> from sklearn.neighbors import KNeighborsRegressor
- >>> r1 = LinearRegression()
- >>> r2 = RandomForestRegressor(n_estimators=10, random_state=1)
- >>> r3 = KNeighborsRegressor()
- >>> X = np.array([[1, 1], [2, 4], [3, 9], [4, 16], [5, 25], [6, 36]])
- >>> y = np.array([2, 6, 12, 20, 30, 42])
- >>> er = VotingRegressor([('lr', r1), ('rf', r2), ('r3', r3)])
- >>> print(er.fit(X, y).predict(X))
- [ 6.8... 8.4... 12.5... 17.8... 26... 34...]
- In the following example, we drop the `'lr'` estimator with
- :meth:`~VotingRegressor.set_params` and fit the remaining two estimators:
- >>> er = er.set_params(lr='drop')
- >>> er = er.fit(X, y)
- >>> len(er.estimators_)
- 2
- """
- def __init__(self, estimators, *, weights=None, n_jobs=None, verbose=False):
- super().__init__(estimators=estimators)
- self.weights = weights
- self.n_jobs = n_jobs
- self.verbose = verbose
- @_fit_context(
- # estimators in VotingRegressor.estimators are not validated yet
- prefer_skip_nested_validation=False
- )
- def fit(self, X, y, sample_weight=None):
- """Fit the estimators.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Training vectors, where `n_samples` is the number of samples and
- `n_features` is the number of features.
- y : array-like of shape (n_samples,)
- Target values.
- sample_weight : array-like of shape (n_samples,), default=None
- Sample weights. If None, then samples are equally weighted.
- Note that this is supported only if all underlying estimators
- support sample weights.
- Returns
- -------
- self : object
- Fitted estimator.
- """
- y = column_or_1d(y, warn=True)
- return super().fit(X, y, sample_weight)
- def predict(self, X):
- """Predict regression target for X.
- The predicted regression target of an input sample is computed as the
- mean predicted regression targets of the estimators in the ensemble.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The input samples.
- Returns
- -------
- y : ndarray of shape (n_samples,)
- The predicted values.
- """
- check_is_fitted(self)
- return np.average(self._predict(X), axis=1, weights=self._weights_not_none)
- def transform(self, X):
- """Return predictions for X for each estimator.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The input samples.
- Returns
- -------
- predictions : ndarray of shape (n_samples, n_classifiers)
- Values predicted by each regressor.
- """
- check_is_fitted(self)
- return self._predict(X)
- def get_feature_names_out(self, input_features=None):
- """Get output feature names for transformation.
- Parameters
- ----------
- input_features : array-like of str or None, default=None
- Not used, present here for API consistency by convention.
- Returns
- -------
- feature_names_out : ndarray of str objects
- Transformed feature names.
- """
- check_is_fitted(self, "n_features_in_")
- _check_feature_names_in(self, input_features, generate_names=False)
- class_name = self.__class__.__name__.lower()
- return np.asarray(
- [f"{class_name}_{name}" for name, est in self.estimators if est != "drop"],
- dtype=object,
- )
|