| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682 |
- # Author: Mathieu Blondel <mathieu@mblondel.org>
- # Arnaud Joly <a.joly@ulg.ac.be>
- # Maheshakya Wijewardena <maheshakya.10@cse.mrt.ac.lk>
- # License: BSD 3 clause
- import warnings
- from numbers import Integral, Real
- import numpy as np
- import scipy.sparse as sp
- from .base import (
- BaseEstimator,
- ClassifierMixin,
- MultiOutputMixin,
- RegressorMixin,
- _fit_context,
- )
- from .utils import check_random_state
- from .utils._param_validation import Interval, StrOptions
- from .utils.multiclass import class_distribution
- from .utils.random import _random_choice_csc
- from .utils.stats import _weighted_percentile
- from .utils.validation import (
- _check_sample_weight,
- _num_samples,
- check_array,
- check_consistent_length,
- check_is_fitted,
- )
- class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator):
- """DummyClassifier makes predictions that ignore the input features.
- This classifier serves as a simple baseline to compare against other more
- complex classifiers.
- The specific behavior of the baseline is selected with the `strategy`
- parameter.
- All strategies make predictions that ignore the input feature values passed
- as the `X` argument to `fit` and `predict`. The predictions, however,
- typically depend on values observed in the `y` parameter passed to `fit`.
- Note that the "stratified" and "uniform" strategies lead to
- non-deterministic predictions that can be rendered deterministic by setting
- the `random_state` parameter if needed. The other strategies are naturally
- deterministic and, once fit, always return the same constant prediction
- for any value of `X`.
- Read more in the :ref:`User Guide <dummy_estimators>`.
- .. versionadded:: 0.13
- Parameters
- ----------
- strategy : {"most_frequent", "prior", "stratified", "uniform", \
- "constant"}, default="prior"
- Strategy to use to generate predictions.
- * "most_frequent": the `predict` method always returns the most
- frequent class label in the observed `y` argument passed to `fit`.
- The `predict_proba` method returns the matching one-hot encoded
- vector.
- * "prior": the `predict` method always returns the most frequent
- class label in the observed `y` argument passed to `fit` (like
- "most_frequent"). ``predict_proba`` always returns the empirical
- class distribution of `y` also known as the empirical class prior
- distribution.
- * "stratified": the `predict_proba` method randomly samples one-hot
- vectors from a multinomial distribution parametrized by the empirical
- class prior probabilities.
- The `predict` method returns the class label which got probability
- one in the one-hot vector of `predict_proba`.
- Each sampled row of both methods is therefore independent and
- identically distributed.
- * "uniform": generates predictions uniformly at random from the list
- of unique classes observed in `y`, i.e. each class has equal
- probability.
- * "constant": always predicts a constant label that is provided by
- the user. This is useful for metrics that evaluate a non-majority
- class.
- .. versionchanged:: 0.24
- The default value of `strategy` has changed to "prior" in version
- 0.24.
- random_state : int, RandomState instance or None, default=None
- Controls the randomness to generate the predictions when
- ``strategy='stratified'`` or ``strategy='uniform'``.
- Pass an int for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- constant : int or str or array-like of shape (n_outputs,), default=None
- The explicit constant as predicted by the "constant" strategy. This
- parameter is useful only for the "constant" strategy.
- Attributes
- ----------
- classes_ : ndarray of shape (n_classes,) or list of such arrays
- Unique class labels observed in `y`. For multi-output classification
- problems, this attribute is a list of arrays as each output has an
- independent set of possible classes.
- n_classes_ : int or list of int
- Number of label for each output.
- class_prior_ : ndarray of shape (n_classes,) or list of such arrays
- Frequency of each class observed in `y`. For multioutput classification
- problems, this is computed independently for each output.
- n_outputs_ : int
- Number of outputs.
- sparse_output_ : bool
- True if the array returned from predict is to be in sparse CSC format.
- Is automatically set to True if the input `y` is passed in sparse
- format.
- See Also
- --------
- DummyRegressor : Regressor that makes predictions using simple rules.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.dummy import DummyClassifier
- >>> X = np.array([-1, 1, 1, 1])
- >>> y = np.array([0, 1, 1, 1])
- >>> dummy_clf = DummyClassifier(strategy="most_frequent")
- >>> dummy_clf.fit(X, y)
- DummyClassifier(strategy='most_frequent')
- >>> dummy_clf.predict(X)
- array([1, 1, 1, 1])
- >>> dummy_clf.score(X, y)
- 0.75
- """
- _parameter_constraints: dict = {
- "strategy": [
- StrOptions({"most_frequent", "prior", "stratified", "uniform", "constant"})
- ],
- "random_state": ["random_state"],
- "constant": [Integral, str, "array-like", None],
- }
- def __init__(self, *, strategy="prior", random_state=None, constant=None):
- self.strategy = strategy
- self.random_state = random_state
- self.constant = constant
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, X, y, sample_weight=None):
- """Fit the baseline classifier.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Training data.
- y : array-like of shape (n_samples,) or (n_samples, n_outputs)
- Target values.
- sample_weight : array-like of shape (n_samples,), default=None
- Sample weights.
- Returns
- -------
- self : object
- Returns the instance itself.
- """
- self._strategy = self.strategy
- if self._strategy == "uniform" and sp.issparse(y):
- y = y.toarray()
- warnings.warn(
- (
- "A local copy of the target data has been converted "
- "to a numpy array. Predicting on sparse target data "
- "with the uniform strategy would not save memory "
- "and would be slower."
- ),
- UserWarning,
- )
- self.sparse_output_ = sp.issparse(y)
- if not self.sparse_output_:
- y = np.asarray(y)
- y = np.atleast_1d(y)
- if y.ndim == 1:
- y = np.reshape(y, (-1, 1))
- self.n_outputs_ = y.shape[1]
- check_consistent_length(X, y)
- if sample_weight is not None:
- sample_weight = _check_sample_weight(sample_weight, X)
- if self._strategy == "constant":
- if self.constant is None:
- raise ValueError(
- "Constant target value has to be specified "
- "when the constant strategy is used."
- )
- else:
- constant = np.reshape(np.atleast_1d(self.constant), (-1, 1))
- if constant.shape[0] != self.n_outputs_:
- raise ValueError(
- "Constant target value should have shape (%d, 1)."
- % self.n_outputs_
- )
- (self.classes_, self.n_classes_, self.class_prior_) = class_distribution(
- y, sample_weight
- )
- if self._strategy == "constant":
- for k in range(self.n_outputs_):
- if not any(constant[k][0] == c for c in self.classes_[k]):
- # Checking in case of constant strategy if the constant
- # provided by the user is in y.
- err_msg = (
- "The constant target value must be present in "
- "the training data. You provided constant={}. "
- "Possible values are: {}.".format(
- self.constant, self.classes_[k].tolist()
- )
- )
- raise ValueError(err_msg)
- if self.n_outputs_ == 1:
- self.n_classes_ = self.n_classes_[0]
- self.classes_ = self.classes_[0]
- self.class_prior_ = self.class_prior_[0]
- return self
- def predict(self, X):
- """Perform classification on test vectors X.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Test data.
- Returns
- -------
- y : array-like of shape (n_samples,) or (n_samples, n_outputs)
- Predicted target values for X.
- """
- check_is_fitted(self)
- # numpy random_state expects Python int and not long as size argument
- # under Windows
- n_samples = _num_samples(X)
- rs = check_random_state(self.random_state)
- n_classes_ = self.n_classes_
- classes_ = self.classes_
- class_prior_ = self.class_prior_
- constant = self.constant
- if self.n_outputs_ == 1:
- # Get same type even for self.n_outputs_ == 1
- n_classes_ = [n_classes_]
- classes_ = [classes_]
- class_prior_ = [class_prior_]
- constant = [constant]
- # Compute probability only once
- if self._strategy == "stratified":
- proba = self.predict_proba(X)
- if self.n_outputs_ == 1:
- proba = [proba]
- if self.sparse_output_:
- class_prob = None
- if self._strategy in ("most_frequent", "prior"):
- classes_ = [np.array([cp.argmax()]) for cp in class_prior_]
- elif self._strategy == "stratified":
- class_prob = class_prior_
- elif self._strategy == "uniform":
- raise ValueError(
- "Sparse target prediction is not "
- "supported with the uniform strategy"
- )
- elif self._strategy == "constant":
- classes_ = [np.array([c]) for c in constant]
- y = _random_choice_csc(n_samples, classes_, class_prob, self.random_state)
- else:
- if self._strategy in ("most_frequent", "prior"):
- y = np.tile(
- [
- classes_[k][class_prior_[k].argmax()]
- for k in range(self.n_outputs_)
- ],
- [n_samples, 1],
- )
- elif self._strategy == "stratified":
- y = np.vstack(
- [
- classes_[k][proba[k].argmax(axis=1)]
- for k in range(self.n_outputs_)
- ]
- ).T
- elif self._strategy == "uniform":
- ret = [
- classes_[k][rs.randint(n_classes_[k], size=n_samples)]
- for k in range(self.n_outputs_)
- ]
- y = np.vstack(ret).T
- elif self._strategy == "constant":
- y = np.tile(self.constant, (n_samples, 1))
- if self.n_outputs_ == 1:
- y = np.ravel(y)
- return y
- def predict_proba(self, X):
- """
- Return probability estimates for the test vectors X.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Test data.
- Returns
- -------
- P : ndarray of shape (n_samples, n_classes) or list of such arrays
- Returns the probability of the sample for each class in
- the model, where classes are ordered arithmetically, for each
- output.
- """
- check_is_fitted(self)
- # numpy random_state expects Python int and not long as size argument
- # under Windows
- n_samples = _num_samples(X)
- rs = check_random_state(self.random_state)
- n_classes_ = self.n_classes_
- classes_ = self.classes_
- class_prior_ = self.class_prior_
- constant = self.constant
- if self.n_outputs_ == 1:
- # Get same type even for self.n_outputs_ == 1
- n_classes_ = [n_classes_]
- classes_ = [classes_]
- class_prior_ = [class_prior_]
- constant = [constant]
- P = []
- for k in range(self.n_outputs_):
- if self._strategy == "most_frequent":
- ind = class_prior_[k].argmax()
- out = np.zeros((n_samples, n_classes_[k]), dtype=np.float64)
- out[:, ind] = 1.0
- elif self._strategy == "prior":
- out = np.ones((n_samples, 1)) * class_prior_[k]
- elif self._strategy == "stratified":
- out = rs.multinomial(1, class_prior_[k], size=n_samples)
- out = out.astype(np.float64)
- elif self._strategy == "uniform":
- out = np.ones((n_samples, n_classes_[k]), dtype=np.float64)
- out /= n_classes_[k]
- elif self._strategy == "constant":
- ind = np.where(classes_[k] == constant[k])
- out = np.zeros((n_samples, n_classes_[k]), dtype=np.float64)
- out[:, ind] = 1.0
- P.append(out)
- if self.n_outputs_ == 1:
- P = P[0]
- return P
- def predict_log_proba(self, X):
- """
- Return log probability estimates for the test vectors X.
- Parameters
- ----------
- X : {array-like, object with finite length or shape}
- Training data.
- Returns
- -------
- P : ndarray of shape (n_samples, n_classes) or list of such arrays
- Returns the log probability of the sample for each class in
- the model, where classes are ordered arithmetically for each
- output.
- """
- proba = self.predict_proba(X)
- if self.n_outputs_ == 1:
- return np.log(proba)
- else:
- return [np.log(p) for p in proba]
- def _more_tags(self):
- return {
- "poor_score": True,
- "no_validation": True,
- "_xfail_checks": {
- "check_methods_subset_invariance": "fails for the predict method",
- "check_methods_sample_order_invariance": "fails for the predict method",
- },
- }
- def score(self, X, y, sample_weight=None):
- """Return the mean accuracy on the given test data and labels.
- In multi-label classification, this is the subset accuracy
- which is a harsh metric since you require for each sample that
- each label set be correctly predicted.
- Parameters
- ----------
- X : None or array-like of shape (n_samples, n_features)
- Test samples. Passing None as test samples gives the same result
- as passing real test samples, since DummyClassifier
- operates independently of the sampled observations.
- y : array-like of shape (n_samples,) or (n_samples, n_outputs)
- True labels for X.
- sample_weight : array-like of shape (n_samples,), default=None
- Sample weights.
- Returns
- -------
- score : float
- Mean accuracy of self.predict(X) w.r.t. y.
- """
- if X is None:
- X = np.zeros(shape=(len(y), 1))
- return super().score(X, y, sample_weight)
- class DummyRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
- """Regressor that makes predictions using simple rules.
- This regressor is useful as a simple baseline to compare with other
- (real) regressors. Do not use it for real problems.
- Read more in the :ref:`User Guide <dummy_estimators>`.
- .. versionadded:: 0.13
- Parameters
- ----------
- strategy : {"mean", "median", "quantile", "constant"}, default="mean"
- Strategy to use to generate predictions.
- * "mean": always predicts the mean of the training set
- * "median": always predicts the median of the training set
- * "quantile": always predicts a specified quantile of the training set,
- provided with the quantile parameter.
- * "constant": always predicts a constant value that is provided by
- the user.
- constant : int or float or array-like of shape (n_outputs,), default=None
- The explicit constant as predicted by the "constant" strategy. This
- parameter is useful only for the "constant" strategy.
- quantile : float in [0.0, 1.0], default=None
- The quantile to predict using the "quantile" strategy. A quantile of
- 0.5 corresponds to the median, while 0.0 to the minimum and 1.0 to the
- maximum.
- Attributes
- ----------
- constant_ : ndarray of shape (1, n_outputs)
- Mean or median or quantile of the training targets or constant value
- given by the user.
- n_outputs_ : int
- Number of outputs.
- See Also
- --------
- DummyClassifier: Classifier that makes predictions using simple rules.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.dummy import DummyRegressor
- >>> X = np.array([1.0, 2.0, 3.0, 4.0])
- >>> y = np.array([2.0, 3.0, 5.0, 10.0])
- >>> dummy_regr = DummyRegressor(strategy="mean")
- >>> dummy_regr.fit(X, y)
- DummyRegressor()
- >>> dummy_regr.predict(X)
- array([5., 5., 5., 5.])
- >>> dummy_regr.score(X, y)
- 0.0
- """
- _parameter_constraints: dict = {
- "strategy": [StrOptions({"mean", "median", "quantile", "constant"})],
- "quantile": [Interval(Real, 0.0, 1.0, closed="both"), None],
- "constant": [
- Interval(Real, None, None, closed="neither"),
- "array-like",
- None,
- ],
- }
- def __init__(self, *, strategy="mean", constant=None, quantile=None):
- self.strategy = strategy
- self.constant = constant
- self.quantile = quantile
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, X, y, sample_weight=None):
- """Fit the random regressor.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Training data.
- y : array-like of shape (n_samples,) or (n_samples, n_outputs)
- Target values.
- sample_weight : array-like of shape (n_samples,), default=None
- Sample weights.
- Returns
- -------
- self : object
- Fitted estimator.
- """
- y = check_array(y, ensure_2d=False, input_name="y")
- if len(y) == 0:
- raise ValueError("y must not be empty.")
- if y.ndim == 1:
- y = np.reshape(y, (-1, 1))
- self.n_outputs_ = y.shape[1]
- check_consistent_length(X, y, sample_weight)
- if sample_weight is not None:
- sample_weight = _check_sample_weight(sample_weight, X)
- if self.strategy == "mean":
- self.constant_ = np.average(y, axis=0, weights=sample_weight)
- elif self.strategy == "median":
- if sample_weight is None:
- self.constant_ = np.median(y, axis=0)
- else:
- self.constant_ = [
- _weighted_percentile(y[:, k], sample_weight, percentile=50.0)
- for k in range(self.n_outputs_)
- ]
- elif self.strategy == "quantile":
- if self.quantile is None:
- raise ValueError(
- "When using `strategy='quantile', you have to specify the desired "
- "quantile in the range [0, 1]."
- )
- percentile = self.quantile * 100.0
- if sample_weight is None:
- self.constant_ = np.percentile(y, axis=0, q=percentile)
- else:
- self.constant_ = [
- _weighted_percentile(y[:, k], sample_weight, percentile=percentile)
- for k in range(self.n_outputs_)
- ]
- elif self.strategy == "constant":
- if self.constant is None:
- raise TypeError(
- "Constant target value has to be specified "
- "when the constant strategy is used."
- )
- self.constant_ = check_array(
- self.constant,
- accept_sparse=["csr", "csc", "coo"],
- ensure_2d=False,
- ensure_min_samples=0,
- )
- if self.n_outputs_ != 1 and self.constant_.shape[0] != y.shape[1]:
- raise ValueError(
- "Constant target value should have shape (%d, 1)." % y.shape[1]
- )
- self.constant_ = np.reshape(self.constant_, (1, -1))
- return self
- def predict(self, X, return_std=False):
- """Perform classification on test vectors X.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Test data.
- return_std : bool, default=False
- Whether to return the standard deviation of posterior prediction.
- All zeros in this case.
- .. versionadded:: 0.20
- Returns
- -------
- y : array-like of shape (n_samples,) or (n_samples, n_outputs)
- Predicted target values for X.
- y_std : array-like of shape (n_samples,) or (n_samples, n_outputs)
- Standard deviation of predictive distribution of query points.
- """
- check_is_fitted(self)
- n_samples = _num_samples(X)
- y = np.full(
- (n_samples, self.n_outputs_),
- self.constant_,
- dtype=np.array(self.constant_).dtype,
- )
- y_std = np.zeros((n_samples, self.n_outputs_))
- if self.n_outputs_ == 1:
- y = np.ravel(y)
- y_std = np.ravel(y_std)
- return (y, y_std) if return_std else y
- def _more_tags(self):
- return {"poor_score": True, "no_validation": True}
- def score(self, X, y, sample_weight=None):
- """Return the coefficient of determination R^2 of the prediction.
- The coefficient R^2 is defined as `(1 - u/v)`, where `u` is the
- residual sum of squares `((y_true - y_pred) ** 2).sum()` and `v` is the
- total sum of squares `((y_true - y_true.mean()) ** 2).sum()`. The best
- possible score is 1.0 and it can be negative (because the model can be
- arbitrarily worse). A constant model that always predicts the expected
- value of y, disregarding the input features, would get a R^2 score of
- 0.0.
- Parameters
- ----------
- X : None or array-like of shape (n_samples, n_features)
- Test samples. Passing None as test samples gives the same result
- as passing real test samples, since `DummyRegressor`
- operates independently of the sampled observations.
- y : array-like of shape (n_samples,) or (n_samples, n_outputs)
- True values for X.
- sample_weight : array-like of shape (n_samples,), default=None
- Sample weights.
- Returns
- -------
- score : float
- R^2 of `self.predict(X)` w.r.t. y.
- """
- if X is None:
- X = np.zeros(shape=(len(y), 1))
- return super().score(X, y, sample_weight)
|