dummy.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682
  1. # Author: Mathieu Blondel <mathieu@mblondel.org>
  2. # Arnaud Joly <a.joly@ulg.ac.be>
  3. # Maheshakya Wijewardena <maheshakya.10@cse.mrt.ac.lk>
  4. # License: BSD 3 clause
  5. import warnings
  6. from numbers import Integral, Real
  7. import numpy as np
  8. import scipy.sparse as sp
  9. from .base import (
  10. BaseEstimator,
  11. ClassifierMixin,
  12. MultiOutputMixin,
  13. RegressorMixin,
  14. _fit_context,
  15. )
  16. from .utils import check_random_state
  17. from .utils._param_validation import Interval, StrOptions
  18. from .utils.multiclass import class_distribution
  19. from .utils.random import _random_choice_csc
  20. from .utils.stats import _weighted_percentile
  21. from .utils.validation import (
  22. _check_sample_weight,
  23. _num_samples,
  24. check_array,
  25. check_consistent_length,
  26. check_is_fitted,
  27. )
  28. class DummyClassifier(MultiOutputMixin, ClassifierMixin, BaseEstimator):
  29. """DummyClassifier makes predictions that ignore the input features.
  30. This classifier serves as a simple baseline to compare against other more
  31. complex classifiers.
  32. The specific behavior of the baseline is selected with the `strategy`
  33. parameter.
  34. All strategies make predictions that ignore the input feature values passed
  35. as the `X` argument to `fit` and `predict`. The predictions, however,
  36. typically depend on values observed in the `y` parameter passed to `fit`.
  37. Note that the "stratified" and "uniform" strategies lead to
  38. non-deterministic predictions that can be rendered deterministic by setting
  39. the `random_state` parameter if needed. The other strategies are naturally
  40. deterministic and, once fit, always return the same constant prediction
  41. for any value of `X`.
  42. Read more in the :ref:`User Guide <dummy_estimators>`.
  43. .. versionadded:: 0.13
  44. Parameters
  45. ----------
  46. strategy : {"most_frequent", "prior", "stratified", "uniform", \
  47. "constant"}, default="prior"
  48. Strategy to use to generate predictions.
  49. * "most_frequent": the `predict` method always returns the most
  50. frequent class label in the observed `y` argument passed to `fit`.
  51. The `predict_proba` method returns the matching one-hot encoded
  52. vector.
  53. * "prior": the `predict` method always returns the most frequent
  54. class label in the observed `y` argument passed to `fit` (like
  55. "most_frequent"). ``predict_proba`` always returns the empirical
  56. class distribution of `y` also known as the empirical class prior
  57. distribution.
  58. * "stratified": the `predict_proba` method randomly samples one-hot
  59. vectors from a multinomial distribution parametrized by the empirical
  60. class prior probabilities.
  61. The `predict` method returns the class label which got probability
  62. one in the one-hot vector of `predict_proba`.
  63. Each sampled row of both methods is therefore independent and
  64. identically distributed.
  65. * "uniform": generates predictions uniformly at random from the list
  66. of unique classes observed in `y`, i.e. each class has equal
  67. probability.
  68. * "constant": always predicts a constant label that is provided by
  69. the user. This is useful for metrics that evaluate a non-majority
  70. class.
  71. .. versionchanged:: 0.24
  72. The default value of `strategy` has changed to "prior" in version
  73. 0.24.
  74. random_state : int, RandomState instance or None, default=None
  75. Controls the randomness to generate the predictions when
  76. ``strategy='stratified'`` or ``strategy='uniform'``.
  77. Pass an int for reproducible output across multiple function calls.
  78. See :term:`Glossary <random_state>`.
  79. constant : int or str or array-like of shape (n_outputs,), default=None
  80. The explicit constant as predicted by the "constant" strategy. This
  81. parameter is useful only for the "constant" strategy.
  82. Attributes
  83. ----------
  84. classes_ : ndarray of shape (n_classes,) or list of such arrays
  85. Unique class labels observed in `y`. For multi-output classification
  86. problems, this attribute is a list of arrays as each output has an
  87. independent set of possible classes.
  88. n_classes_ : int or list of int
  89. Number of label for each output.
  90. class_prior_ : ndarray of shape (n_classes,) or list of such arrays
  91. Frequency of each class observed in `y`. For multioutput classification
  92. problems, this is computed independently for each output.
  93. n_outputs_ : int
  94. Number of outputs.
  95. sparse_output_ : bool
  96. True if the array returned from predict is to be in sparse CSC format.
  97. Is automatically set to True if the input `y` is passed in sparse
  98. format.
  99. See Also
  100. --------
  101. DummyRegressor : Regressor that makes predictions using simple rules.
  102. Examples
  103. --------
  104. >>> import numpy as np
  105. >>> from sklearn.dummy import DummyClassifier
  106. >>> X = np.array([-1, 1, 1, 1])
  107. >>> y = np.array([0, 1, 1, 1])
  108. >>> dummy_clf = DummyClassifier(strategy="most_frequent")
  109. >>> dummy_clf.fit(X, y)
  110. DummyClassifier(strategy='most_frequent')
  111. >>> dummy_clf.predict(X)
  112. array([1, 1, 1, 1])
  113. >>> dummy_clf.score(X, y)
  114. 0.75
  115. """
  116. _parameter_constraints: dict = {
  117. "strategy": [
  118. StrOptions({"most_frequent", "prior", "stratified", "uniform", "constant"})
  119. ],
  120. "random_state": ["random_state"],
  121. "constant": [Integral, str, "array-like", None],
  122. }
  123. def __init__(self, *, strategy="prior", random_state=None, constant=None):
  124. self.strategy = strategy
  125. self.random_state = random_state
  126. self.constant = constant
  127. @_fit_context(prefer_skip_nested_validation=True)
  128. def fit(self, X, y, sample_weight=None):
  129. """Fit the baseline classifier.
  130. Parameters
  131. ----------
  132. X : array-like of shape (n_samples, n_features)
  133. Training data.
  134. y : array-like of shape (n_samples,) or (n_samples, n_outputs)
  135. Target values.
  136. sample_weight : array-like of shape (n_samples,), default=None
  137. Sample weights.
  138. Returns
  139. -------
  140. self : object
  141. Returns the instance itself.
  142. """
  143. self._strategy = self.strategy
  144. if self._strategy == "uniform" and sp.issparse(y):
  145. y = y.toarray()
  146. warnings.warn(
  147. (
  148. "A local copy of the target data has been converted "
  149. "to a numpy array. Predicting on sparse target data "
  150. "with the uniform strategy would not save memory "
  151. "and would be slower."
  152. ),
  153. UserWarning,
  154. )
  155. self.sparse_output_ = sp.issparse(y)
  156. if not self.sparse_output_:
  157. y = np.asarray(y)
  158. y = np.atleast_1d(y)
  159. if y.ndim == 1:
  160. y = np.reshape(y, (-1, 1))
  161. self.n_outputs_ = y.shape[1]
  162. check_consistent_length(X, y)
  163. if sample_weight is not None:
  164. sample_weight = _check_sample_weight(sample_weight, X)
  165. if self._strategy == "constant":
  166. if self.constant is None:
  167. raise ValueError(
  168. "Constant target value has to be specified "
  169. "when the constant strategy is used."
  170. )
  171. else:
  172. constant = np.reshape(np.atleast_1d(self.constant), (-1, 1))
  173. if constant.shape[0] != self.n_outputs_:
  174. raise ValueError(
  175. "Constant target value should have shape (%d, 1)."
  176. % self.n_outputs_
  177. )
  178. (self.classes_, self.n_classes_, self.class_prior_) = class_distribution(
  179. y, sample_weight
  180. )
  181. if self._strategy == "constant":
  182. for k in range(self.n_outputs_):
  183. if not any(constant[k][0] == c for c in self.classes_[k]):
  184. # Checking in case of constant strategy if the constant
  185. # provided by the user is in y.
  186. err_msg = (
  187. "The constant target value must be present in "
  188. "the training data. You provided constant={}. "
  189. "Possible values are: {}.".format(
  190. self.constant, self.classes_[k].tolist()
  191. )
  192. )
  193. raise ValueError(err_msg)
  194. if self.n_outputs_ == 1:
  195. self.n_classes_ = self.n_classes_[0]
  196. self.classes_ = self.classes_[0]
  197. self.class_prior_ = self.class_prior_[0]
  198. return self
  199. def predict(self, X):
  200. """Perform classification on test vectors X.
  201. Parameters
  202. ----------
  203. X : array-like of shape (n_samples, n_features)
  204. Test data.
  205. Returns
  206. -------
  207. y : array-like of shape (n_samples,) or (n_samples, n_outputs)
  208. Predicted target values for X.
  209. """
  210. check_is_fitted(self)
  211. # numpy random_state expects Python int and not long as size argument
  212. # under Windows
  213. n_samples = _num_samples(X)
  214. rs = check_random_state(self.random_state)
  215. n_classes_ = self.n_classes_
  216. classes_ = self.classes_
  217. class_prior_ = self.class_prior_
  218. constant = self.constant
  219. if self.n_outputs_ == 1:
  220. # Get same type even for self.n_outputs_ == 1
  221. n_classes_ = [n_classes_]
  222. classes_ = [classes_]
  223. class_prior_ = [class_prior_]
  224. constant = [constant]
  225. # Compute probability only once
  226. if self._strategy == "stratified":
  227. proba = self.predict_proba(X)
  228. if self.n_outputs_ == 1:
  229. proba = [proba]
  230. if self.sparse_output_:
  231. class_prob = None
  232. if self._strategy in ("most_frequent", "prior"):
  233. classes_ = [np.array([cp.argmax()]) for cp in class_prior_]
  234. elif self._strategy == "stratified":
  235. class_prob = class_prior_
  236. elif self._strategy == "uniform":
  237. raise ValueError(
  238. "Sparse target prediction is not "
  239. "supported with the uniform strategy"
  240. )
  241. elif self._strategy == "constant":
  242. classes_ = [np.array([c]) for c in constant]
  243. y = _random_choice_csc(n_samples, classes_, class_prob, self.random_state)
  244. else:
  245. if self._strategy in ("most_frequent", "prior"):
  246. y = np.tile(
  247. [
  248. classes_[k][class_prior_[k].argmax()]
  249. for k in range(self.n_outputs_)
  250. ],
  251. [n_samples, 1],
  252. )
  253. elif self._strategy == "stratified":
  254. y = np.vstack(
  255. [
  256. classes_[k][proba[k].argmax(axis=1)]
  257. for k in range(self.n_outputs_)
  258. ]
  259. ).T
  260. elif self._strategy == "uniform":
  261. ret = [
  262. classes_[k][rs.randint(n_classes_[k], size=n_samples)]
  263. for k in range(self.n_outputs_)
  264. ]
  265. y = np.vstack(ret).T
  266. elif self._strategy == "constant":
  267. y = np.tile(self.constant, (n_samples, 1))
  268. if self.n_outputs_ == 1:
  269. y = np.ravel(y)
  270. return y
  271. def predict_proba(self, X):
  272. """
  273. Return probability estimates for the test vectors X.
  274. Parameters
  275. ----------
  276. X : array-like of shape (n_samples, n_features)
  277. Test data.
  278. Returns
  279. -------
  280. P : ndarray of shape (n_samples, n_classes) or list of such arrays
  281. Returns the probability of the sample for each class in
  282. the model, where classes are ordered arithmetically, for each
  283. output.
  284. """
  285. check_is_fitted(self)
  286. # numpy random_state expects Python int and not long as size argument
  287. # under Windows
  288. n_samples = _num_samples(X)
  289. rs = check_random_state(self.random_state)
  290. n_classes_ = self.n_classes_
  291. classes_ = self.classes_
  292. class_prior_ = self.class_prior_
  293. constant = self.constant
  294. if self.n_outputs_ == 1:
  295. # Get same type even for self.n_outputs_ == 1
  296. n_classes_ = [n_classes_]
  297. classes_ = [classes_]
  298. class_prior_ = [class_prior_]
  299. constant = [constant]
  300. P = []
  301. for k in range(self.n_outputs_):
  302. if self._strategy == "most_frequent":
  303. ind = class_prior_[k].argmax()
  304. out = np.zeros((n_samples, n_classes_[k]), dtype=np.float64)
  305. out[:, ind] = 1.0
  306. elif self._strategy == "prior":
  307. out = np.ones((n_samples, 1)) * class_prior_[k]
  308. elif self._strategy == "stratified":
  309. out = rs.multinomial(1, class_prior_[k], size=n_samples)
  310. out = out.astype(np.float64)
  311. elif self._strategy == "uniform":
  312. out = np.ones((n_samples, n_classes_[k]), dtype=np.float64)
  313. out /= n_classes_[k]
  314. elif self._strategy == "constant":
  315. ind = np.where(classes_[k] == constant[k])
  316. out = np.zeros((n_samples, n_classes_[k]), dtype=np.float64)
  317. out[:, ind] = 1.0
  318. P.append(out)
  319. if self.n_outputs_ == 1:
  320. P = P[0]
  321. return P
  322. def predict_log_proba(self, X):
  323. """
  324. Return log probability estimates for the test vectors X.
  325. Parameters
  326. ----------
  327. X : {array-like, object with finite length or shape}
  328. Training data.
  329. Returns
  330. -------
  331. P : ndarray of shape (n_samples, n_classes) or list of such arrays
  332. Returns the log probability of the sample for each class in
  333. the model, where classes are ordered arithmetically for each
  334. output.
  335. """
  336. proba = self.predict_proba(X)
  337. if self.n_outputs_ == 1:
  338. return np.log(proba)
  339. else:
  340. return [np.log(p) for p in proba]
  341. def _more_tags(self):
  342. return {
  343. "poor_score": True,
  344. "no_validation": True,
  345. "_xfail_checks": {
  346. "check_methods_subset_invariance": "fails for the predict method",
  347. "check_methods_sample_order_invariance": "fails for the predict method",
  348. },
  349. }
  350. def score(self, X, y, sample_weight=None):
  351. """Return the mean accuracy on the given test data and labels.
  352. In multi-label classification, this is the subset accuracy
  353. which is a harsh metric since you require for each sample that
  354. each label set be correctly predicted.
  355. Parameters
  356. ----------
  357. X : None or array-like of shape (n_samples, n_features)
  358. Test samples. Passing None as test samples gives the same result
  359. as passing real test samples, since DummyClassifier
  360. operates independently of the sampled observations.
  361. y : array-like of shape (n_samples,) or (n_samples, n_outputs)
  362. True labels for X.
  363. sample_weight : array-like of shape (n_samples,), default=None
  364. Sample weights.
  365. Returns
  366. -------
  367. score : float
  368. Mean accuracy of self.predict(X) w.r.t. y.
  369. """
  370. if X is None:
  371. X = np.zeros(shape=(len(y), 1))
  372. return super().score(X, y, sample_weight)
  373. class DummyRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
  374. """Regressor that makes predictions using simple rules.
  375. This regressor is useful as a simple baseline to compare with other
  376. (real) regressors. Do not use it for real problems.
  377. Read more in the :ref:`User Guide <dummy_estimators>`.
  378. .. versionadded:: 0.13
  379. Parameters
  380. ----------
  381. strategy : {"mean", "median", "quantile", "constant"}, default="mean"
  382. Strategy to use to generate predictions.
  383. * "mean": always predicts the mean of the training set
  384. * "median": always predicts the median of the training set
  385. * "quantile": always predicts a specified quantile of the training set,
  386. provided with the quantile parameter.
  387. * "constant": always predicts a constant value that is provided by
  388. the user.
  389. constant : int or float or array-like of shape (n_outputs,), default=None
  390. The explicit constant as predicted by the "constant" strategy. This
  391. parameter is useful only for the "constant" strategy.
  392. quantile : float in [0.0, 1.0], default=None
  393. The quantile to predict using the "quantile" strategy. A quantile of
  394. 0.5 corresponds to the median, while 0.0 to the minimum and 1.0 to the
  395. maximum.
  396. Attributes
  397. ----------
  398. constant_ : ndarray of shape (1, n_outputs)
  399. Mean or median or quantile of the training targets or constant value
  400. given by the user.
  401. n_outputs_ : int
  402. Number of outputs.
  403. See Also
  404. --------
  405. DummyClassifier: Classifier that makes predictions using simple rules.
  406. Examples
  407. --------
  408. >>> import numpy as np
  409. >>> from sklearn.dummy import DummyRegressor
  410. >>> X = np.array([1.0, 2.0, 3.0, 4.0])
  411. >>> y = np.array([2.0, 3.0, 5.0, 10.0])
  412. >>> dummy_regr = DummyRegressor(strategy="mean")
  413. >>> dummy_regr.fit(X, y)
  414. DummyRegressor()
  415. >>> dummy_regr.predict(X)
  416. array([5., 5., 5., 5.])
  417. >>> dummy_regr.score(X, y)
  418. 0.0
  419. """
  420. _parameter_constraints: dict = {
  421. "strategy": [StrOptions({"mean", "median", "quantile", "constant"})],
  422. "quantile": [Interval(Real, 0.0, 1.0, closed="both"), None],
  423. "constant": [
  424. Interval(Real, None, None, closed="neither"),
  425. "array-like",
  426. None,
  427. ],
  428. }
  429. def __init__(self, *, strategy="mean", constant=None, quantile=None):
  430. self.strategy = strategy
  431. self.constant = constant
  432. self.quantile = quantile
  433. @_fit_context(prefer_skip_nested_validation=True)
  434. def fit(self, X, y, sample_weight=None):
  435. """Fit the random regressor.
  436. Parameters
  437. ----------
  438. X : array-like of shape (n_samples, n_features)
  439. Training data.
  440. y : array-like of shape (n_samples,) or (n_samples, n_outputs)
  441. Target values.
  442. sample_weight : array-like of shape (n_samples,), default=None
  443. Sample weights.
  444. Returns
  445. -------
  446. self : object
  447. Fitted estimator.
  448. """
  449. y = check_array(y, ensure_2d=False, input_name="y")
  450. if len(y) == 0:
  451. raise ValueError("y must not be empty.")
  452. if y.ndim == 1:
  453. y = np.reshape(y, (-1, 1))
  454. self.n_outputs_ = y.shape[1]
  455. check_consistent_length(X, y, sample_weight)
  456. if sample_weight is not None:
  457. sample_weight = _check_sample_weight(sample_weight, X)
  458. if self.strategy == "mean":
  459. self.constant_ = np.average(y, axis=0, weights=sample_weight)
  460. elif self.strategy == "median":
  461. if sample_weight is None:
  462. self.constant_ = np.median(y, axis=0)
  463. else:
  464. self.constant_ = [
  465. _weighted_percentile(y[:, k], sample_weight, percentile=50.0)
  466. for k in range(self.n_outputs_)
  467. ]
  468. elif self.strategy == "quantile":
  469. if self.quantile is None:
  470. raise ValueError(
  471. "When using `strategy='quantile', you have to specify the desired "
  472. "quantile in the range [0, 1]."
  473. )
  474. percentile = self.quantile * 100.0
  475. if sample_weight is None:
  476. self.constant_ = np.percentile(y, axis=0, q=percentile)
  477. else:
  478. self.constant_ = [
  479. _weighted_percentile(y[:, k], sample_weight, percentile=percentile)
  480. for k in range(self.n_outputs_)
  481. ]
  482. elif self.strategy == "constant":
  483. if self.constant is None:
  484. raise TypeError(
  485. "Constant target value has to be specified "
  486. "when the constant strategy is used."
  487. )
  488. self.constant_ = check_array(
  489. self.constant,
  490. accept_sparse=["csr", "csc", "coo"],
  491. ensure_2d=False,
  492. ensure_min_samples=0,
  493. )
  494. if self.n_outputs_ != 1 and self.constant_.shape[0] != y.shape[1]:
  495. raise ValueError(
  496. "Constant target value should have shape (%d, 1)." % y.shape[1]
  497. )
  498. self.constant_ = np.reshape(self.constant_, (1, -1))
  499. return self
  500. def predict(self, X, return_std=False):
  501. """Perform classification on test vectors X.
  502. Parameters
  503. ----------
  504. X : array-like of shape (n_samples, n_features)
  505. Test data.
  506. return_std : bool, default=False
  507. Whether to return the standard deviation of posterior prediction.
  508. All zeros in this case.
  509. .. versionadded:: 0.20
  510. Returns
  511. -------
  512. y : array-like of shape (n_samples,) or (n_samples, n_outputs)
  513. Predicted target values for X.
  514. y_std : array-like of shape (n_samples,) or (n_samples, n_outputs)
  515. Standard deviation of predictive distribution of query points.
  516. """
  517. check_is_fitted(self)
  518. n_samples = _num_samples(X)
  519. y = np.full(
  520. (n_samples, self.n_outputs_),
  521. self.constant_,
  522. dtype=np.array(self.constant_).dtype,
  523. )
  524. y_std = np.zeros((n_samples, self.n_outputs_))
  525. if self.n_outputs_ == 1:
  526. y = np.ravel(y)
  527. y_std = np.ravel(y_std)
  528. return (y, y_std) if return_std else y
  529. def _more_tags(self):
  530. return {"poor_score": True, "no_validation": True}
  531. def score(self, X, y, sample_weight=None):
  532. """Return the coefficient of determination R^2 of the prediction.
  533. The coefficient R^2 is defined as `(1 - u/v)`, where `u` is the
  534. residual sum of squares `((y_true - y_pred) ** 2).sum()` and `v` is the
  535. total sum of squares `((y_true - y_true.mean()) ** 2).sum()`. The best
  536. possible score is 1.0 and it can be negative (because the model can be
  537. arbitrarily worse). A constant model that always predicts the expected
  538. value of y, disregarding the input features, would get a R^2 score of
  539. 0.0.
  540. Parameters
  541. ----------
  542. X : None or array-like of shape (n_samples, n_features)
  543. Test samples. Passing None as test samples gives the same result
  544. as passing real test samples, since `DummyRegressor`
  545. operates independently of the sampled observations.
  546. y : array-like of shape (n_samples,) or (n_samples, n_outputs)
  547. True values for X.
  548. sample_weight : array-like of shape (n_samples,), default=None
  549. Sample weights.
  550. Returns
  551. -------
  552. score : float
  553. R^2 of `self.predict(X)` w.r.t. y.
  554. """
  555. if X is None:
  556. X = np.zeros(shape=(len(y), 1))
  557. return super().score(X, y, sample_weight)