test_dummy.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688
  1. import numpy as np
  2. import pytest
  3. import scipy.sparse as sp
  4. from sklearn.base import clone
  5. from sklearn.dummy import DummyClassifier, DummyRegressor
  6. from sklearn.exceptions import NotFittedError
  7. from sklearn.utils._testing import (
  8. assert_almost_equal,
  9. assert_array_almost_equal,
  10. assert_array_equal,
  11. ignore_warnings,
  12. )
  13. from sklearn.utils.stats import _weighted_percentile
  14. @ignore_warnings
  15. def _check_predict_proba(clf, X, y):
  16. proba = clf.predict_proba(X)
  17. # We know that we can have division by zero
  18. log_proba = clf.predict_log_proba(X)
  19. y = np.atleast_1d(y)
  20. if y.ndim == 1:
  21. y = np.reshape(y, (-1, 1))
  22. n_outputs = y.shape[1]
  23. n_samples = len(X)
  24. if n_outputs == 1:
  25. proba = [proba]
  26. log_proba = [log_proba]
  27. for k in range(n_outputs):
  28. assert proba[k].shape[0] == n_samples
  29. assert proba[k].shape[1] == len(np.unique(y[:, k]))
  30. assert_array_almost_equal(proba[k].sum(axis=1), np.ones(len(X)))
  31. # We know that we can have division by zero
  32. assert_array_almost_equal(np.log(proba[k]), log_proba[k])
  33. def _check_behavior_2d(clf):
  34. # 1d case
  35. X = np.array([[0], [0], [0], [0]]) # ignored
  36. y = np.array([1, 2, 1, 1])
  37. est = clone(clf)
  38. est.fit(X, y)
  39. y_pred = est.predict(X)
  40. assert y.shape == y_pred.shape
  41. # 2d case
  42. y = np.array([[1, 0], [2, 0], [1, 0], [1, 3]])
  43. est = clone(clf)
  44. est.fit(X, y)
  45. y_pred = est.predict(X)
  46. assert y.shape == y_pred.shape
  47. def _check_behavior_2d_for_constant(clf):
  48. # 2d case only
  49. X = np.array([[0], [0], [0], [0]]) # ignored
  50. y = np.array([[1, 0, 5, 4, 3], [2, 0, 1, 2, 5], [1, 0, 4, 5, 2], [1, 3, 3, 2, 0]])
  51. est = clone(clf)
  52. est.fit(X, y)
  53. y_pred = est.predict(X)
  54. assert y.shape == y_pred.shape
  55. def _check_equality_regressor(statistic, y_learn, y_pred_learn, y_test, y_pred_test):
  56. assert_array_almost_equal(np.tile(statistic, (y_learn.shape[0], 1)), y_pred_learn)
  57. assert_array_almost_equal(np.tile(statistic, (y_test.shape[0], 1)), y_pred_test)
  58. def test_most_frequent_and_prior_strategy():
  59. X = [[0], [0], [0], [0]] # ignored
  60. y = [1, 2, 1, 1]
  61. for strategy in ("most_frequent", "prior"):
  62. clf = DummyClassifier(strategy=strategy, random_state=0)
  63. clf.fit(X, y)
  64. assert_array_equal(clf.predict(X), np.ones(len(X)))
  65. _check_predict_proba(clf, X, y)
  66. if strategy == "prior":
  67. assert_array_almost_equal(
  68. clf.predict_proba([X[0]]), clf.class_prior_.reshape((1, -1))
  69. )
  70. else:
  71. assert_array_almost_equal(
  72. clf.predict_proba([X[0]]), clf.class_prior_.reshape((1, -1)) > 0.5
  73. )
  74. def test_most_frequent_and_prior_strategy_with_2d_column_y():
  75. # non-regression test added in
  76. # https://github.com/scikit-learn/scikit-learn/pull/13545
  77. X = [[0], [0], [0], [0]]
  78. y_1d = [1, 2, 1, 1]
  79. y_2d = [[1], [2], [1], [1]]
  80. for strategy in ("most_frequent", "prior"):
  81. clf_1d = DummyClassifier(strategy=strategy, random_state=0)
  82. clf_2d = DummyClassifier(strategy=strategy, random_state=0)
  83. clf_1d.fit(X, y_1d)
  84. clf_2d.fit(X, y_2d)
  85. assert_array_equal(clf_1d.predict(X), clf_2d.predict(X))
  86. def test_most_frequent_and_prior_strategy_multioutput():
  87. X = [[0], [0], [0], [0]] # ignored
  88. y = np.array([[1, 0], [2, 0], [1, 0], [1, 3]])
  89. n_samples = len(X)
  90. for strategy in ("prior", "most_frequent"):
  91. clf = DummyClassifier(strategy=strategy, random_state=0)
  92. clf.fit(X, y)
  93. assert_array_equal(
  94. clf.predict(X),
  95. np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))]),
  96. )
  97. _check_predict_proba(clf, X, y)
  98. _check_behavior_2d(clf)
  99. def test_stratified_strategy(global_random_seed):
  100. X = [[0]] * 5 # ignored
  101. y = [1, 2, 1, 1, 2]
  102. clf = DummyClassifier(strategy="stratified", random_state=global_random_seed)
  103. clf.fit(X, y)
  104. X = [[0]] * 500
  105. y_pred = clf.predict(X)
  106. p = np.bincount(y_pred) / float(len(X))
  107. assert_almost_equal(p[1], 3.0 / 5, decimal=1)
  108. assert_almost_equal(p[2], 2.0 / 5, decimal=1)
  109. _check_predict_proba(clf, X, y)
  110. def test_stratified_strategy_multioutput(global_random_seed):
  111. X = [[0]] * 5 # ignored
  112. y = np.array([[2, 1], [2, 2], [1, 1], [1, 2], [1, 1]])
  113. clf = DummyClassifier(strategy="stratified", random_state=global_random_seed)
  114. clf.fit(X, y)
  115. X = [[0]] * 500
  116. y_pred = clf.predict(X)
  117. for k in range(y.shape[1]):
  118. p = np.bincount(y_pred[:, k]) / float(len(X))
  119. assert_almost_equal(p[1], 3.0 / 5, decimal=1)
  120. assert_almost_equal(p[2], 2.0 / 5, decimal=1)
  121. _check_predict_proba(clf, X, y)
  122. _check_behavior_2d(clf)
  123. def test_uniform_strategy(global_random_seed):
  124. X = [[0]] * 4 # ignored
  125. y = [1, 2, 1, 1]
  126. clf = DummyClassifier(strategy="uniform", random_state=global_random_seed)
  127. clf.fit(X, y)
  128. X = [[0]] * 500
  129. y_pred = clf.predict(X)
  130. p = np.bincount(y_pred) / float(len(X))
  131. assert_almost_equal(p[1], 0.5, decimal=1)
  132. assert_almost_equal(p[2], 0.5, decimal=1)
  133. _check_predict_proba(clf, X, y)
  134. def test_uniform_strategy_multioutput(global_random_seed):
  135. X = [[0]] * 4 # ignored
  136. y = np.array([[2, 1], [2, 2], [1, 2], [1, 1]])
  137. clf = DummyClassifier(strategy="uniform", random_state=global_random_seed)
  138. clf.fit(X, y)
  139. X = [[0]] * 500
  140. y_pred = clf.predict(X)
  141. for k in range(y.shape[1]):
  142. p = np.bincount(y_pred[:, k]) / float(len(X))
  143. assert_almost_equal(p[1], 0.5, decimal=1)
  144. assert_almost_equal(p[2], 0.5, decimal=1)
  145. _check_predict_proba(clf, X, y)
  146. _check_behavior_2d(clf)
  147. def test_string_labels():
  148. X = [[0]] * 5
  149. y = ["paris", "paris", "tokyo", "amsterdam", "berlin"]
  150. clf = DummyClassifier(strategy="most_frequent")
  151. clf.fit(X, y)
  152. assert_array_equal(clf.predict(X), ["paris"] * 5)
  153. @pytest.mark.parametrize(
  154. "y,y_test",
  155. [
  156. ([2, 1, 1, 1], [2, 2, 1, 1]),
  157. (
  158. np.array([[2, 2], [1, 1], [1, 1], [1, 1]]),
  159. np.array([[2, 2], [2, 2], [1, 1], [1, 1]]),
  160. ),
  161. ],
  162. )
  163. def test_classifier_score_with_None(y, y_test):
  164. clf = DummyClassifier(strategy="most_frequent")
  165. clf.fit(None, y)
  166. assert clf.score(None, y_test) == 0.5
  167. @pytest.mark.parametrize(
  168. "strategy", ["stratified", "most_frequent", "prior", "uniform", "constant"]
  169. )
  170. def test_classifier_prediction_independent_of_X(strategy, global_random_seed):
  171. y = [0, 2, 1, 1]
  172. X1 = [[0]] * 4
  173. clf1 = DummyClassifier(
  174. strategy=strategy, random_state=global_random_seed, constant=0
  175. )
  176. clf1.fit(X1, y)
  177. predictions1 = clf1.predict(X1)
  178. X2 = [[1]] * 4
  179. clf2 = DummyClassifier(
  180. strategy=strategy, random_state=global_random_seed, constant=0
  181. )
  182. clf2.fit(X2, y)
  183. predictions2 = clf2.predict(X2)
  184. assert_array_equal(predictions1, predictions2)
  185. def test_mean_strategy_regressor(global_random_seed):
  186. random_state = np.random.RandomState(seed=global_random_seed)
  187. X = [[0]] * 4 # ignored
  188. y = random_state.randn(4)
  189. reg = DummyRegressor()
  190. reg.fit(X, y)
  191. assert_array_equal(reg.predict(X), [np.mean(y)] * len(X))
  192. def test_mean_strategy_multioutput_regressor(global_random_seed):
  193. random_state = np.random.RandomState(seed=global_random_seed)
  194. X_learn = random_state.randn(10, 10)
  195. y_learn = random_state.randn(10, 5)
  196. mean = np.mean(y_learn, axis=0).reshape((1, -1))
  197. X_test = random_state.randn(20, 10)
  198. y_test = random_state.randn(20, 5)
  199. # Correctness oracle
  200. est = DummyRegressor()
  201. est.fit(X_learn, y_learn)
  202. y_pred_learn = est.predict(X_learn)
  203. y_pred_test = est.predict(X_test)
  204. _check_equality_regressor(mean, y_learn, y_pred_learn, y_test, y_pred_test)
  205. _check_behavior_2d(est)
  206. def test_regressor_exceptions():
  207. reg = DummyRegressor()
  208. with pytest.raises(NotFittedError):
  209. reg.predict([])
  210. def test_median_strategy_regressor(global_random_seed):
  211. random_state = np.random.RandomState(seed=global_random_seed)
  212. X = [[0]] * 5 # ignored
  213. y = random_state.randn(5)
  214. reg = DummyRegressor(strategy="median")
  215. reg.fit(X, y)
  216. assert_array_equal(reg.predict(X), [np.median(y)] * len(X))
  217. def test_median_strategy_multioutput_regressor(global_random_seed):
  218. random_state = np.random.RandomState(seed=global_random_seed)
  219. X_learn = random_state.randn(10, 10)
  220. y_learn = random_state.randn(10, 5)
  221. median = np.median(y_learn, axis=0).reshape((1, -1))
  222. X_test = random_state.randn(20, 10)
  223. y_test = random_state.randn(20, 5)
  224. # Correctness oracle
  225. est = DummyRegressor(strategy="median")
  226. est.fit(X_learn, y_learn)
  227. y_pred_learn = est.predict(X_learn)
  228. y_pred_test = est.predict(X_test)
  229. _check_equality_regressor(median, y_learn, y_pred_learn, y_test, y_pred_test)
  230. _check_behavior_2d(est)
  231. def test_quantile_strategy_regressor(global_random_seed):
  232. random_state = np.random.RandomState(seed=global_random_seed)
  233. X = [[0]] * 5 # ignored
  234. y = random_state.randn(5)
  235. reg = DummyRegressor(strategy="quantile", quantile=0.5)
  236. reg.fit(X, y)
  237. assert_array_equal(reg.predict(X), [np.median(y)] * len(X))
  238. reg = DummyRegressor(strategy="quantile", quantile=0)
  239. reg.fit(X, y)
  240. assert_array_equal(reg.predict(X), [np.min(y)] * len(X))
  241. reg = DummyRegressor(strategy="quantile", quantile=1)
  242. reg.fit(X, y)
  243. assert_array_equal(reg.predict(X), [np.max(y)] * len(X))
  244. reg = DummyRegressor(strategy="quantile", quantile=0.3)
  245. reg.fit(X, y)
  246. assert_array_equal(reg.predict(X), [np.percentile(y, q=30)] * len(X))
  247. def test_quantile_strategy_multioutput_regressor(global_random_seed):
  248. random_state = np.random.RandomState(seed=global_random_seed)
  249. X_learn = random_state.randn(10, 10)
  250. y_learn = random_state.randn(10, 5)
  251. median = np.median(y_learn, axis=0).reshape((1, -1))
  252. quantile_values = np.percentile(y_learn, axis=0, q=80).reshape((1, -1))
  253. X_test = random_state.randn(20, 10)
  254. y_test = random_state.randn(20, 5)
  255. # Correctness oracle
  256. est = DummyRegressor(strategy="quantile", quantile=0.5)
  257. est.fit(X_learn, y_learn)
  258. y_pred_learn = est.predict(X_learn)
  259. y_pred_test = est.predict(X_test)
  260. _check_equality_regressor(median, y_learn, y_pred_learn, y_test, y_pred_test)
  261. _check_behavior_2d(est)
  262. # Correctness oracle
  263. est = DummyRegressor(strategy="quantile", quantile=0.8)
  264. est.fit(X_learn, y_learn)
  265. y_pred_learn = est.predict(X_learn)
  266. y_pred_test = est.predict(X_test)
  267. _check_equality_regressor(
  268. quantile_values, y_learn, y_pred_learn, y_test, y_pred_test
  269. )
  270. _check_behavior_2d(est)
  271. def test_quantile_invalid():
  272. X = [[0]] * 5 # ignored
  273. y = [0] * 5 # ignored
  274. est = DummyRegressor(strategy="quantile", quantile=None)
  275. err_msg = (
  276. "When using `strategy='quantile', you have to specify the desired quantile"
  277. )
  278. with pytest.raises(ValueError, match=err_msg):
  279. est.fit(X, y)
  280. def test_quantile_strategy_empty_train():
  281. est = DummyRegressor(strategy="quantile", quantile=0.4)
  282. with pytest.raises(ValueError):
  283. est.fit([], [])
  284. def test_constant_strategy_regressor(global_random_seed):
  285. random_state = np.random.RandomState(seed=global_random_seed)
  286. X = [[0]] * 5 # ignored
  287. y = random_state.randn(5)
  288. reg = DummyRegressor(strategy="constant", constant=[43])
  289. reg.fit(X, y)
  290. assert_array_equal(reg.predict(X), [43] * len(X))
  291. reg = DummyRegressor(strategy="constant", constant=43)
  292. reg.fit(X, y)
  293. assert_array_equal(reg.predict(X), [43] * len(X))
  294. # non-regression test for #22478
  295. assert not isinstance(reg.constant, np.ndarray)
  296. def test_constant_strategy_multioutput_regressor(global_random_seed):
  297. random_state = np.random.RandomState(seed=global_random_seed)
  298. X_learn = random_state.randn(10, 10)
  299. y_learn = random_state.randn(10, 5)
  300. # test with 2d array
  301. constants = random_state.randn(5)
  302. X_test = random_state.randn(20, 10)
  303. y_test = random_state.randn(20, 5)
  304. # Correctness oracle
  305. est = DummyRegressor(strategy="constant", constant=constants)
  306. est.fit(X_learn, y_learn)
  307. y_pred_learn = est.predict(X_learn)
  308. y_pred_test = est.predict(X_test)
  309. _check_equality_regressor(constants, y_learn, y_pred_learn, y_test, y_pred_test)
  310. _check_behavior_2d_for_constant(est)
  311. def test_y_mean_attribute_regressor():
  312. X = [[0]] * 5
  313. y = [1, 2, 4, 6, 8]
  314. # when strategy = 'mean'
  315. est = DummyRegressor(strategy="mean")
  316. est.fit(X, y)
  317. assert est.constant_ == np.mean(y)
  318. def test_constants_not_specified_regressor():
  319. X = [[0]] * 5
  320. y = [1, 2, 4, 6, 8]
  321. est = DummyRegressor(strategy="constant")
  322. err_msg = "Constant target value has to be specified"
  323. with pytest.raises(TypeError, match=err_msg):
  324. est.fit(X, y)
  325. def test_constant_size_multioutput_regressor(global_random_seed):
  326. random_state = np.random.RandomState(seed=global_random_seed)
  327. X = random_state.randn(10, 10)
  328. y = random_state.randn(10, 5)
  329. est = DummyRegressor(strategy="constant", constant=[1, 2, 3, 4])
  330. err_msg = r"Constant target value should have shape \(5, 1\)."
  331. with pytest.raises(ValueError, match=err_msg):
  332. est.fit(X, y)
  333. def test_constant_strategy():
  334. X = [[0], [0], [0], [0]] # ignored
  335. y = [2, 1, 2, 2]
  336. clf = DummyClassifier(strategy="constant", random_state=0, constant=1)
  337. clf.fit(X, y)
  338. assert_array_equal(clf.predict(X), np.ones(len(X)))
  339. _check_predict_proba(clf, X, y)
  340. X = [[0], [0], [0], [0]] # ignored
  341. y = ["two", "one", "two", "two"]
  342. clf = DummyClassifier(strategy="constant", random_state=0, constant="one")
  343. clf.fit(X, y)
  344. assert_array_equal(clf.predict(X), np.array(["one"] * 4))
  345. _check_predict_proba(clf, X, y)
  346. def test_constant_strategy_multioutput():
  347. X = [[0], [0], [0], [0]] # ignored
  348. y = np.array([[2, 3], [1, 3], [2, 3], [2, 0]])
  349. n_samples = len(X)
  350. clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0])
  351. clf.fit(X, y)
  352. assert_array_equal(
  353. clf.predict(X), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])
  354. )
  355. _check_predict_proba(clf, X, y)
  356. @pytest.mark.parametrize(
  357. "y, params, err_msg",
  358. [
  359. ([2, 1, 2, 2], {"random_state": 0}, "Constant.*has to be specified"),
  360. ([2, 1, 2, 2], {"constant": [2, 0]}, "Constant.*should have shape"),
  361. (
  362. np.transpose([[2, 1, 2, 2], [2, 1, 2, 2]]),
  363. {"constant": 2},
  364. "Constant.*should have shape",
  365. ),
  366. (
  367. [2, 1, 2, 2],
  368. {"constant": "my-constant"},
  369. "constant=my-constant.*Possible values.*\\[1, 2]",
  370. ),
  371. (
  372. np.transpose([[2, 1, 2, 2], [2, 1, 2, 2]]),
  373. {"constant": [2, "unknown"]},
  374. "constant=\\[2, 'unknown'].*Possible values.*\\[1, 2]",
  375. ),
  376. ],
  377. ids=[
  378. "no-constant",
  379. "too-many-constant",
  380. "not-enough-output",
  381. "single-output",
  382. "multi-output",
  383. ],
  384. )
  385. def test_constant_strategy_exceptions(y, params, err_msg):
  386. X = [[0], [0], [0], [0]]
  387. clf = DummyClassifier(strategy="constant", **params)
  388. with pytest.raises(ValueError, match=err_msg):
  389. clf.fit(X, y)
  390. def test_classification_sample_weight():
  391. X = [[0], [0], [1]]
  392. y = [0, 1, 0]
  393. sample_weight = [0.1, 1.0, 0.1]
  394. clf = DummyClassifier(strategy="stratified").fit(X, y, sample_weight)
  395. assert_array_almost_equal(clf.class_prior_, [0.2 / 1.2, 1.0 / 1.2])
  396. def test_constant_strategy_sparse_target():
  397. X = [[0]] * 5 # ignored
  398. y = sp.csc_matrix(np.array([[0, 1], [4, 0], [1, 1], [1, 4], [1, 1]]))
  399. n_samples = len(X)
  400. clf = DummyClassifier(strategy="constant", random_state=0, constant=[1, 0])
  401. clf.fit(X, y)
  402. y_pred = clf.predict(X)
  403. assert sp.issparse(y_pred)
  404. assert_array_equal(
  405. y_pred.toarray(), np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])
  406. )
  407. def test_uniform_strategy_sparse_target_warning(global_random_seed):
  408. X = [[0]] * 5 # ignored
  409. y = sp.csc_matrix(np.array([[2, 1], [2, 2], [1, 4], [4, 2], [1, 1]]))
  410. clf = DummyClassifier(strategy="uniform", random_state=global_random_seed)
  411. with pytest.warns(UserWarning, match="the uniform strategy would not save memory"):
  412. clf.fit(X, y)
  413. X = [[0]] * 500
  414. y_pred = clf.predict(X)
  415. for k in range(y.shape[1]):
  416. p = np.bincount(y_pred[:, k]) / float(len(X))
  417. assert_almost_equal(p[1], 1 / 3, decimal=1)
  418. assert_almost_equal(p[2], 1 / 3, decimal=1)
  419. assert_almost_equal(p[4], 1 / 3, decimal=1)
  420. def test_stratified_strategy_sparse_target(global_random_seed):
  421. X = [[0]] * 5 # ignored
  422. y = sp.csc_matrix(np.array([[4, 1], [0, 0], [1, 1], [1, 4], [1, 1]]))
  423. clf = DummyClassifier(strategy="stratified", random_state=global_random_seed)
  424. clf.fit(X, y)
  425. X = [[0]] * 500
  426. y_pred = clf.predict(X)
  427. assert sp.issparse(y_pred)
  428. y_pred = y_pred.toarray()
  429. for k in range(y.shape[1]):
  430. p = np.bincount(y_pred[:, k]) / float(len(X))
  431. assert_almost_equal(p[1], 3.0 / 5, decimal=1)
  432. assert_almost_equal(p[0], 1.0 / 5, decimal=1)
  433. assert_almost_equal(p[4], 1.0 / 5, decimal=1)
  434. def test_most_frequent_and_prior_strategy_sparse_target():
  435. X = [[0]] * 5 # ignored
  436. y = sp.csc_matrix(np.array([[1, 0], [1, 3], [4, 0], [0, 1], [1, 0]]))
  437. n_samples = len(X)
  438. y_expected = np.hstack([np.ones((n_samples, 1)), np.zeros((n_samples, 1))])
  439. for strategy in ("most_frequent", "prior"):
  440. clf = DummyClassifier(strategy=strategy, random_state=0)
  441. clf.fit(X, y)
  442. y_pred = clf.predict(X)
  443. assert sp.issparse(y_pred)
  444. assert_array_equal(y_pred.toarray(), y_expected)
  445. def test_dummy_regressor_sample_weight(global_random_seed, n_samples=10):
  446. random_state = np.random.RandomState(seed=global_random_seed)
  447. X = [[0]] * n_samples
  448. y = random_state.rand(n_samples)
  449. sample_weight = random_state.rand(n_samples)
  450. est = DummyRegressor(strategy="mean").fit(X, y, sample_weight)
  451. assert est.constant_ == np.average(y, weights=sample_weight)
  452. est = DummyRegressor(strategy="median").fit(X, y, sample_weight)
  453. assert est.constant_ == _weighted_percentile(y, sample_weight, 50.0)
  454. est = DummyRegressor(strategy="quantile", quantile=0.95).fit(X, y, sample_weight)
  455. assert est.constant_ == _weighted_percentile(y, sample_weight, 95.0)
  456. def test_dummy_regressor_on_3D_array():
  457. X = np.array([[["foo"]], [["bar"]], [["baz"]]])
  458. y = np.array([2, 2, 2])
  459. y_expected = np.array([2, 2, 2])
  460. cls = DummyRegressor()
  461. cls.fit(X, y)
  462. y_pred = cls.predict(X)
  463. assert_array_equal(y_pred, y_expected)
  464. def test_dummy_classifier_on_3D_array():
  465. X = np.array([[["foo"]], [["bar"]], [["baz"]]])
  466. y = [2, 2, 2]
  467. y_expected = [2, 2, 2]
  468. y_proba_expected = [[1], [1], [1]]
  469. cls = DummyClassifier(strategy="stratified")
  470. cls.fit(X, y)
  471. y_pred = cls.predict(X)
  472. y_pred_proba = cls.predict_proba(X)
  473. assert_array_equal(y_pred, y_expected)
  474. assert_array_equal(y_pred_proba, y_proba_expected)
  475. def test_dummy_regressor_return_std():
  476. X = [[0]] * 3 # ignored
  477. y = np.array([2, 2, 2])
  478. y_std_expected = np.array([0, 0, 0])
  479. cls = DummyRegressor()
  480. cls.fit(X, y)
  481. y_pred_list = cls.predict(X, return_std=True)
  482. # there should be two elements when return_std is True
  483. assert len(y_pred_list) == 2
  484. # the second element should be all zeros
  485. assert_array_equal(y_pred_list[1], y_std_expected)
  486. @pytest.mark.parametrize(
  487. "y,y_test",
  488. [
  489. ([1, 1, 1, 2], [1.25] * 4),
  490. (np.array([[2, 2], [1, 1], [1, 1], [1, 1]]), [[1.25, 1.25]] * 4),
  491. ],
  492. )
  493. def test_regressor_score_with_None(y, y_test):
  494. reg = DummyRegressor()
  495. reg.fit(None, y)
  496. assert reg.score(None, y_test) == 1.0
  497. @pytest.mark.parametrize("strategy", ["mean", "median", "quantile", "constant"])
  498. def test_regressor_prediction_independent_of_X(strategy):
  499. y = [0, 2, 1, 1]
  500. X1 = [[0]] * 4
  501. reg1 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7)
  502. reg1.fit(X1, y)
  503. predictions1 = reg1.predict(X1)
  504. X2 = [[1]] * 4
  505. reg2 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7)
  506. reg2.fit(X2, y)
  507. predictions2 = reg2.predict(X2)
  508. assert_array_equal(predictions1, predictions2)
  509. @pytest.mark.parametrize(
  510. "strategy", ["stratified", "most_frequent", "prior", "uniform", "constant"]
  511. )
  512. def test_dtype_of_classifier_probas(strategy):
  513. y = [0, 2, 1, 1]
  514. X = np.zeros(4)
  515. model = DummyClassifier(strategy=strategy, random_state=0, constant=0)
  516. probas = model.fit(X, y).predict_proba(X)
  517. assert probas.dtype == np.float64