test_multioutput.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804
  1. import re
  2. import numpy as np
  3. import pytest
  4. import scipy.sparse as sp
  5. from joblib import cpu_count
  6. from sklearn import datasets
  7. from sklearn.base import ClassifierMixin, clone
  8. from sklearn.datasets import (
  9. load_linnerud,
  10. make_classification,
  11. make_multilabel_classification,
  12. make_regression,
  13. )
  14. from sklearn.dummy import DummyClassifier, DummyRegressor
  15. from sklearn.ensemble import (
  16. GradientBoostingRegressor,
  17. RandomForestClassifier,
  18. StackingRegressor,
  19. )
  20. from sklearn.exceptions import NotFittedError
  21. from sklearn.impute import SimpleImputer
  22. from sklearn.linear_model import (
  23. Lasso,
  24. LinearRegression,
  25. LogisticRegression,
  26. OrthogonalMatchingPursuit,
  27. PassiveAggressiveClassifier,
  28. Ridge,
  29. SGDClassifier,
  30. SGDRegressor,
  31. )
  32. from sklearn.metrics import jaccard_score, mean_squared_error
  33. from sklearn.model_selection import GridSearchCV, train_test_split
  34. from sklearn.multiclass import OneVsRestClassifier
  35. from sklearn.multioutput import (
  36. ClassifierChain,
  37. MultiOutputClassifier,
  38. MultiOutputRegressor,
  39. RegressorChain,
  40. )
  41. from sklearn.pipeline import make_pipeline
  42. from sklearn.svm import LinearSVC
  43. from sklearn.tree import DecisionTreeClassifier
  44. from sklearn.utils import shuffle
  45. from sklearn.utils._testing import (
  46. assert_almost_equal,
  47. assert_array_almost_equal,
  48. assert_array_equal,
  49. )
  50. def test_multi_target_regression():
  51. X, y = datasets.make_regression(n_targets=3, random_state=0)
  52. X_train, y_train = X[:50], y[:50]
  53. X_test, y_test = X[50:], y[50:]
  54. references = np.zeros_like(y_test)
  55. for n in range(3):
  56. rgr = GradientBoostingRegressor(random_state=0)
  57. rgr.fit(X_train, y_train[:, n])
  58. references[:, n] = rgr.predict(X_test)
  59. rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
  60. rgr.fit(X_train, y_train)
  61. y_pred = rgr.predict(X_test)
  62. assert_almost_equal(references, y_pred)
  63. def test_multi_target_regression_partial_fit():
  64. X, y = datasets.make_regression(n_targets=3, random_state=0)
  65. X_train, y_train = X[:50], y[:50]
  66. X_test, y_test = X[50:], y[50:]
  67. references = np.zeros_like(y_test)
  68. half_index = 25
  69. for n in range(3):
  70. sgr = SGDRegressor(random_state=0, max_iter=5)
  71. sgr.partial_fit(X_train[:half_index], y_train[:half_index, n])
  72. sgr.partial_fit(X_train[half_index:], y_train[half_index:, n])
  73. references[:, n] = sgr.predict(X_test)
  74. sgr = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5))
  75. sgr.partial_fit(X_train[:half_index], y_train[:half_index])
  76. sgr.partial_fit(X_train[half_index:], y_train[half_index:])
  77. y_pred = sgr.predict(X_test)
  78. assert_almost_equal(references, y_pred)
  79. assert not hasattr(MultiOutputRegressor(Lasso), "partial_fit")
  80. def test_multi_target_regression_one_target():
  81. # Test multi target regression raises
  82. X, y = datasets.make_regression(n_targets=1, random_state=0)
  83. rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
  84. msg = "at least two dimensions"
  85. with pytest.raises(ValueError, match=msg):
  86. rgr.fit(X, y)
  87. def test_multi_target_sparse_regression():
  88. X, y = datasets.make_regression(n_targets=3, random_state=0)
  89. X_train, y_train = X[:50], y[:50]
  90. X_test = X[50:]
  91. for sparse in [
  92. sp.csr_matrix,
  93. sp.csc_matrix,
  94. sp.coo_matrix,
  95. sp.dok_matrix,
  96. sp.lil_matrix,
  97. ]:
  98. rgr = MultiOutputRegressor(Lasso(random_state=0))
  99. rgr_sparse = MultiOutputRegressor(Lasso(random_state=0))
  100. rgr.fit(X_train, y_train)
  101. rgr_sparse.fit(sparse(X_train), y_train)
  102. assert_almost_equal(rgr.predict(X_test), rgr_sparse.predict(sparse(X_test)))
  103. def test_multi_target_sample_weights_api():
  104. X = [[1, 2, 3], [4, 5, 6]]
  105. y = [[3.141, 2.718], [2.718, 3.141]]
  106. w = [0.8, 0.6]
  107. rgr = MultiOutputRegressor(OrthogonalMatchingPursuit())
  108. msg = "does not support sample weights"
  109. with pytest.raises(ValueError, match=msg):
  110. rgr.fit(X, y, w)
  111. # no exception should be raised if the base estimator supports weights
  112. rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
  113. rgr.fit(X, y, w)
  114. def test_multi_target_sample_weight_partial_fit():
  115. # weighted regressor
  116. X = [[1, 2, 3], [4, 5, 6]]
  117. y = [[3.141, 2.718], [2.718, 3.141]]
  118. w = [2.0, 1.0]
  119. rgr_w = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5))
  120. rgr_w.partial_fit(X, y, w)
  121. # weighted with different weights
  122. w = [2.0, 2.0]
  123. rgr = MultiOutputRegressor(SGDRegressor(random_state=0, max_iter=5))
  124. rgr.partial_fit(X, y, w)
  125. assert rgr.predict(X)[0][0] != rgr_w.predict(X)[0][0]
  126. def test_multi_target_sample_weights():
  127. # weighted regressor
  128. Xw = [[1, 2, 3], [4, 5, 6]]
  129. yw = [[3.141, 2.718], [2.718, 3.141]]
  130. w = [2.0, 1.0]
  131. rgr_w = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
  132. rgr_w.fit(Xw, yw, w)
  133. # unweighted, but with repeated samples
  134. X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]]
  135. y = [[3.141, 2.718], [3.141, 2.718], [2.718, 3.141]]
  136. rgr = MultiOutputRegressor(GradientBoostingRegressor(random_state=0))
  137. rgr.fit(X, y)
  138. X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
  139. assert_almost_equal(rgr.predict(X_test), rgr_w.predict(X_test))
  140. # Import the data
  141. iris = datasets.load_iris()
  142. # create a multiple targets by randomized shuffling and concatenating y.
  143. X = iris.data
  144. y1 = iris.target
  145. y2 = shuffle(y1, random_state=1)
  146. y3 = shuffle(y1, random_state=2)
  147. y = np.column_stack((y1, y2, y3))
  148. n_samples, n_features = X.shape
  149. n_outputs = y.shape[1]
  150. n_classes = len(np.unique(y1))
  151. classes = list(map(np.unique, (y1, y2, y3)))
  152. def test_multi_output_classification_partial_fit_parallelism():
  153. sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
  154. mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=4)
  155. mor.partial_fit(X, y, classes)
  156. est1 = mor.estimators_[0]
  157. mor.partial_fit(X, y)
  158. est2 = mor.estimators_[0]
  159. if cpu_count() > 1:
  160. # parallelism requires this to be the case for a sane implementation
  161. assert est1 is not est2
  162. # check multioutput has predict_proba
  163. def test_hasattr_multi_output_predict_proba():
  164. # default SGDClassifier has loss='hinge'
  165. # which does not expose a predict_proba method
  166. sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
  167. multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
  168. multi_target_linear.fit(X, y)
  169. assert not hasattr(multi_target_linear, "predict_proba")
  170. # case where predict_proba attribute exists
  171. sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
  172. multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
  173. multi_target_linear.fit(X, y)
  174. assert hasattr(multi_target_linear, "predict_proba")
  175. # check predict_proba passes
  176. def test_multi_output_predict_proba():
  177. sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
  178. param = {"loss": ("hinge", "log_loss", "modified_huber")}
  179. # inner function for custom scoring
  180. def custom_scorer(estimator, X, y):
  181. if hasattr(estimator, "predict_proba"):
  182. return 1.0
  183. else:
  184. return 0.0
  185. grid_clf = GridSearchCV(
  186. sgd_linear_clf,
  187. param_grid=param,
  188. scoring=custom_scorer,
  189. cv=3,
  190. error_score="raise",
  191. )
  192. multi_target_linear = MultiOutputClassifier(grid_clf)
  193. multi_target_linear.fit(X, y)
  194. multi_target_linear.predict_proba(X)
  195. # SGDClassifier defaults to loss='hinge' which is not a probabilistic
  196. # loss function; therefore it does not expose a predict_proba method
  197. sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5)
  198. multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
  199. multi_target_linear.fit(X, y)
  200. err_msg = "probability estimates are not available for loss='hinge'"
  201. with pytest.raises(AttributeError, match=err_msg):
  202. multi_target_linear.predict_proba(X)
  203. def test_multi_output_classification_partial_fit():
  204. # test if multi_target initializes correctly with base estimator and fit
  205. # assert predictions work as expected for predict
  206. sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
  207. multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
  208. # train the multi_target_linear and also get the predictions.
  209. half_index = X.shape[0] // 2
  210. multi_target_linear.partial_fit(X[:half_index], y[:half_index], classes=classes)
  211. first_predictions = multi_target_linear.predict(X)
  212. assert (n_samples, n_outputs) == first_predictions.shape
  213. multi_target_linear.partial_fit(X[half_index:], y[half_index:])
  214. second_predictions = multi_target_linear.predict(X)
  215. assert (n_samples, n_outputs) == second_predictions.shape
  216. # train the linear classification with each column and assert that
  217. # predictions are equal after first partial_fit and second partial_fit
  218. for i in range(3):
  219. # create a clone with the same state
  220. sgd_linear_clf = clone(sgd_linear_clf)
  221. sgd_linear_clf.partial_fit(
  222. X[:half_index], y[:half_index, i], classes=classes[i]
  223. )
  224. assert_array_equal(sgd_linear_clf.predict(X), first_predictions[:, i])
  225. sgd_linear_clf.partial_fit(X[half_index:], y[half_index:, i])
  226. assert_array_equal(sgd_linear_clf.predict(X), second_predictions[:, i])
  227. def test_multi_output_classification_partial_fit_no_first_classes_exception():
  228. sgd_linear_clf = SGDClassifier(loss="log_loss", random_state=1, max_iter=5)
  229. multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
  230. msg = "classes must be passed on the first call to partial_fit."
  231. with pytest.raises(ValueError, match=msg):
  232. multi_target_linear.partial_fit(X, y)
  233. def test_multi_output_classification():
  234. # test if multi_target initializes correctly with base estimator and fit
  235. # assert predictions work as expected for predict, prodict_proba and score
  236. forest = RandomForestClassifier(n_estimators=10, random_state=1)
  237. multi_target_forest = MultiOutputClassifier(forest)
  238. # train the multi_target_forest and also get the predictions.
  239. multi_target_forest.fit(X, y)
  240. predictions = multi_target_forest.predict(X)
  241. assert (n_samples, n_outputs) == predictions.shape
  242. predict_proba = multi_target_forest.predict_proba(X)
  243. assert len(predict_proba) == n_outputs
  244. for class_probabilities in predict_proba:
  245. assert (n_samples, n_classes) == class_probabilities.shape
  246. assert_array_equal(np.argmax(np.dstack(predict_proba), axis=1), predictions)
  247. # train the forest with each column and assert that predictions are equal
  248. for i in range(3):
  249. forest_ = clone(forest) # create a clone with the same state
  250. forest_.fit(X, y[:, i])
  251. assert list(forest_.predict(X)) == list(predictions[:, i])
  252. assert_array_equal(list(forest_.predict_proba(X)), list(predict_proba[i]))
  253. def test_multiclass_multioutput_estimator():
  254. # test to check meta of meta estimators
  255. svc = LinearSVC(dual="auto", random_state=0)
  256. multi_class_svc = OneVsRestClassifier(svc)
  257. multi_target_svc = MultiOutputClassifier(multi_class_svc)
  258. multi_target_svc.fit(X, y)
  259. predictions = multi_target_svc.predict(X)
  260. assert (n_samples, n_outputs) == predictions.shape
  261. # train the forest with each column and assert that predictions are equal
  262. for i in range(3):
  263. multi_class_svc_ = clone(multi_class_svc) # create a clone
  264. multi_class_svc_.fit(X, y[:, i])
  265. assert list(multi_class_svc_.predict(X)) == list(predictions[:, i])
  266. def test_multiclass_multioutput_estimator_predict_proba():
  267. seed = 542
  268. # make test deterministic
  269. rng = np.random.RandomState(seed)
  270. # random features
  271. X = rng.normal(size=(5, 5))
  272. # random labels
  273. y1 = np.array(["b", "a", "a", "b", "a"]).reshape(5, 1) # 2 classes
  274. y2 = np.array(["d", "e", "f", "e", "d"]).reshape(5, 1) # 3 classes
  275. Y = np.concatenate([y1, y2], axis=1)
  276. clf = MultiOutputClassifier(
  277. LogisticRegression(solver="liblinear", random_state=seed)
  278. )
  279. clf.fit(X, Y)
  280. y_result = clf.predict_proba(X)
  281. y_actual = [
  282. np.array(
  283. [
  284. [0.23481764, 0.76518236],
  285. [0.67196072, 0.32803928],
  286. [0.54681448, 0.45318552],
  287. [0.34883923, 0.65116077],
  288. [0.73687069, 0.26312931],
  289. ]
  290. ),
  291. np.array(
  292. [
  293. [0.5171785, 0.23878628, 0.24403522],
  294. [0.22141451, 0.64102704, 0.13755846],
  295. [0.16751315, 0.18256843, 0.64991843],
  296. [0.27357372, 0.55201592, 0.17441036],
  297. [0.65745193, 0.26062899, 0.08191907],
  298. ]
  299. ),
  300. ]
  301. for i in range(len(y_actual)):
  302. assert_almost_equal(y_result[i], y_actual[i])
  303. def test_multi_output_classification_sample_weights():
  304. # weighted classifier
  305. Xw = [[1, 2, 3], [4, 5, 6]]
  306. yw = [[3, 2], [2, 3]]
  307. w = np.asarray([2.0, 1.0])
  308. forest = RandomForestClassifier(n_estimators=10, random_state=1)
  309. clf_w = MultiOutputClassifier(forest)
  310. clf_w.fit(Xw, yw, w)
  311. # unweighted, but with repeated samples
  312. X = [[1, 2, 3], [1, 2, 3], [4, 5, 6]]
  313. y = [[3, 2], [3, 2], [2, 3]]
  314. forest = RandomForestClassifier(n_estimators=10, random_state=1)
  315. clf = MultiOutputClassifier(forest)
  316. clf.fit(X, y)
  317. X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
  318. assert_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
  319. def test_multi_output_classification_partial_fit_sample_weights():
  320. # weighted classifier
  321. Xw = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
  322. yw = [[3, 2], [2, 3], [3, 2]]
  323. w = np.asarray([2.0, 1.0, 1.0])
  324. sgd_linear_clf = SGDClassifier(random_state=1, max_iter=20)
  325. clf_w = MultiOutputClassifier(sgd_linear_clf)
  326. clf_w.fit(Xw, yw, w)
  327. # unweighted, but with repeated samples
  328. X = [[1, 2, 3], [1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
  329. y = [[3, 2], [3, 2], [2, 3], [3, 2]]
  330. sgd_linear_clf = SGDClassifier(random_state=1, max_iter=20)
  331. clf = MultiOutputClassifier(sgd_linear_clf)
  332. clf.fit(X, y)
  333. X_test = [[1.5, 2.5, 3.5]]
  334. assert_array_almost_equal(clf.predict(X_test), clf_w.predict(X_test))
  335. def test_multi_output_exceptions():
  336. # NotFittedError when fit is not done but score, predict and
  337. # and predict_proba are called
  338. moc = MultiOutputClassifier(LinearSVC(dual="auto", random_state=0))
  339. with pytest.raises(NotFittedError):
  340. moc.score(X, y)
  341. # ValueError when number of outputs is different
  342. # for fit and score
  343. y_new = np.column_stack((y1, y2))
  344. moc.fit(X, y)
  345. with pytest.raises(ValueError):
  346. moc.score(X, y_new)
  347. # ValueError when y is continuous
  348. msg = "Unknown label type"
  349. with pytest.raises(ValueError, match=msg):
  350. moc.fit(X, X[:, 1])
  351. @pytest.mark.parametrize("response_method", ["predict_proba", "predict"])
  352. def test_multi_output_not_fitted_error(response_method):
  353. """Check that we raise the proper error when the estimator is not fitted"""
  354. moc = MultiOutputClassifier(LogisticRegression())
  355. with pytest.raises(NotFittedError):
  356. getattr(moc, response_method)(X)
  357. def test_multi_output_delegate_predict_proba():
  358. """Check the behavior for the delegation of predict_proba to the underlying
  359. estimator"""
  360. # A base estimator with `predict_proba`should expose the method even before fit
  361. moc = MultiOutputClassifier(LogisticRegression())
  362. assert hasattr(moc, "predict_proba")
  363. moc.fit(X, y)
  364. assert hasattr(moc, "predict_proba")
  365. # A base estimator without `predict_proba` should raise an AttributeError
  366. moc = MultiOutputClassifier(LinearSVC(dual="auto"))
  367. assert not hasattr(moc, "predict_proba")
  368. msg = "'LinearSVC' object has no attribute 'predict_proba'"
  369. with pytest.raises(AttributeError, match=msg):
  370. moc.predict_proba(X)
  371. moc.fit(X, y)
  372. assert not hasattr(moc, "predict_proba")
  373. with pytest.raises(AttributeError, match=msg):
  374. moc.predict_proba(X)
  375. def generate_multilabel_dataset_with_correlations():
  376. # Generate a multilabel data set from a multiclass dataset as a way of
  377. # by representing the integer number of the original class using a binary
  378. # encoding.
  379. X, y = make_classification(
  380. n_samples=1000, n_features=100, n_classes=16, n_informative=10, random_state=0
  381. )
  382. Y_multi = np.array([[int(yyy) for yyy in format(yy, "#06b")[2:]] for yy in y])
  383. return X, Y_multi
  384. def test_classifier_chain_fit_and_predict_with_linear_svc():
  385. # Fit classifier chain and verify predict performance using LinearSVC
  386. X, Y = generate_multilabel_dataset_with_correlations()
  387. classifier_chain = ClassifierChain(LinearSVC(dual="auto"))
  388. classifier_chain.fit(X, Y)
  389. Y_pred = classifier_chain.predict(X)
  390. assert Y_pred.shape == Y.shape
  391. Y_decision = classifier_chain.decision_function(X)
  392. Y_binary = Y_decision >= 0
  393. assert_array_equal(Y_binary, Y_pred)
  394. assert not hasattr(classifier_chain, "predict_proba")
  395. def test_classifier_chain_fit_and_predict_with_sparse_data():
  396. # Fit classifier chain with sparse data
  397. X, Y = generate_multilabel_dataset_with_correlations()
  398. X_sparse = sp.csr_matrix(X)
  399. classifier_chain = ClassifierChain(LogisticRegression())
  400. classifier_chain.fit(X_sparse, Y)
  401. Y_pred_sparse = classifier_chain.predict(X_sparse)
  402. classifier_chain = ClassifierChain(LogisticRegression())
  403. classifier_chain.fit(X, Y)
  404. Y_pred_dense = classifier_chain.predict(X)
  405. assert_array_equal(Y_pred_sparse, Y_pred_dense)
  406. def test_classifier_chain_vs_independent_models():
  407. # Verify that an ensemble of classifier chains (each of length
  408. # N) can achieve a higher Jaccard similarity score than N independent
  409. # models
  410. X, Y = generate_multilabel_dataset_with_correlations()
  411. X_train = X[:600, :]
  412. X_test = X[600:, :]
  413. Y_train = Y[:600, :]
  414. Y_test = Y[600:, :]
  415. ovr = OneVsRestClassifier(LogisticRegression())
  416. ovr.fit(X_train, Y_train)
  417. Y_pred_ovr = ovr.predict(X_test)
  418. chain = ClassifierChain(LogisticRegression())
  419. chain.fit(X_train, Y_train)
  420. Y_pred_chain = chain.predict(X_test)
  421. assert jaccard_score(Y_test, Y_pred_chain, average="samples") > jaccard_score(
  422. Y_test, Y_pred_ovr, average="samples"
  423. )
  424. def test_base_chain_fit_and_predict():
  425. # Fit base chain and verify predict performance
  426. X, Y = generate_multilabel_dataset_with_correlations()
  427. chains = [RegressorChain(Ridge()), ClassifierChain(LogisticRegression())]
  428. for chain in chains:
  429. chain.fit(X, Y)
  430. Y_pred = chain.predict(X)
  431. assert Y_pred.shape == Y.shape
  432. assert [c.coef_.size for c in chain.estimators_] == list(
  433. range(X.shape[1], X.shape[1] + Y.shape[1])
  434. )
  435. Y_prob = chains[1].predict_proba(X)
  436. Y_binary = Y_prob >= 0.5
  437. assert_array_equal(Y_binary, Y_pred)
  438. assert isinstance(chains[1], ClassifierMixin)
  439. def test_base_chain_fit_and_predict_with_sparse_data_and_cv():
  440. # Fit base chain with sparse data cross_val_predict
  441. X, Y = generate_multilabel_dataset_with_correlations()
  442. X_sparse = sp.csr_matrix(X)
  443. base_chains = [
  444. ClassifierChain(LogisticRegression(), cv=3),
  445. RegressorChain(Ridge(), cv=3),
  446. ]
  447. for chain in base_chains:
  448. chain.fit(X_sparse, Y)
  449. Y_pred = chain.predict(X_sparse)
  450. assert Y_pred.shape == Y.shape
  451. def test_base_chain_random_order():
  452. # Fit base chain with random order
  453. X, Y = generate_multilabel_dataset_with_correlations()
  454. for chain in [ClassifierChain(LogisticRegression()), RegressorChain(Ridge())]:
  455. chain_random = clone(chain).set_params(order="random", random_state=42)
  456. chain_random.fit(X, Y)
  457. chain_fixed = clone(chain).set_params(order=chain_random.order_)
  458. chain_fixed.fit(X, Y)
  459. assert_array_equal(chain_fixed.order_, chain_random.order_)
  460. assert list(chain_random.order) != list(range(4))
  461. assert len(chain_random.order_) == 4
  462. assert len(set(chain_random.order_)) == 4
  463. # Randomly ordered chain should behave identically to a fixed order
  464. # chain with the same order.
  465. for est1, est2 in zip(chain_random.estimators_, chain_fixed.estimators_):
  466. assert_array_almost_equal(est1.coef_, est2.coef_)
  467. def test_base_chain_crossval_fit_and_predict():
  468. # Fit chain with cross_val_predict and verify predict
  469. # performance
  470. X, Y = generate_multilabel_dataset_with_correlations()
  471. for chain in [ClassifierChain(LogisticRegression()), RegressorChain(Ridge())]:
  472. chain.fit(X, Y)
  473. chain_cv = clone(chain).set_params(cv=3)
  474. chain_cv.fit(X, Y)
  475. Y_pred_cv = chain_cv.predict(X)
  476. Y_pred = chain.predict(X)
  477. assert Y_pred_cv.shape == Y_pred.shape
  478. assert not np.all(Y_pred == Y_pred_cv)
  479. if isinstance(chain, ClassifierChain):
  480. assert jaccard_score(Y, Y_pred_cv, average="samples") > 0.4
  481. else:
  482. assert mean_squared_error(Y, Y_pred_cv) < 0.25
  483. @pytest.mark.parametrize(
  484. "estimator",
  485. [
  486. RandomForestClassifier(n_estimators=2),
  487. MultiOutputClassifier(RandomForestClassifier(n_estimators=2)),
  488. ClassifierChain(RandomForestClassifier(n_estimators=2)),
  489. ],
  490. )
  491. def test_multi_output_classes_(estimator):
  492. # Tests classes_ attribute of multioutput classifiers
  493. # RandomForestClassifier supports multioutput out-of-the-box
  494. estimator.fit(X, y)
  495. assert isinstance(estimator.classes_, list)
  496. assert len(estimator.classes_) == n_outputs
  497. for estimator_classes, expected_classes in zip(classes, estimator.classes_):
  498. assert_array_equal(estimator_classes, expected_classes)
  499. class DummyRegressorWithFitParams(DummyRegressor):
  500. def fit(self, X, y, sample_weight=None, **fit_params):
  501. self._fit_params = fit_params
  502. return super().fit(X, y, sample_weight)
  503. class DummyClassifierWithFitParams(DummyClassifier):
  504. def fit(self, X, y, sample_weight=None, **fit_params):
  505. self._fit_params = fit_params
  506. return super().fit(X, y, sample_weight)
  507. @pytest.mark.filterwarnings("ignore:`n_features_in_` is deprecated")
  508. @pytest.mark.parametrize(
  509. "estimator, dataset",
  510. [
  511. (
  512. MultiOutputClassifier(DummyClassifierWithFitParams(strategy="prior")),
  513. datasets.make_multilabel_classification(),
  514. ),
  515. (
  516. MultiOutputRegressor(DummyRegressorWithFitParams()),
  517. datasets.make_regression(n_targets=3, random_state=0),
  518. ),
  519. ],
  520. )
  521. def test_multioutput_estimator_with_fit_params(estimator, dataset):
  522. X, y = dataset
  523. some_param = np.zeros_like(X)
  524. estimator.fit(X, y, some_param=some_param)
  525. for dummy_estimator in estimator.estimators_:
  526. assert "some_param" in dummy_estimator._fit_params
  527. def test_regressor_chain_w_fit_params():
  528. # Make sure fit_params are properly propagated to the sub-estimators
  529. rng = np.random.RandomState(0)
  530. X, y = datasets.make_regression(n_targets=3, random_state=0)
  531. weight = rng.rand(y.shape[0])
  532. class MySGD(SGDRegressor):
  533. def fit(self, X, y, **fit_params):
  534. self.sample_weight_ = fit_params["sample_weight"]
  535. super().fit(X, y, **fit_params)
  536. model = RegressorChain(MySGD())
  537. # Fitting with params
  538. fit_param = {"sample_weight": weight}
  539. model.fit(X, y, **fit_param)
  540. for est in model.estimators_:
  541. assert est.sample_weight_ is weight
  542. @pytest.mark.parametrize(
  543. "MultiOutputEstimator, Estimator",
  544. [(MultiOutputClassifier, LogisticRegression), (MultiOutputRegressor, Ridge)],
  545. )
  546. # FIXME: we should move this test in `estimator_checks` once we are able
  547. # to construct meta-estimator instances
  548. def test_support_missing_values(MultiOutputEstimator, Estimator):
  549. # smoke test to check that pipeline MultioutputEstimators are letting
  550. # the validation of missing values to
  551. # the underlying pipeline, regressor or classifier
  552. rng = np.random.RandomState(42)
  553. X, y = rng.randn(50, 2), rng.binomial(1, 0.5, (50, 3))
  554. mask = rng.choice([1, 0], X.shape, p=[0.01, 0.99]).astype(bool)
  555. X[mask] = np.nan
  556. pipe = make_pipeline(SimpleImputer(), Estimator())
  557. MultiOutputEstimator(pipe).fit(X, y).score(X, y)
  558. @pytest.mark.parametrize("order_type", [list, np.array, tuple])
  559. def test_classifier_chain_tuple_order(order_type):
  560. X = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
  561. y = [[3, 2], [2, 3], [3, 2]]
  562. order = order_type([1, 0])
  563. chain = ClassifierChain(RandomForestClassifier(), order=order)
  564. chain.fit(X, y)
  565. X_test = [[1.5, 2.5, 3.5]]
  566. y_test = [[3, 2]]
  567. assert_array_almost_equal(chain.predict(X_test), y_test)
  568. def test_classifier_chain_tuple_invalid_order():
  569. X = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
  570. y = [[3, 2], [2, 3], [3, 2]]
  571. order = tuple([1, 2])
  572. chain = ClassifierChain(RandomForestClassifier(), order=order)
  573. with pytest.raises(ValueError, match="invalid order"):
  574. chain.fit(X, y)
  575. def test_classifier_chain_verbose(capsys):
  576. X, y = make_multilabel_classification(
  577. n_samples=100, n_features=5, n_classes=3, n_labels=3, random_state=0
  578. )
  579. X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
  580. pattern = (
  581. r"\[Chain\].*\(1 of 3\) Processing order 0, total=.*\n"
  582. r"\[Chain\].*\(2 of 3\) Processing order 1, total=.*\n"
  583. r"\[Chain\].*\(3 of 3\) Processing order 2, total=.*\n$"
  584. )
  585. classifier = ClassifierChain(
  586. DecisionTreeClassifier(),
  587. order=[0, 1, 2],
  588. random_state=0,
  589. verbose=True,
  590. )
  591. classifier.fit(X_train, y_train)
  592. assert re.match(pattern, capsys.readouterr()[0])
  593. def test_regressor_chain_verbose(capsys):
  594. X, y = make_regression(n_samples=125, n_targets=3, random_state=0)
  595. X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
  596. pattern = (
  597. r"\[Chain\].*\(1 of 3\) Processing order 1, total=.*\n"
  598. r"\[Chain\].*\(2 of 3\) Processing order 0, total=.*\n"
  599. r"\[Chain\].*\(3 of 3\) Processing order 2, total=.*\n$"
  600. )
  601. regressor = RegressorChain(
  602. LinearRegression(),
  603. order=[1, 0, 2],
  604. random_state=0,
  605. verbose=True,
  606. )
  607. regressor.fit(X_train, y_train)
  608. assert re.match(pattern, capsys.readouterr()[0])
  609. def test_multioutputregressor_ducktypes_fitted_estimator():
  610. """Test that MultiOutputRegressor checks the fitted estimator for
  611. predict. Non-regression test for #16549."""
  612. X, y = load_linnerud(return_X_y=True)
  613. stacker = StackingRegressor(
  614. estimators=[("sgd", SGDRegressor(random_state=1))],
  615. final_estimator=Ridge(),
  616. cv=2,
  617. )
  618. reg = MultiOutputRegressor(estimator=stacker).fit(X, y)
  619. # Does not raise
  620. reg.predict(X)
  621. @pytest.mark.parametrize(
  622. "Cls, method", [(ClassifierChain, "fit"), (MultiOutputClassifier, "partial_fit")]
  623. )
  624. def test_fit_params_no_routing(Cls, method):
  625. """Check that we raise an error when passing metadata not requested by the
  626. underlying classifier.
  627. """
  628. X, y = make_classification(n_samples=50)
  629. clf = Cls(PassiveAggressiveClassifier())
  630. with pytest.raises(ValueError, match="is only supported if"):
  631. getattr(clf, method)(X, y, test=1)
  632. def test_multioutput_regressor_has_partial_fit():
  633. # Test that an unfitted MultiOutputRegressor handles available_if for
  634. # partial_fit correctly
  635. est = MultiOutputRegressor(LinearRegression())
  636. msg = "This 'MultiOutputRegressor' has no attribute 'partial_fit'"
  637. with pytest.raises(AttributeError, match=msg):
  638. getattr(est, "partial_fit")