test_voting.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676
  1. """Testing for the VotingClassifier and VotingRegressor"""
  2. import re
  3. import numpy as np
  4. import pytest
  5. from sklearn import datasets
  6. from sklearn.base import BaseEstimator, ClassifierMixin, clone
  7. from sklearn.datasets import make_multilabel_classification
  8. from sklearn.dummy import DummyRegressor
  9. from sklearn.ensemble import (
  10. RandomForestClassifier,
  11. RandomForestRegressor,
  12. VotingClassifier,
  13. VotingRegressor,
  14. )
  15. from sklearn.exceptions import NotFittedError
  16. from sklearn.linear_model import LinearRegression, LogisticRegression
  17. from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
  18. from sklearn.multiclass import OneVsRestClassifier
  19. from sklearn.naive_bayes import GaussianNB
  20. from sklearn.neighbors import KNeighborsClassifier
  21. from sklearn.preprocessing import StandardScaler
  22. from sklearn.svm import SVC
  23. from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
  24. from sklearn.utils._testing import (
  25. assert_almost_equal,
  26. assert_array_almost_equal,
  27. assert_array_equal,
  28. )
  29. # Load datasets
  30. iris = datasets.load_iris()
  31. X, y = iris.data[:, 1:3], iris.target
  32. # Scaled to solve ConvergenceWarning throw by Logistic Regression
  33. X_scaled = StandardScaler().fit_transform(X)
  34. X_r, y_r = datasets.load_diabetes(return_X_y=True)
  35. @pytest.mark.parametrize(
  36. "params, err_msg",
  37. [
  38. (
  39. {"estimators": []},
  40. "Invalid 'estimators' attribute, 'estimators' should be a non-empty list",
  41. ),
  42. (
  43. {"estimators": [("lr", LogisticRegression())], "weights": [1, 2]},
  44. "Number of `estimators` and weights must be equal",
  45. ),
  46. ],
  47. )
  48. def test_voting_classifier_estimator_init(params, err_msg):
  49. ensemble = VotingClassifier(**params)
  50. with pytest.raises(ValueError, match=err_msg):
  51. ensemble.fit(X, y)
  52. def test_predictproba_hardvoting():
  53. eclf = VotingClassifier(
  54. estimators=[("lr1", LogisticRegression()), ("lr2", LogisticRegression())],
  55. voting="hard",
  56. )
  57. msg = "predict_proba is not available when voting='hard'"
  58. with pytest.raises(AttributeError, match=msg):
  59. eclf.predict_proba
  60. assert not hasattr(eclf, "predict_proba")
  61. eclf.fit(X_scaled, y)
  62. assert not hasattr(eclf, "predict_proba")
  63. def test_notfitted():
  64. eclf = VotingClassifier(
  65. estimators=[("lr1", LogisticRegression()), ("lr2", LogisticRegression())],
  66. voting="soft",
  67. )
  68. ereg = VotingRegressor([("dr", DummyRegressor())])
  69. msg = (
  70. "This %s instance is not fitted yet. Call 'fit'"
  71. " with appropriate arguments before using this estimator."
  72. )
  73. with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
  74. eclf.predict(X)
  75. with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
  76. eclf.predict_proba(X)
  77. with pytest.raises(NotFittedError, match=msg % "VotingClassifier"):
  78. eclf.transform(X)
  79. with pytest.raises(NotFittedError, match=msg % "VotingRegressor"):
  80. ereg.predict(X_r)
  81. with pytest.raises(NotFittedError, match=msg % "VotingRegressor"):
  82. ereg.transform(X_r)
  83. def test_majority_label_iris(global_random_seed):
  84. """Check classification by majority label on dataset iris."""
  85. clf1 = LogisticRegression(solver="liblinear", random_state=global_random_seed)
  86. clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
  87. clf3 = GaussianNB()
  88. eclf = VotingClassifier(
  89. estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="hard"
  90. )
  91. scores = cross_val_score(eclf, X, y, scoring="accuracy")
  92. assert scores.mean() >= 0.9
  93. def test_tie_situation():
  94. """Check voting classifier selects smaller class label in tie situation."""
  95. clf1 = LogisticRegression(random_state=123, solver="liblinear")
  96. clf2 = RandomForestClassifier(random_state=123)
  97. eclf = VotingClassifier(estimators=[("lr", clf1), ("rf", clf2)], voting="hard")
  98. assert clf1.fit(X, y).predict(X)[73] == 2
  99. assert clf2.fit(X, y).predict(X)[73] == 1
  100. assert eclf.fit(X, y).predict(X)[73] == 1
  101. def test_weights_iris(global_random_seed):
  102. """Check classification by average probabilities on dataset iris."""
  103. clf1 = LogisticRegression(random_state=global_random_seed)
  104. clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
  105. clf3 = GaussianNB()
  106. eclf = VotingClassifier(
  107. estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
  108. voting="soft",
  109. weights=[1, 2, 10],
  110. )
  111. scores = cross_val_score(eclf, X_scaled, y, scoring="accuracy")
  112. assert scores.mean() >= 0.9
  113. def test_weights_regressor():
  114. """Check weighted average regression prediction on diabetes dataset."""
  115. reg1 = DummyRegressor(strategy="mean")
  116. reg2 = DummyRegressor(strategy="median")
  117. reg3 = DummyRegressor(strategy="quantile", quantile=0.2)
  118. ereg = VotingRegressor(
  119. [("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 2, 10]
  120. )
  121. X_r_train, X_r_test, y_r_train, y_r_test = train_test_split(
  122. X_r, y_r, test_size=0.25
  123. )
  124. reg1_pred = reg1.fit(X_r_train, y_r_train).predict(X_r_test)
  125. reg2_pred = reg2.fit(X_r_train, y_r_train).predict(X_r_test)
  126. reg3_pred = reg3.fit(X_r_train, y_r_train).predict(X_r_test)
  127. ereg_pred = ereg.fit(X_r_train, y_r_train).predict(X_r_test)
  128. avg = np.average(
  129. np.asarray([reg1_pred, reg2_pred, reg3_pred]), axis=0, weights=[1, 2, 10]
  130. )
  131. assert_almost_equal(ereg_pred, avg, decimal=2)
  132. ereg_weights_none = VotingRegressor(
  133. [("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=None
  134. )
  135. ereg_weights_equal = VotingRegressor(
  136. [("mean", reg1), ("median", reg2), ("quantile", reg3)], weights=[1, 1, 1]
  137. )
  138. ereg_weights_none.fit(X_r_train, y_r_train)
  139. ereg_weights_equal.fit(X_r_train, y_r_train)
  140. ereg_none_pred = ereg_weights_none.predict(X_r_test)
  141. ereg_equal_pred = ereg_weights_equal.predict(X_r_test)
  142. assert_almost_equal(ereg_none_pred, ereg_equal_pred, decimal=2)
  143. def test_predict_on_toy_problem(global_random_seed):
  144. """Manually check predicted class labels for toy dataset."""
  145. clf1 = LogisticRegression(random_state=global_random_seed)
  146. clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
  147. clf3 = GaussianNB()
  148. X = np.array(
  149. [[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2], [2.1, 1.4], [3.1, 2.3]]
  150. )
  151. y = np.array([1, 1, 1, 2, 2, 2])
  152. assert_array_equal(clf1.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
  153. assert_array_equal(clf2.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
  154. assert_array_equal(clf3.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
  155. eclf = VotingClassifier(
  156. estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
  157. voting="hard",
  158. weights=[1, 1, 1],
  159. )
  160. assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
  161. eclf = VotingClassifier(
  162. estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
  163. voting="soft",
  164. weights=[1, 1, 1],
  165. )
  166. assert_array_equal(eclf.fit(X, y).predict(X), [1, 1, 1, 2, 2, 2])
  167. def test_predict_proba_on_toy_problem():
  168. """Calculate predicted probabilities on toy dataset."""
  169. clf1 = LogisticRegression(random_state=123)
  170. clf2 = RandomForestClassifier(random_state=123)
  171. clf3 = GaussianNB()
  172. X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
  173. y = np.array([1, 1, 2, 2])
  174. clf1_res = np.array(
  175. [
  176. [0.59790391, 0.40209609],
  177. [0.57622162, 0.42377838],
  178. [0.50728456, 0.49271544],
  179. [0.40241774, 0.59758226],
  180. ]
  181. )
  182. clf2_res = np.array([[0.8, 0.2], [0.8, 0.2], [0.2, 0.8], [0.3, 0.7]])
  183. clf3_res = np.array(
  184. [[0.9985082, 0.0014918], [0.99845843, 0.00154157], [0.0, 1.0], [0.0, 1.0]]
  185. )
  186. t00 = (2 * clf1_res[0][0] + clf2_res[0][0] + clf3_res[0][0]) / 4
  187. t11 = (2 * clf1_res[1][1] + clf2_res[1][1] + clf3_res[1][1]) / 4
  188. t21 = (2 * clf1_res[2][1] + clf2_res[2][1] + clf3_res[2][1]) / 4
  189. t31 = (2 * clf1_res[3][1] + clf2_res[3][1] + clf3_res[3][1]) / 4
  190. eclf = VotingClassifier(
  191. estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
  192. voting="soft",
  193. weights=[2, 1, 1],
  194. )
  195. eclf_res = eclf.fit(X, y).predict_proba(X)
  196. assert_almost_equal(t00, eclf_res[0][0], decimal=1)
  197. assert_almost_equal(t11, eclf_res[1][1], decimal=1)
  198. assert_almost_equal(t21, eclf_res[2][1], decimal=1)
  199. assert_almost_equal(t31, eclf_res[3][1], decimal=1)
  200. with pytest.raises(
  201. AttributeError, match="predict_proba is not available when voting='hard'"
  202. ):
  203. eclf = VotingClassifier(
  204. estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="hard"
  205. )
  206. eclf.fit(X, y).predict_proba(X)
  207. def test_multilabel():
  208. """Check if error is raised for multilabel classification."""
  209. X, y = make_multilabel_classification(
  210. n_classes=2, n_labels=1, allow_unlabeled=False, random_state=123
  211. )
  212. clf = OneVsRestClassifier(SVC(kernel="linear"))
  213. eclf = VotingClassifier(estimators=[("ovr", clf)], voting="hard")
  214. try:
  215. eclf.fit(X, y)
  216. except NotImplementedError:
  217. return
  218. def test_gridsearch():
  219. """Check GridSearch support."""
  220. clf1 = LogisticRegression(random_state=1)
  221. clf2 = RandomForestClassifier(random_state=1, n_estimators=3)
  222. clf3 = GaussianNB()
  223. eclf = VotingClassifier(
  224. estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft"
  225. )
  226. params = {
  227. "lr__C": [1.0, 100.0],
  228. "voting": ["soft", "hard"],
  229. "weights": [[0.5, 0.5, 0.5], [1.0, 0.5, 0.5]],
  230. }
  231. grid = GridSearchCV(estimator=eclf, param_grid=params, cv=2)
  232. grid.fit(X_scaled, y)
  233. def test_parallel_fit(global_random_seed):
  234. """Check parallel backend of VotingClassifier on toy dataset."""
  235. clf1 = LogisticRegression(random_state=global_random_seed)
  236. clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
  237. clf3 = GaussianNB()
  238. X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
  239. y = np.array([1, 1, 2, 2])
  240. eclf1 = VotingClassifier(
  241. estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=1
  242. ).fit(X, y)
  243. eclf2 = VotingClassifier(
  244. estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft", n_jobs=2
  245. ).fit(X, y)
  246. assert_array_equal(eclf1.predict(X), eclf2.predict(X))
  247. assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
  248. def test_sample_weight(global_random_seed):
  249. """Tests sample_weight parameter of VotingClassifier"""
  250. clf1 = LogisticRegression(random_state=global_random_seed)
  251. clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
  252. clf3 = SVC(probability=True, random_state=global_random_seed)
  253. eclf1 = VotingClassifier(
  254. estimators=[("lr", clf1), ("rf", clf2), ("svc", clf3)], voting="soft"
  255. ).fit(X_scaled, y, sample_weight=np.ones((len(y),)))
  256. eclf2 = VotingClassifier(
  257. estimators=[("lr", clf1), ("rf", clf2), ("svc", clf3)], voting="soft"
  258. ).fit(X_scaled, y)
  259. assert_array_equal(eclf1.predict(X_scaled), eclf2.predict(X_scaled))
  260. assert_array_almost_equal(
  261. eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled)
  262. )
  263. sample_weight = np.random.RandomState(global_random_seed).uniform(size=(len(y),))
  264. eclf3 = VotingClassifier(estimators=[("lr", clf1)], voting="soft")
  265. eclf3.fit(X_scaled, y, sample_weight)
  266. clf1.fit(X_scaled, y, sample_weight)
  267. assert_array_equal(eclf3.predict(X_scaled), clf1.predict(X_scaled))
  268. assert_array_almost_equal(
  269. eclf3.predict_proba(X_scaled), clf1.predict_proba(X_scaled)
  270. )
  271. # check that an error is raised and indicative if sample_weight is not
  272. # supported.
  273. clf4 = KNeighborsClassifier()
  274. eclf3 = VotingClassifier(
  275. estimators=[("lr", clf1), ("svc", clf3), ("knn", clf4)], voting="soft"
  276. )
  277. msg = "Underlying estimator KNeighborsClassifier does not support sample weights."
  278. with pytest.raises(TypeError, match=msg):
  279. eclf3.fit(X_scaled, y, sample_weight)
  280. # check that _fit_single_estimator will raise the right error
  281. # it should raise the original error if this is not linked to sample_weight
  282. class ClassifierErrorFit(ClassifierMixin, BaseEstimator):
  283. def fit(self, X_scaled, y, sample_weight):
  284. raise TypeError("Error unrelated to sample_weight.")
  285. clf = ClassifierErrorFit()
  286. with pytest.raises(TypeError, match="Error unrelated to sample_weight"):
  287. clf.fit(X_scaled, y, sample_weight=sample_weight)
  288. def test_sample_weight_kwargs():
  289. """Check that VotingClassifier passes sample_weight as kwargs"""
  290. class MockClassifier(ClassifierMixin, BaseEstimator):
  291. """Mock Classifier to check that sample_weight is received as kwargs"""
  292. def fit(self, X, y, *args, **sample_weight):
  293. assert "sample_weight" in sample_weight
  294. clf = MockClassifier()
  295. eclf = VotingClassifier(estimators=[("mock", clf)], voting="soft")
  296. # Should not raise an error.
  297. eclf.fit(X, y, sample_weight=np.ones((len(y),)))
  298. def test_voting_classifier_set_params(global_random_seed):
  299. # check equivalence in the output when setting underlying estimators
  300. clf1 = LogisticRegression(random_state=global_random_seed)
  301. clf2 = RandomForestClassifier(
  302. n_estimators=10, random_state=global_random_seed, max_depth=None
  303. )
  304. clf3 = GaussianNB()
  305. eclf1 = VotingClassifier(
  306. [("lr", clf1), ("rf", clf2)], voting="soft", weights=[1, 2]
  307. ).fit(X_scaled, y)
  308. eclf2 = VotingClassifier(
  309. [("lr", clf1), ("nb", clf3)], voting="soft", weights=[1, 2]
  310. )
  311. eclf2.set_params(nb=clf2).fit(X_scaled, y)
  312. assert_array_equal(eclf1.predict(X_scaled), eclf2.predict(X_scaled))
  313. assert_array_almost_equal(
  314. eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled)
  315. )
  316. assert eclf2.estimators[0][1].get_params() == clf1.get_params()
  317. assert eclf2.estimators[1][1].get_params() == clf2.get_params()
  318. def test_set_estimator_drop():
  319. # VotingClassifier set_params should be able to set estimators as drop
  320. # Test predict
  321. clf1 = LogisticRegression(random_state=123)
  322. clf2 = RandomForestClassifier(n_estimators=10, random_state=123)
  323. clf3 = GaussianNB()
  324. eclf1 = VotingClassifier(
  325. estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)],
  326. voting="hard",
  327. weights=[1, 0, 0.5],
  328. ).fit(X, y)
  329. eclf2 = VotingClassifier(
  330. estimators=[("lr", clf1), ("rf", clf2), ("nb", clf3)],
  331. voting="hard",
  332. weights=[1, 1, 0.5],
  333. )
  334. eclf2.set_params(rf="drop").fit(X, y)
  335. assert_array_equal(eclf1.predict(X), eclf2.predict(X))
  336. assert dict(eclf2.estimators)["rf"] == "drop"
  337. assert len(eclf2.estimators_) == 2
  338. assert all(
  339. isinstance(est, (LogisticRegression, GaussianNB)) for est in eclf2.estimators_
  340. )
  341. assert eclf2.get_params()["rf"] == "drop"
  342. eclf1.set_params(voting="soft").fit(X, y)
  343. eclf2.set_params(voting="soft").fit(X, y)
  344. assert_array_equal(eclf1.predict(X), eclf2.predict(X))
  345. assert_array_almost_equal(eclf1.predict_proba(X), eclf2.predict_proba(X))
  346. msg = "All estimators are dropped. At least one is required"
  347. with pytest.raises(ValueError, match=msg):
  348. eclf2.set_params(lr="drop", rf="drop", nb="drop").fit(X, y)
  349. # Test soft voting transform
  350. X1 = np.array([[1], [2]])
  351. y1 = np.array([1, 2])
  352. eclf1 = VotingClassifier(
  353. estimators=[("rf", clf2), ("nb", clf3)],
  354. voting="soft",
  355. weights=[0, 0.5],
  356. flatten_transform=False,
  357. ).fit(X1, y1)
  358. eclf2 = VotingClassifier(
  359. estimators=[("rf", clf2), ("nb", clf3)],
  360. voting="soft",
  361. weights=[1, 0.5],
  362. flatten_transform=False,
  363. )
  364. eclf2.set_params(rf="drop").fit(X1, y1)
  365. assert_array_almost_equal(
  366. eclf1.transform(X1),
  367. np.array([[[0.7, 0.3], [0.3, 0.7]], [[1.0, 0.0], [0.0, 1.0]]]),
  368. )
  369. assert_array_almost_equal(eclf2.transform(X1), np.array([[[1.0, 0.0], [0.0, 1.0]]]))
  370. eclf1.set_params(voting="hard")
  371. eclf2.set_params(voting="hard")
  372. assert_array_equal(eclf1.transform(X1), np.array([[0, 0], [1, 1]]))
  373. assert_array_equal(eclf2.transform(X1), np.array([[0], [1]]))
  374. def test_estimator_weights_format(global_random_seed):
  375. # Test estimator weights inputs as list and array
  376. clf1 = LogisticRegression(random_state=global_random_seed)
  377. clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
  378. eclf1 = VotingClassifier(
  379. estimators=[("lr", clf1), ("rf", clf2)], weights=[1, 2], voting="soft"
  380. )
  381. eclf2 = VotingClassifier(
  382. estimators=[("lr", clf1), ("rf", clf2)], weights=np.array((1, 2)), voting="soft"
  383. )
  384. eclf1.fit(X_scaled, y)
  385. eclf2.fit(X_scaled, y)
  386. assert_array_almost_equal(
  387. eclf1.predict_proba(X_scaled), eclf2.predict_proba(X_scaled)
  388. )
  389. def test_transform(global_random_seed):
  390. """Check transform method of VotingClassifier on toy dataset."""
  391. clf1 = LogisticRegression(random_state=global_random_seed)
  392. clf2 = RandomForestClassifier(n_estimators=10, random_state=global_random_seed)
  393. clf3 = GaussianNB()
  394. X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
  395. y = np.array([1, 1, 2, 2])
  396. eclf1 = VotingClassifier(
  397. estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)], voting="soft"
  398. ).fit(X, y)
  399. eclf2 = VotingClassifier(
  400. estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
  401. voting="soft",
  402. flatten_transform=True,
  403. ).fit(X, y)
  404. eclf3 = VotingClassifier(
  405. estimators=[("lr", clf1), ("rf", clf2), ("gnb", clf3)],
  406. voting="soft",
  407. flatten_transform=False,
  408. ).fit(X, y)
  409. assert_array_equal(eclf1.transform(X).shape, (4, 6))
  410. assert_array_equal(eclf2.transform(X).shape, (4, 6))
  411. assert_array_equal(eclf3.transform(X).shape, (3, 4, 2))
  412. assert_array_almost_equal(eclf1.transform(X), eclf2.transform(X))
  413. assert_array_almost_equal(
  414. eclf3.transform(X).swapaxes(0, 1).reshape((4, 6)), eclf2.transform(X)
  415. )
  416. @pytest.mark.parametrize(
  417. "X, y, voter",
  418. [
  419. (
  420. X,
  421. y,
  422. VotingClassifier(
  423. [
  424. ("lr", LogisticRegression()),
  425. ("rf", RandomForestClassifier(n_estimators=5)),
  426. ]
  427. ),
  428. ),
  429. (
  430. X_r,
  431. y_r,
  432. VotingRegressor(
  433. [
  434. ("lr", LinearRegression()),
  435. ("rf", RandomForestRegressor(n_estimators=5)),
  436. ]
  437. ),
  438. ),
  439. ],
  440. )
  441. def test_none_estimator_with_weights(X, y, voter):
  442. # check that an estimator can be set to 'drop' and passing some weight
  443. # regression test for
  444. # https://github.com/scikit-learn/scikit-learn/issues/13777
  445. voter = clone(voter)
  446. # Scaled to solve ConvergenceWarning throw by Logistic Regression
  447. X_scaled = StandardScaler().fit_transform(X)
  448. voter.fit(X_scaled, y, sample_weight=np.ones(y.shape))
  449. voter.set_params(lr="drop")
  450. voter.fit(X_scaled, y, sample_weight=np.ones(y.shape))
  451. y_pred = voter.predict(X_scaled)
  452. assert y_pred.shape == y.shape
  453. @pytest.mark.parametrize(
  454. "est",
  455. [
  456. VotingRegressor(
  457. estimators=[
  458. ("lr", LinearRegression()),
  459. ("tree", DecisionTreeRegressor(random_state=0)),
  460. ]
  461. ),
  462. VotingClassifier(
  463. estimators=[
  464. ("lr", LogisticRegression(random_state=0)),
  465. ("tree", DecisionTreeClassifier(random_state=0)),
  466. ]
  467. ),
  468. ],
  469. ids=["VotingRegressor", "VotingClassifier"],
  470. )
  471. def test_n_features_in(est):
  472. X = [[1, 2], [3, 4], [5, 6]]
  473. y = [0, 1, 2]
  474. assert not hasattr(est, "n_features_in_")
  475. est.fit(X, y)
  476. assert est.n_features_in_ == 2
  477. @pytest.mark.parametrize(
  478. "estimator",
  479. [
  480. VotingRegressor(
  481. estimators=[
  482. ("lr", LinearRegression()),
  483. ("rf", RandomForestRegressor(random_state=123)),
  484. ],
  485. verbose=True,
  486. ),
  487. VotingClassifier(
  488. estimators=[
  489. ("lr", LogisticRegression(random_state=123)),
  490. ("rf", RandomForestClassifier(random_state=123)),
  491. ],
  492. verbose=True,
  493. ),
  494. ],
  495. )
  496. def test_voting_verbose(estimator, capsys):
  497. X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2]])
  498. y = np.array([1, 1, 2, 2])
  499. pattern = (
  500. r"\[Voting\].*\(1 of 2\) Processing lr, total=.*\n"
  501. r"\[Voting\].*\(2 of 2\) Processing rf, total=.*\n$"
  502. )
  503. estimator.fit(X, y)
  504. assert re.match(pattern, capsys.readouterr()[0])
  505. def test_get_features_names_out_regressor():
  506. """Check get_feature_names_out output for regressor."""
  507. X = [[1, 2], [3, 4], [5, 6]]
  508. y = [0, 1, 2]
  509. voting = VotingRegressor(
  510. estimators=[
  511. ("lr", LinearRegression()),
  512. ("tree", DecisionTreeRegressor(random_state=0)),
  513. ("ignore", "drop"),
  514. ]
  515. )
  516. voting.fit(X, y)
  517. names_out = voting.get_feature_names_out()
  518. expected_names = ["votingregressor_lr", "votingregressor_tree"]
  519. assert_array_equal(names_out, expected_names)
  520. @pytest.mark.parametrize(
  521. "kwargs, expected_names",
  522. [
  523. (
  524. {"voting": "soft", "flatten_transform": True},
  525. [
  526. "votingclassifier_lr0",
  527. "votingclassifier_lr1",
  528. "votingclassifier_lr2",
  529. "votingclassifier_tree0",
  530. "votingclassifier_tree1",
  531. "votingclassifier_tree2",
  532. ],
  533. ),
  534. ({"voting": "hard"}, ["votingclassifier_lr", "votingclassifier_tree"]),
  535. ],
  536. )
  537. def test_get_features_names_out_classifier(kwargs, expected_names):
  538. """Check get_feature_names_out for classifier for different settings."""
  539. X = [[1, 2], [3, 4], [5, 6], [1, 1.2]]
  540. y = [0, 1, 2, 0]
  541. voting = VotingClassifier(
  542. estimators=[
  543. ("lr", LogisticRegression(random_state=0)),
  544. ("tree", DecisionTreeClassifier(random_state=0)),
  545. ],
  546. **kwargs,
  547. )
  548. voting.fit(X, y)
  549. X_trans = voting.transform(X)
  550. names_out = voting.get_feature_names_out()
  551. assert X_trans.shape[1] == len(expected_names)
  552. assert_array_equal(names_out, expected_names)
  553. def test_get_features_names_out_classifier_error():
  554. """Check that error is raised when voting="soft" and flatten_transform=False."""
  555. X = [[1, 2], [3, 4], [5, 6]]
  556. y = [0, 1, 2]
  557. voting = VotingClassifier(
  558. estimators=[
  559. ("lr", LogisticRegression(random_state=0)),
  560. ("tree", DecisionTreeClassifier(random_state=0)),
  561. ],
  562. voting="soft",
  563. flatten_transform=False,
  564. )
  565. voting.fit(X, y)
  566. msg = (
  567. "get_feature_names_out is not supported when `voting='soft'` and "
  568. "`flatten_transform=False`"
  569. )
  570. with pytest.raises(ValueError, match=msg):
  571. voting.get_feature_names_out()