test_common.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262
  1. import numpy as np
  2. import pytest
  3. from sklearn.base import ClassifierMixin, clone, is_classifier
  4. from sklearn.datasets import (
  5. load_diabetes,
  6. load_iris,
  7. make_classification,
  8. make_regression,
  9. )
  10. from sklearn.ensemble import (
  11. RandomForestClassifier,
  12. RandomForestRegressor,
  13. StackingClassifier,
  14. StackingRegressor,
  15. VotingClassifier,
  16. VotingRegressor,
  17. )
  18. from sklearn.impute import SimpleImputer
  19. from sklearn.linear_model import LinearRegression, LogisticRegression
  20. from sklearn.pipeline import make_pipeline
  21. from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR
  22. X, y = load_iris(return_X_y=True)
  23. X_r, y_r = load_diabetes(return_X_y=True)
  24. @pytest.mark.parametrize(
  25. "X, y, estimator",
  26. [
  27. (
  28. *make_classification(n_samples=10),
  29. StackingClassifier(
  30. estimators=[
  31. ("lr", LogisticRegression()),
  32. ("svm", LinearSVC(dual="auto")),
  33. ("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
  34. ],
  35. cv=2,
  36. ),
  37. ),
  38. (
  39. *make_classification(n_samples=10),
  40. VotingClassifier(
  41. estimators=[
  42. ("lr", LogisticRegression()),
  43. ("svm", LinearSVC(dual="auto")),
  44. ("rf", RandomForestClassifier(n_estimators=5, max_depth=3)),
  45. ]
  46. ),
  47. ),
  48. (
  49. *make_regression(n_samples=10),
  50. StackingRegressor(
  51. estimators=[
  52. ("lr", LinearRegression()),
  53. ("svm", LinearSVR(dual="auto")),
  54. ("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
  55. ],
  56. cv=2,
  57. ),
  58. ),
  59. (
  60. *make_regression(n_samples=10),
  61. VotingRegressor(
  62. estimators=[
  63. ("lr", LinearRegression()),
  64. ("svm", LinearSVR(dual="auto")),
  65. ("rf", RandomForestRegressor(n_estimators=5, max_depth=3)),
  66. ]
  67. ),
  68. ),
  69. ],
  70. ids=[
  71. "stacking-classifier",
  72. "voting-classifier",
  73. "stacking-regressor",
  74. "voting-regressor",
  75. ],
  76. )
  77. def test_ensemble_heterogeneous_estimators_behavior(X, y, estimator):
  78. # check that the behavior of `estimators`, `estimators_`,
  79. # `named_estimators`, `named_estimators_` is consistent across all
  80. # ensemble classes and when using `set_params()`.
  81. # before fit
  82. assert "svm" in estimator.named_estimators
  83. assert estimator.named_estimators.svm is estimator.estimators[1][1]
  84. assert estimator.named_estimators.svm is estimator.named_estimators["svm"]
  85. # check fitted attributes
  86. estimator.fit(X, y)
  87. assert len(estimator.named_estimators) == 3
  88. assert len(estimator.named_estimators_) == 3
  89. assert sorted(list(estimator.named_estimators_.keys())) == sorted(
  90. ["lr", "svm", "rf"]
  91. )
  92. # check that set_params() does not add a new attribute
  93. estimator_new_params = clone(estimator)
  94. svm_estimator = SVC() if is_classifier(estimator) else SVR()
  95. estimator_new_params.set_params(svm=svm_estimator).fit(X, y)
  96. assert not hasattr(estimator_new_params, "svm")
  97. assert (
  98. estimator_new_params.named_estimators.lr.get_params()
  99. == estimator.named_estimators.lr.get_params()
  100. )
  101. assert (
  102. estimator_new_params.named_estimators.rf.get_params()
  103. == estimator.named_estimators.rf.get_params()
  104. )
  105. # check the behavior when setting an dropping an estimator
  106. estimator_dropped = clone(estimator)
  107. estimator_dropped.set_params(svm="drop")
  108. estimator_dropped.fit(X, y)
  109. assert len(estimator_dropped.named_estimators) == 3
  110. assert estimator_dropped.named_estimators.svm == "drop"
  111. assert len(estimator_dropped.named_estimators_) == 3
  112. assert sorted(list(estimator_dropped.named_estimators_.keys())) == sorted(
  113. ["lr", "svm", "rf"]
  114. )
  115. for sub_est in estimator_dropped.named_estimators_:
  116. # check that the correspondence is correct
  117. assert not isinstance(sub_est, type(estimator.named_estimators.svm))
  118. # check that we can set the parameters of the underlying classifier
  119. estimator.set_params(svm__C=10.0)
  120. estimator.set_params(rf__max_depth=5)
  121. assert (
  122. estimator.get_params()["svm__C"]
  123. == estimator.get_params()["svm"].get_params()["C"]
  124. )
  125. assert (
  126. estimator.get_params()["rf__max_depth"]
  127. == estimator.get_params()["rf"].get_params()["max_depth"]
  128. )
  129. @pytest.mark.parametrize(
  130. "Ensemble",
  131. [VotingClassifier, StackingRegressor, VotingRegressor],
  132. )
  133. def test_ensemble_heterogeneous_estimators_type(Ensemble):
  134. # check that ensemble will fail during validation if the underlying
  135. # estimators are not of the same type (i.e. classifier or regressor)
  136. # StackingClassifier can have an underlying regresor so it's not checked
  137. if issubclass(Ensemble, ClassifierMixin):
  138. X, y = make_classification(n_samples=10)
  139. estimators = [("lr", LinearRegression())]
  140. ensemble_type = "classifier"
  141. else:
  142. X, y = make_regression(n_samples=10)
  143. estimators = [("lr", LogisticRegression())]
  144. ensemble_type = "regressor"
  145. ensemble = Ensemble(estimators=estimators)
  146. err_msg = "should be a {}".format(ensemble_type)
  147. with pytest.raises(ValueError, match=err_msg):
  148. ensemble.fit(X, y)
  149. @pytest.mark.parametrize(
  150. "X, y, Ensemble",
  151. [
  152. (*make_classification(n_samples=10), StackingClassifier),
  153. (*make_classification(n_samples=10), VotingClassifier),
  154. (*make_regression(n_samples=10), StackingRegressor),
  155. (*make_regression(n_samples=10), VotingRegressor),
  156. ],
  157. )
  158. def test_ensemble_heterogeneous_estimators_name_validation(X, y, Ensemble):
  159. # raise an error when the name contains dunder
  160. if issubclass(Ensemble, ClassifierMixin):
  161. estimators = [("lr__", LogisticRegression())]
  162. else:
  163. estimators = [("lr__", LinearRegression())]
  164. ensemble = Ensemble(estimators=estimators)
  165. err_msg = r"Estimator names must not contain __: got \['lr__'\]"
  166. with pytest.raises(ValueError, match=err_msg):
  167. ensemble.fit(X, y)
  168. # raise an error when the name is not unique
  169. if issubclass(Ensemble, ClassifierMixin):
  170. estimators = [("lr", LogisticRegression()), ("lr", LogisticRegression())]
  171. else:
  172. estimators = [("lr", LinearRegression()), ("lr", LinearRegression())]
  173. ensemble = Ensemble(estimators=estimators)
  174. err_msg = r"Names provided are not unique: \['lr', 'lr'\]"
  175. with pytest.raises(ValueError, match=err_msg):
  176. ensemble.fit(X, y)
  177. # raise an error when the name conflicts with the parameters
  178. if issubclass(Ensemble, ClassifierMixin):
  179. estimators = [("estimators", LogisticRegression())]
  180. else:
  181. estimators = [("estimators", LinearRegression())]
  182. ensemble = Ensemble(estimators=estimators)
  183. err_msg = "Estimator names conflict with constructor arguments"
  184. with pytest.raises(ValueError, match=err_msg):
  185. ensemble.fit(X, y)
  186. @pytest.mark.parametrize(
  187. "X, y, estimator",
  188. [
  189. (
  190. *make_classification(n_samples=10),
  191. StackingClassifier(estimators=[("lr", LogisticRegression())]),
  192. ),
  193. (
  194. *make_classification(n_samples=10),
  195. VotingClassifier(estimators=[("lr", LogisticRegression())]),
  196. ),
  197. (
  198. *make_regression(n_samples=10),
  199. StackingRegressor(estimators=[("lr", LinearRegression())]),
  200. ),
  201. (
  202. *make_regression(n_samples=10),
  203. VotingRegressor(estimators=[("lr", LinearRegression())]),
  204. ),
  205. ],
  206. ids=[
  207. "stacking-classifier",
  208. "voting-classifier",
  209. "stacking-regressor",
  210. "voting-regressor",
  211. ],
  212. )
  213. def test_ensemble_heterogeneous_estimators_all_dropped(X, y, estimator):
  214. # check that we raise a consistent error when all estimators are
  215. # dropped
  216. estimator.set_params(lr="drop")
  217. with pytest.raises(ValueError, match="All estimators are dropped."):
  218. estimator.fit(X, y)
  219. @pytest.mark.parametrize(
  220. "Ensemble, Estimator, X, y",
  221. [
  222. (StackingClassifier, LogisticRegression, X, y),
  223. (StackingRegressor, LinearRegression, X_r, y_r),
  224. (VotingClassifier, LogisticRegression, X, y),
  225. (VotingRegressor, LinearRegression, X_r, y_r),
  226. ],
  227. )
  228. # FIXME: we should move this test in `estimator_checks` once we are able
  229. # to construct meta-estimator instances
  230. def test_heterogeneous_ensemble_support_missing_values(Ensemble, Estimator, X, y):
  231. # check that Voting and Stacking predictor delegate the missing values
  232. # validation to the underlying estimator.
  233. X = X.copy()
  234. mask = np.random.choice([1, 0], X.shape, p=[0.1, 0.9]).astype(bool)
  235. X[mask] = np.nan
  236. pipe = make_pipeline(SimpleImputer(), Estimator())
  237. ensemble = Ensemble(estimators=[("pipe1", pipe), ("pipe2", pipe)])
  238. ensemble.fit(X, y).score(X, y)