test_common.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. import numpy as np
  2. import pytest
  3. from scipy import sparse
  4. from sklearn.experimental import enable_iterative_imputer # noqa
  5. from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
  6. from sklearn.utils._testing import (
  7. assert_allclose,
  8. assert_allclose_dense_sparse,
  9. assert_array_equal,
  10. )
  11. def imputers():
  12. return [IterativeImputer(tol=0.1), KNNImputer(), SimpleImputer()]
  13. def sparse_imputers():
  14. return [SimpleImputer()]
  15. # ConvergenceWarning will be raised by the IterativeImputer
  16. @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
  17. @pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
  18. def test_imputation_missing_value_in_test_array(imputer):
  19. # [Non Regression Test for issue #13968] Missing value in test set should
  20. # not throw an error and return a finite dataset
  21. train = [[1], [2]]
  22. test = [[3], [np.nan]]
  23. imputer.set_params(add_indicator=True)
  24. imputer.fit(train).transform(test)
  25. # ConvergenceWarning will be raised by the IterativeImputer
  26. @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
  27. @pytest.mark.parametrize("marker", [np.nan, -1, 0])
  28. @pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
  29. def test_imputers_add_indicator(marker, imputer):
  30. X = np.array(
  31. [
  32. [marker, 1, 5, marker, 1],
  33. [2, marker, 1, marker, 2],
  34. [6, 3, marker, marker, 3],
  35. [1, 2, 9, marker, 4],
  36. ]
  37. )
  38. X_true_indicator = np.array(
  39. [
  40. [1.0, 0.0, 0.0, 1.0],
  41. [0.0, 1.0, 0.0, 1.0],
  42. [0.0, 0.0, 1.0, 1.0],
  43. [0.0, 0.0, 0.0, 1.0],
  44. ]
  45. )
  46. imputer.set_params(missing_values=marker, add_indicator=True)
  47. X_trans = imputer.fit_transform(X)
  48. assert_allclose(X_trans[:, -4:], X_true_indicator)
  49. assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3]))
  50. imputer.set_params(add_indicator=False)
  51. X_trans_no_indicator = imputer.fit_transform(X)
  52. assert_allclose(X_trans[:, :-4], X_trans_no_indicator)
  53. # ConvergenceWarning will be raised by the IterativeImputer
  54. @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
  55. @pytest.mark.parametrize("marker", [np.nan, -1])
  56. @pytest.mark.parametrize(
  57. "imputer", sparse_imputers(), ids=lambda x: x.__class__.__name__
  58. )
  59. def test_imputers_add_indicator_sparse(imputer, marker):
  60. X = sparse.csr_matrix(
  61. [
  62. [marker, 1, 5, marker, 1],
  63. [2, marker, 1, marker, 2],
  64. [6, 3, marker, marker, 3],
  65. [1, 2, 9, marker, 4],
  66. ]
  67. )
  68. X_true_indicator = sparse.csr_matrix(
  69. [
  70. [1.0, 0.0, 0.0, 1.0],
  71. [0.0, 1.0, 0.0, 1.0],
  72. [0.0, 0.0, 1.0, 1.0],
  73. [0.0, 0.0, 0.0, 1.0],
  74. ]
  75. )
  76. imputer.set_params(missing_values=marker, add_indicator=True)
  77. X_trans = imputer.fit_transform(X)
  78. assert_allclose_dense_sparse(X_trans[:, -4:], X_true_indicator)
  79. assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3]))
  80. imputer.set_params(add_indicator=False)
  81. X_trans_no_indicator = imputer.fit_transform(X)
  82. assert_allclose_dense_sparse(X_trans[:, :-4], X_trans_no_indicator)
  83. # ConvergenceWarning will be raised by the IterativeImputer
  84. @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
  85. @pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
  86. @pytest.mark.parametrize("add_indicator", [True, False])
  87. def test_imputers_pandas_na_integer_array_support(imputer, add_indicator):
  88. # Test pandas IntegerArray with pd.NA
  89. pd = pytest.importorskip("pandas")
  90. marker = np.nan
  91. imputer = imputer.set_params(add_indicator=add_indicator, missing_values=marker)
  92. X = np.array(
  93. [
  94. [marker, 1, 5, marker, 1],
  95. [2, marker, 1, marker, 2],
  96. [6, 3, marker, marker, 3],
  97. [1, 2, 9, marker, 4],
  98. ]
  99. )
  100. # fit on numpy array
  101. X_trans_expected = imputer.fit_transform(X)
  102. # Creates dataframe with IntegerArrays with pd.NA
  103. X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c", "d", "e"])
  104. # fit on pandas dataframe with IntegerArrays
  105. X_trans = imputer.fit_transform(X_df)
  106. assert_allclose(X_trans_expected, X_trans)
  107. @pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
  108. @pytest.mark.parametrize("add_indicator", [True, False])
  109. def test_imputers_feature_names_out_pandas(imputer, add_indicator):
  110. """Check feature names out for imputers."""
  111. pd = pytest.importorskip("pandas")
  112. marker = np.nan
  113. imputer = imputer.set_params(add_indicator=add_indicator, missing_values=marker)
  114. X = np.array(
  115. [
  116. [marker, 1, 5, 3, marker, 1],
  117. [2, marker, 1, 4, marker, 2],
  118. [6, 3, 7, marker, marker, 3],
  119. [1, 2, 9, 8, marker, 4],
  120. ]
  121. )
  122. X_df = pd.DataFrame(X, columns=["a", "b", "c", "d", "e", "f"])
  123. imputer.fit(X_df)
  124. names = imputer.get_feature_names_out()
  125. if add_indicator:
  126. expected_names = [
  127. "a",
  128. "b",
  129. "c",
  130. "d",
  131. "f",
  132. "missingindicator_a",
  133. "missingindicator_b",
  134. "missingindicator_d",
  135. "missingindicator_e",
  136. ]
  137. assert_array_equal(expected_names, names)
  138. else:
  139. expected_names = ["a", "b", "c", "d", "f"]
  140. assert_array_equal(expected_names, names)
  141. @pytest.mark.parametrize("keep_empty_features", [True, False])
  142. @pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
  143. def test_keep_empty_features(imputer, keep_empty_features):
  144. """Check that the imputer keeps features with only missing values."""
  145. X = np.array([[np.nan, 1], [np.nan, 2], [np.nan, 3]])
  146. imputer = imputer.set_params(
  147. add_indicator=False, keep_empty_features=keep_empty_features
  148. )
  149. for method in ["fit_transform", "transform"]:
  150. X_imputed = getattr(imputer, method)(X)
  151. if keep_empty_features:
  152. assert X_imputed.shape == X.shape
  153. else:
  154. assert X_imputed.shape == (X.shape[0], X.shape[1] - 1)
  155. @pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
  156. @pytest.mark.parametrize("missing_value_test", [np.nan, 1])
  157. def test_imputation_adds_missing_indicator_if_add_indicator_is_true(
  158. imputer, missing_value_test
  159. ):
  160. """Check that missing indicator always exists when add_indicator=True.
  161. Non-regression test for gh-26590.
  162. """
  163. X_train = np.array([[0, np.nan], [1, 2]])
  164. # Test data where missing_value_test variable can be set to np.nan or 1.
  165. X_test = np.array([[0, missing_value_test], [1, 2]])
  166. imputer.set_params(add_indicator=True)
  167. imputer.fit(X_train)
  168. X_test_imputed_with_indicator = imputer.transform(X_test)
  169. assert X_test_imputed_with_indicator.shape == (2, 3)
  170. imputer.set_params(add_indicator=False)
  171. imputer.fit(X_train)
  172. X_test_imputed_without_indicator = imputer.transform(X_test)
  173. assert X_test_imputed_without_indicator.shape == (2, 2)
  174. assert_allclose(
  175. X_test_imputed_with_indicator[:, :-1], X_test_imputed_without_indicator
  176. )
  177. if np.isnan(missing_value_test):
  178. expected_missing_indicator = [1, 0]
  179. else:
  180. expected_missing_indicator = [0, 0]
  181. assert_allclose(X_test_imputed_with_indicator[:, -1], expected_missing_indicator)