| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219 |
- import numpy as np
- import pytest
- from scipy import sparse
- from sklearn.experimental import enable_iterative_imputer # noqa
- from sklearn.impute import IterativeImputer, KNNImputer, SimpleImputer
- from sklearn.utils._testing import (
- assert_allclose,
- assert_allclose_dense_sparse,
- assert_array_equal,
- )
- def imputers():
- return [IterativeImputer(tol=0.1), KNNImputer(), SimpleImputer()]
- def sparse_imputers():
- return [SimpleImputer()]
- # ConvergenceWarning will be raised by the IterativeImputer
- @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
- @pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
- def test_imputation_missing_value_in_test_array(imputer):
- # [Non Regression Test for issue #13968] Missing value in test set should
- # not throw an error and return a finite dataset
- train = [[1], [2]]
- test = [[3], [np.nan]]
- imputer.set_params(add_indicator=True)
- imputer.fit(train).transform(test)
- # ConvergenceWarning will be raised by the IterativeImputer
- @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
- @pytest.mark.parametrize("marker", [np.nan, -1, 0])
- @pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
- def test_imputers_add_indicator(marker, imputer):
- X = np.array(
- [
- [marker, 1, 5, marker, 1],
- [2, marker, 1, marker, 2],
- [6, 3, marker, marker, 3],
- [1, 2, 9, marker, 4],
- ]
- )
- X_true_indicator = np.array(
- [
- [1.0, 0.0, 0.0, 1.0],
- [0.0, 1.0, 0.0, 1.0],
- [0.0, 0.0, 1.0, 1.0],
- [0.0, 0.0, 0.0, 1.0],
- ]
- )
- imputer.set_params(missing_values=marker, add_indicator=True)
- X_trans = imputer.fit_transform(X)
- assert_allclose(X_trans[:, -4:], X_true_indicator)
- assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3]))
- imputer.set_params(add_indicator=False)
- X_trans_no_indicator = imputer.fit_transform(X)
- assert_allclose(X_trans[:, :-4], X_trans_no_indicator)
- # ConvergenceWarning will be raised by the IterativeImputer
- @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
- @pytest.mark.parametrize("marker", [np.nan, -1])
- @pytest.mark.parametrize(
- "imputer", sparse_imputers(), ids=lambda x: x.__class__.__name__
- )
- def test_imputers_add_indicator_sparse(imputer, marker):
- X = sparse.csr_matrix(
- [
- [marker, 1, 5, marker, 1],
- [2, marker, 1, marker, 2],
- [6, 3, marker, marker, 3],
- [1, 2, 9, marker, 4],
- ]
- )
- X_true_indicator = sparse.csr_matrix(
- [
- [1.0, 0.0, 0.0, 1.0],
- [0.0, 1.0, 0.0, 1.0],
- [0.0, 0.0, 1.0, 1.0],
- [0.0, 0.0, 0.0, 1.0],
- ]
- )
- imputer.set_params(missing_values=marker, add_indicator=True)
- X_trans = imputer.fit_transform(X)
- assert_allclose_dense_sparse(X_trans[:, -4:], X_true_indicator)
- assert_array_equal(imputer.indicator_.features_, np.array([0, 1, 2, 3]))
- imputer.set_params(add_indicator=False)
- X_trans_no_indicator = imputer.fit_transform(X)
- assert_allclose_dense_sparse(X_trans[:, :-4], X_trans_no_indicator)
- # ConvergenceWarning will be raised by the IterativeImputer
- @pytest.mark.filterwarnings("ignore::sklearn.exceptions.ConvergenceWarning")
- @pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
- @pytest.mark.parametrize("add_indicator", [True, False])
- def test_imputers_pandas_na_integer_array_support(imputer, add_indicator):
- # Test pandas IntegerArray with pd.NA
- pd = pytest.importorskip("pandas")
- marker = np.nan
- imputer = imputer.set_params(add_indicator=add_indicator, missing_values=marker)
- X = np.array(
- [
- [marker, 1, 5, marker, 1],
- [2, marker, 1, marker, 2],
- [6, 3, marker, marker, 3],
- [1, 2, 9, marker, 4],
- ]
- )
- # fit on numpy array
- X_trans_expected = imputer.fit_transform(X)
- # Creates dataframe with IntegerArrays with pd.NA
- X_df = pd.DataFrame(X, dtype="Int16", columns=["a", "b", "c", "d", "e"])
- # fit on pandas dataframe with IntegerArrays
- X_trans = imputer.fit_transform(X_df)
- assert_allclose(X_trans_expected, X_trans)
- @pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
- @pytest.mark.parametrize("add_indicator", [True, False])
- def test_imputers_feature_names_out_pandas(imputer, add_indicator):
- """Check feature names out for imputers."""
- pd = pytest.importorskip("pandas")
- marker = np.nan
- imputer = imputer.set_params(add_indicator=add_indicator, missing_values=marker)
- X = np.array(
- [
- [marker, 1, 5, 3, marker, 1],
- [2, marker, 1, 4, marker, 2],
- [6, 3, 7, marker, marker, 3],
- [1, 2, 9, 8, marker, 4],
- ]
- )
- X_df = pd.DataFrame(X, columns=["a", "b", "c", "d", "e", "f"])
- imputer.fit(X_df)
- names = imputer.get_feature_names_out()
- if add_indicator:
- expected_names = [
- "a",
- "b",
- "c",
- "d",
- "f",
- "missingindicator_a",
- "missingindicator_b",
- "missingindicator_d",
- "missingindicator_e",
- ]
- assert_array_equal(expected_names, names)
- else:
- expected_names = ["a", "b", "c", "d", "f"]
- assert_array_equal(expected_names, names)
- @pytest.mark.parametrize("keep_empty_features", [True, False])
- @pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
- def test_keep_empty_features(imputer, keep_empty_features):
- """Check that the imputer keeps features with only missing values."""
- X = np.array([[np.nan, 1], [np.nan, 2], [np.nan, 3]])
- imputer = imputer.set_params(
- add_indicator=False, keep_empty_features=keep_empty_features
- )
- for method in ["fit_transform", "transform"]:
- X_imputed = getattr(imputer, method)(X)
- if keep_empty_features:
- assert X_imputed.shape == X.shape
- else:
- assert X_imputed.shape == (X.shape[0], X.shape[1] - 1)
- @pytest.mark.parametrize("imputer", imputers(), ids=lambda x: x.__class__.__name__)
- @pytest.mark.parametrize("missing_value_test", [np.nan, 1])
- def test_imputation_adds_missing_indicator_if_add_indicator_is_true(
- imputer, missing_value_test
- ):
- """Check that missing indicator always exists when add_indicator=True.
- Non-regression test for gh-26590.
- """
- X_train = np.array([[0, np.nan], [1, 2]])
- # Test data where missing_value_test variable can be set to np.nan or 1.
- X_test = np.array([[0, missing_value_test], [1, 2]])
- imputer.set_params(add_indicator=True)
- imputer.fit(X_train)
- X_test_imputed_with_indicator = imputer.transform(X_test)
- assert X_test_imputed_with_indicator.shape == (2, 3)
- imputer.set_params(add_indicator=False)
- imputer.fit(X_train)
- X_test_imputed_without_indicator = imputer.transform(X_test)
- assert X_test_imputed_without_indicator.shape == (2, 2)
- assert_allclose(
- X_test_imputed_with_indicator[:, :-1], X_test_imputed_without_indicator
- )
- if np.isnan(missing_value_test):
- expected_missing_indicator = [1, 0]
- else:
- expected_missing_indicator = [0, 0]
- assert_allclose(X_test_imputed_with_indicator[:, -1], expected_missing_indicator)
|