| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274 |
- import pickle
- import numpy as np
- import pytest
- from numpy.testing import assert_array_equal
- from sklearn.utils._encode import _check_unknown, _encode, _get_counts, _unique
- @pytest.mark.parametrize(
- "values, expected",
- [
- (np.array([2, 1, 3, 1, 3], dtype="int64"), np.array([1, 2, 3], dtype="int64")),
- (
- np.array([2, 1, np.nan, 1, np.nan], dtype="float32"),
- np.array([1, 2, np.nan], dtype="float32"),
- ),
- (
- np.array(["b", "a", "c", "a", "c"], dtype=object),
- np.array(["a", "b", "c"], dtype=object),
- ),
- (
- np.array(["b", "a", None, "a", None], dtype=object),
- np.array(["a", "b", None], dtype=object),
- ),
- (np.array(["b", "a", "c", "a", "c"]), np.array(["a", "b", "c"])),
- ],
- ids=["int64", "float32-nan", "object", "object-None", "str"],
- )
- def test_encode_util(values, expected):
- uniques = _unique(values)
- assert_array_equal(uniques, expected)
- result, encoded = _unique(values, return_inverse=True)
- assert_array_equal(result, expected)
- assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
- encoded = _encode(values, uniques=uniques)
- assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
- result, counts = _unique(values, return_counts=True)
- assert_array_equal(result, expected)
- assert_array_equal(counts, np.array([2, 1, 2]))
- result, encoded, counts = _unique(values, return_inverse=True, return_counts=True)
- assert_array_equal(result, expected)
- assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
- assert_array_equal(counts, np.array([2, 1, 2]))
- def test_encode_with_check_unknown():
- # test for the check_unknown parameter of _encode()
- uniques = np.array([1, 2, 3])
- values = np.array([1, 2, 3, 4])
- # Default is True, raise error
- with pytest.raises(ValueError, match="y contains previously unseen labels"):
- _encode(values, uniques=uniques, check_unknown=True)
- # dont raise error if False
- _encode(values, uniques=uniques, check_unknown=False)
- # parameter is ignored for object dtype
- uniques = np.array(["a", "b", "c"], dtype=object)
- values = np.array(["a", "b", "c", "d"], dtype=object)
- with pytest.raises(ValueError, match="y contains previously unseen labels"):
- _encode(values, uniques=uniques, check_unknown=False)
- def _assert_check_unknown(values, uniques, expected_diff, expected_mask):
- diff = _check_unknown(values, uniques)
- assert_array_equal(diff, expected_diff)
- diff, valid_mask = _check_unknown(values, uniques, return_mask=True)
- assert_array_equal(diff, expected_diff)
- assert_array_equal(valid_mask, expected_mask)
- @pytest.mark.parametrize(
- "values, uniques, expected_diff, expected_mask",
- [
- (np.array([1, 2, 3, 4]), np.array([1, 2, 3]), [4], [True, True, True, False]),
- (np.array([2, 1, 4, 5]), np.array([2, 5, 1]), [4], [True, True, False, True]),
- (np.array([2, 1, np.nan]), np.array([2, 5, 1]), [np.nan], [True, True, False]),
- (
- np.array([2, 1, 4, np.nan]),
- np.array([2, 5, 1, np.nan]),
- [4],
- [True, True, False, True],
- ),
- (
- np.array([2, 1, 4, np.nan]),
- np.array([2, 5, 1]),
- [4, np.nan],
- [True, True, False, False],
- ),
- (
- np.array([2, 1, 4, 5]),
- np.array([2, 5, 1, np.nan]),
- [4],
- [True, True, False, True],
- ),
- (
- np.array(["a", "b", "c", "d"], dtype=object),
- np.array(["a", "b", "c"], dtype=object),
- np.array(["d"], dtype=object),
- [True, True, True, False],
- ),
- (
- np.array(["d", "c", "a", "b"], dtype=object),
- np.array(["a", "c", "b"], dtype=object),
- np.array(["d"], dtype=object),
- [False, True, True, True],
- ),
- (
- np.array(["a", "b", "c", "d"]),
- np.array(["a", "b", "c"]),
- np.array(["d"]),
- [True, True, True, False],
- ),
- (
- np.array(["d", "c", "a", "b"]),
- np.array(["a", "c", "b"]),
- np.array(["d"]),
- [False, True, True, True],
- ),
- ],
- )
- def test_check_unknown(values, uniques, expected_diff, expected_mask):
- _assert_check_unknown(values, uniques, expected_diff, expected_mask)
- @pytest.mark.parametrize("missing_value", [None, np.nan, float("nan")])
- @pytest.mark.parametrize("pickle_uniques", [True, False])
- def test_check_unknown_missing_values(missing_value, pickle_uniques):
- # check for check_unknown with missing values with object dtypes
- values = np.array(["d", "c", "a", "b", missing_value], dtype=object)
- uniques = np.array(["c", "a", "b", missing_value], dtype=object)
- if pickle_uniques:
- uniques = pickle.loads(pickle.dumps(uniques))
- expected_diff = ["d"]
- expected_mask = [False, True, True, True, True]
- _assert_check_unknown(values, uniques, expected_diff, expected_mask)
- values = np.array(["d", "c", "a", "b", missing_value], dtype=object)
- uniques = np.array(["c", "a", "b"], dtype=object)
- if pickle_uniques:
- uniques = pickle.loads(pickle.dumps(uniques))
- expected_diff = ["d", missing_value]
- expected_mask = [False, True, True, True, False]
- _assert_check_unknown(values, uniques, expected_diff, expected_mask)
- values = np.array(["a", missing_value], dtype=object)
- uniques = np.array(["a", "b", "z"], dtype=object)
- if pickle_uniques:
- uniques = pickle.loads(pickle.dumps(uniques))
- expected_diff = [missing_value]
- expected_mask = [True, False]
- _assert_check_unknown(values, uniques, expected_diff, expected_mask)
- @pytest.mark.parametrize("missing_value", [np.nan, None, float("nan")])
- @pytest.mark.parametrize("pickle_uniques", [True, False])
- def test_unique_util_missing_values_objects(missing_value, pickle_uniques):
- # check for _unique and _encode with missing values with object dtypes
- values = np.array(["a", "c", "c", missing_value, "b"], dtype=object)
- expected_uniques = np.array(["a", "b", "c", missing_value], dtype=object)
- uniques = _unique(values)
- if missing_value is None:
- assert_array_equal(uniques, expected_uniques)
- else: # missing_value == np.nan
- assert_array_equal(uniques[:-1], expected_uniques[:-1])
- assert np.isnan(uniques[-1])
- if pickle_uniques:
- uniques = pickle.loads(pickle.dumps(uniques))
- encoded = _encode(values, uniques=uniques)
- assert_array_equal(encoded, np.array([0, 2, 2, 3, 1]))
- def test_unique_util_missing_values_numeric():
- # Check missing values in numerical values
- values = np.array([3, 1, np.nan, 5, 3, np.nan], dtype=float)
- expected_uniques = np.array([1, 3, 5, np.nan], dtype=float)
- expected_inverse = np.array([1, 0, 3, 2, 1, 3])
- uniques = _unique(values)
- assert_array_equal(uniques, expected_uniques)
- uniques, inverse = _unique(values, return_inverse=True)
- assert_array_equal(uniques, expected_uniques)
- assert_array_equal(inverse, expected_inverse)
- encoded = _encode(values, uniques=uniques)
- assert_array_equal(encoded, expected_inverse)
- def test_unique_util_with_all_missing_values():
- # test for all types of missing values for object dtype
- values = np.array([np.nan, "a", "c", "c", None, float("nan"), None], dtype=object)
- uniques = _unique(values)
- assert_array_equal(uniques[:-1], ["a", "c", None])
- # last value is nan
- assert np.isnan(uniques[-1])
- expected_inverse = [3, 0, 1, 1, 2, 3, 2]
- _, inverse = _unique(values, return_inverse=True)
- assert_array_equal(inverse, expected_inverse)
- def test_check_unknown_with_both_missing_values():
- # test for both types of missing values for object dtype
- values = np.array([np.nan, "a", "c", "c", None, np.nan, None], dtype=object)
- diff = _check_unknown(values, known_values=np.array(["a", "c"], dtype=object))
- assert diff[0] is None
- assert np.isnan(diff[1])
- diff, valid_mask = _check_unknown(
- values, known_values=np.array(["a", "c"], dtype=object), return_mask=True
- )
- assert diff[0] is None
- assert np.isnan(diff[1])
- assert_array_equal(valid_mask, [False, True, True, True, False, False, False])
- @pytest.mark.parametrize(
- "values, uniques, expected_counts",
- [
- (np.array([1] * 10 + [2] * 4 + [3] * 15), np.array([1, 2, 3]), [10, 4, 15]),
- (
- np.array([1] * 10 + [2] * 4 + [3] * 15),
- np.array([1, 2, 3, 5]),
- [10, 4, 15, 0],
- ),
- (
- np.array([np.nan] * 10 + [2] * 4 + [3] * 15),
- np.array([2, 3, np.nan]),
- [4, 15, 10],
- ),
- (
- np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
- ["a", "b", "c"],
- [16, 4, 20],
- ),
- (
- np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
- ["c", "b", "a"],
- [20, 4, 16],
- ),
- (
- np.array([np.nan] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
- ["c", np.nan, "a"],
- [20, 4, 16],
- ),
- (
- np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
- ["a", "b", "c", "e"],
- [16, 4, 20, 0],
- ),
- ],
- )
- def test_get_counts(values, uniques, expected_counts):
- counts = _get_counts(values, uniques)
- assert_array_equal(counts, expected_counts)
|