test_encode.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. import pickle
  2. import numpy as np
  3. import pytest
  4. from numpy.testing import assert_array_equal
  5. from sklearn.utils._encode import _check_unknown, _encode, _get_counts, _unique
  6. @pytest.mark.parametrize(
  7. "values, expected",
  8. [
  9. (np.array([2, 1, 3, 1, 3], dtype="int64"), np.array([1, 2, 3], dtype="int64")),
  10. (
  11. np.array([2, 1, np.nan, 1, np.nan], dtype="float32"),
  12. np.array([1, 2, np.nan], dtype="float32"),
  13. ),
  14. (
  15. np.array(["b", "a", "c", "a", "c"], dtype=object),
  16. np.array(["a", "b", "c"], dtype=object),
  17. ),
  18. (
  19. np.array(["b", "a", None, "a", None], dtype=object),
  20. np.array(["a", "b", None], dtype=object),
  21. ),
  22. (np.array(["b", "a", "c", "a", "c"]), np.array(["a", "b", "c"])),
  23. ],
  24. ids=["int64", "float32-nan", "object", "object-None", "str"],
  25. )
  26. def test_encode_util(values, expected):
  27. uniques = _unique(values)
  28. assert_array_equal(uniques, expected)
  29. result, encoded = _unique(values, return_inverse=True)
  30. assert_array_equal(result, expected)
  31. assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
  32. encoded = _encode(values, uniques=uniques)
  33. assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
  34. result, counts = _unique(values, return_counts=True)
  35. assert_array_equal(result, expected)
  36. assert_array_equal(counts, np.array([2, 1, 2]))
  37. result, encoded, counts = _unique(values, return_inverse=True, return_counts=True)
  38. assert_array_equal(result, expected)
  39. assert_array_equal(encoded, np.array([1, 0, 2, 0, 2]))
  40. assert_array_equal(counts, np.array([2, 1, 2]))
  41. def test_encode_with_check_unknown():
  42. # test for the check_unknown parameter of _encode()
  43. uniques = np.array([1, 2, 3])
  44. values = np.array([1, 2, 3, 4])
  45. # Default is True, raise error
  46. with pytest.raises(ValueError, match="y contains previously unseen labels"):
  47. _encode(values, uniques=uniques, check_unknown=True)
  48. # dont raise error if False
  49. _encode(values, uniques=uniques, check_unknown=False)
  50. # parameter is ignored for object dtype
  51. uniques = np.array(["a", "b", "c"], dtype=object)
  52. values = np.array(["a", "b", "c", "d"], dtype=object)
  53. with pytest.raises(ValueError, match="y contains previously unseen labels"):
  54. _encode(values, uniques=uniques, check_unknown=False)
  55. def _assert_check_unknown(values, uniques, expected_diff, expected_mask):
  56. diff = _check_unknown(values, uniques)
  57. assert_array_equal(diff, expected_diff)
  58. diff, valid_mask = _check_unknown(values, uniques, return_mask=True)
  59. assert_array_equal(diff, expected_diff)
  60. assert_array_equal(valid_mask, expected_mask)
  61. @pytest.mark.parametrize(
  62. "values, uniques, expected_diff, expected_mask",
  63. [
  64. (np.array([1, 2, 3, 4]), np.array([1, 2, 3]), [4], [True, True, True, False]),
  65. (np.array([2, 1, 4, 5]), np.array([2, 5, 1]), [4], [True, True, False, True]),
  66. (np.array([2, 1, np.nan]), np.array([2, 5, 1]), [np.nan], [True, True, False]),
  67. (
  68. np.array([2, 1, 4, np.nan]),
  69. np.array([2, 5, 1, np.nan]),
  70. [4],
  71. [True, True, False, True],
  72. ),
  73. (
  74. np.array([2, 1, 4, np.nan]),
  75. np.array([2, 5, 1]),
  76. [4, np.nan],
  77. [True, True, False, False],
  78. ),
  79. (
  80. np.array([2, 1, 4, 5]),
  81. np.array([2, 5, 1, np.nan]),
  82. [4],
  83. [True, True, False, True],
  84. ),
  85. (
  86. np.array(["a", "b", "c", "d"], dtype=object),
  87. np.array(["a", "b", "c"], dtype=object),
  88. np.array(["d"], dtype=object),
  89. [True, True, True, False],
  90. ),
  91. (
  92. np.array(["d", "c", "a", "b"], dtype=object),
  93. np.array(["a", "c", "b"], dtype=object),
  94. np.array(["d"], dtype=object),
  95. [False, True, True, True],
  96. ),
  97. (
  98. np.array(["a", "b", "c", "d"]),
  99. np.array(["a", "b", "c"]),
  100. np.array(["d"]),
  101. [True, True, True, False],
  102. ),
  103. (
  104. np.array(["d", "c", "a", "b"]),
  105. np.array(["a", "c", "b"]),
  106. np.array(["d"]),
  107. [False, True, True, True],
  108. ),
  109. ],
  110. )
  111. def test_check_unknown(values, uniques, expected_diff, expected_mask):
  112. _assert_check_unknown(values, uniques, expected_diff, expected_mask)
  113. @pytest.mark.parametrize("missing_value", [None, np.nan, float("nan")])
  114. @pytest.mark.parametrize("pickle_uniques", [True, False])
  115. def test_check_unknown_missing_values(missing_value, pickle_uniques):
  116. # check for check_unknown with missing values with object dtypes
  117. values = np.array(["d", "c", "a", "b", missing_value], dtype=object)
  118. uniques = np.array(["c", "a", "b", missing_value], dtype=object)
  119. if pickle_uniques:
  120. uniques = pickle.loads(pickle.dumps(uniques))
  121. expected_diff = ["d"]
  122. expected_mask = [False, True, True, True, True]
  123. _assert_check_unknown(values, uniques, expected_diff, expected_mask)
  124. values = np.array(["d", "c", "a", "b", missing_value], dtype=object)
  125. uniques = np.array(["c", "a", "b"], dtype=object)
  126. if pickle_uniques:
  127. uniques = pickle.loads(pickle.dumps(uniques))
  128. expected_diff = ["d", missing_value]
  129. expected_mask = [False, True, True, True, False]
  130. _assert_check_unknown(values, uniques, expected_diff, expected_mask)
  131. values = np.array(["a", missing_value], dtype=object)
  132. uniques = np.array(["a", "b", "z"], dtype=object)
  133. if pickle_uniques:
  134. uniques = pickle.loads(pickle.dumps(uniques))
  135. expected_diff = [missing_value]
  136. expected_mask = [True, False]
  137. _assert_check_unknown(values, uniques, expected_diff, expected_mask)
  138. @pytest.mark.parametrize("missing_value", [np.nan, None, float("nan")])
  139. @pytest.mark.parametrize("pickle_uniques", [True, False])
  140. def test_unique_util_missing_values_objects(missing_value, pickle_uniques):
  141. # check for _unique and _encode with missing values with object dtypes
  142. values = np.array(["a", "c", "c", missing_value, "b"], dtype=object)
  143. expected_uniques = np.array(["a", "b", "c", missing_value], dtype=object)
  144. uniques = _unique(values)
  145. if missing_value is None:
  146. assert_array_equal(uniques, expected_uniques)
  147. else: # missing_value == np.nan
  148. assert_array_equal(uniques[:-1], expected_uniques[:-1])
  149. assert np.isnan(uniques[-1])
  150. if pickle_uniques:
  151. uniques = pickle.loads(pickle.dumps(uniques))
  152. encoded = _encode(values, uniques=uniques)
  153. assert_array_equal(encoded, np.array([0, 2, 2, 3, 1]))
  154. def test_unique_util_missing_values_numeric():
  155. # Check missing values in numerical values
  156. values = np.array([3, 1, np.nan, 5, 3, np.nan], dtype=float)
  157. expected_uniques = np.array([1, 3, 5, np.nan], dtype=float)
  158. expected_inverse = np.array([1, 0, 3, 2, 1, 3])
  159. uniques = _unique(values)
  160. assert_array_equal(uniques, expected_uniques)
  161. uniques, inverse = _unique(values, return_inverse=True)
  162. assert_array_equal(uniques, expected_uniques)
  163. assert_array_equal(inverse, expected_inverse)
  164. encoded = _encode(values, uniques=uniques)
  165. assert_array_equal(encoded, expected_inverse)
  166. def test_unique_util_with_all_missing_values():
  167. # test for all types of missing values for object dtype
  168. values = np.array([np.nan, "a", "c", "c", None, float("nan"), None], dtype=object)
  169. uniques = _unique(values)
  170. assert_array_equal(uniques[:-1], ["a", "c", None])
  171. # last value is nan
  172. assert np.isnan(uniques[-1])
  173. expected_inverse = [3, 0, 1, 1, 2, 3, 2]
  174. _, inverse = _unique(values, return_inverse=True)
  175. assert_array_equal(inverse, expected_inverse)
  176. def test_check_unknown_with_both_missing_values():
  177. # test for both types of missing values for object dtype
  178. values = np.array([np.nan, "a", "c", "c", None, np.nan, None], dtype=object)
  179. diff = _check_unknown(values, known_values=np.array(["a", "c"], dtype=object))
  180. assert diff[0] is None
  181. assert np.isnan(diff[1])
  182. diff, valid_mask = _check_unknown(
  183. values, known_values=np.array(["a", "c"], dtype=object), return_mask=True
  184. )
  185. assert diff[0] is None
  186. assert np.isnan(diff[1])
  187. assert_array_equal(valid_mask, [False, True, True, True, False, False, False])
  188. @pytest.mark.parametrize(
  189. "values, uniques, expected_counts",
  190. [
  191. (np.array([1] * 10 + [2] * 4 + [3] * 15), np.array([1, 2, 3]), [10, 4, 15]),
  192. (
  193. np.array([1] * 10 + [2] * 4 + [3] * 15),
  194. np.array([1, 2, 3, 5]),
  195. [10, 4, 15, 0],
  196. ),
  197. (
  198. np.array([np.nan] * 10 + [2] * 4 + [3] * 15),
  199. np.array([2, 3, np.nan]),
  200. [4, 15, 10],
  201. ),
  202. (
  203. np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
  204. ["a", "b", "c"],
  205. [16, 4, 20],
  206. ),
  207. (
  208. np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
  209. ["c", "b", "a"],
  210. [20, 4, 16],
  211. ),
  212. (
  213. np.array([np.nan] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
  214. ["c", np.nan, "a"],
  215. [20, 4, 16],
  216. ),
  217. (
  218. np.array(["b"] * 4 + ["a"] * 16 + ["c"] * 20, dtype=object),
  219. ["a", "b", "c", "e"],
  220. [16, 4, 20, 0],
  221. ),
  222. ],
  223. )
  224. def test_get_counts(values, uniques, expected_counts):
  225. counts = _get_counts(values, uniques)
  226. assert_array_equal(counts, expected_counts)