test_label.py 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688
  1. import numpy as np
  2. import pytest
  3. from scipy.sparse import (
  4. coo_matrix,
  5. csc_matrix,
  6. csr_matrix,
  7. dok_matrix,
  8. issparse,
  9. lil_matrix,
  10. )
  11. from sklearn import datasets
  12. from sklearn.preprocessing._label import (
  13. LabelBinarizer,
  14. LabelEncoder,
  15. MultiLabelBinarizer,
  16. _inverse_binarize_multiclass,
  17. _inverse_binarize_thresholding,
  18. label_binarize,
  19. )
  20. from sklearn.utils import _to_object_array
  21. from sklearn.utils._testing import assert_array_equal, ignore_warnings
  22. from sklearn.utils.multiclass import type_of_target
  23. iris = datasets.load_iris()
  24. def toarray(a):
  25. if hasattr(a, "toarray"):
  26. a = a.toarray()
  27. return a
  28. def test_label_binarizer():
  29. # one-class case defaults to negative label
  30. # For dense case:
  31. inp = ["pos", "pos", "pos", "pos"]
  32. lb = LabelBinarizer(sparse_output=False)
  33. expected = np.array([[0, 0, 0, 0]]).T
  34. got = lb.fit_transform(inp)
  35. assert_array_equal(lb.classes_, ["pos"])
  36. assert_array_equal(expected, got)
  37. assert_array_equal(lb.inverse_transform(got), inp)
  38. # For sparse case:
  39. lb = LabelBinarizer(sparse_output=True)
  40. got = lb.fit_transform(inp)
  41. assert issparse(got)
  42. assert_array_equal(lb.classes_, ["pos"])
  43. assert_array_equal(expected, got.toarray())
  44. assert_array_equal(lb.inverse_transform(got.toarray()), inp)
  45. lb = LabelBinarizer(sparse_output=False)
  46. # two-class case
  47. inp = ["neg", "pos", "pos", "neg"]
  48. expected = np.array([[0, 1, 1, 0]]).T
  49. got = lb.fit_transform(inp)
  50. assert_array_equal(lb.classes_, ["neg", "pos"])
  51. assert_array_equal(expected, got)
  52. to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]])
  53. assert_array_equal(lb.inverse_transform(to_invert), inp)
  54. # multi-class case
  55. inp = ["spam", "ham", "eggs", "ham", "0"]
  56. expected = np.array(
  57. [[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]
  58. )
  59. got = lb.fit_transform(inp)
  60. assert_array_equal(lb.classes_, ["0", "eggs", "ham", "spam"])
  61. assert_array_equal(expected, got)
  62. assert_array_equal(lb.inverse_transform(got), inp)
  63. def test_label_binarizer_unseen_labels():
  64. lb = LabelBinarizer()
  65. expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
  66. got = lb.fit_transform(["b", "d", "e"])
  67. assert_array_equal(expected, got)
  68. expected = np.array(
  69. [[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]]
  70. )
  71. got = lb.transform(["a", "b", "c", "d", "e", "f"])
  72. assert_array_equal(expected, got)
  73. def test_label_binarizer_set_label_encoding():
  74. lb = LabelBinarizer(neg_label=-2, pos_label=0)
  75. # two-class case with pos_label=0
  76. inp = np.array([0, 1, 1, 0])
  77. expected = np.array([[-2, 0, 0, -2]]).T
  78. got = lb.fit_transform(inp)
  79. assert_array_equal(expected, got)
  80. assert_array_equal(lb.inverse_transform(got), inp)
  81. lb = LabelBinarizer(neg_label=-2, pos_label=2)
  82. # multi-class case
  83. inp = np.array([3, 2, 1, 2, 0])
  84. expected = np.array(
  85. [
  86. [-2, -2, -2, +2],
  87. [-2, -2, +2, -2],
  88. [-2, +2, -2, -2],
  89. [-2, -2, +2, -2],
  90. [+2, -2, -2, -2],
  91. ]
  92. )
  93. got = lb.fit_transform(inp)
  94. assert_array_equal(expected, got)
  95. assert_array_equal(lb.inverse_transform(got), inp)
  96. @pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
  97. @pytest.mark.parametrize("unique_first", [True, False])
  98. def test_label_binarizer_pandas_nullable(dtype, unique_first):
  99. """Checks that LabelBinarizer works with pandas nullable dtypes.
  100. Non-regression test for gh-25637.
  101. """
  102. pd = pytest.importorskip("pandas")
  103. y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype)
  104. if unique_first:
  105. # Calling unique creates a pandas array which has a different interface
  106. # compared to a pandas Series. Specifically, pandas arrays do not have "iloc".
  107. y_true = y_true.unique()
  108. lb = LabelBinarizer().fit(y_true)
  109. y_out = lb.transform([1, 0])
  110. assert_array_equal(y_out, [[1], [0]])
  111. @ignore_warnings
  112. def test_label_binarizer_errors():
  113. # Check that invalid arguments yield ValueError
  114. one_class = np.array([0, 0, 0, 0])
  115. lb = LabelBinarizer().fit(one_class)
  116. multi_label = [(2, 3), (0,), (0, 2)]
  117. err_msg = "You appear to be using a legacy multi-label data representation."
  118. with pytest.raises(ValueError, match=err_msg):
  119. lb.transform(multi_label)
  120. lb = LabelBinarizer()
  121. err_msg = "This LabelBinarizer instance is not fitted yet"
  122. with pytest.raises(ValueError, match=err_msg):
  123. lb.transform([])
  124. with pytest.raises(ValueError, match=err_msg):
  125. lb.inverse_transform([])
  126. input_labels = [0, 1, 0, 1]
  127. err_msg = "neg_label=2 must be strictly less than pos_label=1."
  128. lb = LabelBinarizer(neg_label=2, pos_label=1)
  129. with pytest.raises(ValueError, match=err_msg):
  130. lb.fit(input_labels)
  131. err_msg = "neg_label=2 must be strictly less than pos_label=2."
  132. lb = LabelBinarizer(neg_label=2, pos_label=2)
  133. with pytest.raises(ValueError, match=err_msg):
  134. lb.fit(input_labels)
  135. err_msg = (
  136. "Sparse binarization is only supported with non zero pos_label and zero "
  137. "neg_label, got pos_label=2 and neg_label=1"
  138. )
  139. lb = LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)
  140. with pytest.raises(ValueError, match=err_msg):
  141. lb.fit(input_labels)
  142. # Fail on y_type
  143. err_msg = "foo format is not supported"
  144. with pytest.raises(ValueError, match=err_msg):
  145. _inverse_binarize_thresholding(
  146. y=csr_matrix([[1, 2], [2, 1]]),
  147. output_type="foo",
  148. classes=[1, 2],
  149. threshold=0,
  150. )
  151. # Sequence of seq type should raise ValueError
  152. y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
  153. err_msg = "You appear to be using a legacy multi-label data representation"
  154. with pytest.raises(ValueError, match=err_msg):
  155. LabelBinarizer().fit_transform(y_seq_of_seqs)
  156. # Fail on the number of classes
  157. err_msg = "The number of class is not equal to the number of dimension of y."
  158. with pytest.raises(ValueError, match=err_msg):
  159. _inverse_binarize_thresholding(
  160. y=csr_matrix([[1, 2], [2, 1]]),
  161. output_type="foo",
  162. classes=[1, 2, 3],
  163. threshold=0,
  164. )
  165. # Fail on the dimension of 'binary'
  166. err_msg = "output_type='binary', but y.shape"
  167. with pytest.raises(ValueError, match=err_msg):
  168. _inverse_binarize_thresholding(
  169. y=np.array([[1, 2, 3], [2, 1, 3]]),
  170. output_type="binary",
  171. classes=[1, 2, 3],
  172. threshold=0,
  173. )
  174. # Fail on multioutput data
  175. err_msg = "Multioutput target data is not supported with label binarization"
  176. with pytest.raises(ValueError, match=err_msg):
  177. LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
  178. with pytest.raises(ValueError, match=err_msg):
  179. label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
  180. @pytest.mark.parametrize(
  181. "values, classes, unknown",
  182. [
  183. (
  184. np.array([2, 1, 3, 1, 3], dtype="int64"),
  185. np.array([1, 2, 3], dtype="int64"),
  186. np.array([4], dtype="int64"),
  187. ),
  188. (
  189. np.array(["b", "a", "c", "a", "c"], dtype=object),
  190. np.array(["a", "b", "c"], dtype=object),
  191. np.array(["d"], dtype=object),
  192. ),
  193. (
  194. np.array(["b", "a", "c", "a", "c"]),
  195. np.array(["a", "b", "c"]),
  196. np.array(["d"]),
  197. ),
  198. ],
  199. ids=["int64", "object", "str"],
  200. )
  201. def test_label_encoder(values, classes, unknown):
  202. # Test LabelEncoder's transform, fit_transform and
  203. # inverse_transform methods
  204. le = LabelEncoder()
  205. le.fit(values)
  206. assert_array_equal(le.classes_, classes)
  207. assert_array_equal(le.transform(values), [1, 0, 2, 0, 2])
  208. assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values)
  209. le = LabelEncoder()
  210. ret = le.fit_transform(values)
  211. assert_array_equal(ret, [1, 0, 2, 0, 2])
  212. with pytest.raises(ValueError, match="unseen labels"):
  213. le.transform(unknown)
  214. def test_label_encoder_negative_ints():
  215. le = LabelEncoder()
  216. le.fit([1, 1, 4, 5, -1, 0])
  217. assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
  218. assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0])
  219. assert_array_equal(
  220. le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]
  221. )
  222. with pytest.raises(ValueError):
  223. le.transform([0, 6])
  224. @pytest.mark.parametrize("dtype", ["str", "object"])
  225. def test_label_encoder_str_bad_shape(dtype):
  226. le = LabelEncoder()
  227. le.fit(np.array(["apple", "orange"], dtype=dtype))
  228. msg = "should be a 1d array"
  229. with pytest.raises(ValueError, match=msg):
  230. le.transform("apple")
  231. def test_label_encoder_errors():
  232. # Check that invalid arguments yield ValueError
  233. le = LabelEncoder()
  234. with pytest.raises(ValueError):
  235. le.transform([])
  236. with pytest.raises(ValueError):
  237. le.inverse_transform([])
  238. # Fail on unseen labels
  239. le = LabelEncoder()
  240. le.fit([1, 2, 3, -1, 1])
  241. msg = "contains previously unseen labels"
  242. with pytest.raises(ValueError, match=msg):
  243. le.inverse_transform([-2])
  244. with pytest.raises(ValueError, match=msg):
  245. le.inverse_transform([-2, -3, -4])
  246. # Fail on inverse_transform("")
  247. msg = r"should be a 1d array.+shape \(\)"
  248. with pytest.raises(ValueError, match=msg):
  249. le.inverse_transform("")
  250. @pytest.mark.parametrize(
  251. "values",
  252. [
  253. np.array([2, 1, 3, 1, 3], dtype="int64"),
  254. np.array(["b", "a", "c", "a", "c"], dtype=object),
  255. np.array(["b", "a", "c", "a", "c"]),
  256. ],
  257. ids=["int64", "object", "str"],
  258. )
  259. def test_label_encoder_empty_array(values):
  260. le = LabelEncoder()
  261. le.fit(values)
  262. # test empty transform
  263. transformed = le.transform([])
  264. assert_array_equal(np.array([]), transformed)
  265. # test empty inverse transform
  266. inverse_transformed = le.inverse_transform([])
  267. assert_array_equal(np.array([]), inverse_transformed)
  268. def test_sparse_output_multilabel_binarizer():
  269. # test input as iterable of iterables
  270. inputs = [
  271. lambda: [(2, 3), (1,), (1, 2)],
  272. lambda: ({2, 3}, {1}, {1, 2}),
  273. lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
  274. ]
  275. indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
  276. inverse = inputs[0]()
  277. for sparse_output in [True, False]:
  278. for inp in inputs:
  279. # With fit_transform
  280. mlb = MultiLabelBinarizer(sparse_output=sparse_output)
  281. got = mlb.fit_transform(inp())
  282. assert issparse(got) == sparse_output
  283. if sparse_output:
  284. # verify CSR assumption that indices and indptr have same dtype
  285. assert got.indices.dtype == got.indptr.dtype
  286. got = got.toarray()
  287. assert_array_equal(indicator_mat, got)
  288. assert_array_equal([1, 2, 3], mlb.classes_)
  289. assert mlb.inverse_transform(got) == inverse
  290. # With fit
  291. mlb = MultiLabelBinarizer(sparse_output=sparse_output)
  292. got = mlb.fit(inp()).transform(inp())
  293. assert issparse(got) == sparse_output
  294. if sparse_output:
  295. # verify CSR assumption that indices and indptr have same dtype
  296. assert got.indices.dtype == got.indptr.dtype
  297. got = got.toarray()
  298. assert_array_equal(indicator_mat, got)
  299. assert_array_equal([1, 2, 3], mlb.classes_)
  300. assert mlb.inverse_transform(got) == inverse
  301. with pytest.raises(ValueError):
  302. mlb.inverse_transform(csr_matrix(np.array([[0, 1, 1], [2, 0, 0], [1, 1, 0]])))
  303. def test_multilabel_binarizer():
  304. # test input as iterable of iterables
  305. inputs = [
  306. lambda: [(2, 3), (1,), (1, 2)],
  307. lambda: ({2, 3}, {1}, {1, 2}),
  308. lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
  309. ]
  310. indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
  311. inverse = inputs[0]()
  312. for inp in inputs:
  313. # With fit_transform
  314. mlb = MultiLabelBinarizer()
  315. got = mlb.fit_transform(inp())
  316. assert_array_equal(indicator_mat, got)
  317. assert_array_equal([1, 2, 3], mlb.classes_)
  318. assert mlb.inverse_transform(got) == inverse
  319. # With fit
  320. mlb = MultiLabelBinarizer()
  321. got = mlb.fit(inp()).transform(inp())
  322. assert_array_equal(indicator_mat, got)
  323. assert_array_equal([1, 2, 3], mlb.classes_)
  324. assert mlb.inverse_transform(got) == inverse
  325. def test_multilabel_binarizer_empty_sample():
  326. mlb = MultiLabelBinarizer()
  327. y = [[1, 2], [1], []]
  328. Y = np.array([[1, 1], [1, 0], [0, 0]])
  329. assert_array_equal(mlb.fit_transform(y), Y)
  330. def test_multilabel_binarizer_unknown_class():
  331. mlb = MultiLabelBinarizer()
  332. y = [[1, 2]]
  333. Y = np.array([[1, 0], [0, 1]])
  334. warning_message = "unknown class.* will be ignored"
  335. with pytest.warns(UserWarning, match=warning_message):
  336. matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
  337. Y = np.array([[1, 0, 0], [0, 1, 0]])
  338. mlb = MultiLabelBinarizer(classes=[1, 2, 3])
  339. with pytest.warns(UserWarning, match=warning_message):
  340. matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
  341. assert_array_equal(matrix, Y)
  342. def test_multilabel_binarizer_given_classes():
  343. inp = [(2, 3), (1,), (1, 2)]
  344. indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
  345. # fit_transform()
  346. mlb = MultiLabelBinarizer(classes=[1, 3, 2])
  347. assert_array_equal(mlb.fit_transform(inp), indicator_mat)
  348. assert_array_equal(mlb.classes_, [1, 3, 2])
  349. # fit().transform()
  350. mlb = MultiLabelBinarizer(classes=[1, 3, 2])
  351. assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
  352. assert_array_equal(mlb.classes_, [1, 3, 2])
  353. # ensure works with extra class
  354. mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2])
  355. assert_array_equal(
  356. mlb.fit_transform(inp), np.hstack(([[0], [0], [0]], indicator_mat))
  357. )
  358. assert_array_equal(mlb.classes_, [4, 1, 3, 2])
  359. # ensure fit is no-op as iterable is not consumed
  360. inp = iter(inp)
  361. mlb = MultiLabelBinarizer(classes=[1, 3, 2])
  362. assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
  363. # ensure a ValueError is thrown if given duplicate classes
  364. err_msg = (
  365. "The classes argument contains duplicate classes. Remove "
  366. "these duplicates before passing them to MultiLabelBinarizer."
  367. )
  368. mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3])
  369. with pytest.raises(ValueError, match=err_msg):
  370. mlb.fit(inp)
  371. def test_multilabel_binarizer_multiple_calls():
  372. inp = [(2, 3), (1,), (1, 2)]
  373. indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
  374. indicator_mat2 = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
  375. # first call
  376. mlb = MultiLabelBinarizer(classes=[1, 3, 2])
  377. assert_array_equal(mlb.fit_transform(inp), indicator_mat)
  378. # second call change class
  379. mlb.classes = [1, 2, 3]
  380. assert_array_equal(mlb.fit_transform(inp), indicator_mat2)
  381. def test_multilabel_binarizer_same_length_sequence():
  382. # Ensure sequences of the same length are not interpreted as a 2-d array
  383. inp = [[1], [0], [2]]
  384. indicator_mat = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]])
  385. # fit_transform()
  386. mlb = MultiLabelBinarizer()
  387. assert_array_equal(mlb.fit_transform(inp), indicator_mat)
  388. assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
  389. # fit().transform()
  390. mlb = MultiLabelBinarizer()
  391. assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
  392. assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
  393. def test_multilabel_binarizer_non_integer_labels():
  394. tuple_classes = _to_object_array([(1,), (2,), (3,)])
  395. inputs = [
  396. ([("2", "3"), ("1",), ("1", "2")], ["1", "2", "3"]),
  397. ([("b", "c"), ("a",), ("a", "b")], ["a", "b", "c"]),
  398. ([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes),
  399. ]
  400. indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
  401. for inp, classes in inputs:
  402. # fit_transform()
  403. mlb = MultiLabelBinarizer()
  404. inp = np.array(inp, dtype=object)
  405. assert_array_equal(mlb.fit_transform(inp), indicator_mat)
  406. assert_array_equal(mlb.classes_, classes)
  407. indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
  408. assert_array_equal(indicator_mat_inv, inp)
  409. # fit().transform()
  410. mlb = MultiLabelBinarizer()
  411. assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
  412. assert_array_equal(mlb.classes_, classes)
  413. indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
  414. assert_array_equal(indicator_mat_inv, inp)
  415. mlb = MultiLabelBinarizer()
  416. with pytest.raises(TypeError):
  417. mlb.fit_transform([({}), ({}, {"a": "b"})])
  418. def test_multilabel_binarizer_non_unique():
  419. inp = [(1, 1, 1, 0)]
  420. indicator_mat = np.array([[1, 1]])
  421. mlb = MultiLabelBinarizer()
  422. assert_array_equal(mlb.fit_transform(inp), indicator_mat)
  423. def test_multilabel_binarizer_inverse_validation():
  424. inp = [(1, 1, 1, 0)]
  425. mlb = MultiLabelBinarizer()
  426. mlb.fit_transform(inp)
  427. # Not binary
  428. with pytest.raises(ValueError):
  429. mlb.inverse_transform(np.array([[1, 3]]))
  430. # The following binary cases are fine, however
  431. mlb.inverse_transform(np.array([[0, 0]]))
  432. mlb.inverse_transform(np.array([[1, 1]]))
  433. mlb.inverse_transform(np.array([[1, 0]]))
  434. # Wrong shape
  435. with pytest.raises(ValueError):
  436. mlb.inverse_transform(np.array([[1]]))
  437. with pytest.raises(ValueError):
  438. mlb.inverse_transform(np.array([[1, 1, 1]]))
  439. def test_label_binarize_with_class_order():
  440. out = label_binarize([1, 6], classes=[1, 2, 4, 6])
  441. expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]])
  442. assert_array_equal(out, expected)
  443. # Modified class order
  444. out = label_binarize([1, 6], classes=[1, 6, 4, 2])
  445. expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]])
  446. assert_array_equal(out, expected)
  447. out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1])
  448. expected = np.array([[0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0]])
  449. assert_array_equal(out, expected)
  450. def check_binarized_results(y, classes, pos_label, neg_label, expected):
  451. for sparse_output in [True, False]:
  452. if (pos_label == 0 or neg_label != 0) and sparse_output:
  453. with pytest.raises(ValueError):
  454. label_binarize(
  455. y,
  456. classes=classes,
  457. neg_label=neg_label,
  458. pos_label=pos_label,
  459. sparse_output=sparse_output,
  460. )
  461. continue
  462. # check label_binarize
  463. binarized = label_binarize(
  464. y,
  465. classes=classes,
  466. neg_label=neg_label,
  467. pos_label=pos_label,
  468. sparse_output=sparse_output,
  469. )
  470. assert_array_equal(toarray(binarized), expected)
  471. assert issparse(binarized) == sparse_output
  472. # check inverse
  473. y_type = type_of_target(y)
  474. if y_type == "multiclass":
  475. inversed = _inverse_binarize_multiclass(binarized, classes=classes)
  476. else:
  477. inversed = _inverse_binarize_thresholding(
  478. binarized,
  479. output_type=y_type,
  480. classes=classes,
  481. threshold=((neg_label + pos_label) / 2.0),
  482. )
  483. assert_array_equal(toarray(inversed), toarray(y))
  484. # Check label binarizer
  485. lb = LabelBinarizer(
  486. neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output
  487. )
  488. binarized = lb.fit_transform(y)
  489. assert_array_equal(toarray(binarized), expected)
  490. assert issparse(binarized) == sparse_output
  491. inverse_output = lb.inverse_transform(binarized)
  492. assert_array_equal(toarray(inverse_output), toarray(y))
  493. assert issparse(inverse_output) == issparse(y)
  494. def test_label_binarize_binary():
  495. y = [0, 1, 0]
  496. classes = [0, 1]
  497. pos_label = 2
  498. neg_label = -1
  499. expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1))
  500. check_binarized_results(y, classes, pos_label, neg_label, expected)
  501. # Binary case where sparse_output = True will not result in a ValueError
  502. y = [0, 1, 0]
  503. classes = [0, 1]
  504. pos_label = 3
  505. neg_label = 0
  506. expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1))
  507. check_binarized_results(y, classes, pos_label, neg_label, expected)
  508. def test_label_binarize_multiclass():
  509. y = [0, 1, 2]
  510. classes = [0, 1, 2]
  511. pos_label = 2
  512. neg_label = 0
  513. expected = 2 * np.eye(3)
  514. check_binarized_results(y, classes, pos_label, neg_label, expected)
  515. with pytest.raises(ValueError):
  516. label_binarize(
  517. y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
  518. )
  519. def test_label_binarize_multilabel():
  520. y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]])
  521. classes = [0, 1, 2]
  522. pos_label = 2
  523. neg_label = 0
  524. expected = pos_label * y_ind
  525. y_sparse = [
  526. sparse_matrix(y_ind)
  527. for sparse_matrix in [
  528. coo_matrix,
  529. csc_matrix,
  530. csr_matrix,
  531. dok_matrix,
  532. lil_matrix,
  533. ]
  534. ]
  535. for y in [y_ind] + y_sparse:
  536. check_binarized_results(y, classes, pos_label, neg_label, expected)
  537. with pytest.raises(ValueError):
  538. label_binarize(
  539. y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
  540. )
  541. def test_invalid_input_label_binarize():
  542. with pytest.raises(ValueError):
  543. label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)
  544. with pytest.raises(ValueError, match="continuous target data is not "):
  545. label_binarize([1.2, 2.7], classes=[0, 1])
  546. with pytest.raises(ValueError, match="mismatch with the labels"):
  547. label_binarize([[1, 3]], classes=[1, 2, 3])
  548. def test_inverse_binarize_multiclass():
  549. got = _inverse_binarize_multiclass(
  550. csr_matrix([[0, 1, 0], [-1, 0, -1], [0, 0, 0]]), np.arange(3)
  551. )
  552. assert_array_equal(got, np.array([1, 1, 0]))
  553. def test_nan_label_encoder():
  554. """Check that label encoder encodes nans in transform.
  555. Non-regression test for #22628.
  556. """
  557. le = LabelEncoder()
  558. le.fit(["a", "a", "b", np.nan])
  559. y_trans = le.transform([np.nan])
  560. assert_array_equal(y_trans, [2])
  561. @pytest.mark.parametrize(
  562. "encoder", [LabelEncoder(), LabelBinarizer(), MultiLabelBinarizer()]
  563. )
  564. def test_label_encoders_do_not_have_set_output(encoder):
  565. """Check that label encoders do not define set_output and work with y as a kwarg.
  566. Non-regression test for #26854.
  567. """
  568. assert not hasattr(encoder, "set_output")
  569. y_encoded_with_kwarg = encoder.fit_transform(y=["a", "b", "c"])
  570. y_encoded_positional = encoder.fit_transform(["a", "b", "c"])
  571. assert_array_equal(y_encoded_with_kwarg, y_encoded_positional)