| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688 |
- import numpy as np
- import pytest
- from scipy.sparse import (
- coo_matrix,
- csc_matrix,
- csr_matrix,
- dok_matrix,
- issparse,
- lil_matrix,
- )
- from sklearn import datasets
- from sklearn.preprocessing._label import (
- LabelBinarizer,
- LabelEncoder,
- MultiLabelBinarizer,
- _inverse_binarize_multiclass,
- _inverse_binarize_thresholding,
- label_binarize,
- )
- from sklearn.utils import _to_object_array
- from sklearn.utils._testing import assert_array_equal, ignore_warnings
- from sklearn.utils.multiclass import type_of_target
- iris = datasets.load_iris()
- def toarray(a):
- if hasattr(a, "toarray"):
- a = a.toarray()
- return a
- def test_label_binarizer():
- # one-class case defaults to negative label
- # For dense case:
- inp = ["pos", "pos", "pos", "pos"]
- lb = LabelBinarizer(sparse_output=False)
- expected = np.array([[0, 0, 0, 0]]).T
- got = lb.fit_transform(inp)
- assert_array_equal(lb.classes_, ["pos"])
- assert_array_equal(expected, got)
- assert_array_equal(lb.inverse_transform(got), inp)
- # For sparse case:
- lb = LabelBinarizer(sparse_output=True)
- got = lb.fit_transform(inp)
- assert issparse(got)
- assert_array_equal(lb.classes_, ["pos"])
- assert_array_equal(expected, got.toarray())
- assert_array_equal(lb.inverse_transform(got.toarray()), inp)
- lb = LabelBinarizer(sparse_output=False)
- # two-class case
- inp = ["neg", "pos", "pos", "neg"]
- expected = np.array([[0, 1, 1, 0]]).T
- got = lb.fit_transform(inp)
- assert_array_equal(lb.classes_, ["neg", "pos"])
- assert_array_equal(expected, got)
- to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]])
- assert_array_equal(lb.inverse_transform(to_invert), inp)
- # multi-class case
- inp = ["spam", "ham", "eggs", "ham", "0"]
- expected = np.array(
- [[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]
- )
- got = lb.fit_transform(inp)
- assert_array_equal(lb.classes_, ["0", "eggs", "ham", "spam"])
- assert_array_equal(expected, got)
- assert_array_equal(lb.inverse_transform(got), inp)
- def test_label_binarizer_unseen_labels():
- lb = LabelBinarizer()
- expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
- got = lb.fit_transform(["b", "d", "e"])
- assert_array_equal(expected, got)
- expected = np.array(
- [[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]]
- )
- got = lb.transform(["a", "b", "c", "d", "e", "f"])
- assert_array_equal(expected, got)
- def test_label_binarizer_set_label_encoding():
- lb = LabelBinarizer(neg_label=-2, pos_label=0)
- # two-class case with pos_label=0
- inp = np.array([0, 1, 1, 0])
- expected = np.array([[-2, 0, 0, -2]]).T
- got = lb.fit_transform(inp)
- assert_array_equal(expected, got)
- assert_array_equal(lb.inverse_transform(got), inp)
- lb = LabelBinarizer(neg_label=-2, pos_label=2)
- # multi-class case
- inp = np.array([3, 2, 1, 2, 0])
- expected = np.array(
- [
- [-2, -2, -2, +2],
- [-2, -2, +2, -2],
- [-2, +2, -2, -2],
- [-2, -2, +2, -2],
- [+2, -2, -2, -2],
- ]
- )
- got = lb.fit_transform(inp)
- assert_array_equal(expected, got)
- assert_array_equal(lb.inverse_transform(got), inp)
- @pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
- @pytest.mark.parametrize("unique_first", [True, False])
- def test_label_binarizer_pandas_nullable(dtype, unique_first):
- """Checks that LabelBinarizer works with pandas nullable dtypes.
- Non-regression test for gh-25637.
- """
- pd = pytest.importorskip("pandas")
- y_true = pd.Series([1, 0, 0, 1, 0, 1, 1, 0, 1], dtype=dtype)
- if unique_first:
- # Calling unique creates a pandas array which has a different interface
- # compared to a pandas Series. Specifically, pandas arrays do not have "iloc".
- y_true = y_true.unique()
- lb = LabelBinarizer().fit(y_true)
- y_out = lb.transform([1, 0])
- assert_array_equal(y_out, [[1], [0]])
- @ignore_warnings
- def test_label_binarizer_errors():
- # Check that invalid arguments yield ValueError
- one_class = np.array([0, 0, 0, 0])
- lb = LabelBinarizer().fit(one_class)
- multi_label = [(2, 3), (0,), (0, 2)]
- err_msg = "You appear to be using a legacy multi-label data representation."
- with pytest.raises(ValueError, match=err_msg):
- lb.transform(multi_label)
- lb = LabelBinarizer()
- err_msg = "This LabelBinarizer instance is not fitted yet"
- with pytest.raises(ValueError, match=err_msg):
- lb.transform([])
- with pytest.raises(ValueError, match=err_msg):
- lb.inverse_transform([])
- input_labels = [0, 1, 0, 1]
- err_msg = "neg_label=2 must be strictly less than pos_label=1."
- lb = LabelBinarizer(neg_label=2, pos_label=1)
- with pytest.raises(ValueError, match=err_msg):
- lb.fit(input_labels)
- err_msg = "neg_label=2 must be strictly less than pos_label=2."
- lb = LabelBinarizer(neg_label=2, pos_label=2)
- with pytest.raises(ValueError, match=err_msg):
- lb.fit(input_labels)
- err_msg = (
- "Sparse binarization is only supported with non zero pos_label and zero "
- "neg_label, got pos_label=2 and neg_label=1"
- )
- lb = LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)
- with pytest.raises(ValueError, match=err_msg):
- lb.fit(input_labels)
- # Fail on y_type
- err_msg = "foo format is not supported"
- with pytest.raises(ValueError, match=err_msg):
- _inverse_binarize_thresholding(
- y=csr_matrix([[1, 2], [2, 1]]),
- output_type="foo",
- classes=[1, 2],
- threshold=0,
- )
- # Sequence of seq type should raise ValueError
- y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
- err_msg = "You appear to be using a legacy multi-label data representation"
- with pytest.raises(ValueError, match=err_msg):
- LabelBinarizer().fit_transform(y_seq_of_seqs)
- # Fail on the number of classes
- err_msg = "The number of class is not equal to the number of dimension of y."
- with pytest.raises(ValueError, match=err_msg):
- _inverse_binarize_thresholding(
- y=csr_matrix([[1, 2], [2, 1]]),
- output_type="foo",
- classes=[1, 2, 3],
- threshold=0,
- )
- # Fail on the dimension of 'binary'
- err_msg = "output_type='binary', but y.shape"
- with pytest.raises(ValueError, match=err_msg):
- _inverse_binarize_thresholding(
- y=np.array([[1, 2, 3], [2, 1, 3]]),
- output_type="binary",
- classes=[1, 2, 3],
- threshold=0,
- )
- # Fail on multioutput data
- err_msg = "Multioutput target data is not supported with label binarization"
- with pytest.raises(ValueError, match=err_msg):
- LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
- with pytest.raises(ValueError, match=err_msg):
- label_binarize(np.array([[1, 3], [2, 1]]), classes=[1, 2, 3])
- @pytest.mark.parametrize(
- "values, classes, unknown",
- [
- (
- np.array([2, 1, 3, 1, 3], dtype="int64"),
- np.array([1, 2, 3], dtype="int64"),
- np.array([4], dtype="int64"),
- ),
- (
- np.array(["b", "a", "c", "a", "c"], dtype=object),
- np.array(["a", "b", "c"], dtype=object),
- np.array(["d"], dtype=object),
- ),
- (
- np.array(["b", "a", "c", "a", "c"]),
- np.array(["a", "b", "c"]),
- np.array(["d"]),
- ),
- ],
- ids=["int64", "object", "str"],
- )
- def test_label_encoder(values, classes, unknown):
- # Test LabelEncoder's transform, fit_transform and
- # inverse_transform methods
- le = LabelEncoder()
- le.fit(values)
- assert_array_equal(le.classes_, classes)
- assert_array_equal(le.transform(values), [1, 0, 2, 0, 2])
- assert_array_equal(le.inverse_transform([1, 0, 2, 0, 2]), values)
- le = LabelEncoder()
- ret = le.fit_transform(values)
- assert_array_equal(ret, [1, 0, 2, 0, 2])
- with pytest.raises(ValueError, match="unseen labels"):
- le.transform(unknown)
- def test_label_encoder_negative_ints():
- le = LabelEncoder()
- le.fit([1, 1, 4, 5, -1, 0])
- assert_array_equal(le.classes_, [-1, 0, 1, 4, 5])
- assert_array_equal(le.transform([0, 1, 4, 4, 5, -1, -1]), [1, 2, 3, 3, 4, 0, 0])
- assert_array_equal(
- le.inverse_transform([1, 2, 3, 3, 4, 0, 0]), [0, 1, 4, 4, 5, -1, -1]
- )
- with pytest.raises(ValueError):
- le.transform([0, 6])
- @pytest.mark.parametrize("dtype", ["str", "object"])
- def test_label_encoder_str_bad_shape(dtype):
- le = LabelEncoder()
- le.fit(np.array(["apple", "orange"], dtype=dtype))
- msg = "should be a 1d array"
- with pytest.raises(ValueError, match=msg):
- le.transform("apple")
- def test_label_encoder_errors():
- # Check that invalid arguments yield ValueError
- le = LabelEncoder()
- with pytest.raises(ValueError):
- le.transform([])
- with pytest.raises(ValueError):
- le.inverse_transform([])
- # Fail on unseen labels
- le = LabelEncoder()
- le.fit([1, 2, 3, -1, 1])
- msg = "contains previously unseen labels"
- with pytest.raises(ValueError, match=msg):
- le.inverse_transform([-2])
- with pytest.raises(ValueError, match=msg):
- le.inverse_transform([-2, -3, -4])
- # Fail on inverse_transform("")
- msg = r"should be a 1d array.+shape \(\)"
- with pytest.raises(ValueError, match=msg):
- le.inverse_transform("")
- @pytest.mark.parametrize(
- "values",
- [
- np.array([2, 1, 3, 1, 3], dtype="int64"),
- np.array(["b", "a", "c", "a", "c"], dtype=object),
- np.array(["b", "a", "c", "a", "c"]),
- ],
- ids=["int64", "object", "str"],
- )
- def test_label_encoder_empty_array(values):
- le = LabelEncoder()
- le.fit(values)
- # test empty transform
- transformed = le.transform([])
- assert_array_equal(np.array([]), transformed)
- # test empty inverse transform
- inverse_transformed = le.inverse_transform([])
- assert_array_equal(np.array([]), inverse_transformed)
- def test_sparse_output_multilabel_binarizer():
- # test input as iterable of iterables
- inputs = [
- lambda: [(2, 3), (1,), (1, 2)],
- lambda: ({2, 3}, {1}, {1, 2}),
- lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
- ]
- indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
- inverse = inputs[0]()
- for sparse_output in [True, False]:
- for inp in inputs:
- # With fit_transform
- mlb = MultiLabelBinarizer(sparse_output=sparse_output)
- got = mlb.fit_transform(inp())
- assert issparse(got) == sparse_output
- if sparse_output:
- # verify CSR assumption that indices and indptr have same dtype
- assert got.indices.dtype == got.indptr.dtype
- got = got.toarray()
- assert_array_equal(indicator_mat, got)
- assert_array_equal([1, 2, 3], mlb.classes_)
- assert mlb.inverse_transform(got) == inverse
- # With fit
- mlb = MultiLabelBinarizer(sparse_output=sparse_output)
- got = mlb.fit(inp()).transform(inp())
- assert issparse(got) == sparse_output
- if sparse_output:
- # verify CSR assumption that indices and indptr have same dtype
- assert got.indices.dtype == got.indptr.dtype
- got = got.toarray()
- assert_array_equal(indicator_mat, got)
- assert_array_equal([1, 2, 3], mlb.classes_)
- assert mlb.inverse_transform(got) == inverse
- with pytest.raises(ValueError):
- mlb.inverse_transform(csr_matrix(np.array([[0, 1, 1], [2, 0, 0], [1, 1, 0]])))
- def test_multilabel_binarizer():
- # test input as iterable of iterables
- inputs = [
- lambda: [(2, 3), (1,), (1, 2)],
- lambda: ({2, 3}, {1}, {1, 2}),
- lambda: iter([iter((2, 3)), iter((1,)), {1, 2}]),
- ]
- indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
- inverse = inputs[0]()
- for inp in inputs:
- # With fit_transform
- mlb = MultiLabelBinarizer()
- got = mlb.fit_transform(inp())
- assert_array_equal(indicator_mat, got)
- assert_array_equal([1, 2, 3], mlb.classes_)
- assert mlb.inverse_transform(got) == inverse
- # With fit
- mlb = MultiLabelBinarizer()
- got = mlb.fit(inp()).transform(inp())
- assert_array_equal(indicator_mat, got)
- assert_array_equal([1, 2, 3], mlb.classes_)
- assert mlb.inverse_transform(got) == inverse
- def test_multilabel_binarizer_empty_sample():
- mlb = MultiLabelBinarizer()
- y = [[1, 2], [1], []]
- Y = np.array([[1, 1], [1, 0], [0, 0]])
- assert_array_equal(mlb.fit_transform(y), Y)
- def test_multilabel_binarizer_unknown_class():
- mlb = MultiLabelBinarizer()
- y = [[1, 2]]
- Y = np.array([[1, 0], [0, 1]])
- warning_message = "unknown class.* will be ignored"
- with pytest.warns(UserWarning, match=warning_message):
- matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
- Y = np.array([[1, 0, 0], [0, 1, 0]])
- mlb = MultiLabelBinarizer(classes=[1, 2, 3])
- with pytest.warns(UserWarning, match=warning_message):
- matrix = mlb.fit(y).transform([[4, 1], [2, 0]])
- assert_array_equal(matrix, Y)
- def test_multilabel_binarizer_given_classes():
- inp = [(2, 3), (1,), (1, 2)]
- indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
- # fit_transform()
- mlb = MultiLabelBinarizer(classes=[1, 3, 2])
- assert_array_equal(mlb.fit_transform(inp), indicator_mat)
- assert_array_equal(mlb.classes_, [1, 3, 2])
- # fit().transform()
- mlb = MultiLabelBinarizer(classes=[1, 3, 2])
- assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
- assert_array_equal(mlb.classes_, [1, 3, 2])
- # ensure works with extra class
- mlb = MultiLabelBinarizer(classes=[4, 1, 3, 2])
- assert_array_equal(
- mlb.fit_transform(inp), np.hstack(([[0], [0], [0]], indicator_mat))
- )
- assert_array_equal(mlb.classes_, [4, 1, 3, 2])
- # ensure fit is no-op as iterable is not consumed
- inp = iter(inp)
- mlb = MultiLabelBinarizer(classes=[1, 3, 2])
- assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
- # ensure a ValueError is thrown if given duplicate classes
- err_msg = (
- "The classes argument contains duplicate classes. Remove "
- "these duplicates before passing them to MultiLabelBinarizer."
- )
- mlb = MultiLabelBinarizer(classes=[1, 3, 2, 3])
- with pytest.raises(ValueError, match=err_msg):
- mlb.fit(inp)
- def test_multilabel_binarizer_multiple_calls():
- inp = [(2, 3), (1,), (1, 2)]
- indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
- indicator_mat2 = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
- # first call
- mlb = MultiLabelBinarizer(classes=[1, 3, 2])
- assert_array_equal(mlb.fit_transform(inp), indicator_mat)
- # second call change class
- mlb.classes = [1, 2, 3]
- assert_array_equal(mlb.fit_transform(inp), indicator_mat2)
- def test_multilabel_binarizer_same_length_sequence():
- # Ensure sequences of the same length are not interpreted as a 2-d array
- inp = [[1], [0], [2]]
- indicator_mat = np.array([[0, 1, 0], [1, 0, 0], [0, 0, 1]])
- # fit_transform()
- mlb = MultiLabelBinarizer()
- assert_array_equal(mlb.fit_transform(inp), indicator_mat)
- assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
- # fit().transform()
- mlb = MultiLabelBinarizer()
- assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
- assert_array_equal(mlb.inverse_transform(indicator_mat), inp)
- def test_multilabel_binarizer_non_integer_labels():
- tuple_classes = _to_object_array([(1,), (2,), (3,)])
- inputs = [
- ([("2", "3"), ("1",), ("1", "2")], ["1", "2", "3"]),
- ([("b", "c"), ("a",), ("a", "b")], ["a", "b", "c"]),
- ([((2,), (3,)), ((1,),), ((1,), (2,))], tuple_classes),
- ]
- indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
- for inp, classes in inputs:
- # fit_transform()
- mlb = MultiLabelBinarizer()
- inp = np.array(inp, dtype=object)
- assert_array_equal(mlb.fit_transform(inp), indicator_mat)
- assert_array_equal(mlb.classes_, classes)
- indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
- assert_array_equal(indicator_mat_inv, inp)
- # fit().transform()
- mlb = MultiLabelBinarizer()
- assert_array_equal(mlb.fit(inp).transform(inp), indicator_mat)
- assert_array_equal(mlb.classes_, classes)
- indicator_mat_inv = np.array(mlb.inverse_transform(indicator_mat), dtype=object)
- assert_array_equal(indicator_mat_inv, inp)
- mlb = MultiLabelBinarizer()
- with pytest.raises(TypeError):
- mlb.fit_transform([({}), ({}, {"a": "b"})])
- def test_multilabel_binarizer_non_unique():
- inp = [(1, 1, 1, 0)]
- indicator_mat = np.array([[1, 1]])
- mlb = MultiLabelBinarizer()
- assert_array_equal(mlb.fit_transform(inp), indicator_mat)
- def test_multilabel_binarizer_inverse_validation():
- inp = [(1, 1, 1, 0)]
- mlb = MultiLabelBinarizer()
- mlb.fit_transform(inp)
- # Not binary
- with pytest.raises(ValueError):
- mlb.inverse_transform(np.array([[1, 3]]))
- # The following binary cases are fine, however
- mlb.inverse_transform(np.array([[0, 0]]))
- mlb.inverse_transform(np.array([[1, 1]]))
- mlb.inverse_transform(np.array([[1, 0]]))
- # Wrong shape
- with pytest.raises(ValueError):
- mlb.inverse_transform(np.array([[1]]))
- with pytest.raises(ValueError):
- mlb.inverse_transform(np.array([[1, 1, 1]]))
- def test_label_binarize_with_class_order():
- out = label_binarize([1, 6], classes=[1, 2, 4, 6])
- expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]])
- assert_array_equal(out, expected)
- # Modified class order
- out = label_binarize([1, 6], classes=[1, 6, 4, 2])
- expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]])
- assert_array_equal(out, expected)
- out = label_binarize([0, 1, 2, 3], classes=[3, 2, 0, 1])
- expected = np.array([[0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1, 0, 0, 0]])
- assert_array_equal(out, expected)
- def check_binarized_results(y, classes, pos_label, neg_label, expected):
- for sparse_output in [True, False]:
- if (pos_label == 0 or neg_label != 0) and sparse_output:
- with pytest.raises(ValueError):
- label_binarize(
- y,
- classes=classes,
- neg_label=neg_label,
- pos_label=pos_label,
- sparse_output=sparse_output,
- )
- continue
- # check label_binarize
- binarized = label_binarize(
- y,
- classes=classes,
- neg_label=neg_label,
- pos_label=pos_label,
- sparse_output=sparse_output,
- )
- assert_array_equal(toarray(binarized), expected)
- assert issparse(binarized) == sparse_output
- # check inverse
- y_type = type_of_target(y)
- if y_type == "multiclass":
- inversed = _inverse_binarize_multiclass(binarized, classes=classes)
- else:
- inversed = _inverse_binarize_thresholding(
- binarized,
- output_type=y_type,
- classes=classes,
- threshold=((neg_label + pos_label) / 2.0),
- )
- assert_array_equal(toarray(inversed), toarray(y))
- # Check label binarizer
- lb = LabelBinarizer(
- neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output
- )
- binarized = lb.fit_transform(y)
- assert_array_equal(toarray(binarized), expected)
- assert issparse(binarized) == sparse_output
- inverse_output = lb.inverse_transform(binarized)
- assert_array_equal(toarray(inverse_output), toarray(y))
- assert issparse(inverse_output) == issparse(y)
- def test_label_binarize_binary():
- y = [0, 1, 0]
- classes = [0, 1]
- pos_label = 2
- neg_label = -1
- expected = np.array([[2, -1], [-1, 2], [2, -1]])[:, 1].reshape((-1, 1))
- check_binarized_results(y, classes, pos_label, neg_label, expected)
- # Binary case where sparse_output = True will not result in a ValueError
- y = [0, 1, 0]
- classes = [0, 1]
- pos_label = 3
- neg_label = 0
- expected = np.array([[3, 0], [0, 3], [3, 0]])[:, 1].reshape((-1, 1))
- check_binarized_results(y, classes, pos_label, neg_label, expected)
- def test_label_binarize_multiclass():
- y = [0, 1, 2]
- classes = [0, 1, 2]
- pos_label = 2
- neg_label = 0
- expected = 2 * np.eye(3)
- check_binarized_results(y, classes, pos_label, neg_label, expected)
- with pytest.raises(ValueError):
- label_binarize(
- y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
- )
- def test_label_binarize_multilabel():
- y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]])
- classes = [0, 1, 2]
- pos_label = 2
- neg_label = 0
- expected = pos_label * y_ind
- y_sparse = [
- sparse_matrix(y_ind)
- for sparse_matrix in [
- coo_matrix,
- csc_matrix,
- csr_matrix,
- dok_matrix,
- lil_matrix,
- ]
- ]
- for y in [y_ind] + y_sparse:
- check_binarized_results(y, classes, pos_label, neg_label, expected)
- with pytest.raises(ValueError):
- label_binarize(
- y, classes=classes, neg_label=-1, pos_label=pos_label, sparse_output=True
- )
- def test_invalid_input_label_binarize():
- with pytest.raises(ValueError):
- label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)
- with pytest.raises(ValueError, match="continuous target data is not "):
- label_binarize([1.2, 2.7], classes=[0, 1])
- with pytest.raises(ValueError, match="mismatch with the labels"):
- label_binarize([[1, 3]], classes=[1, 2, 3])
- def test_inverse_binarize_multiclass():
- got = _inverse_binarize_multiclass(
- csr_matrix([[0, 1, 0], [-1, 0, -1], [0, 0, 0]]), np.arange(3)
- )
- assert_array_equal(got, np.array([1, 1, 0]))
- def test_nan_label_encoder():
- """Check that label encoder encodes nans in transform.
- Non-regression test for #22628.
- """
- le = LabelEncoder()
- le.fit(["a", "a", "b", np.nan])
- y_trans = le.transform([np.nan])
- assert_array_equal(y_trans, [2])
- @pytest.mark.parametrize(
- "encoder", [LabelEncoder(), LabelBinarizer(), MultiLabelBinarizer()]
- )
- def test_label_encoders_do_not_have_set_output(encoder):
- """Check that label encoders do not define set_output and work with y as a kwarg.
- Non-regression test for #26854.
- """
- assert not hasattr(encoder, "set_output")
- y_encoded_with_kwarg = encoder.fit_transform(y=["a", "b", "c"])
- y_encoded_positional = encoder.fit_transform(["a", "b", "c"])
- assert_array_equal(y_encoded_with_kwarg, y_encoded_positional)
|