| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951 |
- # Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
- # Mathieu Blondel <mathieu@mblondel.org>
- # Olivier Grisel <olivier.grisel@ensta.org>
- # Andreas Mueller <amueller@ais.uni-bonn.de>
- # Joel Nothman <joel.nothman@gmail.com>
- # Hamzeh Alsalhi <ha258@cornell.edu>
- # License: BSD 3 clause
- import array
- import itertools
- import warnings
- from collections import defaultdict
- from numbers import Integral
- import numpy as np
- import scipy.sparse as sp
- from ..base import BaseEstimator, TransformerMixin, _fit_context
- from ..utils import column_or_1d
- from ..utils._encode import _encode, _unique
- from ..utils._param_validation import Interval, validate_params
- from ..utils.multiclass import type_of_target, unique_labels
- from ..utils.sparsefuncs import min_max_axis
- from ..utils.validation import _num_samples, check_array, check_is_fitted
- __all__ = [
- "label_binarize",
- "LabelBinarizer",
- "LabelEncoder",
- "MultiLabelBinarizer",
- ]
- class LabelEncoder(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
- """Encode target labels with value between 0 and n_classes-1.
- This transformer should be used to encode target values, *i.e.* `y`, and
- not the input `X`.
- Read more in the :ref:`User Guide <preprocessing_targets>`.
- .. versionadded:: 0.12
- Attributes
- ----------
- classes_ : ndarray of shape (n_classes,)
- Holds the label for each class.
- See Also
- --------
- OrdinalEncoder : Encode categorical features using an ordinal encoding
- scheme.
- OneHotEncoder : Encode categorical features as a one-hot numeric array.
- Examples
- --------
- `LabelEncoder` can be used to normalize labels.
- >>> from sklearn.preprocessing import LabelEncoder
- >>> le = LabelEncoder()
- >>> le.fit([1, 2, 2, 6])
- LabelEncoder()
- >>> le.classes_
- array([1, 2, 6])
- >>> le.transform([1, 1, 2, 6])
- array([0, 0, 1, 2]...)
- >>> le.inverse_transform([0, 0, 1, 2])
- array([1, 1, 2, 6])
- It can also be used to transform non-numerical labels (as long as they are
- hashable and comparable) to numerical labels.
- >>> le = LabelEncoder()
- >>> le.fit(["paris", "paris", "tokyo", "amsterdam"])
- LabelEncoder()
- >>> list(le.classes_)
- ['amsterdam', 'paris', 'tokyo']
- >>> le.transform(["tokyo", "tokyo", "paris"])
- array([2, 2, 1]...)
- >>> list(le.inverse_transform([2, 2, 1]))
- ['tokyo', 'tokyo', 'paris']
- """
- def fit(self, y):
- """Fit label encoder.
- Parameters
- ----------
- y : array-like of shape (n_samples,)
- Target values.
- Returns
- -------
- self : returns an instance of self.
- Fitted label encoder.
- """
- y = column_or_1d(y, warn=True)
- self.classes_ = _unique(y)
- return self
- def fit_transform(self, y):
- """Fit label encoder and return encoded labels.
- Parameters
- ----------
- y : array-like of shape (n_samples,)
- Target values.
- Returns
- -------
- y : array-like of shape (n_samples,)
- Encoded labels.
- """
- y = column_or_1d(y, warn=True)
- self.classes_, y = _unique(y, return_inverse=True)
- return y
- def transform(self, y):
- """Transform labels to normalized encoding.
- Parameters
- ----------
- y : array-like of shape (n_samples,)
- Target values.
- Returns
- -------
- y : array-like of shape (n_samples,)
- Labels as normalized encodings.
- """
- check_is_fitted(self)
- y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)
- # transform of empty array is empty array
- if _num_samples(y) == 0:
- return np.array([])
- return _encode(y, uniques=self.classes_)
- def inverse_transform(self, y):
- """Transform labels back to original encoding.
- Parameters
- ----------
- y : ndarray of shape (n_samples,)
- Target values.
- Returns
- -------
- y : ndarray of shape (n_samples,)
- Original encoding.
- """
- check_is_fitted(self)
- y = column_or_1d(y, warn=True)
- # inverse transform of empty array is empty array
- if _num_samples(y) == 0:
- return np.array([])
- diff = np.setdiff1d(y, np.arange(len(self.classes_)))
- if len(diff):
- raise ValueError("y contains previously unseen labels: %s" % str(diff))
- y = np.asarray(y)
- return self.classes_[y]
- def _more_tags(self):
- return {"X_types": ["1dlabels"]}
- class LabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
- """Binarize labels in a one-vs-all fashion.
- Several regression and binary classification algorithms are
- available in scikit-learn. A simple way to extend these algorithms
- to the multi-class classification case is to use the so-called
- one-vs-all scheme.
- At learning time, this simply consists in learning one regressor
- or binary classifier per class. In doing so, one needs to convert
- multi-class labels to binary labels (belong or does not belong
- to the class). `LabelBinarizer` makes this process easy with the
- transform method.
- At prediction time, one assigns the class for which the corresponding
- model gave the greatest confidence. `LabelBinarizer` makes this easy
- with the :meth:`inverse_transform` method.
- Read more in the :ref:`User Guide <preprocessing_targets>`.
- Parameters
- ----------
- neg_label : int, default=0
- Value with which negative labels must be encoded.
- pos_label : int, default=1
- Value with which positive labels must be encoded.
- sparse_output : bool, default=False
- True if the returned array from transform is desired to be in sparse
- CSR format.
- Attributes
- ----------
- classes_ : ndarray of shape (n_classes,)
- Holds the label for each class.
- y_type_ : str
- Represents the type of the target data as evaluated by
- :func:`~sklearn.utils.multiclass.type_of_target`. Possible type are
- 'continuous', 'continuous-multioutput', 'binary', 'multiclass',
- 'multiclass-multioutput', 'multilabel-indicator', and 'unknown'.
- sparse_input_ : bool
- `True` if the input data to transform is given as a sparse matrix,
- `False` otherwise.
- See Also
- --------
- label_binarize : Function to perform the transform operation of
- LabelBinarizer with fixed classes.
- OneHotEncoder : Encode categorical features using a one-hot aka one-of-K
- scheme.
- Examples
- --------
- >>> from sklearn.preprocessing import LabelBinarizer
- >>> lb = LabelBinarizer()
- >>> lb.fit([1, 2, 6, 4, 2])
- LabelBinarizer()
- >>> lb.classes_
- array([1, 2, 4, 6])
- >>> lb.transform([1, 6])
- array([[1, 0, 0, 0],
- [0, 0, 0, 1]])
- Binary targets transform to a column vector
- >>> lb = LabelBinarizer()
- >>> lb.fit_transform(['yes', 'no', 'no', 'yes'])
- array([[1],
- [0],
- [0],
- [1]])
- Passing a 2D matrix for multilabel classification
- >>> import numpy as np
- >>> lb.fit(np.array([[0, 1, 1], [1, 0, 0]]))
- LabelBinarizer()
- >>> lb.classes_
- array([0, 1, 2])
- >>> lb.transform([0, 1, 2, 1])
- array([[1, 0, 0],
- [0, 1, 0],
- [0, 0, 1],
- [0, 1, 0]])
- """
- _parameter_constraints: dict = {
- "neg_label": [Integral],
- "pos_label": [Integral],
- "sparse_output": ["boolean"],
- }
- def __init__(self, *, neg_label=0, pos_label=1, sparse_output=False):
- self.neg_label = neg_label
- self.pos_label = pos_label
- self.sparse_output = sparse_output
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, y):
- """Fit label binarizer.
- Parameters
- ----------
- y : ndarray of shape (n_samples,) or (n_samples, n_classes)
- Target values. The 2-d matrix should only contain 0 and 1,
- represents multilabel classification.
- Returns
- -------
- self : object
- Returns the instance itself.
- """
- if self.neg_label >= self.pos_label:
- raise ValueError(
- f"neg_label={self.neg_label} must be strictly less than "
- f"pos_label={self.pos_label}."
- )
- if self.sparse_output and (self.pos_label == 0 or self.neg_label != 0):
- raise ValueError(
- "Sparse binarization is only supported with non "
- "zero pos_label and zero neg_label, got "
- f"pos_label={self.pos_label} and neg_label={self.neg_label}"
- )
- self.y_type_ = type_of_target(y, input_name="y")
- if "multioutput" in self.y_type_:
- raise ValueError(
- "Multioutput target data is not supported with label binarization"
- )
- if _num_samples(y) == 0:
- raise ValueError("y has 0 samples: %r" % y)
- self.sparse_input_ = sp.issparse(y)
- self.classes_ = unique_labels(y)
- return self
- def fit_transform(self, y):
- """Fit label binarizer/transform multi-class labels to binary labels.
- The output of transform is sometimes referred to as
- the 1-of-K coding scheme.
- Parameters
- ----------
- y : {ndarray, sparse matrix} of shape (n_samples,) or \
- (n_samples, n_classes)
- Target values. The 2-d matrix should only contain 0 and 1,
- represents multilabel classification. Sparse matrix can be
- CSR, CSC, COO, DOK, or LIL.
- Returns
- -------
- Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
- Shape will be (n_samples, 1) for binary problems. Sparse matrix
- will be of CSR format.
- """
- return self.fit(y).transform(y)
- def transform(self, y):
- """Transform multi-class labels to binary labels.
- The output of transform is sometimes referred to by some authors as
- the 1-of-K coding scheme.
- Parameters
- ----------
- y : {array, sparse matrix} of shape (n_samples,) or \
- (n_samples, n_classes)
- Target values. The 2-d matrix should only contain 0 and 1,
- represents multilabel classification. Sparse matrix can be
- CSR, CSC, COO, DOK, or LIL.
- Returns
- -------
- Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
- Shape will be (n_samples, 1) for binary problems. Sparse matrix
- will be of CSR format.
- """
- check_is_fitted(self)
- y_is_multilabel = type_of_target(y).startswith("multilabel")
- if y_is_multilabel and not self.y_type_.startswith("multilabel"):
- raise ValueError("The object was not fitted with multilabel input.")
- return label_binarize(
- y,
- classes=self.classes_,
- pos_label=self.pos_label,
- neg_label=self.neg_label,
- sparse_output=self.sparse_output,
- )
- def inverse_transform(self, Y, threshold=None):
- """Transform binary labels back to multi-class labels.
- Parameters
- ----------
- Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
- Target values. All sparse matrices are converted to CSR before
- inverse transformation.
- threshold : float, default=None
- Threshold used in the binary and multi-label cases.
- Use 0 when ``Y`` contains the output of :term:`decision_function`
- (classifier).
- Use 0.5 when ``Y`` contains the output of :term:`predict_proba`.
- If None, the threshold is assumed to be half way between
- neg_label and pos_label.
- Returns
- -------
- y : {ndarray, sparse matrix} of shape (n_samples,)
- Target values. Sparse matrix will be of CSR format.
- Notes
- -----
- In the case when the binary labels are fractional
- (probabilistic), :meth:`inverse_transform` chooses the class with the
- greatest value. Typically, this allows to use the output of a
- linear model's :term:`decision_function` method directly as the input
- of :meth:`inverse_transform`.
- """
- check_is_fitted(self)
- if threshold is None:
- threshold = (self.pos_label + self.neg_label) / 2.0
- if self.y_type_ == "multiclass":
- y_inv = _inverse_binarize_multiclass(Y, self.classes_)
- else:
- y_inv = _inverse_binarize_thresholding(
- Y, self.y_type_, self.classes_, threshold
- )
- if self.sparse_input_:
- y_inv = sp.csr_matrix(y_inv)
- elif sp.issparse(y_inv):
- y_inv = y_inv.toarray()
- return y_inv
- def _more_tags(self):
- return {"X_types": ["1dlabels"]}
- @validate_params(
- {
- "y": ["array-like"],
- "classes": ["array-like"],
- "neg_label": [Interval(Integral, None, None, closed="neither")],
- "pos_label": [Interval(Integral, None, None, closed="neither")],
- "sparse_output": ["boolean"],
- },
- prefer_skip_nested_validation=True,
- )
- def label_binarize(y, *, classes, neg_label=0, pos_label=1, sparse_output=False):
- """Binarize labels in a one-vs-all fashion.
- Several regression and binary classification algorithms are
- available in scikit-learn. A simple way to extend these algorithms
- to the multi-class classification case is to use the so-called
- one-vs-all scheme.
- This function makes it possible to compute this transformation for a
- fixed set of class labels known ahead of time.
- Parameters
- ----------
- y : array-like
- Sequence of integer labels or multilabel data to encode.
- classes : array-like of shape (n_classes,)
- Uniquely holds the label for each class.
- neg_label : int, default=0
- Value with which negative labels must be encoded.
- pos_label : int, default=1
- Value with which positive labels must be encoded.
- sparse_output : bool, default=False,
- Set to true if output binary array is desired in CSR sparse format.
- Returns
- -------
- Y : {ndarray, sparse matrix} of shape (n_samples, n_classes)
- Shape will be (n_samples, 1) for binary problems. Sparse matrix will
- be of CSR format.
- See Also
- --------
- LabelBinarizer : Class used to wrap the functionality of label_binarize and
- allow for fitting to classes independently of the transform operation.
- Examples
- --------
- >>> from sklearn.preprocessing import label_binarize
- >>> label_binarize([1, 6], classes=[1, 2, 4, 6])
- array([[1, 0, 0, 0],
- [0, 0, 0, 1]])
- The class ordering is preserved:
- >>> label_binarize([1, 6], classes=[1, 6, 4, 2])
- array([[1, 0, 0, 0],
- [0, 1, 0, 0]])
- Binary targets transform to a column vector
- >>> label_binarize(['yes', 'no', 'no', 'yes'], classes=['no', 'yes'])
- array([[1],
- [0],
- [0],
- [1]])
- """
- if not isinstance(y, list):
- # XXX Workaround that will be removed when list of list format is
- # dropped
- y = check_array(
- y, input_name="y", accept_sparse="csr", ensure_2d=False, dtype=None
- )
- else:
- if _num_samples(y) == 0:
- raise ValueError("y has 0 samples: %r" % y)
- if neg_label >= pos_label:
- raise ValueError(
- "neg_label={0} must be strictly less than pos_label={1}.".format(
- neg_label, pos_label
- )
- )
- if sparse_output and (pos_label == 0 or neg_label != 0):
- raise ValueError(
- "Sparse binarization is only supported with non "
- "zero pos_label and zero neg_label, got "
- "pos_label={0} and neg_label={1}"
- "".format(pos_label, neg_label)
- )
- # To account for pos_label == 0 in the dense case
- pos_switch = pos_label == 0
- if pos_switch:
- pos_label = -neg_label
- y_type = type_of_target(y)
- if "multioutput" in y_type:
- raise ValueError(
- "Multioutput target data is not supported with label binarization"
- )
- if y_type == "unknown":
- raise ValueError("The type of target data is not known")
- n_samples = y.shape[0] if sp.issparse(y) else len(y)
- n_classes = len(classes)
- classes = np.asarray(classes)
- if y_type == "binary":
- if n_classes == 1:
- if sparse_output:
- return sp.csr_matrix((n_samples, 1), dtype=int)
- else:
- Y = np.zeros((len(y), 1), dtype=int)
- Y += neg_label
- return Y
- elif len(classes) >= 3:
- y_type = "multiclass"
- sorted_class = np.sort(classes)
- if y_type == "multilabel-indicator":
- y_n_classes = y.shape[1] if hasattr(y, "shape") else len(y[0])
- if classes.size != y_n_classes:
- raise ValueError(
- "classes {0} mismatch with the labels {1} found in the data".format(
- classes, unique_labels(y)
- )
- )
- if y_type in ("binary", "multiclass"):
- y = column_or_1d(y)
- # pick out the known labels from y
- y_in_classes = np.isin(y, classes)
- y_seen = y[y_in_classes]
- indices = np.searchsorted(sorted_class, y_seen)
- indptr = np.hstack((0, np.cumsum(y_in_classes)))
- data = np.empty_like(indices)
- data.fill(pos_label)
- Y = sp.csr_matrix((data, indices, indptr), shape=(n_samples, n_classes))
- elif y_type == "multilabel-indicator":
- Y = sp.csr_matrix(y)
- if pos_label != 1:
- data = np.empty_like(Y.data)
- data.fill(pos_label)
- Y.data = data
- else:
- raise ValueError(
- "%s target data is not supported with label binarization" % y_type
- )
- if not sparse_output:
- Y = Y.toarray()
- Y = Y.astype(int, copy=False)
- if neg_label != 0:
- Y[Y == 0] = neg_label
- if pos_switch:
- Y[Y == pos_label] = 0
- else:
- Y.data = Y.data.astype(int, copy=False)
- # preserve label ordering
- if np.any(classes != sorted_class):
- indices = np.searchsorted(sorted_class, classes)
- Y = Y[:, indices]
- if y_type == "binary":
- if sparse_output:
- Y = Y.getcol(-1)
- else:
- Y = Y[:, -1].reshape((-1, 1))
- return Y
- def _inverse_binarize_multiclass(y, classes):
- """Inverse label binarization transformation for multiclass.
- Multiclass uses the maximal score instead of a threshold.
- """
- classes = np.asarray(classes)
- if sp.issparse(y):
- # Find the argmax for each row in y where y is a CSR matrix
- y = y.tocsr()
- n_samples, n_outputs = y.shape
- outputs = np.arange(n_outputs)
- row_max = min_max_axis(y, 1)[1]
- row_nnz = np.diff(y.indptr)
- y_data_repeated_max = np.repeat(row_max, row_nnz)
- # picks out all indices obtaining the maximum per row
- y_i_all_argmax = np.flatnonzero(y_data_repeated_max == y.data)
- # For corner case where last row has a max of 0
- if row_max[-1] == 0:
- y_i_all_argmax = np.append(y_i_all_argmax, [len(y.data)])
- # Gets the index of the first argmax in each row from y_i_all_argmax
- index_first_argmax = np.searchsorted(y_i_all_argmax, y.indptr[:-1])
- # first argmax of each row
- y_ind_ext = np.append(y.indices, [0])
- y_i_argmax = y_ind_ext[y_i_all_argmax[index_first_argmax]]
- # Handle rows of all 0
- y_i_argmax[np.where(row_nnz == 0)[0]] = 0
- # Handles rows with max of 0 that contain negative numbers
- samples = np.arange(n_samples)[(row_nnz > 0) & (row_max.ravel() == 0)]
- for i in samples:
- ind = y.indices[y.indptr[i] : y.indptr[i + 1]]
- y_i_argmax[i] = classes[np.setdiff1d(outputs, ind)][0]
- return classes[y_i_argmax]
- else:
- return classes.take(y.argmax(axis=1), mode="clip")
- def _inverse_binarize_thresholding(y, output_type, classes, threshold):
- """Inverse label binarization transformation using thresholding."""
- if output_type == "binary" and y.ndim == 2 and y.shape[1] > 2:
- raise ValueError("output_type='binary', but y.shape = {0}".format(y.shape))
- if output_type != "binary" and y.shape[1] != len(classes):
- raise ValueError(
- "The number of class is not equal to the number of dimension of y."
- )
- classes = np.asarray(classes)
- # Perform thresholding
- if sp.issparse(y):
- if threshold > 0:
- if y.format not in ("csr", "csc"):
- y = y.tocsr()
- y.data = np.array(y.data > threshold, dtype=int)
- y.eliminate_zeros()
- else:
- y = np.array(y.toarray() > threshold, dtype=int)
- else:
- y = np.array(y > threshold, dtype=int)
- # Inverse transform data
- if output_type == "binary":
- if sp.issparse(y):
- y = y.toarray()
- if y.ndim == 2 and y.shape[1] == 2:
- return classes[y[:, 1]]
- else:
- if len(classes) == 1:
- return np.repeat(classes[0], len(y))
- else:
- return classes[y.ravel()]
- elif output_type == "multilabel-indicator":
- return y
- else:
- raise ValueError("{0} format is not supported".format(output_type))
- class MultiLabelBinarizer(TransformerMixin, BaseEstimator, auto_wrap_output_keys=None):
- """Transform between iterable of iterables and a multilabel format.
- Although a list of sets or tuples is a very intuitive format for multilabel
- data, it is unwieldy to process. This transformer converts between this
- intuitive format and the supported multilabel format: a (samples x classes)
- binary matrix indicating the presence of a class label.
- Parameters
- ----------
- classes : array-like of shape (n_classes,), default=None
- Indicates an ordering for the class labels.
- All entries should be unique (cannot contain duplicate classes).
- sparse_output : bool, default=False
- Set to True if output binary array is desired in CSR sparse format.
- Attributes
- ----------
- classes_ : ndarray of shape (n_classes,)
- A copy of the `classes` parameter when provided.
- Otherwise it corresponds to the sorted set of classes found
- when fitting.
- See Also
- --------
- OneHotEncoder : Encode categorical features using a one-hot aka one-of-K
- scheme.
- Examples
- --------
- >>> from sklearn.preprocessing import MultiLabelBinarizer
- >>> mlb = MultiLabelBinarizer()
- >>> mlb.fit_transform([(1, 2), (3,)])
- array([[1, 1, 0],
- [0, 0, 1]])
- >>> mlb.classes_
- array([1, 2, 3])
- >>> mlb.fit_transform([{'sci-fi', 'thriller'}, {'comedy'}])
- array([[0, 1, 1],
- [1, 0, 0]])
- >>> list(mlb.classes_)
- ['comedy', 'sci-fi', 'thriller']
- A common mistake is to pass in a list, which leads to the following issue:
- >>> mlb = MultiLabelBinarizer()
- >>> mlb.fit(['sci-fi', 'thriller', 'comedy'])
- MultiLabelBinarizer()
- >>> mlb.classes_
- array(['-', 'c', 'd', 'e', 'f', 'h', 'i', 'l', 'm', 'o', 'r', 's', 't',
- 'y'], dtype=object)
- To correct this, the list of labels should be passed in as:
- >>> mlb = MultiLabelBinarizer()
- >>> mlb.fit([['sci-fi', 'thriller', 'comedy']])
- MultiLabelBinarizer()
- >>> mlb.classes_
- array(['comedy', 'sci-fi', 'thriller'], dtype=object)
- """
- _parameter_constraints: dict = {
- "classes": ["array-like", None],
- "sparse_output": ["boolean"],
- }
- def __init__(self, *, classes=None, sparse_output=False):
- self.classes = classes
- self.sparse_output = sparse_output
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, y):
- """Fit the label sets binarizer, storing :term:`classes_`.
- Parameters
- ----------
- y : iterable of iterables
- A set of labels (any orderable and hashable object) for each
- sample. If the `classes` parameter is set, `y` will not be
- iterated.
- Returns
- -------
- self : object
- Fitted estimator.
- """
- self._cached_dict = None
- if self.classes is None:
- classes = sorted(set(itertools.chain.from_iterable(y)))
- elif len(set(self.classes)) < len(self.classes):
- raise ValueError(
- "The classes argument contains duplicate "
- "classes. Remove these duplicates before passing "
- "them to MultiLabelBinarizer."
- )
- else:
- classes = self.classes
- dtype = int if all(isinstance(c, int) for c in classes) else object
- self.classes_ = np.empty(len(classes), dtype=dtype)
- self.classes_[:] = classes
- return self
- @_fit_context(prefer_skip_nested_validation=True)
- def fit_transform(self, y):
- """Fit the label sets binarizer and transform the given label sets.
- Parameters
- ----------
- y : iterable of iterables
- A set of labels (any orderable and hashable object) for each
- sample. If the `classes` parameter is set, `y` will not be
- iterated.
- Returns
- -------
- y_indicator : {ndarray, sparse matrix} of shape (n_samples, n_classes)
- A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]`
- is in `y[i]`, and 0 otherwise. Sparse matrix will be of CSR
- format.
- """
- if self.classes is not None:
- return self.fit(y).transform(y)
- self._cached_dict = None
- # Automatically increment on new class
- class_mapping = defaultdict(int)
- class_mapping.default_factory = class_mapping.__len__
- yt = self._transform(y, class_mapping)
- # sort classes and reorder columns
- tmp = sorted(class_mapping, key=class_mapping.get)
- # (make safe for tuples)
- dtype = int if all(isinstance(c, int) for c in tmp) else object
- class_mapping = np.empty(len(tmp), dtype=dtype)
- class_mapping[:] = tmp
- self.classes_, inverse = np.unique(class_mapping, return_inverse=True)
- # ensure yt.indices keeps its current dtype
- yt.indices = np.array(inverse[yt.indices], dtype=yt.indices.dtype, copy=False)
- if not self.sparse_output:
- yt = yt.toarray()
- return yt
- def transform(self, y):
- """Transform the given label sets.
- Parameters
- ----------
- y : iterable of iterables
- A set of labels (any orderable and hashable object) for each
- sample. If the `classes` parameter is set, `y` will not be
- iterated.
- Returns
- -------
- y_indicator : array or CSR matrix, shape (n_samples, n_classes)
- A matrix such that `y_indicator[i, j] = 1` iff `classes_[j]` is in
- `y[i]`, and 0 otherwise.
- """
- check_is_fitted(self)
- class_to_index = self._build_cache()
- yt = self._transform(y, class_to_index)
- if not self.sparse_output:
- yt = yt.toarray()
- return yt
- def _build_cache(self):
- if self._cached_dict is None:
- self._cached_dict = dict(zip(self.classes_, range(len(self.classes_))))
- return self._cached_dict
- def _transform(self, y, class_mapping):
- """Transforms the label sets with a given mapping.
- Parameters
- ----------
- y : iterable of iterables
- A set of labels (any orderable and hashable object) for each
- sample. If the `classes` parameter is set, `y` will not be
- iterated.
- class_mapping : Mapping
- Maps from label to column index in label indicator matrix.
- Returns
- -------
- y_indicator : sparse matrix of shape (n_samples, n_classes)
- Label indicator matrix. Will be of CSR format.
- """
- indices = array.array("i")
- indptr = array.array("i", [0])
- unknown = set()
- for labels in y:
- index = set()
- for label in labels:
- try:
- index.add(class_mapping[label])
- except KeyError:
- unknown.add(label)
- indices.extend(index)
- indptr.append(len(indices))
- if unknown:
- warnings.warn(
- "unknown class(es) {0} will be ignored".format(sorted(unknown, key=str))
- )
- data = np.ones(len(indices), dtype=int)
- return sp.csr_matrix(
- (data, indices, indptr), shape=(len(indptr) - 1, len(class_mapping))
- )
- def inverse_transform(self, yt):
- """Transform the given indicator matrix into label sets.
- Parameters
- ----------
- yt : {ndarray, sparse matrix} of shape (n_samples, n_classes)
- A matrix containing only 1s ands 0s.
- Returns
- -------
- y : list of tuples
- The set of labels for each sample such that `y[i]` consists of
- `classes_[j]` for each `yt[i, j] == 1`.
- """
- check_is_fitted(self)
- if yt.shape[1] != len(self.classes_):
- raise ValueError(
- "Expected indicator for {0} classes, but got {1}".format(
- len(self.classes_), yt.shape[1]
- )
- )
- if sp.issparse(yt):
- yt = yt.tocsr()
- if len(yt.data) != 0 and len(np.setdiff1d(yt.data, [0, 1])) > 0:
- raise ValueError("Expected only 0s and 1s in label indicator.")
- return [
- tuple(self.classes_.take(yt.indices[start:end]))
- for start, end in zip(yt.indptr[:-1], yt.indptr[1:])
- ]
- else:
- unexpected = np.setdiff1d(yt, [0, 1])
- if len(unexpected) > 0:
- raise ValueError(
- "Expected only 0s and 1s in label indicator. Also got {0}".format(
- unexpected
- )
- )
- return [tuple(self.classes_.compress(indicators)) for indicators in yt]
- def _more_tags(self):
- return {"X_types": ["2dlabels"]}
|