| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546 |
- # Author: Arnaud Joly, Joel Nothman, Hamzeh Alsalhi
- #
- # License: BSD 3 clause
- """
- Multi-class / multi-label utility function
- ==========================================
- """
- import warnings
- from collections.abc import Sequence
- from itertools import chain
- import numpy as np
- from scipy.sparse import issparse
- from ..utils._array_api import get_namespace
- from ..utils.fixes import VisibleDeprecationWarning
- from .validation import _assert_all_finite, check_array
- def _unique_multiclass(y):
- xp, is_array_api_compliant = get_namespace(y)
- if hasattr(y, "__array__") or is_array_api_compliant:
- return xp.unique_values(xp.asarray(y))
- else:
- return set(y)
- def _unique_indicator(y):
- return np.arange(
- check_array(y, input_name="y", accept_sparse=["csr", "csc", "coo"]).shape[1]
- )
- _FN_UNIQUE_LABELS = {
- "binary": _unique_multiclass,
- "multiclass": _unique_multiclass,
- "multilabel-indicator": _unique_indicator,
- }
- def unique_labels(*ys):
- """Extract an ordered array of unique labels.
- We don't allow:
- - mix of multilabel and multiclass (single label) targets
- - mix of label indicator matrix and anything else,
- because there are no explicit labels)
- - mix of label indicator matrices of different sizes
- - mix of string and integer labels
- At the moment, we also don't allow "multiclass-multioutput" input type.
- Parameters
- ----------
- *ys : array-likes
- Label values.
- Returns
- -------
- out : ndarray of shape (n_unique_labels,)
- An ordered array of unique labels.
- Examples
- --------
- >>> from sklearn.utils.multiclass import unique_labels
- >>> unique_labels([3, 5, 5, 5, 7, 7])
- array([3, 5, 7])
- >>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4])
- array([1, 2, 3, 4])
- >>> unique_labels([1, 2, 10], [5, 11])
- array([ 1, 2, 5, 10, 11])
- """
- xp, is_array_api_compliant = get_namespace(*ys)
- if not ys:
- raise ValueError("No argument has been passed.")
- # Check that we don't mix label format
- ys_types = set(type_of_target(x) for x in ys)
- if ys_types == {"binary", "multiclass"}:
- ys_types = {"multiclass"}
- if len(ys_types) > 1:
- raise ValueError("Mix type of y not allowed, got types %s" % ys_types)
- label_type = ys_types.pop()
- # Check consistency for the indicator format
- if (
- label_type == "multilabel-indicator"
- and len(
- set(
- check_array(y, accept_sparse=["csr", "csc", "coo"]).shape[1] for y in ys
- )
- )
- > 1
- ):
- raise ValueError(
- "Multi-label binary indicator input with different numbers of labels"
- )
- # Get the unique set of labels
- _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None)
- if not _unique_labels:
- raise ValueError("Unknown label type: %s" % repr(ys))
- if is_array_api_compliant:
- # array_api does not allow for mixed dtypes
- unique_ys = xp.concat([_unique_labels(y) for y in ys])
- return xp.unique_values(unique_ys)
- ys_labels = set(chain.from_iterable((i for i in _unique_labels(y)) for y in ys))
- # Check that we don't mix string type with number type
- if len(set(isinstance(label, str) for label in ys_labels)) > 1:
- raise ValueError("Mix of label input types (string and number)")
- return xp.asarray(sorted(ys_labels))
- def _is_integral_float(y):
- return y.dtype.kind == "f" and np.all(y.astype(int) == y)
- def is_multilabel(y):
- """Check if ``y`` is in a multilabel format.
- Parameters
- ----------
- y : ndarray of shape (n_samples,)
- Target values.
- Returns
- -------
- out : bool
- Return ``True``, if ``y`` is in a multilabel format, else ```False``.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.utils.multiclass import is_multilabel
- >>> is_multilabel([0, 1, 0, 1])
- False
- >>> is_multilabel([[1], [0, 2], []])
- False
- >>> is_multilabel(np.array([[1, 0], [0, 0]]))
- True
- >>> is_multilabel(np.array([[1], [0], [0]]))
- False
- >>> is_multilabel(np.array([[1, 0, 0]]))
- True
- """
- xp, is_array_api_compliant = get_namespace(y)
- if hasattr(y, "__array__") or isinstance(y, Sequence) or is_array_api_compliant:
- # DeprecationWarning will be replaced by ValueError, see NEP 34
- # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html
- check_y_kwargs = dict(
- accept_sparse=True,
- allow_nd=True,
- force_all_finite=False,
- ensure_2d=False,
- ensure_min_samples=0,
- ensure_min_features=0,
- )
- with warnings.catch_warnings():
- warnings.simplefilter("error", VisibleDeprecationWarning)
- try:
- y = check_array(y, dtype=None, **check_y_kwargs)
- except (VisibleDeprecationWarning, ValueError) as e:
- if str(e).startswith("Complex data not supported"):
- raise
- # dtype=object should be provided explicitly for ragged arrays,
- # see NEP 34
- y = check_array(y, dtype=object, **check_y_kwargs)
- if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1):
- return False
- if issparse(y):
- if y.format in ("dok", "lil"):
- y = y.tocsr()
- labels = xp.unique_values(y.data)
- return (
- len(y.data) == 0
- or (labels.size == 1 or (labels.size == 2) and (0 in labels))
- and (y.dtype.kind in "biu" or _is_integral_float(labels)) # bool, int, uint
- )
- else:
- labels = xp.unique_values(y)
- return len(labels) < 3 and (
- y.dtype.kind in "biu" or _is_integral_float(labels) # bool, int, uint
- )
- def check_classification_targets(y):
- """Ensure that target y is of a non-regression type.
- Only the following target types (as defined in type_of_target) are allowed:
- 'binary', 'multiclass', 'multiclass-multioutput',
- 'multilabel-indicator', 'multilabel-sequences'
- Parameters
- ----------
- y : array-like
- Target values.
- """
- y_type = type_of_target(y, input_name="y")
- if y_type not in [
- "binary",
- "multiclass",
- "multiclass-multioutput",
- "multilabel-indicator",
- "multilabel-sequences",
- ]:
- raise ValueError(
- f"Unknown label type: {y_type}. Maybe you are trying to fit a "
- "classifier, which expects discrete classes on a "
- "regression target with continuous values."
- )
- def type_of_target(y, input_name=""):
- """Determine the type of data indicated by the target.
- Note that this type is the most specific type that can be inferred.
- For example:
- * ``binary`` is more specific but compatible with ``multiclass``.
- * ``multiclass`` of integers is more specific but compatible with
- ``continuous``.
- * ``multilabel-indicator`` is more specific but compatible with
- ``multiclass-multioutput``.
- Parameters
- ----------
- y : {array-like, sparse matrix}
- Target values. If a sparse matrix, `y` is expected to be a
- CSR/CSC matrix.
- input_name : str, default=""
- The data name used to construct the error message.
- .. versionadded:: 1.1.0
- Returns
- -------
- target_type : str
- One of:
- * 'continuous': `y` is an array-like of floats that are not all
- integers, and is 1d or a column vector.
- * 'continuous-multioutput': `y` is a 2d array of floats that are
- not all integers, and both dimensions are of size > 1.
- * 'binary': `y` contains <= 2 discrete values and is 1d or a column
- vector.
- * 'multiclass': `y` contains more than two discrete values, is not a
- sequence of sequences, and is 1d or a column vector.
- * 'multiclass-multioutput': `y` is a 2d array that contains more
- than two discrete values, is not a sequence of sequences, and both
- dimensions are of size > 1.
- * 'multilabel-indicator': `y` is a label indicator matrix, an array
- of two dimensions with at least two columns, and at most 2 unique
- values.
- * 'unknown': `y` is array-like but none of the above, such as a 3d
- array, sequence of sequences, or an array of non-sequence objects.
- Examples
- --------
- >>> from sklearn.utils.multiclass import type_of_target
- >>> import numpy as np
- >>> type_of_target([0.1, 0.6])
- 'continuous'
- >>> type_of_target([1, -1, -1, 1])
- 'binary'
- >>> type_of_target(['a', 'b', 'a'])
- 'binary'
- >>> type_of_target([1.0, 2.0])
- 'binary'
- >>> type_of_target([1, 0, 2])
- 'multiclass'
- >>> type_of_target([1.0, 0.0, 3.0])
- 'multiclass'
- >>> type_of_target(['a', 'b', 'c'])
- 'multiclass'
- >>> type_of_target(np.array([[1, 2], [3, 1]]))
- 'multiclass-multioutput'
- >>> type_of_target([[1, 2]])
- 'multilabel-indicator'
- >>> type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]]))
- 'continuous-multioutput'
- >>> type_of_target(np.array([[0, 1], [1, 1]]))
- 'multilabel-indicator'
- """
- xp, is_array_api_compliant = get_namespace(y)
- valid = (
- (isinstance(y, Sequence) or issparse(y) or hasattr(y, "__array__"))
- and not isinstance(y, str)
- or is_array_api_compliant
- )
- if not valid:
- raise ValueError(
- "Expected array-like (array or non-string sequence), got %r" % y
- )
- sparse_pandas = y.__class__.__name__ in ["SparseSeries", "SparseArray"]
- if sparse_pandas:
- raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'")
- if is_multilabel(y):
- return "multilabel-indicator"
- # DeprecationWarning will be replaced by ValueError, see NEP 34
- # https://numpy.org/neps/nep-0034-infer-dtype-is-object.html
- # We therefore catch both deprecation (NumPy < 1.24) warning and
- # value error (NumPy >= 1.24).
- check_y_kwargs = dict(
- accept_sparse=True,
- allow_nd=True,
- force_all_finite=False,
- ensure_2d=False,
- ensure_min_samples=0,
- ensure_min_features=0,
- )
- with warnings.catch_warnings():
- warnings.simplefilter("error", VisibleDeprecationWarning)
- if not issparse(y):
- try:
- y = check_array(y, dtype=None, **check_y_kwargs)
- except (VisibleDeprecationWarning, ValueError) as e:
- if str(e).startswith("Complex data not supported"):
- raise
- # dtype=object should be provided explicitly for ragged arrays,
- # see NEP 34
- y = check_array(y, dtype=object, **check_y_kwargs)
- # The old sequence of sequences format
- try:
- if (
- not hasattr(y[0], "__array__")
- and isinstance(y[0], Sequence)
- and not isinstance(y[0], str)
- ):
- raise ValueError(
- "You appear to be using a legacy multi-label data"
- " representation. Sequence of sequences are no"
- " longer supported; use a binary array or sparse"
- " matrix instead - the MultiLabelBinarizer"
- " transformer can convert to this format."
- )
- except IndexError:
- pass
- # Invalid inputs
- if y.ndim not in (1, 2):
- # Number of dimension greater than 2: [[[1, 2]]]
- return "unknown"
- if not min(y.shape):
- # Empty ndarray: []/[[]]
- if y.ndim == 1:
- # 1-D empty array: []
- return "binary" # []
- # 2-D empty array: [[]]
- return "unknown"
- if not issparse(y) and y.dtype == object and not isinstance(y.flat[0], str):
- # [obj_1] and not ["label_1"]
- return "unknown"
- # Check if multioutput
- if y.ndim == 2 and y.shape[1] > 1:
- suffix = "-multioutput" # [[1, 2], [1, 2]]
- else:
- suffix = "" # [1, 2, 3] or [[1], [2], [3]]
- # Check float and contains non-integer float values
- if xp.isdtype(y.dtype, "real floating"):
- # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.]
- data = y.data if issparse(y) else y
- if xp.any(data != xp.astype(data, int)):
- _assert_all_finite(data, input_name=input_name)
- return "continuous" + suffix
- # Check multiclass
- first_row = y[0] if not issparse(y) else y.getrow(0).data
- if xp.unique_values(y).shape[0] > 2 or (y.ndim == 2 and len(first_row) > 1):
- # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
- return "multiclass" + suffix
- else:
- return "binary" # [1, 2] or [["a"], ["b"]]
- def _check_partial_fit_first_call(clf, classes=None):
- """Private helper function for factorizing common classes param logic.
- Estimators that implement the ``partial_fit`` API need to be provided with
- the list of possible classes at the first call to partial_fit.
- Subsequent calls to partial_fit should check that ``classes`` is still
- consistent with a previous value of ``clf.classes_`` when provided.
- This function returns True if it detects that this was the first call to
- ``partial_fit`` on ``clf``. In that case the ``classes_`` attribute is also
- set on ``clf``.
- """
- if getattr(clf, "classes_", None) is None and classes is None:
- raise ValueError("classes must be passed on the first call to partial_fit.")
- elif classes is not None:
- if getattr(clf, "classes_", None) is not None:
- if not np.array_equal(clf.classes_, unique_labels(classes)):
- raise ValueError(
- "`classes=%r` is not the same as on last call "
- "to partial_fit, was: %r" % (classes, clf.classes_)
- )
- else:
- # This is the first call to partial_fit
- clf.classes_ = unique_labels(classes)
- return True
- # classes is None and clf.classes_ has already previously been set:
- # nothing to do
- return False
- def class_distribution(y, sample_weight=None):
- """Compute class priors from multioutput-multiclass target data.
- Parameters
- ----------
- y : {array-like, sparse matrix} of size (n_samples, n_outputs)
- The labels for each example.
- sample_weight : array-like of shape (n_samples,), default=None
- Sample weights.
- Returns
- -------
- classes : list of size n_outputs of ndarray of size (n_classes,)
- List of classes for each column.
- n_classes : list of int of size n_outputs
- Number of classes in each column.
- class_prior : list of size n_outputs of ndarray of size (n_classes,)
- Class distribution of each column.
- """
- classes = []
- n_classes = []
- class_prior = []
- n_samples, n_outputs = y.shape
- if sample_weight is not None:
- sample_weight = np.asarray(sample_weight)
- if issparse(y):
- y = y.tocsc()
- y_nnz = np.diff(y.indptr)
- for k in range(n_outputs):
- col_nonzero = y.indices[y.indptr[k] : y.indptr[k + 1]]
- # separate sample weights for zero and non-zero elements
- if sample_weight is not None:
- nz_samp_weight = sample_weight[col_nonzero]
- zeros_samp_weight_sum = np.sum(sample_weight) - np.sum(nz_samp_weight)
- else:
- nz_samp_weight = None
- zeros_samp_weight_sum = y.shape[0] - y_nnz[k]
- classes_k, y_k = np.unique(
- y.data[y.indptr[k] : y.indptr[k + 1]], return_inverse=True
- )
- class_prior_k = np.bincount(y_k, weights=nz_samp_weight)
- # An explicit zero was found, combine its weight with the weight
- # of the implicit zeros
- if 0 in classes_k:
- class_prior_k[classes_k == 0] += zeros_samp_weight_sum
- # If an there is an implicit zero and it is not in classes and
- # class_prior, make an entry for it
- if 0 not in classes_k and y_nnz[k] < y.shape[0]:
- classes_k = np.insert(classes_k, 0, 0)
- class_prior_k = np.insert(class_prior_k, 0, zeros_samp_weight_sum)
- classes.append(classes_k)
- n_classes.append(classes_k.shape[0])
- class_prior.append(class_prior_k / class_prior_k.sum())
- else:
- for k in range(n_outputs):
- classes_k, y_k = np.unique(y[:, k], return_inverse=True)
- classes.append(classes_k)
- n_classes.append(classes_k.shape[0])
- class_prior_k = np.bincount(y_k, weights=sample_weight)
- class_prior.append(class_prior_k / class_prior_k.sum())
- return (classes, n_classes, class_prior)
- def _ovr_decision_function(predictions, confidences, n_classes):
- """Compute a continuous, tie-breaking OvR decision function from OvO.
- It is important to include a continuous value, not only votes,
- to make computing AUC or calibration meaningful.
- Parameters
- ----------
- predictions : array-like of shape (n_samples, n_classifiers)
- Predicted classes for each binary classifier.
- confidences : array-like of shape (n_samples, n_classifiers)
- Decision functions or predicted probabilities for positive class
- for each binary classifier.
- n_classes : int
- Number of classes. n_classifiers must be
- ``n_classes * (n_classes - 1 ) / 2``.
- """
- n_samples = predictions.shape[0]
- votes = np.zeros((n_samples, n_classes))
- sum_of_confidences = np.zeros((n_samples, n_classes))
- k = 0
- for i in range(n_classes):
- for j in range(i + 1, n_classes):
- sum_of_confidences[:, i] -= confidences[:, k]
- sum_of_confidences[:, j] += confidences[:, k]
- votes[predictions[:, k] == 0, i] += 1
- votes[predictions[:, k] == 1, j] += 1
- k += 1
- # Monotonically transform the sum_of_confidences to (-1/3, 1/3)
- # and add it with votes. The monotonic transformation is
- # f: x -> x / (3 * (|x| + 1)), it uses 1/3 instead of 1/2
- # to ensure that we won't reach the limits and change vote order.
- # The motivation is to use confidence levels as a way to break ties in
- # the votes without switching any decision made based on a difference
- # of 1 vote.
- transformed_confidences = sum_of_confidences / (
- 3 * (np.abs(sum_of_confidences) + 1)
- )
- return votes + transformed_confidences
|