| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253 |
- """Utilities for input validation"""
- # Authors: Olivier Grisel
- # Gael Varoquaux
- # Andreas Mueller
- # Lars Buitinck
- # Alexandre Gramfort
- # Nicolas Tresegnie
- # Sylvain Marie
- # License: BSD 3 clause
- import numbers
- import operator
- import warnings
- from contextlib import suppress
- from functools import reduce, wraps
- from inspect import Parameter, isclass, signature
- import joblib
- import numpy as np
- import scipy.sparse as sp
- from .. import get_config as _get_config
- from ..exceptions import DataConversionWarning, NotFittedError, PositiveSpectrumWarning
- from ..utils._array_api import _asarray_with_order, _is_numpy_namespace, get_namespace
- from ..utils.fixes import ComplexWarning
- from ._isfinite import FiniteStatus, cy_isfinite
- from .fixes import _object_dtype_isnan
- FLOAT_DTYPES = (np.float64, np.float32, np.float16)
- # This function is not used anymore at this moment in the code base but we keep it in
- # case that we merge a new public function without kwarg only by mistake, which would
- # require a deprecation cycle to fix.
- def _deprecate_positional_args(func=None, *, version="1.3"):
- """Decorator for methods that issues warnings for positional arguments.
- Using the keyword-only argument syntax in pep 3102, arguments after the
- * will issue a warning when passed as a positional argument.
- Parameters
- ----------
- func : callable, default=None
- Function to check arguments on.
- version : callable, default="1.3"
- The version when positional arguments will result in error.
- """
- def _inner_deprecate_positional_args(f):
- sig = signature(f)
- kwonly_args = []
- all_args = []
- for name, param in sig.parameters.items():
- if param.kind == Parameter.POSITIONAL_OR_KEYWORD:
- all_args.append(name)
- elif param.kind == Parameter.KEYWORD_ONLY:
- kwonly_args.append(name)
- @wraps(f)
- def inner_f(*args, **kwargs):
- extra_args = len(args) - len(all_args)
- if extra_args <= 0:
- return f(*args, **kwargs)
- # extra_args > 0
- args_msg = [
- "{}={}".format(name, arg)
- for name, arg in zip(kwonly_args[:extra_args], args[-extra_args:])
- ]
- args_msg = ", ".join(args_msg)
- warnings.warn(
- (
- f"Pass {args_msg} as keyword args. From version "
- f"{version} passing these as positional arguments "
- "will result in an error"
- ),
- FutureWarning,
- )
- kwargs.update(zip(sig.parameters, args))
- return f(**kwargs)
- return inner_f
- if func is not None:
- return _inner_deprecate_positional_args(func)
- return _inner_deprecate_positional_args
- def _assert_all_finite(
- X, allow_nan=False, msg_dtype=None, estimator_name=None, input_name=""
- ):
- """Like assert_all_finite, but only for ndarray."""
- xp, _ = get_namespace(X)
- if _get_config()["assume_finite"]:
- return
- X = xp.asarray(X)
- # for object dtype data, we only check for NaNs (GH-13254)
- if X.dtype == np.dtype("object") and not allow_nan:
- if _object_dtype_isnan(X).any():
- raise ValueError("Input contains NaN")
- # We need only consider float arrays, hence can early return for all else.
- if not xp.isdtype(X.dtype, ("real floating", "complex floating")):
- return
- # First try an O(n) time, O(1) space solution for the common case that
- # everything is finite; fall back to O(n) space `np.isinf/isnan` or custom
- # Cython implementation to prevent false positives and provide a detailed
- # error message.
- with np.errstate(over="ignore"):
- first_pass_isfinite = xp.isfinite(xp.sum(X))
- if first_pass_isfinite:
- return
- _assert_all_finite_element_wise(
- X,
- xp=xp,
- allow_nan=allow_nan,
- msg_dtype=msg_dtype,
- estimator_name=estimator_name,
- input_name=input_name,
- )
- def _assert_all_finite_element_wise(
- X, *, xp, allow_nan, msg_dtype=None, estimator_name=None, input_name=""
- ):
- # Cython implementation doesn't support FP16 or complex numbers
- use_cython = (
- xp is np and X.data.contiguous and X.dtype.type in {np.float32, np.float64}
- )
- if use_cython:
- out = cy_isfinite(X.reshape(-1), allow_nan=allow_nan)
- has_nan_error = False if allow_nan else out == FiniteStatus.has_nan
- has_inf = out == FiniteStatus.has_infinite
- else:
- has_inf = xp.any(xp.isinf(X))
- has_nan_error = False if allow_nan else xp.any(xp.isnan(X))
- if has_inf or has_nan_error:
- if has_nan_error:
- type_err = "NaN"
- else:
- msg_dtype = msg_dtype if msg_dtype is not None else X.dtype
- type_err = f"infinity or a value too large for {msg_dtype!r}"
- padded_input_name = input_name + " " if input_name else ""
- msg_err = f"Input {padded_input_name}contains {type_err}."
- if estimator_name and input_name == "X" and has_nan_error:
- # Improve the error message on how to handle missing values in
- # scikit-learn.
- msg_err += (
- f"\n{estimator_name} does not accept missing values"
- " encoded as NaN natively. For supervised learning, you might want"
- " to consider sklearn.ensemble.HistGradientBoostingClassifier and"
- " Regressor which accept missing values encoded as NaNs natively."
- " Alternatively, it is possible to preprocess the data, for"
- " instance by using an imputer transformer in a pipeline or drop"
- " samples with missing values. See"
- " https://scikit-learn.org/stable/modules/impute.html"
- " You can find a list of all estimators that handle NaN values"
- " at the following page:"
- " https://scikit-learn.org/stable/modules/impute.html"
- "#estimators-that-handle-nan-values"
- )
- raise ValueError(msg_err)
- def assert_all_finite(
- X,
- *,
- allow_nan=False,
- estimator_name=None,
- input_name="",
- ):
- """Throw a ValueError if X contains NaN or infinity.
- Parameters
- ----------
- X : {ndarray, sparse matrix}
- The input data.
- allow_nan : bool, default=False
- If True, do not throw error when `X` contains NaN.
- estimator_name : str, default=None
- The estimator name, used to construct the error message.
- input_name : str, default=""
- The data name used to construct the error message. In particular
- if `input_name` is "X" and the data has NaN values and
- allow_nan is False, the error message will link to the imputer
- documentation.
- """
- _assert_all_finite(
- X.data if sp.issparse(X) else X,
- allow_nan=allow_nan,
- estimator_name=estimator_name,
- input_name=input_name,
- )
- def as_float_array(X, *, copy=True, force_all_finite=True):
- """Convert an array-like to an array of floats.
- The new dtype will be np.float32 or np.float64, depending on the original
- type. The function can create a copy or modify the argument depending
- on the argument copy.
- Parameters
- ----------
- X : {array-like, sparse matrix}
- The input data.
- copy : bool, default=True
- If True, a copy of X will be created. If False, a copy may still be
- returned if X's dtype is not a floating point type.
- force_all_finite : bool or 'allow-nan', default=True
- Whether to raise an error on np.inf, np.nan, pd.NA in X. The
- possibilities are:
- - True: Force all values of X to be finite.
- - False: accepts np.inf, np.nan, pd.NA in X.
- - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot
- be infinite.
- .. versionadded:: 0.20
- ``force_all_finite`` accepts the string ``'allow-nan'``.
- .. versionchanged:: 0.23
- Accepts `pd.NA` and converts it into `np.nan`
- Returns
- -------
- XT : {ndarray, sparse matrix}
- An array of type float.
- """
- if isinstance(X, np.matrix) or (
- not isinstance(X, np.ndarray) and not sp.issparse(X)
- ):
- return check_array(
- X,
- accept_sparse=["csr", "csc", "coo"],
- dtype=np.float64,
- copy=copy,
- force_all_finite=force_all_finite,
- ensure_2d=False,
- )
- elif sp.issparse(X) and X.dtype in [np.float32, np.float64]:
- return X.copy() if copy else X
- elif X.dtype in [np.float32, np.float64]: # is numpy array
- return X.copy("F" if X.flags["F_CONTIGUOUS"] else "C") if copy else X
- else:
- if X.dtype.kind in "uib" and X.dtype.itemsize <= 4:
- return_dtype = np.float32
- else:
- return_dtype = np.float64
- return X.astype(return_dtype)
- def _is_arraylike(x):
- """Returns whether the input is array-like."""
- return hasattr(x, "__len__") or hasattr(x, "shape") or hasattr(x, "__array__")
- def _is_arraylike_not_scalar(array):
- """Return True if array is array-like and not a scalar"""
- return _is_arraylike(array) and not np.isscalar(array)
- def _num_features(X):
- """Return the number of features in an array-like X.
- This helper function tries hard to avoid to materialize an array version
- of X unless necessary. For instance, if X is a list of lists,
- this function will return the length of the first element, assuming
- that subsequent elements are all lists of the same length without
- checking.
- Parameters
- ----------
- X : array-like
- array-like to get the number of features.
- Returns
- -------
- features : int
- Number of features
- """
- type_ = type(X)
- if type_.__module__ == "builtins":
- type_name = type_.__qualname__
- else:
- type_name = f"{type_.__module__}.{type_.__qualname__}"
- message = f"Unable to find the number of features from X of type {type_name}"
- if not hasattr(X, "__len__") and not hasattr(X, "shape"):
- if not hasattr(X, "__array__"):
- raise TypeError(message)
- # Only convert X to a numpy array if there is no cheaper, heuristic
- # option.
- X = np.asarray(X)
- if hasattr(X, "shape"):
- if not hasattr(X.shape, "__len__") or len(X.shape) <= 1:
- message += f" with shape {X.shape}"
- raise TypeError(message)
- return X.shape[1]
- first_sample = X[0]
- # Do not consider an array-like of strings or dicts to be a 2D array
- if isinstance(first_sample, (str, bytes, dict)):
- message += f" where the samples are of type {type(first_sample).__qualname__}"
- raise TypeError(message)
- try:
- # If X is a list of lists, for instance, we assume that all nested
- # lists have the same length without checking or converting to
- # a numpy array to keep this function call as cheap as possible.
- return len(first_sample)
- except Exception as err:
- raise TypeError(message) from err
- def _num_samples(x):
- """Return number of samples in array-like x."""
- message = "Expected sequence or array-like, got %s" % type(x)
- if hasattr(x, "fit") and callable(x.fit):
- # Don't get num_samples from an ensembles length!
- raise TypeError(message)
- if not hasattr(x, "__len__") and not hasattr(x, "shape"):
- if hasattr(x, "__array__"):
- x = np.asarray(x)
- else:
- raise TypeError(message)
- if hasattr(x, "shape") and x.shape is not None:
- if len(x.shape) == 0:
- raise TypeError(
- "Singleton array %r cannot be considered a valid collection." % x
- )
- # Check that shape is returning an integer or default to len
- # Dask dataframes may not return numeric shape[0] value
- if isinstance(x.shape[0], numbers.Integral):
- return x.shape[0]
- try:
- return len(x)
- except TypeError as type_error:
- raise TypeError(message) from type_error
- def check_memory(memory):
- """Check that ``memory`` is joblib.Memory-like.
- joblib.Memory-like means that ``memory`` can be converted into a
- joblib.Memory instance (typically a str denoting the ``location``)
- or has the same interface (has a ``cache`` method).
- Parameters
- ----------
- memory : None, str or object with the joblib.Memory interface
- - If string, the location where to create the `joblib.Memory` interface.
- - If None, no caching is done and the Memory object is completely transparent.
- Returns
- -------
- memory : object with the joblib.Memory interface
- A correct joblib.Memory object.
- Raises
- ------
- ValueError
- If ``memory`` is not joblib.Memory-like.
- """
- if memory is None or isinstance(memory, str):
- memory = joblib.Memory(location=memory, verbose=0)
- elif not hasattr(memory, "cache"):
- raise ValueError(
- "'memory' should be None, a string or have the same"
- " interface as joblib.Memory."
- " Got memory='{}' instead.".format(memory)
- )
- return memory
- def check_consistent_length(*arrays):
- """Check that all arrays have consistent first dimensions.
- Checks whether all objects in arrays have the same shape or length.
- Parameters
- ----------
- *arrays : list or tuple of input objects.
- Objects that will be checked for consistent length.
- """
- lengths = [_num_samples(X) for X in arrays if X is not None]
- uniques = np.unique(lengths)
- if len(uniques) > 1:
- raise ValueError(
- "Found input variables with inconsistent numbers of samples: %r"
- % [int(l) for l in lengths]
- )
- def _make_indexable(iterable):
- """Ensure iterable supports indexing or convert to an indexable variant.
- Convert sparse matrices to csr and other non-indexable iterable to arrays.
- Let `None` and indexable objects (e.g. pandas dataframes) pass unchanged.
- Parameters
- ----------
- iterable : {list, dataframe, ndarray, sparse matrix} or None
- Object to be converted to an indexable iterable.
- """
- if sp.issparse(iterable):
- return iterable.tocsr()
- elif hasattr(iterable, "__getitem__") or hasattr(iterable, "iloc"):
- return iterable
- elif iterable is None:
- return iterable
- return np.array(iterable)
- def indexable(*iterables):
- """Make arrays indexable for cross-validation.
- Checks consistent length, passes through None, and ensures that everything
- can be indexed by converting sparse matrices to csr and converting
- non-interable objects to arrays.
- Parameters
- ----------
- *iterables : {lists, dataframes, ndarrays, sparse matrices}
- List of objects to ensure sliceability.
- Returns
- -------
- result : list of {ndarray, sparse matrix, dataframe} or None
- Returns a list containing indexable arrays (i.e. NumPy array,
- sparse matrix, or dataframe) or `None`.
- """
- result = [_make_indexable(X) for X in iterables]
- check_consistent_length(*result)
- return result
- def _ensure_sparse_format(
- spmatrix,
- accept_sparse,
- dtype,
- copy,
- force_all_finite,
- accept_large_sparse,
- estimator_name=None,
- input_name="",
- ):
- """Convert a sparse matrix to a given format.
- Checks the sparse format of spmatrix and converts if necessary.
- Parameters
- ----------
- spmatrix : sparse matrix
- Input to validate and convert.
- accept_sparse : str, bool or list/tuple of str
- String[s] representing allowed sparse matrix formats ('csc',
- 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia'). If the input is sparse but
- not in the allowed format, it will be converted to the first listed
- format. True allows the input to be any format. False means
- that a sparse matrix input will raise an error.
- dtype : str, type or None
- Data type of result. If None, the dtype of the input is preserved.
- copy : bool
- Whether a forced copy will be triggered. If copy=False, a copy might
- be triggered by a conversion.
- force_all_finite : bool or 'allow-nan'
- Whether to raise an error on np.inf, np.nan, pd.NA in X. The
- possibilities are:
- - True: Force all values of X to be finite.
- - False: accepts np.inf, np.nan, pd.NA in X.
- - 'allow-nan': accepts only np.nan and pd.NA values in X. Values cannot
- be infinite.
- .. versionadded:: 0.20
- ``force_all_finite`` accepts the string ``'allow-nan'``.
- .. versionchanged:: 0.23
- Accepts `pd.NA` and converts it into `np.nan`
- estimator_name : str, default=None
- The estimator name, used to construct the error message.
- input_name : str, default=""
- The data name used to construct the error message. In particular
- if `input_name` is "X" and the data has NaN values and
- allow_nan is False, the error message will link to the imputer
- documentation.
- Returns
- -------
- spmatrix_converted : sparse matrix.
- Matrix that is ensured to have an allowed type.
- """
- if dtype is None:
- dtype = spmatrix.dtype
- changed_format = False
- if isinstance(accept_sparse, str):
- accept_sparse = [accept_sparse]
- # Indices dtype validation
- _check_large_sparse(spmatrix, accept_large_sparse)
- if accept_sparse is False:
- raise TypeError(
- "A sparse matrix was passed, but dense "
- "data is required. Use X.toarray() to "
- "convert to a dense numpy array."
- )
- elif isinstance(accept_sparse, (list, tuple)):
- if len(accept_sparse) == 0:
- raise ValueError(
- "When providing 'accept_sparse' "
- "as a tuple or list, it must contain at "
- "least one string value."
- )
- # ensure correct sparse format
- if spmatrix.format not in accept_sparse:
- # create new with correct sparse
- spmatrix = spmatrix.asformat(accept_sparse[0])
- changed_format = True
- elif accept_sparse is not True:
- # any other type
- raise ValueError(
- "Parameter 'accept_sparse' should be a string, "
- "boolean or list of strings. You provided "
- "'accept_sparse={}'.".format(accept_sparse)
- )
- if dtype != spmatrix.dtype:
- # convert dtype
- spmatrix = spmatrix.astype(dtype)
- elif copy and not changed_format:
- # force copy
- spmatrix = spmatrix.copy()
- if force_all_finite:
- if not hasattr(spmatrix, "data"):
- warnings.warn(
- "Can't check %s sparse matrix for nan or inf." % spmatrix.format,
- stacklevel=2,
- )
- else:
- _assert_all_finite(
- spmatrix.data,
- allow_nan=force_all_finite == "allow-nan",
- estimator_name=estimator_name,
- input_name=input_name,
- )
- return spmatrix
- def _ensure_no_complex_data(array):
- if (
- hasattr(array, "dtype")
- and array.dtype is not None
- and hasattr(array.dtype, "kind")
- and array.dtype.kind == "c"
- ):
- raise ValueError("Complex data not supported\n{}\n".format(array))
- def _check_estimator_name(estimator):
- if estimator is not None:
- if isinstance(estimator, str):
- return estimator
- else:
- return estimator.__class__.__name__
- return None
- def _pandas_dtype_needs_early_conversion(pd_dtype):
- """Return True if pandas extension pd_dtype need to be converted early."""
- # Check these early for pandas versions without extension dtypes
- from pandas import SparseDtype
- from pandas.api.types import (
- is_bool_dtype,
- is_float_dtype,
- is_integer_dtype,
- )
- if is_bool_dtype(pd_dtype):
- # bool and extension booleans need early conversion because __array__
- # converts mixed dtype dataframes into object dtypes
- return True
- if isinstance(pd_dtype, SparseDtype):
- # Sparse arrays will be converted later in `check_array`
- return False
- try:
- from pandas.api.types import is_extension_array_dtype
- except ImportError:
- return False
- if isinstance(pd_dtype, SparseDtype) or not is_extension_array_dtype(pd_dtype):
- # Sparse arrays will be converted later in `check_array`
- # Only handle extension arrays for integer and floats
- return False
- elif is_float_dtype(pd_dtype):
- # Float ndarrays can normally support nans. They need to be converted
- # first to map pd.NA to np.nan
- return True
- elif is_integer_dtype(pd_dtype):
- # XXX: Warn when converting from a high integer to a float
- return True
- return False
- def _is_extension_array_dtype(array):
- # Pandas extension arrays have a dtype with an na_value
- return hasattr(array, "dtype") and hasattr(array.dtype, "na_value")
- def check_array(
- array,
- accept_sparse=False,
- *,
- accept_large_sparse=True,
- dtype="numeric",
- order=None,
- copy=False,
- force_all_finite=True,
- ensure_2d=True,
- allow_nd=False,
- ensure_min_samples=1,
- ensure_min_features=1,
- estimator=None,
- input_name="",
- ):
- """Input validation on an array, list, sparse matrix or similar.
- By default, the input is checked to be a non-empty 2D array containing
- only finite values. If the dtype of the array is object, attempt
- converting to float, raising on failure.
- Parameters
- ----------
- array : object
- Input object to check / convert.
- accept_sparse : str, bool or list/tuple of str, default=False
- String[s] representing allowed sparse matrix formats, such as 'csc',
- 'csr', etc. If the input is sparse but not in the allowed format,
- it will be converted to the first listed format. True allows the input
- to be any format. False means that a sparse matrix input will
- raise an error.
- accept_large_sparse : bool, default=True
- If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
- accept_sparse, accept_large_sparse=False will cause it to be accepted
- only if its indices are stored with a 32-bit dtype.
- .. versionadded:: 0.20
- dtype : 'numeric', type, list of type or None, default='numeric'
- Data type of result. If None, the dtype of the input is preserved.
- If "numeric", dtype is preserved unless array.dtype is object.
- If dtype is a list of types, conversion on the first type is only
- performed if the dtype of the input is not in the list.
- order : {'F', 'C'} or None, default=None
- Whether an array will be forced to be fortran or c-style.
- When order is None (default), then if copy=False, nothing is ensured
- about the memory layout of the output array; otherwise (copy=True)
- the memory layout of the returned array is kept as close as possible
- to the original array.
- copy : bool, default=False
- Whether a forced copy will be triggered. If copy=False, a copy might
- be triggered by a conversion.
- force_all_finite : bool or 'allow-nan', default=True
- Whether to raise an error on np.inf, np.nan, pd.NA in array. The
- possibilities are:
- - True: Force all values of array to be finite.
- - False: accepts np.inf, np.nan, pd.NA in array.
- - 'allow-nan': accepts only np.nan and pd.NA values in array. Values
- cannot be infinite.
- .. versionadded:: 0.20
- ``force_all_finite`` accepts the string ``'allow-nan'``.
- .. versionchanged:: 0.23
- Accepts `pd.NA` and converts it into `np.nan`
- ensure_2d : bool, default=True
- Whether to raise a value error if array is not 2D.
- allow_nd : bool, default=False
- Whether to allow array.ndim > 2.
- ensure_min_samples : int, default=1
- Make sure that the array has a minimum number of samples in its first
- axis (rows for a 2D array). Setting to 0 disables this check.
- ensure_min_features : int, default=1
- Make sure that the 2D array has some minimum number of features
- (columns). The default value of 1 rejects empty datasets.
- This check is only enforced when the input data has effectively 2
- dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
- disables this check.
- estimator : str or estimator instance, default=None
- If passed, include the name of the estimator in warning messages.
- input_name : str, default=""
- The data name used to construct the error message. In particular
- if `input_name` is "X" and the data has NaN values and
- allow_nan is False, the error message will link to the imputer
- documentation.
- .. versionadded:: 1.1.0
- Returns
- -------
- array_converted : object
- The converted and validated array.
- """
- if isinstance(array, np.matrix):
- raise TypeError(
- "np.matrix is not supported. Please convert to a numpy array with "
- "np.asarray. For more information see: "
- "https://numpy.org/doc/stable/reference/generated/numpy.matrix.html"
- )
- xp, is_array_api_compliant = get_namespace(array)
- # store reference to original array to check if copy is needed when
- # function returns
- array_orig = array
- # store whether originally we wanted numeric dtype
- dtype_numeric = isinstance(dtype, str) and dtype == "numeric"
- dtype_orig = getattr(array, "dtype", None)
- if not is_array_api_compliant and not hasattr(dtype_orig, "kind"):
- # not a data type (e.g. a column named dtype in a pandas DataFrame)
- dtype_orig = None
- # check if the object contains several dtypes (typically a pandas
- # DataFrame), and store them. If not, store None.
- dtypes_orig = None
- pandas_requires_conversion = False
- if hasattr(array, "dtypes") and hasattr(array.dtypes, "__array__"):
- # throw warning if columns are sparse. If all columns are sparse, then
- # array.sparse exists and sparsity will be preserved (later).
- with suppress(ImportError):
- from pandas import SparseDtype
- def is_sparse(dtype):
- return isinstance(dtype, SparseDtype)
- if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
- warnings.warn(
- "pandas.DataFrame with sparse columns found."
- "It will be converted to a dense numpy array."
- )
- dtypes_orig = list(array.dtypes)
- pandas_requires_conversion = any(
- _pandas_dtype_needs_early_conversion(i) for i in dtypes_orig
- )
- if all(isinstance(dtype_iter, np.dtype) for dtype_iter in dtypes_orig):
- dtype_orig = np.result_type(*dtypes_orig)
- elif pandas_requires_conversion and any(d == object for d in dtypes_orig):
- # Force object if any of the dtypes is an object
- dtype_orig = object
- elif (_is_extension_array_dtype(array) or hasattr(array, "iloc")) and hasattr(
- array, "dtype"
- ):
- # array is a pandas series
- pandas_requires_conversion = _pandas_dtype_needs_early_conversion(array.dtype)
- if isinstance(array.dtype, np.dtype):
- dtype_orig = array.dtype
- else:
- # Set to None to let array.astype work out the best dtype
- dtype_orig = None
- if dtype_numeric:
- if (
- dtype_orig is not None
- and hasattr(dtype_orig, "kind")
- and dtype_orig.kind == "O"
- ):
- # if input is object, convert to float.
- dtype = xp.float64
- else:
- dtype = None
- if isinstance(dtype, (list, tuple)):
- if dtype_orig is not None and dtype_orig in dtype:
- # no dtype conversion required
- dtype = None
- else:
- # dtype conversion required. Let's select the first element of the
- # list of accepted types.
- dtype = dtype[0]
- if pandas_requires_conversion:
- # pandas dataframe requires conversion earlier to handle extension dtypes with
- # nans
- # Use the original dtype for conversion if dtype is None
- new_dtype = dtype_orig if dtype is None else dtype
- array = array.astype(new_dtype)
- # Since we converted here, we do not need to convert again later
- dtype = None
- if dtype is not None and _is_numpy_namespace(xp):
- dtype = np.dtype(dtype)
- if force_all_finite not in (True, False, "allow-nan"):
- raise ValueError(
- 'force_all_finite should be a bool or "allow-nan". Got {!r} instead'.format(
- force_all_finite
- )
- )
- if dtype is not None and _is_numpy_namespace(xp):
- # convert to dtype object to conform to Array API to be use `xp.isdtype` later
- dtype = np.dtype(dtype)
- estimator_name = _check_estimator_name(estimator)
- context = " by %s" % estimator_name if estimator is not None else ""
- # When all dataframe columns are sparse, convert to a sparse array
- if hasattr(array, "sparse") and array.ndim > 1:
- with suppress(ImportError):
- from pandas import SparseDtype # noqa: F811
- def is_sparse(dtype):
- return isinstance(dtype, SparseDtype)
- if array.dtypes.apply(is_sparse).all():
- # DataFrame.sparse only supports `to_coo`
- array = array.sparse.to_coo()
- if array.dtype == np.dtype("object"):
- unique_dtypes = set([dt.subtype.name for dt in array_orig.dtypes])
- if len(unique_dtypes) > 1:
- raise ValueError(
- "Pandas DataFrame with mixed sparse extension arrays "
- "generated a sparse matrix with object dtype which "
- "can not be converted to a scipy sparse matrix."
- "Sparse extension arrays should all have the same "
- "numeric type."
- )
- if sp.issparse(array):
- _ensure_no_complex_data(array)
- array = _ensure_sparse_format(
- array,
- accept_sparse=accept_sparse,
- dtype=dtype,
- copy=copy,
- force_all_finite=force_all_finite,
- accept_large_sparse=accept_large_sparse,
- estimator_name=estimator_name,
- input_name=input_name,
- )
- else:
- # If np.array(..) gives ComplexWarning, then we convert the warning
- # to an error. This is needed because specifying a non complex
- # dtype to the function converts complex to real dtype,
- # thereby passing the test made in the lines following the scope
- # of warnings context manager.
- with warnings.catch_warnings():
- try:
- warnings.simplefilter("error", ComplexWarning)
- if dtype is not None and xp.isdtype(dtype, "integral"):
- # Conversion float -> int should not contain NaN or
- # inf (numpy#14412). We cannot use casting='safe' because
- # then conversion float -> int would be disallowed.
- array = _asarray_with_order(array, order=order, xp=xp)
- if xp.isdtype(array.dtype, ("real floating", "complex floating")):
- _assert_all_finite(
- array,
- allow_nan=False,
- msg_dtype=dtype,
- estimator_name=estimator_name,
- input_name=input_name,
- )
- array = xp.astype(array, dtype, copy=False)
- else:
- array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
- except ComplexWarning as complex_warning:
- raise ValueError(
- "Complex data not supported\n{}\n".format(array)
- ) from complex_warning
- # It is possible that the np.array(..) gave no warning. This happens
- # when no dtype conversion happened, for example dtype = None. The
- # result is that np.array(..) produces an array of complex dtype
- # and we need to catch and raise exception for such cases.
- _ensure_no_complex_data(array)
- if ensure_2d:
- # If input is scalar raise error
- if array.ndim == 0:
- raise ValueError(
- "Expected 2D array, got scalar array instead:\narray={}.\n"
- "Reshape your data either using array.reshape(-1, 1) if "
- "your data has a single feature or array.reshape(1, -1) "
- "if it contains a single sample.".format(array)
- )
- # If input is 1D raise error
- if array.ndim == 1:
- raise ValueError(
- "Expected 2D array, got 1D array instead:\narray={}.\n"
- "Reshape your data either using array.reshape(-1, 1) if "
- "your data has a single feature or array.reshape(1, -1) "
- "if it contains a single sample.".format(array)
- )
- if dtype_numeric and hasattr(array.dtype, "kind") and array.dtype.kind in "USV":
- raise ValueError(
- "dtype='numeric' is not compatible with arrays of bytes/strings."
- "Convert your data to numeric values explicitly instead."
- )
- if not allow_nd and array.ndim >= 3:
- raise ValueError(
- "Found array with dim %d. %s expected <= 2."
- % (array.ndim, estimator_name)
- )
- if force_all_finite:
- _assert_all_finite(
- array,
- input_name=input_name,
- estimator_name=estimator_name,
- allow_nan=force_all_finite == "allow-nan",
- )
- if ensure_min_samples > 0:
- n_samples = _num_samples(array)
- if n_samples < ensure_min_samples:
- raise ValueError(
- "Found array with %d sample(s) (shape=%s) while a"
- " minimum of %d is required%s."
- % (n_samples, array.shape, ensure_min_samples, context)
- )
- if ensure_min_features > 0 and array.ndim == 2:
- n_features = array.shape[1]
- if n_features < ensure_min_features:
- raise ValueError(
- "Found array with %d feature(s) (shape=%s) while"
- " a minimum of %d is required%s."
- % (n_features, array.shape, ensure_min_features, context)
- )
- if copy:
- if _is_numpy_namespace(xp):
- # only make a copy if `array` and `array_orig` may share memory`
- if np.may_share_memory(array, array_orig):
- array = _asarray_with_order(
- array, dtype=dtype, order=order, copy=True, xp=xp
- )
- else:
- # always make a copy for non-numpy arrays
- array = _asarray_with_order(
- array, dtype=dtype, order=order, copy=True, xp=xp
- )
- return array
- def _check_large_sparse(X, accept_large_sparse=False):
- """Raise a ValueError if X has 64bit indices and accept_large_sparse=False"""
- if not accept_large_sparse:
- supported_indices = ["int32"]
- if X.getformat() == "coo":
- index_keys = ["col", "row"]
- elif X.getformat() in ["csr", "csc", "bsr"]:
- index_keys = ["indices", "indptr"]
- else:
- return
- for key in index_keys:
- indices_datatype = getattr(X, key).dtype
- if indices_datatype not in supported_indices:
- raise ValueError(
- "Only sparse matrices with 32-bit integer indices are accepted."
- f" Got {indices_datatype} indices. Please do report a minimal"
- " reproducer on scikit-learn issue tracker so that support for"
- " your use-case can be studied by maintainers. See:"
- " https://scikit-learn.org/dev/developers/minimal_reproducer.html"
- )
- def check_X_y(
- X,
- y,
- accept_sparse=False,
- *,
- accept_large_sparse=True,
- dtype="numeric",
- order=None,
- copy=False,
- force_all_finite=True,
- ensure_2d=True,
- allow_nd=False,
- multi_output=False,
- ensure_min_samples=1,
- ensure_min_features=1,
- y_numeric=False,
- estimator=None,
- ):
- """Input validation for standard estimators.
- Checks X and y for consistent length, enforces X to be 2D and y 1D. By
- default, X is checked to be non-empty and containing only finite values.
- Standard input checks are also applied to y, such as checking that y
- does not have np.nan or np.inf targets. For multi-label y, set
- multi_output=True to allow 2D and sparse y. If the dtype of X is
- object, attempt converting to float, raising on failure.
- Parameters
- ----------
- X : {ndarray, list, sparse matrix}
- Input data.
- y : {ndarray, list, sparse matrix}
- Labels.
- accept_sparse : str, bool or list of str, default=False
- String[s] representing allowed sparse matrix formats, such as 'csc',
- 'csr', etc. If the input is sparse but not in the allowed format,
- it will be converted to the first listed format. True allows the input
- to be any format. False means that a sparse matrix input will
- raise an error.
- accept_large_sparse : bool, default=True
- If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
- accept_sparse, accept_large_sparse will cause it to be accepted only
- if its indices are stored with a 32-bit dtype.
- .. versionadded:: 0.20
- dtype : 'numeric', type, list of type or None, default='numeric'
- Data type of result. If None, the dtype of the input is preserved.
- If "numeric", dtype is preserved unless array.dtype is object.
- If dtype is a list of types, conversion on the first type is only
- performed if the dtype of the input is not in the list.
- order : {'F', 'C'}, default=None
- Whether an array will be forced to be fortran or c-style. If
- `None`, then the input data's order is preserved when possible.
- copy : bool, default=False
- Whether a forced copy will be triggered. If copy=False, a copy might
- be triggered by a conversion.
- force_all_finite : bool or 'allow-nan', default=True
- Whether to raise an error on np.inf, np.nan, pd.NA in X. This parameter
- does not influence whether y can have np.inf, np.nan, pd.NA values.
- The possibilities are:
- - True: Force all values of X to be finite.
- - False: accepts np.inf, np.nan, pd.NA in X.
- - 'allow-nan': accepts only np.nan or pd.NA values in X. Values cannot
- be infinite.
- .. versionadded:: 0.20
- ``force_all_finite`` accepts the string ``'allow-nan'``.
- .. versionchanged:: 0.23
- Accepts `pd.NA` and converts it into `np.nan`
- ensure_2d : bool, default=True
- Whether to raise a value error if X is not 2D.
- allow_nd : bool, default=False
- Whether to allow X.ndim > 2.
- multi_output : bool, default=False
- Whether to allow 2D y (array or sparse matrix). If false, y will be
- validated as a vector. y cannot have np.nan or np.inf values if
- multi_output=True.
- ensure_min_samples : int, default=1
- Make sure that X has a minimum number of samples in its first
- axis (rows for a 2D array).
- ensure_min_features : int, default=1
- Make sure that the 2D array has some minimum number of features
- (columns). The default value of 1 rejects empty datasets.
- This check is only enforced when X has effectively 2 dimensions or
- is originally 1D and ``ensure_2d`` is True. Setting to 0 disables
- this check.
- y_numeric : bool, default=False
- Whether to ensure that y has a numeric type. If dtype of y is object,
- it is converted to float64. Should only be used for regression
- algorithms.
- estimator : str or estimator instance, default=None
- If passed, include the name of the estimator in warning messages.
- Returns
- -------
- X_converted : object
- The converted and validated X.
- y_converted : object
- The converted and validated y.
- """
- if y is None:
- if estimator is None:
- estimator_name = "estimator"
- else:
- estimator_name = _check_estimator_name(estimator)
- raise ValueError(
- f"{estimator_name} requires y to be passed, but the target y is None"
- )
- X = check_array(
- X,
- accept_sparse=accept_sparse,
- accept_large_sparse=accept_large_sparse,
- dtype=dtype,
- order=order,
- copy=copy,
- force_all_finite=force_all_finite,
- ensure_2d=ensure_2d,
- allow_nd=allow_nd,
- ensure_min_samples=ensure_min_samples,
- ensure_min_features=ensure_min_features,
- estimator=estimator,
- input_name="X",
- )
- y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
- check_consistent_length(X, y)
- return X, y
- def _check_y(y, multi_output=False, y_numeric=False, estimator=None):
- """Isolated part of check_X_y dedicated to y validation"""
- if multi_output:
- y = check_array(
- y,
- accept_sparse="csr",
- force_all_finite=True,
- ensure_2d=False,
- dtype=None,
- input_name="y",
- estimator=estimator,
- )
- else:
- estimator_name = _check_estimator_name(estimator)
- y = column_or_1d(y, warn=True)
- _assert_all_finite(y, input_name="y", estimator_name=estimator_name)
- _ensure_no_complex_data(y)
- if y_numeric and y.dtype.kind == "O":
- y = y.astype(np.float64)
- return y
- def column_or_1d(y, *, dtype=None, warn=False):
- """Ravel column or 1d numpy array, else raises an error.
- Parameters
- ----------
- y : array-like
- Input data.
- dtype : data-type, default=None
- Data type for `y`.
- .. versionadded:: 1.2
- warn : bool, default=False
- To control display of warnings.
- Returns
- -------
- y : ndarray
- Output data.
- Raises
- ------
- ValueError
- If `y` is not a 1D array or a 2D array with a single row or column.
- """
- xp, _ = get_namespace(y)
- y = check_array(
- y,
- ensure_2d=False,
- dtype=dtype,
- input_name="y",
- force_all_finite=False,
- ensure_min_samples=0,
- )
- shape = y.shape
- if len(shape) == 1:
- return _asarray_with_order(xp.reshape(y, (-1,)), order="C", xp=xp)
- if len(shape) == 2 and shape[1] == 1:
- if warn:
- warnings.warn(
- (
- "A column-vector y was passed when a 1d array was"
- " expected. Please change the shape of y to "
- "(n_samples, ), for example using ravel()."
- ),
- DataConversionWarning,
- stacklevel=2,
- )
- return _asarray_with_order(xp.reshape(y, (-1,)), order="C", xp=xp)
- raise ValueError(
- "y should be a 1d array, got an array of shape {} instead.".format(shape)
- )
- def check_random_state(seed):
- """Turn seed into a np.random.RandomState instance.
- Parameters
- ----------
- seed : None, int or instance of RandomState
- If seed is None, return the RandomState singleton used by np.random.
- If seed is an int, return a new RandomState instance seeded with seed.
- If seed is already a RandomState instance, return it.
- Otherwise raise ValueError.
- Returns
- -------
- :class:`numpy:numpy.random.RandomState`
- The random state object based on `seed` parameter.
- """
- if seed is None or seed is np.random:
- return np.random.mtrand._rand
- if isinstance(seed, numbers.Integral):
- return np.random.RandomState(seed)
- if isinstance(seed, np.random.RandomState):
- return seed
- raise ValueError(
- "%r cannot be used to seed a numpy.random.RandomState instance" % seed
- )
- def has_fit_parameter(estimator, parameter):
- """Check whether the estimator's fit method supports the given parameter.
- Parameters
- ----------
- estimator : object
- An estimator to inspect.
- parameter : str
- The searched parameter.
- Returns
- -------
- is_parameter : bool
- Whether the parameter was found to be a named parameter of the
- estimator's fit method.
- Examples
- --------
- >>> from sklearn.svm import SVC
- >>> from sklearn.utils.validation import has_fit_parameter
- >>> has_fit_parameter(SVC(), "sample_weight")
- True
- """
- return parameter in signature(estimator.fit).parameters
- def check_symmetric(array, *, tol=1e-10, raise_warning=True, raise_exception=False):
- """Make sure that array is 2D, square and symmetric.
- If the array is not symmetric, then a symmetrized version is returned.
- Optionally, a warning or exception is raised if the matrix is not
- symmetric.
- Parameters
- ----------
- array : {ndarray, sparse matrix}
- Input object to check / convert. Must be two-dimensional and square,
- otherwise a ValueError will be raised.
- tol : float, default=1e-10
- Absolute tolerance for equivalence of arrays. Default = 1E-10.
- raise_warning : bool, default=True
- If True then raise a warning if conversion is required.
- raise_exception : bool, default=False
- If True then raise an exception if array is not symmetric.
- Returns
- -------
- array_sym : {ndarray, sparse matrix}
- Symmetrized version of the input array, i.e. the average of array
- and array.transpose(). If sparse, then duplicate entries are first
- summed and zeros are eliminated.
- """
- if (array.ndim != 2) or (array.shape[0] != array.shape[1]):
- raise ValueError(
- "array must be 2-dimensional and square. shape = {0}".format(array.shape)
- )
- if sp.issparse(array):
- diff = array - array.T
- # only csr, csc, and coo have `data` attribute
- if diff.format not in ["csr", "csc", "coo"]:
- diff = diff.tocsr()
- symmetric = np.all(abs(diff.data) < tol)
- else:
- symmetric = np.allclose(array, array.T, atol=tol)
- if not symmetric:
- if raise_exception:
- raise ValueError("Array must be symmetric")
- if raise_warning:
- warnings.warn(
- (
- "Array is not symmetric, and will be converted "
- "to symmetric by average with its transpose."
- ),
- stacklevel=2,
- )
- if sp.issparse(array):
- conversion = "to" + array.format
- array = getattr(0.5 * (array + array.T), conversion)()
- else:
- array = 0.5 * (array + array.T)
- return array
- def _is_fitted(estimator, attributes=None, all_or_any=all):
- """Determine if an estimator is fitted
- Parameters
- ----------
- estimator : estimator instance
- Estimator instance for which the check is performed.
- attributes : str, list or tuple of str, default=None
- Attribute name(s) given as string or a list/tuple of strings
- Eg.: ``["coef_", "estimator_", ...], "coef_"``
- If `None`, `estimator` is considered fitted if there exist an
- attribute that ends with a underscore and does not start with double
- underscore.
- all_or_any : callable, {all, any}, default=all
- Specify whether all or any of the given attributes must exist.
- Returns
- -------
- fitted : bool
- Whether the estimator is fitted.
- """
- if attributes is not None:
- if not isinstance(attributes, (list, tuple)):
- attributes = [attributes]
- return all_or_any([hasattr(estimator, attr) for attr in attributes])
- if hasattr(estimator, "__sklearn_is_fitted__"):
- return estimator.__sklearn_is_fitted__()
- fitted_attrs = [
- v for v in vars(estimator) if v.endswith("_") and not v.startswith("__")
- ]
- return len(fitted_attrs) > 0
- def check_is_fitted(estimator, attributes=None, *, msg=None, all_or_any=all):
- """Perform is_fitted validation for estimator.
- Checks if the estimator is fitted by verifying the presence of
- fitted attributes (ending with a trailing underscore) and otherwise
- raises a NotFittedError with the given message.
- If an estimator does not set any attributes with a trailing underscore, it
- can define a ``__sklearn_is_fitted__`` method returning a boolean to specify if the
- estimator is fitted or not.
- Parameters
- ----------
- estimator : estimator instance
- Estimator instance for which the check is performed.
- attributes : str, list or tuple of str, default=None
- Attribute name(s) given as string or a list/tuple of strings
- Eg.: ``["coef_", "estimator_", ...], "coef_"``
- If `None`, `estimator` is considered fitted if there exist an
- attribute that ends with a underscore and does not start with double
- underscore.
- msg : str, default=None
- The default error message is, "This %(name)s instance is not fitted
- yet. Call 'fit' with appropriate arguments before using this
- estimator."
- For custom messages if "%(name)s" is present in the message string,
- it is substituted for the estimator name.
- Eg. : "Estimator, %(name)s, must be fitted before sparsifying".
- all_or_any : callable, {all, any}, default=all
- Specify whether all or any of the given attributes must exist.
- Raises
- ------
- TypeError
- If the estimator is a class or not an estimator instance
- NotFittedError
- If the attributes are not found.
- """
- if isclass(estimator):
- raise TypeError("{} is a class, not an instance.".format(estimator))
- if msg is None:
- msg = (
- "This %(name)s instance is not fitted yet. Call 'fit' with "
- "appropriate arguments before using this estimator."
- )
- if not hasattr(estimator, "fit"):
- raise TypeError("%s is not an estimator instance." % (estimator))
- if not _is_fitted(estimator, attributes, all_or_any):
- raise NotFittedError(msg % {"name": type(estimator).__name__})
- def check_non_negative(X, whom):
- """
- Check if there is any negative value in an array.
- Parameters
- ----------
- X : {array-like, sparse matrix}
- Input data.
- whom : str
- Who passed X to this function.
- """
- xp, _ = get_namespace(X)
- # avoid X.min() on sparse matrix since it also sorts the indices
- if sp.issparse(X):
- if X.format in ["lil", "dok"]:
- X = X.tocsr()
- if X.data.size == 0:
- X_min = 0
- else:
- X_min = X.data.min()
- else:
- X_min = xp.min(X)
- if X_min < 0:
- raise ValueError("Negative values in data passed to %s" % whom)
- def check_scalar(
- x,
- name,
- target_type,
- *,
- min_val=None,
- max_val=None,
- include_boundaries="both",
- ):
- """Validate scalar parameters type and value.
- Parameters
- ----------
- x : object
- The scalar parameter to validate.
- name : str
- The name of the parameter to be printed in error messages.
- target_type : type or tuple
- Acceptable data types for the parameter.
- min_val : float or int, default=None
- The minimum valid value the parameter can take. If None (default) it
- is implied that the parameter does not have a lower bound.
- max_val : float or int, default=None
- The maximum valid value the parameter can take. If None (default) it
- is implied that the parameter does not have an upper bound.
- include_boundaries : {"left", "right", "both", "neither"}, default="both"
- Whether the interval defined by `min_val` and `max_val` should include
- the boundaries. Possible choices are:
- - `"left"`: only `min_val` is included in the valid interval.
- It is equivalent to the interval `[ min_val, max_val )`.
- - `"right"`: only `max_val` is included in the valid interval.
- It is equivalent to the interval `( min_val, max_val ]`.
- - `"both"`: `min_val` and `max_val` are included in the valid interval.
- It is equivalent to the interval `[ min_val, max_val ]`.
- - `"neither"`: neither `min_val` nor `max_val` are included in the
- valid interval. It is equivalent to the interval `( min_val, max_val )`.
- Returns
- -------
- x : numbers.Number
- The validated number.
- Raises
- ------
- TypeError
- If the parameter's type does not match the desired type.
- ValueError
- If the parameter's value violates the given bounds.
- If `min_val`, `max_val` and `include_boundaries` are inconsistent.
- """
- def type_name(t):
- """Convert type into humman readable string."""
- module = t.__module__
- qualname = t.__qualname__
- if module == "builtins":
- return qualname
- elif t == numbers.Real:
- return "float"
- elif t == numbers.Integral:
- return "int"
- return f"{module}.{qualname}"
- if not isinstance(x, target_type):
- if isinstance(target_type, tuple):
- types_str = ", ".join(type_name(t) for t in target_type)
- target_type_str = f"{{{types_str}}}"
- else:
- target_type_str = type_name(target_type)
- raise TypeError(
- f"{name} must be an instance of {target_type_str}, not"
- f" {type(x).__qualname__}."
- )
- expected_include_boundaries = ("left", "right", "both", "neither")
- if include_boundaries not in expected_include_boundaries:
- raise ValueError(
- f"Unknown value for `include_boundaries`: {repr(include_boundaries)}. "
- f"Possible values are: {expected_include_boundaries}."
- )
- if max_val is None and include_boundaries == "right":
- raise ValueError(
- "`include_boundaries`='right' without specifying explicitly `max_val` "
- "is inconsistent."
- )
- if min_val is None and include_boundaries == "left":
- raise ValueError(
- "`include_boundaries`='left' without specifying explicitly `min_val` "
- "is inconsistent."
- )
- comparison_operator = (
- operator.lt if include_boundaries in ("left", "both") else operator.le
- )
- if min_val is not None and comparison_operator(x, min_val):
- raise ValueError(
- f"{name} == {x}, must be"
- f" {'>=' if include_boundaries in ('left', 'both') else '>'} {min_val}."
- )
- comparison_operator = (
- operator.gt if include_boundaries in ("right", "both") else operator.ge
- )
- if max_val is not None and comparison_operator(x, max_val):
- raise ValueError(
- f"{name} == {x}, must be"
- f" {'<=' if include_boundaries in ('right', 'both') else '<'} {max_val}."
- )
- return x
- def _check_psd_eigenvalues(lambdas, enable_warnings=False):
- """Check the eigenvalues of a positive semidefinite (PSD) matrix.
- Checks the provided array of PSD matrix eigenvalues for numerical or
- conditioning issues and returns a fixed validated version. This method
- should typically be used if the PSD matrix is user-provided (e.g. a
- Gram matrix) or computed using a user-provided dissimilarity metric
- (e.g. kernel function), or if the decomposition process uses approximation
- methods (randomized SVD, etc.).
- It checks for three things:
- - that there are no significant imaginary parts in eigenvalues (more than
- 1e-5 times the maximum real part). If this check fails, it raises a
- ``ValueError``. Otherwise all non-significant imaginary parts that may
- remain are set to zero. This operation is traced with a
- ``PositiveSpectrumWarning`` when ``enable_warnings=True``.
- - that eigenvalues are not all negative. If this check fails, it raises a
- ``ValueError``
- - that there are no significant negative eigenvalues with absolute value
- more than 1e-10 (1e-6) and more than 1e-5 (5e-3) times the largest
- positive eigenvalue in double (simple) precision. If this check fails,
- it raises a ``ValueError``. Otherwise all negative eigenvalues that may
- remain are set to zero. This operation is traced with a
- ``PositiveSpectrumWarning`` when ``enable_warnings=True``.
- Finally, all the positive eigenvalues that are too small (with a value
- smaller than the maximum eigenvalue multiplied by 1e-12 (2e-7)) are set to
- zero. This operation is traced with a ``PositiveSpectrumWarning`` when
- ``enable_warnings=True``.
- Parameters
- ----------
- lambdas : array-like of shape (n_eigenvalues,)
- Array of eigenvalues to check / fix.
- enable_warnings : bool, default=False
- When this is set to ``True``, a ``PositiveSpectrumWarning`` will be
- raised when there are imaginary parts, negative eigenvalues, or
- extremely small non-zero eigenvalues. Otherwise no warning will be
- raised. In both cases, imaginary parts, negative eigenvalues, and
- extremely small non-zero eigenvalues will be set to zero.
- Returns
- -------
- lambdas_fixed : ndarray of shape (n_eigenvalues,)
- A fixed validated copy of the array of eigenvalues.
- Examples
- --------
- >>> from sklearn.utils.validation import _check_psd_eigenvalues
- >>> _check_psd_eigenvalues([1, 2]) # nominal case
- array([1, 2])
- >>> _check_psd_eigenvalues([5, 5j]) # significant imag part
- Traceback (most recent call last):
- ...
- ValueError: There are significant imaginary parts in eigenvalues (1
- of the maximum real part). Either the matrix is not PSD, or there was
- an issue while computing the eigendecomposition of the matrix.
- >>> _check_psd_eigenvalues([5, 5e-5j]) # insignificant imag part
- array([5., 0.])
- >>> _check_psd_eigenvalues([-5, -1]) # all negative
- Traceback (most recent call last):
- ...
- ValueError: All eigenvalues are negative (maximum is -1). Either the
- matrix is not PSD, or there was an issue while computing the
- eigendecomposition of the matrix.
- >>> _check_psd_eigenvalues([5, -1]) # significant negative
- Traceback (most recent call last):
- ...
- ValueError: There are significant negative eigenvalues (0.2 of the
- maximum positive). Either the matrix is not PSD, or there was an issue
- while computing the eigendecomposition of the matrix.
- >>> _check_psd_eigenvalues([5, -5e-5]) # insignificant negative
- array([5., 0.])
- >>> _check_psd_eigenvalues([5, 4e-12]) # bad conditioning (too small)
- array([5., 0.])
- """
- lambdas = np.array(lambdas)
- is_double_precision = lambdas.dtype == np.float64
- # note: the minimum value available is
- # - single-precision: np.finfo('float32').eps = 1.2e-07
- # - double-precision: np.finfo('float64').eps = 2.2e-16
- # the various thresholds used for validation
- # we may wish to change the value according to precision.
- significant_imag_ratio = 1e-5
- significant_neg_ratio = 1e-5 if is_double_precision else 5e-3
- significant_neg_value = 1e-10 if is_double_precision else 1e-6
- small_pos_ratio = 1e-12 if is_double_precision else 2e-7
- # Check that there are no significant imaginary parts
- if not np.isreal(lambdas).all():
- max_imag_abs = np.abs(np.imag(lambdas)).max()
- max_real_abs = np.abs(np.real(lambdas)).max()
- if max_imag_abs > significant_imag_ratio * max_real_abs:
- raise ValueError(
- "There are significant imaginary parts in eigenvalues (%g "
- "of the maximum real part). Either the matrix is not PSD, or "
- "there was an issue while computing the eigendecomposition "
- "of the matrix." % (max_imag_abs / max_real_abs)
- )
- # warn about imaginary parts being removed
- if enable_warnings:
- warnings.warn(
- "There are imaginary parts in eigenvalues (%g "
- "of the maximum real part). Either the matrix is not"
- " PSD, or there was an issue while computing the "
- "eigendecomposition of the matrix. Only the real "
- "parts will be kept." % (max_imag_abs / max_real_abs),
- PositiveSpectrumWarning,
- )
- # Remove all imaginary parts (even if zero)
- lambdas = np.real(lambdas)
- # Check that there are no significant negative eigenvalues
- max_eig = lambdas.max()
- if max_eig < 0:
- raise ValueError(
- "All eigenvalues are negative (maximum is %g). "
- "Either the matrix is not PSD, or there was an "
- "issue while computing the eigendecomposition of "
- "the matrix." % max_eig
- )
- else:
- min_eig = lambdas.min()
- if (
- min_eig < -significant_neg_ratio * max_eig
- and min_eig < -significant_neg_value
- ):
- raise ValueError(
- "There are significant negative eigenvalues (%g"
- " of the maximum positive). Either the matrix is "
- "not PSD, or there was an issue while computing "
- "the eigendecomposition of the matrix." % (-min_eig / max_eig)
- )
- elif min_eig < 0:
- # Remove all negative values and warn about it
- if enable_warnings:
- warnings.warn(
- "There are negative eigenvalues (%g of the "
- "maximum positive). Either the matrix is not "
- "PSD, or there was an issue while computing the"
- " eigendecomposition of the matrix. Negative "
- "eigenvalues will be replaced with 0." % (-min_eig / max_eig),
- PositiveSpectrumWarning,
- )
- lambdas[lambdas < 0] = 0
- # Check for conditioning (small positive non-zeros)
- too_small_lambdas = (0 < lambdas) & (lambdas < small_pos_ratio * max_eig)
- if too_small_lambdas.any():
- if enable_warnings:
- warnings.warn(
- "Badly conditioned PSD matrix spectrum: the largest "
- "eigenvalue is more than %g times the smallest. "
- "Small eigenvalues will be replaced with 0."
- "" % (1 / small_pos_ratio),
- PositiveSpectrumWarning,
- )
- lambdas[too_small_lambdas] = 0
- return lambdas
- def _check_sample_weight(
- sample_weight, X, dtype=None, copy=False, only_non_negative=False
- ):
- """Validate sample weights.
- Note that passing sample_weight=None will output an array of ones.
- Therefore, in some cases, you may want to protect the call with:
- if sample_weight is not None:
- sample_weight = _check_sample_weight(...)
- Parameters
- ----------
- sample_weight : {ndarray, Number or None}, shape (n_samples,)
- Input sample weights.
- X : {ndarray, list, sparse matrix}
- Input data.
- only_non_negative : bool, default=False,
- Whether or not the weights are expected to be non-negative.
- .. versionadded:: 1.0
- dtype : dtype, default=None
- dtype of the validated `sample_weight`.
- If None, and the input `sample_weight` is an array, the dtype of the
- input is preserved; otherwise an array with the default numpy dtype
- is be allocated. If `dtype` is not one of `float32`, `float64`,
- `None`, the output will be of dtype `float64`.
- copy : bool, default=False
- If True, a copy of sample_weight will be created.
- Returns
- -------
- sample_weight : ndarray of shape (n_samples,)
- Validated sample weight. It is guaranteed to be "C" contiguous.
- """
- n_samples = _num_samples(X)
- if dtype is not None and dtype not in [np.float32, np.float64]:
- dtype = np.float64
- if sample_weight is None:
- sample_weight = np.ones(n_samples, dtype=dtype)
- elif isinstance(sample_weight, numbers.Number):
- sample_weight = np.full(n_samples, sample_weight, dtype=dtype)
- else:
- if dtype is None:
- dtype = [np.float64, np.float32]
- sample_weight = check_array(
- sample_weight,
- accept_sparse=False,
- ensure_2d=False,
- dtype=dtype,
- order="C",
- copy=copy,
- input_name="sample_weight",
- )
- if sample_weight.ndim != 1:
- raise ValueError("Sample weights must be 1D array or scalar")
- if sample_weight.shape != (n_samples,):
- raise ValueError(
- "sample_weight.shape == {}, expected {}!".format(
- sample_weight.shape, (n_samples,)
- )
- )
- if only_non_negative:
- check_non_negative(sample_weight, "`sample_weight`")
- return sample_weight
- def _allclose_dense_sparse(x, y, rtol=1e-7, atol=1e-9):
- """Check allclose for sparse and dense data.
- Both x and y need to be either sparse or dense, they
- can't be mixed.
- Parameters
- ----------
- x : {array-like, sparse matrix}
- First array to compare.
- y : {array-like, sparse matrix}
- Second array to compare.
- rtol : float, default=1e-7
- Relative tolerance; see numpy.allclose.
- atol : float, default=1e-9
- absolute tolerance; see numpy.allclose. Note that the default here is
- more tolerant than the default for numpy.testing.assert_allclose, where
- atol=0.
- """
- if sp.issparse(x) and sp.issparse(y):
- x = x.tocsr()
- y = y.tocsr()
- x.sum_duplicates()
- y.sum_duplicates()
- return (
- np.array_equal(x.indices, y.indices)
- and np.array_equal(x.indptr, y.indptr)
- and np.allclose(x.data, y.data, rtol=rtol, atol=atol)
- )
- elif not sp.issparse(x) and not sp.issparse(y):
- return np.allclose(x, y, rtol=rtol, atol=atol)
- raise ValueError(
- "Can only compare two sparse matrices, not a sparse matrix and an array"
- )
- def _check_response_method(estimator, response_method):
- """Check if `response_method` is available in estimator and return it.
- .. versionadded:: 1.3
- Parameters
- ----------
- estimator : estimator instance
- Classifier or regressor to check.
- response_method : {"predict_proba", "decision_function", "predict"} or \
- list of such str
- Specifies the response method to use get prediction from an estimator
- (i.e. :term:`predict_proba`, :term:`decision_function` or
- :term:`predict`). Possible choices are:
- - if `str`, it corresponds to the name to the method to return;
- - if a list of `str`, it provides the method names in order of
- preference. The method returned corresponds to the first method in
- the list and which is implemented by `estimator`.
- Returns
- -------
- prediction_method : callable
- Prediction method of estimator.
- Raises
- ------
- AttributeError
- If `response_method` is not available in `estimator`.
- """
- if isinstance(response_method, str):
- list_methods = [response_method]
- else:
- list_methods = response_method
- prediction_method = [getattr(estimator, method, None) for method in list_methods]
- prediction_method = reduce(lambda x, y: x or y, prediction_method)
- if prediction_method is None:
- raise AttributeError(
- f"{estimator.__class__.__name__} has none of the following attributes: "
- f"{', '.join(list_methods)}."
- )
- return prediction_method
- def _check_fit_params(X, fit_params, indices=None):
- """Check and validate the parameters passed during `fit`.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Data array.
- fit_params : dict
- Dictionary containing the parameters passed at fit.
- indices : array-like of shape (n_samples,), default=None
- Indices to be selected if the parameter has the same size as `X`.
- Returns
- -------
- fit_params_validated : dict
- Validated parameters. We ensure that the values support indexing.
- """
- from . import _safe_indexing
- fit_params_validated = {}
- for param_key, param_value in fit_params.items():
- if not _is_arraylike(param_value) or _num_samples(param_value) != _num_samples(
- X
- ):
- # Non-indexable pass-through (for now for backward-compatibility).
- # https://github.com/scikit-learn/scikit-learn/issues/15805
- fit_params_validated[param_key] = param_value
- else:
- # Any other fit_params should support indexing
- # (e.g. for cross-validation).
- fit_params_validated[param_key] = _make_indexable(param_value)
- fit_params_validated[param_key] = _safe_indexing(
- fit_params_validated[param_key], indices
- )
- return fit_params_validated
- def _get_feature_names(X):
- """Get feature names from X.
- Support for other array containers should place its implementation here.
- Parameters
- ----------
- X : {ndarray, dataframe} of shape (n_samples, n_features)
- Array container to extract feature names.
- - pandas dataframe : The columns will be considered to be feature
- names. If the dataframe contains non-string feature names, `None` is
- returned.
- - All other array containers will return `None`.
- Returns
- -------
- names: ndarray or None
- Feature names of `X`. Unrecognized array containers will return `None`.
- """
- feature_names = None
- # extract feature names for support array containers
- if hasattr(X, "columns"):
- feature_names = np.asarray(X.columns, dtype=object)
- if feature_names is None or len(feature_names) == 0:
- return
- types = sorted(t.__qualname__ for t in set(type(v) for v in feature_names))
- # mixed type of string and non-string is not supported
- if len(types) > 1 and "str" in types:
- raise TypeError(
- "Feature names are only supported if all input features have string names, "
- f"but your input has {types} as feature name / column name types. "
- "If you want feature names to be stored and validated, you must convert "
- "them all to strings, by using X.columns = X.columns.astype(str) for "
- "example. Otherwise you can remove feature / column names from your input "
- "data, or convert them all to a non-string data type."
- )
- # Only feature names of all strings are supported
- if len(types) == 1 and types[0] == "str":
- return feature_names
- def _check_feature_names_in(estimator, input_features=None, *, generate_names=True):
- """Check `input_features` and generate names if needed.
- Commonly used in :term:`get_feature_names_out`.
- Parameters
- ----------
- input_features : array-like of str or None, default=None
- Input features.
- - If `input_features` is `None`, then `feature_names_in_` is
- used as feature names in. If `feature_names_in_` is not defined,
- then the following input feature names are generated:
- `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
- - If `input_features` is an array-like, then `input_features` must
- match `feature_names_in_` if `feature_names_in_` is defined.
- generate_names : bool, default=True
- Whether to generate names when `input_features` is `None` and
- `estimator.feature_names_in_` is not defined. This is useful for transformers
- that validates `input_features` but do not require them in
- :term:`get_feature_names_out` e.g. `PCA`.
- Returns
- -------
- feature_names_in : ndarray of str or `None`
- Feature names in.
- """
- feature_names_in_ = getattr(estimator, "feature_names_in_", None)
- n_features_in_ = getattr(estimator, "n_features_in_", None)
- if input_features is not None:
- input_features = np.asarray(input_features, dtype=object)
- if feature_names_in_ is not None and not np.array_equal(
- feature_names_in_, input_features
- ):
- raise ValueError("input_features is not equal to feature_names_in_")
- if n_features_in_ is not None and len(input_features) != n_features_in_:
- raise ValueError(
- "input_features should have length equal to number of "
- f"features ({n_features_in_}), got {len(input_features)}"
- )
- return input_features
- if feature_names_in_ is not None:
- return feature_names_in_
- if not generate_names:
- return
- # Generates feature names if `n_features_in_` is defined
- if n_features_in_ is None:
- raise ValueError("Unable to generate feature names without n_features_in_")
- return np.asarray([f"x{i}" for i in range(n_features_in_)], dtype=object)
- def _generate_get_feature_names_out(estimator, n_features_out, input_features=None):
- """Generate feature names out for estimator using the estimator name as the prefix.
- The input_feature names are validated but not used. This function is useful
- for estimators that generate their own names based on `n_features_out`, i.e. PCA.
- Parameters
- ----------
- estimator : estimator instance
- Estimator producing output feature names.
- n_feature_out : int
- Number of feature names out.
- input_features : array-like of str or None, default=None
- Only used to validate feature names with `estimator.feature_names_in_`.
- Returns
- -------
- feature_names_in : ndarray of str or `None`
- Feature names in.
- """
- _check_feature_names_in(estimator, input_features, generate_names=False)
- estimator_name = estimator.__class__.__name__.lower()
- return np.asarray(
- [f"{estimator_name}{i}" for i in range(n_features_out)], dtype=object
- )
- def _check_monotonic_cst(estimator, monotonic_cst=None):
- """Check the monotonic constraints and return the corresponding array.
- This helper function should be used in the `fit` method of an estimator
- that supports monotonic constraints and called after the estimator has
- introspected input data to set the `n_features_in_` and optionally the
- `feature_names_in_` attributes.
- .. versionadded:: 1.2
- Parameters
- ----------
- estimator : estimator instance
- monotonic_cst : array-like of int, dict of str or None, default=None
- Monotonic constraints for the features.
- - If array-like, then it should contain only -1, 0 or 1. Each value
- will be checked to be in [-1, 0, 1]. If a value is -1, then the
- corresponding feature is required to be monotonically decreasing.
- - If dict, then it the keys should be the feature names occurring in
- `estimator.feature_names_in_` and the values should be -1, 0 or 1.
- - If None, then an array of 0s will be allocated.
- Returns
- -------
- monotonic_cst : ndarray of int
- Monotonic constraints for each feature.
- """
- original_monotonic_cst = monotonic_cst
- if monotonic_cst is None or isinstance(monotonic_cst, dict):
- monotonic_cst = np.full(
- shape=estimator.n_features_in_,
- fill_value=0,
- dtype=np.int8,
- )
- if isinstance(original_monotonic_cst, dict):
- if not hasattr(estimator, "feature_names_in_"):
- raise ValueError(
- f"{estimator.__class__.__name__} was not fitted on data "
- "with feature names. Pass monotonic_cst as an integer "
- "array instead."
- )
- unexpected_feature_names = list(
- set(original_monotonic_cst) - set(estimator.feature_names_in_)
- )
- unexpected_feature_names.sort() # deterministic error message
- n_unexpeced = len(unexpected_feature_names)
- if unexpected_feature_names:
- if len(unexpected_feature_names) > 5:
- unexpected_feature_names = unexpected_feature_names[:5]
- unexpected_feature_names.append("...")
- raise ValueError(
- f"monotonic_cst contains {n_unexpeced} unexpected feature "
- f"names: {unexpected_feature_names}."
- )
- for feature_idx, feature_name in enumerate(estimator.feature_names_in_):
- if feature_name in original_monotonic_cst:
- cst = original_monotonic_cst[feature_name]
- if cst not in [-1, 0, 1]:
- raise ValueError(
- f"monotonic_cst['{feature_name}'] must be either "
- f"-1, 0 or 1. Got {cst!r}."
- )
- monotonic_cst[feature_idx] = cst
- else:
- unexpected_cst = np.setdiff1d(monotonic_cst, [-1, 0, 1])
- if unexpected_cst.shape[0]:
- raise ValueError(
- "monotonic_cst must be an array-like of -1, 0 or 1. Observed "
- f"values: {unexpected_cst.tolist()}."
- )
- monotonic_cst = np.asarray(monotonic_cst, dtype=np.int8)
- if monotonic_cst.shape[0] != estimator.n_features_in_:
- raise ValueError(
- f"monotonic_cst has shape {monotonic_cst.shape} but the input data "
- f"X has {estimator.n_features_in_} features."
- )
- return monotonic_cst
- def _check_pos_label_consistency(pos_label, y_true):
- """Check if `pos_label` need to be specified or not.
- In binary classification, we fix `pos_label=1` if the labels are in the set
- {-1, 1} or {0, 1}. Otherwise, we raise an error asking to specify the
- `pos_label` parameters.
- Parameters
- ----------
- pos_label : int, float, bool, str or None
- The positive label.
- y_true : ndarray of shape (n_samples,)
- The target vector.
- Returns
- -------
- pos_label : int, float, bool or str
- If `pos_label` can be inferred, it will be returned.
- Raises
- ------
- ValueError
- In the case that `y_true` does not have label in {-1, 1} or {0, 1},
- it will raise a `ValueError`.
- """
- # ensure binary classification if pos_label is not specified
- # classes.dtype.kind in ('O', 'U', 'S') is required to avoid
- # triggering a FutureWarning by calling np.array_equal(a, b)
- # when elements in the two arrays are not comparable.
- classes = np.unique(y_true)
- if pos_label is None and (
- classes.dtype.kind in "OUS"
- or not (
- np.array_equal(classes, [0, 1])
- or np.array_equal(classes, [-1, 1])
- or np.array_equal(classes, [0])
- or np.array_equal(classes, [-1])
- or np.array_equal(classes, [1])
- )
- ):
- classes_repr = ", ".join([repr(c) for c in classes.tolist()])
- raise ValueError(
- f"y_true takes value in {{{classes_repr}}} and pos_label is not "
- "specified: either make y_true take value in {0, 1} or "
- "{-1, 1} or pass pos_label explicitly."
- )
- elif pos_label is None:
- pos_label = 1
- return pos_label
|