| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197 |
- """
- The :mod:`sklearn.utils` module includes various utilities.
- """
- import math
- import numbers
- import platform
- import struct
- import timeit
- import warnings
- from collections.abc import Sequence
- from contextlib import contextmanager, suppress
- from itertools import compress, islice
- import numpy as np
- from scipy.sparse import issparse
- from .. import get_config
- from ..exceptions import DataConversionWarning
- from . import _joblib, metadata_routing
- from ._bunch import Bunch
- from ._estimator_html_repr import estimator_html_repr
- from ._param_validation import Interval, validate_params
- from .class_weight import compute_class_weight, compute_sample_weight
- from .deprecation import deprecated
- from .discovery import all_estimators
- from .fixes import parse_version, threadpool_info
- from .murmurhash import murmurhash3_32
- from .validation import (
- _is_arraylike_not_scalar,
- as_float_array,
- assert_all_finite,
- check_array,
- check_consistent_length,
- check_random_state,
- check_scalar,
- check_symmetric,
- check_X_y,
- column_or_1d,
- indexable,
- )
- # Do not deprecate parallel_backend and register_parallel_backend as they are
- # needed to tune `scikit-learn` behavior and have different effect if called
- # from the vendored version or or the site-package version. The other are
- # utilities that are independent of scikit-learn so they are not part of
- # scikit-learn public API.
- parallel_backend = _joblib.parallel_backend
- register_parallel_backend = _joblib.register_parallel_backend
- __all__ = [
- "murmurhash3_32",
- "as_float_array",
- "assert_all_finite",
- "check_array",
- "check_random_state",
- "compute_class_weight",
- "compute_sample_weight",
- "column_or_1d",
- "check_consistent_length",
- "check_X_y",
- "check_scalar",
- "indexable",
- "check_symmetric",
- "indices_to_mask",
- "deprecated",
- "parallel_backend",
- "register_parallel_backend",
- "resample",
- "shuffle",
- "check_matplotlib_support",
- "all_estimators",
- "DataConversionWarning",
- "estimator_html_repr",
- "Bunch",
- "metadata_routing",
- ]
- IS_PYPY = platform.python_implementation() == "PyPy"
- _IS_32BIT = 8 * struct.calcsize("P") == 32
- def _in_unstable_openblas_configuration():
- """Return True if in an unstable configuration for OpenBLAS"""
- # Import libraries which might load OpenBLAS.
- import numpy # noqa
- import scipy # noqa
- modules_info = threadpool_info()
- open_blas_used = any(info["internal_api"] == "openblas" for info in modules_info)
- if not open_blas_used:
- return False
- # OpenBLAS 0.3.16 fixed unstability for arm64, see:
- # https://github.com/xianyi/OpenBLAS/blob/1b6db3dbba672b4f8af935bd43a1ff6cff4d20b7/Changelog.txt#L56-L58 # noqa
- openblas_arm64_stable_version = parse_version("0.3.16")
- for info in modules_info:
- if info["internal_api"] != "openblas":
- continue
- openblas_version = info.get("version")
- openblas_architecture = info.get("architecture")
- if openblas_version is None or openblas_architecture is None:
- # Cannot be sure that OpenBLAS is good enough. Assume unstable:
- return True
- if (
- openblas_architecture == "neoversen1"
- and parse_version(openblas_version) < openblas_arm64_stable_version
- ):
- # See discussions in https://github.com/numpy/numpy/issues/19411
- return True
- return False
- def safe_mask(X, mask):
- """Return a mask which is safe to use on X.
- Parameters
- ----------
- X : {array-like, sparse matrix}
- Data on which to apply mask.
- mask : ndarray
- Mask to be used on X.
- Returns
- -------
- mask : ndarray
- Array that is safe to use on X.
- """
- mask = np.asarray(mask)
- if np.issubdtype(mask.dtype, np.signedinteger):
- return mask
- if hasattr(X, "toarray"):
- ind = np.arange(mask.shape[0])
- mask = ind[mask]
- return mask
- def axis0_safe_slice(X, mask, len_mask):
- """Return a mask which is safer to use on X than safe_mask.
- This mask is safer than safe_mask since it returns an
- empty array, when a sparse matrix is sliced with a boolean mask
- with all False, instead of raising an unhelpful error in older
- versions of SciPy.
- See: https://github.com/scipy/scipy/issues/5361
- Also note that we can avoid doing the dot product by checking if
- the len_mask is not zero in _huber_loss_and_gradient but this
- is not going to be the bottleneck, since the number of outliers
- and non_outliers are typically non-zero and it makes the code
- tougher to follow.
- Parameters
- ----------
- X : {array-like, sparse matrix}
- Data on which to apply mask.
- mask : ndarray
- Mask to be used on X.
- len_mask : int
- The length of the mask.
- Returns
- -------
- mask : ndarray
- Array that is safe to use on X.
- """
- if len_mask != 0:
- return X[safe_mask(X, mask), :]
- return np.zeros(shape=(0, X.shape[1]))
- def _array_indexing(array, key, key_dtype, axis):
- """Index an array or scipy.sparse consistently across NumPy version."""
- if issparse(array) and key_dtype == "bool":
- key = np.asarray(key)
- if isinstance(key, tuple):
- key = list(key)
- return array[key] if axis == 0 else array[:, key]
- def _pandas_indexing(X, key, key_dtype, axis):
- """Index a pandas dataframe or a series."""
- if _is_arraylike_not_scalar(key):
- key = np.asarray(key)
- if key_dtype == "int" and not (isinstance(key, slice) or np.isscalar(key)):
- # using take() instead of iloc[] ensures the return value is a "proper"
- # copy that will not raise SettingWithCopyWarning
- return X.take(key, axis=axis)
- else:
- # check whether we should index with loc or iloc
- indexer = X.iloc if key_dtype == "int" else X.loc
- return indexer[:, key] if axis else indexer[key]
- def _list_indexing(X, key, key_dtype):
- """Index a Python list."""
- if np.isscalar(key) or isinstance(key, slice):
- # key is a slice or a scalar
- return X[key]
- if key_dtype == "bool":
- # key is a boolean array-like
- return list(compress(X, key))
- # key is a integer array-like of key
- return [X[idx] for idx in key]
- def _determine_key_type(key, accept_slice=True):
- """Determine the data type of key.
- Parameters
- ----------
- key : scalar, slice or array-like
- The key from which we want to infer the data type.
- accept_slice : bool, default=True
- Whether or not to raise an error if the key is a slice.
- Returns
- -------
- dtype : {'int', 'str', 'bool', None}
- Returns the data type of key.
- """
- err_msg = (
- "No valid specification of the columns. Only a scalar, list or "
- "slice of all integers or all strings, or boolean mask is "
- "allowed"
- )
- dtype_to_str = {int: "int", str: "str", bool: "bool", np.bool_: "bool"}
- array_dtype_to_str = {
- "i": "int",
- "u": "int",
- "b": "bool",
- "O": "str",
- "U": "str",
- "S": "str",
- }
- if key is None:
- return None
- if isinstance(key, tuple(dtype_to_str.keys())):
- try:
- return dtype_to_str[type(key)]
- except KeyError:
- raise ValueError(err_msg)
- if isinstance(key, slice):
- if not accept_slice:
- raise TypeError(
- "Only array-like or scalar are supported. A Python slice was given."
- )
- if key.start is None and key.stop is None:
- return None
- key_start_type = _determine_key_type(key.start)
- key_stop_type = _determine_key_type(key.stop)
- if key_start_type is not None and key_stop_type is not None:
- if key_start_type != key_stop_type:
- raise ValueError(err_msg)
- if key_start_type is not None:
- return key_start_type
- return key_stop_type
- if isinstance(key, (list, tuple)):
- unique_key = set(key)
- key_type = {_determine_key_type(elt) for elt in unique_key}
- if not key_type:
- return None
- if len(key_type) != 1:
- raise ValueError(err_msg)
- return key_type.pop()
- if hasattr(key, "dtype"):
- try:
- return array_dtype_to_str[key.dtype.kind]
- except KeyError:
- raise ValueError(err_msg)
- raise ValueError(err_msg)
- def _safe_indexing(X, indices, *, axis=0):
- """Return rows, items or columns of X using indices.
- .. warning::
- This utility is documented, but **private**. This means that
- backward compatibility might be broken without any deprecation
- cycle.
- Parameters
- ----------
- X : array-like, sparse-matrix, list, pandas.DataFrame, pandas.Series
- Data from which to sample rows, items or columns. `list` are only
- supported when `axis=0`.
- indices : bool, int, str, slice, array-like
- - If `axis=0`, boolean and integer array-like, integer slice,
- and scalar integer are supported.
- - If `axis=1`:
- - to select a single column, `indices` can be of `int` type for
- all `X` types and `str` only for dataframe. The selected subset
- will be 1D, unless `X` is a sparse matrix in which case it will
- be 2D.
- - to select multiples columns, `indices` can be one of the
- following: `list`, `array`, `slice`. The type used in
- these containers can be one of the following: `int`, 'bool' and
- `str`. However, `str` is only supported when `X` is a dataframe.
- The selected subset will be 2D.
- axis : int, default=0
- The axis along which `X` will be subsampled. `axis=0` will select
- rows while `axis=1` will select columns.
- Returns
- -------
- subset
- Subset of X on axis 0 or 1.
- Notes
- -----
- CSR, CSC, and LIL sparse matrices are supported. COO sparse matrices are
- not supported.
- """
- if indices is None:
- return X
- if axis not in (0, 1):
- raise ValueError(
- "'axis' should be either 0 (to index rows) or 1 (to index "
- " column). Got {} instead.".format(axis)
- )
- indices_dtype = _determine_key_type(indices)
- if axis == 0 and indices_dtype == "str":
- raise ValueError("String indexing is not supported with 'axis=0'")
- if axis == 1 and X.ndim != 2:
- raise ValueError(
- "'X' should be a 2D NumPy array, 2D sparse matrix or pandas "
- "dataframe when indexing the columns (i.e. 'axis=1'). "
- "Got {} instead with {} dimension(s).".format(type(X), X.ndim)
- )
- if axis == 1 and indices_dtype == "str" and not hasattr(X, "loc"):
- raise ValueError(
- "Specifying the columns using strings is only supported for "
- "pandas DataFrames"
- )
- if hasattr(X, "iloc"):
- return _pandas_indexing(X, indices, indices_dtype, axis=axis)
- elif hasattr(X, "shape"):
- return _array_indexing(X, indices, indices_dtype, axis=axis)
- else:
- return _list_indexing(X, indices, indices_dtype)
- def _safe_assign(X, values, *, row_indexer=None, column_indexer=None):
- """Safe assignment to a numpy array, sparse matrix, or pandas dataframe.
- Parameters
- ----------
- X : {ndarray, sparse-matrix, dataframe}
- Array to be modified. It is expected to be 2-dimensional.
- values : ndarray
- The values to be assigned to `X`.
- row_indexer : array-like, dtype={int, bool}, default=None
- A 1-dimensional array to select the rows of interest. If `None`, all
- rows are selected.
- column_indexer : array-like, dtype={int, bool}, default=None
- A 1-dimensional array to select the columns of interest. If `None`, all
- columns are selected.
- """
- row_indexer = slice(None, None, None) if row_indexer is None else row_indexer
- column_indexer = (
- slice(None, None, None) if column_indexer is None else column_indexer
- )
- if hasattr(X, "iloc"): # pandas dataframe
- with warnings.catch_warnings():
- # pandas >= 1.5 raises a warning when using iloc to set values in a column
- # that does not have the same type as the column being set. It happens
- # for instance when setting a categorical column with a string.
- # In the future the behavior won't change and the warning should disappear.
- # TODO(1.3): check if the warning is still raised or remove the filter.
- warnings.simplefilter("ignore", FutureWarning)
- X.iloc[row_indexer, column_indexer] = values
- else: # numpy array or sparse matrix
- X[row_indexer, column_indexer] = values
- def _get_column_indices(X, key):
- """Get feature column indices for input data X and key.
- For accepted values of `key`, see the docstring of
- :func:`_safe_indexing`.
- """
- n_columns = X.shape[1]
- key_dtype = _determine_key_type(key)
- if isinstance(key, (list, tuple)) and not key:
- # we get an empty list
- return []
- elif key_dtype in ("bool", "int"):
- # Convert key into positive indexes
- try:
- idx = _safe_indexing(np.arange(n_columns), key)
- except IndexError as e:
- raise ValueError(
- "all features must be in [0, {}] or [-{}, 0]".format(
- n_columns - 1, n_columns
- )
- ) from e
- return np.atleast_1d(idx).tolist()
- elif key_dtype == "str":
- try:
- all_columns = X.columns
- except AttributeError:
- raise ValueError(
- "Specifying the columns using strings is only "
- "supported for pandas DataFrames"
- )
- if isinstance(key, str):
- columns = [key]
- elif isinstance(key, slice):
- start, stop = key.start, key.stop
- if start is not None:
- start = all_columns.get_loc(start)
- if stop is not None:
- # pandas indexing with strings is endpoint included
- stop = all_columns.get_loc(stop) + 1
- else:
- stop = n_columns + 1
- return list(islice(range(n_columns), start, stop))
- else:
- columns = list(key)
- try:
- column_indices = []
- for col in columns:
- col_idx = all_columns.get_loc(col)
- if not isinstance(col_idx, numbers.Integral):
- raise ValueError(
- f"Selected columns, {columns}, are not unique in dataframe"
- )
- column_indices.append(col_idx)
- except KeyError as e:
- raise ValueError("A given column is not a column of the dataframe") from e
- return column_indices
- else:
- raise ValueError(
- "No valid specification of the columns. Only a "
- "scalar, list or slice of all integers or all "
- "strings, or boolean mask is allowed"
- )
- @validate_params(
- {
- "replace": ["boolean"],
- "n_samples": [Interval(numbers.Integral, 1, None, closed="left"), None],
- "random_state": ["random_state"],
- "stratify": ["array-like", None],
- },
- prefer_skip_nested_validation=True,
- )
- def resample(*arrays, replace=True, n_samples=None, random_state=None, stratify=None):
- """Resample arrays or sparse matrices in a consistent way.
- The default strategy implements one step of the bootstrapping
- procedure.
- Parameters
- ----------
- *arrays : sequence of array-like of shape (n_samples,) or \
- (n_samples, n_outputs)
- Indexable data-structures can be arrays, lists, dataframes or scipy
- sparse matrices with consistent first dimension.
- replace : bool, default=True
- Implements resampling with replacement. If False, this will implement
- (sliced) random permutations.
- n_samples : int, default=None
- Number of samples to generate. If left to None this is
- automatically set to the first dimension of the arrays.
- If replace is False it should not be larger than the length of
- arrays.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for shuffling
- the data.
- Pass an int for reproducible results across multiple function calls.
- See :term:`Glossary <random_state>`.
- stratify : array-like of shape (n_samples,) or (n_samples, n_outputs), \
- default=None
- If not None, data is split in a stratified fashion, using this as
- the class labels.
- Returns
- -------
- resampled_arrays : sequence of array-like of shape (n_samples,) or \
- (n_samples, n_outputs)
- Sequence of resampled copies of the collections. The original arrays
- are not impacted.
- See Also
- --------
- shuffle : Shuffle arrays or sparse matrices in a consistent way.
- Examples
- --------
- It is possible to mix sparse and dense arrays in the same run::
- >>> import numpy as np
- >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
- >>> y = np.array([0, 1, 2])
- >>> from scipy.sparse import coo_matrix
- >>> X_sparse = coo_matrix(X)
- >>> from sklearn.utils import resample
- >>> X, X_sparse, y = resample(X, X_sparse, y, random_state=0)
- >>> X
- array([[1., 0.],
- [2., 1.],
- [1., 0.]])
- >>> X_sparse
- <3x2 sparse matrix of type '<... 'numpy.float64'>'
- with 4 stored elements in Compressed Sparse Row format>
- >>> X_sparse.toarray()
- array([[1., 0.],
- [2., 1.],
- [1., 0.]])
- >>> y
- array([0, 1, 0])
- >>> resample(y, n_samples=2, random_state=0)
- array([0, 1])
- Example using stratification::
- >>> y = [0, 0, 1, 1, 1, 1, 1, 1, 1]
- >>> resample(y, n_samples=5, replace=False, stratify=y,
- ... random_state=0)
- [1, 1, 1, 0, 1]
- """
- max_n_samples = n_samples
- random_state = check_random_state(random_state)
- if len(arrays) == 0:
- return None
- first = arrays[0]
- n_samples = first.shape[0] if hasattr(first, "shape") else len(first)
- if max_n_samples is None:
- max_n_samples = n_samples
- elif (max_n_samples > n_samples) and (not replace):
- raise ValueError(
- "Cannot sample %d out of arrays with dim %d when replace is False"
- % (max_n_samples, n_samples)
- )
- check_consistent_length(*arrays)
- if stratify is None:
- if replace:
- indices = random_state.randint(0, n_samples, size=(max_n_samples,))
- else:
- indices = np.arange(n_samples)
- random_state.shuffle(indices)
- indices = indices[:max_n_samples]
- else:
- # Code adapted from StratifiedShuffleSplit()
- y = check_array(stratify, ensure_2d=False, dtype=None)
- if y.ndim == 2:
- # for multi-label y, map each distinct row to a string repr
- # using join because str(row) uses an ellipsis if len(row) > 1000
- y = np.array([" ".join(row.astype("str")) for row in y])
- classes, y_indices = np.unique(y, return_inverse=True)
- n_classes = classes.shape[0]
- class_counts = np.bincount(y_indices)
- # Find the sorted list of instances for each class:
- # (np.unique above performs a sort, so code is O(n logn) already)
- class_indices = np.split(
- np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1]
- )
- n_i = _approximate_mode(class_counts, max_n_samples, random_state)
- indices = []
- for i in range(n_classes):
- indices_i = random_state.choice(class_indices[i], n_i[i], replace=replace)
- indices.extend(indices_i)
- indices = random_state.permutation(indices)
- # convert sparse matrices to CSR for row-based indexing
- arrays = [a.tocsr() if issparse(a) else a for a in arrays]
- resampled_arrays = [_safe_indexing(a, indices) for a in arrays]
- if len(resampled_arrays) == 1:
- # syntactic sugar for the unit argument case
- return resampled_arrays[0]
- else:
- return resampled_arrays
- def shuffle(*arrays, random_state=None, n_samples=None):
- """Shuffle arrays or sparse matrices in a consistent way.
- This is a convenience alias to ``resample(*arrays, replace=False)`` to do
- random permutations of the collections.
- Parameters
- ----------
- *arrays : sequence of indexable data-structures
- Indexable data-structures can be arrays, lists, dataframes or scipy
- sparse matrices with consistent first dimension.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for shuffling
- the data.
- Pass an int for reproducible results across multiple function calls.
- See :term:`Glossary <random_state>`.
- n_samples : int, default=None
- Number of samples to generate. If left to None this is
- automatically set to the first dimension of the arrays. It should
- not be larger than the length of arrays.
- Returns
- -------
- shuffled_arrays : sequence of indexable data-structures
- Sequence of shuffled copies of the collections. The original arrays
- are not impacted.
- See Also
- --------
- resample : Resample arrays or sparse matrices in a consistent way.
- Examples
- --------
- It is possible to mix sparse and dense arrays in the same run::
- >>> import numpy as np
- >>> X = np.array([[1., 0.], [2., 1.], [0., 0.]])
- >>> y = np.array([0, 1, 2])
- >>> from scipy.sparse import coo_matrix
- >>> X_sparse = coo_matrix(X)
- >>> from sklearn.utils import shuffle
- >>> X, X_sparse, y = shuffle(X, X_sparse, y, random_state=0)
- >>> X
- array([[0., 0.],
- [2., 1.],
- [1., 0.]])
- >>> X_sparse
- <3x2 sparse matrix of type '<... 'numpy.float64'>'
- with 3 stored elements in Compressed Sparse Row format>
- >>> X_sparse.toarray()
- array([[0., 0.],
- [2., 1.],
- [1., 0.]])
- >>> y
- array([2, 1, 0])
- >>> shuffle(y, n_samples=2, random_state=0)
- array([0, 1])
- """
- return resample(
- *arrays, replace=False, n_samples=n_samples, random_state=random_state
- )
- def safe_sqr(X, *, copy=True):
- """Element wise squaring of array-likes and sparse matrices.
- Parameters
- ----------
- X : {array-like, ndarray, sparse matrix}
- copy : bool, default=True
- Whether to create a copy of X and operate on it or to perform
- inplace computation (default behaviour).
- Returns
- -------
- X ** 2 : element wise square
- Return the element-wise square of the input.
- """
- X = check_array(X, accept_sparse=["csr", "csc", "coo"], ensure_2d=False)
- if issparse(X):
- if copy:
- X = X.copy()
- X.data **= 2
- else:
- if copy:
- X = X**2
- else:
- X **= 2
- return X
- def _chunk_generator(gen, chunksize):
- """Chunk generator, ``gen`` into lists of length ``chunksize``. The last
- chunk may have a length less than ``chunksize``."""
- while True:
- chunk = list(islice(gen, chunksize))
- if chunk:
- yield chunk
- else:
- return
- @validate_params(
- {
- "n": [Interval(numbers.Integral, 1, None, closed="left")],
- "batch_size": [Interval(numbers.Integral, 1, None, closed="left")],
- "min_batch_size": [Interval(numbers.Integral, 0, None, closed="left")],
- },
- prefer_skip_nested_validation=True,
- )
- def gen_batches(n, batch_size, *, min_batch_size=0):
- """Generator to create slices containing `batch_size` elements from 0 to `n`.
- The last slice may contain less than `batch_size` elements, when
- `batch_size` does not divide `n`.
- Parameters
- ----------
- n : int
- Size of the sequence.
- batch_size : int
- Number of elements in each batch.
- min_batch_size : int, default=0
- Minimum number of elements in each batch.
- Yields
- ------
- slice of `batch_size` elements
- See Also
- --------
- gen_even_slices: Generator to create n_packs slices going up to n.
- Examples
- --------
- >>> from sklearn.utils import gen_batches
- >>> list(gen_batches(7, 3))
- [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]
- >>> list(gen_batches(6, 3))
- [slice(0, 3, None), slice(3, 6, None)]
- >>> list(gen_batches(2, 3))
- [slice(0, 2, None)]
- >>> list(gen_batches(7, 3, min_batch_size=0))
- [slice(0, 3, None), slice(3, 6, None), slice(6, 7, None)]
- >>> list(gen_batches(7, 3, min_batch_size=2))
- [slice(0, 3, None), slice(3, 7, None)]
- """
- start = 0
- for _ in range(int(n // batch_size)):
- end = start + batch_size
- if end + min_batch_size > n:
- continue
- yield slice(start, end)
- start = end
- if start < n:
- yield slice(start, n)
- def gen_even_slices(n, n_packs, *, n_samples=None):
- """Generator to create `n_packs` evenly spaced slices going up to `n`.
- If `n_packs` does not divide `n`, except for the first `n % n_packs`
- slices, remaining slices may contain fewer elements.
- Parameters
- ----------
- n : int
- Size of the sequence.
- n_packs : int
- Number of slices to generate.
- n_samples : int, default=None
- Number of samples. Pass `n_samples` when the slices are to be used for
- sparse matrix indexing; slicing off-the-end raises an exception, while
- it works for NumPy arrays.
- Yields
- ------
- `slice` representing a set of indices from 0 to n.
- See Also
- --------
- gen_batches: Generator to create slices containing batch_size elements
- from 0 to n.
- Examples
- --------
- >>> from sklearn.utils import gen_even_slices
- >>> list(gen_even_slices(10, 1))
- [slice(0, 10, None)]
- >>> list(gen_even_slices(10, 10))
- [slice(0, 1, None), slice(1, 2, None), ..., slice(9, 10, None)]
- >>> list(gen_even_slices(10, 5))
- [slice(0, 2, None), slice(2, 4, None), ..., slice(8, 10, None)]
- >>> list(gen_even_slices(10, 3))
- [slice(0, 4, None), slice(4, 7, None), slice(7, 10, None)]
- """
- start = 0
- if n_packs < 1:
- raise ValueError("gen_even_slices got n_packs=%s, must be >=1" % n_packs)
- for pack_num in range(n_packs):
- this_n = n // n_packs
- if pack_num < n % n_packs:
- this_n += 1
- if this_n > 0:
- end = start + this_n
- if n_samples is not None:
- end = min(n_samples, end)
- yield slice(start, end, None)
- start = end
- def tosequence(x):
- """Cast iterable x to a Sequence, avoiding a copy if possible.
- Parameters
- ----------
- x : iterable
- The iterable to be converted.
- Returns
- -------
- x : Sequence
- If `x` is a NumPy array, it returns it as a `ndarray`. If `x`
- is a `Sequence`, `x` is returned as-is. If `x` is from any other
- type, `x` is returned casted as a list.
- """
- if isinstance(x, np.ndarray):
- return np.asarray(x)
- elif isinstance(x, Sequence):
- return x
- else:
- return list(x)
- def _to_object_array(sequence):
- """Convert sequence to a 1-D NumPy array of object dtype.
- numpy.array constructor has a similar use but it's output
- is ambiguous. It can be 1-D NumPy array of object dtype if
- the input is a ragged array, but if the input is a list of
- equal length arrays, then the output is a 2D numpy.array.
- _to_object_array solves this ambiguity by guarantying that
- the output is a 1-D NumPy array of objects for any input.
- Parameters
- ----------
- sequence : array-like of shape (n_elements,)
- The sequence to be converted.
- Returns
- -------
- out : ndarray of shape (n_elements,), dtype=object
- The converted sequence into a 1-D NumPy array of object dtype.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.utils import _to_object_array
- >>> _to_object_array([np.array([0]), np.array([1])])
- array([array([0]), array([1])], dtype=object)
- >>> _to_object_array([np.array([0]), np.array([1, 2])])
- array([array([0]), array([1, 2])], dtype=object)
- >>> _to_object_array([np.array([0]), np.array([1, 2])])
- array([array([0]), array([1, 2])], dtype=object)
- """
- out = np.empty(len(sequence), dtype=object)
- out[:] = sequence
- return out
- def indices_to_mask(indices, mask_length):
- """Convert list of indices to boolean mask.
- Parameters
- ----------
- indices : list-like
- List of integers treated as indices.
- mask_length : int
- Length of boolean mask to be generated.
- This parameter must be greater than max(indices).
- Returns
- -------
- mask : 1d boolean nd-array
- Boolean array that is True where indices are present, else False.
- Examples
- --------
- >>> from sklearn.utils import indices_to_mask
- >>> indices = [1, 2 , 3, 4]
- >>> indices_to_mask(indices, 5)
- array([False, True, True, True, True])
- """
- if mask_length <= np.max(indices):
- raise ValueError("mask_length must be greater than max(indices)")
- mask = np.zeros(mask_length, dtype=bool)
- mask[indices] = True
- return mask
- def _message_with_time(source, message, time):
- """Create one line message for logging purposes.
- Parameters
- ----------
- source : str
- String indicating the source or the reference of the message.
- message : str
- Short message.
- time : int
- Time in seconds.
- """
- start_message = "[%s] " % source
- # adapted from joblib.logger.short_format_time without the Windows -.1s
- # adjustment
- if time > 60:
- time_str = "%4.1fmin" % (time / 60)
- else:
- time_str = " %5.1fs" % time
- end_message = " %s, total=%s" % (message, time_str)
- dots_len = 70 - len(start_message) - len(end_message)
- return "%s%s%s" % (start_message, dots_len * ".", end_message)
- @contextmanager
- def _print_elapsed_time(source, message=None):
- """Log elapsed time to stdout when the context is exited.
- Parameters
- ----------
- source : str
- String indicating the source or the reference of the message.
- message : str, default=None
- Short message. If None, nothing will be printed.
- Returns
- -------
- context_manager
- Prints elapsed time upon exit if verbose.
- """
- if message is None:
- yield
- else:
- start = timeit.default_timer()
- yield
- print(_message_with_time(source, message, timeit.default_timer() - start))
- def get_chunk_n_rows(row_bytes, *, max_n_rows=None, working_memory=None):
- """Calculate how many rows can be processed within `working_memory`.
- Parameters
- ----------
- row_bytes : int
- The expected number of bytes of memory that will be consumed
- during the processing of each row.
- max_n_rows : int, default=None
- The maximum return value.
- working_memory : int or float, default=None
- The number of rows to fit inside this number of MiB will be
- returned. When None (default), the value of
- ``sklearn.get_config()['working_memory']`` is used.
- Returns
- -------
- int
- The number of rows which can be processed within `working_memory`.
- Warns
- -----
- Issues a UserWarning if `row_bytes exceeds `working_memory` MiB.
- """
- if working_memory is None:
- working_memory = get_config()["working_memory"]
- chunk_n_rows = int(working_memory * (2**20) // row_bytes)
- if max_n_rows is not None:
- chunk_n_rows = min(chunk_n_rows, max_n_rows)
- if chunk_n_rows < 1:
- warnings.warn(
- "Could not adhere to working_memory config. "
- "Currently %.0fMiB, %.0fMiB required."
- % (working_memory, np.ceil(row_bytes * 2**-20))
- )
- chunk_n_rows = 1
- return chunk_n_rows
- def _is_pandas_na(x):
- """Test if x is pandas.NA.
- We intentionally do not use this function to return `True` for `pd.NA` in
- `is_scalar_nan`, because estimators that support `pd.NA` are the exception
- rather than the rule at the moment. When `pd.NA` is more universally
- supported, we may reconsider this decision.
- Parameters
- ----------
- x : any type
- Returns
- -------
- boolean
- """
- with suppress(ImportError):
- from pandas import NA
- return x is NA
- return False
- def is_scalar_nan(x):
- """Test if x is NaN.
- This function is meant to overcome the issue that np.isnan does not allow
- non-numerical types as input, and that np.nan is not float('nan').
- Parameters
- ----------
- x : any type
- Any scalar value.
- Returns
- -------
- bool
- Returns true if x is NaN, and false otherwise.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.utils import is_scalar_nan
- >>> is_scalar_nan(np.nan)
- True
- >>> is_scalar_nan(float("nan"))
- True
- >>> is_scalar_nan(None)
- False
- >>> is_scalar_nan("")
- False
- >>> is_scalar_nan([np.nan])
- False
- """
- return isinstance(x, numbers.Real) and math.isnan(x)
- def _approximate_mode(class_counts, n_draws, rng):
- """Computes approximate mode of multivariate hypergeometric.
- This is an approximation to the mode of the multivariate
- hypergeometric given by class_counts and n_draws.
- It shouldn't be off by more than one.
- It is the mostly likely outcome of drawing n_draws many
- samples from the population given by class_counts.
- Parameters
- ----------
- class_counts : ndarray of int
- Population per class.
- n_draws : int
- Number of draws (samples to draw) from the overall population.
- rng : random state
- Used to break ties.
- Returns
- -------
- sampled_classes : ndarray of int
- Number of samples drawn from each class.
- np.sum(sampled_classes) == n_draws
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.utils import _approximate_mode
- >>> _approximate_mode(class_counts=np.array([4, 2]), n_draws=3, rng=0)
- array([2, 1])
- >>> _approximate_mode(class_counts=np.array([5, 2]), n_draws=4, rng=0)
- array([3, 1])
- >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
- ... n_draws=2, rng=0)
- array([0, 1, 1, 0])
- >>> _approximate_mode(class_counts=np.array([2, 2, 2, 1]),
- ... n_draws=2, rng=42)
- array([1, 1, 0, 0])
- """
- rng = check_random_state(rng)
- # this computes a bad approximation to the mode of the
- # multivariate hypergeometric given by class_counts and n_draws
- continuous = class_counts / class_counts.sum() * n_draws
- # floored means we don't overshoot n_samples, but probably undershoot
- floored = np.floor(continuous)
- # we add samples according to how much "left over" probability
- # they had, until we arrive at n_samples
- need_to_add = int(n_draws - floored.sum())
- if need_to_add > 0:
- remainder = continuous - floored
- values = np.sort(np.unique(remainder))[::-1]
- # add according to remainder, but break ties
- # randomly to avoid biases
- for value in values:
- (inds,) = np.where(remainder == value)
- # if we need_to_add less than what's in inds
- # we draw randomly from them.
- # if we need to add more, we add them all and
- # go to the next value
- add_now = min(len(inds), need_to_add)
- inds = rng.choice(inds, size=add_now, replace=False)
- floored[inds] += 1
- need_to_add -= add_now
- if need_to_add == 0:
- break
- return floored.astype(int)
- def check_matplotlib_support(caller_name):
- """Raise ImportError with detailed error message if mpl is not installed.
- Plot utilities like any of the Display's plotting functions should lazily import
- matplotlib and call this helper before any computation.
- Parameters
- ----------
- caller_name : str
- The name of the caller that requires matplotlib.
- """
- try:
- import matplotlib # noqa
- except ImportError as e:
- raise ImportError(
- "{} requires matplotlib. You can install matplotlib with "
- "`pip install matplotlib`".format(caller_name)
- ) from e
- def check_pandas_support(caller_name):
- """Raise ImportError with detailed error message if pandas is not installed.
- Plot utilities like :func:`fetch_openml` should lazily import
- pandas and call this helper before any computation.
- Parameters
- ----------
- caller_name : str
- The name of the caller that requires pandas.
- Returns
- -------
- pandas
- The pandas package.
- """
- try:
- import pandas # noqa
- return pandas
- except ImportError as e:
- raise ImportError("{} requires pandas.".format(caller_name)) from e
|