| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156 |
- """Base classes for all estimators."""
- # Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
- # License: BSD 3 clause
- import copy
- import functools
- import inspect
- import platform
- import re
- import warnings
- from collections import defaultdict
- import numpy as np
- from . import __version__
- from ._config import config_context, get_config
- from .exceptions import InconsistentVersionWarning
- from .utils import _IS_32BIT
- from .utils._estimator_html_repr import estimator_html_repr
- from .utils._metadata_requests import _MetadataRequester
- from .utils._param_validation import validate_parameter_constraints
- from .utils._set_output import _SetOutputMixin
- from .utils._tags import (
- _DEFAULT_TAGS,
- )
- from .utils.validation import (
- _check_feature_names_in,
- _check_y,
- _generate_get_feature_names_out,
- _get_feature_names,
- _is_fitted,
- _num_features,
- check_array,
- check_is_fitted,
- check_X_y,
- )
- def clone(estimator, *, safe=True):
- """Construct a new unfitted estimator with the same parameters.
- Clone does a deep copy of the model in an estimator
- without actually copying attached data. It returns a new estimator
- with the same parameters that has not been fitted on any data.
- .. versionchanged:: 1.3
- Delegates to `estimator.__sklearn_clone__` if the method exists.
- Parameters
- ----------
- estimator : {list, tuple, set} of estimator instance or a single \
- estimator instance
- The estimator or group of estimators to be cloned.
- safe : bool, default=True
- If safe is False, clone will fall back to a deep copy on objects
- that are not estimators. Ignored if `estimator.__sklearn_clone__`
- exists.
- Returns
- -------
- estimator : object
- The deep copy of the input, an estimator if input is an estimator.
- Notes
- -----
- If the estimator's `random_state` parameter is an integer (or if the
- estimator doesn't have a `random_state` parameter), an *exact clone* is
- returned: the clone and the original estimator will give the exact same
- results. Otherwise, *statistical clone* is returned: the clone might
- return different results from the original estimator. More details can be
- found in :ref:`randomness`.
- """
- if hasattr(estimator, "__sklearn_clone__") and not inspect.isclass(estimator):
- return estimator.__sklearn_clone__()
- return _clone_parametrized(estimator, safe=safe)
- def _clone_parametrized(estimator, *, safe=True):
- """Default implementation of clone. See :func:`sklearn.base.clone` for details."""
- estimator_type = type(estimator)
- if estimator_type is dict:
- return {k: clone(v, safe=safe) for k, v in estimator.items()}
- elif estimator_type in (list, tuple, set, frozenset):
- return estimator_type([clone(e, safe=safe) for e in estimator])
- elif not hasattr(estimator, "get_params") or isinstance(estimator, type):
- if not safe:
- return copy.deepcopy(estimator)
- else:
- if isinstance(estimator, type):
- raise TypeError(
- "Cannot clone object. "
- + "You should provide an instance of "
- + "scikit-learn estimator instead of a class."
- )
- else:
- raise TypeError(
- "Cannot clone object '%s' (type %s): "
- "it does not seem to be a scikit-learn "
- "estimator as it does not implement a "
- "'get_params' method." % (repr(estimator), type(estimator))
- )
- klass = estimator.__class__
- new_object_params = estimator.get_params(deep=False)
- for name, param in new_object_params.items():
- new_object_params[name] = clone(param, safe=False)
- new_object = klass(**new_object_params)
- try:
- new_object._metadata_request = copy.deepcopy(estimator._metadata_request)
- except AttributeError:
- pass
- params_set = new_object.get_params(deep=False)
- # quick sanity check of the parameters of the clone
- for name in new_object_params:
- param1 = new_object_params[name]
- param2 = params_set[name]
- if param1 is not param2:
- raise RuntimeError(
- "Cannot clone object %s, as the constructor "
- "either does not set or modifies parameter %s" % (estimator, name)
- )
- # _sklearn_output_config is used by `set_output` to configure the output
- # container of an estimator.
- if hasattr(estimator, "_sklearn_output_config"):
- new_object._sklearn_output_config = copy.deepcopy(
- estimator._sklearn_output_config
- )
- return new_object
- class BaseEstimator(_MetadataRequester):
- """Base class for all estimators in scikit-learn.
- Notes
- -----
- All estimators should specify all the parameters that can be set
- at the class level in their ``__init__`` as explicit keyword
- arguments (no ``*args`` or ``**kwargs``).
- """
- @classmethod
- def _get_param_names(cls):
- """Get parameter names for the estimator"""
- # fetch the constructor or the original constructor before
- # deprecation wrapping if any
- init = getattr(cls.__init__, "deprecated_original", cls.__init__)
- if init is object.__init__:
- # No explicit constructor to introspect
- return []
- # introspect the constructor arguments to find the model parameters
- # to represent
- init_signature = inspect.signature(init)
- # Consider the constructor parameters excluding 'self'
- parameters = [
- p
- for p in init_signature.parameters.values()
- if p.name != "self" and p.kind != p.VAR_KEYWORD
- ]
- for p in parameters:
- if p.kind == p.VAR_POSITIONAL:
- raise RuntimeError(
- "scikit-learn estimators should always "
- "specify their parameters in the signature"
- " of their __init__ (no varargs)."
- " %s with constructor %s doesn't "
- " follow this convention." % (cls, init_signature)
- )
- # Extract and sort argument names excluding 'self'
- return sorted([p.name for p in parameters])
- def get_params(self, deep=True):
- """
- Get parameters for this estimator.
- Parameters
- ----------
- deep : bool, default=True
- If True, will return the parameters for this estimator and
- contained subobjects that are estimators.
- Returns
- -------
- params : dict
- Parameter names mapped to their values.
- """
- out = dict()
- for key in self._get_param_names():
- value = getattr(self, key)
- if deep and hasattr(value, "get_params") and not isinstance(value, type):
- deep_items = value.get_params().items()
- out.update((key + "__" + k, val) for k, val in deep_items)
- out[key] = value
- return out
- def set_params(self, **params):
- """Set the parameters of this estimator.
- The method works on simple estimators as well as on nested objects
- (such as :class:`~sklearn.pipeline.Pipeline`). The latter have
- parameters of the form ``<component>__<parameter>`` so that it's
- possible to update each component of a nested object.
- Parameters
- ----------
- **params : dict
- Estimator parameters.
- Returns
- -------
- self : estimator instance
- Estimator instance.
- """
- if not params:
- # Simple optimization to gain speed (inspect is slow)
- return self
- valid_params = self.get_params(deep=True)
- nested_params = defaultdict(dict) # grouped by prefix
- for key, value in params.items():
- key, delim, sub_key = key.partition("__")
- if key not in valid_params:
- local_valid_params = self._get_param_names()
- raise ValueError(
- f"Invalid parameter {key!r} for estimator {self}. "
- f"Valid parameters are: {local_valid_params!r}."
- )
- if delim:
- nested_params[key][sub_key] = value
- else:
- setattr(self, key, value)
- valid_params[key] = value
- for key, sub_params in nested_params.items():
- # TODO(1.4): remove specific handling of "base_estimator".
- # The "base_estimator" key is special. It was deprecated and
- # renamed to "estimator" for several estimators. This means we
- # need to translate it here and set sub-parameters on "estimator",
- # but only if the user did not explicitly set a value for
- # "base_estimator".
- if (
- key == "base_estimator"
- and valid_params[key] == "deprecated"
- and self.__module__.startswith("sklearn.")
- ):
- warnings.warn(
- (
- f"Parameter 'base_estimator' of {self.__class__.__name__} is"
- " deprecated in favor of 'estimator'. See"
- f" {self.__class__.__name__}'s docstring for more details."
- ),
- FutureWarning,
- stacklevel=2,
- )
- key = "estimator"
- valid_params[key].set_params(**sub_params)
- return self
- def __sklearn_clone__(self):
- return _clone_parametrized(self)
- def __repr__(self, N_CHAR_MAX=700):
- # N_CHAR_MAX is the (approximate) maximum number of non-blank
- # characters to render. We pass it as an optional parameter to ease
- # the tests.
- from .utils._pprint import _EstimatorPrettyPrinter
- N_MAX_ELEMENTS_TO_SHOW = 30 # number of elements to show in sequences
- # use ellipsis for sequences with a lot of elements
- pp = _EstimatorPrettyPrinter(
- compact=True,
- indent=1,
- indent_at_name=True,
- n_max_elements_to_show=N_MAX_ELEMENTS_TO_SHOW,
- )
- repr_ = pp.pformat(self)
- # Use bruteforce ellipsis when there are a lot of non-blank characters
- n_nonblank = len("".join(repr_.split()))
- if n_nonblank > N_CHAR_MAX:
- lim = N_CHAR_MAX // 2 # apprx number of chars to keep on both ends
- regex = r"^(\s*\S){%d}" % lim
- # The regex '^(\s*\S){%d}' % n
- # matches from the start of the string until the nth non-blank
- # character:
- # - ^ matches the start of string
- # - (pattern){n} matches n repetitions of pattern
- # - \s*\S matches a non-blank char following zero or more blanks
- left_lim = re.match(regex, repr_).end()
- right_lim = re.match(regex, repr_[::-1]).end()
- if "\n" in repr_[left_lim:-right_lim]:
- # The left side and right side aren't on the same line.
- # To avoid weird cuts, e.g.:
- # categoric...ore',
- # we need to start the right side with an appropriate newline
- # character so that it renders properly as:
- # categoric...
- # handle_unknown='ignore',
- # so we add [^\n]*\n which matches until the next \n
- regex += r"[^\n]*\n"
- right_lim = re.match(regex, repr_[::-1]).end()
- ellipsis = "..."
- if left_lim + len(ellipsis) < len(repr_) - right_lim:
- # Only add ellipsis if it results in a shorter repr
- repr_ = repr_[:left_lim] + "..." + repr_[-right_lim:]
- return repr_
- def __getstate__(self):
- if getattr(self, "__slots__", None):
- raise TypeError(
- "You cannot use `__slots__` in objects inheriting from "
- "`sklearn.base.BaseEstimator`."
- )
- try:
- state = super().__getstate__()
- if state is None:
- # For Python 3.11+, empty instance (no `__slots__`,
- # and `__dict__`) will return a state equal to `None`.
- state = self.__dict__.copy()
- except AttributeError:
- # Python < 3.11
- state = self.__dict__.copy()
- if type(self).__module__.startswith("sklearn."):
- return dict(state.items(), _sklearn_version=__version__)
- else:
- return state
- def __setstate__(self, state):
- if type(self).__module__.startswith("sklearn."):
- pickle_version = state.pop("_sklearn_version", "pre-0.18")
- if pickle_version != __version__:
- warnings.warn(
- InconsistentVersionWarning(
- estimator_name=self.__class__.__name__,
- current_sklearn_version=__version__,
- original_sklearn_version=pickle_version,
- ),
- )
- try:
- super().__setstate__(state)
- except AttributeError:
- self.__dict__.update(state)
- def _more_tags(self):
- return _DEFAULT_TAGS
- def _get_tags(self):
- collected_tags = {}
- for base_class in reversed(inspect.getmro(self.__class__)):
- if hasattr(base_class, "_more_tags"):
- # need the if because mixins might not have _more_tags
- # but might do redundant work in estimators
- # (i.e. calling more tags on BaseEstimator multiple times)
- more_tags = base_class._more_tags(self)
- collected_tags.update(more_tags)
- return collected_tags
- def _check_n_features(self, X, reset):
- """Set the `n_features_in_` attribute, or check against it.
- Parameters
- ----------
- X : {ndarray, sparse matrix} of shape (n_samples, n_features)
- The input samples.
- reset : bool
- If True, the `n_features_in_` attribute is set to `X.shape[1]`.
- If False and the attribute exists, then check that it is equal to
- `X.shape[1]`. If False and the attribute does *not* exist, then
- the check is skipped.
- .. note::
- It is recommended to call reset=True in `fit` and in the first
- call to `partial_fit`. All other methods that validate `X`
- should set `reset=False`.
- """
- try:
- n_features = _num_features(X)
- except TypeError as e:
- if not reset and hasattr(self, "n_features_in_"):
- raise ValueError(
- "X does not contain any features, but "
- f"{self.__class__.__name__} is expecting "
- f"{self.n_features_in_} features"
- ) from e
- # If the number of features is not defined and reset=True,
- # then we skip this check
- return
- if reset:
- self.n_features_in_ = n_features
- return
- if not hasattr(self, "n_features_in_"):
- # Skip this check if the expected number of expected input features
- # was not recorded by calling fit first. This is typically the case
- # for stateless transformers.
- return
- if n_features != self.n_features_in_:
- raise ValueError(
- f"X has {n_features} features, but {self.__class__.__name__} "
- f"is expecting {self.n_features_in_} features as input."
- )
- def _check_feature_names(self, X, *, reset):
- """Set or check the `feature_names_in_` attribute.
- .. versionadded:: 1.0
- Parameters
- ----------
- X : {ndarray, dataframe} of shape (n_samples, n_features)
- The input samples.
- reset : bool
- Whether to reset the `feature_names_in_` attribute.
- If False, the input will be checked for consistency with
- feature names of data provided when reset was last True.
- .. note::
- It is recommended to call `reset=True` in `fit` and in the first
- call to `partial_fit`. All other methods that validate `X`
- should set `reset=False`.
- """
- if reset:
- feature_names_in = _get_feature_names(X)
- if feature_names_in is not None:
- self.feature_names_in_ = feature_names_in
- elif hasattr(self, "feature_names_in_"):
- # Delete the attribute when the estimator is fitted on a new dataset
- # that has no feature names.
- delattr(self, "feature_names_in_")
- return
- fitted_feature_names = getattr(self, "feature_names_in_", None)
- X_feature_names = _get_feature_names(X)
- if fitted_feature_names is None and X_feature_names is None:
- # no feature names seen in fit and in X
- return
- if X_feature_names is not None and fitted_feature_names is None:
- warnings.warn(
- f"X has feature names, but {self.__class__.__name__} was fitted without"
- " feature names"
- )
- return
- if X_feature_names is None and fitted_feature_names is not None:
- warnings.warn(
- "X does not have valid feature names, but"
- f" {self.__class__.__name__} was fitted with feature names"
- )
- return
- # validate the feature names against the `feature_names_in_` attribute
- if len(fitted_feature_names) != len(X_feature_names) or np.any(
- fitted_feature_names != X_feature_names
- ):
- message = (
- "The feature names should match those that were passed during fit.\n"
- )
- fitted_feature_names_set = set(fitted_feature_names)
- X_feature_names_set = set(X_feature_names)
- unexpected_names = sorted(X_feature_names_set - fitted_feature_names_set)
- missing_names = sorted(fitted_feature_names_set - X_feature_names_set)
- def add_names(names):
- output = ""
- max_n_names = 5
- for i, name in enumerate(names):
- if i >= max_n_names:
- output += "- ...\n"
- break
- output += f"- {name}\n"
- return output
- if unexpected_names:
- message += "Feature names unseen at fit time:\n"
- message += add_names(unexpected_names)
- if missing_names:
- message += "Feature names seen at fit time, yet now missing:\n"
- message += add_names(missing_names)
- if not missing_names and not unexpected_names:
- message += (
- "Feature names must be in the same order as they were in fit.\n"
- )
- raise ValueError(message)
- def _validate_data(
- self,
- X="no_validation",
- y="no_validation",
- reset=True,
- validate_separately=False,
- cast_to_ndarray=True,
- **check_params,
- ):
- """Validate input data and set or check the `n_features_in_` attribute.
- Parameters
- ----------
- X : {array-like, sparse matrix, dataframe} of shape \
- (n_samples, n_features), default='no validation'
- The input samples.
- If `'no_validation'`, no validation is performed on `X`. This is
- useful for meta-estimator which can delegate input validation to
- their underlying estimator(s). In that case `y` must be passed and
- the only accepted `check_params` are `multi_output` and
- `y_numeric`.
- y : array-like of shape (n_samples,), default='no_validation'
- The targets.
- - If `None`, `check_array` is called on `X`. If the estimator's
- requires_y tag is True, then an error will be raised.
- - If `'no_validation'`, `check_array` is called on `X` and the
- estimator's requires_y tag is ignored. This is a default
- placeholder and is never meant to be explicitly set. In that case
- `X` must be passed.
- - Otherwise, only `y` with `_check_y` or both `X` and `y` are
- checked with either `check_array` or `check_X_y` depending on
- `validate_separately`.
- reset : bool, default=True
- Whether to reset the `n_features_in_` attribute.
- If False, the input will be checked for consistency with data
- provided when reset was last True.
- .. note::
- It is recommended to call reset=True in `fit` and in the first
- call to `partial_fit`. All other methods that validate `X`
- should set `reset=False`.
- validate_separately : False or tuple of dicts, default=False
- Only used if y is not None.
- If False, call validate_X_y(). Else, it must be a tuple of kwargs
- to be used for calling check_array() on X and y respectively.
- `estimator=self` is automatically added to these dicts to generate
- more informative error message in case of invalid input data.
- cast_to_ndarray : bool, default=True
- Cast `X` and `y` to ndarray with checks in `check_params`. If
- `False`, `X` and `y` are unchanged and only `feature_names_in_` and
- `n_features_in_` are checked.
- **check_params : kwargs
- Parameters passed to :func:`sklearn.utils.check_array` or
- :func:`sklearn.utils.check_X_y`. Ignored if validate_separately
- is not False.
- `estimator=self` is automatically added to these params to generate
- more informative error message in case of invalid input data.
- Returns
- -------
- out : {ndarray, sparse matrix} or tuple of these
- The validated input. A tuple is returned if both `X` and `y` are
- validated.
- """
- self._check_feature_names(X, reset=reset)
- if y is None and self._get_tags()["requires_y"]:
- raise ValueError(
- f"This {self.__class__.__name__} estimator "
- "requires y to be passed, but the target y is None."
- )
- no_val_X = isinstance(X, str) and X == "no_validation"
- no_val_y = y is None or isinstance(y, str) and y == "no_validation"
- if no_val_X and no_val_y:
- raise ValueError("Validation should be done on X, y or both.")
- default_check_params = {"estimator": self}
- check_params = {**default_check_params, **check_params}
- if not cast_to_ndarray:
- if not no_val_X and no_val_y:
- out = X
- elif no_val_X and not no_val_y:
- out = y
- else:
- out = X, y
- elif not no_val_X and no_val_y:
- out = check_array(X, input_name="X", **check_params)
- elif no_val_X and not no_val_y:
- out = _check_y(y, **check_params)
- else:
- if validate_separately:
- # We need this because some estimators validate X and y
- # separately, and in general, separately calling check_array()
- # on X and y isn't equivalent to just calling check_X_y()
- # :(
- check_X_params, check_y_params = validate_separately
- if "estimator" not in check_X_params:
- check_X_params = {**default_check_params, **check_X_params}
- X = check_array(X, input_name="X", **check_X_params)
- if "estimator" not in check_y_params:
- check_y_params = {**default_check_params, **check_y_params}
- y = check_array(y, input_name="y", **check_y_params)
- else:
- X, y = check_X_y(X, y, **check_params)
- out = X, y
- if not no_val_X and check_params.get("ensure_2d", True):
- self._check_n_features(X, reset=reset)
- return out
- def _validate_params(self):
- """Validate types and values of constructor parameters
- The expected type and values must be defined in the `_parameter_constraints`
- class attribute, which is a dictionary `param_name: list of constraints`. See
- the docstring of `validate_parameter_constraints` for a description of the
- accepted constraints.
- """
- validate_parameter_constraints(
- self._parameter_constraints,
- self.get_params(deep=False),
- caller_name=self.__class__.__name__,
- )
- @property
- def _repr_html_(self):
- """HTML representation of estimator.
- This is redundant with the logic of `_repr_mimebundle_`. The latter
- should be favorted in the long term, `_repr_html_` is only
- implemented for consumers who do not interpret `_repr_mimbundle_`.
- """
- if get_config()["display"] != "diagram":
- raise AttributeError(
- "_repr_html_ is only defined when the "
- "'display' configuration option is set to "
- "'diagram'"
- )
- return self._repr_html_inner
- def _repr_html_inner(self):
- """This function is returned by the @property `_repr_html_` to make
- `hasattr(estimator, "_repr_html_") return `True` or `False` depending
- on `get_config()["display"]`.
- """
- return estimator_html_repr(self)
- def _repr_mimebundle_(self, **kwargs):
- """Mime bundle used by jupyter kernels to display estimator"""
- output = {"text/plain": repr(self)}
- if get_config()["display"] == "diagram":
- output["text/html"] = estimator_html_repr(self)
- return output
- class ClassifierMixin:
- """Mixin class for all classifiers in scikit-learn."""
- _estimator_type = "classifier"
- def score(self, X, y, sample_weight=None):
- """
- Return the mean accuracy on the given test data and labels.
- In multi-label classification, this is the subset accuracy
- which is a harsh metric since you require for each sample that
- each label set be correctly predicted.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Test samples.
- y : array-like of shape (n_samples,) or (n_samples, n_outputs)
- True labels for `X`.
- sample_weight : array-like of shape (n_samples,), default=None
- Sample weights.
- Returns
- -------
- score : float
- Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
- """
- from .metrics import accuracy_score
- return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
- def _more_tags(self):
- return {"requires_y": True}
- class RegressorMixin:
- """Mixin class for all regression estimators in scikit-learn."""
- _estimator_type = "regressor"
- def score(self, X, y, sample_weight=None):
- """Return the coefficient of determination of the prediction.
- The coefficient of determination :math:`R^2` is defined as
- :math:`(1 - \\frac{u}{v})`, where :math:`u` is the residual
- sum of squares ``((y_true - y_pred)** 2).sum()`` and :math:`v`
- is the total sum of squares ``((y_true - y_true.mean()) ** 2).sum()``.
- The best possible score is 1.0 and it can be negative (because the
- model can be arbitrarily worse). A constant model that always predicts
- the expected value of `y`, disregarding the input features, would get
- a :math:`R^2` score of 0.0.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Test samples. For some estimators this may be a precomputed
- kernel matrix or a list of generic objects instead with shape
- ``(n_samples, n_samples_fitted)``, where ``n_samples_fitted``
- is the number of samples used in the fitting for the estimator.
- y : array-like of shape (n_samples,) or (n_samples, n_outputs)
- True values for `X`.
- sample_weight : array-like of shape (n_samples,), default=None
- Sample weights.
- Returns
- -------
- score : float
- :math:`R^2` of ``self.predict(X)`` w.r.t. `y`.
- Notes
- -----
- The :math:`R^2` score used when calling ``score`` on a regressor uses
- ``multioutput='uniform_average'`` from version 0.23 to keep consistent
- with default value of :func:`~sklearn.metrics.r2_score`.
- This influences the ``score`` method of all the multioutput
- regressors (except for
- :class:`~sklearn.multioutput.MultiOutputRegressor`).
- """
- from .metrics import r2_score
- y_pred = self.predict(X)
- return r2_score(y, y_pred, sample_weight=sample_weight)
- def _more_tags(self):
- return {"requires_y": True}
- class ClusterMixin:
- """Mixin class for all cluster estimators in scikit-learn."""
- _estimator_type = "clusterer"
- def fit_predict(self, X, y=None):
- """
- Perform clustering on `X` and returns cluster labels.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Input data.
- y : Ignored
- Not used, present for API consistency by convention.
- Returns
- -------
- labels : ndarray of shape (n_samples,), dtype=np.int64
- Cluster labels.
- """
- # non-optimized default implementation; override when a better
- # method is possible for a given clustering algorithm
- self.fit(X)
- return self.labels_
- def _more_tags(self):
- return {"preserves_dtype": []}
- class BiclusterMixin:
- """Mixin class for all bicluster estimators in scikit-learn."""
- @property
- def biclusters_(self):
- """Convenient way to get row and column indicators together.
- Returns the ``rows_`` and ``columns_`` members.
- """
- return self.rows_, self.columns_
- def get_indices(self, i):
- """Row and column indices of the `i`'th bicluster.
- Only works if ``rows_`` and ``columns_`` attributes exist.
- Parameters
- ----------
- i : int
- The index of the cluster.
- Returns
- -------
- row_ind : ndarray, dtype=np.intp
- Indices of rows in the dataset that belong to the bicluster.
- col_ind : ndarray, dtype=np.intp
- Indices of columns in the dataset that belong to the bicluster.
- """
- rows = self.rows_[i]
- columns = self.columns_[i]
- return np.nonzero(rows)[0], np.nonzero(columns)[0]
- def get_shape(self, i):
- """Shape of the `i`'th bicluster.
- Parameters
- ----------
- i : int
- The index of the cluster.
- Returns
- -------
- n_rows : int
- Number of rows in the bicluster.
- n_cols : int
- Number of columns in the bicluster.
- """
- indices = self.get_indices(i)
- return tuple(len(i) for i in indices)
- def get_submatrix(self, i, data):
- """Return the submatrix corresponding to bicluster `i`.
- Parameters
- ----------
- i : int
- The index of the cluster.
- data : array-like of shape (n_samples, n_features)
- The data.
- Returns
- -------
- submatrix : ndarray of shape (n_rows, n_cols)
- The submatrix corresponding to bicluster `i`.
- Notes
- -----
- Works with sparse matrices. Only works if ``rows_`` and
- ``columns_`` attributes exist.
- """
- from .utils.validation import check_array
- data = check_array(data, accept_sparse="csr")
- row_ind, col_ind = self.get_indices(i)
- return data[row_ind[:, np.newaxis], col_ind]
- class TransformerMixin(_SetOutputMixin):
- """Mixin class for all transformers in scikit-learn.
- If :term:`get_feature_names_out` is defined, then :class:`BaseEstimator` will
- automatically wrap `transform` and `fit_transform` to follow the `set_output`
- API. See the :ref:`developer_api_set_output` for details.
- :class:`OneToOneFeatureMixin` and
- :class:`ClassNamePrefixFeaturesOutMixin` are helpful mixins for
- defining :term:`get_feature_names_out`.
- """
- def fit_transform(self, X, y=None, **fit_params):
- """
- Fit to data, then transform it.
- Fits transformer to `X` and `y` with optional parameters `fit_params`
- and returns a transformed version of `X`.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Input samples.
- y : array-like of shape (n_samples,) or (n_samples, n_outputs), \
- default=None
- Target values (None for unsupervised transformations).
- **fit_params : dict
- Additional fit parameters.
- Returns
- -------
- X_new : ndarray array of shape (n_samples, n_features_new)
- Transformed array.
- """
- # non-optimized default implementation; override when a better
- # method is possible for a given clustering algorithm
- if y is None:
- # fit method of arity 1 (unsupervised transformation)
- return self.fit(X, **fit_params).transform(X)
- else:
- # fit method of arity 2 (supervised transformation)
- return self.fit(X, y, **fit_params).transform(X)
- class OneToOneFeatureMixin:
- """Provides `get_feature_names_out` for simple transformers.
- This mixin assumes there's a 1-to-1 correspondence between input features
- and output features, such as :class:`~sklearn.preprocessing.StandardScaler`.
- """
- def get_feature_names_out(self, input_features=None):
- """Get output feature names for transformation.
- Parameters
- ----------
- input_features : array-like of str or None, default=None
- Input features.
- - If `input_features` is `None`, then `feature_names_in_` is
- used as feature names in. If `feature_names_in_` is not defined,
- then the following input feature names are generated:
- `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
- - If `input_features` is an array-like, then `input_features` must
- match `feature_names_in_` if `feature_names_in_` is defined.
- Returns
- -------
- feature_names_out : ndarray of str objects
- Same as input features.
- """
- check_is_fitted(self, "n_features_in_")
- return _check_feature_names_in(self, input_features)
- class ClassNamePrefixFeaturesOutMixin:
- """Mixin class for transformers that generate their own names by prefixing.
- This mixin is useful when the transformer needs to generate its own feature
- names out, such as :class:`~sklearn.decomposition.PCA`. For example, if
- :class:`~sklearn.decomposition.PCA` outputs 3 features, then the generated feature
- names out are: `["pca0", "pca1", "pca2"]`.
- This mixin assumes that a `_n_features_out` attribute is defined when the
- transformer is fitted. `_n_features_out` is the number of output features
- that the transformer will return in `transform` of `fit_transform`.
- """
- def get_feature_names_out(self, input_features=None):
- """Get output feature names for transformation.
- The feature names out will prefixed by the lowercased class name. For
- example, if the transformer outputs 3 features, then the feature names
- out are: `["class_name0", "class_name1", "class_name2"]`.
- Parameters
- ----------
- input_features : array-like of str or None, default=None
- Only used to validate feature names with the names seen in `fit`.
- Returns
- -------
- feature_names_out : ndarray of str objects
- Transformed feature names.
- """
- check_is_fitted(self, "_n_features_out")
- return _generate_get_feature_names_out(
- self, self._n_features_out, input_features=input_features
- )
- class DensityMixin:
- """Mixin class for all density estimators in scikit-learn."""
- _estimator_type = "DensityEstimator"
- def score(self, X, y=None):
- """Return the score of the model on the data `X`.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Test samples.
- y : Ignored
- Not used, present for API consistency by convention.
- Returns
- -------
- score : float
- """
- pass
- class OutlierMixin:
- """Mixin class for all outlier detection estimators in scikit-learn."""
- _estimator_type = "outlier_detector"
- def fit_predict(self, X, y=None):
- """Perform fit on X and returns labels for X.
- Returns -1 for outliers and 1 for inliers.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The input samples.
- y : Ignored
- Not used, present for API consistency by convention.
- Returns
- -------
- y : ndarray of shape (n_samples,)
- 1 for inliers, -1 for outliers.
- """
- # override for transductive outlier detectors like LocalOulierFactor
- return self.fit(X).predict(X)
- class MetaEstimatorMixin:
- _required_parameters = ["estimator"]
- """Mixin class for all meta estimators in scikit-learn."""
- class MultiOutputMixin:
- """Mixin to mark estimators that support multioutput."""
- def _more_tags(self):
- return {"multioutput": True}
- class _UnstableArchMixin:
- """Mark estimators that are non-determinstic on 32bit or PowerPC"""
- def _more_tags(self):
- return {
- "non_deterministic": _IS_32BIT or platform.machine().startswith(
- ("ppc", "powerpc")
- )
- }
- def is_classifier(estimator):
- """Return True if the given estimator is (probably) a classifier.
- Parameters
- ----------
- estimator : object
- Estimator object to test.
- Returns
- -------
- out : bool
- True if estimator is a classifier and False otherwise.
- """
- return getattr(estimator, "_estimator_type", None) == "classifier"
- def is_regressor(estimator):
- """Return True if the given estimator is (probably) a regressor.
- Parameters
- ----------
- estimator : estimator instance
- Estimator object to test.
- Returns
- -------
- out : bool
- True if estimator is a regressor and False otherwise.
- """
- return getattr(estimator, "_estimator_type", None) == "regressor"
- def is_outlier_detector(estimator):
- """Return True if the given estimator is (probably) an outlier detector.
- Parameters
- ----------
- estimator : estimator instance
- Estimator object to test.
- Returns
- -------
- out : bool
- True if estimator is an outlier detector and False otherwise.
- """
- return getattr(estimator, "_estimator_type", None) == "outlier_detector"
- def _fit_context(*, prefer_skip_nested_validation):
- """Decorator to run the fit methods of estimators within context managers.
- Parameters
- ----------
- prefer_skip_nested_validation : bool
- If True, the validation of parameters of inner estimators or functions
- called during fit will be skipped.
- This is useful to avoid validating many times the parameters passed by the
- user from the public facing API. It's also useful to avoid validating
- parameters that we pass internally to inner functions that are guaranteed to
- be valid by the test suite.
- It should be set to True for most estimators, except for those that receive
- non-validated objects as parameters, such as meta-estimators that are given
- estimator objects.
- Returns
- -------
- decorated_fit : method
- The decorated fit method.
- """
- def decorator(fit_method):
- @functools.wraps(fit_method)
- def wrapper(estimator, *args, **kwargs):
- global_skip_validation = get_config()["skip_parameter_validation"]
- # we don't want to validate again for each call to partial_fit
- partial_fit_and_fitted = (
- fit_method.__name__ == "partial_fit" and _is_fitted(estimator)
- )
- if not global_skip_validation and not partial_fit_and_fitted:
- estimator._validate_params()
- with config_context(
- skip_parameter_validation=(
- prefer_skip_nested_validation or global_skip_validation
- )
- ):
- return fit_method(estimator, *args, **kwargs)
- return wrapper
- return decorator
|