| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813 |
- """
- The :mod:`sklearn.model_selection._search` includes utilities to fine-tune the
- parameters of an estimator.
- """
- # Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>,
- # Gael Varoquaux <gael.varoquaux@normalesup.org>
- # Andreas Mueller <amueller@ais.uni-bonn.de>
- # Olivier Grisel <olivier.grisel@ensta.org>
- # Raghav RV <rvraghav93@gmail.com>
- # License: BSD 3 clause
- import numbers
- import operator
- import time
- import warnings
- from abc import ABCMeta, abstractmethod
- from collections import defaultdict
- from collections.abc import Iterable, Mapping, Sequence
- from functools import partial, reduce
- from itertools import product
- import numpy as np
- from numpy.ma import MaskedArray
- from scipy.stats import rankdata
- from ..base import BaseEstimator, MetaEstimatorMixin, _fit_context, clone, is_classifier
- from ..exceptions import NotFittedError
- from ..metrics import check_scoring
- from ..metrics._scorer import _check_multimetric_scoring, get_scorer_names
- from ..utils import check_random_state
- from ..utils._param_validation import HasMethods, Interval, StrOptions
- from ..utils._tags import _safe_tags
- from ..utils.metaestimators import available_if
- from ..utils.parallel import Parallel, delayed
- from ..utils.random import sample_without_replacement
- from ..utils.validation import _check_fit_params, check_is_fitted, indexable
- from ._split import check_cv
- from ._validation import (
- _aggregate_score_dicts,
- _fit_and_score,
- _insert_error_scores,
- _normalize_score_results,
- _warn_or_raise_about_fit_failures,
- )
- __all__ = ["GridSearchCV", "ParameterGrid", "ParameterSampler", "RandomizedSearchCV"]
- class ParameterGrid:
- """Grid of parameters with a discrete number of values for each.
- Can be used to iterate over parameter value combinations with the
- Python built-in function iter.
- The order of the generated parameter combinations is deterministic.
- Read more in the :ref:`User Guide <grid_search>`.
- Parameters
- ----------
- param_grid : dict of str to sequence, or sequence of such
- The parameter grid to explore, as a dictionary mapping estimator
- parameters to sequences of allowed values.
- An empty dict signifies default parameters.
- A sequence of dicts signifies a sequence of grids to search, and is
- useful to avoid exploring parameter combinations that make no sense
- or have no effect. See the examples below.
- Examples
- --------
- >>> from sklearn.model_selection import ParameterGrid
- >>> param_grid = {'a': [1, 2], 'b': [True, False]}
- >>> list(ParameterGrid(param_grid)) == (
- ... [{'a': 1, 'b': True}, {'a': 1, 'b': False},
- ... {'a': 2, 'b': True}, {'a': 2, 'b': False}])
- True
- >>> grid = [{'kernel': ['linear']}, {'kernel': ['rbf'], 'gamma': [1, 10]}]
- >>> list(ParameterGrid(grid)) == [{'kernel': 'linear'},
- ... {'kernel': 'rbf', 'gamma': 1},
- ... {'kernel': 'rbf', 'gamma': 10}]
- True
- >>> ParameterGrid(grid)[1] == {'kernel': 'rbf', 'gamma': 1}
- True
- See Also
- --------
- GridSearchCV : Uses :class:`ParameterGrid` to perform a full parallelized
- parameter search.
- """
- def __init__(self, param_grid):
- if not isinstance(param_grid, (Mapping, Iterable)):
- raise TypeError(
- f"Parameter grid should be a dict or a list, got: {param_grid!r} of"
- f" type {type(param_grid).__name__}"
- )
- if isinstance(param_grid, Mapping):
- # wrap dictionary in a singleton list to support either dict
- # or list of dicts
- param_grid = [param_grid]
- # check if all entries are dictionaries of lists
- for grid in param_grid:
- if not isinstance(grid, dict):
- raise TypeError(f"Parameter grid is not a dict ({grid!r})")
- for key, value in grid.items():
- if isinstance(value, np.ndarray) and value.ndim > 1:
- raise ValueError(
- f"Parameter array for {key!r} should be one-dimensional, got:"
- f" {value!r} with shape {value.shape}"
- )
- if isinstance(value, str) or not isinstance(
- value, (np.ndarray, Sequence)
- ):
- raise TypeError(
- f"Parameter grid for parameter {key!r} needs to be a list or a"
- f" numpy array, but got {value!r} (of type "
- f"{type(value).__name__}) instead. Single values "
- "need to be wrapped in a list with one element."
- )
- if len(value) == 0:
- raise ValueError(
- f"Parameter grid for parameter {key!r} need "
- f"to be a non-empty sequence, got: {value!r}"
- )
- self.param_grid = param_grid
- def __iter__(self):
- """Iterate over the points in the grid.
- Returns
- -------
- params : iterator over dict of str to any
- Yields dictionaries mapping each estimator parameter to one of its
- allowed values.
- """
- for p in self.param_grid:
- # Always sort the keys of a dictionary, for reproducibility
- items = sorted(p.items())
- if not items:
- yield {}
- else:
- keys, values = zip(*items)
- for v in product(*values):
- params = dict(zip(keys, v))
- yield params
- def __len__(self):
- """Number of points on the grid."""
- # Product function that can handle iterables (np.prod can't).
- product = partial(reduce, operator.mul)
- return sum(
- product(len(v) for v in p.values()) if p else 1 for p in self.param_grid
- )
- def __getitem__(self, ind):
- """Get the parameters that would be ``ind``th in iteration
- Parameters
- ----------
- ind : int
- The iteration index
- Returns
- -------
- params : dict of str to any
- Equal to list(self)[ind]
- """
- # This is used to make discrete sampling without replacement memory
- # efficient.
- for sub_grid in self.param_grid:
- # XXX: could memoize information used here
- if not sub_grid:
- if ind == 0:
- return {}
- else:
- ind -= 1
- continue
- # Reverse so most frequent cycling parameter comes first
- keys, values_lists = zip(*sorted(sub_grid.items())[::-1])
- sizes = [len(v_list) for v_list in values_lists]
- total = np.prod(sizes)
- if ind >= total:
- # Try the next grid
- ind -= total
- else:
- out = {}
- for key, v_list, n in zip(keys, values_lists, sizes):
- ind, offset = divmod(ind, n)
- out[key] = v_list[offset]
- return out
- raise IndexError("ParameterGrid index out of range")
- class ParameterSampler:
- """Generator on parameters sampled from given distributions.
- Non-deterministic iterable over random candidate combinations for hyper-
- parameter search. If all parameters are presented as a list,
- sampling without replacement is performed. If at least one parameter
- is given as a distribution, sampling with replacement is used.
- It is highly recommended to use continuous distributions for continuous
- parameters.
- Read more in the :ref:`User Guide <grid_search>`.
- Parameters
- ----------
- param_distributions : dict
- Dictionary with parameters names (`str`) as keys and distributions
- or lists of parameters to try. Distributions must provide a ``rvs``
- method for sampling (such as those from scipy.stats.distributions).
- If a list is given, it is sampled uniformly.
- If a list of dicts is given, first a dict is sampled uniformly, and
- then a parameter is sampled using that dict as above.
- n_iter : int
- Number of parameter settings that are produced.
- random_state : int, RandomState instance or None, default=None
- Pseudo random number generator state used for random uniform sampling
- from lists of possible values instead of scipy.stats distributions.
- Pass an int for reproducible output across multiple
- function calls.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- params : dict of str to any
- **Yields** dictionaries mapping each estimator parameter to
- as sampled value.
- Examples
- --------
- >>> from sklearn.model_selection import ParameterSampler
- >>> from scipy.stats.distributions import expon
- >>> import numpy as np
- >>> rng = np.random.RandomState(0)
- >>> param_grid = {'a':[1, 2], 'b': expon()}
- >>> param_list = list(ParameterSampler(param_grid, n_iter=4,
- ... random_state=rng))
- >>> rounded_list = [dict((k, round(v, 6)) for (k, v) in d.items())
- ... for d in param_list]
- >>> rounded_list == [{'b': 0.89856, 'a': 1},
- ... {'b': 0.923223, 'a': 1},
- ... {'b': 1.878964, 'a': 2},
- ... {'b': 1.038159, 'a': 2}]
- True
- """
- def __init__(self, param_distributions, n_iter, *, random_state=None):
- if not isinstance(param_distributions, (Mapping, Iterable)):
- raise TypeError(
- "Parameter distribution is not a dict or a list,"
- f" got: {param_distributions!r} of type "
- f"{type(param_distributions).__name__}"
- )
- if isinstance(param_distributions, Mapping):
- # wrap dictionary in a singleton list to support either dict
- # or list of dicts
- param_distributions = [param_distributions]
- for dist in param_distributions:
- if not isinstance(dist, dict):
- raise TypeError(
- "Parameter distribution is not a dict ({!r})".format(dist)
- )
- for key in dist:
- if not isinstance(dist[key], Iterable) and not hasattr(
- dist[key], "rvs"
- ):
- raise TypeError(
- f"Parameter grid for parameter {key!r} is not iterable "
- f"or a distribution (value={dist[key]})"
- )
- self.n_iter = n_iter
- self.random_state = random_state
- self.param_distributions = param_distributions
- def _is_all_lists(self):
- return all(
- all(not hasattr(v, "rvs") for v in dist.values())
- for dist in self.param_distributions
- )
- def __iter__(self):
- rng = check_random_state(self.random_state)
- # if all distributions are given as lists, we want to sample without
- # replacement
- if self._is_all_lists():
- # look up sampled parameter settings in parameter grid
- param_grid = ParameterGrid(self.param_distributions)
- grid_size = len(param_grid)
- n_iter = self.n_iter
- if grid_size < n_iter:
- warnings.warn(
- "The total space of parameters %d is smaller "
- "than n_iter=%d. Running %d iterations. For exhaustive "
- "searches, use GridSearchCV." % (grid_size, self.n_iter, grid_size),
- UserWarning,
- )
- n_iter = grid_size
- for i in sample_without_replacement(grid_size, n_iter, random_state=rng):
- yield param_grid[i]
- else:
- for _ in range(self.n_iter):
- dist = rng.choice(self.param_distributions)
- # Always sort the keys of a dictionary, for reproducibility
- items = sorted(dist.items())
- params = dict()
- for k, v in items:
- if hasattr(v, "rvs"):
- params[k] = v.rvs(random_state=rng)
- else:
- params[k] = v[rng.randint(len(v))]
- yield params
- def __len__(self):
- """Number of points that will be sampled."""
- if self._is_all_lists():
- grid_size = len(ParameterGrid(self.param_distributions))
- return min(self.n_iter, grid_size)
- else:
- return self.n_iter
- def _check_refit(search_cv, attr):
- if not search_cv.refit:
- raise AttributeError(
- f"This {type(search_cv).__name__} instance was initialized with "
- f"`refit=False`. {attr} is available only after refitting on the best "
- "parameters. You can refit an estimator manually using the "
- "`best_params_` attribute"
- )
- def _estimator_has(attr):
- """Check if we can delegate a method to the underlying estimator.
- Calling a prediction method will only be available if `refit=True`. In
- such case, we check first the fitted best estimator. If it is not
- fitted, we check the unfitted estimator.
- Checking the unfitted estimator allows to use `hasattr` on the `SearchCV`
- instance even before calling `fit`.
- """
- def check(self):
- _check_refit(self, attr)
- if hasattr(self, "best_estimator_"):
- # raise an AttributeError if `attr` does not exist
- getattr(self.best_estimator_, attr)
- return True
- # raise an AttributeError if `attr` does not exist
- getattr(self.estimator, attr)
- return True
- return check
- class BaseSearchCV(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta):
- """Abstract base class for hyper parameter search with cross-validation."""
- _parameter_constraints: dict = {
- "estimator": [HasMethods(["fit"])],
- "scoring": [
- StrOptions(set(get_scorer_names())),
- callable,
- list,
- tuple,
- dict,
- None,
- ],
- "n_jobs": [numbers.Integral, None],
- "refit": ["boolean", str, callable],
- "cv": ["cv_object"],
- "verbose": ["verbose"],
- "pre_dispatch": [numbers.Integral, str],
- "error_score": [StrOptions({"raise"}), numbers.Real],
- "return_train_score": ["boolean"],
- }
- @abstractmethod
- def __init__(
- self,
- estimator,
- *,
- scoring=None,
- n_jobs=None,
- refit=True,
- cv=None,
- verbose=0,
- pre_dispatch="2*n_jobs",
- error_score=np.nan,
- return_train_score=True,
- ):
- self.scoring = scoring
- self.estimator = estimator
- self.n_jobs = n_jobs
- self.refit = refit
- self.cv = cv
- self.verbose = verbose
- self.pre_dispatch = pre_dispatch
- self.error_score = error_score
- self.return_train_score = return_train_score
- @property
- def _estimator_type(self):
- return self.estimator._estimator_type
- def _more_tags(self):
- # allows cross-validation to see 'precomputed' metrics
- return {
- "pairwise": _safe_tags(self.estimator, "pairwise"),
- "_xfail_checks": {
- "check_supervised_y_2d": "DataConversionWarning not caught"
- },
- }
- def score(self, X, y=None):
- """Return the score on the given data, if the estimator has been refit.
- This uses the score defined by ``scoring`` where provided, and the
- ``best_estimator_.score`` method otherwise.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Input data, where `n_samples` is the number of samples and
- `n_features` is the number of features.
- y : array-like of shape (n_samples, n_output) \
- or (n_samples,), default=None
- Target relative to X for classification or regression;
- None for unsupervised learning.
- Returns
- -------
- score : float
- The score defined by ``scoring`` if provided, and the
- ``best_estimator_.score`` method otherwise.
- """
- _check_refit(self, "score")
- check_is_fitted(self)
- if self.scorer_ is None:
- raise ValueError(
- "No score function explicitly defined, "
- "and the estimator doesn't provide one %s"
- % self.best_estimator_
- )
- if isinstance(self.scorer_, dict):
- if self.multimetric_:
- scorer = self.scorer_[self.refit]
- else:
- scorer = self.scorer_
- return scorer(self.best_estimator_, X, y)
- # callable
- score = self.scorer_(self.best_estimator_, X, y)
- if self.multimetric_:
- score = score[self.refit]
- return score
- @available_if(_estimator_has("score_samples"))
- def score_samples(self, X):
- """Call score_samples on the estimator with the best found parameters.
- Only available if ``refit=True`` and the underlying estimator supports
- ``score_samples``.
- .. versionadded:: 0.24
- Parameters
- ----------
- X : iterable
- Data to predict on. Must fulfill input requirements
- of the underlying estimator.
- Returns
- -------
- y_score : ndarray of shape (n_samples,)
- The ``best_estimator_.score_samples`` method.
- """
- check_is_fitted(self)
- return self.best_estimator_.score_samples(X)
- @available_if(_estimator_has("predict"))
- def predict(self, X):
- """Call predict on the estimator with the best found parameters.
- Only available if ``refit=True`` and the underlying estimator supports
- ``predict``.
- Parameters
- ----------
- X : indexable, length n_samples
- Must fulfill the input assumptions of the
- underlying estimator.
- Returns
- -------
- y_pred : ndarray of shape (n_samples,)
- The predicted labels or values for `X` based on the estimator with
- the best found parameters.
- """
- check_is_fitted(self)
- return self.best_estimator_.predict(X)
- @available_if(_estimator_has("predict_proba"))
- def predict_proba(self, X):
- """Call predict_proba on the estimator with the best found parameters.
- Only available if ``refit=True`` and the underlying estimator supports
- ``predict_proba``.
- Parameters
- ----------
- X : indexable, length n_samples
- Must fulfill the input assumptions of the
- underlying estimator.
- Returns
- -------
- y_pred : ndarray of shape (n_samples,) or (n_samples, n_classes)
- Predicted class probabilities for `X` based on the estimator with
- the best found parameters. The order of the classes corresponds
- to that in the fitted attribute :term:`classes_`.
- """
- check_is_fitted(self)
- return self.best_estimator_.predict_proba(X)
- @available_if(_estimator_has("predict_log_proba"))
- def predict_log_proba(self, X):
- """Call predict_log_proba on the estimator with the best found parameters.
- Only available if ``refit=True`` and the underlying estimator supports
- ``predict_log_proba``.
- Parameters
- ----------
- X : indexable, length n_samples
- Must fulfill the input assumptions of the
- underlying estimator.
- Returns
- -------
- y_pred : ndarray of shape (n_samples,) or (n_samples, n_classes)
- Predicted class log-probabilities for `X` based on the estimator
- with the best found parameters. The order of the classes
- corresponds to that in the fitted attribute :term:`classes_`.
- """
- check_is_fitted(self)
- return self.best_estimator_.predict_log_proba(X)
- @available_if(_estimator_has("decision_function"))
- def decision_function(self, X):
- """Call decision_function on the estimator with the best found parameters.
- Only available if ``refit=True`` and the underlying estimator supports
- ``decision_function``.
- Parameters
- ----------
- X : indexable, length n_samples
- Must fulfill the input assumptions of the
- underlying estimator.
- Returns
- -------
- y_score : ndarray of shape (n_samples,) or (n_samples, n_classes) \
- or (n_samples, n_classes * (n_classes-1) / 2)
- Result of the decision function for `X` based on the estimator with
- the best found parameters.
- """
- check_is_fitted(self)
- return self.best_estimator_.decision_function(X)
- @available_if(_estimator_has("transform"))
- def transform(self, X):
- """Call transform on the estimator with the best found parameters.
- Only available if the underlying estimator supports ``transform`` and
- ``refit=True``.
- Parameters
- ----------
- X : indexable, length n_samples
- Must fulfill the input assumptions of the
- underlying estimator.
- Returns
- -------
- Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)
- `X` transformed in the new space based on the estimator with
- the best found parameters.
- """
- check_is_fitted(self)
- return self.best_estimator_.transform(X)
- @available_if(_estimator_has("inverse_transform"))
- def inverse_transform(self, Xt):
- """Call inverse_transform on the estimator with the best found params.
- Only available if the underlying estimator implements
- ``inverse_transform`` and ``refit=True``.
- Parameters
- ----------
- Xt : indexable, length n_samples
- Must fulfill the input assumptions of the
- underlying estimator.
- Returns
- -------
- X : {ndarray, sparse matrix} of shape (n_samples, n_features)
- Result of the `inverse_transform` function for `Xt` based on the
- estimator with the best found parameters.
- """
- check_is_fitted(self)
- return self.best_estimator_.inverse_transform(Xt)
- @property
- def n_features_in_(self):
- """Number of features seen during :term:`fit`.
- Only available when `refit=True`.
- """
- # For consistency with other estimators we raise a AttributeError so
- # that hasattr() fails if the search estimator isn't fitted.
- try:
- check_is_fitted(self)
- except NotFittedError as nfe:
- raise AttributeError(
- "{} object has no n_features_in_ attribute.".format(
- self.__class__.__name__
- )
- ) from nfe
- return self.best_estimator_.n_features_in_
- @property
- def classes_(self):
- """Class labels.
- Only available when `refit=True` and the estimator is a classifier.
- """
- _estimator_has("classes_")(self)
- return self.best_estimator_.classes_
- def _run_search(self, evaluate_candidates):
- """Repeatedly calls `evaluate_candidates` to conduct a search.
- This method, implemented in sub-classes, makes it possible to
- customize the scheduling of evaluations: GridSearchCV and
- RandomizedSearchCV schedule evaluations for their whole parameter
- search space at once but other more sequential approaches are also
- possible: for instance is possible to iteratively schedule evaluations
- for new regions of the parameter search space based on previously
- collected evaluation results. This makes it possible to implement
- Bayesian optimization or more generally sequential model-based
- optimization by deriving from the BaseSearchCV abstract base class.
- For example, Successive Halving is implemented by calling
- `evaluate_candidates` multiples times (once per iteration of the SH
- process), each time passing a different set of candidates with `X`
- and `y` of increasing sizes.
- Parameters
- ----------
- evaluate_candidates : callable
- This callback accepts:
- - a list of candidates, where each candidate is a dict of
- parameter settings.
- - an optional `cv` parameter which can be used to e.g.
- evaluate candidates on different dataset splits, or
- evaluate candidates on subsampled data (as done in the
- SucessiveHaling estimators). By default, the original `cv`
- parameter is used, and it is available as a private
- `_checked_cv_orig` attribute.
- - an optional `more_results` dict. Each key will be added to
- the `cv_results_` attribute. Values should be lists of
- length `n_candidates`
- It returns a dict of all results so far, formatted like
- ``cv_results_``.
- Important note (relevant whether the default cv is used or not):
- in randomized splitters, and unless the random_state parameter of
- cv was set to an int, calling cv.split() multiple times will
- yield different splits. Since cv.split() is called in
- evaluate_candidates, this means that candidates will be evaluated
- on different splits each time evaluate_candidates is called. This
- might be a methodological issue depending on the search strategy
- that you're implementing. To prevent randomized splitters from
- being used, you may use _split._yields_constant_splits()
- Examples
- --------
- ::
- def _run_search(self, evaluate_candidates):
- 'Try C=0.1 only if C=1 is better than C=10'
- all_results = evaluate_candidates([{'C': 1}, {'C': 10}])
- score = all_results['mean_test_score']
- if score[0] < score[1]:
- evaluate_candidates([{'C': 0.1}])
- """
- raise NotImplementedError("_run_search not implemented.")
- def _check_refit_for_multimetric(self, scores):
- """Check `refit` is compatible with `scores` is valid"""
- multimetric_refit_msg = (
- "For multi-metric scoring, the parameter refit must be set to a "
- "scorer key or a callable to refit an estimator with the best "
- "parameter setting on the whole data and make the best_* "
- "attributes available for that metric. If this is not needed, "
- f"refit should be set to False explicitly. {self.refit!r} was "
- "passed."
- )
- valid_refit_dict = isinstance(self.refit, str) and self.refit in scores
- if (
- self.refit is not False
- and not valid_refit_dict
- and not callable(self.refit)
- ):
- raise ValueError(multimetric_refit_msg)
- @staticmethod
- def _select_best_index(refit, refit_metric, results):
- """Select index of the best combination of hyperparemeters."""
- if callable(refit):
- # If callable, refit is expected to return the index of the best
- # parameter set.
- best_index = refit(results)
- if not isinstance(best_index, numbers.Integral):
- raise TypeError("best_index_ returned is not an integer")
- if best_index < 0 or best_index >= len(results["params"]):
- raise IndexError("best_index_ index out of range")
- else:
- best_index = results[f"rank_test_{refit_metric}"].argmin()
- return best_index
- @_fit_context(
- # *SearchCV.estimator is not validated yet
- prefer_skip_nested_validation=False
- )
- def fit(self, X, y=None, *, groups=None, **fit_params):
- """Run fit with all sets of parameters.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Training vector, where `n_samples` is the number of samples and
- `n_features` is the number of features.
- y : array-like of shape (n_samples, n_output) \
- or (n_samples,), default=None
- Target relative to X for classification or regression;
- None for unsupervised learning.
- groups : array-like of shape (n_samples,), default=None
- Group labels for the samples used while splitting the dataset into
- train/test set. Only used in conjunction with a "Group" :term:`cv`
- instance (e.g., :class:`~sklearn.model_selection.GroupKFold`).
- **fit_params : dict of str -> object
- Parameters passed to the `fit` method of the estimator.
- If a fit parameter is an array-like whose length is equal to
- `num_samples` then it will be split across CV groups along with `X`
- and `y`. For example, the :term:`sample_weight` parameter is split
- because `len(sample_weights) = len(X)`.
- Returns
- -------
- self : object
- Instance of fitted estimator.
- """
- estimator = self.estimator
- refit_metric = "score"
- if callable(self.scoring):
- scorers = self.scoring
- elif self.scoring is None or isinstance(self.scoring, str):
- scorers = check_scoring(self.estimator, self.scoring)
- else:
- scorers = _check_multimetric_scoring(self.estimator, self.scoring)
- self._check_refit_for_multimetric(scorers)
- refit_metric = self.refit
- X, y, groups = indexable(X, y, groups)
- fit_params = _check_fit_params(X, fit_params)
- cv_orig = check_cv(self.cv, y, classifier=is_classifier(estimator))
- n_splits = cv_orig.get_n_splits(X, y, groups)
- base_estimator = clone(self.estimator)
- parallel = Parallel(n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch)
- fit_and_score_kwargs = dict(
- scorer=scorers,
- fit_params=fit_params,
- return_train_score=self.return_train_score,
- return_n_test_samples=True,
- return_times=True,
- return_parameters=False,
- error_score=self.error_score,
- verbose=self.verbose,
- )
- results = {}
- with parallel:
- all_candidate_params = []
- all_out = []
- all_more_results = defaultdict(list)
- def evaluate_candidates(candidate_params, cv=None, more_results=None):
- cv = cv or cv_orig
- candidate_params = list(candidate_params)
- n_candidates = len(candidate_params)
- if self.verbose > 0:
- print(
- "Fitting {0} folds for each of {1} candidates,"
- " totalling {2} fits".format(
- n_splits, n_candidates, n_candidates * n_splits
- )
- )
- out = parallel(
- delayed(_fit_and_score)(
- clone(base_estimator),
- X,
- y,
- train=train,
- test=test,
- parameters=parameters,
- split_progress=(split_idx, n_splits),
- candidate_progress=(cand_idx, n_candidates),
- **fit_and_score_kwargs,
- )
- for (cand_idx, parameters), (split_idx, (train, test)) in product(
- enumerate(candidate_params), enumerate(cv.split(X, y, groups))
- )
- )
- if len(out) < 1:
- raise ValueError(
- "No fits were performed. "
- "Was the CV iterator empty? "
- "Were there no candidates?"
- )
- elif len(out) != n_candidates * n_splits:
- raise ValueError(
- "cv.split and cv.get_n_splits returned "
- "inconsistent results. Expected {} "
- "splits, got {}".format(n_splits, len(out) // n_candidates)
- )
- _warn_or_raise_about_fit_failures(out, self.error_score)
- # For callable self.scoring, the return type is only know after
- # calling. If the return type is a dictionary, the error scores
- # can now be inserted with the correct key. The type checking
- # of out will be done in `_insert_error_scores`.
- if callable(self.scoring):
- _insert_error_scores(out, self.error_score)
- all_candidate_params.extend(candidate_params)
- all_out.extend(out)
- if more_results is not None:
- for key, value in more_results.items():
- all_more_results[key].extend(value)
- nonlocal results
- results = self._format_results(
- all_candidate_params, n_splits, all_out, all_more_results
- )
- return results
- self._run_search(evaluate_candidates)
- # multimetric is determined here because in the case of a callable
- # self.scoring the return type is only known after calling
- first_test_score = all_out[0]["test_scores"]
- self.multimetric_ = isinstance(first_test_score, dict)
- # check refit_metric now for a callabe scorer that is multimetric
- if callable(self.scoring) and self.multimetric_:
- self._check_refit_for_multimetric(first_test_score)
- refit_metric = self.refit
- # For multi-metric evaluation, store the best_index_, best_params_ and
- # best_score_ iff refit is one of the scorer names
- # In single metric evaluation, refit_metric is "score"
- if self.refit or not self.multimetric_:
- self.best_index_ = self._select_best_index(
- self.refit, refit_metric, results
- )
- if not callable(self.refit):
- # With a non-custom callable, we can select the best score
- # based on the best index
- self.best_score_ = results[f"mean_test_{refit_metric}"][
- self.best_index_
- ]
- self.best_params_ = results["params"][self.best_index_]
- if self.refit:
- # here we clone the estimator as well as the parameters, since
- # sometimes the parameters themselves might be estimators, e.g.
- # when we search over different estimators in a pipeline.
- # ref: https://github.com/scikit-learn/scikit-learn/pull/26786
- self.best_estimator_ = clone(base_estimator).set_params(
- **clone(self.best_params_, safe=False)
- )
- refit_start_time = time.time()
- if y is not None:
- self.best_estimator_.fit(X, y, **fit_params)
- else:
- self.best_estimator_.fit(X, **fit_params)
- refit_end_time = time.time()
- self.refit_time_ = refit_end_time - refit_start_time
- if hasattr(self.best_estimator_, "feature_names_in_"):
- self.feature_names_in_ = self.best_estimator_.feature_names_in_
- # Store the only scorer not as a dict for single metric evaluation
- self.scorer_ = scorers
- self.cv_results_ = results
- self.n_splits_ = n_splits
- return self
- def _format_results(self, candidate_params, n_splits, out, more_results=None):
- n_candidates = len(candidate_params)
- out = _aggregate_score_dicts(out)
- results = dict(more_results or {})
- for key, val in results.items():
- # each value is a list (as per evaluate_candidate's convention)
- # we convert it to an array for consistency with the other keys
- results[key] = np.asarray(val)
- def _store(key_name, array, weights=None, splits=False, rank=False):
- """A small helper to store the scores/times to the cv_results_"""
- # When iterated first by splits, then by parameters
- # We want `array` to have `n_candidates` rows and `n_splits` cols.
- array = np.array(array, dtype=np.float64).reshape(n_candidates, n_splits)
- if splits:
- for split_idx in range(n_splits):
- # Uses closure to alter the results
- results["split%d_%s" % (split_idx, key_name)] = array[:, split_idx]
- array_means = np.average(array, axis=1, weights=weights)
- results["mean_%s" % key_name] = array_means
- if key_name.startswith(("train_", "test_")) and np.any(
- ~np.isfinite(array_means)
- ):
- warnings.warn(
- (
- f"One or more of the {key_name.split('_')[0]} scores "
- f"are non-finite: {array_means}"
- ),
- category=UserWarning,
- )
- # Weighted std is not directly available in numpy
- array_stds = np.sqrt(
- np.average(
- (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights
- )
- )
- results["std_%s" % key_name] = array_stds
- if rank:
- # When the fit/scoring fails `array_means` contains NaNs, we
- # will exclude them from the ranking process and consider them
- # as tied with the worst performers.
- if np.isnan(array_means).all():
- # All fit/scoring routines failed.
- rank_result = np.ones_like(array_means, dtype=np.int32)
- else:
- min_array_means = np.nanmin(array_means) - 1
- array_means = np.nan_to_num(array_means, nan=min_array_means)
- rank_result = rankdata(-array_means, method="min").astype(
- np.int32, copy=False
- )
- results["rank_%s" % key_name] = rank_result
- _store("fit_time", out["fit_time"])
- _store("score_time", out["score_time"])
- # Use one MaskedArray and mask all the places where the param is not
- # applicable for that candidate. Use defaultdict as each candidate may
- # not contain all the params
- param_results = defaultdict(
- partial(
- MaskedArray,
- np.empty(
- n_candidates,
- ),
- mask=True,
- dtype=object,
- )
- )
- for cand_idx, params in enumerate(candidate_params):
- for name, value in params.items():
- # An all masked empty array gets created for the key
- # `"param_%s" % name` at the first occurrence of `name`.
- # Setting the value at an index also unmasks that index
- param_results["param_%s" % name][cand_idx] = value
- results.update(param_results)
- # Store a list of param dicts at the key 'params'
- results["params"] = candidate_params
- test_scores_dict = _normalize_score_results(out["test_scores"])
- if self.return_train_score:
- train_scores_dict = _normalize_score_results(out["train_scores"])
- for scorer_name in test_scores_dict:
- # Computed the (weighted) mean and std for test scores alone
- _store(
- "test_%s" % scorer_name,
- test_scores_dict[scorer_name],
- splits=True,
- rank=True,
- weights=None,
- )
- if self.return_train_score:
- _store(
- "train_%s" % scorer_name,
- train_scores_dict[scorer_name],
- splits=True,
- )
- return results
- class GridSearchCV(BaseSearchCV):
- """Exhaustive search over specified parameter values for an estimator.
- Important members are fit, predict.
- GridSearchCV implements a "fit" and a "score" method.
- It also implements "score_samples", "predict", "predict_proba",
- "decision_function", "transform" and "inverse_transform" if they are
- implemented in the estimator used.
- The parameters of the estimator used to apply these methods are optimized
- by cross-validated grid-search over a parameter grid.
- Read more in the :ref:`User Guide <grid_search>`.
- Parameters
- ----------
- estimator : estimator object
- This is assumed to implement the scikit-learn estimator interface.
- Either estimator needs to provide a ``score`` function,
- or ``scoring`` must be passed.
- param_grid : dict or list of dictionaries
- Dictionary with parameters names (`str`) as keys and lists of
- parameter settings to try as values, or a list of such
- dictionaries, in which case the grids spanned by each dictionary
- in the list are explored. This enables searching over any sequence
- of parameter settings.
- scoring : str, callable, list, tuple or dict, default=None
- Strategy to evaluate the performance of the cross-validated model on
- the test set.
- If `scoring` represents a single score, one can use:
- - a single string (see :ref:`scoring_parameter`);
- - a callable (see :ref:`scoring`) that returns a single value.
- If `scoring` represents multiple scores, one can use:
- - a list or tuple of unique strings;
- - a callable returning a dictionary where the keys are the metric
- names and the values are the metric scores;
- - a dictionary with metric names as keys and callables a values.
- See :ref:`multimetric_grid_search` for an example.
- n_jobs : int, default=None
- Number of jobs to run in parallel.
- ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
- ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
- for more details.
- .. versionchanged:: v0.20
- `n_jobs` default changed from 1 to None
- refit : bool, str, or callable, default=True
- Refit an estimator using the best found parameters on the whole
- dataset.
- For multiple metric evaluation, this needs to be a `str` denoting the
- scorer that would be used to find the best parameters for refitting
- the estimator at the end.
- Where there are considerations other than maximum score in
- choosing a best estimator, ``refit`` can be set to a function which
- returns the selected ``best_index_`` given ``cv_results_``. In that
- case, the ``best_estimator_`` and ``best_params_`` will be set
- according to the returned ``best_index_`` while the ``best_score_``
- attribute will not be available.
- The refitted estimator is made available at the ``best_estimator_``
- attribute and permits using ``predict`` directly on this
- ``GridSearchCV`` instance.
- Also for multiple metric evaluation, the attributes ``best_index_``,
- ``best_score_`` and ``best_params_`` will only be available if
- ``refit`` is set and all of them will be determined w.r.t this specific
- scorer.
- See ``scoring`` parameter to know more about multiple metric
- evaluation.
- See :ref:`sphx_glr_auto_examples_model_selection_plot_grid_search_digits.py`
- to see how to design a custom selection strategy using a callable
- via `refit`.
- .. versionchanged:: 0.20
- Support for callable added.
- cv : int, cross-validation generator or an iterable, default=None
- Determines the cross-validation splitting strategy.
- Possible inputs for cv are:
- - None, to use the default 5-fold cross validation,
- - integer, to specify the number of folds in a `(Stratified)KFold`,
- - :term:`CV splitter`,
- - An iterable yielding (train, test) splits as arrays of indices.
- For integer/None inputs, if the estimator is a classifier and ``y`` is
- either binary or multiclass, :class:`StratifiedKFold` is used. In all
- other cases, :class:`KFold` is used. These splitters are instantiated
- with `shuffle=False` so the splits will be the same across calls.
- Refer :ref:`User Guide <cross_validation>` for the various
- cross-validation strategies that can be used here.
- .. versionchanged:: 0.22
- ``cv`` default value if None changed from 3-fold to 5-fold.
- verbose : int
- Controls the verbosity: the higher, the more messages.
- - >1 : the computation time for each fold and parameter candidate is
- displayed;
- - >2 : the score is also displayed;
- - >3 : the fold and candidate parameter indexes are also displayed
- together with the starting time of the computation.
- pre_dispatch : int, or str, default='2*n_jobs'
- Controls the number of jobs that get dispatched during parallel
- execution. Reducing this number can be useful to avoid an
- explosion of memory consumption when more jobs get dispatched
- than CPUs can process. This parameter can be:
- - None, in which case all the jobs are immediately
- created and spawned. Use this for lightweight and
- fast-running jobs, to avoid delays due to on-demand
- spawning of the jobs
- - An int, giving the exact number of total jobs that are
- spawned
- - A str, giving an expression as a function of n_jobs,
- as in '2*n_jobs'
- error_score : 'raise' or numeric, default=np.nan
- Value to assign to the score if an error occurs in estimator fitting.
- If set to 'raise', the error is raised. If a numeric value is given,
- FitFailedWarning is raised. This parameter does not affect the refit
- step, which will always raise the error.
- return_train_score : bool, default=False
- If ``False``, the ``cv_results_`` attribute will not include training
- scores.
- Computing training scores is used to get insights on how different
- parameter settings impact the overfitting/underfitting trade-off.
- However computing the scores on the training set can be computationally
- expensive and is not strictly required to select the parameters that
- yield the best generalization performance.
- .. versionadded:: 0.19
- .. versionchanged:: 0.21
- Default value was changed from ``True`` to ``False``
- Attributes
- ----------
- cv_results_ : dict of numpy (masked) ndarrays
- A dict with keys as column headers and values as columns, that can be
- imported into a pandas ``DataFrame``.
- For instance the below given table
- +------------+-----------+------------+-----------------+---+---------+
- |param_kernel|param_gamma|param_degree|split0_test_score|...|rank_t...|
- +============+===========+============+=================+===+=========+
- | 'poly' | -- | 2 | 0.80 |...| 2 |
- +------------+-----------+------------+-----------------+---+---------+
- | 'poly' | -- | 3 | 0.70 |...| 4 |
- +------------+-----------+------------+-----------------+---+---------+
- | 'rbf' | 0.1 | -- | 0.80 |...| 3 |
- +------------+-----------+------------+-----------------+---+---------+
- | 'rbf' | 0.2 | -- | 0.93 |...| 1 |
- +------------+-----------+------------+-----------------+---+---------+
- will be represented by a ``cv_results_`` dict of::
- {
- 'param_kernel': masked_array(data = ['poly', 'poly', 'rbf', 'rbf'],
- mask = [False False False False]...)
- 'param_gamma': masked_array(data = [-- -- 0.1 0.2],
- mask = [ True True False False]...),
- 'param_degree': masked_array(data = [2.0 3.0 -- --],
- mask = [False False True True]...),
- 'split0_test_score' : [0.80, 0.70, 0.80, 0.93],
- 'split1_test_score' : [0.82, 0.50, 0.70, 0.78],
- 'mean_test_score' : [0.81, 0.60, 0.75, 0.85],
- 'std_test_score' : [0.01, 0.10, 0.05, 0.08],
- 'rank_test_score' : [2, 4, 3, 1],
- 'split0_train_score' : [0.80, 0.92, 0.70, 0.93],
- 'split1_train_score' : [0.82, 0.55, 0.70, 0.87],
- 'mean_train_score' : [0.81, 0.74, 0.70, 0.90],
- 'std_train_score' : [0.01, 0.19, 0.00, 0.03],
- 'mean_fit_time' : [0.73, 0.63, 0.43, 0.49],
- 'std_fit_time' : [0.01, 0.02, 0.01, 0.01],
- 'mean_score_time' : [0.01, 0.06, 0.04, 0.04],
- 'std_score_time' : [0.00, 0.00, 0.00, 0.01],
- 'params' : [{'kernel': 'poly', 'degree': 2}, ...],
- }
- NOTE
- The key ``'params'`` is used to store a list of parameter
- settings dicts for all the parameter candidates.
- The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and
- ``std_score_time`` are all in seconds.
- For multi-metric evaluation, the scores for all the scorers are
- available in the ``cv_results_`` dict at the keys ending with that
- scorer's name (``'_<scorer_name>'``) instead of ``'_score'`` shown
- above. ('split0_test_precision', 'mean_train_precision' etc.)
- best_estimator_ : estimator
- Estimator that was chosen by the search, i.e. estimator
- which gave highest score (or smallest loss if specified)
- on the left out data. Not available if ``refit=False``.
- See ``refit`` parameter for more information on allowed values.
- best_score_ : float
- Mean cross-validated score of the best_estimator
- For multi-metric evaluation, this is present only if ``refit`` is
- specified.
- This attribute is not available if ``refit`` is a function.
- best_params_ : dict
- Parameter setting that gave the best results on the hold out data.
- For multi-metric evaluation, this is present only if ``refit`` is
- specified.
- best_index_ : int
- The index (of the ``cv_results_`` arrays) which corresponds to the best
- candidate parameter setting.
- The dict at ``search.cv_results_['params'][search.best_index_]`` gives
- the parameter setting for the best model, that gives the highest
- mean score (``search.best_score_``).
- For multi-metric evaluation, this is present only if ``refit`` is
- specified.
- scorer_ : function or a dict
- Scorer function used on the held out data to choose the best
- parameters for the model.
- For multi-metric evaluation, this attribute holds the validated
- ``scoring`` dict which maps the scorer key to the scorer callable.
- n_splits_ : int
- The number of cross-validation splits (folds/iterations).
- refit_time_ : float
- Seconds used for refitting the best model on the whole dataset.
- This is present only if ``refit`` is not False.
- .. versionadded:: 0.20
- multimetric_ : bool
- Whether or not the scorers compute several metrics.
- classes_ : ndarray of shape (n_classes,)
- The classes labels. This is present only if ``refit`` is specified and
- the underlying estimator is a classifier.
- n_features_in_ : int
- Number of features seen during :term:`fit`. Only defined if
- `best_estimator_` is defined (see the documentation for the `refit`
- parameter for more details) and that `best_estimator_` exposes
- `n_features_in_` when fit.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Only defined if
- `best_estimator_` is defined (see the documentation for the `refit`
- parameter for more details) and that `best_estimator_` exposes
- `feature_names_in_` when fit.
- .. versionadded:: 1.0
- See Also
- --------
- ParameterGrid : Generates all the combinations of a hyperparameter grid.
- train_test_split : Utility function to split the data into a development
- set usable for fitting a GridSearchCV instance and an evaluation set
- for its final evaluation.
- sklearn.metrics.make_scorer : Make a scorer from a performance metric or
- loss function.
- Notes
- -----
- The parameters selected are those that maximize the score of the left out
- data, unless an explicit score is passed in which case it is used instead.
- If `n_jobs` was set to a value higher than one, the data is copied for each
- point in the grid (and not `n_jobs` times). This is done for efficiency
- reasons if individual jobs take very little time, but may raise errors if
- the dataset is large and not enough memory is available. A workaround in
- this case is to set `pre_dispatch`. Then, the memory is copied only
- `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
- n_jobs`.
- Examples
- --------
- >>> from sklearn import svm, datasets
- >>> from sklearn.model_selection import GridSearchCV
- >>> iris = datasets.load_iris()
- >>> parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
- >>> svc = svm.SVC()
- >>> clf = GridSearchCV(svc, parameters)
- >>> clf.fit(iris.data, iris.target)
- GridSearchCV(estimator=SVC(),
- param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')})
- >>> sorted(clf.cv_results_.keys())
- ['mean_fit_time', 'mean_score_time', 'mean_test_score',...
- 'param_C', 'param_kernel', 'params',...
- 'rank_test_score', 'split0_test_score',...
- 'split2_test_score', ...
- 'std_fit_time', 'std_score_time', 'std_test_score']
- """
- _required_parameters = ["estimator", "param_grid"]
- _parameter_constraints: dict = {
- **BaseSearchCV._parameter_constraints,
- "param_grid": [dict, list],
- }
- def __init__(
- self,
- estimator,
- param_grid,
- *,
- scoring=None,
- n_jobs=None,
- refit=True,
- cv=None,
- verbose=0,
- pre_dispatch="2*n_jobs",
- error_score=np.nan,
- return_train_score=False,
- ):
- super().__init__(
- estimator=estimator,
- scoring=scoring,
- n_jobs=n_jobs,
- refit=refit,
- cv=cv,
- verbose=verbose,
- pre_dispatch=pre_dispatch,
- error_score=error_score,
- return_train_score=return_train_score,
- )
- self.param_grid = param_grid
- def _run_search(self, evaluate_candidates):
- """Search all candidates in param_grid"""
- evaluate_candidates(ParameterGrid(self.param_grid))
- class RandomizedSearchCV(BaseSearchCV):
- """Randomized search on hyper parameters.
- RandomizedSearchCV implements a "fit" and a "score" method.
- It also implements "score_samples", "predict", "predict_proba",
- "decision_function", "transform" and "inverse_transform" if they are
- implemented in the estimator used.
- The parameters of the estimator used to apply these methods are optimized
- by cross-validated search over parameter settings.
- In contrast to GridSearchCV, not all parameter values are tried out, but
- rather a fixed number of parameter settings is sampled from the specified
- distributions. The number of parameter settings that are tried is
- given by n_iter.
- If all parameters are presented as a list,
- sampling without replacement is performed. If at least one parameter
- is given as a distribution, sampling with replacement is used.
- It is highly recommended to use continuous distributions for continuous
- parameters.
- Read more in the :ref:`User Guide <randomized_parameter_search>`.
- .. versionadded:: 0.14
- Parameters
- ----------
- estimator : estimator object
- An object of that type is instantiated for each grid point.
- This is assumed to implement the scikit-learn estimator interface.
- Either estimator needs to provide a ``score`` function,
- or ``scoring`` must be passed.
- param_distributions : dict or list of dicts
- Dictionary with parameters names (`str`) as keys and distributions
- or lists of parameters to try. Distributions must provide a ``rvs``
- method for sampling (such as those from scipy.stats.distributions).
- If a list is given, it is sampled uniformly.
- If a list of dicts is given, first a dict is sampled uniformly, and
- then a parameter is sampled using that dict as above.
- n_iter : int, default=10
- Number of parameter settings that are sampled. n_iter trades
- off runtime vs quality of the solution.
- scoring : str, callable, list, tuple or dict, default=None
- Strategy to evaluate the performance of the cross-validated model on
- the test set.
- If `scoring` represents a single score, one can use:
- - a single string (see :ref:`scoring_parameter`);
- - a callable (see :ref:`scoring`) that returns a single value.
- If `scoring` represents multiple scores, one can use:
- - a list or tuple of unique strings;
- - a callable returning a dictionary where the keys are the metric
- names and the values are the metric scores;
- - a dictionary with metric names as keys and callables a values.
- See :ref:`multimetric_grid_search` for an example.
- If None, the estimator's score method is used.
- n_jobs : int, default=None
- Number of jobs to run in parallel.
- ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
- ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
- for more details.
- .. versionchanged:: v0.20
- `n_jobs` default changed from 1 to None
- refit : bool, str, or callable, default=True
- Refit an estimator using the best found parameters on the whole
- dataset.
- For multiple metric evaluation, this needs to be a `str` denoting the
- scorer that would be used to find the best parameters for refitting
- the estimator at the end.
- Where there are considerations other than maximum score in
- choosing a best estimator, ``refit`` can be set to a function which
- returns the selected ``best_index_`` given the ``cv_results``. In that
- case, the ``best_estimator_`` and ``best_params_`` will be set
- according to the returned ``best_index_`` while the ``best_score_``
- attribute will not be available.
- The refitted estimator is made available at the ``best_estimator_``
- attribute and permits using ``predict`` directly on this
- ``RandomizedSearchCV`` instance.
- Also for multiple metric evaluation, the attributes ``best_index_``,
- ``best_score_`` and ``best_params_`` will only be available if
- ``refit`` is set and all of them will be determined w.r.t this specific
- scorer.
- See ``scoring`` parameter to know more about multiple metric
- evaluation.
- .. versionchanged:: 0.20
- Support for callable added.
- cv : int, cross-validation generator or an iterable, default=None
- Determines the cross-validation splitting strategy.
- Possible inputs for cv are:
- - None, to use the default 5-fold cross validation,
- - integer, to specify the number of folds in a `(Stratified)KFold`,
- - :term:`CV splitter`,
- - An iterable yielding (train, test) splits as arrays of indices.
- For integer/None inputs, if the estimator is a classifier and ``y`` is
- either binary or multiclass, :class:`StratifiedKFold` is used. In all
- other cases, :class:`KFold` is used. These splitters are instantiated
- with `shuffle=False` so the splits will be the same across calls.
- Refer :ref:`User Guide <cross_validation>` for the various
- cross-validation strategies that can be used here.
- .. versionchanged:: 0.22
- ``cv`` default value if None changed from 3-fold to 5-fold.
- verbose : int
- Controls the verbosity: the higher, the more messages.
- - >1 : the computation time for each fold and parameter candidate is
- displayed;
- - >2 : the score is also displayed;
- - >3 : the fold and candidate parameter indexes are also displayed
- together with the starting time of the computation.
- pre_dispatch : int, or str, default='2*n_jobs'
- Controls the number of jobs that get dispatched during parallel
- execution. Reducing this number can be useful to avoid an
- explosion of memory consumption when more jobs get dispatched
- than CPUs can process. This parameter can be:
- - None, in which case all the jobs are immediately
- created and spawned. Use this for lightweight and
- fast-running jobs, to avoid delays due to on-demand
- spawning of the jobs
- - An int, giving the exact number of total jobs that are
- spawned
- - A str, giving an expression as a function of n_jobs,
- as in '2*n_jobs'
- random_state : int, RandomState instance or None, default=None
- Pseudo random number generator state used for random uniform sampling
- from lists of possible values instead of scipy.stats distributions.
- Pass an int for reproducible output across multiple
- function calls.
- See :term:`Glossary <random_state>`.
- error_score : 'raise' or numeric, default=np.nan
- Value to assign to the score if an error occurs in estimator fitting.
- If set to 'raise', the error is raised. If a numeric value is given,
- FitFailedWarning is raised. This parameter does not affect the refit
- step, which will always raise the error.
- return_train_score : bool, default=False
- If ``False``, the ``cv_results_`` attribute will not include training
- scores.
- Computing training scores is used to get insights on how different
- parameter settings impact the overfitting/underfitting trade-off.
- However computing the scores on the training set can be computationally
- expensive and is not strictly required to select the parameters that
- yield the best generalization performance.
- .. versionadded:: 0.19
- .. versionchanged:: 0.21
- Default value was changed from ``True`` to ``False``
- Attributes
- ----------
- cv_results_ : dict of numpy (masked) ndarrays
- A dict with keys as column headers and values as columns, that can be
- imported into a pandas ``DataFrame``.
- For instance the below given table
- +--------------+-------------+-------------------+---+---------------+
- | param_kernel | param_gamma | split0_test_score |...|rank_test_score|
- +==============+=============+===================+===+===============+
- | 'rbf' | 0.1 | 0.80 |...| 1 |
- +--------------+-------------+-------------------+---+---------------+
- | 'rbf' | 0.2 | 0.84 |...| 3 |
- +--------------+-------------+-------------------+---+---------------+
- | 'rbf' | 0.3 | 0.70 |...| 2 |
- +--------------+-------------+-------------------+---+---------------+
- will be represented by a ``cv_results_`` dict of::
- {
- 'param_kernel' : masked_array(data = ['rbf', 'rbf', 'rbf'],
- mask = False),
- 'param_gamma' : masked_array(data = [0.1 0.2 0.3], mask = False),
- 'split0_test_score' : [0.80, 0.84, 0.70],
- 'split1_test_score' : [0.82, 0.50, 0.70],
- 'mean_test_score' : [0.81, 0.67, 0.70],
- 'std_test_score' : [0.01, 0.24, 0.00],
- 'rank_test_score' : [1, 3, 2],
- 'split0_train_score' : [0.80, 0.92, 0.70],
- 'split1_train_score' : [0.82, 0.55, 0.70],
- 'mean_train_score' : [0.81, 0.74, 0.70],
- 'std_train_score' : [0.01, 0.19, 0.00],
- 'mean_fit_time' : [0.73, 0.63, 0.43],
- 'std_fit_time' : [0.01, 0.02, 0.01],
- 'mean_score_time' : [0.01, 0.06, 0.04],
- 'std_score_time' : [0.00, 0.00, 0.00],
- 'params' : [{'kernel' : 'rbf', 'gamma' : 0.1}, ...],
- }
- NOTE
- The key ``'params'`` is used to store a list of parameter
- settings dicts for all the parameter candidates.
- The ``mean_fit_time``, ``std_fit_time``, ``mean_score_time`` and
- ``std_score_time`` are all in seconds.
- For multi-metric evaluation, the scores for all the scorers are
- available in the ``cv_results_`` dict at the keys ending with that
- scorer's name (``'_<scorer_name>'``) instead of ``'_score'`` shown
- above. ('split0_test_precision', 'mean_train_precision' etc.)
- best_estimator_ : estimator
- Estimator that was chosen by the search, i.e. estimator
- which gave highest score (or smallest loss if specified)
- on the left out data. Not available if ``refit=False``.
- For multi-metric evaluation, this attribute is present only if
- ``refit`` is specified.
- See ``refit`` parameter for more information on allowed values.
- best_score_ : float
- Mean cross-validated score of the best_estimator.
- For multi-metric evaluation, this is not available if ``refit`` is
- ``False``. See ``refit`` parameter for more information.
- This attribute is not available if ``refit`` is a function.
- best_params_ : dict
- Parameter setting that gave the best results on the hold out data.
- For multi-metric evaluation, this is not available if ``refit`` is
- ``False``. See ``refit`` parameter for more information.
- best_index_ : int
- The index (of the ``cv_results_`` arrays) which corresponds to the best
- candidate parameter setting.
- The dict at ``search.cv_results_['params'][search.best_index_]`` gives
- the parameter setting for the best model, that gives the highest
- mean score (``search.best_score_``).
- For multi-metric evaluation, this is not available if ``refit`` is
- ``False``. See ``refit`` parameter for more information.
- scorer_ : function or a dict
- Scorer function used on the held out data to choose the best
- parameters for the model.
- For multi-metric evaluation, this attribute holds the validated
- ``scoring`` dict which maps the scorer key to the scorer callable.
- n_splits_ : int
- The number of cross-validation splits (folds/iterations).
- refit_time_ : float
- Seconds used for refitting the best model on the whole dataset.
- This is present only if ``refit`` is not False.
- .. versionadded:: 0.20
- multimetric_ : bool
- Whether or not the scorers compute several metrics.
- classes_ : ndarray of shape (n_classes,)
- The classes labels. This is present only if ``refit`` is specified and
- the underlying estimator is a classifier.
- n_features_in_ : int
- Number of features seen during :term:`fit`. Only defined if
- `best_estimator_` is defined (see the documentation for the `refit`
- parameter for more details) and that `best_estimator_` exposes
- `n_features_in_` when fit.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Only defined if
- `best_estimator_` is defined (see the documentation for the `refit`
- parameter for more details) and that `best_estimator_` exposes
- `feature_names_in_` when fit.
- .. versionadded:: 1.0
- See Also
- --------
- GridSearchCV : Does exhaustive search over a grid of parameters.
- ParameterSampler : A generator over parameter settings, constructed from
- param_distributions.
- Notes
- -----
- The parameters selected are those that maximize the score of the held-out
- data, according to the scoring parameter.
- If `n_jobs` was set to a value higher than one, the data is copied for each
- parameter setting(and not `n_jobs` times). This is done for efficiency
- reasons if individual jobs take very little time, but may raise errors if
- the dataset is large and not enough memory is available. A workaround in
- this case is to set `pre_dispatch`. Then, the memory is copied only
- `pre_dispatch` many times. A reasonable value for `pre_dispatch` is `2 *
- n_jobs`.
- Examples
- --------
- >>> from sklearn.datasets import load_iris
- >>> from sklearn.linear_model import LogisticRegression
- >>> from sklearn.model_selection import RandomizedSearchCV
- >>> from scipy.stats import uniform
- >>> iris = load_iris()
- >>> logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200,
- ... random_state=0)
- >>> distributions = dict(C=uniform(loc=0, scale=4),
- ... penalty=['l2', 'l1'])
- >>> clf = RandomizedSearchCV(logistic, distributions, random_state=0)
- >>> search = clf.fit(iris.data, iris.target)
- >>> search.best_params_
- {'C': 2..., 'penalty': 'l1'}
- """
- _required_parameters = ["estimator", "param_distributions"]
- _parameter_constraints: dict = {
- **BaseSearchCV._parameter_constraints,
- "param_distributions": [dict, list],
- "n_iter": [Interval(numbers.Integral, 1, None, closed="left")],
- "random_state": ["random_state"],
- }
- def __init__(
- self,
- estimator,
- param_distributions,
- *,
- n_iter=10,
- scoring=None,
- n_jobs=None,
- refit=True,
- cv=None,
- verbose=0,
- pre_dispatch="2*n_jobs",
- random_state=None,
- error_score=np.nan,
- return_train_score=False,
- ):
- self.param_distributions = param_distributions
- self.n_iter = n_iter
- self.random_state = random_state
- super().__init__(
- estimator=estimator,
- scoring=scoring,
- n_jobs=n_jobs,
- refit=refit,
- cv=cv,
- verbose=verbose,
- pre_dispatch=pre_dispatch,
- error_score=error_score,
- return_train_score=return_train_score,
- )
- def _run_search(self, evaluate_candidates):
- """Search n_iter candidates from param_distributions"""
- evaluate_candidates(
- ParameterSampler(
- self.param_distributions, self.n_iter, random_state=self.random_state
- )
- )
|