| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783 |
- """
- The :mod:`sklearn.model_selection._split` module includes classes and
- functions to split the data based on a preset strategy.
- """
- # Author: Alexandre Gramfort <alexandre.gramfort@inria.fr>
- # Gael Varoquaux <gael.varoquaux@normalesup.org>
- # Olivier Grisel <olivier.grisel@ensta.org>
- # Raghav RV <rvraghav93@gmail.com>
- # Leandro Hermida <hermidal@cs.umd.edu>
- # Rodion Martynov <marrodion@gmail.com>
- # License: BSD 3 clause
- import numbers
- import warnings
- from abc import ABCMeta, abstractmethod
- from collections import defaultdict
- from collections.abc import Iterable
- from inspect import signature
- from itertools import chain, combinations
- from math import ceil, floor
- import numpy as np
- from scipy.special import comb
- from ..utils import (
- _approximate_mode,
- _safe_indexing,
- check_random_state,
- indexable,
- metadata_routing,
- )
- from ..utils._param_validation import Interval, RealNotInt, validate_params
- from ..utils.metadata_routing import _MetadataRequester
- from ..utils.multiclass import type_of_target
- from ..utils.validation import _num_samples, check_array, column_or_1d
- __all__ = [
- "BaseCrossValidator",
- "KFold",
- "GroupKFold",
- "LeaveOneGroupOut",
- "LeaveOneOut",
- "LeavePGroupsOut",
- "LeavePOut",
- "RepeatedStratifiedKFold",
- "RepeatedKFold",
- "ShuffleSplit",
- "GroupShuffleSplit",
- "StratifiedKFold",
- "StratifiedGroupKFold",
- "StratifiedShuffleSplit",
- "PredefinedSplit",
- "train_test_split",
- "check_cv",
- ]
- class GroupsConsumerMixin(_MetadataRequester):
- """A Mixin to ``groups`` by default.
- This Mixin makes the object to request ``groups`` by default as ``True``.
- .. versionadded:: 1.3
- """
- __metadata_request__split = {"groups": True}
- class BaseCrossValidator(_MetadataRequester, metaclass=ABCMeta):
- """Base class for all cross-validators
- Implementations must define `_iter_test_masks` or `_iter_test_indices`.
- """
- # This indicates that by default CV splitters don't have a "groups" kwarg,
- # unless indicated by inheriting from ``GroupsConsumerMixin``.
- # This also prevents ``set_split_request`` to be generated for splitters
- # which don't support ``groups``.
- __metadata_request__split = {"groups": metadata_routing.UNUSED}
- def split(self, X, y=None, groups=None):
- """Generate indices to split data into training and test set.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Training data, where `n_samples` is the number of samples
- and `n_features` is the number of features.
- y : array-like of shape (n_samples,)
- The target variable for supervised learning problems.
- groups : array-like of shape (n_samples,), default=None
- Group labels for the samples used while splitting the dataset into
- train/test set.
- Yields
- ------
- train : ndarray
- The training set indices for that split.
- test : ndarray
- The testing set indices for that split.
- """
- X, y, groups = indexable(X, y, groups)
- indices = np.arange(_num_samples(X))
- for test_index in self._iter_test_masks(X, y, groups):
- train_index = indices[np.logical_not(test_index)]
- test_index = indices[test_index]
- yield train_index, test_index
- # Since subclasses must implement either _iter_test_masks or
- # _iter_test_indices, neither can be abstract.
- def _iter_test_masks(self, X=None, y=None, groups=None):
- """Generates boolean masks corresponding to test sets.
- By default, delegates to _iter_test_indices(X, y, groups)
- """
- for test_index in self._iter_test_indices(X, y, groups):
- test_mask = np.zeros(_num_samples(X), dtype=bool)
- test_mask[test_index] = True
- yield test_mask
- def _iter_test_indices(self, X=None, y=None, groups=None):
- """Generates integer indices corresponding to test sets."""
- raise NotImplementedError
- @abstractmethod
- def get_n_splits(self, X=None, y=None, groups=None):
- """Returns the number of splitting iterations in the cross-validator"""
- def __repr__(self):
- return _build_repr(self)
- class LeaveOneOut(BaseCrossValidator):
- """Leave-One-Out cross-validator
- Provides train/test indices to split data in train/test sets. Each
- sample is used once as a test set (singleton) while the remaining
- samples form the training set.
- Note: ``LeaveOneOut()`` is equivalent to ``KFold(n_splits=n)`` and
- ``LeavePOut(p=1)`` where ``n`` is the number of samples.
- Due to the high number of test sets (which is the same as the
- number of samples) this cross-validation method can be very costly.
- For large datasets one should favor :class:`KFold`, :class:`ShuffleSplit`
- or :class:`StratifiedKFold`.
- Read more in the :ref:`User Guide <leave_one_out>`.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.model_selection import LeaveOneOut
- >>> X = np.array([[1, 2], [3, 4]])
- >>> y = np.array([1, 2])
- >>> loo = LeaveOneOut()
- >>> loo.get_n_splits(X)
- 2
- >>> print(loo)
- LeaveOneOut()
- >>> for i, (train_index, test_index) in enumerate(loo.split(X)):
- ... print(f"Fold {i}:")
- ... print(f" Train: index={train_index}")
- ... print(f" Test: index={test_index}")
- Fold 0:
- Train: index=[1]
- Test: index=[0]
- Fold 1:
- Train: index=[0]
- Test: index=[1]
- See Also
- --------
- LeaveOneGroupOut : For splitting the data according to explicit,
- domain-specific stratification of the dataset.
- GroupKFold : K-fold iterator variant with non-overlapping groups.
- """
- def _iter_test_indices(self, X, y=None, groups=None):
- n_samples = _num_samples(X)
- if n_samples <= 1:
- raise ValueError(
- "Cannot perform LeaveOneOut with n_samples={}.".format(n_samples)
- )
- return range(n_samples)
- def get_n_splits(self, X, y=None, groups=None):
- """Returns the number of splitting iterations in the cross-validator
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Training data, where `n_samples` is the number of samples
- and `n_features` is the number of features.
- y : object
- Always ignored, exists for compatibility.
- groups : object
- Always ignored, exists for compatibility.
- Returns
- -------
- n_splits : int
- Returns the number of splitting iterations in the cross-validator.
- """
- if X is None:
- raise ValueError("The 'X' parameter should not be None.")
- return _num_samples(X)
- class LeavePOut(BaseCrossValidator):
- """Leave-P-Out cross-validator
- Provides train/test indices to split data in train/test sets. This results
- in testing on all distinct samples of size p, while the remaining n - p
- samples form the training set in each iteration.
- Note: ``LeavePOut(p)`` is NOT equivalent to
- ``KFold(n_splits=n_samples // p)`` which creates non-overlapping test sets.
- Due to the high number of iterations which grows combinatorically with the
- number of samples this cross-validation method can be very costly. For
- large datasets one should favor :class:`KFold`, :class:`StratifiedKFold`
- or :class:`ShuffleSplit`.
- Read more in the :ref:`User Guide <leave_p_out>`.
- Parameters
- ----------
- p : int
- Size of the test sets. Must be strictly less than the number of
- samples.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.model_selection import LeavePOut
- >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
- >>> y = np.array([1, 2, 3, 4])
- >>> lpo = LeavePOut(2)
- >>> lpo.get_n_splits(X)
- 6
- >>> print(lpo)
- LeavePOut(p=2)
- >>> for i, (train_index, test_index) in enumerate(lpo.split(X)):
- ... print(f"Fold {i}:")
- ... print(f" Train: index={train_index}")
- ... print(f" Test: index={test_index}")
- Fold 0:
- Train: index=[2 3]
- Test: index=[0 1]
- Fold 1:
- Train: index=[1 3]
- Test: index=[0 2]
- Fold 2:
- Train: index=[1 2]
- Test: index=[0 3]
- Fold 3:
- Train: index=[0 3]
- Test: index=[1 2]
- Fold 4:
- Train: index=[0 2]
- Test: index=[1 3]
- Fold 5:
- Train: index=[0 1]
- Test: index=[2 3]
- """
- def __init__(self, p):
- self.p = p
- def _iter_test_indices(self, X, y=None, groups=None):
- n_samples = _num_samples(X)
- if n_samples <= self.p:
- raise ValueError(
- "p={} must be strictly less than the number of samples={}".format(
- self.p, n_samples
- )
- )
- for combination in combinations(range(n_samples), self.p):
- yield np.array(combination)
- def get_n_splits(self, X, y=None, groups=None):
- """Returns the number of splitting iterations in the cross-validator
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Training data, where `n_samples` is the number of samples
- and `n_features` is the number of features.
- y : object
- Always ignored, exists for compatibility.
- groups : object
- Always ignored, exists for compatibility.
- """
- if X is None:
- raise ValueError("The 'X' parameter should not be None.")
- return int(comb(_num_samples(X), self.p, exact=True))
- class _BaseKFold(BaseCrossValidator, metaclass=ABCMeta):
- """Base class for KFold, GroupKFold, and StratifiedKFold"""
- @abstractmethod
- def __init__(self, n_splits, *, shuffle, random_state):
- if not isinstance(n_splits, numbers.Integral):
- raise ValueError(
- "The number of folds must be of Integral type. "
- "%s of type %s was passed." % (n_splits, type(n_splits))
- )
- n_splits = int(n_splits)
- if n_splits <= 1:
- raise ValueError(
- "k-fold cross-validation requires at least one"
- " train/test split by setting n_splits=2 or more,"
- " got n_splits={0}.".format(n_splits)
- )
- if not isinstance(shuffle, bool):
- raise TypeError("shuffle must be True or False; got {0}".format(shuffle))
- if not shuffle and random_state is not None: # None is the default
- raise ValueError(
- (
- "Setting a random_state has no effect since shuffle is "
- "False. You should leave "
- "random_state to its default (None), or set shuffle=True."
- ),
- )
- self.n_splits = n_splits
- self.shuffle = shuffle
- self.random_state = random_state
- def split(self, X, y=None, groups=None):
- """Generate indices to split data into training and test set.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Training data, where `n_samples` is the number of samples
- and `n_features` is the number of features.
- y : array-like of shape (n_samples,), default=None
- The target variable for supervised learning problems.
- groups : array-like of shape (n_samples,), default=None
- Group labels for the samples used while splitting the dataset into
- train/test set.
- Yields
- ------
- train : ndarray
- The training set indices for that split.
- test : ndarray
- The testing set indices for that split.
- """
- X, y, groups = indexable(X, y, groups)
- n_samples = _num_samples(X)
- if self.n_splits > n_samples:
- raise ValueError(
- (
- "Cannot have number of splits n_splits={0} greater"
- " than the number of samples: n_samples={1}."
- ).format(self.n_splits, n_samples)
- )
- for train, test in super().split(X, y, groups):
- yield train, test
- def get_n_splits(self, X=None, y=None, groups=None):
- """Returns the number of splitting iterations in the cross-validator
- Parameters
- ----------
- X : object
- Always ignored, exists for compatibility.
- y : object
- Always ignored, exists for compatibility.
- groups : object
- Always ignored, exists for compatibility.
- Returns
- -------
- n_splits : int
- Returns the number of splitting iterations in the cross-validator.
- """
- return self.n_splits
- class KFold(_BaseKFold):
- """K-Folds cross-validator
- Provides train/test indices to split data in train/test sets. Split
- dataset into k consecutive folds (without shuffling by default).
- Each fold is then used once as a validation while the k - 1 remaining
- folds form the training set.
- Read more in the :ref:`User Guide <k_fold>`.
- For visualisation of cross-validation behaviour and
- comparison between common scikit-learn split methods
- refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
- Parameters
- ----------
- n_splits : int, default=5
- Number of folds. Must be at least 2.
- .. versionchanged:: 0.22
- ``n_splits`` default value changed from 3 to 5.
- shuffle : bool, default=False
- Whether to shuffle the data before splitting into batches.
- Note that the samples within each split will not be shuffled.
- random_state : int, RandomState instance or None, default=None
- When `shuffle` is True, `random_state` affects the ordering of the
- indices, which controls the randomness of each fold. Otherwise, this
- parameter has no effect.
- Pass an int for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.model_selection import KFold
- >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
- >>> y = np.array([1, 2, 3, 4])
- >>> kf = KFold(n_splits=2)
- >>> kf.get_n_splits(X)
- 2
- >>> print(kf)
- KFold(n_splits=2, random_state=None, shuffle=False)
- >>> for i, (train_index, test_index) in enumerate(kf.split(X)):
- ... print(f"Fold {i}:")
- ... print(f" Train: index={train_index}")
- ... print(f" Test: index={test_index}")
- Fold 0:
- Train: index=[2 3]
- Test: index=[0 1]
- Fold 1:
- Train: index=[0 1]
- Test: index=[2 3]
- Notes
- -----
- The first ``n_samples % n_splits`` folds have size
- ``n_samples // n_splits + 1``, other folds have size
- ``n_samples // n_splits``, where ``n_samples`` is the number of samples.
- Randomized CV splitters may return different results for each call of
- split. You can make the results identical by setting `random_state`
- to an integer.
- See Also
- --------
- StratifiedKFold : Takes class information into account to avoid building
- folds with imbalanced class distributions (for binary or multiclass
- classification tasks).
- GroupKFold : K-fold iterator variant with non-overlapping groups.
- RepeatedKFold : Repeats K-Fold n times.
- """
- def __init__(self, n_splits=5, *, shuffle=False, random_state=None):
- super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
- def _iter_test_indices(self, X, y=None, groups=None):
- n_samples = _num_samples(X)
- indices = np.arange(n_samples)
- if self.shuffle:
- check_random_state(self.random_state).shuffle(indices)
- n_splits = self.n_splits
- fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=int)
- fold_sizes[: n_samples % n_splits] += 1
- current = 0
- for fold_size in fold_sizes:
- start, stop = current, current + fold_size
- yield indices[start:stop]
- current = stop
- class GroupKFold(GroupsConsumerMixin, _BaseKFold):
- """K-fold iterator variant with non-overlapping groups.
- Each group will appear exactly once in the test set across all folds (the
- number of distinct groups has to be at least equal to the number of folds).
- The folds are approximately balanced in the sense that the number of
- distinct groups is approximately the same in each fold.
- Read more in the :ref:`User Guide <group_k_fold>`.
- For visualisation of cross-validation behaviour and
- comparison between common scikit-learn split methods
- refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
- Parameters
- ----------
- n_splits : int, default=5
- Number of folds. Must be at least 2.
- .. versionchanged:: 0.22
- ``n_splits`` default value changed from 3 to 5.
- Notes
- -----
- Groups appear in an arbitrary order throughout the folds.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.model_selection import GroupKFold
- >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12]])
- >>> y = np.array([1, 2, 3, 4, 5, 6])
- >>> groups = np.array([0, 0, 2, 2, 3, 3])
- >>> group_kfold = GroupKFold(n_splits=2)
- >>> group_kfold.get_n_splits(X, y, groups)
- 2
- >>> print(group_kfold)
- GroupKFold(n_splits=2)
- >>> for i, (train_index, test_index) in enumerate(group_kfold.split(X, y, groups)):
- ... print(f"Fold {i}:")
- ... print(f" Train: index={train_index}, group={groups[train_index]}")
- ... print(f" Test: index={test_index}, group={groups[test_index]}")
- Fold 0:
- Train: index=[2 3], group=[2 2]
- Test: index=[0 1 4 5], group=[0 0 3 3]
- Fold 1:
- Train: index=[0 1 4 5], group=[0 0 3 3]
- Test: index=[2 3], group=[2 2]
- See Also
- --------
- LeaveOneGroupOut : For splitting the data according to explicit
- domain-specific stratification of the dataset.
- StratifiedKFold : Takes class information into account to avoid building
- folds with imbalanced class proportions (for binary or multiclass
- classification tasks).
- """
- def __init__(self, n_splits=5):
- super().__init__(n_splits, shuffle=False, random_state=None)
- def _iter_test_indices(self, X, y, groups):
- if groups is None:
- raise ValueError("The 'groups' parameter should not be None.")
- groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None)
- unique_groups, groups = np.unique(groups, return_inverse=True)
- n_groups = len(unique_groups)
- if self.n_splits > n_groups:
- raise ValueError(
- "Cannot have number of splits n_splits=%d greater"
- " than the number of groups: %d." % (self.n_splits, n_groups)
- )
- # Weight groups by their number of occurrences
- n_samples_per_group = np.bincount(groups)
- # Distribute the most frequent groups first
- indices = np.argsort(n_samples_per_group)[::-1]
- n_samples_per_group = n_samples_per_group[indices]
- # Total weight of each fold
- n_samples_per_fold = np.zeros(self.n_splits)
- # Mapping from group index to fold index
- group_to_fold = np.zeros(len(unique_groups))
- # Distribute samples by adding the largest weight to the lightest fold
- for group_index, weight in enumerate(n_samples_per_group):
- lightest_fold = np.argmin(n_samples_per_fold)
- n_samples_per_fold[lightest_fold] += weight
- group_to_fold[indices[group_index]] = lightest_fold
- indices = group_to_fold[groups]
- for f in range(self.n_splits):
- yield np.where(indices == f)[0]
- def split(self, X, y=None, groups=None):
- """Generate indices to split data into training and test set.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Training data, where `n_samples` is the number of samples
- and `n_features` is the number of features.
- y : array-like of shape (n_samples,), default=None
- The target variable for supervised learning problems.
- groups : array-like of shape (n_samples,)
- Group labels for the samples used while splitting the dataset into
- train/test set.
- Yields
- ------
- train : ndarray
- The training set indices for that split.
- test : ndarray
- The testing set indices for that split.
- """
- return super().split(X, y, groups)
- class StratifiedKFold(_BaseKFold):
- """Stratified K-Folds cross-validator.
- Provides train/test indices to split data in train/test sets.
- This cross-validation object is a variation of KFold that returns
- stratified folds. The folds are made by preserving the percentage of
- samples for each class.
- Read more in the :ref:`User Guide <stratified_k_fold>`.
- For visualisation of cross-validation behaviour and
- comparison between common scikit-learn split methods
- refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
- Parameters
- ----------
- n_splits : int, default=5
- Number of folds. Must be at least 2.
- .. versionchanged:: 0.22
- ``n_splits`` default value changed from 3 to 5.
- shuffle : bool, default=False
- Whether to shuffle each class's samples before splitting into batches.
- Note that the samples within each split will not be shuffled.
- random_state : int, RandomState instance or None, default=None
- When `shuffle` is True, `random_state` affects the ordering of the
- indices, which controls the randomness of each fold for each class.
- Otherwise, leave `random_state` as `None`.
- Pass an int for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.model_selection import StratifiedKFold
- >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
- >>> y = np.array([0, 0, 1, 1])
- >>> skf = StratifiedKFold(n_splits=2)
- >>> skf.get_n_splits(X, y)
- 2
- >>> print(skf)
- StratifiedKFold(n_splits=2, random_state=None, shuffle=False)
- >>> for i, (train_index, test_index) in enumerate(skf.split(X, y)):
- ... print(f"Fold {i}:")
- ... print(f" Train: index={train_index}")
- ... print(f" Test: index={test_index}")
- Fold 0:
- Train: index=[1 3]
- Test: index=[0 2]
- Fold 1:
- Train: index=[0 2]
- Test: index=[1 3]
- Notes
- -----
- The implementation is designed to:
- * Generate test sets such that all contain the same distribution of
- classes, or as close as possible.
- * Be invariant to class label: relabelling ``y = ["Happy", "Sad"]`` to
- ``y = [1, 0]`` should not change the indices generated.
- * Preserve order dependencies in the dataset ordering, when
- ``shuffle=False``: all samples from class k in some test set were
- contiguous in y, or separated in y by samples from classes other than k.
- * Generate test sets where the smallest and largest differ by at most one
- sample.
- .. versionchanged:: 0.22
- The previous implementation did not follow the last constraint.
- See Also
- --------
- RepeatedStratifiedKFold : Repeats Stratified K-Fold n times.
- """
- def __init__(self, n_splits=5, *, shuffle=False, random_state=None):
- super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
- def _make_test_folds(self, X, y=None):
- rng = check_random_state(self.random_state)
- y = np.asarray(y)
- type_of_target_y = type_of_target(y)
- allowed_target_types = ("binary", "multiclass")
- if type_of_target_y not in allowed_target_types:
- raise ValueError(
- "Supported target types are: {}. Got {!r} instead.".format(
- allowed_target_types, type_of_target_y
- )
- )
- y = column_or_1d(y)
- _, y_idx, y_inv = np.unique(y, return_index=True, return_inverse=True)
- # y_inv encodes y according to lexicographic order. We invert y_idx to
- # map the classes so that they are encoded by order of appearance:
- # 0 represents the first label appearing in y, 1 the second, etc.
- _, class_perm = np.unique(y_idx, return_inverse=True)
- y_encoded = class_perm[y_inv]
- n_classes = len(y_idx)
- y_counts = np.bincount(y_encoded)
- min_groups = np.min(y_counts)
- if np.all(self.n_splits > y_counts):
- raise ValueError(
- "n_splits=%d cannot be greater than the"
- " number of members in each class." % (self.n_splits)
- )
- if self.n_splits > min_groups:
- warnings.warn(
- "The least populated class in y has only %d"
- " members, which is less than n_splits=%d."
- % (min_groups, self.n_splits),
- UserWarning,
- )
- # Determine the optimal number of samples from each class in each fold,
- # using round robin over the sorted y. (This can be done direct from
- # counts, but that code is unreadable.)
- y_order = np.sort(y_encoded)
- allocation = np.asarray(
- [
- np.bincount(y_order[i :: self.n_splits], minlength=n_classes)
- for i in range(self.n_splits)
- ]
- )
- # To maintain the data order dependencies as best as possible within
- # the stratification constraint, we assign samples from each class in
- # blocks (and then mess that up when shuffle=True).
- test_folds = np.empty(len(y), dtype="i")
- for k in range(n_classes):
- # since the kth column of allocation stores the number of samples
- # of class k in each test set, this generates blocks of fold
- # indices corresponding to the allocation for class k.
- folds_for_class = np.arange(self.n_splits).repeat(allocation[:, k])
- if self.shuffle:
- rng.shuffle(folds_for_class)
- test_folds[y_encoded == k] = folds_for_class
- return test_folds
- def _iter_test_masks(self, X, y=None, groups=None):
- test_folds = self._make_test_folds(X, y)
- for i in range(self.n_splits):
- yield test_folds == i
- def split(self, X, y, groups=None):
- """Generate indices to split data into training and test set.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Training data, where `n_samples` is the number of samples
- and `n_features` is the number of features.
- Note that providing ``y`` is sufficient to generate the splits and
- hence ``np.zeros(n_samples)`` may be used as a placeholder for
- ``X`` instead of actual training data.
- y : array-like of shape (n_samples,)
- The target variable for supervised learning problems.
- Stratification is done based on the y labels.
- groups : object
- Always ignored, exists for compatibility.
- Yields
- ------
- train : ndarray
- The training set indices for that split.
- test : ndarray
- The testing set indices for that split.
- Notes
- -----
- Randomized CV splitters may return different results for each call of
- split. You can make the results identical by setting `random_state`
- to an integer.
- """
- y = check_array(y, input_name="y", ensure_2d=False, dtype=None)
- return super().split(X, y, groups)
- class StratifiedGroupKFold(GroupsConsumerMixin, _BaseKFold):
- """Stratified K-Folds iterator variant with non-overlapping groups.
- This cross-validation object is a variation of StratifiedKFold attempts to
- return stratified folds with non-overlapping groups. The folds are made by
- preserving the percentage of samples for each class.
- Each group will appear exactly once in the test set across all folds (the
- number of distinct groups has to be at least equal to the number of folds).
- The difference between :class:`~sklearn.model_selection.GroupKFold`
- and :class:`~sklearn.model_selection.StratifiedGroupKFold` is that
- the former attempts to create balanced folds such that the number of
- distinct groups is approximately the same in each fold, whereas
- StratifiedGroupKFold attempts to create folds which preserve the
- percentage of samples for each class as much as possible given the
- constraint of non-overlapping groups between splits.
- Read more in the :ref:`User Guide <cross_validation>`.
- For visualisation of cross-validation behaviour and
- comparison between common scikit-learn split methods
- refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
- Parameters
- ----------
- n_splits : int, default=5
- Number of folds. Must be at least 2.
- shuffle : bool, default=False
- Whether to shuffle each class's samples before splitting into batches.
- Note that the samples within each split will not be shuffled.
- This implementation can only shuffle groups that have approximately the
- same y distribution, no global shuffle will be performed.
- random_state : int or RandomState instance, default=None
- When `shuffle` is True, `random_state` affects the ordering of the
- indices, which controls the randomness of each fold for each class.
- Otherwise, leave `random_state` as `None`.
- Pass an int for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.model_selection import StratifiedGroupKFold
- >>> X = np.ones((17, 2))
- >>> y = np.array([0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
- >>> groups = np.array([1, 1, 2, 2, 3, 3, 3, 4, 5, 5, 5, 5, 6, 6, 7, 8, 8])
- >>> sgkf = StratifiedGroupKFold(n_splits=3)
- >>> sgkf.get_n_splits(X, y)
- 3
- >>> print(sgkf)
- StratifiedGroupKFold(n_splits=3, random_state=None, shuffle=False)
- >>> for i, (train_index, test_index) in enumerate(sgkf.split(X, y, groups)):
- ... print(f"Fold {i}:")
- ... print(f" Train: index={train_index}")
- ... print(f" group={groups[train_index]}")
- ... print(f" Test: index={test_index}")
- ... print(f" group={groups[test_index]}")
- Fold 0:
- Train: index=[ 0 1 2 3 7 8 9 10 11 15 16]
- group=[1 1 2 2 4 5 5 5 5 8 8]
- Test: index=[ 4 5 6 12 13 14]
- group=[3 3 3 6 6 7]
- Fold 1:
- Train: index=[ 4 5 6 7 8 9 10 11 12 13 14]
- group=[3 3 3 4 5 5 5 5 6 6 7]
- Test: index=[ 0 1 2 3 15 16]
- group=[1 1 2 2 8 8]
- Fold 2:
- Train: index=[ 0 1 2 3 4 5 6 12 13 14 15 16]
- group=[1 1 2 2 3 3 3 6 6 7 8 8]
- Test: index=[ 7 8 9 10 11]
- group=[4 5 5 5 5]
- Notes
- -----
- The implementation is designed to:
- * Mimic the behavior of StratifiedKFold as much as possible for trivial
- groups (e.g. when each group contains only one sample).
- * Be invariant to class label: relabelling ``y = ["Happy", "Sad"]`` to
- ``y = [1, 0]`` should not change the indices generated.
- * Stratify based on samples as much as possible while keeping
- non-overlapping groups constraint. That means that in some cases when
- there is a small number of groups containing a large number of samples
- the stratification will not be possible and the behavior will be close
- to GroupKFold.
- See also
- --------
- StratifiedKFold: Takes class information into account to build folds which
- retain class distributions (for binary or multiclass classification
- tasks).
- GroupKFold: K-fold iterator variant with non-overlapping groups.
- """
- def __init__(self, n_splits=5, shuffle=False, random_state=None):
- super().__init__(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
- def _iter_test_indices(self, X, y, groups):
- # Implementation is based on this kaggle kernel:
- # https://www.kaggle.com/jakubwasikowski/stratified-group-k-fold-cross-validation
- # and is a subject to Apache 2.0 License. You may obtain a copy of the
- # License at http://www.apache.org/licenses/LICENSE-2.0
- # Changelist:
- # - Refactored function to a class following scikit-learn KFold
- # interface.
- # - Added heuristic for assigning group to the least populated fold in
- # cases when all other criteria are equal
- # - Swtch from using python ``Counter`` to ``np.unique`` to get class
- # distribution
- # - Added scikit-learn checks for input: checking that target is binary
- # or multiclass, checking passed random state, checking that number
- # of splits is less than number of members in each class, checking
- # that least populated class has more members than there are splits.
- rng = check_random_state(self.random_state)
- y = np.asarray(y)
- type_of_target_y = type_of_target(y)
- allowed_target_types = ("binary", "multiclass")
- if type_of_target_y not in allowed_target_types:
- raise ValueError(
- "Supported target types are: {}. Got {!r} instead.".format(
- allowed_target_types, type_of_target_y
- )
- )
- y = column_or_1d(y)
- _, y_inv, y_cnt = np.unique(y, return_inverse=True, return_counts=True)
- if np.all(self.n_splits > y_cnt):
- raise ValueError(
- "n_splits=%d cannot be greater than the"
- " number of members in each class." % (self.n_splits)
- )
- n_smallest_class = np.min(y_cnt)
- if self.n_splits > n_smallest_class:
- warnings.warn(
- "The least populated class in y has only %d"
- " members, which is less than n_splits=%d."
- % (n_smallest_class, self.n_splits),
- UserWarning,
- )
- n_classes = len(y_cnt)
- _, groups_inv, groups_cnt = np.unique(
- groups, return_inverse=True, return_counts=True
- )
- y_counts_per_group = np.zeros((len(groups_cnt), n_classes))
- for class_idx, group_idx in zip(y_inv, groups_inv):
- y_counts_per_group[group_idx, class_idx] += 1
- y_counts_per_fold = np.zeros((self.n_splits, n_classes))
- groups_per_fold = defaultdict(set)
- if self.shuffle:
- rng.shuffle(y_counts_per_group)
- # Stable sort to keep shuffled order for groups with the same
- # class distribution variance
- sorted_groups_idx = np.argsort(
- -np.std(y_counts_per_group, axis=1), kind="mergesort"
- )
- for group_idx in sorted_groups_idx:
- group_y_counts = y_counts_per_group[group_idx]
- best_fold = self._find_best_fold(
- y_counts_per_fold=y_counts_per_fold,
- y_cnt=y_cnt,
- group_y_counts=group_y_counts,
- )
- y_counts_per_fold[best_fold] += group_y_counts
- groups_per_fold[best_fold].add(group_idx)
- for i in range(self.n_splits):
- test_indices = [
- idx
- for idx, group_idx in enumerate(groups_inv)
- if group_idx in groups_per_fold[i]
- ]
- yield test_indices
- def _find_best_fold(self, y_counts_per_fold, y_cnt, group_y_counts):
- best_fold = None
- min_eval = np.inf
- min_samples_in_fold = np.inf
- for i in range(self.n_splits):
- y_counts_per_fold[i] += group_y_counts
- # Summarise the distribution over classes in each proposed fold
- std_per_class = np.std(y_counts_per_fold / y_cnt.reshape(1, -1), axis=0)
- y_counts_per_fold[i] -= group_y_counts
- fold_eval = np.mean(std_per_class)
- samples_in_fold = np.sum(y_counts_per_fold[i])
- is_current_fold_better = (
- fold_eval < min_eval
- or np.isclose(fold_eval, min_eval)
- and samples_in_fold < min_samples_in_fold
- )
- if is_current_fold_better:
- min_eval = fold_eval
- min_samples_in_fold = samples_in_fold
- best_fold = i
- return best_fold
- class TimeSeriesSplit(_BaseKFold):
- """Time Series cross-validator
- Provides train/test indices to split time series data samples
- that are observed at fixed time intervals, in train/test sets.
- In each split, test indices must be higher than before, and thus shuffling
- in cross validator is inappropriate.
- This cross-validation object is a variation of :class:`KFold`.
- In the kth split, it returns first k folds as train set and the
- (k+1)th fold as test set.
- Note that unlike standard cross-validation methods, successive
- training sets are supersets of those that come before them.
- Read more in the :ref:`User Guide <time_series_split>`.
- For visualisation of cross-validation behaviour and
- comparison between common scikit-learn split methods
- refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
- .. versionadded:: 0.18
- Parameters
- ----------
- n_splits : int, default=5
- Number of splits. Must be at least 2.
- .. versionchanged:: 0.22
- ``n_splits`` default value changed from 3 to 5.
- max_train_size : int, default=None
- Maximum size for a single training set.
- test_size : int, default=None
- Used to limit the size of the test set. Defaults to
- ``n_samples // (n_splits + 1)``, which is the maximum allowed value
- with ``gap=0``.
- .. versionadded:: 0.24
- gap : int, default=0
- Number of samples to exclude from the end of each train set before
- the test set.
- .. versionadded:: 0.24
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.model_selection import TimeSeriesSplit
- >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
- >>> y = np.array([1, 2, 3, 4, 5, 6])
- >>> tscv = TimeSeriesSplit()
- >>> print(tscv)
- TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)
- >>> for i, (train_index, test_index) in enumerate(tscv.split(X)):
- ... print(f"Fold {i}:")
- ... print(f" Train: index={train_index}")
- ... print(f" Test: index={test_index}")
- Fold 0:
- Train: index=[0]
- Test: index=[1]
- Fold 1:
- Train: index=[0 1]
- Test: index=[2]
- Fold 2:
- Train: index=[0 1 2]
- Test: index=[3]
- Fold 3:
- Train: index=[0 1 2 3]
- Test: index=[4]
- Fold 4:
- Train: index=[0 1 2 3 4]
- Test: index=[5]
- >>> # Fix test_size to 2 with 12 samples
- >>> X = np.random.randn(12, 2)
- >>> y = np.random.randint(0, 2, 12)
- >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2)
- >>> for i, (train_index, test_index) in enumerate(tscv.split(X)):
- ... print(f"Fold {i}:")
- ... print(f" Train: index={train_index}")
- ... print(f" Test: index={test_index}")
- Fold 0:
- Train: index=[0 1 2 3 4 5]
- Test: index=[6 7]
- Fold 1:
- Train: index=[0 1 2 3 4 5 6 7]
- Test: index=[8 9]
- Fold 2:
- Train: index=[0 1 2 3 4 5 6 7 8 9]
- Test: index=[10 11]
- >>> # Add in a 2 period gap
- >>> tscv = TimeSeriesSplit(n_splits=3, test_size=2, gap=2)
- >>> for i, (train_index, test_index) in enumerate(tscv.split(X)):
- ... print(f"Fold {i}:")
- ... print(f" Train: index={train_index}")
- ... print(f" Test: index={test_index}")
- Fold 0:
- Train: index=[0 1 2 3]
- Test: index=[6 7]
- Fold 1:
- Train: index=[0 1 2 3 4 5]
- Test: index=[8 9]
- Fold 2:
- Train: index=[0 1 2 3 4 5 6 7]
- Test: index=[10 11]
- Notes
- -----
- The training set has size ``i * n_samples // (n_splits + 1)
- + n_samples % (n_splits + 1)`` in the ``i`` th split,
- with a test set of size ``n_samples//(n_splits + 1)`` by default,
- where ``n_samples`` is the number of samples.
- """
- def __init__(self, n_splits=5, *, max_train_size=None, test_size=None, gap=0):
- super().__init__(n_splits, shuffle=False, random_state=None)
- self.max_train_size = max_train_size
- self.test_size = test_size
- self.gap = gap
- def split(self, X, y=None, groups=None):
- """Generate indices to split data into training and test set.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Training data, where `n_samples` is the number of samples
- and `n_features` is the number of features.
- y : array-like of shape (n_samples,)
- Always ignored, exists for compatibility.
- groups : array-like of shape (n_samples,)
- Always ignored, exists for compatibility.
- Yields
- ------
- train : ndarray
- The training set indices for that split.
- test : ndarray
- The testing set indices for that split.
- """
- X, y, groups = indexable(X, y, groups)
- n_samples = _num_samples(X)
- n_splits = self.n_splits
- n_folds = n_splits + 1
- gap = self.gap
- test_size = (
- self.test_size if self.test_size is not None else n_samples // n_folds
- )
- # Make sure we have enough samples for the given split parameters
- if n_folds > n_samples:
- raise ValueError(
- f"Cannot have number of folds={n_folds} greater"
- f" than the number of samples={n_samples}."
- )
- if n_samples - gap - (test_size * n_splits) <= 0:
- raise ValueError(
- f"Too many splits={n_splits} for number of samples"
- f"={n_samples} with test_size={test_size} and gap={gap}."
- )
- indices = np.arange(n_samples)
- test_starts = range(n_samples - n_splits * test_size, n_samples, test_size)
- for test_start in test_starts:
- train_end = test_start - gap
- if self.max_train_size and self.max_train_size < train_end:
- yield (
- indices[train_end - self.max_train_size : train_end],
- indices[test_start : test_start + test_size],
- )
- else:
- yield (
- indices[:train_end],
- indices[test_start : test_start + test_size],
- )
- class LeaveOneGroupOut(GroupsConsumerMixin, BaseCrossValidator):
- """Leave One Group Out cross-validator
- Provides train/test indices to split data such that each training set is
- comprised of all samples except ones belonging to one specific group.
- Arbitrary domain specific group information is provided an array integers
- that encodes the group of each sample.
- For instance the groups could be the year of collection of the samples
- and thus allow for cross-validation against time-based splits.
- Read more in the :ref:`User Guide <leave_one_group_out>`.
- Notes
- -----
- Splits are ordered according to the index of the group left out. The first
- split has testing set consisting of the group whose index in `groups` is
- lowest, and so on.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.model_selection import LeaveOneGroupOut
- >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
- >>> y = np.array([1, 2, 1, 2])
- >>> groups = np.array([1, 1, 2, 2])
- >>> logo = LeaveOneGroupOut()
- >>> logo.get_n_splits(X, y, groups)
- 2
- >>> logo.get_n_splits(groups=groups) # 'groups' is always required
- 2
- >>> print(logo)
- LeaveOneGroupOut()
- >>> for i, (train_index, test_index) in enumerate(logo.split(X, y, groups)):
- ... print(f"Fold {i}:")
- ... print(f" Train: index={train_index}, group={groups[train_index]}")
- ... print(f" Test: index={test_index}, group={groups[test_index]}")
- Fold 0:
- Train: index=[2 3], group=[2 2]
- Test: index=[0 1], group=[1 1]
- Fold 1:
- Train: index=[0 1], group=[1 1]
- Test: index=[2 3], group=[2 2]
- See also
- --------
- GroupKFold: K-fold iterator variant with non-overlapping groups.
- """
- def _iter_test_masks(self, X, y, groups):
- if groups is None:
- raise ValueError("The 'groups' parameter should not be None.")
- # We make a copy of groups to avoid side-effects during iteration
- groups = check_array(
- groups, input_name="groups", copy=True, ensure_2d=False, dtype=None
- )
- unique_groups = np.unique(groups)
- if len(unique_groups) <= 1:
- raise ValueError(
- "The groups parameter contains fewer than 2 unique groups "
- "(%s). LeaveOneGroupOut expects at least 2." % unique_groups
- )
- for i in unique_groups:
- yield groups == i
- def get_n_splits(self, X=None, y=None, groups=None):
- """Returns the number of splitting iterations in the cross-validator
- Parameters
- ----------
- X : object
- Always ignored, exists for compatibility.
- y : object
- Always ignored, exists for compatibility.
- groups : array-like of shape (n_samples,)
- Group labels for the samples used while splitting the dataset into
- train/test set. This 'groups' parameter must always be specified to
- calculate the number of splits, though the other parameters can be
- omitted.
- Returns
- -------
- n_splits : int
- Returns the number of splitting iterations in the cross-validator.
- """
- if groups is None:
- raise ValueError("The 'groups' parameter should not be None.")
- groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None)
- return len(np.unique(groups))
- def split(self, X, y=None, groups=None):
- """Generate indices to split data into training and test set.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Training data, where `n_samples` is the number of samples
- and `n_features` is the number of features.
- y : array-like of shape (n_samples,), default=None
- The target variable for supervised learning problems.
- groups : array-like of shape (n_samples,)
- Group labels for the samples used while splitting the dataset into
- train/test set.
- Yields
- ------
- train : ndarray
- The training set indices for that split.
- test : ndarray
- The testing set indices for that split.
- """
- return super().split(X, y, groups)
- class LeavePGroupsOut(GroupsConsumerMixin, BaseCrossValidator):
- """Leave P Group(s) Out cross-validator
- Provides train/test indices to split data according to a third-party
- provided group. This group information can be used to encode arbitrary
- domain specific stratifications of the samples as integers.
- For instance the groups could be the year of collection of the samples
- and thus allow for cross-validation against time-based splits.
- The difference between LeavePGroupsOut and LeaveOneGroupOut is that
- the former builds the test sets with all the samples assigned to
- ``p`` different values of the groups while the latter uses samples
- all assigned the same groups.
- Read more in the :ref:`User Guide <leave_p_groups_out>`.
- Parameters
- ----------
- n_groups : int
- Number of groups (``p``) to leave out in the test split.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.model_selection import LeavePGroupsOut
- >>> X = np.array([[1, 2], [3, 4], [5, 6]])
- >>> y = np.array([1, 2, 1])
- >>> groups = np.array([1, 2, 3])
- >>> lpgo = LeavePGroupsOut(n_groups=2)
- >>> lpgo.get_n_splits(X, y, groups)
- 3
- >>> lpgo.get_n_splits(groups=groups) # 'groups' is always required
- 3
- >>> print(lpgo)
- LeavePGroupsOut(n_groups=2)
- >>> for i, (train_index, test_index) in enumerate(lpgo.split(X, y, groups)):
- ... print(f"Fold {i}:")
- ... print(f" Train: index={train_index}, group={groups[train_index]}")
- ... print(f" Test: index={test_index}, group={groups[test_index]}")
- Fold 0:
- Train: index=[2], group=[3]
- Test: index=[0 1], group=[1 2]
- Fold 1:
- Train: index=[1], group=[2]
- Test: index=[0 2], group=[1 3]
- Fold 2:
- Train: index=[0], group=[1]
- Test: index=[1 2], group=[2 3]
- See Also
- --------
- GroupKFold : K-fold iterator variant with non-overlapping groups.
- """
- def __init__(self, n_groups):
- self.n_groups = n_groups
- def _iter_test_masks(self, X, y, groups):
- if groups is None:
- raise ValueError("The 'groups' parameter should not be None.")
- groups = check_array(
- groups, input_name="groups", copy=True, ensure_2d=False, dtype=None
- )
- unique_groups = np.unique(groups)
- if self.n_groups >= len(unique_groups):
- raise ValueError(
- "The groups parameter contains fewer than (or equal to) "
- "n_groups (%d) numbers of unique groups (%s). LeavePGroupsOut "
- "expects that at least n_groups + 1 (%d) unique groups be "
- "present" % (self.n_groups, unique_groups, self.n_groups + 1)
- )
- combi = combinations(range(len(unique_groups)), self.n_groups)
- for indices in combi:
- test_index = np.zeros(_num_samples(X), dtype=bool)
- for l in unique_groups[np.array(indices)]:
- test_index[groups == l] = True
- yield test_index
- def get_n_splits(self, X=None, y=None, groups=None):
- """Returns the number of splitting iterations in the cross-validator
- Parameters
- ----------
- X : object
- Always ignored, exists for compatibility.
- y : object
- Always ignored, exists for compatibility.
- groups : array-like of shape (n_samples,)
- Group labels for the samples used while splitting the dataset into
- train/test set. This 'groups' parameter must always be specified to
- calculate the number of splits, though the other parameters can be
- omitted.
- Returns
- -------
- n_splits : int
- Returns the number of splitting iterations in the cross-validator.
- """
- if groups is None:
- raise ValueError("The 'groups' parameter should not be None.")
- groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None)
- return int(comb(len(np.unique(groups)), self.n_groups, exact=True))
- def split(self, X, y=None, groups=None):
- """Generate indices to split data into training and test set.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Training data, where `n_samples` is the number of samples
- and `n_features` is the number of features.
- y : array-like of shape (n_samples,), default=None
- The target variable for supervised learning problems.
- groups : array-like of shape (n_samples,)
- Group labels for the samples used while splitting the dataset into
- train/test set.
- Yields
- ------
- train : ndarray
- The training set indices for that split.
- test : ndarray
- The testing set indices for that split.
- """
- return super().split(X, y, groups)
- class _RepeatedSplits(_MetadataRequester, metaclass=ABCMeta):
- """Repeated splits for an arbitrary randomized CV splitter.
- Repeats splits for cross-validators n times with different randomization
- in each repetition.
- Parameters
- ----------
- cv : callable
- Cross-validator class.
- n_repeats : int, default=10
- Number of times cross-validator needs to be repeated.
- random_state : int, RandomState instance or None, default=None
- Passes `random_state` to the arbitrary repeating cross validator.
- Pass an int for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- **cvargs : additional params
- Constructor parameters for cv. Must not contain random_state
- and shuffle.
- """
- # This indicates that by default CV splitters don't have a "groups" kwarg,
- # unless indicated by inheriting from ``GroupsConsumerMixin``.
- # This also prevents ``set_split_request`` to be generated for splitters
- # which don't support ``groups``.
- __metadata_request__split = {"groups": metadata_routing.UNUSED}
- def __init__(self, cv, *, n_repeats=10, random_state=None, **cvargs):
- if not isinstance(n_repeats, numbers.Integral):
- raise ValueError("Number of repetitions must be of Integral type.")
- if n_repeats <= 0:
- raise ValueError("Number of repetitions must be greater than 0.")
- if any(key in cvargs for key in ("random_state", "shuffle")):
- raise ValueError("cvargs must not contain random_state or shuffle.")
- self.cv = cv
- self.n_repeats = n_repeats
- self.random_state = random_state
- self.cvargs = cvargs
- def split(self, X, y=None, groups=None):
- """Generates indices to split data into training and test set.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Training data, where `n_samples` is the number of samples
- and `n_features` is the number of features.
- y : array-like of shape (n_samples,)
- The target variable for supervised learning problems.
- groups : array-like of shape (n_samples,), default=None
- Group labels for the samples used while splitting the dataset into
- train/test set.
- Yields
- ------
- train : ndarray
- The training set indices for that split.
- test : ndarray
- The testing set indices for that split.
- """
- n_repeats = self.n_repeats
- rng = check_random_state(self.random_state)
- for idx in range(n_repeats):
- cv = self.cv(random_state=rng, shuffle=True, **self.cvargs)
- for train_index, test_index in cv.split(X, y, groups):
- yield train_index, test_index
- def get_n_splits(self, X=None, y=None, groups=None):
- """Returns the number of splitting iterations in the cross-validator
- Parameters
- ----------
- X : object
- Always ignored, exists for compatibility.
- ``np.zeros(n_samples)`` may be used as a placeholder.
- y : object
- Always ignored, exists for compatibility.
- ``np.zeros(n_samples)`` may be used as a placeholder.
- groups : array-like of shape (n_samples,), default=None
- Group labels for the samples used while splitting the dataset into
- train/test set.
- Returns
- -------
- n_splits : int
- Returns the number of splitting iterations in the cross-validator.
- """
- rng = check_random_state(self.random_state)
- cv = self.cv(random_state=rng, shuffle=True, **self.cvargs)
- return cv.get_n_splits(X, y, groups) * self.n_repeats
- def __repr__(self):
- return _build_repr(self)
- class RepeatedKFold(_RepeatedSplits):
- """Repeated K-Fold cross validator.
- Repeats K-Fold n times with different randomization in each repetition.
- Read more in the :ref:`User Guide <repeated_k_fold>`.
- Parameters
- ----------
- n_splits : int, default=5
- Number of folds. Must be at least 2.
- n_repeats : int, default=10
- Number of times cross-validator needs to be repeated.
- random_state : int, RandomState instance or None, default=None
- Controls the randomness of each repeated cross-validation instance.
- Pass an int for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.model_selection import RepeatedKFold
- >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
- >>> y = np.array([0, 0, 1, 1])
- >>> rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=2652124)
- >>> rkf.get_n_splits(X, y)
- 4
- >>> print(rkf)
- RepeatedKFold(n_repeats=2, n_splits=2, random_state=2652124)
- >>> for i, (train_index, test_index) in enumerate(rkf.split(X)):
- ... print(f"Fold {i}:")
- ... print(f" Train: index={train_index}")
- ... print(f" Test: index={test_index}")
- ...
- Fold 0:
- Train: index=[0 1]
- Test: index=[2 3]
- Fold 1:
- Train: index=[2 3]
- Test: index=[0 1]
- Fold 2:
- Train: index=[1 2]
- Test: index=[0 3]
- Fold 3:
- Train: index=[0 3]
- Test: index=[1 2]
- Notes
- -----
- Randomized CV splitters may return different results for each call of
- split. You can make the results identical by setting `random_state`
- to an integer.
- See Also
- --------
- RepeatedStratifiedKFold : Repeats Stratified K-Fold n times.
- """
- def __init__(self, *, n_splits=5, n_repeats=10, random_state=None):
- super().__init__(
- KFold, n_repeats=n_repeats, random_state=random_state, n_splits=n_splits
- )
- class RepeatedStratifiedKFold(_RepeatedSplits):
- """Repeated Stratified K-Fold cross validator.
- Repeats Stratified K-Fold n times with different randomization in each
- repetition.
- Read more in the :ref:`User Guide <repeated_k_fold>`.
- Parameters
- ----------
- n_splits : int, default=5
- Number of folds. Must be at least 2.
- n_repeats : int, default=10
- Number of times cross-validator needs to be repeated.
- random_state : int, RandomState instance or None, default=None
- Controls the generation of the random states for each repetition.
- Pass an int for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.model_selection import RepeatedStratifiedKFold
- >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
- >>> y = np.array([0, 0, 1, 1])
- >>> rskf = RepeatedStratifiedKFold(n_splits=2, n_repeats=2,
- ... random_state=36851234)
- >>> rskf.get_n_splits(X, y)
- 4
- >>> print(rskf)
- RepeatedStratifiedKFold(n_repeats=2, n_splits=2, random_state=36851234)
- >>> for i, (train_index, test_index) in enumerate(rskf.split(X, y)):
- ... print(f"Fold {i}:")
- ... print(f" Train: index={train_index}")
- ... print(f" Test: index={test_index}")
- ...
- Fold 0:
- Train: index=[1 2]
- Test: index=[0 3]
- Fold 1:
- Train: index=[0 3]
- Test: index=[1 2]
- Fold 2:
- Train: index=[1 3]
- Test: index=[0 2]
- Fold 3:
- Train: index=[0 2]
- Test: index=[1 3]
- Notes
- -----
- Randomized CV splitters may return different results for each call of
- split. You can make the results identical by setting `random_state`
- to an integer.
- See Also
- --------
- RepeatedKFold : Repeats K-Fold n times.
- """
- def __init__(self, *, n_splits=5, n_repeats=10, random_state=None):
- super().__init__(
- StratifiedKFold,
- n_repeats=n_repeats,
- random_state=random_state,
- n_splits=n_splits,
- )
- class BaseShuffleSplit(_MetadataRequester, metaclass=ABCMeta):
- """Base class for ShuffleSplit and StratifiedShuffleSplit"""
- # This indicates that by default CV splitters don't have a "groups" kwarg,
- # unless indicated by inheriting from ``GroupsConsumerMixin``.
- # This also prevents ``set_split_request`` to be generated for splitters
- # which don't support ``groups``.
- __metadata_request__split = {"groups": metadata_routing.UNUSED}
- def __init__(
- self, n_splits=10, *, test_size=None, train_size=None, random_state=None
- ):
- self.n_splits = n_splits
- self.test_size = test_size
- self.train_size = train_size
- self.random_state = random_state
- self._default_test_size = 0.1
- def split(self, X, y=None, groups=None):
- """Generate indices to split data into training and test set.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Training data, where `n_samples` is the number of samples
- and `n_features` is the number of features.
- y : array-like of shape (n_samples,)
- The target variable for supervised learning problems.
- groups : array-like of shape (n_samples,), default=None
- Group labels for the samples used while splitting the dataset into
- train/test set.
- Yields
- ------
- train : ndarray
- The training set indices for that split.
- test : ndarray
- The testing set indices for that split.
- Notes
- -----
- Randomized CV splitters may return different results for each call of
- split. You can make the results identical by setting `random_state`
- to an integer.
- """
- X, y, groups = indexable(X, y, groups)
- for train, test in self._iter_indices(X, y, groups):
- yield train, test
- @abstractmethod
- def _iter_indices(self, X, y=None, groups=None):
- """Generate (train, test) indices"""
- def get_n_splits(self, X=None, y=None, groups=None):
- """Returns the number of splitting iterations in the cross-validator
- Parameters
- ----------
- X : object
- Always ignored, exists for compatibility.
- y : object
- Always ignored, exists for compatibility.
- groups : object
- Always ignored, exists for compatibility.
- Returns
- -------
- n_splits : int
- Returns the number of splitting iterations in the cross-validator.
- """
- return self.n_splits
- def __repr__(self):
- return _build_repr(self)
- class ShuffleSplit(BaseShuffleSplit):
- """Random permutation cross-validator
- Yields indices to split data into training and test sets.
- Note: contrary to other cross-validation strategies, random splits
- do not guarantee that all folds will be different, although this is
- still very likely for sizeable datasets.
- Read more in the :ref:`User Guide <ShuffleSplit>`.
- For visualisation of cross-validation behaviour and
- comparison between common scikit-learn split methods
- refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
- Parameters
- ----------
- n_splits : int, default=10
- Number of re-shuffling & splitting iterations.
- test_size : float or int, default=None
- If float, should be between 0.0 and 1.0 and represent the proportion
- of the dataset to include in the test split. If int, represents the
- absolute number of test samples. If None, the value is set to the
- complement of the train size. If ``train_size`` is also None, it will
- be set to 0.1.
- train_size : float or int, default=None
- If float, should be between 0.0 and 1.0 and represent the
- proportion of the dataset to include in the train split. If
- int, represents the absolute number of train samples. If None,
- the value is automatically set to the complement of the test size.
- random_state : int, RandomState instance or None, default=None
- Controls the randomness of the training and testing indices produced.
- Pass an int for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.model_selection import ShuffleSplit
- >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [3, 4], [5, 6]])
- >>> y = np.array([1, 2, 1, 2, 1, 2])
- >>> rs = ShuffleSplit(n_splits=5, test_size=.25, random_state=0)
- >>> rs.get_n_splits(X)
- 5
- >>> print(rs)
- ShuffleSplit(n_splits=5, random_state=0, test_size=0.25, train_size=None)
- >>> for i, (train_index, test_index) in enumerate(rs.split(X)):
- ... print(f"Fold {i}:")
- ... print(f" Train: index={train_index}")
- ... print(f" Test: index={test_index}")
- Fold 0:
- Train: index=[1 3 0 4]
- Test: index=[5 2]
- Fold 1:
- Train: index=[4 0 2 5]
- Test: index=[1 3]
- Fold 2:
- Train: index=[1 2 4 0]
- Test: index=[3 5]
- Fold 3:
- Train: index=[3 4 1 0]
- Test: index=[5 2]
- Fold 4:
- Train: index=[3 5 1 0]
- Test: index=[2 4]
- >>> # Specify train and test size
- >>> rs = ShuffleSplit(n_splits=5, train_size=0.5, test_size=.25,
- ... random_state=0)
- >>> for i, (train_index, test_index) in enumerate(rs.split(X)):
- ... print(f"Fold {i}:")
- ... print(f" Train: index={train_index}")
- ... print(f" Test: index={test_index}")
- Fold 0:
- Train: index=[1 3 0]
- Test: index=[5 2]
- Fold 1:
- Train: index=[4 0 2]
- Test: index=[1 3]
- Fold 2:
- Train: index=[1 2 4]
- Test: index=[3 5]
- Fold 3:
- Train: index=[3 4 1]
- Test: index=[5 2]
- Fold 4:
- Train: index=[3 5 1]
- Test: index=[2 4]
- """
- def __init__(
- self, n_splits=10, *, test_size=None, train_size=None, random_state=None
- ):
- super().__init__(
- n_splits=n_splits,
- test_size=test_size,
- train_size=train_size,
- random_state=random_state,
- )
- self._default_test_size = 0.1
- def _iter_indices(self, X, y=None, groups=None):
- n_samples = _num_samples(X)
- n_train, n_test = _validate_shuffle_split(
- n_samples,
- self.test_size,
- self.train_size,
- default_test_size=self._default_test_size,
- )
- rng = check_random_state(self.random_state)
- for i in range(self.n_splits):
- # random partition
- permutation = rng.permutation(n_samples)
- ind_test = permutation[:n_test]
- ind_train = permutation[n_test : (n_test + n_train)]
- yield ind_train, ind_test
- class GroupShuffleSplit(GroupsConsumerMixin, ShuffleSplit):
- """Shuffle-Group(s)-Out cross-validation iterator
- Provides randomized train/test indices to split data according to a
- third-party provided group. This group information can be used to encode
- arbitrary domain specific stratifications of the samples as integers.
- For instance the groups could be the year of collection of the samples
- and thus allow for cross-validation against time-based splits.
- The difference between LeavePGroupsOut and GroupShuffleSplit is that
- the former generates splits using all subsets of size ``p`` unique groups,
- whereas GroupShuffleSplit generates a user-determined number of random
- test splits, each with a user-determined fraction of unique groups.
- For example, a less computationally intensive alternative to
- ``LeavePGroupsOut(p=10)`` would be
- ``GroupShuffleSplit(test_size=10, n_splits=100)``.
- Note: The parameters ``test_size`` and ``train_size`` refer to groups, and
- not to samples, as in ShuffleSplit.
- Read more in the :ref:`User Guide <group_shuffle_split>`.
- For visualisation of cross-validation behaviour and
- comparison between common scikit-learn split methods
- refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
- Parameters
- ----------
- n_splits : int, default=5
- Number of re-shuffling & splitting iterations.
- test_size : float, int, default=0.2
- If float, should be between 0.0 and 1.0 and represent the proportion
- of groups to include in the test split (rounded up). If int,
- represents the absolute number of test groups. If None, the value is
- set to the complement of the train size.
- The default will change in version 0.21. It will remain 0.2 only
- if ``train_size`` is unspecified, otherwise it will complement
- the specified ``train_size``.
- train_size : float or int, default=None
- If float, should be between 0.0 and 1.0 and represent the
- proportion of the groups to include in the train split. If
- int, represents the absolute number of train groups. If None,
- the value is automatically set to the complement of the test size.
- random_state : int, RandomState instance or None, default=None
- Controls the randomness of the training and testing indices produced.
- Pass an int for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.model_selection import GroupShuffleSplit
- >>> X = np.ones(shape=(8, 2))
- >>> y = np.ones(shape=(8, 1))
- >>> groups = np.array([1, 1, 2, 2, 2, 3, 3, 3])
- >>> print(groups.shape)
- (8,)
- >>> gss = GroupShuffleSplit(n_splits=2, train_size=.7, random_state=42)
- >>> gss.get_n_splits()
- 2
- >>> print(gss)
- GroupShuffleSplit(n_splits=2, random_state=42, test_size=None, train_size=0.7)
- >>> for i, (train_index, test_index) in enumerate(gss.split(X, y, groups)):
- ... print(f"Fold {i}:")
- ... print(f" Train: index={train_index}, group={groups[train_index]}")
- ... print(f" Test: index={test_index}, group={groups[test_index]}")
- Fold 0:
- Train: index=[2 3 4 5 6 7], group=[2 2 2 3 3 3]
- Test: index=[0 1], group=[1 1]
- Fold 1:
- Train: index=[0 1 5 6 7], group=[1 1 3 3 3]
- Test: index=[2 3 4], group=[2 2 2]
- See Also
- --------
- ShuffleSplit : Shuffles samples to create independent test/train sets.
- LeavePGroupsOut : Train set leaves out all possible subsets of `p` groups.
- """
- def __init__(
- self, n_splits=5, *, test_size=None, train_size=None, random_state=None
- ):
- super().__init__(
- n_splits=n_splits,
- test_size=test_size,
- train_size=train_size,
- random_state=random_state,
- )
- self._default_test_size = 0.2
- def _iter_indices(self, X, y, groups):
- if groups is None:
- raise ValueError("The 'groups' parameter should not be None.")
- groups = check_array(groups, input_name="groups", ensure_2d=False, dtype=None)
- classes, group_indices = np.unique(groups, return_inverse=True)
- for group_train, group_test in super()._iter_indices(X=classes):
- # these are the indices of classes in the partition
- # invert them into data indices
- train = np.flatnonzero(np.isin(group_indices, group_train))
- test = np.flatnonzero(np.isin(group_indices, group_test))
- yield train, test
- def split(self, X, y=None, groups=None):
- """Generate indices to split data into training and test set.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Training data, where `n_samples` is the number of samples
- and `n_features` is the number of features.
- y : array-like of shape (n_samples,), default=None
- The target variable for supervised learning problems.
- groups : array-like of shape (n_samples,)
- Group labels for the samples used while splitting the dataset into
- train/test set.
- Yields
- ------
- train : ndarray
- The training set indices for that split.
- test : ndarray
- The testing set indices for that split.
- Notes
- -----
- Randomized CV splitters may return different results for each call of
- split. You can make the results identical by setting `random_state`
- to an integer.
- """
- return super().split(X, y, groups)
- class StratifiedShuffleSplit(BaseShuffleSplit):
- """Stratified ShuffleSplit cross-validator
- Provides train/test indices to split data in train/test sets.
- This cross-validation object is a merge of StratifiedKFold and
- ShuffleSplit, which returns stratified randomized folds. The folds
- are made by preserving the percentage of samples for each class.
- Note: like the ShuffleSplit strategy, stratified random splits
- do not guarantee that all folds will be different, although this is
- still very likely for sizeable datasets.
- Read more in the :ref:`User Guide <stratified_shuffle_split>`.
- For visualisation of cross-validation behaviour and
- comparison between common scikit-learn split methods
- refer to :ref:`sphx_glr_auto_examples_model_selection_plot_cv_indices.py`
- Parameters
- ----------
- n_splits : int, default=10
- Number of re-shuffling & splitting iterations.
- test_size : float or int, default=None
- If float, should be between 0.0 and 1.0 and represent the proportion
- of the dataset to include in the test split. If int, represents the
- absolute number of test samples. If None, the value is set to the
- complement of the train size. If ``train_size`` is also None, it will
- be set to 0.1.
- train_size : float or int, default=None
- If float, should be between 0.0 and 1.0 and represent the
- proportion of the dataset to include in the train split. If
- int, represents the absolute number of train samples. If None,
- the value is automatically set to the complement of the test size.
- random_state : int, RandomState instance or None, default=None
- Controls the randomness of the training and testing indices produced.
- Pass an int for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.model_selection import StratifiedShuffleSplit
- >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
- >>> y = np.array([0, 0, 0, 1, 1, 1])
- >>> sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0)
- >>> sss.get_n_splits(X, y)
- 5
- >>> print(sss)
- StratifiedShuffleSplit(n_splits=5, random_state=0, ...)
- >>> for i, (train_index, test_index) in enumerate(sss.split(X, y)):
- ... print(f"Fold {i}:")
- ... print(f" Train: index={train_index}")
- ... print(f" Test: index={test_index}")
- Fold 0:
- Train: index=[5 2 3]
- Test: index=[4 1 0]
- Fold 1:
- Train: index=[5 1 4]
- Test: index=[0 2 3]
- Fold 2:
- Train: index=[5 0 2]
- Test: index=[4 3 1]
- Fold 3:
- Train: index=[4 1 0]
- Test: index=[2 3 5]
- Fold 4:
- Train: index=[0 5 1]
- Test: index=[3 4 2]
- """
- def __init__(
- self, n_splits=10, *, test_size=None, train_size=None, random_state=None
- ):
- super().__init__(
- n_splits=n_splits,
- test_size=test_size,
- train_size=train_size,
- random_state=random_state,
- )
- self._default_test_size = 0.1
- def _iter_indices(self, X, y, groups=None):
- n_samples = _num_samples(X)
- y = check_array(y, input_name="y", ensure_2d=False, dtype=None)
- n_train, n_test = _validate_shuffle_split(
- n_samples,
- self.test_size,
- self.train_size,
- default_test_size=self._default_test_size,
- )
- if y.ndim == 2:
- # for multi-label y, map each distinct row to a string repr
- # using join because str(row) uses an ellipsis if len(row) > 1000
- y = np.array([" ".join(row.astype("str")) for row in y])
- classes, y_indices = np.unique(y, return_inverse=True)
- n_classes = classes.shape[0]
- class_counts = np.bincount(y_indices)
- if np.min(class_counts) < 2:
- raise ValueError(
- "The least populated class in y has only 1"
- " member, which is too few. The minimum"
- " number of groups for any class cannot"
- " be less than 2."
- )
- if n_train < n_classes:
- raise ValueError(
- "The train_size = %d should be greater or "
- "equal to the number of classes = %d" % (n_train, n_classes)
- )
- if n_test < n_classes:
- raise ValueError(
- "The test_size = %d should be greater or "
- "equal to the number of classes = %d" % (n_test, n_classes)
- )
- # Find the sorted list of instances for each class:
- # (np.unique above performs a sort, so code is O(n logn) already)
- class_indices = np.split(
- np.argsort(y_indices, kind="mergesort"), np.cumsum(class_counts)[:-1]
- )
- rng = check_random_state(self.random_state)
- for _ in range(self.n_splits):
- # if there are ties in the class-counts, we want
- # to make sure to break them anew in each iteration
- n_i = _approximate_mode(class_counts, n_train, rng)
- class_counts_remaining = class_counts - n_i
- t_i = _approximate_mode(class_counts_remaining, n_test, rng)
- train = []
- test = []
- for i in range(n_classes):
- permutation = rng.permutation(class_counts[i])
- perm_indices_class_i = class_indices[i].take(permutation, mode="clip")
- train.extend(perm_indices_class_i[: n_i[i]])
- test.extend(perm_indices_class_i[n_i[i] : n_i[i] + t_i[i]])
- train = rng.permutation(train)
- test = rng.permutation(test)
- yield train, test
- def split(self, X, y, groups=None):
- """Generate indices to split data into training and test set.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Training data, where `n_samples` is the number of samples
- and `n_features` is the number of features.
- Note that providing ``y`` is sufficient to generate the splits and
- hence ``np.zeros(n_samples)`` may be used as a placeholder for
- ``X`` instead of actual training data.
- y : array-like of shape (n_samples,) or (n_samples, n_labels)
- The target variable for supervised learning problems.
- Stratification is done based on the y labels.
- groups : object
- Always ignored, exists for compatibility.
- Yields
- ------
- train : ndarray
- The training set indices for that split.
- test : ndarray
- The testing set indices for that split.
- Notes
- -----
- Randomized CV splitters may return different results for each call of
- split. You can make the results identical by setting `random_state`
- to an integer.
- """
- y = check_array(y, input_name="y", ensure_2d=False, dtype=None)
- return super().split(X, y, groups)
- def _validate_shuffle_split(n_samples, test_size, train_size, default_test_size=None):
- """
- Validation helper to check if the test/test sizes are meaningful w.r.t. the
- size of the data (n_samples).
- """
- if test_size is None and train_size is None:
- test_size = default_test_size
- test_size_type = np.asarray(test_size).dtype.kind
- train_size_type = np.asarray(train_size).dtype.kind
- if (
- test_size_type == "i"
- and (test_size >= n_samples or test_size <= 0)
- or test_size_type == "f"
- and (test_size <= 0 or test_size >= 1)
- ):
- raise ValueError(
- "test_size={0} should be either positive and smaller"
- " than the number of samples {1} or a float in the "
- "(0, 1) range".format(test_size, n_samples)
- )
- if (
- train_size_type == "i"
- and (train_size >= n_samples or train_size <= 0)
- or train_size_type == "f"
- and (train_size <= 0 or train_size >= 1)
- ):
- raise ValueError(
- "train_size={0} should be either positive and smaller"
- " than the number of samples {1} or a float in the "
- "(0, 1) range".format(train_size, n_samples)
- )
- if train_size is not None and train_size_type not in ("i", "f"):
- raise ValueError("Invalid value for train_size: {}".format(train_size))
- if test_size is not None and test_size_type not in ("i", "f"):
- raise ValueError("Invalid value for test_size: {}".format(test_size))
- if train_size_type == "f" and test_size_type == "f" and train_size + test_size > 1:
- raise ValueError(
- "The sum of test_size and train_size = {}, should be in the (0, 1)"
- " range. Reduce test_size and/or train_size.".format(train_size + test_size)
- )
- if test_size_type == "f":
- n_test = ceil(test_size * n_samples)
- elif test_size_type == "i":
- n_test = float(test_size)
- if train_size_type == "f":
- n_train = floor(train_size * n_samples)
- elif train_size_type == "i":
- n_train = float(train_size)
- if train_size is None:
- n_train = n_samples - n_test
- elif test_size is None:
- n_test = n_samples - n_train
- if n_train + n_test > n_samples:
- raise ValueError(
- "The sum of train_size and test_size = %d, "
- "should be smaller than the number of "
- "samples %d. Reduce test_size and/or "
- "train_size." % (n_train + n_test, n_samples)
- )
- n_train, n_test = int(n_train), int(n_test)
- if n_train == 0:
- raise ValueError(
- "With n_samples={}, test_size={} and train_size={}, the "
- "resulting train set will be empty. Adjust any of the "
- "aforementioned parameters.".format(n_samples, test_size, train_size)
- )
- return n_train, n_test
- class PredefinedSplit(BaseCrossValidator):
- """Predefined split cross-validator
- Provides train/test indices to split data into train/test sets using a
- predefined scheme specified by the user with the ``test_fold`` parameter.
- Read more in the :ref:`User Guide <predefined_split>`.
- .. versionadded:: 0.16
- Parameters
- ----------
- test_fold : array-like of shape (n_samples,)
- The entry ``test_fold[i]`` represents the index of the test set that
- sample ``i`` belongs to. It is possible to exclude sample ``i`` from
- any test set (i.e. include sample ``i`` in every training set) by
- setting ``test_fold[i]`` equal to -1.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.model_selection import PredefinedSplit
- >>> X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
- >>> y = np.array([0, 0, 1, 1])
- >>> test_fold = [0, 1, -1, 1]
- >>> ps = PredefinedSplit(test_fold)
- >>> ps.get_n_splits()
- 2
- >>> print(ps)
- PredefinedSplit(test_fold=array([ 0, 1, -1, 1]))
- >>> for i, (train_index, test_index) in enumerate(ps.split()):
- ... print(f"Fold {i}:")
- ... print(f" Train: index={train_index}")
- ... print(f" Test: index={test_index}")
- Fold 0:
- Train: index=[1 2 3]
- Test: index=[0]
- Fold 1:
- Train: index=[0 2]
- Test: index=[1 3]
- """
- def __init__(self, test_fold):
- self.test_fold = np.array(test_fold, dtype=int)
- self.test_fold = column_or_1d(self.test_fold)
- self.unique_folds = np.unique(self.test_fold)
- self.unique_folds = self.unique_folds[self.unique_folds != -1]
- def split(self, X=None, y=None, groups=None):
- """Generate indices to split data into training and test set.
- Parameters
- ----------
- X : object
- Always ignored, exists for compatibility.
- y : object
- Always ignored, exists for compatibility.
- groups : object
- Always ignored, exists for compatibility.
- Yields
- ------
- train : ndarray
- The training set indices for that split.
- test : ndarray
- The testing set indices for that split.
- """
- ind = np.arange(len(self.test_fold))
- for test_index in self._iter_test_masks():
- train_index = ind[np.logical_not(test_index)]
- test_index = ind[test_index]
- yield train_index, test_index
- def _iter_test_masks(self):
- """Generates boolean masks corresponding to test sets."""
- for f in self.unique_folds:
- test_index = np.where(self.test_fold == f)[0]
- test_mask = np.zeros(len(self.test_fold), dtype=bool)
- test_mask[test_index] = True
- yield test_mask
- def get_n_splits(self, X=None, y=None, groups=None):
- """Returns the number of splitting iterations in the cross-validator
- Parameters
- ----------
- X : object
- Always ignored, exists for compatibility.
- y : object
- Always ignored, exists for compatibility.
- groups : object
- Always ignored, exists for compatibility.
- Returns
- -------
- n_splits : int
- Returns the number of splitting iterations in the cross-validator.
- """
- return len(self.unique_folds)
- class _CVIterableWrapper(BaseCrossValidator):
- """Wrapper class for old style cv objects and iterables."""
- def __init__(self, cv):
- self.cv = list(cv)
- def get_n_splits(self, X=None, y=None, groups=None):
- """Returns the number of splitting iterations in the cross-validator
- Parameters
- ----------
- X : object
- Always ignored, exists for compatibility.
- y : object
- Always ignored, exists for compatibility.
- groups : object
- Always ignored, exists for compatibility.
- Returns
- -------
- n_splits : int
- Returns the number of splitting iterations in the cross-validator.
- """
- return len(self.cv)
- def split(self, X=None, y=None, groups=None):
- """Generate indices to split data into training and test set.
- Parameters
- ----------
- X : object
- Always ignored, exists for compatibility.
- y : object
- Always ignored, exists for compatibility.
- groups : object
- Always ignored, exists for compatibility.
- Yields
- ------
- train : ndarray
- The training set indices for that split.
- test : ndarray
- The testing set indices for that split.
- """
- for train, test in self.cv:
- yield train, test
- def check_cv(cv=5, y=None, *, classifier=False):
- """Input checker utility for building a cross-validator.
- Parameters
- ----------
- cv : int, cross-validation generator or an iterable, default=None
- Determines the cross-validation splitting strategy.
- Possible inputs for cv are:
- - None, to use the default 5-fold cross validation,
- - integer, to specify the number of folds.
- - :term:`CV splitter`,
- - An iterable that generates (train, test) splits as arrays of indices.
- For integer/None inputs, if classifier is True and ``y`` is either
- binary or multiclass, :class:`StratifiedKFold` is used. In all other
- cases, :class:`KFold` is used.
- Refer :ref:`User Guide <cross_validation>` for the various
- cross-validation strategies that can be used here.
- .. versionchanged:: 0.22
- ``cv`` default value changed from 3-fold to 5-fold.
- y : array-like, default=None
- The target variable for supervised learning problems.
- classifier : bool, default=False
- Whether the task is a classification task, in which case
- stratified KFold will be used.
- Returns
- -------
- checked_cv : a cross-validator instance.
- The return value is a cross-validator which generates the train/test
- splits via the ``split`` method.
- """
- cv = 5 if cv is None else cv
- if isinstance(cv, numbers.Integral):
- if (
- classifier
- and (y is not None)
- and (type_of_target(y, input_name="y") in ("binary", "multiclass"))
- ):
- return StratifiedKFold(cv)
- else:
- return KFold(cv)
- if not hasattr(cv, "split") or isinstance(cv, str):
- if not isinstance(cv, Iterable) or isinstance(cv, str):
- raise ValueError(
- "Expected cv as an integer, cross-validation "
- "object (from sklearn.model_selection) "
- "or an iterable. Got %s." % cv
- )
- return _CVIterableWrapper(cv)
- return cv # New style cv objects are passed without any modification
- @validate_params(
- {
- "test_size": [
- Interval(RealNotInt, 0, 1, closed="neither"),
- Interval(numbers.Integral, 1, None, closed="left"),
- None,
- ],
- "train_size": [
- Interval(RealNotInt, 0, 1, closed="neither"),
- Interval(numbers.Integral, 1, None, closed="left"),
- None,
- ],
- "random_state": ["random_state"],
- "shuffle": ["boolean"],
- "stratify": ["array-like", None],
- },
- prefer_skip_nested_validation=True,
- )
- def train_test_split(
- *arrays,
- test_size=None,
- train_size=None,
- random_state=None,
- shuffle=True,
- stratify=None,
- ):
- """Split arrays or matrices into random train and test subsets.
- Quick utility that wraps input validation,
- ``next(ShuffleSplit().split(X, y))``, and application to input data
- into a single call for splitting (and optionally subsampling) data into a
- one-liner.
- Read more in the :ref:`User Guide <cross_validation>`.
- Parameters
- ----------
- *arrays : sequence of indexables with same length / shape[0]
- Allowed inputs are lists, numpy arrays, scipy-sparse
- matrices or pandas dataframes.
- test_size : float or int, default=None
- If float, should be between 0.0 and 1.0 and represent the proportion
- of the dataset to include in the test split. If int, represents the
- absolute number of test samples. If None, the value is set to the
- complement of the train size. If ``train_size`` is also None, it will
- be set to 0.25.
- train_size : float or int, default=None
- If float, should be between 0.0 and 1.0 and represent the
- proportion of the dataset to include in the train split. If
- int, represents the absolute number of train samples. If None,
- the value is automatically set to the complement of the test size.
- random_state : int, RandomState instance or None, default=None
- Controls the shuffling applied to the data before applying the split.
- Pass an int for reproducible output across multiple function calls.
- See :term:`Glossary <random_state>`.
- shuffle : bool, default=True
- Whether or not to shuffle the data before splitting. If shuffle=False
- then stratify must be None.
- stratify : array-like, default=None
- If not None, data is split in a stratified fashion, using this as
- the class labels.
- Read more in the :ref:`User Guide <stratification>`.
- Returns
- -------
- splitting : list, length=2 * len(arrays)
- List containing train-test split of inputs.
- .. versionadded:: 0.16
- If the input is sparse, the output will be a
- ``scipy.sparse.csr_matrix``. Else, output type is the same as the
- input type.
- Examples
- --------
- >>> import numpy as np
- >>> from sklearn.model_selection import train_test_split
- >>> X, y = np.arange(10).reshape((5, 2)), range(5)
- >>> X
- array([[0, 1],
- [2, 3],
- [4, 5],
- [6, 7],
- [8, 9]])
- >>> list(y)
- [0, 1, 2, 3, 4]
- >>> X_train, X_test, y_train, y_test = train_test_split(
- ... X, y, test_size=0.33, random_state=42)
- ...
- >>> X_train
- array([[4, 5],
- [0, 1],
- [6, 7]])
- >>> y_train
- [2, 0, 3]
- >>> X_test
- array([[2, 3],
- [8, 9]])
- >>> y_test
- [1, 4]
- >>> train_test_split(y, shuffle=False)
- [[0, 1, 2], [3, 4]]
- """
- n_arrays = len(arrays)
- if n_arrays == 0:
- raise ValueError("At least one array required as input")
- arrays = indexable(*arrays)
- n_samples = _num_samples(arrays[0])
- n_train, n_test = _validate_shuffle_split(
- n_samples, test_size, train_size, default_test_size=0.25
- )
- if shuffle is False:
- if stratify is not None:
- raise ValueError(
- "Stratified train/test split is not implemented for shuffle=False"
- )
- train = np.arange(n_train)
- test = np.arange(n_train, n_train + n_test)
- else:
- if stratify is not None:
- CVClass = StratifiedShuffleSplit
- else:
- CVClass = ShuffleSplit
- cv = CVClass(test_size=n_test, train_size=n_train, random_state=random_state)
- train, test = next(cv.split(X=arrays[0], y=stratify))
- return list(
- chain.from_iterable(
- (_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays
- )
- )
- # Tell nose that train_test_split is not a test.
- # (Needed for external libraries that may use nose.)
- # Use setattr to avoid mypy errors when monkeypatching.
- setattr(train_test_split, "__test__", False)
- def _pprint(params, offset=0, printer=repr):
- """Pretty print the dictionary 'params'
- Parameters
- ----------
- params : dict
- The dictionary to pretty print
- offset : int, default=0
- The offset in characters to add at the begin of each line.
- printer : callable, default=repr
- The function to convert entries to strings, typically
- the builtin str or repr
- """
- # Do a multi-line justified repr:
- options = np.get_printoptions()
- np.set_printoptions(precision=5, threshold=64, edgeitems=2)
- params_list = list()
- this_line_length = offset
- line_sep = ",\n" + (1 + offset // 2) * " "
- for i, (k, v) in enumerate(sorted(params.items())):
- if isinstance(v, float):
- # use str for representing floating point numbers
- # this way we get consistent representation across
- # architectures and versions.
- this_repr = "%s=%s" % (k, str(v))
- else:
- # use repr of the rest
- this_repr = "%s=%s" % (k, printer(v))
- if len(this_repr) > 500:
- this_repr = this_repr[:300] + "..." + this_repr[-100:]
- if i > 0:
- if this_line_length + len(this_repr) >= 75 or "\n" in this_repr:
- params_list.append(line_sep)
- this_line_length = len(line_sep)
- else:
- params_list.append(", ")
- this_line_length += 2
- params_list.append(this_repr)
- this_line_length += len(this_repr)
- np.set_printoptions(**options)
- lines = "".join(params_list)
- # Strip trailing space to avoid nightmare in doctests
- lines = "\n".join(l.rstrip(" ") for l in lines.split("\n"))
- return lines
- def _build_repr(self):
- # XXX This is copied from BaseEstimator's get_params
- cls = self.__class__
- init = getattr(cls.__init__, "deprecated_original", cls.__init__)
- # Ignore varargs, kw and default values and pop self
- init_signature = signature(init)
- # Consider the constructor parameters excluding 'self'
- if init is object.__init__:
- args = []
- else:
- args = sorted(
- [
- p.name
- for p in init_signature.parameters.values()
- if p.name != "self" and p.kind != p.VAR_KEYWORD
- ]
- )
- class_name = self.__class__.__name__
- params = dict()
- for key in args:
- # We need deprecation warnings to always be on in order to
- # catch deprecated param values.
- # This is set in utils/__init__.py but it gets overwritten
- # when running under python3 somehow.
- warnings.simplefilter("always", FutureWarning)
- try:
- with warnings.catch_warnings(record=True) as w:
- value = getattr(self, key, None)
- if value is None and hasattr(self, "cvargs"):
- value = self.cvargs.get(key, None)
- if len(w) and w[0].category == FutureWarning:
- # if the parameter is deprecated, don't show it
- continue
- finally:
- warnings.filters.pop(0)
- params[key] = value
- return "%s(%s)" % (class_name, _pprint(params, offset=len(class_name)))
- def _yields_constant_splits(cv):
- # Return True if calling cv.split() always returns the same splits
- # We assume that if a cv doesn't have a shuffle parameter, it shuffles by
- # default (e.g. ShuffleSplit). If it actually doesn't shuffle (e.g.
- # LeaveOneOut), then it won't have a random_state parameter anyway, in
- # which case it will default to 0, leading to output=True
- shuffle = getattr(cv, "shuffle", True)
- random_state = getattr(cv, "random_state", 0)
- return isinstance(random_state, numbers.Integral) or not shuffle
|