| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826 |
- """
- Forest of trees-based ensemble methods.
- Those methods include random forests and extremely randomized trees.
- The module structure is the following:
- - The ``BaseForest`` base class implements a common ``fit`` method for all
- the estimators in the module. The ``fit`` method of the base ``Forest``
- class calls the ``fit`` method of each sub-estimator on random samples
- (with replacement, a.k.a. bootstrap) of the training set.
- The init of the sub-estimator is further delegated to the
- ``BaseEnsemble`` constructor.
- - The ``ForestClassifier`` and ``ForestRegressor`` base classes further
- implement the prediction logic by computing an average of the predicted
- outcomes of the sub-estimators.
- - The ``RandomForestClassifier`` and ``RandomForestRegressor`` derived
- classes provide the user with concrete implementations of
- the forest ensemble method using classical, deterministic
- ``DecisionTreeClassifier`` and ``DecisionTreeRegressor`` as
- sub-estimator implementations.
- - The ``ExtraTreesClassifier`` and ``ExtraTreesRegressor`` derived
- classes provide the user with concrete implementations of the
- forest ensemble method using the extremely randomized trees
- ``ExtraTreeClassifier`` and ``ExtraTreeRegressor`` as
- sub-estimator implementations.
- Single and multi-output problems are both handled.
- """
- # Authors: Gilles Louppe <g.louppe@gmail.com>
- # Brian Holt <bdholt1@gmail.com>
- # Joly Arnaud <arnaud.v.joly@gmail.com>
- # Fares Hedayati <fares.hedayati@gmail.com>
- #
- # License: BSD 3 clause
- import threading
- from abc import ABCMeta, abstractmethod
- from numbers import Integral, Real
- from warnings import catch_warnings, simplefilter, warn
- import numpy as np
- from scipy.sparse import hstack as sparse_hstack
- from scipy.sparse import issparse
- from ..base import (
- ClassifierMixin,
- MultiOutputMixin,
- RegressorMixin,
- TransformerMixin,
- _fit_context,
- is_classifier,
- )
- from ..exceptions import DataConversionWarning
- from ..metrics import accuracy_score, r2_score
- from ..preprocessing import OneHotEncoder
- from ..tree import (
- BaseDecisionTree,
- DecisionTreeClassifier,
- DecisionTreeRegressor,
- ExtraTreeClassifier,
- ExtraTreeRegressor,
- )
- from ..tree._tree import DOUBLE, DTYPE
- from ..utils import check_random_state, compute_sample_weight
- from ..utils._param_validation import Interval, RealNotInt, StrOptions
- from ..utils.multiclass import check_classification_targets, type_of_target
- from ..utils.parallel import Parallel, delayed
- from ..utils.validation import (
- _check_feature_names_in,
- _check_sample_weight,
- _num_samples,
- check_is_fitted,
- )
- from ._base import BaseEnsemble, _partition_estimators
- __all__ = [
- "RandomForestClassifier",
- "RandomForestRegressor",
- "ExtraTreesClassifier",
- "ExtraTreesRegressor",
- "RandomTreesEmbedding",
- ]
- MAX_INT = np.iinfo(np.int32).max
- def _get_n_samples_bootstrap(n_samples, max_samples):
- """
- Get the number of samples in a bootstrap sample.
- Parameters
- ----------
- n_samples : int
- Number of samples in the dataset.
- max_samples : int or float
- The maximum number of samples to draw from the total available:
- - if float, this indicates a fraction of the total and should be
- the interval `(0.0, 1.0]`;
- - if int, this indicates the exact number of samples;
- - if None, this indicates the total number of samples.
- Returns
- -------
- n_samples_bootstrap : int
- The total number of samples to draw for the bootstrap sample.
- """
- if max_samples is None:
- return n_samples
- if isinstance(max_samples, Integral):
- if max_samples > n_samples:
- msg = "`max_samples` must be <= n_samples={} but got value {}"
- raise ValueError(msg.format(n_samples, max_samples))
- return max_samples
- if isinstance(max_samples, Real):
- return max(round(n_samples * max_samples), 1)
- def _generate_sample_indices(random_state, n_samples, n_samples_bootstrap):
- """
- Private function used to _parallel_build_trees function."""
- random_instance = check_random_state(random_state)
- sample_indices = random_instance.randint(0, n_samples, n_samples_bootstrap)
- return sample_indices
- def _generate_unsampled_indices(random_state, n_samples, n_samples_bootstrap):
- """
- Private function used to forest._set_oob_score function."""
- sample_indices = _generate_sample_indices(
- random_state, n_samples, n_samples_bootstrap
- )
- sample_counts = np.bincount(sample_indices, minlength=n_samples)
- unsampled_mask = sample_counts == 0
- indices_range = np.arange(n_samples)
- unsampled_indices = indices_range[unsampled_mask]
- return unsampled_indices
- def _parallel_build_trees(
- tree,
- bootstrap,
- X,
- y,
- sample_weight,
- tree_idx,
- n_trees,
- verbose=0,
- class_weight=None,
- n_samples_bootstrap=None,
- ):
- """
- Private function used to fit a single tree in parallel."""
- if verbose > 1:
- print("building tree %d of %d" % (tree_idx + 1, n_trees))
- if bootstrap:
- n_samples = X.shape[0]
- if sample_weight is None:
- curr_sample_weight = np.ones((n_samples,), dtype=np.float64)
- else:
- curr_sample_weight = sample_weight.copy()
- indices = _generate_sample_indices(
- tree.random_state, n_samples, n_samples_bootstrap
- )
- sample_counts = np.bincount(indices, minlength=n_samples)
- curr_sample_weight *= sample_counts
- if class_weight == "subsample":
- with catch_warnings():
- simplefilter("ignore", DeprecationWarning)
- curr_sample_weight *= compute_sample_weight("auto", y, indices=indices)
- elif class_weight == "balanced_subsample":
- curr_sample_weight *= compute_sample_weight("balanced", y, indices=indices)
- tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
- else:
- tree.fit(X, y, sample_weight=sample_weight, check_input=False)
- return tree
- class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta):
- """
- Base class for forests of trees.
- Warning: This class should not be used directly. Use derived classes
- instead.
- """
- _parameter_constraints: dict = {
- "n_estimators": [Interval(Integral, 1, None, closed="left")],
- "bootstrap": ["boolean"],
- "oob_score": ["boolean", callable],
- "n_jobs": [Integral, None],
- "random_state": ["random_state"],
- "verbose": ["verbose"],
- "warm_start": ["boolean"],
- "max_samples": [
- None,
- Interval(RealNotInt, 0.0, 1.0, closed="right"),
- Interval(Integral, 1, None, closed="left"),
- ],
- }
- @abstractmethod
- def __init__(
- self,
- estimator,
- n_estimators=100,
- *,
- estimator_params=tuple(),
- bootstrap=False,
- oob_score=False,
- n_jobs=None,
- random_state=None,
- verbose=0,
- warm_start=False,
- class_weight=None,
- max_samples=None,
- base_estimator="deprecated",
- ):
- super().__init__(
- estimator=estimator,
- n_estimators=n_estimators,
- estimator_params=estimator_params,
- base_estimator=base_estimator,
- )
- self.bootstrap = bootstrap
- self.oob_score = oob_score
- self.n_jobs = n_jobs
- self.random_state = random_state
- self.verbose = verbose
- self.warm_start = warm_start
- self.class_weight = class_weight
- self.max_samples = max_samples
- def apply(self, X):
- """
- Apply trees in the forest to X, return leaf indices.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The input samples. Internally, its dtype will be converted to
- ``dtype=np.float32``. If a sparse matrix is provided, it will be
- converted into a sparse ``csr_matrix``.
- Returns
- -------
- X_leaves : ndarray of shape (n_samples, n_estimators)
- For each datapoint x in X and for each tree in the forest,
- return the index of the leaf x ends up in.
- """
- X = self._validate_X_predict(X)
- results = Parallel(
- n_jobs=self.n_jobs,
- verbose=self.verbose,
- prefer="threads",
- )(delayed(tree.apply)(X, check_input=False) for tree in self.estimators_)
- return np.array(results).T
- def decision_path(self, X):
- """
- Return the decision path in the forest.
- .. versionadded:: 0.18
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The input samples. Internally, its dtype will be converted to
- ``dtype=np.float32``. If a sparse matrix is provided, it will be
- converted into a sparse ``csr_matrix``.
- Returns
- -------
- indicator : sparse matrix of shape (n_samples, n_nodes)
- Return a node indicator matrix where non zero elements indicates
- that the samples goes through the nodes. The matrix is of CSR
- format.
- n_nodes_ptr : ndarray of shape (n_estimators + 1,)
- The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]
- gives the indicator value for the i-th estimator.
- """
- X = self._validate_X_predict(X)
- indicators = Parallel(
- n_jobs=self.n_jobs,
- verbose=self.verbose,
- prefer="threads",
- )(
- delayed(tree.decision_path)(X, check_input=False)
- for tree in self.estimators_
- )
- n_nodes = [0]
- n_nodes.extend([i.shape[1] for i in indicators])
- n_nodes_ptr = np.array(n_nodes).cumsum()
- return sparse_hstack(indicators).tocsr(), n_nodes_ptr
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, X, y, sample_weight=None):
- """
- Build a forest of trees from the training set (X, y).
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The training input samples. Internally, its dtype will be converted
- to ``dtype=np.float32``. If a sparse matrix is provided, it will be
- converted into a sparse ``csc_matrix``.
- y : array-like of shape (n_samples,) or (n_samples, n_outputs)
- The target values (class labels in classification, real numbers in
- regression).
- sample_weight : array-like of shape (n_samples,), default=None
- Sample weights. If None, then samples are equally weighted. Splits
- that would create child nodes with net zero or negative weight are
- ignored while searching for a split in each node. In the case of
- classification, splits are also ignored if they would result in any
- single class carrying a negative weight in either child node.
- Returns
- -------
- self : object
- Fitted estimator.
- """
- # Validate or convert input data
- if issparse(y):
- raise ValueError("sparse multilabel-indicator for y is not supported.")
- X, y = self._validate_data(
- X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE
- )
- if sample_weight is not None:
- sample_weight = _check_sample_weight(sample_weight, X)
- if issparse(X):
- # Pre-sort indices to avoid that each individual tree of the
- # ensemble sorts the indices.
- X.sort_indices()
- y = np.atleast_1d(y)
- if y.ndim == 2 and y.shape[1] == 1:
- warn(
- (
- "A column-vector y was passed when a 1d array was"
- " expected. Please change the shape of y to "
- "(n_samples,), for example using ravel()."
- ),
- DataConversionWarning,
- stacklevel=2,
- )
- if y.ndim == 1:
- # reshape is necessary to preserve the data contiguity against vs
- # [:, np.newaxis] that does not.
- y = np.reshape(y, (-1, 1))
- if self.criterion == "poisson":
- if np.any(y < 0):
- raise ValueError(
- "Some value(s) of y are negative which is "
- "not allowed for Poisson regression."
- )
- if np.sum(y) <= 0:
- raise ValueError(
- "Sum of y is not strictly positive which "
- "is necessary for Poisson regression."
- )
- self.n_outputs_ = y.shape[1]
- y, expanded_class_weight = self._validate_y_class_weight(y)
- if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
- y = np.ascontiguousarray(y, dtype=DOUBLE)
- if expanded_class_weight is not None:
- if sample_weight is not None:
- sample_weight = sample_weight * expanded_class_weight
- else:
- sample_weight = expanded_class_weight
- if not self.bootstrap and self.max_samples is not None:
- raise ValueError(
- "`max_sample` cannot be set if `bootstrap=False`. "
- "Either switch to `bootstrap=True` or set "
- "`max_sample=None`."
- )
- elif self.bootstrap:
- n_samples_bootstrap = _get_n_samples_bootstrap(
- n_samples=X.shape[0], max_samples=self.max_samples
- )
- else:
- n_samples_bootstrap = None
- self._validate_estimator()
- if not self.bootstrap and self.oob_score:
- raise ValueError("Out of bag estimation only available if bootstrap=True")
- random_state = check_random_state(self.random_state)
- if not self.warm_start or not hasattr(self, "estimators_"):
- # Free allocated memory, if any
- self.estimators_ = []
- n_more_estimators = self.n_estimators - len(self.estimators_)
- if n_more_estimators < 0:
- raise ValueError(
- "n_estimators=%d must be larger or equal to "
- "len(estimators_)=%d when warm_start==True"
- % (self.n_estimators, len(self.estimators_))
- )
- elif n_more_estimators == 0:
- warn(
- "Warm-start fitting without increasing n_estimators does not "
- "fit new trees."
- )
- else:
- if self.warm_start and len(self.estimators_) > 0:
- # We draw from the random state to get the random state we
- # would have got if we hadn't used a warm_start.
- random_state.randint(MAX_INT, size=len(self.estimators_))
- trees = [
- self._make_estimator(append=False, random_state=random_state)
- for i in range(n_more_estimators)
- ]
- # Parallel loop: we prefer the threading backend as the Cython code
- # for fitting the trees is internally releasing the Python GIL
- # making threading more efficient than multiprocessing in
- # that case. However, for joblib 0.12+ we respect any
- # parallel_backend contexts set at a higher level,
- # since correctness does not rely on using threads.
- trees = Parallel(
- n_jobs=self.n_jobs,
- verbose=self.verbose,
- prefer="threads",
- )(
- delayed(_parallel_build_trees)(
- t,
- self.bootstrap,
- X,
- y,
- sample_weight,
- i,
- len(trees),
- verbose=self.verbose,
- class_weight=self.class_weight,
- n_samples_bootstrap=n_samples_bootstrap,
- )
- for i, t in enumerate(trees)
- )
- # Collect newly grown trees
- self.estimators_.extend(trees)
- if self.oob_score and (
- n_more_estimators > 0 or not hasattr(self, "oob_score_")
- ):
- y_type = type_of_target(y)
- if y_type in ("multiclass-multioutput", "unknown"):
- # FIXME: we could consider to support multiclass-multioutput if
- # we introduce or reuse a constructor parameter (e.g.
- # oob_score) allowing our user to pass a callable defining the
- # scoring strategy on OOB sample.
- raise ValueError(
- "The type of target cannot be used to compute OOB "
- f"estimates. Got {y_type} while only the following are "
- "supported: continuous, continuous-multioutput, binary, "
- "multiclass, multilabel-indicator."
- )
- if callable(self.oob_score):
- self._set_oob_score_and_attributes(
- X, y, scoring_function=self.oob_score
- )
- else:
- self._set_oob_score_and_attributes(X, y)
- # Decapsulate classes_ attributes
- if hasattr(self, "classes_") and self.n_outputs_ == 1:
- self.n_classes_ = self.n_classes_[0]
- self.classes_ = self.classes_[0]
- return self
- @abstractmethod
- def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
- """Compute and set the OOB score and attributes.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- The data matrix.
- y : ndarray of shape (n_samples, n_outputs)
- The target matrix.
- scoring_function : callable, default=None
- Scoring function for OOB score. Default depends on whether
- this is a regression (R2 score) or classification problem
- (accuracy score).
- """
- def _compute_oob_predictions(self, X, y):
- """Compute and set the OOB score.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- The data matrix.
- y : ndarray of shape (n_samples, n_outputs)
- The target matrix.
- Returns
- -------
- oob_pred : ndarray of shape (n_samples, n_classes, n_outputs) or \
- (n_samples, 1, n_outputs)
- The OOB predictions.
- """
- # Prediction requires X to be in CSR format
- if issparse(X):
- X = X.tocsr()
- n_samples = y.shape[0]
- n_outputs = self.n_outputs_
- if is_classifier(self) and hasattr(self, "n_classes_"):
- # n_classes_ is a ndarray at this stage
- # all the supported type of target will have the same number of
- # classes in all outputs
- oob_pred_shape = (n_samples, self.n_classes_[0], n_outputs)
- else:
- # for regression, n_classes_ does not exist and we create an empty
- # axis to be consistent with the classification case and make
- # the array operations compatible with the 2 settings
- oob_pred_shape = (n_samples, 1, n_outputs)
- oob_pred = np.zeros(shape=oob_pred_shape, dtype=np.float64)
- n_oob_pred = np.zeros((n_samples, n_outputs), dtype=np.int64)
- n_samples_bootstrap = _get_n_samples_bootstrap(
- n_samples,
- self.max_samples,
- )
- for estimator in self.estimators_:
- unsampled_indices = _generate_unsampled_indices(
- estimator.random_state,
- n_samples,
- n_samples_bootstrap,
- )
- y_pred = self._get_oob_predictions(estimator, X[unsampled_indices, :])
- oob_pred[unsampled_indices, ...] += y_pred
- n_oob_pred[unsampled_indices, :] += 1
- for k in range(n_outputs):
- if (n_oob_pred == 0).any():
- warn(
- (
- "Some inputs do not have OOB scores. This probably means "
- "too few trees were used to compute any reliable OOB "
- "estimates."
- ),
- UserWarning,
- )
- n_oob_pred[n_oob_pred == 0] = 1
- oob_pred[..., k] /= n_oob_pred[..., [k]]
- return oob_pred
- def _validate_y_class_weight(self, y):
- # Default implementation
- return y, None
- def _validate_X_predict(self, X):
- """
- Validate X whenever one tries to predict, apply, predict_proba."""
- check_is_fitted(self)
- X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False)
- if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc):
- raise ValueError("No support for np.int64 index based sparse matrices")
- return X
- @property
- def feature_importances_(self):
- """
- The impurity-based feature importances.
- The higher, the more important the feature.
- The importance of a feature is computed as the (normalized)
- total reduction of the criterion brought by that feature. It is also
- known as the Gini importance.
- Warning: impurity-based feature importances can be misleading for
- high cardinality features (many unique values). See
- :func:`sklearn.inspection.permutation_importance` as an alternative.
- Returns
- -------
- feature_importances_ : ndarray of shape (n_features,)
- The values of this array sum to 1, unless all trees are single node
- trees consisting of only the root node, in which case it will be an
- array of zeros.
- """
- check_is_fitted(self)
- all_importances = Parallel(n_jobs=self.n_jobs, prefer="threads")(
- delayed(getattr)(tree, "feature_importances_")
- for tree in self.estimators_
- if tree.tree_.node_count > 1
- )
- if not all_importances:
- return np.zeros(self.n_features_in_, dtype=np.float64)
- all_importances = np.mean(all_importances, axis=0, dtype=np.float64)
- return all_importances / np.sum(all_importances)
- def _accumulate_prediction(predict, X, out, lock):
- """
- This is a utility function for joblib's Parallel.
- It can't go locally in ForestClassifier or ForestRegressor, because joblib
- complains that it cannot pickle it when placed there.
- """
- prediction = predict(X, check_input=False)
- with lock:
- if len(out) == 1:
- out[0] += prediction
- else:
- for i in range(len(out)):
- out[i] += prediction[i]
- class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta):
- """
- Base class for forest of trees-based classifiers.
- Warning: This class should not be used directly. Use derived classes
- instead.
- """
- @abstractmethod
- def __init__(
- self,
- estimator,
- n_estimators=100,
- *,
- estimator_params=tuple(),
- bootstrap=False,
- oob_score=False,
- n_jobs=None,
- random_state=None,
- verbose=0,
- warm_start=False,
- class_weight=None,
- max_samples=None,
- base_estimator="deprecated",
- ):
- super().__init__(
- estimator=estimator,
- n_estimators=n_estimators,
- estimator_params=estimator_params,
- bootstrap=bootstrap,
- oob_score=oob_score,
- n_jobs=n_jobs,
- random_state=random_state,
- verbose=verbose,
- warm_start=warm_start,
- class_weight=class_weight,
- max_samples=max_samples,
- base_estimator=base_estimator,
- )
- @staticmethod
- def _get_oob_predictions(tree, X):
- """Compute the OOB predictions for an individual tree.
- Parameters
- ----------
- tree : DecisionTreeClassifier object
- A single decision tree classifier.
- X : ndarray of shape (n_samples, n_features)
- The OOB samples.
- Returns
- -------
- y_pred : ndarray of shape (n_samples, n_classes, n_outputs)
- The OOB associated predictions.
- """
- y_pred = tree.predict_proba(X, check_input=False)
- y_pred = np.array(y_pred, copy=False)
- if y_pred.ndim == 2:
- # binary and multiclass
- y_pred = y_pred[..., np.newaxis]
- else:
- # Roll the first `n_outputs` axis to the last axis. We will reshape
- # from a shape of (n_outputs, n_samples, n_classes) to a shape of
- # (n_samples, n_classes, n_outputs).
- y_pred = np.rollaxis(y_pred, axis=0, start=3)
- return y_pred
- def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
- """Compute and set the OOB score and attributes.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- The data matrix.
- y : ndarray of shape (n_samples, n_outputs)
- The target matrix.
- scoring_function : callable, default=None
- Scoring function for OOB score. Defaults to `accuracy_score`.
- """
- self.oob_decision_function_ = super()._compute_oob_predictions(X, y)
- if self.oob_decision_function_.shape[-1] == 1:
- # drop the n_outputs axis if there is a single output
- self.oob_decision_function_ = self.oob_decision_function_.squeeze(axis=-1)
- if scoring_function is None:
- scoring_function = accuracy_score
- self.oob_score_ = scoring_function(
- y, np.argmax(self.oob_decision_function_, axis=1)
- )
- def _validate_y_class_weight(self, y):
- check_classification_targets(y)
- y = np.copy(y)
- expanded_class_weight = None
- if self.class_weight is not None:
- y_original = np.copy(y)
- self.classes_ = []
- self.n_classes_ = []
- y_store_unique_indices = np.zeros(y.shape, dtype=int)
- for k in range(self.n_outputs_):
- classes_k, y_store_unique_indices[:, k] = np.unique(
- y[:, k], return_inverse=True
- )
- self.classes_.append(classes_k)
- self.n_classes_.append(classes_k.shape[0])
- y = y_store_unique_indices
- if self.class_weight is not None:
- valid_presets = ("balanced", "balanced_subsample")
- if isinstance(self.class_weight, str):
- if self.class_weight not in valid_presets:
- raise ValueError(
- "Valid presets for class_weight include "
- '"balanced" and "balanced_subsample".'
- 'Given "%s".'
- % self.class_weight
- )
- if self.warm_start:
- warn(
- 'class_weight presets "balanced" or '
- '"balanced_subsample" are '
- "not recommended for warm_start if the fitted data "
- "differs from the full dataset. In order to use "
- '"balanced" weights, use compute_class_weight '
- '("balanced", classes, y). In place of y you can use '
- "a large enough sample of the full training set "
- "target to properly estimate the class frequency "
- "distributions. Pass the resulting weights as the "
- "class_weight parameter."
- )
- if self.class_weight != "balanced_subsample" or not self.bootstrap:
- if self.class_weight == "balanced_subsample":
- class_weight = "balanced"
- else:
- class_weight = self.class_weight
- expanded_class_weight = compute_sample_weight(class_weight, y_original)
- return y, expanded_class_weight
- def predict(self, X):
- """
- Predict class for X.
- The predicted class of an input sample is a vote by the trees in
- the forest, weighted by their probability estimates. That is,
- the predicted class is the one with highest mean probability
- estimate across the trees.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The input samples. Internally, its dtype will be converted to
- ``dtype=np.float32``. If a sparse matrix is provided, it will be
- converted into a sparse ``csr_matrix``.
- Returns
- -------
- y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
- The predicted classes.
- """
- proba = self.predict_proba(X)
- if self.n_outputs_ == 1:
- return self.classes_.take(np.argmax(proba, axis=1), axis=0)
- else:
- n_samples = proba[0].shape[0]
- # all dtypes should be the same, so just take the first
- class_type = self.classes_[0].dtype
- predictions = np.empty((n_samples, self.n_outputs_), dtype=class_type)
- for k in range(self.n_outputs_):
- predictions[:, k] = self.classes_[k].take(
- np.argmax(proba[k], axis=1), axis=0
- )
- return predictions
- def predict_proba(self, X):
- """
- Predict class probabilities for X.
- The predicted class probabilities of an input sample are computed as
- the mean predicted class probabilities of the trees in the forest.
- The class probability of a single tree is the fraction of samples of
- the same class in a leaf.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The input samples. Internally, its dtype will be converted to
- ``dtype=np.float32``. If a sparse matrix is provided, it will be
- converted into a sparse ``csr_matrix``.
- Returns
- -------
- p : ndarray of shape (n_samples, n_classes), or a list of such arrays
- The class probabilities of the input samples. The order of the
- classes corresponds to that in the attribute :term:`classes_`.
- """
- check_is_fitted(self)
- # Check data
- X = self._validate_X_predict(X)
- # Assign chunk of trees to jobs
- n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
- # avoid storing the output of every estimator by summing them here
- all_proba = [
- np.zeros((X.shape[0], j), dtype=np.float64)
- for j in np.atleast_1d(self.n_classes_)
- ]
- lock = threading.Lock()
- Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")(
- delayed(_accumulate_prediction)(e.predict_proba, X, all_proba, lock)
- for e in self.estimators_
- )
- for proba in all_proba:
- proba /= len(self.estimators_)
- if len(all_proba) == 1:
- return all_proba[0]
- else:
- return all_proba
- def predict_log_proba(self, X):
- """
- Predict class log-probabilities for X.
- The predicted class log-probabilities of an input sample is computed as
- the log of the mean predicted class probabilities of the trees in the
- forest.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The input samples. Internally, its dtype will be converted to
- ``dtype=np.float32``. If a sparse matrix is provided, it will be
- converted into a sparse ``csr_matrix``.
- Returns
- -------
- p : ndarray of shape (n_samples, n_classes), or a list of such arrays
- The class probabilities of the input samples. The order of the
- classes corresponds to that in the attribute :term:`classes_`.
- """
- proba = self.predict_proba(X)
- if self.n_outputs_ == 1:
- return np.log(proba)
- else:
- for k in range(self.n_outputs_):
- proba[k] = np.log(proba[k])
- return proba
- def _more_tags(self):
- return {"multilabel": True}
- class ForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta):
- """
- Base class for forest of trees-based regressors.
- Warning: This class should not be used directly. Use derived classes
- instead.
- """
- @abstractmethod
- def __init__(
- self,
- estimator,
- n_estimators=100,
- *,
- estimator_params=tuple(),
- bootstrap=False,
- oob_score=False,
- n_jobs=None,
- random_state=None,
- verbose=0,
- warm_start=False,
- max_samples=None,
- base_estimator="deprecated",
- ):
- super().__init__(
- estimator,
- n_estimators=n_estimators,
- estimator_params=estimator_params,
- bootstrap=bootstrap,
- oob_score=oob_score,
- n_jobs=n_jobs,
- random_state=random_state,
- verbose=verbose,
- warm_start=warm_start,
- max_samples=max_samples,
- base_estimator=base_estimator,
- )
- def predict(self, X):
- """
- Predict regression target for X.
- The predicted regression target of an input sample is computed as the
- mean predicted regression targets of the trees in the forest.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The input samples. Internally, its dtype will be converted to
- ``dtype=np.float32``. If a sparse matrix is provided, it will be
- converted into a sparse ``csr_matrix``.
- Returns
- -------
- y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
- The predicted values.
- """
- check_is_fitted(self)
- # Check data
- X = self._validate_X_predict(X)
- # Assign chunk of trees to jobs
- n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
- # avoid storing the output of every estimator by summing them here
- if self.n_outputs_ > 1:
- y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=np.float64)
- else:
- y_hat = np.zeros((X.shape[0]), dtype=np.float64)
- # Parallel loop
- lock = threading.Lock()
- Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")(
- delayed(_accumulate_prediction)(e.predict, X, [y_hat], lock)
- for e in self.estimators_
- )
- y_hat /= len(self.estimators_)
- return y_hat
- @staticmethod
- def _get_oob_predictions(tree, X):
- """Compute the OOB predictions for an individual tree.
- Parameters
- ----------
- tree : DecisionTreeRegressor object
- A single decision tree regressor.
- X : ndarray of shape (n_samples, n_features)
- The OOB samples.
- Returns
- -------
- y_pred : ndarray of shape (n_samples, 1, n_outputs)
- The OOB associated predictions.
- """
- y_pred = tree.predict(X, check_input=False)
- if y_pred.ndim == 1:
- # single output regression
- y_pred = y_pred[:, np.newaxis, np.newaxis]
- else:
- # multioutput regression
- y_pred = y_pred[:, np.newaxis, :]
- return y_pred
- def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
- """Compute and set the OOB score and attributes.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- The data matrix.
- y : ndarray of shape (n_samples, n_outputs)
- The target matrix.
- scoring_function : callable, default=None
- Scoring function for OOB score. Defaults to `r2_score`.
- """
- self.oob_prediction_ = super()._compute_oob_predictions(X, y).squeeze(axis=1)
- if self.oob_prediction_.shape[-1] == 1:
- # drop the n_outputs axis if there is a single output
- self.oob_prediction_ = self.oob_prediction_.squeeze(axis=-1)
- if scoring_function is None:
- scoring_function = r2_score
- self.oob_score_ = scoring_function(y, self.oob_prediction_)
- def _compute_partial_dependence_recursion(self, grid, target_features):
- """Fast partial dependence computation.
- Parameters
- ----------
- grid : ndarray of shape (n_samples, n_target_features)
- The grid points on which the partial dependence should be
- evaluated.
- target_features : ndarray of shape (n_target_features)
- The set of target features for which the partial dependence
- should be evaluated.
- Returns
- -------
- averaged_predictions : ndarray of shape (n_samples,)
- The value of the partial dependence function on each grid point.
- """
- grid = np.asarray(grid, dtype=DTYPE, order="C")
- averaged_predictions = np.zeros(
- shape=grid.shape[0], dtype=np.float64, order="C"
- )
- for tree in self.estimators_:
- # Note: we don't sum in parallel because the GIL isn't released in
- # the fast method.
- tree.tree_.compute_partial_dependence(
- grid, target_features, averaged_predictions
- )
- # Average over the forest
- averaged_predictions /= len(self.estimators_)
- return averaged_predictions
- def _more_tags(self):
- return {"multilabel": True}
- class RandomForestClassifier(ForestClassifier):
- """
- A random forest classifier.
- A random forest is a meta estimator that fits a number of decision tree
- classifiers on various sub-samples of the dataset and uses averaging to
- improve the predictive accuracy and control over-fitting.
- The sub-sample size is controlled with the `max_samples` parameter if
- `bootstrap=True` (default), otherwise the whole dataset is used to build
- each tree.
- For a comparison between tree-based ensemble models see the example
- :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`.
- Read more in the :ref:`User Guide <forest>`.
- Parameters
- ----------
- n_estimators : int, default=100
- The number of trees in the forest.
- .. versionchanged:: 0.22
- The default value of ``n_estimators`` changed from 10 to 100
- in 0.22.
- criterion : {"gini", "entropy", "log_loss"}, default="gini"
- The function to measure the quality of a split. Supported criteria are
- "gini" for the Gini impurity and "log_loss" and "entropy" both for the
- Shannon information gain, see :ref:`tree_mathematical_formulation`.
- Note: This parameter is tree-specific.
- max_depth : int, default=None
- The maximum depth of the tree. If None, then nodes are expanded until
- all leaves are pure or until all leaves contain less than
- min_samples_split samples.
- min_samples_split : int or float, default=2
- The minimum number of samples required to split an internal node:
- - If int, then consider `min_samples_split` as the minimum number.
- - If float, then `min_samples_split` is a fraction and
- `ceil(min_samples_split * n_samples)` are the minimum
- number of samples for each split.
- .. versionchanged:: 0.18
- Added float values for fractions.
- min_samples_leaf : int or float, default=1
- The minimum number of samples required to be at a leaf node.
- A split point at any depth will only be considered if it leaves at
- least ``min_samples_leaf`` training samples in each of the left and
- right branches. This may have the effect of smoothing the model,
- especially in regression.
- - If int, then consider `min_samples_leaf` as the minimum number.
- - If float, then `min_samples_leaf` is a fraction and
- `ceil(min_samples_leaf * n_samples)` are the minimum
- number of samples for each node.
- .. versionchanged:: 0.18
- Added float values for fractions.
- min_weight_fraction_leaf : float, default=0.0
- The minimum weighted fraction of the sum total of weights (of all
- the input samples) required to be at a leaf node. Samples have
- equal weight when sample_weight is not provided.
- max_features : {"sqrt", "log2", None}, int or float, default="sqrt"
- The number of features to consider when looking for the best split:
- - If int, then consider `max_features` features at each split.
- - If float, then `max_features` is a fraction and
- `max(1, int(max_features * n_features_in_))` features are considered at each
- split.
- - If "sqrt", then `max_features=sqrt(n_features)`.
- - If "log2", then `max_features=log2(n_features)`.
- - If None, then `max_features=n_features`.
- .. versionchanged:: 1.1
- The default of `max_features` changed from `"auto"` to `"sqrt"`.
- Note: the search for a split does not stop until at least one
- valid partition of the node samples is found, even if it requires to
- effectively inspect more than ``max_features`` features.
- max_leaf_nodes : int, default=None
- Grow trees with ``max_leaf_nodes`` in best-first fashion.
- Best nodes are defined as relative reduction in impurity.
- If None then unlimited number of leaf nodes.
- min_impurity_decrease : float, default=0.0
- A node will be split if this split induces a decrease of the impurity
- greater than or equal to this value.
- The weighted impurity decrease equation is the following::
- N_t / N * (impurity - N_t_R / N_t * right_impurity
- - N_t_L / N_t * left_impurity)
- where ``N`` is the total number of samples, ``N_t`` is the number of
- samples at the current node, ``N_t_L`` is the number of samples in the
- left child, and ``N_t_R`` is the number of samples in the right child.
- ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
- if ``sample_weight`` is passed.
- .. versionadded:: 0.19
- bootstrap : bool, default=True
- Whether bootstrap samples are used when building trees. If False, the
- whole dataset is used to build each tree.
- oob_score : bool or callable, default=False
- Whether to use out-of-bag samples to estimate the generalization score.
- By default, :func:`~sklearn.metrics.accuracy_score` is used.
- Provide a callable with signature `metric(y_true, y_pred)` to use a
- custom metric. Only available if `bootstrap=True`.
- n_jobs : int, default=None
- The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
- :meth:`decision_path` and :meth:`apply` are all parallelized over the
- trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
- context. ``-1`` means using all processors. See :term:`Glossary
- <n_jobs>` for more details.
- random_state : int, RandomState instance or None, default=None
- Controls both the randomness of the bootstrapping of the samples used
- when building trees (if ``bootstrap=True``) and the sampling of the
- features to consider when looking for the best split at each node
- (if ``max_features < n_features``).
- See :term:`Glossary <random_state>` for details.
- verbose : int, default=0
- Controls the verbosity when fitting and predicting.
- warm_start : bool, default=False
- When set to ``True``, reuse the solution of the previous call to fit
- and add more estimators to the ensemble, otherwise, just fit a whole
- new forest. See :term:`Glossary <warm_start>` and
- :ref:`gradient_boosting_warm_start` for details.
- class_weight : {"balanced", "balanced_subsample"}, dict or list of dicts, \
- default=None
- Weights associated with classes in the form ``{class_label: weight}``.
- If not given, all classes are supposed to have weight one. For
- multi-output problems, a list of dicts can be provided in the same
- order as the columns of y.
- Note that for multioutput (including multilabel) weights should be
- defined for each class of every column in its own dict. For example,
- for four-class multilabel classification weights should be
- [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
- [{1:1}, {2:5}, {3:1}, {4:1}].
- The "balanced" mode uses the values of y to automatically adjust
- weights inversely proportional to class frequencies in the input data
- as ``n_samples / (n_classes * np.bincount(y))``
- The "balanced_subsample" mode is the same as "balanced" except that
- weights are computed based on the bootstrap sample for every tree
- grown.
- For multi-output, the weights of each column of y will be multiplied.
- Note that these weights will be multiplied with sample_weight (passed
- through the fit method) if sample_weight is specified.
- ccp_alpha : non-negative float, default=0.0
- Complexity parameter used for Minimal Cost-Complexity Pruning. The
- subtree with the largest cost complexity that is smaller than
- ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
- :ref:`minimal_cost_complexity_pruning` for details.
- .. versionadded:: 0.22
- max_samples : int or float, default=None
- If bootstrap is True, the number of samples to draw from X
- to train each base estimator.
- - If None (default), then draw `X.shape[0]` samples.
- - If int, then draw `max_samples` samples.
- - If float, then draw `max(round(n_samples * max_samples), 1)` samples. Thus,
- `max_samples` should be in the interval `(0.0, 1.0]`.
- .. versionadded:: 0.22
- Attributes
- ----------
- estimator_ : :class:`~sklearn.tree.DecisionTreeClassifier`
- The child estimator template used to create the collection of fitted
- sub-estimators.
- .. versionadded:: 1.2
- `base_estimator_` was renamed to `estimator_`.
- base_estimator_ : DecisionTreeClassifier
- The child estimator template used to create the collection of fitted
- sub-estimators.
- .. deprecated:: 1.2
- `base_estimator_` is deprecated and will be removed in 1.4.
- Use `estimator_` instead.
- estimators_ : list of DecisionTreeClassifier
- The collection of fitted sub-estimators.
- classes_ : ndarray of shape (n_classes,) or a list of such arrays
- The classes labels (single output problem), or a list of arrays of
- class labels (multi-output problem).
- n_classes_ : int or list
- The number of classes (single output problem), or a list containing the
- number of classes for each output (multi-output problem).
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- n_outputs_ : int
- The number of outputs when ``fit`` is performed.
- feature_importances_ : ndarray of shape (n_features,)
- The impurity-based feature importances.
- The higher, the more important the feature.
- The importance of a feature is computed as the (normalized)
- total reduction of the criterion brought by that feature. It is also
- known as the Gini importance.
- Warning: impurity-based feature importances can be misleading for
- high cardinality features (many unique values). See
- :func:`sklearn.inspection.permutation_importance` as an alternative.
- oob_score_ : float
- Score of the training dataset obtained using an out-of-bag estimate.
- This attribute exists only when ``oob_score`` is True.
- oob_decision_function_ : ndarray of shape (n_samples, n_classes) or \
- (n_samples, n_classes, n_outputs)
- Decision function computed with out-of-bag estimate on the training
- set. If n_estimators is small it might be possible that a data point
- was never left out during the bootstrap. In this case,
- `oob_decision_function_` might contain NaN. This attribute exists
- only when ``oob_score`` is True.
- See Also
- --------
- sklearn.tree.DecisionTreeClassifier : A decision tree classifier.
- sklearn.ensemble.ExtraTreesClassifier : Ensemble of extremely randomized
- tree classifiers.
- sklearn.ensemble.HistGradientBoostingClassifier : A Histogram-based Gradient
- Boosting Classification Tree, very fast for big datasets (n_samples >=
- 10_000).
- Notes
- -----
- The default values for the parameters controlling the size of the trees
- (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
- unpruned trees which can potentially be very large on some data sets. To
- reduce memory consumption, the complexity and size of the trees should be
- controlled by setting those parameter values.
- The features are always randomly permuted at each split. Therefore,
- the best found split may vary, even with the same training data,
- ``max_features=n_features`` and ``bootstrap=False``, if the improvement
- of the criterion is identical for several splits enumerated during the
- search of the best split. To obtain a deterministic behaviour during
- fitting, ``random_state`` has to be fixed.
- References
- ----------
- .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.
- Examples
- --------
- >>> from sklearn.ensemble import RandomForestClassifier
- >>> from sklearn.datasets import make_classification
- >>> X, y = make_classification(n_samples=1000, n_features=4,
- ... n_informative=2, n_redundant=0,
- ... random_state=0, shuffle=False)
- >>> clf = RandomForestClassifier(max_depth=2, random_state=0)
- >>> clf.fit(X, y)
- RandomForestClassifier(...)
- >>> print(clf.predict([[0, 0, 0, 0]]))
- [1]
- """
- _parameter_constraints: dict = {
- **ForestClassifier._parameter_constraints,
- **DecisionTreeClassifier._parameter_constraints,
- "class_weight": [
- StrOptions({"balanced_subsample", "balanced"}),
- dict,
- list,
- None,
- ],
- }
- _parameter_constraints.pop("splitter")
- def __init__(
- self,
- n_estimators=100,
- *,
- criterion="gini",
- max_depth=None,
- min_samples_split=2,
- min_samples_leaf=1,
- min_weight_fraction_leaf=0.0,
- max_features="sqrt",
- max_leaf_nodes=None,
- min_impurity_decrease=0.0,
- bootstrap=True,
- oob_score=False,
- n_jobs=None,
- random_state=None,
- verbose=0,
- warm_start=False,
- class_weight=None,
- ccp_alpha=0.0,
- max_samples=None,
- ):
- super().__init__(
- estimator=DecisionTreeClassifier(),
- n_estimators=n_estimators,
- estimator_params=(
- "criterion",
- "max_depth",
- "min_samples_split",
- "min_samples_leaf",
- "min_weight_fraction_leaf",
- "max_features",
- "max_leaf_nodes",
- "min_impurity_decrease",
- "random_state",
- "ccp_alpha",
- ),
- bootstrap=bootstrap,
- oob_score=oob_score,
- n_jobs=n_jobs,
- random_state=random_state,
- verbose=verbose,
- warm_start=warm_start,
- class_weight=class_weight,
- max_samples=max_samples,
- )
- self.criterion = criterion
- self.max_depth = max_depth
- self.min_samples_split = min_samples_split
- self.min_samples_leaf = min_samples_leaf
- self.min_weight_fraction_leaf = min_weight_fraction_leaf
- self.max_features = max_features
- self.max_leaf_nodes = max_leaf_nodes
- self.min_impurity_decrease = min_impurity_decrease
- self.ccp_alpha = ccp_alpha
- class RandomForestRegressor(ForestRegressor):
- """
- A random forest regressor.
- A random forest is a meta estimator that fits a number of classifying
- decision trees on various sub-samples of the dataset and uses averaging
- to improve the predictive accuracy and control over-fitting.
- The sub-sample size is controlled with the `max_samples` parameter if
- `bootstrap=True` (default), otherwise the whole dataset is used to build
- each tree.
- For a comparison between tree-based ensemble models see the example
- :ref:`sphx_glr_auto_examples_ensemble_plot_forest_hist_grad_boosting_comparison.py`.
- Read more in the :ref:`User Guide <forest>`.
- Parameters
- ----------
- n_estimators : int, default=100
- The number of trees in the forest.
- .. versionchanged:: 0.22
- The default value of ``n_estimators`` changed from 10 to 100
- in 0.22.
- criterion : {"squared_error", "absolute_error", "friedman_mse", "poisson"}, \
- default="squared_error"
- The function to measure the quality of a split. Supported criteria
- are "squared_error" for the mean squared error, which is equal to
- variance reduction as feature selection criterion and minimizes the L2
- loss using the mean of each terminal node, "friedman_mse", which uses
- mean squared error with Friedman's improvement score for potential
- splits, "absolute_error" for the mean absolute error, which minimizes
- the L1 loss using the median of each terminal node, and "poisson" which
- uses reduction in Poisson deviance to find splits.
- Training using "absolute_error" is significantly slower
- than when using "squared_error".
- .. versionadded:: 0.18
- Mean Absolute Error (MAE) criterion.
- .. versionadded:: 1.0
- Poisson criterion.
- max_depth : int, default=None
- The maximum depth of the tree. If None, then nodes are expanded until
- all leaves are pure or until all leaves contain less than
- min_samples_split samples.
- min_samples_split : int or float, default=2
- The minimum number of samples required to split an internal node:
- - If int, then consider `min_samples_split` as the minimum number.
- - If float, then `min_samples_split` is a fraction and
- `ceil(min_samples_split * n_samples)` are the minimum
- number of samples for each split.
- .. versionchanged:: 0.18
- Added float values for fractions.
- min_samples_leaf : int or float, default=1
- The minimum number of samples required to be at a leaf node.
- A split point at any depth will only be considered if it leaves at
- least ``min_samples_leaf`` training samples in each of the left and
- right branches. This may have the effect of smoothing the model,
- especially in regression.
- - If int, then consider `min_samples_leaf` as the minimum number.
- - If float, then `min_samples_leaf` is a fraction and
- `ceil(min_samples_leaf * n_samples)` are the minimum
- number of samples for each node.
- .. versionchanged:: 0.18
- Added float values for fractions.
- min_weight_fraction_leaf : float, default=0.0
- The minimum weighted fraction of the sum total of weights (of all
- the input samples) required to be at a leaf node. Samples have
- equal weight when sample_weight is not provided.
- max_features : {"sqrt", "log2", None}, int or float, default=1.0
- The number of features to consider when looking for the best split:
- - If int, then consider `max_features` features at each split.
- - If float, then `max_features` is a fraction and
- `max(1, int(max_features * n_features_in_))` features are considered at each
- split.
- - If "sqrt", then `max_features=sqrt(n_features)`.
- - If "log2", then `max_features=log2(n_features)`.
- - If None or 1.0, then `max_features=n_features`.
- .. note::
- The default of 1.0 is equivalent to bagged trees and more
- randomness can be achieved by setting smaller values, e.g. 0.3.
- .. versionchanged:: 1.1
- The default of `max_features` changed from `"auto"` to 1.0.
- Note: the search for a split does not stop until at least one
- valid partition of the node samples is found, even if it requires to
- effectively inspect more than ``max_features`` features.
- max_leaf_nodes : int, default=None
- Grow trees with ``max_leaf_nodes`` in best-first fashion.
- Best nodes are defined as relative reduction in impurity.
- If None then unlimited number of leaf nodes.
- min_impurity_decrease : float, default=0.0
- A node will be split if this split induces a decrease of the impurity
- greater than or equal to this value.
- The weighted impurity decrease equation is the following::
- N_t / N * (impurity - N_t_R / N_t * right_impurity
- - N_t_L / N_t * left_impurity)
- where ``N`` is the total number of samples, ``N_t`` is the number of
- samples at the current node, ``N_t_L`` is the number of samples in the
- left child, and ``N_t_R`` is the number of samples in the right child.
- ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
- if ``sample_weight`` is passed.
- .. versionadded:: 0.19
- bootstrap : bool, default=True
- Whether bootstrap samples are used when building trees. If False, the
- whole dataset is used to build each tree.
- oob_score : bool or callable, default=False
- Whether to use out-of-bag samples to estimate the generalization score.
- By default, :func:`~sklearn.metrics.r2_score` is used.
- Provide a callable with signature `metric(y_true, y_pred)` to use a
- custom metric. Only available if `bootstrap=True`.
- n_jobs : int, default=None
- The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
- :meth:`decision_path` and :meth:`apply` are all parallelized over the
- trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
- context. ``-1`` means using all processors. See :term:`Glossary
- <n_jobs>` for more details.
- random_state : int, RandomState instance or None, default=None
- Controls both the randomness of the bootstrapping of the samples used
- when building trees (if ``bootstrap=True``) and the sampling of the
- features to consider when looking for the best split at each node
- (if ``max_features < n_features``).
- See :term:`Glossary <random_state>` for details.
- verbose : int, default=0
- Controls the verbosity when fitting and predicting.
- warm_start : bool, default=False
- When set to ``True``, reuse the solution of the previous call to fit
- and add more estimators to the ensemble, otherwise, just fit a whole
- new forest. See :term:`Glossary <warm_start>` and
- :ref:`gradient_boosting_warm_start` for details.
- ccp_alpha : non-negative float, default=0.0
- Complexity parameter used for Minimal Cost-Complexity Pruning. The
- subtree with the largest cost complexity that is smaller than
- ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
- :ref:`minimal_cost_complexity_pruning` for details.
- .. versionadded:: 0.22
- max_samples : int or float, default=None
- If bootstrap is True, the number of samples to draw from X
- to train each base estimator.
- - If None (default), then draw `X.shape[0]` samples.
- - If int, then draw `max_samples` samples.
- - If float, then draw `max(round(n_samples * max_samples), 1)` samples. Thus,
- `max_samples` should be in the interval `(0.0, 1.0]`.
- .. versionadded:: 0.22
- Attributes
- ----------
- estimator_ : :class:`~sklearn.tree.DecisionTreeRegressor`
- The child estimator template used to create the collection of fitted
- sub-estimators.
- .. versionadded:: 1.2
- `base_estimator_` was renamed to `estimator_`.
- base_estimator_ : DecisionTreeRegressor
- The child estimator template used to create the collection of fitted
- sub-estimators.
- .. deprecated:: 1.2
- `base_estimator_` is deprecated and will be removed in 1.4.
- Use `estimator_` instead.
- estimators_ : list of DecisionTreeRegressor
- The collection of fitted sub-estimators.
- feature_importances_ : ndarray of shape (n_features,)
- The impurity-based feature importances.
- The higher, the more important the feature.
- The importance of a feature is computed as the (normalized)
- total reduction of the criterion brought by that feature. It is also
- known as the Gini importance.
- Warning: impurity-based feature importances can be misleading for
- high cardinality features (many unique values). See
- :func:`sklearn.inspection.permutation_importance` as an alternative.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- n_outputs_ : int
- The number of outputs when ``fit`` is performed.
- oob_score_ : float
- Score of the training dataset obtained using an out-of-bag estimate.
- This attribute exists only when ``oob_score`` is True.
- oob_prediction_ : ndarray of shape (n_samples,) or (n_samples, n_outputs)
- Prediction computed with out-of-bag estimate on the training set.
- This attribute exists only when ``oob_score`` is True.
- See Also
- --------
- sklearn.tree.DecisionTreeRegressor : A decision tree regressor.
- sklearn.ensemble.ExtraTreesRegressor : Ensemble of extremely randomized
- tree regressors.
- sklearn.ensemble.HistGradientBoostingRegressor : A Histogram-based Gradient
- Boosting Regression Tree, very fast for big datasets (n_samples >=
- 10_000).
- Notes
- -----
- The default values for the parameters controlling the size of the trees
- (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
- unpruned trees which can potentially be very large on some data sets. To
- reduce memory consumption, the complexity and size of the trees should be
- controlled by setting those parameter values.
- The features are always randomly permuted at each split. Therefore,
- the best found split may vary, even with the same training data,
- ``max_features=n_features`` and ``bootstrap=False``, if the improvement
- of the criterion is identical for several splits enumerated during the
- search of the best split. To obtain a deterministic behaviour during
- fitting, ``random_state`` has to be fixed.
- The default value ``max_features=1.0`` uses ``n_features``
- rather than ``n_features / 3``. The latter was originally suggested in
- [1], whereas the former was more recently justified empirically in [2].
- References
- ----------
- .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.
- .. [2] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized
- trees", Machine Learning, 63(1), 3-42, 2006.
- Examples
- --------
- >>> from sklearn.ensemble import RandomForestRegressor
- >>> from sklearn.datasets import make_regression
- >>> X, y = make_regression(n_features=4, n_informative=2,
- ... random_state=0, shuffle=False)
- >>> regr = RandomForestRegressor(max_depth=2, random_state=0)
- >>> regr.fit(X, y)
- RandomForestRegressor(...)
- >>> print(regr.predict([[0, 0, 0, 0]]))
- [-8.32987858]
- """
- _parameter_constraints: dict = {
- **ForestRegressor._parameter_constraints,
- **DecisionTreeRegressor._parameter_constraints,
- }
- _parameter_constraints.pop("splitter")
- def __init__(
- self,
- n_estimators=100,
- *,
- criterion="squared_error",
- max_depth=None,
- min_samples_split=2,
- min_samples_leaf=1,
- min_weight_fraction_leaf=0.0,
- max_features=1.0,
- max_leaf_nodes=None,
- min_impurity_decrease=0.0,
- bootstrap=True,
- oob_score=False,
- n_jobs=None,
- random_state=None,
- verbose=0,
- warm_start=False,
- ccp_alpha=0.0,
- max_samples=None,
- ):
- super().__init__(
- estimator=DecisionTreeRegressor(),
- n_estimators=n_estimators,
- estimator_params=(
- "criterion",
- "max_depth",
- "min_samples_split",
- "min_samples_leaf",
- "min_weight_fraction_leaf",
- "max_features",
- "max_leaf_nodes",
- "min_impurity_decrease",
- "random_state",
- "ccp_alpha",
- ),
- bootstrap=bootstrap,
- oob_score=oob_score,
- n_jobs=n_jobs,
- random_state=random_state,
- verbose=verbose,
- warm_start=warm_start,
- max_samples=max_samples,
- )
- self.criterion = criterion
- self.max_depth = max_depth
- self.min_samples_split = min_samples_split
- self.min_samples_leaf = min_samples_leaf
- self.min_weight_fraction_leaf = min_weight_fraction_leaf
- self.max_features = max_features
- self.max_leaf_nodes = max_leaf_nodes
- self.min_impurity_decrease = min_impurity_decrease
- self.ccp_alpha = ccp_alpha
- class ExtraTreesClassifier(ForestClassifier):
- """
- An extra-trees classifier.
- This class implements a meta estimator that fits a number of
- randomized decision trees (a.k.a. extra-trees) on various sub-samples
- of the dataset and uses averaging to improve the predictive accuracy
- and control over-fitting.
- Read more in the :ref:`User Guide <forest>`.
- Parameters
- ----------
- n_estimators : int, default=100
- The number of trees in the forest.
- .. versionchanged:: 0.22
- The default value of ``n_estimators`` changed from 10 to 100
- in 0.22.
- criterion : {"gini", "entropy", "log_loss"}, default="gini"
- The function to measure the quality of a split. Supported criteria are
- "gini" for the Gini impurity and "log_loss" and "entropy" both for the
- Shannon information gain, see :ref:`tree_mathematical_formulation`.
- Note: This parameter is tree-specific.
- max_depth : int, default=None
- The maximum depth of the tree. If None, then nodes are expanded until
- all leaves are pure or until all leaves contain less than
- min_samples_split samples.
- min_samples_split : int or float, default=2
- The minimum number of samples required to split an internal node:
- - If int, then consider `min_samples_split` as the minimum number.
- - If float, then `min_samples_split` is a fraction and
- `ceil(min_samples_split * n_samples)` are the minimum
- number of samples for each split.
- .. versionchanged:: 0.18
- Added float values for fractions.
- min_samples_leaf : int or float, default=1
- The minimum number of samples required to be at a leaf node.
- A split point at any depth will only be considered if it leaves at
- least ``min_samples_leaf`` training samples in each of the left and
- right branches. This may have the effect of smoothing the model,
- especially in regression.
- - If int, then consider `min_samples_leaf` as the minimum number.
- - If float, then `min_samples_leaf` is a fraction and
- `ceil(min_samples_leaf * n_samples)` are the minimum
- number of samples for each node.
- .. versionchanged:: 0.18
- Added float values for fractions.
- min_weight_fraction_leaf : float, default=0.0
- The minimum weighted fraction of the sum total of weights (of all
- the input samples) required to be at a leaf node. Samples have
- equal weight when sample_weight is not provided.
- max_features : {"sqrt", "log2", None}, int or float, default="sqrt"
- The number of features to consider when looking for the best split:
- - If int, then consider `max_features` features at each split.
- - If float, then `max_features` is a fraction and
- `max(1, int(max_features * n_features_in_))` features are considered at each
- split.
- - If "sqrt", then `max_features=sqrt(n_features)`.
- - If "log2", then `max_features=log2(n_features)`.
- - If None, then `max_features=n_features`.
- .. versionchanged:: 1.1
- The default of `max_features` changed from `"auto"` to `"sqrt"`.
- Note: the search for a split does not stop until at least one
- valid partition of the node samples is found, even if it requires to
- effectively inspect more than ``max_features`` features.
- max_leaf_nodes : int, default=None
- Grow trees with ``max_leaf_nodes`` in best-first fashion.
- Best nodes are defined as relative reduction in impurity.
- If None then unlimited number of leaf nodes.
- min_impurity_decrease : float, default=0.0
- A node will be split if this split induces a decrease of the impurity
- greater than or equal to this value.
- The weighted impurity decrease equation is the following::
- N_t / N * (impurity - N_t_R / N_t * right_impurity
- - N_t_L / N_t * left_impurity)
- where ``N`` is the total number of samples, ``N_t`` is the number of
- samples at the current node, ``N_t_L`` is the number of samples in the
- left child, and ``N_t_R`` is the number of samples in the right child.
- ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
- if ``sample_weight`` is passed.
- .. versionadded:: 0.19
- bootstrap : bool, default=False
- Whether bootstrap samples are used when building trees. If False, the
- whole dataset is used to build each tree.
- oob_score : bool or callable, default=False
- Whether to use out-of-bag samples to estimate the generalization score.
- By default, :func:`~sklearn.metrics.accuracy_score` is used.
- Provide a callable with signature `metric(y_true, y_pred)` to use a
- custom metric. Only available if `bootstrap=True`.
- n_jobs : int, default=None
- The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
- :meth:`decision_path` and :meth:`apply` are all parallelized over the
- trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
- context. ``-1`` means using all processors. See :term:`Glossary
- <n_jobs>` for more details.
- random_state : int, RandomState instance or None, default=None
- Controls 3 sources of randomness:
- - the bootstrapping of the samples used when building trees
- (if ``bootstrap=True``)
- - the sampling of the features to consider when looking for the best
- split at each node (if ``max_features < n_features``)
- - the draw of the splits for each of the `max_features`
- See :term:`Glossary <random_state>` for details.
- verbose : int, default=0
- Controls the verbosity when fitting and predicting.
- warm_start : bool, default=False
- When set to ``True``, reuse the solution of the previous call to fit
- and add more estimators to the ensemble, otherwise, just fit a whole
- new forest. See :term:`Glossary <warm_start>` and
- :ref:`gradient_boosting_warm_start` for details.
- class_weight : {"balanced", "balanced_subsample"}, dict or list of dicts, \
- default=None
- Weights associated with classes in the form ``{class_label: weight}``.
- If not given, all classes are supposed to have weight one. For
- multi-output problems, a list of dicts can be provided in the same
- order as the columns of y.
- Note that for multioutput (including multilabel) weights should be
- defined for each class of every column in its own dict. For example,
- for four-class multilabel classification weights should be
- [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
- [{1:1}, {2:5}, {3:1}, {4:1}].
- The "balanced" mode uses the values of y to automatically adjust
- weights inversely proportional to class frequencies in the input data
- as ``n_samples / (n_classes * np.bincount(y))``
- The "balanced_subsample" mode is the same as "balanced" except that
- weights are computed based on the bootstrap sample for every tree
- grown.
- For multi-output, the weights of each column of y will be multiplied.
- Note that these weights will be multiplied with sample_weight (passed
- through the fit method) if sample_weight is specified.
- ccp_alpha : non-negative float, default=0.0
- Complexity parameter used for Minimal Cost-Complexity Pruning. The
- subtree with the largest cost complexity that is smaller than
- ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
- :ref:`minimal_cost_complexity_pruning` for details.
- .. versionadded:: 0.22
- max_samples : int or float, default=None
- If bootstrap is True, the number of samples to draw from X
- to train each base estimator.
- - If None (default), then draw `X.shape[0]` samples.
- - If int, then draw `max_samples` samples.
- - If float, then draw `max_samples * X.shape[0]` samples. Thus,
- `max_samples` should be in the interval `(0.0, 1.0]`.
- .. versionadded:: 0.22
- Attributes
- ----------
- estimator_ : :class:`~sklearn.tree.ExtraTreeClassifier`
- The child estimator template used to create the collection of fitted
- sub-estimators.
- .. versionadded:: 1.2
- `base_estimator_` was renamed to `estimator_`.
- base_estimator_ : ExtraTreesClassifier
- The child estimator template used to create the collection of fitted
- sub-estimators.
- .. deprecated:: 1.2
- `base_estimator_` is deprecated and will be removed in 1.4.
- Use `estimator_` instead.
- estimators_ : list of DecisionTreeClassifier
- The collection of fitted sub-estimators.
- classes_ : ndarray of shape (n_classes,) or a list of such arrays
- The classes labels (single output problem), or a list of arrays of
- class labels (multi-output problem).
- n_classes_ : int or list
- The number of classes (single output problem), or a list containing the
- number of classes for each output (multi-output problem).
- feature_importances_ : ndarray of shape (n_features,)
- The impurity-based feature importances.
- The higher, the more important the feature.
- The importance of a feature is computed as the (normalized)
- total reduction of the criterion brought by that feature. It is also
- known as the Gini importance.
- Warning: impurity-based feature importances can be misleading for
- high cardinality features (many unique values). See
- :func:`sklearn.inspection.permutation_importance` as an alternative.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- n_outputs_ : int
- The number of outputs when ``fit`` is performed.
- oob_score_ : float
- Score of the training dataset obtained using an out-of-bag estimate.
- This attribute exists only when ``oob_score`` is True.
- oob_decision_function_ : ndarray of shape (n_samples, n_classes) or \
- (n_samples, n_classes, n_outputs)
- Decision function computed with out-of-bag estimate on the training
- set. If n_estimators is small it might be possible that a data point
- was never left out during the bootstrap. In this case,
- `oob_decision_function_` might contain NaN. This attribute exists
- only when ``oob_score`` is True.
- See Also
- --------
- ExtraTreesRegressor : An extra-trees regressor with random splits.
- RandomForestClassifier : A random forest classifier with optimal splits.
- RandomForestRegressor : Ensemble regressor using trees with optimal splits.
- Notes
- -----
- The default values for the parameters controlling the size of the trees
- (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
- unpruned trees which can potentially be very large on some data sets. To
- reduce memory consumption, the complexity and size of the trees should be
- controlled by setting those parameter values.
- References
- ----------
- .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized
- trees", Machine Learning, 63(1), 3-42, 2006.
- Examples
- --------
- >>> from sklearn.ensemble import ExtraTreesClassifier
- >>> from sklearn.datasets import make_classification
- >>> X, y = make_classification(n_features=4, random_state=0)
- >>> clf = ExtraTreesClassifier(n_estimators=100, random_state=0)
- >>> clf.fit(X, y)
- ExtraTreesClassifier(random_state=0)
- >>> clf.predict([[0, 0, 0, 0]])
- array([1])
- """
- _parameter_constraints: dict = {
- **ForestClassifier._parameter_constraints,
- **DecisionTreeClassifier._parameter_constraints,
- "class_weight": [
- StrOptions({"balanced_subsample", "balanced"}),
- dict,
- list,
- None,
- ],
- }
- _parameter_constraints.pop("splitter")
- def __init__(
- self,
- n_estimators=100,
- *,
- criterion="gini",
- max_depth=None,
- min_samples_split=2,
- min_samples_leaf=1,
- min_weight_fraction_leaf=0.0,
- max_features="sqrt",
- max_leaf_nodes=None,
- min_impurity_decrease=0.0,
- bootstrap=False,
- oob_score=False,
- n_jobs=None,
- random_state=None,
- verbose=0,
- warm_start=False,
- class_weight=None,
- ccp_alpha=0.0,
- max_samples=None,
- ):
- super().__init__(
- estimator=ExtraTreeClassifier(),
- n_estimators=n_estimators,
- estimator_params=(
- "criterion",
- "max_depth",
- "min_samples_split",
- "min_samples_leaf",
- "min_weight_fraction_leaf",
- "max_features",
- "max_leaf_nodes",
- "min_impurity_decrease",
- "random_state",
- "ccp_alpha",
- ),
- bootstrap=bootstrap,
- oob_score=oob_score,
- n_jobs=n_jobs,
- random_state=random_state,
- verbose=verbose,
- warm_start=warm_start,
- class_weight=class_weight,
- max_samples=max_samples,
- )
- self.criterion = criterion
- self.max_depth = max_depth
- self.min_samples_split = min_samples_split
- self.min_samples_leaf = min_samples_leaf
- self.min_weight_fraction_leaf = min_weight_fraction_leaf
- self.max_features = max_features
- self.max_leaf_nodes = max_leaf_nodes
- self.min_impurity_decrease = min_impurity_decrease
- self.ccp_alpha = ccp_alpha
- class ExtraTreesRegressor(ForestRegressor):
- """
- An extra-trees regressor.
- This class implements a meta estimator that fits a number of
- randomized decision trees (a.k.a. extra-trees) on various sub-samples
- of the dataset and uses averaging to improve the predictive accuracy
- and control over-fitting.
- Read more in the :ref:`User Guide <forest>`.
- Parameters
- ----------
- n_estimators : int, default=100
- The number of trees in the forest.
- .. versionchanged:: 0.22
- The default value of ``n_estimators`` changed from 10 to 100
- in 0.22.
- criterion : {"squared_error", "absolute_error", "friedman_mse", "poisson"}, \
- default="squared_error"
- The function to measure the quality of a split. Supported criteria
- are "squared_error" for the mean squared error, which is equal to
- variance reduction as feature selection criterion and minimizes the L2
- loss using the mean of each terminal node, "friedman_mse", which uses
- mean squared error with Friedman's improvement score for potential
- splits, "absolute_error" for the mean absolute error, which minimizes
- the L1 loss using the median of each terminal node, and "poisson" which
- uses reduction in Poisson deviance to find splits.
- Training using "absolute_error" is significantly slower
- than when using "squared_error".
- .. versionadded:: 0.18
- Mean Absolute Error (MAE) criterion.
- max_depth : int, default=None
- The maximum depth of the tree. If None, then nodes are expanded until
- all leaves are pure or until all leaves contain less than
- min_samples_split samples.
- min_samples_split : int or float, default=2
- The minimum number of samples required to split an internal node:
- - If int, then consider `min_samples_split` as the minimum number.
- - If float, then `min_samples_split` is a fraction and
- `ceil(min_samples_split * n_samples)` are the minimum
- number of samples for each split.
- .. versionchanged:: 0.18
- Added float values for fractions.
- min_samples_leaf : int or float, default=1
- The minimum number of samples required to be at a leaf node.
- A split point at any depth will only be considered if it leaves at
- least ``min_samples_leaf`` training samples in each of the left and
- right branches. This may have the effect of smoothing the model,
- especially in regression.
- - If int, then consider `min_samples_leaf` as the minimum number.
- - If float, then `min_samples_leaf` is a fraction and
- `ceil(min_samples_leaf * n_samples)` are the minimum
- number of samples for each node.
- .. versionchanged:: 0.18
- Added float values for fractions.
- min_weight_fraction_leaf : float, default=0.0
- The minimum weighted fraction of the sum total of weights (of all
- the input samples) required to be at a leaf node. Samples have
- equal weight when sample_weight is not provided.
- max_features : {"sqrt", "log2", None}, int or float, default=1.0
- The number of features to consider when looking for the best split:
- - If int, then consider `max_features` features at each split.
- - If float, then `max_features` is a fraction and
- `max(1, int(max_features * n_features_in_))` features are considered at each
- split.
- - If "sqrt", then `max_features=sqrt(n_features)`.
- - If "log2", then `max_features=log2(n_features)`.
- - If None or 1.0, then `max_features=n_features`.
- .. note::
- The default of 1.0 is equivalent to bagged trees and more
- randomness can be achieved by setting smaller values, e.g. 0.3.
- .. versionchanged:: 1.1
- The default of `max_features` changed from `"auto"` to 1.0.
- Note: the search for a split does not stop until at least one
- valid partition of the node samples is found, even if it requires to
- effectively inspect more than ``max_features`` features.
- max_leaf_nodes : int, default=None
- Grow trees with ``max_leaf_nodes`` in best-first fashion.
- Best nodes are defined as relative reduction in impurity.
- If None then unlimited number of leaf nodes.
- min_impurity_decrease : float, default=0.0
- A node will be split if this split induces a decrease of the impurity
- greater than or equal to this value.
- The weighted impurity decrease equation is the following::
- N_t / N * (impurity - N_t_R / N_t * right_impurity
- - N_t_L / N_t * left_impurity)
- where ``N`` is the total number of samples, ``N_t`` is the number of
- samples at the current node, ``N_t_L`` is the number of samples in the
- left child, and ``N_t_R`` is the number of samples in the right child.
- ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
- if ``sample_weight`` is passed.
- .. versionadded:: 0.19
- bootstrap : bool, default=False
- Whether bootstrap samples are used when building trees. If False, the
- whole dataset is used to build each tree.
- oob_score : bool or callable, default=False
- Whether to use out-of-bag samples to estimate the generalization score.
- By default, :func:`~sklearn.metrics.r2_score` is used.
- Provide a callable with signature `metric(y_true, y_pred)` to use a
- custom metric. Only available if `bootstrap=True`.
- n_jobs : int, default=None
- The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
- :meth:`decision_path` and :meth:`apply` are all parallelized over the
- trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
- context. ``-1`` means using all processors. See :term:`Glossary
- <n_jobs>` for more details.
- random_state : int, RandomState instance or None, default=None
- Controls 3 sources of randomness:
- - the bootstrapping of the samples used when building trees
- (if ``bootstrap=True``)
- - the sampling of the features to consider when looking for the best
- split at each node (if ``max_features < n_features``)
- - the draw of the splits for each of the `max_features`
- See :term:`Glossary <random_state>` for details.
- verbose : int, default=0
- Controls the verbosity when fitting and predicting.
- warm_start : bool, default=False
- When set to ``True``, reuse the solution of the previous call to fit
- and add more estimators to the ensemble, otherwise, just fit a whole
- new forest. See :term:`Glossary <warm_start>` and
- :ref:`gradient_boosting_warm_start` for details.
- ccp_alpha : non-negative float, default=0.0
- Complexity parameter used for Minimal Cost-Complexity Pruning. The
- subtree with the largest cost complexity that is smaller than
- ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
- :ref:`minimal_cost_complexity_pruning` for details.
- .. versionadded:: 0.22
- max_samples : int or float, default=None
- If bootstrap is True, the number of samples to draw from X
- to train each base estimator.
- - If None (default), then draw `X.shape[0]` samples.
- - If int, then draw `max_samples` samples.
- - If float, then draw `max_samples * X.shape[0]` samples. Thus,
- `max_samples` should be in the interval `(0.0, 1.0]`.
- .. versionadded:: 0.22
- Attributes
- ----------
- estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor`
- The child estimator template used to create the collection of fitted
- sub-estimators.
- .. versionadded:: 1.2
- `base_estimator_` was renamed to `estimator_`.
- base_estimator_ : ExtraTreeRegressor
- The child estimator template used to create the collection of fitted
- sub-estimators.
- .. deprecated:: 1.2
- `base_estimator_` is deprecated and will be removed in 1.4.
- Use `estimator_` instead.
- estimators_ : list of DecisionTreeRegressor
- The collection of fitted sub-estimators.
- feature_importances_ : ndarray of shape (n_features,)
- The impurity-based feature importances.
- The higher, the more important the feature.
- The importance of a feature is computed as the (normalized)
- total reduction of the criterion brought by that feature. It is also
- known as the Gini importance.
- Warning: impurity-based feature importances can be misleading for
- high cardinality features (many unique values). See
- :func:`sklearn.inspection.permutation_importance` as an alternative.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- n_outputs_ : int
- The number of outputs.
- oob_score_ : float
- Score of the training dataset obtained using an out-of-bag estimate.
- This attribute exists only when ``oob_score`` is True.
- oob_prediction_ : ndarray of shape (n_samples,) or (n_samples, n_outputs)
- Prediction computed with out-of-bag estimate on the training set.
- This attribute exists only when ``oob_score`` is True.
- See Also
- --------
- ExtraTreesClassifier : An extra-trees classifier with random splits.
- RandomForestClassifier : A random forest classifier with optimal splits.
- RandomForestRegressor : Ensemble regressor using trees with optimal splits.
- Notes
- -----
- The default values for the parameters controlling the size of the trees
- (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
- unpruned trees which can potentially be very large on some data sets. To
- reduce memory consumption, the complexity and size of the trees should be
- controlled by setting those parameter values.
- References
- ----------
- .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
- Machine Learning, 63(1), 3-42, 2006.
- Examples
- --------
- >>> from sklearn.datasets import load_diabetes
- >>> from sklearn.model_selection import train_test_split
- >>> from sklearn.ensemble import ExtraTreesRegressor
- >>> X, y = load_diabetes(return_X_y=True)
- >>> X_train, X_test, y_train, y_test = train_test_split(
- ... X, y, random_state=0)
- >>> reg = ExtraTreesRegressor(n_estimators=100, random_state=0).fit(
- ... X_train, y_train)
- >>> reg.score(X_test, y_test)
- 0.2727...
- """
- _parameter_constraints: dict = {
- **ForestRegressor._parameter_constraints,
- **DecisionTreeRegressor._parameter_constraints,
- }
- _parameter_constraints.pop("splitter")
- def __init__(
- self,
- n_estimators=100,
- *,
- criterion="squared_error",
- max_depth=None,
- min_samples_split=2,
- min_samples_leaf=1,
- min_weight_fraction_leaf=0.0,
- max_features=1.0,
- max_leaf_nodes=None,
- min_impurity_decrease=0.0,
- bootstrap=False,
- oob_score=False,
- n_jobs=None,
- random_state=None,
- verbose=0,
- warm_start=False,
- ccp_alpha=0.0,
- max_samples=None,
- ):
- super().__init__(
- estimator=ExtraTreeRegressor(),
- n_estimators=n_estimators,
- estimator_params=(
- "criterion",
- "max_depth",
- "min_samples_split",
- "min_samples_leaf",
- "min_weight_fraction_leaf",
- "max_features",
- "max_leaf_nodes",
- "min_impurity_decrease",
- "random_state",
- "ccp_alpha",
- ),
- bootstrap=bootstrap,
- oob_score=oob_score,
- n_jobs=n_jobs,
- random_state=random_state,
- verbose=verbose,
- warm_start=warm_start,
- max_samples=max_samples,
- )
- self.criterion = criterion
- self.max_depth = max_depth
- self.min_samples_split = min_samples_split
- self.min_samples_leaf = min_samples_leaf
- self.min_weight_fraction_leaf = min_weight_fraction_leaf
- self.max_features = max_features
- self.max_leaf_nodes = max_leaf_nodes
- self.min_impurity_decrease = min_impurity_decrease
- self.ccp_alpha = ccp_alpha
- class RandomTreesEmbedding(TransformerMixin, BaseForest):
- """
- An ensemble of totally random trees.
- An unsupervised transformation of a dataset to a high-dimensional
- sparse representation. A datapoint is coded according to which leaf of
- each tree it is sorted into. Using a one-hot encoding of the leaves,
- this leads to a binary coding with as many ones as there are trees in
- the forest.
- The dimensionality of the resulting representation is
- ``n_out <= n_estimators * max_leaf_nodes``. If ``max_leaf_nodes == None``,
- the number of leaf nodes is at most ``n_estimators * 2 ** max_depth``.
- Read more in the :ref:`User Guide <random_trees_embedding>`.
- Parameters
- ----------
- n_estimators : int, default=100
- Number of trees in the forest.
- .. versionchanged:: 0.22
- The default value of ``n_estimators`` changed from 10 to 100
- in 0.22.
- max_depth : int, default=5
- The maximum depth of each tree. If None, then nodes are expanded until
- all leaves are pure or until all leaves contain less than
- min_samples_split samples.
- min_samples_split : int or float, default=2
- The minimum number of samples required to split an internal node:
- - If int, then consider `min_samples_split` as the minimum number.
- - If float, then `min_samples_split` is a fraction and
- `ceil(min_samples_split * n_samples)` is the minimum
- number of samples for each split.
- .. versionchanged:: 0.18
- Added float values for fractions.
- min_samples_leaf : int or float, default=1
- The minimum number of samples required to be at a leaf node.
- A split point at any depth will only be considered if it leaves at
- least ``min_samples_leaf`` training samples in each of the left and
- right branches. This may have the effect of smoothing the model,
- especially in regression.
- - If int, then consider `min_samples_leaf` as the minimum number.
- - If float, then `min_samples_leaf` is a fraction and
- `ceil(min_samples_leaf * n_samples)` is the minimum
- number of samples for each node.
- .. versionchanged:: 0.18
- Added float values for fractions.
- min_weight_fraction_leaf : float, default=0.0
- The minimum weighted fraction of the sum total of weights (of all
- the input samples) required to be at a leaf node. Samples have
- equal weight when sample_weight is not provided.
- max_leaf_nodes : int, default=None
- Grow trees with ``max_leaf_nodes`` in best-first fashion.
- Best nodes are defined as relative reduction in impurity.
- If None then unlimited number of leaf nodes.
- min_impurity_decrease : float, default=0.0
- A node will be split if this split induces a decrease of the impurity
- greater than or equal to this value.
- The weighted impurity decrease equation is the following::
- N_t / N * (impurity - N_t_R / N_t * right_impurity
- - N_t_L / N_t * left_impurity)
- where ``N`` is the total number of samples, ``N_t`` is the number of
- samples at the current node, ``N_t_L`` is the number of samples in the
- left child, and ``N_t_R`` is the number of samples in the right child.
- ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
- if ``sample_weight`` is passed.
- .. versionadded:: 0.19
- sparse_output : bool, default=True
- Whether or not to return a sparse CSR matrix, as default behavior,
- or to return a dense array compatible with dense pipeline operators.
- n_jobs : int, default=None
- The number of jobs to run in parallel. :meth:`fit`, :meth:`transform`,
- :meth:`decision_path` and :meth:`apply` are all parallelized over the
- trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
- context. ``-1`` means using all processors. See :term:`Glossary
- <n_jobs>` for more details.
- random_state : int, RandomState instance or None, default=None
- Controls the generation of the random `y` used to fit the trees
- and the draw of the splits for each feature at the trees' nodes.
- See :term:`Glossary <random_state>` for details.
- verbose : int, default=0
- Controls the verbosity when fitting and predicting.
- warm_start : bool, default=False
- When set to ``True``, reuse the solution of the previous call to fit
- and add more estimators to the ensemble, otherwise, just fit a whole
- new forest. See :term:`Glossary <warm_start>` and
- :ref:`gradient_boosting_warm_start` for details.
- Attributes
- ----------
- estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` instance
- The child estimator template used to create the collection of fitted
- sub-estimators.
- .. versionadded:: 1.2
- `base_estimator_` was renamed to `estimator_`.
- base_estimator_ : :class:`~sklearn.tree.ExtraTreeRegressor` instance
- The child estimator template used to create the collection of fitted
- sub-estimators.
- .. deprecated:: 1.2
- `base_estimator_` is deprecated and will be removed in 1.4.
- Use `estimator_` instead.
- estimators_ : list of :class:`~sklearn.tree.ExtraTreeRegressor` instances
- The collection of fitted sub-estimators.
- feature_importances_ : ndarray of shape (n_features,)
- The feature importances (the higher, the more important the feature).
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- n_outputs_ : int
- The number of outputs when ``fit`` is performed.
- one_hot_encoder_ : OneHotEncoder instance
- One-hot encoder used to create the sparse embedding.
- See Also
- --------
- ExtraTreesClassifier : An extra-trees classifier.
- ExtraTreesRegressor : An extra-trees regressor.
- RandomForestClassifier : A random forest classifier.
- RandomForestRegressor : A random forest regressor.
- sklearn.tree.ExtraTreeClassifier: An extremely randomized
- tree classifier.
- sklearn.tree.ExtraTreeRegressor : An extremely randomized
- tree regressor.
- References
- ----------
- .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
- Machine Learning, 63(1), 3-42, 2006.
- .. [2] Moosmann, F. and Triggs, B. and Jurie, F. "Fast discriminative
- visual codebooks using randomized clustering forests"
- NIPS 2007
- Examples
- --------
- >>> from sklearn.ensemble import RandomTreesEmbedding
- >>> X = [[0,0], [1,0], [0,1], [-1,0], [0,-1]]
- >>> random_trees = RandomTreesEmbedding(
- ... n_estimators=5, random_state=0, max_depth=1).fit(X)
- >>> X_sparse_embedding = random_trees.transform(X)
- >>> X_sparse_embedding.toarray()
- array([[0., 1., 1., 0., 1., 0., 0., 1., 1., 0.],
- [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.],
- [0., 1., 0., 1., 0., 1., 0., 1., 0., 1.],
- [1., 0., 1., 0., 1., 0., 1., 0., 1., 0.],
- [0., 1., 1., 0., 1., 0., 0., 1., 1., 0.]])
- """
- _parameter_constraints: dict = {
- "n_estimators": [Interval(Integral, 1, None, closed="left")],
- "n_jobs": [Integral, None],
- "verbose": ["verbose"],
- "warm_start": ["boolean"],
- **BaseDecisionTree._parameter_constraints,
- "sparse_output": ["boolean"],
- }
- for param in ("max_features", "ccp_alpha", "splitter"):
- _parameter_constraints.pop(param)
- criterion = "squared_error"
- max_features = 1
- def __init__(
- self,
- n_estimators=100,
- *,
- max_depth=5,
- min_samples_split=2,
- min_samples_leaf=1,
- min_weight_fraction_leaf=0.0,
- max_leaf_nodes=None,
- min_impurity_decrease=0.0,
- sparse_output=True,
- n_jobs=None,
- random_state=None,
- verbose=0,
- warm_start=False,
- ):
- super().__init__(
- estimator=ExtraTreeRegressor(),
- n_estimators=n_estimators,
- estimator_params=(
- "criterion",
- "max_depth",
- "min_samples_split",
- "min_samples_leaf",
- "min_weight_fraction_leaf",
- "max_features",
- "max_leaf_nodes",
- "min_impurity_decrease",
- "random_state",
- ),
- bootstrap=False,
- oob_score=False,
- n_jobs=n_jobs,
- random_state=random_state,
- verbose=verbose,
- warm_start=warm_start,
- max_samples=None,
- )
- self.max_depth = max_depth
- self.min_samples_split = min_samples_split
- self.min_samples_leaf = min_samples_leaf
- self.min_weight_fraction_leaf = min_weight_fraction_leaf
- self.max_leaf_nodes = max_leaf_nodes
- self.min_impurity_decrease = min_impurity_decrease
- self.sparse_output = sparse_output
- def _set_oob_score_and_attributes(self, X, y, scoring_function=None):
- raise NotImplementedError("OOB score not supported by tree embedding")
- def fit(self, X, y=None, sample_weight=None):
- """
- Fit estimator.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The input samples. Use ``dtype=np.float32`` for maximum
- efficiency. Sparse matrices are also supported, use sparse
- ``csc_matrix`` for maximum efficiency.
- y : Ignored
- Not used, present for API consistency by convention.
- sample_weight : array-like of shape (n_samples,), default=None
- Sample weights. If None, then samples are equally weighted. Splits
- that would create child nodes with net zero or negative weight are
- ignored while searching for a split in each node. In the case of
- classification, splits are also ignored if they would result in any
- single class carrying a negative weight in either child node.
- Returns
- -------
- self : object
- Returns the instance itself.
- """
- # Parameters are validated in fit_transform
- self.fit_transform(X, y, sample_weight=sample_weight)
- return self
- @_fit_context(prefer_skip_nested_validation=True)
- def fit_transform(self, X, y=None, sample_weight=None):
- """
- Fit estimator and transform dataset.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Input data used to build forests. Use ``dtype=np.float32`` for
- maximum efficiency.
- y : Ignored
- Not used, present for API consistency by convention.
- sample_weight : array-like of shape (n_samples,), default=None
- Sample weights. If None, then samples are equally weighted. Splits
- that would create child nodes with net zero or negative weight are
- ignored while searching for a split in each node. In the case of
- classification, splits are also ignored if they would result in any
- single class carrying a negative weight in either child node.
- Returns
- -------
- X_transformed : sparse matrix of shape (n_samples, n_out)
- Transformed dataset.
- """
- rnd = check_random_state(self.random_state)
- y = rnd.uniform(size=_num_samples(X))
- super().fit(X, y, sample_weight=sample_weight)
- self.one_hot_encoder_ = OneHotEncoder(sparse_output=self.sparse_output)
- output = self.one_hot_encoder_.fit_transform(self.apply(X))
- self._n_features_out = output.shape[1]
- return output
- def get_feature_names_out(self, input_features=None):
- """Get output feature names for transformation.
- Parameters
- ----------
- input_features : array-like of str or None, default=None
- Only used to validate feature names with the names seen in :meth:`fit`.
- Returns
- -------
- feature_names_out : ndarray of str objects
- Transformed feature names, in the format of
- `randomtreesembedding_{tree}_{leaf}`, where `tree` is the tree used
- to generate the leaf and `leaf` is the index of a leaf node
- in that tree. Note that the node indexing scheme is used to
- index both nodes with children (split nodes) and leaf nodes.
- Only the latter can be present as output features.
- As a consequence, there are missing indices in the output
- feature names.
- """
- check_is_fitted(self, "_n_features_out")
- _check_feature_names_in(
- self, input_features=input_features, generate_names=False
- )
- feature_names = [
- f"randomtreesembedding_{tree}_{leaf}"
- for tree in range(self.n_estimators)
- for leaf in self.one_hot_encoder_.categories_[tree]
- ]
- return np.asarray(feature_names, dtype=object)
- def transform(self, X):
- """
- Transform dataset.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Input data to be transformed. Use ``dtype=np.float32`` for maximum
- efficiency. Sparse matrices are also supported, use sparse
- ``csr_matrix`` for maximum efficiency.
- Returns
- -------
- X_transformed : sparse matrix of shape (n_samples, n_out)
- Transformed dataset.
- """
- check_is_fitted(self)
- return self.one_hot_encoder_.transform(self.apply(X))
|