| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842 |
- """
- This module gathers tree-based methods, including decision, regression and
- randomized trees. Single and multi-output problems are both handled.
- """
- # Authors: Gilles Louppe <g.louppe@gmail.com>
- # Peter Prettenhofer <peter.prettenhofer@gmail.com>
- # Brian Holt <bdholt1@gmail.com>
- # Noel Dawe <noel@dawe.me>
- # Satrajit Gosh <satrajit.ghosh@gmail.com>
- # Joly Arnaud <arnaud.v.joly@gmail.com>
- # Fares Hedayati <fares.hedayati@gmail.com>
- # Nelson Liu <nelson@nelsonliu.me>
- #
- # License: BSD 3 clause
- import copy
- import numbers
- import warnings
- from abc import ABCMeta, abstractmethod
- from math import ceil
- from numbers import Integral, Real
- import numpy as np
- from scipy.sparse import issparse
- from ..base import (
- BaseEstimator,
- ClassifierMixin,
- MultiOutputMixin,
- RegressorMixin,
- _fit_context,
- clone,
- is_classifier,
- )
- from ..utils import Bunch, check_random_state, compute_sample_weight
- from ..utils._param_validation import Hidden, Interval, RealNotInt, StrOptions
- from ..utils.multiclass import check_classification_targets
- from ..utils.validation import (
- _assert_all_finite_element_wise,
- _check_sample_weight,
- assert_all_finite,
- check_is_fitted,
- )
- from . import _criterion, _splitter, _tree
- from ._criterion import Criterion
- from ._splitter import Splitter
- from ._tree import (
- BestFirstTreeBuilder,
- DepthFirstTreeBuilder,
- Tree,
- _build_pruned_tree_ccp,
- ccp_pruning_path,
- )
- from ._utils import _any_isnan_axis0
- __all__ = [
- "DecisionTreeClassifier",
- "DecisionTreeRegressor",
- "ExtraTreeClassifier",
- "ExtraTreeRegressor",
- ]
- # =============================================================================
- # Types and constants
- # =============================================================================
- DTYPE = _tree.DTYPE
- DOUBLE = _tree.DOUBLE
- CRITERIA_CLF = {
- "gini": _criterion.Gini,
- "log_loss": _criterion.Entropy,
- "entropy": _criterion.Entropy,
- }
- CRITERIA_REG = {
- "squared_error": _criterion.MSE,
- "friedman_mse": _criterion.FriedmanMSE,
- "absolute_error": _criterion.MAE,
- "poisson": _criterion.Poisson,
- }
- DENSE_SPLITTERS = {"best": _splitter.BestSplitter, "random": _splitter.RandomSplitter}
- SPARSE_SPLITTERS = {
- "best": _splitter.BestSparseSplitter,
- "random": _splitter.RandomSparseSplitter,
- }
- # =============================================================================
- # Base decision tree
- # =============================================================================
- class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta):
- """Base class for decision trees.
- Warning: This class should not be used directly.
- Use derived classes instead.
- """
- _parameter_constraints: dict = {
- "splitter": [StrOptions({"best", "random"})],
- "max_depth": [Interval(Integral, 1, None, closed="left"), None],
- "min_samples_split": [
- Interval(Integral, 2, None, closed="left"),
- Interval(RealNotInt, 0.0, 1.0, closed="right"),
- ],
- "min_samples_leaf": [
- Interval(Integral, 1, None, closed="left"),
- Interval(RealNotInt, 0.0, 1.0, closed="neither"),
- ],
- "min_weight_fraction_leaf": [Interval(Real, 0.0, 0.5, closed="both")],
- "max_features": [
- Interval(Integral, 1, None, closed="left"),
- Interval(RealNotInt, 0.0, 1.0, closed="right"),
- StrOptions({"sqrt", "log2"}),
- None,
- ],
- "random_state": ["random_state"],
- "max_leaf_nodes": [Interval(Integral, 2, None, closed="left"), None],
- "min_impurity_decrease": [Interval(Real, 0.0, None, closed="left")],
- "ccp_alpha": [Interval(Real, 0.0, None, closed="left")],
- }
- @abstractmethod
- def __init__(
- self,
- *,
- criterion,
- splitter,
- max_depth,
- min_samples_split,
- min_samples_leaf,
- min_weight_fraction_leaf,
- max_features,
- max_leaf_nodes,
- random_state,
- min_impurity_decrease,
- class_weight=None,
- ccp_alpha=0.0,
- ):
- self.criterion = criterion
- self.splitter = splitter
- self.max_depth = max_depth
- self.min_samples_split = min_samples_split
- self.min_samples_leaf = min_samples_leaf
- self.min_weight_fraction_leaf = min_weight_fraction_leaf
- self.max_features = max_features
- self.max_leaf_nodes = max_leaf_nodes
- self.random_state = random_state
- self.min_impurity_decrease = min_impurity_decrease
- self.class_weight = class_weight
- self.ccp_alpha = ccp_alpha
- def get_depth(self):
- """Return the depth of the decision tree.
- The depth of a tree is the maximum distance between the root
- and any leaf.
- Returns
- -------
- self.tree_.max_depth : int
- The maximum depth of the tree.
- """
- check_is_fitted(self)
- return self.tree_.max_depth
- def get_n_leaves(self):
- """Return the number of leaves of the decision tree.
- Returns
- -------
- self.tree_.n_leaves : int
- Number of leaves.
- """
- check_is_fitted(self)
- return self.tree_.n_leaves
- def _support_missing_values(self, X):
- return not issparse(X) and self._get_tags()["allow_nan"]
- def _compute_missing_values_in_feature_mask(self, X):
- """Return boolean mask denoting if there are missing values for each feature.
- This method also ensures that X is finite.
- Parameter
- ---------
- X : array-like of shape (n_samples, n_features), dtype=DOUBLE
- Input data.
- Returns
- -------
- missing_values_in_feature_mask : ndarray of shape (n_features,), or None
- Missing value mask. If missing values are not supported or there
- are no missing values, return None.
- """
- common_kwargs = dict(estimator_name=self.__class__.__name__, input_name="X")
- if not self._support_missing_values(X):
- assert_all_finite(X, **common_kwargs)
- return None
- with np.errstate(over="ignore"):
- overall_sum = np.sum(X)
- if not np.isfinite(overall_sum):
- # Raise a ValueError in case of the presence of an infinite element.
- _assert_all_finite_element_wise(X, xp=np, allow_nan=True, **common_kwargs)
- # If the sum is not nan, then there are no missing values
- if not np.isnan(overall_sum):
- return None
- missing_values_in_feature_mask = _any_isnan_axis0(X)
- return missing_values_in_feature_mask
- def _fit(
- self,
- X,
- y,
- sample_weight=None,
- check_input=True,
- missing_values_in_feature_mask=None,
- ):
- random_state = check_random_state(self.random_state)
- if check_input:
- # Need to validate separately here.
- # We can't pass multi_output=True because that would allow y to be
- # csr.
- # _compute_missing_values_in_feature_mask will check for finite values and
- # compute the missing mask if the tree supports missing values
- check_X_params = dict(
- dtype=DTYPE, accept_sparse="csc", force_all_finite=False
- )
- check_y_params = dict(ensure_2d=False, dtype=None)
- X, y = self._validate_data(
- X, y, validate_separately=(check_X_params, check_y_params)
- )
- missing_values_in_feature_mask = (
- self._compute_missing_values_in_feature_mask(X)
- )
- if issparse(X):
- X.sort_indices()
- if X.indices.dtype != np.intc or X.indptr.dtype != np.intc:
- raise ValueError(
- "No support for np.int64 index based sparse matrices"
- )
- if self.criterion == "poisson":
- if np.any(y < 0):
- raise ValueError(
- "Some value(s) of y are negative which is"
- " not allowed for Poisson regression."
- )
- if np.sum(y) <= 0:
- raise ValueError(
- "Sum of y is not positive which is "
- "necessary for Poisson regression."
- )
- # Determine output settings
- n_samples, self.n_features_in_ = X.shape
- is_classification = is_classifier(self)
- y = np.atleast_1d(y)
- expanded_class_weight = None
- if y.ndim == 1:
- # reshape is necessary to preserve the data contiguity against vs
- # [:, np.newaxis] that does not.
- y = np.reshape(y, (-1, 1))
- self.n_outputs_ = y.shape[1]
- if is_classification:
- check_classification_targets(y)
- y = np.copy(y)
- self.classes_ = []
- self.n_classes_ = []
- if self.class_weight is not None:
- y_original = np.copy(y)
- y_encoded = np.zeros(y.shape, dtype=int)
- for k in range(self.n_outputs_):
- classes_k, y_encoded[:, k] = np.unique(y[:, k], return_inverse=True)
- self.classes_.append(classes_k)
- self.n_classes_.append(classes_k.shape[0])
- y = y_encoded
- if self.class_weight is not None:
- expanded_class_weight = compute_sample_weight(
- self.class_weight, y_original
- )
- self.n_classes_ = np.array(self.n_classes_, dtype=np.intp)
- if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous:
- y = np.ascontiguousarray(y, dtype=DOUBLE)
- max_depth = np.iinfo(np.int32).max if self.max_depth is None else self.max_depth
- if isinstance(self.min_samples_leaf, numbers.Integral):
- min_samples_leaf = self.min_samples_leaf
- else: # float
- min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples))
- if isinstance(self.min_samples_split, numbers.Integral):
- min_samples_split = self.min_samples_split
- else: # float
- min_samples_split = int(ceil(self.min_samples_split * n_samples))
- min_samples_split = max(2, min_samples_split)
- min_samples_split = max(min_samples_split, 2 * min_samples_leaf)
- if isinstance(self.max_features, str):
- if self.max_features == "auto":
- if is_classification:
- max_features = max(1, int(np.sqrt(self.n_features_in_)))
- warnings.warn(
- (
- "`max_features='auto'` has been deprecated in 1.1 "
- "and will be removed in 1.3. To keep the past behaviour, "
- "explicitly set `max_features='sqrt'`."
- ),
- FutureWarning,
- )
- else:
- max_features = self.n_features_in_
- warnings.warn(
- (
- "`max_features='auto'` has been deprecated in 1.1 "
- "and will be removed in 1.3. To keep the past behaviour, "
- "explicitly set `max_features=1.0'`."
- ),
- FutureWarning,
- )
- elif self.max_features == "sqrt":
- max_features = max(1, int(np.sqrt(self.n_features_in_)))
- elif self.max_features == "log2":
- max_features = max(1, int(np.log2(self.n_features_in_)))
- elif self.max_features is None:
- max_features = self.n_features_in_
- elif isinstance(self.max_features, numbers.Integral):
- max_features = self.max_features
- else: # float
- if self.max_features > 0.0:
- max_features = max(1, int(self.max_features * self.n_features_in_))
- else:
- max_features = 0
- self.max_features_ = max_features
- max_leaf_nodes = -1 if self.max_leaf_nodes is None else self.max_leaf_nodes
- if len(y) != n_samples:
- raise ValueError(
- "Number of labels=%d does not match number of samples=%d"
- % (len(y), n_samples)
- )
- if sample_weight is not None:
- sample_weight = _check_sample_weight(sample_weight, X, DOUBLE)
- if expanded_class_weight is not None:
- if sample_weight is not None:
- sample_weight = sample_weight * expanded_class_weight
- else:
- sample_weight = expanded_class_weight
- # Set min_weight_leaf from min_weight_fraction_leaf
- if sample_weight is None:
- min_weight_leaf = self.min_weight_fraction_leaf * n_samples
- else:
- min_weight_leaf = self.min_weight_fraction_leaf * np.sum(sample_weight)
- # Build tree
- criterion = self.criterion
- if not isinstance(criterion, Criterion):
- if is_classification:
- criterion = CRITERIA_CLF[self.criterion](
- self.n_outputs_, self.n_classes_
- )
- else:
- criterion = CRITERIA_REG[self.criterion](self.n_outputs_, n_samples)
- else:
- # Make a deepcopy in case the criterion has mutable attributes that
- # might be shared and modified concurrently during parallel fitting
- criterion = copy.deepcopy(criterion)
- SPLITTERS = SPARSE_SPLITTERS if issparse(X) else DENSE_SPLITTERS
- splitter = self.splitter
- if not isinstance(self.splitter, Splitter):
- splitter = SPLITTERS[self.splitter](
- criterion,
- self.max_features_,
- min_samples_leaf,
- min_weight_leaf,
- random_state,
- )
- if is_classifier(self):
- self.tree_ = Tree(self.n_features_in_, self.n_classes_, self.n_outputs_)
- else:
- self.tree_ = Tree(
- self.n_features_in_,
- # TODO: tree shouldn't need this in this case
- np.array([1] * self.n_outputs_, dtype=np.intp),
- self.n_outputs_,
- )
- # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise
- if max_leaf_nodes < 0:
- builder = DepthFirstTreeBuilder(
- splitter,
- min_samples_split,
- min_samples_leaf,
- min_weight_leaf,
- max_depth,
- self.min_impurity_decrease,
- )
- else:
- builder = BestFirstTreeBuilder(
- splitter,
- min_samples_split,
- min_samples_leaf,
- min_weight_leaf,
- max_depth,
- max_leaf_nodes,
- self.min_impurity_decrease,
- )
- builder.build(self.tree_, X, y, sample_weight, missing_values_in_feature_mask)
- if self.n_outputs_ == 1 and is_classifier(self):
- self.n_classes_ = self.n_classes_[0]
- self.classes_ = self.classes_[0]
- self._prune_tree()
- return self
- def _validate_X_predict(self, X, check_input):
- """Validate the training data on predict (probabilities)."""
- if check_input:
- if self._support_missing_values(X):
- force_all_finite = "allow-nan"
- else:
- force_all_finite = True
- X = self._validate_data(
- X,
- dtype=DTYPE,
- accept_sparse="csr",
- reset=False,
- force_all_finite=force_all_finite,
- )
- if issparse(X) and (
- X.indices.dtype != np.intc or X.indptr.dtype != np.intc
- ):
- raise ValueError("No support for np.int64 index based sparse matrices")
- else:
- # The number of features is checked regardless of `check_input`
- self._check_n_features(X, reset=False)
- return X
- def predict(self, X, check_input=True):
- """Predict class or regression value for X.
- For a classification model, the predicted class for each sample in X is
- returned. For a regression model, the predicted value based on X is
- returned.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The input samples. Internally, it will be converted to
- ``dtype=np.float32`` and if a sparse matrix is provided
- to a sparse ``csr_matrix``.
- check_input : bool, default=True
- Allow to bypass several input checking.
- Don't use this parameter unless you know what you're doing.
- Returns
- -------
- y : array-like of shape (n_samples,) or (n_samples, n_outputs)
- The predicted classes, or the predict values.
- """
- check_is_fitted(self)
- X = self._validate_X_predict(X, check_input)
- proba = self.tree_.predict(X)
- n_samples = X.shape[0]
- # Classification
- if is_classifier(self):
- if self.n_outputs_ == 1:
- return self.classes_.take(np.argmax(proba, axis=1), axis=0)
- else:
- class_type = self.classes_[0].dtype
- predictions = np.zeros((n_samples, self.n_outputs_), dtype=class_type)
- for k in range(self.n_outputs_):
- predictions[:, k] = self.classes_[k].take(
- np.argmax(proba[:, k], axis=1), axis=0
- )
- return predictions
- # Regression
- else:
- if self.n_outputs_ == 1:
- return proba[:, 0]
- else:
- return proba[:, :, 0]
- def apply(self, X, check_input=True):
- """Return the index of the leaf that each sample is predicted as.
- .. versionadded:: 0.17
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The input samples. Internally, it will be converted to
- ``dtype=np.float32`` and if a sparse matrix is provided
- to a sparse ``csr_matrix``.
- check_input : bool, default=True
- Allow to bypass several input checking.
- Don't use this parameter unless you know what you're doing.
- Returns
- -------
- X_leaves : array-like of shape (n_samples,)
- For each datapoint x in X, return the index of the leaf x
- ends up in. Leaves are numbered within
- ``[0; self.tree_.node_count)``, possibly with gaps in the
- numbering.
- """
- check_is_fitted(self)
- X = self._validate_X_predict(X, check_input)
- return self.tree_.apply(X)
- def decision_path(self, X, check_input=True):
- """Return the decision path in the tree.
- .. versionadded:: 0.18
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The input samples. Internally, it will be converted to
- ``dtype=np.float32`` and if a sparse matrix is provided
- to a sparse ``csr_matrix``.
- check_input : bool, default=True
- Allow to bypass several input checking.
- Don't use this parameter unless you know what you're doing.
- Returns
- -------
- indicator : sparse matrix of shape (n_samples, n_nodes)
- Return a node indicator CSR matrix where non zero elements
- indicates that the samples goes through the nodes.
- """
- X = self._validate_X_predict(X, check_input)
- return self.tree_.decision_path(X)
- def _prune_tree(self):
- """Prune tree using Minimal Cost-Complexity Pruning."""
- check_is_fitted(self)
- if self.ccp_alpha == 0.0:
- return
- # build pruned tree
- if is_classifier(self):
- n_classes = np.atleast_1d(self.n_classes_)
- pruned_tree = Tree(self.n_features_in_, n_classes, self.n_outputs_)
- else:
- pruned_tree = Tree(
- self.n_features_in_,
- # TODO: the tree shouldn't need this param
- np.array([1] * self.n_outputs_, dtype=np.intp),
- self.n_outputs_,
- )
- _build_pruned_tree_ccp(pruned_tree, self.tree_, self.ccp_alpha)
- self.tree_ = pruned_tree
- def cost_complexity_pruning_path(self, X, y, sample_weight=None):
- """Compute the pruning path during Minimal Cost-Complexity Pruning.
- See :ref:`minimal_cost_complexity_pruning` for details on the pruning
- process.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The training input samples. Internally, it will be converted to
- ``dtype=np.float32`` and if a sparse matrix is provided
- to a sparse ``csc_matrix``.
- y : array-like of shape (n_samples,) or (n_samples, n_outputs)
- The target values (class labels) as integers or strings.
- sample_weight : array-like of shape (n_samples,), default=None
- Sample weights. If None, then samples are equally weighted. Splits
- that would create child nodes with net zero or negative weight are
- ignored while searching for a split in each node. Splits are also
- ignored if they would result in any single class carrying a
- negative weight in either child node.
- Returns
- -------
- ccp_path : :class:`~sklearn.utils.Bunch`
- Dictionary-like object, with the following attributes.
- ccp_alphas : ndarray
- Effective alphas of subtree during pruning.
- impurities : ndarray
- Sum of the impurities of the subtree leaves for the
- corresponding alpha value in ``ccp_alphas``.
- """
- est = clone(self).set_params(ccp_alpha=0.0)
- est.fit(X, y, sample_weight=sample_weight)
- return Bunch(**ccp_pruning_path(est.tree_))
- @property
- def feature_importances_(self):
- """Return the feature importances.
- The importance of a feature is computed as the (normalized) total
- reduction of the criterion brought by that feature.
- It is also known as the Gini importance.
- Warning: impurity-based feature importances can be misleading for
- high cardinality features (many unique values). See
- :func:`sklearn.inspection.permutation_importance` as an alternative.
- Returns
- -------
- feature_importances_ : ndarray of shape (n_features,)
- Normalized total reduction of criteria by feature
- (Gini importance).
- """
- check_is_fitted(self)
- return self.tree_.compute_feature_importances()
- # =============================================================================
- # Public estimators
- # =============================================================================
- class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree):
- """A decision tree classifier.
- Read more in the :ref:`User Guide <tree>`.
- Parameters
- ----------
- criterion : {"gini", "entropy", "log_loss"}, default="gini"
- The function to measure the quality of a split. Supported criteria are
- "gini" for the Gini impurity and "log_loss" and "entropy" both for the
- Shannon information gain, see :ref:`tree_mathematical_formulation`.
- splitter : {"best", "random"}, default="best"
- The strategy used to choose the split at each node. Supported
- strategies are "best" to choose the best split and "random" to choose
- the best random split.
- max_depth : int, default=None
- The maximum depth of the tree. If None, then nodes are expanded until
- all leaves are pure or until all leaves contain less than
- min_samples_split samples.
- min_samples_split : int or float, default=2
- The minimum number of samples required to split an internal node:
- - If int, then consider `min_samples_split` as the minimum number.
- - If float, then `min_samples_split` is a fraction and
- `ceil(min_samples_split * n_samples)` are the minimum
- number of samples for each split.
- .. versionchanged:: 0.18
- Added float values for fractions.
- min_samples_leaf : int or float, default=1
- The minimum number of samples required to be at a leaf node.
- A split point at any depth will only be considered if it leaves at
- least ``min_samples_leaf`` training samples in each of the left and
- right branches. This may have the effect of smoothing the model,
- especially in regression.
- - If int, then consider `min_samples_leaf` as the minimum number.
- - If float, then `min_samples_leaf` is a fraction and
- `ceil(min_samples_leaf * n_samples)` are the minimum
- number of samples for each node.
- .. versionchanged:: 0.18
- Added float values for fractions.
- min_weight_fraction_leaf : float, default=0.0
- The minimum weighted fraction of the sum total of weights (of all
- the input samples) required to be at a leaf node. Samples have
- equal weight when sample_weight is not provided.
- max_features : int, float or {"auto", "sqrt", "log2"}, default=None
- The number of features to consider when looking for the best split:
- - If int, then consider `max_features` features at each split.
- - If float, then `max_features` is a fraction and
- `max(1, int(max_features * n_features_in_))` features are considered at
- each split.
- - If "sqrt", then `max_features=sqrt(n_features)`.
- - If "log2", then `max_features=log2(n_features)`.
- - If None, then `max_features=n_features`.
- Note: the search for a split does not stop until at least one
- valid partition of the node samples is found, even if it requires to
- effectively inspect more than ``max_features`` features.
- random_state : int, RandomState instance or None, default=None
- Controls the randomness of the estimator. The features are always
- randomly permuted at each split, even if ``splitter`` is set to
- ``"best"``. When ``max_features < n_features``, the algorithm will
- select ``max_features`` at random at each split before finding the best
- split among them. But the best found split may vary across different
- runs, even if ``max_features=n_features``. That is the case, if the
- improvement of the criterion is identical for several splits and one
- split has to be selected at random. To obtain a deterministic behaviour
- during fitting, ``random_state`` has to be fixed to an integer.
- See :term:`Glossary <random_state>` for details.
- max_leaf_nodes : int, default=None
- Grow a tree with ``max_leaf_nodes`` in best-first fashion.
- Best nodes are defined as relative reduction in impurity.
- If None then unlimited number of leaf nodes.
- min_impurity_decrease : float, default=0.0
- A node will be split if this split induces a decrease of the impurity
- greater than or equal to this value.
- The weighted impurity decrease equation is the following::
- N_t / N * (impurity - N_t_R / N_t * right_impurity
- - N_t_L / N_t * left_impurity)
- where ``N`` is the total number of samples, ``N_t`` is the number of
- samples at the current node, ``N_t_L`` is the number of samples in the
- left child, and ``N_t_R`` is the number of samples in the right child.
- ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
- if ``sample_weight`` is passed.
- .. versionadded:: 0.19
- class_weight : dict, list of dict or "balanced", default=None
- Weights associated with classes in the form ``{class_label: weight}``.
- If None, all classes are supposed to have weight one. For
- multi-output problems, a list of dicts can be provided in the same
- order as the columns of y.
- Note that for multioutput (including multilabel) weights should be
- defined for each class of every column in its own dict. For example,
- for four-class multilabel classification weights should be
- [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
- [{1:1}, {2:5}, {3:1}, {4:1}].
- The "balanced" mode uses the values of y to automatically adjust
- weights inversely proportional to class frequencies in the input data
- as ``n_samples / (n_classes * np.bincount(y))``
- For multi-output, the weights of each column of y will be multiplied.
- Note that these weights will be multiplied with sample_weight (passed
- through the fit method) if sample_weight is specified.
- ccp_alpha : non-negative float, default=0.0
- Complexity parameter used for Minimal Cost-Complexity Pruning. The
- subtree with the largest cost complexity that is smaller than
- ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
- :ref:`minimal_cost_complexity_pruning` for details.
- .. versionadded:: 0.22
- Attributes
- ----------
- classes_ : ndarray of shape (n_classes,) or list of ndarray
- The classes labels (single output problem),
- or a list of arrays of class labels (multi-output problem).
- feature_importances_ : ndarray of shape (n_features,)
- The impurity-based feature importances.
- The higher, the more important the feature.
- The importance of a feature is computed as the (normalized)
- total reduction of the criterion brought by that feature. It is also
- known as the Gini importance [4]_.
- Warning: impurity-based feature importances can be misleading for
- high cardinality features (many unique values). See
- :func:`sklearn.inspection.permutation_importance` as an alternative.
- max_features_ : int
- The inferred value of max_features.
- n_classes_ : int or list of int
- The number of classes (for single output problems),
- or a list containing the number of classes for each
- output (for multi-output problems).
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- n_outputs_ : int
- The number of outputs when ``fit`` is performed.
- tree_ : Tree instance
- The underlying Tree object. Please refer to
- ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and
- :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
- for basic usage of these attributes.
- See Also
- --------
- DecisionTreeRegressor : A decision tree regressor.
- Notes
- -----
- The default values for the parameters controlling the size of the trees
- (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
- unpruned trees which can potentially be very large on some data sets. To
- reduce memory consumption, the complexity and size of the trees should be
- controlled by setting those parameter values.
- The :meth:`predict` method operates using the :func:`numpy.argmax`
- function on the outputs of :meth:`predict_proba`. This means that in
- case the highest predicted probabilities are tied, the classifier will
- predict the tied class with the lowest index in :term:`classes_`.
- References
- ----------
- .. [1] https://en.wikipedia.org/wiki/Decision_tree_learning
- .. [2] L. Breiman, J. Friedman, R. Olshen, and C. Stone, "Classification
- and Regression Trees", Wadsworth, Belmont, CA, 1984.
- .. [3] T. Hastie, R. Tibshirani and J. Friedman. "Elements of Statistical
- Learning", Springer, 2009.
- .. [4] L. Breiman, and A. Cutler, "Random Forests",
- https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm
- Examples
- --------
- >>> from sklearn.datasets import load_iris
- >>> from sklearn.model_selection import cross_val_score
- >>> from sklearn.tree import DecisionTreeClassifier
- >>> clf = DecisionTreeClassifier(random_state=0)
- >>> iris = load_iris()
- >>> cross_val_score(clf, iris.data, iris.target, cv=10)
- ... # doctest: +SKIP
- ...
- array([ 1. , 0.93..., 0.86..., 0.93..., 0.93...,
- 0.93..., 0.93..., 1. , 0.93..., 1. ])
- """
- _parameter_constraints: dict = {
- **BaseDecisionTree._parameter_constraints,
- "criterion": [StrOptions({"gini", "entropy", "log_loss"}), Hidden(Criterion)],
- "class_weight": [dict, list, StrOptions({"balanced"}), None],
- }
- def __init__(
- self,
- *,
- criterion="gini",
- splitter="best",
- max_depth=None,
- min_samples_split=2,
- min_samples_leaf=1,
- min_weight_fraction_leaf=0.0,
- max_features=None,
- random_state=None,
- max_leaf_nodes=None,
- min_impurity_decrease=0.0,
- class_weight=None,
- ccp_alpha=0.0,
- ):
- super().__init__(
- criterion=criterion,
- splitter=splitter,
- max_depth=max_depth,
- min_samples_split=min_samples_split,
- min_samples_leaf=min_samples_leaf,
- min_weight_fraction_leaf=min_weight_fraction_leaf,
- max_features=max_features,
- max_leaf_nodes=max_leaf_nodes,
- class_weight=class_weight,
- random_state=random_state,
- min_impurity_decrease=min_impurity_decrease,
- ccp_alpha=ccp_alpha,
- )
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, X, y, sample_weight=None, check_input=True):
- """Build a decision tree classifier from the training set (X, y).
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The training input samples. Internally, it will be converted to
- ``dtype=np.float32`` and if a sparse matrix is provided
- to a sparse ``csc_matrix``.
- y : array-like of shape (n_samples,) or (n_samples, n_outputs)
- The target values (class labels) as integers or strings.
- sample_weight : array-like of shape (n_samples,), default=None
- Sample weights. If None, then samples are equally weighted. Splits
- that would create child nodes with net zero or negative weight are
- ignored while searching for a split in each node. Splits are also
- ignored if they would result in any single class carrying a
- negative weight in either child node.
- check_input : bool, default=True
- Allow to bypass several input checking.
- Don't use this parameter unless you know what you're doing.
- Returns
- -------
- self : DecisionTreeClassifier
- Fitted estimator.
- """
- super()._fit(
- X,
- y,
- sample_weight=sample_weight,
- check_input=check_input,
- )
- return self
- def predict_proba(self, X, check_input=True):
- """Predict class probabilities of the input samples X.
- The predicted class probability is the fraction of samples of the same
- class in a leaf.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The input samples. Internally, it will be converted to
- ``dtype=np.float32`` and if a sparse matrix is provided
- to a sparse ``csr_matrix``.
- check_input : bool, default=True
- Allow to bypass several input checking.
- Don't use this parameter unless you know what you're doing.
- Returns
- -------
- proba : ndarray of shape (n_samples, n_classes) or list of n_outputs \
- such arrays if n_outputs > 1
- The class probabilities of the input samples. The order of the
- classes corresponds to that in the attribute :term:`classes_`.
- """
- check_is_fitted(self)
- X = self._validate_X_predict(X, check_input)
- proba = self.tree_.predict(X)
- if self.n_outputs_ == 1:
- proba = proba[:, : self.n_classes_]
- normalizer = proba.sum(axis=1)[:, np.newaxis]
- normalizer[normalizer == 0.0] = 1.0
- proba /= normalizer
- return proba
- else:
- all_proba = []
- for k in range(self.n_outputs_):
- proba_k = proba[:, k, : self.n_classes_[k]]
- normalizer = proba_k.sum(axis=1)[:, np.newaxis]
- normalizer[normalizer == 0.0] = 1.0
- proba_k /= normalizer
- all_proba.append(proba_k)
- return all_proba
- def predict_log_proba(self, X):
- """Predict class log-probabilities of the input samples X.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The input samples. Internally, it will be converted to
- ``dtype=np.float32`` and if a sparse matrix is provided
- to a sparse ``csr_matrix``.
- Returns
- -------
- proba : ndarray of shape (n_samples, n_classes) or list of n_outputs \
- such arrays if n_outputs > 1
- The class log-probabilities of the input samples. The order of the
- classes corresponds to that in the attribute :term:`classes_`.
- """
- proba = self.predict_proba(X)
- if self.n_outputs_ == 1:
- return np.log(proba)
- else:
- for k in range(self.n_outputs_):
- proba[k] = np.log(proba[k])
- return proba
- def _more_tags(self):
- # XXX: nan is only support for dense arrays, but we set this for common test to
- # pass, specifically: check_estimators_nan_inf
- allow_nan = self.splitter == "best" and self.criterion in {
- "gini",
- "log_loss",
- "entropy",
- }
- return {"multilabel": True, "allow_nan": allow_nan}
- class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree):
- """A decision tree regressor.
- Read more in the :ref:`User Guide <tree>`.
- Parameters
- ----------
- criterion : {"squared_error", "friedman_mse", "absolute_error", \
- "poisson"}, default="squared_error"
- The function to measure the quality of a split. Supported criteria
- are "squared_error" for the mean squared error, which is equal to
- variance reduction as feature selection criterion and minimizes the L2
- loss using the mean of each terminal node, "friedman_mse", which uses
- mean squared error with Friedman's improvement score for potential
- splits, "absolute_error" for the mean absolute error, which minimizes
- the L1 loss using the median of each terminal node, and "poisson" which
- uses reduction in Poisson deviance to find splits.
- .. versionadded:: 0.18
- Mean Absolute Error (MAE) criterion.
- .. versionadded:: 0.24
- Poisson deviance criterion.
- splitter : {"best", "random"}, default="best"
- The strategy used to choose the split at each node. Supported
- strategies are "best" to choose the best split and "random" to choose
- the best random split.
- max_depth : int, default=None
- The maximum depth of the tree. If None, then nodes are expanded until
- all leaves are pure or until all leaves contain less than
- min_samples_split samples.
- min_samples_split : int or float, default=2
- The minimum number of samples required to split an internal node:
- - If int, then consider `min_samples_split` as the minimum number.
- - If float, then `min_samples_split` is a fraction and
- `ceil(min_samples_split * n_samples)` are the minimum
- number of samples for each split.
- .. versionchanged:: 0.18
- Added float values for fractions.
- min_samples_leaf : int or float, default=1
- The minimum number of samples required to be at a leaf node.
- A split point at any depth will only be considered if it leaves at
- least ``min_samples_leaf`` training samples in each of the left and
- right branches. This may have the effect of smoothing the model,
- especially in regression.
- - If int, then consider `min_samples_leaf` as the minimum number.
- - If float, then `min_samples_leaf` is a fraction and
- `ceil(min_samples_leaf * n_samples)` are the minimum
- number of samples for each node.
- .. versionchanged:: 0.18
- Added float values for fractions.
- min_weight_fraction_leaf : float, default=0.0
- The minimum weighted fraction of the sum total of weights (of all
- the input samples) required to be at a leaf node. Samples have
- equal weight when sample_weight is not provided.
- max_features : int, float or {"auto", "sqrt", "log2"}, default=None
- The number of features to consider when looking for the best split:
- - If int, then consider `max_features` features at each split.
- - If float, then `max_features` is a fraction and
- `max(1, int(max_features * n_features_in_))` features are considered at each
- split.
- - If "sqrt", then `max_features=sqrt(n_features)`.
- - If "log2", then `max_features=log2(n_features)`.
- - If None, then `max_features=n_features`.
- Note: the search for a split does not stop until at least one
- valid partition of the node samples is found, even if it requires to
- effectively inspect more than ``max_features`` features.
- random_state : int, RandomState instance or None, default=None
- Controls the randomness of the estimator. The features are always
- randomly permuted at each split, even if ``splitter`` is set to
- ``"best"``. When ``max_features < n_features``, the algorithm will
- select ``max_features`` at random at each split before finding the best
- split among them. But the best found split may vary across different
- runs, even if ``max_features=n_features``. That is the case, if the
- improvement of the criterion is identical for several splits and one
- split has to be selected at random. To obtain a deterministic behaviour
- during fitting, ``random_state`` has to be fixed to an integer.
- See :term:`Glossary <random_state>` for details.
- max_leaf_nodes : int, default=None
- Grow a tree with ``max_leaf_nodes`` in best-first fashion.
- Best nodes are defined as relative reduction in impurity.
- If None then unlimited number of leaf nodes.
- min_impurity_decrease : float, default=0.0
- A node will be split if this split induces a decrease of the impurity
- greater than or equal to this value.
- The weighted impurity decrease equation is the following::
- N_t / N * (impurity - N_t_R / N_t * right_impurity
- - N_t_L / N_t * left_impurity)
- where ``N`` is the total number of samples, ``N_t`` is the number of
- samples at the current node, ``N_t_L`` is the number of samples in the
- left child, and ``N_t_R`` is the number of samples in the right child.
- ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
- if ``sample_weight`` is passed.
- .. versionadded:: 0.19
- ccp_alpha : non-negative float, default=0.0
- Complexity parameter used for Minimal Cost-Complexity Pruning. The
- subtree with the largest cost complexity that is smaller than
- ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
- :ref:`minimal_cost_complexity_pruning` for details.
- .. versionadded:: 0.22
- Attributes
- ----------
- feature_importances_ : ndarray of shape (n_features,)
- The feature importances.
- The higher, the more important the feature.
- The importance of a feature is computed as the
- (normalized) total reduction of the criterion brought
- by that feature. It is also known as the Gini importance [4]_.
- Warning: impurity-based feature importances can be misleading for
- high cardinality features (many unique values). See
- :func:`sklearn.inspection.permutation_importance` as an alternative.
- max_features_ : int
- The inferred value of max_features.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- n_outputs_ : int
- The number of outputs when ``fit`` is performed.
- tree_ : Tree instance
- The underlying Tree object. Please refer to
- ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and
- :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
- for basic usage of these attributes.
- See Also
- --------
- DecisionTreeClassifier : A decision tree classifier.
- Notes
- -----
- The default values for the parameters controlling the size of the trees
- (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
- unpruned trees which can potentially be very large on some data sets. To
- reduce memory consumption, the complexity and size of the trees should be
- controlled by setting those parameter values.
- References
- ----------
- .. [1] https://en.wikipedia.org/wiki/Decision_tree_learning
- .. [2] L. Breiman, J. Friedman, R. Olshen, and C. Stone, "Classification
- and Regression Trees", Wadsworth, Belmont, CA, 1984.
- .. [3] T. Hastie, R. Tibshirani and J. Friedman. "Elements of Statistical
- Learning", Springer, 2009.
- .. [4] L. Breiman, and A. Cutler, "Random Forests",
- https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm
- Examples
- --------
- >>> from sklearn.datasets import load_diabetes
- >>> from sklearn.model_selection import cross_val_score
- >>> from sklearn.tree import DecisionTreeRegressor
- >>> X, y = load_diabetes(return_X_y=True)
- >>> regressor = DecisionTreeRegressor(random_state=0)
- >>> cross_val_score(regressor, X, y, cv=10)
- ... # doctest: +SKIP
- ...
- array([-0.39..., -0.46..., 0.02..., 0.06..., -0.50...,
- 0.16..., 0.11..., -0.73..., -0.30..., -0.00...])
- """
- _parameter_constraints: dict = {
- **BaseDecisionTree._parameter_constraints,
- "criterion": [
- StrOptions({"squared_error", "friedman_mse", "absolute_error", "poisson"}),
- Hidden(Criterion),
- ],
- }
- def __init__(
- self,
- *,
- criterion="squared_error",
- splitter="best",
- max_depth=None,
- min_samples_split=2,
- min_samples_leaf=1,
- min_weight_fraction_leaf=0.0,
- max_features=None,
- random_state=None,
- max_leaf_nodes=None,
- min_impurity_decrease=0.0,
- ccp_alpha=0.0,
- ):
- super().__init__(
- criterion=criterion,
- splitter=splitter,
- max_depth=max_depth,
- min_samples_split=min_samples_split,
- min_samples_leaf=min_samples_leaf,
- min_weight_fraction_leaf=min_weight_fraction_leaf,
- max_features=max_features,
- max_leaf_nodes=max_leaf_nodes,
- random_state=random_state,
- min_impurity_decrease=min_impurity_decrease,
- ccp_alpha=ccp_alpha,
- )
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, X, y, sample_weight=None, check_input=True):
- """Build a decision tree regressor from the training set (X, y).
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- The training input samples. Internally, it will be converted to
- ``dtype=np.float32`` and if a sparse matrix is provided
- to a sparse ``csc_matrix``.
- y : array-like of shape (n_samples,) or (n_samples, n_outputs)
- The target values (real numbers). Use ``dtype=np.float64`` and
- ``order='C'`` for maximum efficiency.
- sample_weight : array-like of shape (n_samples,), default=None
- Sample weights. If None, then samples are equally weighted. Splits
- that would create child nodes with net zero or negative weight are
- ignored while searching for a split in each node.
- check_input : bool, default=True
- Allow to bypass several input checking.
- Don't use this parameter unless you know what you're doing.
- Returns
- -------
- self : DecisionTreeRegressor
- Fitted estimator.
- """
- super()._fit(
- X,
- y,
- sample_weight=sample_weight,
- check_input=check_input,
- )
- return self
- def _compute_partial_dependence_recursion(self, grid, target_features):
- """Fast partial dependence computation.
- Parameters
- ----------
- grid : ndarray of shape (n_samples, n_target_features)
- The grid points on which the partial dependence should be
- evaluated.
- target_features : ndarray of shape (n_target_features)
- The set of target features for which the partial dependence
- should be evaluated.
- Returns
- -------
- averaged_predictions : ndarray of shape (n_samples,)
- The value of the partial dependence function on each grid point.
- """
- grid = np.asarray(grid, dtype=DTYPE, order="C")
- averaged_predictions = np.zeros(
- shape=grid.shape[0], dtype=np.float64, order="C"
- )
- self.tree_.compute_partial_dependence(
- grid, target_features, averaged_predictions
- )
- return averaged_predictions
- def _more_tags(self):
- # XXX: nan is only support for dense arrays, but we set this for common test to
- # pass, specifically: check_estimators_nan_inf
- allow_nan = self.splitter == "best" and self.criterion in {
- "squared_error",
- "friedman_mse",
- "poisson",
- }
- return {"allow_nan": allow_nan}
- class ExtraTreeClassifier(DecisionTreeClassifier):
- """An extremely randomized tree classifier.
- Extra-trees differ from classic decision trees in the way they are built.
- When looking for the best split to separate the samples of a node into two
- groups, random splits are drawn for each of the `max_features` randomly
- selected features and the best split among those is chosen. When
- `max_features` is set 1, this amounts to building a totally random
- decision tree.
- Warning: Extra-trees should only be used within ensemble methods.
- Read more in the :ref:`User Guide <tree>`.
- Parameters
- ----------
- criterion : {"gini", "entropy", "log_loss"}, default="gini"
- The function to measure the quality of a split. Supported criteria are
- "gini" for the Gini impurity and "log_loss" and "entropy" both for the
- Shannon information gain, see :ref:`tree_mathematical_formulation`.
- splitter : {"random", "best"}, default="random"
- The strategy used to choose the split at each node. Supported
- strategies are "best" to choose the best split and "random" to choose
- the best random split.
- max_depth : int, default=None
- The maximum depth of the tree. If None, then nodes are expanded until
- all leaves are pure or until all leaves contain less than
- min_samples_split samples.
- min_samples_split : int or float, default=2
- The minimum number of samples required to split an internal node:
- - If int, then consider `min_samples_split` as the minimum number.
- - If float, then `min_samples_split` is a fraction and
- `ceil(min_samples_split * n_samples)` are the minimum
- number of samples for each split.
- .. versionchanged:: 0.18
- Added float values for fractions.
- min_samples_leaf : int or float, default=1
- The minimum number of samples required to be at a leaf node.
- A split point at any depth will only be considered if it leaves at
- least ``min_samples_leaf`` training samples in each of the left and
- right branches. This may have the effect of smoothing the model,
- especially in regression.
- - If int, then consider `min_samples_leaf` as the minimum number.
- - If float, then `min_samples_leaf` is a fraction and
- `ceil(min_samples_leaf * n_samples)` are the minimum
- number of samples for each node.
- .. versionchanged:: 0.18
- Added float values for fractions.
- min_weight_fraction_leaf : float, default=0.0
- The minimum weighted fraction of the sum total of weights (of all
- the input samples) required to be at a leaf node. Samples have
- equal weight when sample_weight is not provided.
- max_features : int, float, {"auto", "sqrt", "log2"} or None, default="sqrt"
- The number of features to consider when looking for the best split:
- - If int, then consider `max_features` features at each split.
- - If float, then `max_features` is a fraction and
- `max(1, int(max_features * n_features_in_))` features are considered at
- each split.
- - If "sqrt", then `max_features=sqrt(n_features)`.
- - If "log2", then `max_features=log2(n_features)`.
- - If None, then `max_features=n_features`.
- .. versionchanged:: 1.1
- The default of `max_features` changed from `"auto"` to `"sqrt"`.
- Note: the search for a split does not stop until at least one
- valid partition of the node samples is found, even if it requires to
- effectively inspect more than ``max_features`` features.
- random_state : int, RandomState instance or None, default=None
- Used to pick randomly the `max_features` used at each split.
- See :term:`Glossary <random_state>` for details.
- max_leaf_nodes : int, default=None
- Grow a tree with ``max_leaf_nodes`` in best-first fashion.
- Best nodes are defined as relative reduction in impurity.
- If None then unlimited number of leaf nodes.
- min_impurity_decrease : float, default=0.0
- A node will be split if this split induces a decrease of the impurity
- greater than or equal to this value.
- The weighted impurity decrease equation is the following::
- N_t / N * (impurity - N_t_R / N_t * right_impurity
- - N_t_L / N_t * left_impurity)
- where ``N`` is the total number of samples, ``N_t`` is the number of
- samples at the current node, ``N_t_L`` is the number of samples in the
- left child, and ``N_t_R`` is the number of samples in the right child.
- ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
- if ``sample_weight`` is passed.
- .. versionadded:: 0.19
- class_weight : dict, list of dict or "balanced", default=None
- Weights associated with classes in the form ``{class_label: weight}``.
- If None, all classes are supposed to have weight one. For
- multi-output problems, a list of dicts can be provided in the same
- order as the columns of y.
- Note that for multioutput (including multilabel) weights should be
- defined for each class of every column in its own dict. For example,
- for four-class multilabel classification weights should be
- [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
- [{1:1}, {2:5}, {3:1}, {4:1}].
- The "balanced" mode uses the values of y to automatically adjust
- weights inversely proportional to class frequencies in the input data
- as ``n_samples / (n_classes * np.bincount(y))``
- For multi-output, the weights of each column of y will be multiplied.
- Note that these weights will be multiplied with sample_weight (passed
- through the fit method) if sample_weight is specified.
- ccp_alpha : non-negative float, default=0.0
- Complexity parameter used for Minimal Cost-Complexity Pruning. The
- subtree with the largest cost complexity that is smaller than
- ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
- :ref:`minimal_cost_complexity_pruning` for details.
- .. versionadded:: 0.22
- Attributes
- ----------
- classes_ : ndarray of shape (n_classes,) or list of ndarray
- The classes labels (single output problem),
- or a list of arrays of class labels (multi-output problem).
- max_features_ : int
- The inferred value of max_features.
- n_classes_ : int or list of int
- The number of classes (for single output problems),
- or a list containing the number of classes for each
- output (for multi-output problems).
- feature_importances_ : ndarray of shape (n_features,)
- The impurity-based feature importances.
- The higher, the more important the feature.
- The importance of a feature is computed as the (normalized)
- total reduction of the criterion brought by that feature. It is also
- known as the Gini importance.
- Warning: impurity-based feature importances can be misleading for
- high cardinality features (many unique values). See
- :func:`sklearn.inspection.permutation_importance` as an alternative.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- n_outputs_ : int
- The number of outputs when ``fit`` is performed.
- tree_ : Tree instance
- The underlying Tree object. Please refer to
- ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and
- :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
- for basic usage of these attributes.
- See Also
- --------
- ExtraTreeRegressor : An extremely randomized tree regressor.
- sklearn.ensemble.ExtraTreesClassifier : An extra-trees classifier.
- sklearn.ensemble.ExtraTreesRegressor : An extra-trees regressor.
- sklearn.ensemble.RandomForestClassifier : A random forest classifier.
- sklearn.ensemble.RandomForestRegressor : A random forest regressor.
- sklearn.ensemble.RandomTreesEmbedding : An ensemble of
- totally random trees.
- Notes
- -----
- The default values for the parameters controlling the size of the trees
- (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
- unpruned trees which can potentially be very large on some data sets. To
- reduce memory consumption, the complexity and size of the trees should be
- controlled by setting those parameter values.
- References
- ----------
- .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
- Machine Learning, 63(1), 3-42, 2006.
- Examples
- --------
- >>> from sklearn.datasets import load_iris
- >>> from sklearn.model_selection import train_test_split
- >>> from sklearn.ensemble import BaggingClassifier
- >>> from sklearn.tree import ExtraTreeClassifier
- >>> X, y = load_iris(return_X_y=True)
- >>> X_train, X_test, y_train, y_test = train_test_split(
- ... X, y, random_state=0)
- >>> extra_tree = ExtraTreeClassifier(random_state=0)
- >>> cls = BaggingClassifier(extra_tree, random_state=0).fit(
- ... X_train, y_train)
- >>> cls.score(X_test, y_test)
- 0.8947...
- """
- def __init__(
- self,
- *,
- criterion="gini",
- splitter="random",
- max_depth=None,
- min_samples_split=2,
- min_samples_leaf=1,
- min_weight_fraction_leaf=0.0,
- max_features="sqrt",
- random_state=None,
- max_leaf_nodes=None,
- min_impurity_decrease=0.0,
- class_weight=None,
- ccp_alpha=0.0,
- ):
- super().__init__(
- criterion=criterion,
- splitter=splitter,
- max_depth=max_depth,
- min_samples_split=min_samples_split,
- min_samples_leaf=min_samples_leaf,
- min_weight_fraction_leaf=min_weight_fraction_leaf,
- max_features=max_features,
- max_leaf_nodes=max_leaf_nodes,
- class_weight=class_weight,
- min_impurity_decrease=min_impurity_decrease,
- random_state=random_state,
- ccp_alpha=ccp_alpha,
- )
- class ExtraTreeRegressor(DecisionTreeRegressor):
- """An extremely randomized tree regressor.
- Extra-trees differ from classic decision trees in the way they are built.
- When looking for the best split to separate the samples of a node into two
- groups, random splits are drawn for each of the `max_features` randomly
- selected features and the best split among those is chosen. When
- `max_features` is set 1, this amounts to building a totally random
- decision tree.
- Warning: Extra-trees should only be used within ensemble methods.
- Read more in the :ref:`User Guide <tree>`.
- Parameters
- ----------
- criterion : {"squared_error", "friedman_mse", "absolute_error", "poisson"}, \
- default="squared_error"
- The function to measure the quality of a split. Supported criteria
- are "squared_error" for the mean squared error, which is equal to
- variance reduction as feature selection criterion and minimizes the L2
- loss using the mean of each terminal node, "friedman_mse", which uses
- mean squared error with Friedman's improvement score for potential
- splits, "absolute_error" for the mean absolute error, which minimizes
- the L1 loss using the median of each terminal node, and "poisson" which
- uses reduction in Poisson deviance to find splits.
- .. versionadded:: 0.18
- Mean Absolute Error (MAE) criterion.
- .. versionadded:: 0.24
- Poisson deviance criterion.
- splitter : {"random", "best"}, default="random"
- The strategy used to choose the split at each node. Supported
- strategies are "best" to choose the best split and "random" to choose
- the best random split.
- max_depth : int, default=None
- The maximum depth of the tree. If None, then nodes are expanded until
- all leaves are pure or until all leaves contain less than
- min_samples_split samples.
- min_samples_split : int or float, default=2
- The minimum number of samples required to split an internal node:
- - If int, then consider `min_samples_split` as the minimum number.
- - If float, then `min_samples_split` is a fraction and
- `ceil(min_samples_split * n_samples)` are the minimum
- number of samples for each split.
- .. versionchanged:: 0.18
- Added float values for fractions.
- min_samples_leaf : int or float, default=1
- The minimum number of samples required to be at a leaf node.
- A split point at any depth will only be considered if it leaves at
- least ``min_samples_leaf`` training samples in each of the left and
- right branches. This may have the effect of smoothing the model,
- especially in regression.
- - If int, then consider `min_samples_leaf` as the minimum number.
- - If float, then `min_samples_leaf` is a fraction and
- `ceil(min_samples_leaf * n_samples)` are the minimum
- number of samples for each node.
- .. versionchanged:: 0.18
- Added float values for fractions.
- min_weight_fraction_leaf : float, default=0.0
- The minimum weighted fraction of the sum total of weights (of all
- the input samples) required to be at a leaf node. Samples have
- equal weight when sample_weight is not provided.
- max_features : int, float, {"auto", "sqrt", "log2"} or None, default=1.0
- The number of features to consider when looking for the best split:
- - If int, then consider `max_features` features at each split.
- - If float, then `max_features` is a fraction and
- `max(1, int(max_features * n_features_in_))` features are considered at each
- split.
- - If "sqrt", then `max_features=sqrt(n_features)`.
- - If "log2", then `max_features=log2(n_features)`.
- - If None, then `max_features=n_features`.
- .. versionchanged:: 1.1
- The default of `max_features` changed from `"auto"` to `1.0`.
- Note: the search for a split does not stop until at least one
- valid partition of the node samples is found, even if it requires to
- effectively inspect more than ``max_features`` features.
- random_state : int, RandomState instance or None, default=None
- Used to pick randomly the `max_features` used at each split.
- See :term:`Glossary <random_state>` for details.
- min_impurity_decrease : float, default=0.0
- A node will be split if this split induces a decrease of the impurity
- greater than or equal to this value.
- The weighted impurity decrease equation is the following::
- N_t / N * (impurity - N_t_R / N_t * right_impurity
- - N_t_L / N_t * left_impurity)
- where ``N`` is the total number of samples, ``N_t`` is the number of
- samples at the current node, ``N_t_L`` is the number of samples in the
- left child, and ``N_t_R`` is the number of samples in the right child.
- ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
- if ``sample_weight`` is passed.
- .. versionadded:: 0.19
- max_leaf_nodes : int, default=None
- Grow a tree with ``max_leaf_nodes`` in best-first fashion.
- Best nodes are defined as relative reduction in impurity.
- If None then unlimited number of leaf nodes.
- ccp_alpha : non-negative float, default=0.0
- Complexity parameter used for Minimal Cost-Complexity Pruning. The
- subtree with the largest cost complexity that is smaller than
- ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
- :ref:`minimal_cost_complexity_pruning` for details.
- .. versionadded:: 0.22
- Attributes
- ----------
- max_features_ : int
- The inferred value of max_features.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- feature_importances_ : ndarray of shape (n_features,)
- Return impurity-based feature importances (the higher, the more
- important the feature).
- Warning: impurity-based feature importances can be misleading for
- high cardinality features (many unique values). See
- :func:`sklearn.inspection.permutation_importance` as an alternative.
- n_outputs_ : int
- The number of outputs when ``fit`` is performed.
- tree_ : Tree instance
- The underlying Tree object. Please refer to
- ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and
- :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
- for basic usage of these attributes.
- See Also
- --------
- ExtraTreeClassifier : An extremely randomized tree classifier.
- sklearn.ensemble.ExtraTreesClassifier : An extra-trees classifier.
- sklearn.ensemble.ExtraTreesRegressor : An extra-trees regressor.
- Notes
- -----
- The default values for the parameters controlling the size of the trees
- (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
- unpruned trees which can potentially be very large on some data sets. To
- reduce memory consumption, the complexity and size of the trees should be
- controlled by setting those parameter values.
- References
- ----------
- .. [1] P. Geurts, D. Ernst., and L. Wehenkel, "Extremely randomized trees",
- Machine Learning, 63(1), 3-42, 2006.
- Examples
- --------
- >>> from sklearn.datasets import load_diabetes
- >>> from sklearn.model_selection import train_test_split
- >>> from sklearn.ensemble import BaggingRegressor
- >>> from sklearn.tree import ExtraTreeRegressor
- >>> X, y = load_diabetes(return_X_y=True)
- >>> X_train, X_test, y_train, y_test = train_test_split(
- ... X, y, random_state=0)
- >>> extra_tree = ExtraTreeRegressor(random_state=0)
- >>> reg = BaggingRegressor(extra_tree, random_state=0).fit(
- ... X_train, y_train)
- >>> reg.score(X_test, y_test)
- 0.33...
- """
- def __init__(
- self,
- *,
- criterion="squared_error",
- splitter="random",
- max_depth=None,
- min_samples_split=2,
- min_samples_leaf=1,
- min_weight_fraction_leaf=0.0,
- max_features=1.0,
- random_state=None,
- min_impurity_decrease=0.0,
- max_leaf_nodes=None,
- ccp_alpha=0.0,
- ):
- super().__init__(
- criterion=criterion,
- splitter=splitter,
- max_depth=max_depth,
- min_samples_split=min_samples_split,
- min_samples_leaf=min_samples_leaf,
- min_weight_fraction_leaf=min_weight_fraction_leaf,
- max_features=max_features,
- max_leaf_nodes=max_leaf_nodes,
- min_impurity_decrease=min_impurity_decrease,
- random_state=random_state,
- ccp_alpha=ccp_alpha,
- )
|