| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641 |
- """
- Testing for the tree module (sklearn.tree).
- """
- import copy
- import copyreg
- import io
- import pickle
- import struct
- from itertools import chain, product
- import joblib
- import numpy as np
- import pytest
- from joblib.numpy_pickle import NumpyPickler
- from numpy.testing import assert_allclose
- from scipy.sparse import coo_matrix, csc_matrix, csr_matrix
- from sklearn import datasets, tree
- from sklearn.dummy import DummyRegressor
- from sklearn.exceptions import NotFittedError
- from sklearn.metrics import accuracy_score, mean_poisson_deviance, mean_squared_error
- from sklearn.model_selection import train_test_split
- from sklearn.random_projection import _sparse_random_matrix
- from sklearn.tree import (
- DecisionTreeClassifier,
- DecisionTreeRegressor,
- ExtraTreeClassifier,
- ExtraTreeRegressor,
- )
- from sklearn.tree._classes import (
- CRITERIA_CLF,
- CRITERIA_REG,
- DENSE_SPLITTERS,
- SPARSE_SPLITTERS,
- )
- from sklearn.tree._tree import (
- NODE_DTYPE,
- TREE_LEAF,
- TREE_UNDEFINED,
- _check_n_classes,
- _check_node_ndarray,
- _check_value_ndarray,
- )
- from sklearn.tree._tree import Tree as CythonTree
- from sklearn.utils import _IS_32BIT, compute_sample_weight
- from sklearn.utils._testing import (
- assert_almost_equal,
- assert_array_almost_equal,
- assert_array_equal,
- create_memmap_backed_data,
- ignore_warnings,
- skip_if_32bit,
- )
- from sklearn.utils.estimator_checks import check_sample_weights_invariance
- from sklearn.utils.validation import check_random_state
- CLF_CRITERIONS = ("gini", "log_loss")
- REG_CRITERIONS = ("squared_error", "absolute_error", "friedman_mse", "poisson")
- CLF_TREES = {
- "DecisionTreeClassifier": DecisionTreeClassifier,
- "ExtraTreeClassifier": ExtraTreeClassifier,
- }
- REG_TREES = {
- "DecisionTreeRegressor": DecisionTreeRegressor,
- "ExtraTreeRegressor": ExtraTreeRegressor,
- }
- ALL_TREES: dict = dict()
- ALL_TREES.update(CLF_TREES)
- ALL_TREES.update(REG_TREES)
- SPARSE_TREES = [
- "DecisionTreeClassifier",
- "DecisionTreeRegressor",
- "ExtraTreeClassifier",
- "ExtraTreeRegressor",
- ]
- X_small = np.array(
- [
- [0, 0, 4, 0, 0, 0, 1, -14, 0, -4, 0, 0, 0, 0],
- [0, 0, 5, 3, 0, -4, 0, 0, 1, -5, 0.2, 0, 4, 1],
- [-1, -1, 0, 0, -4.5, 0, 0, 2.1, 1, 0, 0, -4.5, 0, 1],
- [-1, -1, 0, -1.2, 0, 0, 0, 0, 0, 0, 0.2, 0, 0, 1],
- [-1, -1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 1],
- [-1, -2, 0, 4, -3, 10, 4, 0, -3.2, 0, 4, 3, -4, 1],
- [2.11, 0, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0.5, 0, -3, 1],
- [2.11, 0, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0, 0, -2, 1],
- [2.11, 8, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0, 0, -2, 1],
- [2.11, 8, -6, -0.5, 0, 11, 0, 0, -3.2, 6, 0.5, 0, -1, 0],
- [2, 8, 5, 1, 0.5, -4, 10, 0, 1, -5, 3, 0, 2, 0],
- [2, 0, 1, 1, 1, -1, 1, 0, 0, -2, 3, 0, 1, 0],
- [2, 0, 1, 2, 3, -1, 10, 2, 0, -1, 1, 2, 2, 0],
- [1, 1, 0, 2, 2, -1, 1, 2, 0, -5, 1, 2, 3, 0],
- [3, 1, 0, 3, 0, -4, 10, 0, 1, -5, 3, 0, 3, 1],
- [2.11, 8, -6, -0.5, 0, 1, 0, 0, -3.2, 6, 0.5, 0, -3, 1],
- [2.11, 8, -6, -0.5, 0, 1, 0, 0, -3.2, 6, 1.5, 1, -1, -1],
- [2.11, 8, -6, -0.5, 0, 10, 0, 0, -3.2, 6, 0.5, 0, -1, -1],
- [2, 0, 5, 1, 0.5, -2, 10, 0, 1, -5, 3, 1, 0, -1],
- [2, 0, 1, 1, 1, -2, 1, 0, 0, -2, 0, 0, 0, 1],
- [2, 1, 1, 1, 2, -1, 10, 2, 0, -1, 0, 2, 1, 1],
- [1, 1, 0, 0, 1, -3, 1, 2, 0, -5, 1, 2, 1, 1],
- [3, 1, 0, 1, 0, -4, 1, 0, 1, -2, 0, 0, 1, 0],
- ]
- )
- y_small = [1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0]
- y_small_reg = [
- 1.0,
- 2.1,
- 1.2,
- 0.05,
- 10,
- 2.4,
- 3.1,
- 1.01,
- 0.01,
- 2.98,
- 3.1,
- 1.1,
- 0.0,
- 1.2,
- 2,
- 11,
- 0,
- 0,
- 4.5,
- 0.201,
- 1.06,
- 0.9,
- 0,
- ]
- # toy sample
- X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
- y = [-1, -1, -1, 1, 1, 1]
- T = [[-1, -1], [2, 2], [3, 2]]
- true_result = [-1, 1, 1]
- # also load the iris dataset
- # and randomly permute it
- iris = datasets.load_iris()
- rng = np.random.RandomState(1)
- perm = rng.permutation(iris.target.size)
- iris.data = iris.data[perm]
- iris.target = iris.target[perm]
- # also load the diabetes dataset
- # and randomly permute it
- diabetes = datasets.load_diabetes()
- perm = rng.permutation(diabetes.target.size)
- diabetes.data = diabetes.data[perm]
- diabetes.target = diabetes.target[perm]
- digits = datasets.load_digits()
- perm = rng.permutation(digits.target.size)
- digits.data = digits.data[perm]
- digits.target = digits.target[perm]
- random_state = check_random_state(0)
- X_multilabel, y_multilabel = datasets.make_multilabel_classification(
- random_state=0, n_samples=30, n_features=10
- )
- # NB: despite their names X_sparse_* are numpy arrays (and not sparse matrices)
- X_sparse_pos = random_state.uniform(size=(20, 5))
- X_sparse_pos[X_sparse_pos <= 0.8] = 0.0
- y_random = random_state.randint(0, 4, size=(20,))
- X_sparse_mix = _sparse_random_matrix(20, 10, density=0.25, random_state=0).toarray()
- DATASETS = {
- "iris": {"X": iris.data, "y": iris.target},
- "diabetes": {"X": diabetes.data, "y": diabetes.target},
- "digits": {"X": digits.data, "y": digits.target},
- "toy": {"X": X, "y": y},
- "clf_small": {"X": X_small, "y": y_small},
- "reg_small": {"X": X_small, "y": y_small_reg},
- "multilabel": {"X": X_multilabel, "y": y_multilabel},
- "sparse-pos": {"X": X_sparse_pos, "y": y_random},
- "sparse-neg": {"X": -X_sparse_pos, "y": y_random},
- "sparse-mix": {"X": X_sparse_mix, "y": y_random},
- "zeros": {"X": np.zeros((20, 3)), "y": y_random},
- }
- for name in DATASETS:
- DATASETS[name]["X_sparse"] = csc_matrix(DATASETS[name]["X"])
- def assert_tree_equal(d, s, message):
- assert (
- s.node_count == d.node_count
- ), "{0}: inequal number of node ({1} != {2})".format(
- message, s.node_count, d.node_count
- )
- assert_array_equal(
- d.children_right, s.children_right, message + ": inequal children_right"
- )
- assert_array_equal(
- d.children_left, s.children_left, message + ": inequal children_left"
- )
- external = d.children_right == TREE_LEAF
- internal = np.logical_not(external)
- assert_array_equal(
- d.feature[internal], s.feature[internal], message + ": inequal features"
- )
- assert_array_equal(
- d.threshold[internal], s.threshold[internal], message + ": inequal threshold"
- )
- assert_array_equal(
- d.n_node_samples.sum(),
- s.n_node_samples.sum(),
- message + ": inequal sum(n_node_samples)",
- )
- assert_array_equal(
- d.n_node_samples, s.n_node_samples, message + ": inequal n_node_samples"
- )
- assert_almost_equal(d.impurity, s.impurity, err_msg=message + ": inequal impurity")
- assert_array_almost_equal(
- d.value[external], s.value[external], err_msg=message + ": inequal value"
- )
- def test_classification_toy():
- # Check classification on a toy dataset.
- for name, Tree in CLF_TREES.items():
- clf = Tree(random_state=0)
- clf.fit(X, y)
- assert_array_equal(clf.predict(T), true_result, "Failed with {0}".format(name))
- clf = Tree(max_features=1, random_state=1)
- clf.fit(X, y)
- assert_array_equal(clf.predict(T), true_result, "Failed with {0}".format(name))
- def test_weighted_classification_toy():
- # Check classification on a weighted toy dataset.
- for name, Tree in CLF_TREES.items():
- clf = Tree(random_state=0)
- clf.fit(X, y, sample_weight=np.ones(len(X)))
- assert_array_equal(clf.predict(T), true_result, "Failed with {0}".format(name))
- clf.fit(X, y, sample_weight=np.full(len(X), 0.5))
- assert_array_equal(clf.predict(T), true_result, "Failed with {0}".format(name))
- @pytest.mark.parametrize("Tree", REG_TREES.values())
- @pytest.mark.parametrize("criterion", REG_CRITERIONS)
- def test_regression_toy(Tree, criterion):
- # Check regression on a toy dataset.
- if criterion == "poisson":
- # make target positive while not touching the original y and
- # true_result
- a = np.abs(np.min(y)) + 1
- y_train = np.array(y) + a
- y_test = np.array(true_result) + a
- else:
- y_train = y
- y_test = true_result
- reg = Tree(criterion=criterion, random_state=1)
- reg.fit(X, y_train)
- assert_allclose(reg.predict(T), y_test)
- clf = Tree(criterion=criterion, max_features=1, random_state=1)
- clf.fit(X, y_train)
- assert_allclose(reg.predict(T), y_test)
- def test_xor():
- # Check on a XOR problem
- y = np.zeros((10, 10))
- y[:5, :5] = 1
- y[5:, 5:] = 1
- gridx, gridy = np.indices(y.shape)
- X = np.vstack([gridx.ravel(), gridy.ravel()]).T
- y = y.ravel()
- for name, Tree in CLF_TREES.items():
- clf = Tree(random_state=0)
- clf.fit(X, y)
- assert clf.score(X, y) == 1.0, "Failed with {0}".format(name)
- clf = Tree(random_state=0, max_features=1)
- clf.fit(X, y)
- assert clf.score(X, y) == 1.0, "Failed with {0}".format(name)
- def test_iris():
- # Check consistency on dataset iris.
- for (name, Tree), criterion in product(CLF_TREES.items(), CLF_CRITERIONS):
- clf = Tree(criterion=criterion, random_state=0)
- clf.fit(iris.data, iris.target)
- score = accuracy_score(clf.predict(iris.data), iris.target)
- assert score > 0.9, "Failed with {0}, criterion = {1} and score = {2}".format(
- name, criterion, score
- )
- clf = Tree(criterion=criterion, max_features=2, random_state=0)
- clf.fit(iris.data, iris.target)
- score = accuracy_score(clf.predict(iris.data), iris.target)
- assert score > 0.5, "Failed with {0}, criterion = {1} and score = {2}".format(
- name, criterion, score
- )
- @pytest.mark.parametrize("name, Tree", REG_TREES.items())
- @pytest.mark.parametrize("criterion", REG_CRITERIONS)
- def test_diabetes_overfit(name, Tree, criterion):
- # check consistency of overfitted trees on the diabetes dataset
- # since the trees will overfit, we expect an MSE of 0
- reg = Tree(criterion=criterion, random_state=0)
- reg.fit(diabetes.data, diabetes.target)
- score = mean_squared_error(diabetes.target, reg.predict(diabetes.data))
- assert score == pytest.approx(
- 0
- ), f"Failed with {name}, criterion = {criterion} and score = {score}"
- @skip_if_32bit
- @pytest.mark.parametrize("name, Tree", REG_TREES.items())
- @pytest.mark.parametrize(
- "criterion, max_depth, metric, max_loss",
- [
- ("squared_error", 15, mean_squared_error, 60),
- ("absolute_error", 20, mean_squared_error, 60),
- ("friedman_mse", 15, mean_squared_error, 60),
- ("poisson", 15, mean_poisson_deviance, 30),
- ],
- )
- def test_diabetes_underfit(name, Tree, criterion, max_depth, metric, max_loss):
- # check consistency of trees when the depth and the number of features are
- # limited
- reg = Tree(criterion=criterion, max_depth=max_depth, max_features=6, random_state=0)
- reg.fit(diabetes.data, diabetes.target)
- loss = metric(diabetes.target, reg.predict(diabetes.data))
- assert 0 < loss < max_loss
- def test_probability():
- # Predict probabilities using DecisionTreeClassifier.
- for name, Tree in CLF_TREES.items():
- clf = Tree(max_depth=1, max_features=1, random_state=42)
- clf.fit(iris.data, iris.target)
- prob_predict = clf.predict_proba(iris.data)
- assert_array_almost_equal(
- np.sum(prob_predict, 1),
- np.ones(iris.data.shape[0]),
- err_msg="Failed with {0}".format(name),
- )
- assert_array_equal(
- np.argmax(prob_predict, 1),
- clf.predict(iris.data),
- err_msg="Failed with {0}".format(name),
- )
- assert_almost_equal(
- clf.predict_proba(iris.data),
- np.exp(clf.predict_log_proba(iris.data)),
- 8,
- err_msg="Failed with {0}".format(name),
- )
- def test_arrayrepr():
- # Check the array representation.
- # Check resize
- X = np.arange(10000)[:, np.newaxis]
- y = np.arange(10000)
- for name, Tree in REG_TREES.items():
- reg = Tree(max_depth=None, random_state=0)
- reg.fit(X, y)
- def test_pure_set():
- # Check when y is pure.
- X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1]]
- y = [1, 1, 1, 1, 1, 1]
- for name, TreeClassifier in CLF_TREES.items():
- clf = TreeClassifier(random_state=0)
- clf.fit(X, y)
- assert_array_equal(clf.predict(X), y, err_msg="Failed with {0}".format(name))
- for name, TreeRegressor in REG_TREES.items():
- reg = TreeRegressor(random_state=0)
- reg.fit(X, y)
- assert_almost_equal(reg.predict(X), y, err_msg="Failed with {0}".format(name))
- def test_numerical_stability():
- # Check numerical stability.
- X = np.array(
- [
- [152.08097839, 140.40744019, 129.75102234, 159.90493774],
- [142.50700378, 135.81935120, 117.82884979, 162.75781250],
- [127.28772736, 140.40744019, 129.75102234, 159.90493774],
- [132.37025452, 143.71923828, 138.35694885, 157.84558105],
- [103.10237122, 143.71928406, 138.35696411, 157.84559631],
- [127.71276855, 143.71923828, 138.35694885, 157.84558105],
- [120.91514587, 140.40744019, 129.75102234, 159.90493774],
- ]
- )
- y = np.array([1.0, 0.70209277, 0.53896582, 0.0, 0.90914464, 0.48026916, 0.49622521])
- with np.errstate(all="raise"):
- for name, Tree in REG_TREES.items():
- reg = Tree(random_state=0)
- reg.fit(X, y)
- reg.fit(X, -y)
- reg.fit(-X, y)
- reg.fit(-X, -y)
- def test_importances():
- # Check variable importances.
- X, y = datasets.make_classification(
- n_samples=5000,
- n_features=10,
- n_informative=3,
- n_redundant=0,
- n_repeated=0,
- shuffle=False,
- random_state=0,
- )
- for name, Tree in CLF_TREES.items():
- clf = Tree(random_state=0)
- clf.fit(X, y)
- importances = clf.feature_importances_
- n_important = np.sum(importances > 0.1)
- assert importances.shape[0] == 10, "Failed with {0}".format(name)
- assert n_important == 3, "Failed with {0}".format(name)
- # Check on iris that importances are the same for all builders
- clf = DecisionTreeClassifier(random_state=0)
- clf.fit(iris.data, iris.target)
- clf2 = DecisionTreeClassifier(random_state=0, max_leaf_nodes=len(iris.data))
- clf2.fit(iris.data, iris.target)
- assert_array_equal(clf.feature_importances_, clf2.feature_importances_)
- def test_importances_raises():
- # Check if variable importance before fit raises ValueError.
- clf = DecisionTreeClassifier()
- with pytest.raises(ValueError):
- getattr(clf, "feature_importances_")
- def test_importances_gini_equal_squared_error():
- # Check that gini is equivalent to squared_error for binary output variable
- X, y = datasets.make_classification(
- n_samples=2000,
- n_features=10,
- n_informative=3,
- n_redundant=0,
- n_repeated=0,
- shuffle=False,
- random_state=0,
- )
- # The gini index and the mean square error (variance) might differ due
- # to numerical instability. Since those instabilities mainly occurs at
- # high tree depth, we restrict this maximal depth.
- clf = DecisionTreeClassifier(criterion="gini", max_depth=5, random_state=0).fit(
- X, y
- )
- reg = DecisionTreeRegressor(
- criterion="squared_error", max_depth=5, random_state=0
- ).fit(X, y)
- assert_almost_equal(clf.feature_importances_, reg.feature_importances_)
- assert_array_equal(clf.tree_.feature, reg.tree_.feature)
- assert_array_equal(clf.tree_.children_left, reg.tree_.children_left)
- assert_array_equal(clf.tree_.children_right, reg.tree_.children_right)
- assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples)
- def test_max_features():
- # Check max_features.
- for name, TreeEstimator in ALL_TREES.items():
- est = TreeEstimator(max_features="sqrt")
- est.fit(iris.data, iris.target)
- assert est.max_features_ == int(np.sqrt(iris.data.shape[1]))
- est = TreeEstimator(max_features="log2")
- est.fit(iris.data, iris.target)
- assert est.max_features_ == int(np.log2(iris.data.shape[1]))
- est = TreeEstimator(max_features=1)
- est.fit(iris.data, iris.target)
- assert est.max_features_ == 1
- est = TreeEstimator(max_features=3)
- est.fit(iris.data, iris.target)
- assert est.max_features_ == 3
- est = TreeEstimator(max_features=0.01)
- est.fit(iris.data, iris.target)
- assert est.max_features_ == 1
- est = TreeEstimator(max_features=0.5)
- est.fit(iris.data, iris.target)
- assert est.max_features_ == int(0.5 * iris.data.shape[1])
- est = TreeEstimator(max_features=1.0)
- est.fit(iris.data, iris.target)
- assert est.max_features_ == iris.data.shape[1]
- est = TreeEstimator(max_features=None)
- est.fit(iris.data, iris.target)
- assert est.max_features_ == iris.data.shape[1]
- def test_error():
- # Test that it gives proper exception on deficient input.
- for name, TreeEstimator in CLF_TREES.items():
- # predict before fit
- est = TreeEstimator()
- with pytest.raises(NotFittedError):
- est.predict_proba(X)
- est.fit(X, y)
- X2 = [[-2, -1, 1]] # wrong feature shape for sample
- with pytest.raises(ValueError):
- est.predict_proba(X2)
- # Wrong dimensions
- est = TreeEstimator()
- y2 = y[:-1]
- with pytest.raises(ValueError):
- est.fit(X, y2)
- # Test with arrays that are non-contiguous.
- Xf = np.asfortranarray(X)
- est = TreeEstimator()
- est.fit(Xf, y)
- assert_almost_equal(est.predict(T), true_result)
- # predict before fitting
- est = TreeEstimator()
- with pytest.raises(NotFittedError):
- est.predict(T)
- # predict on vector with different dims
- est.fit(X, y)
- t = np.asarray(T)
- with pytest.raises(ValueError):
- est.predict(t[:, 1:])
- # wrong sample shape
- Xt = np.array(X).T
- est = TreeEstimator()
- est.fit(np.dot(X, Xt), y)
- with pytest.raises(ValueError):
- est.predict(X)
- with pytest.raises(ValueError):
- est.apply(X)
- clf = TreeEstimator()
- clf.fit(X, y)
- with pytest.raises(ValueError):
- clf.predict(Xt)
- with pytest.raises(ValueError):
- clf.apply(Xt)
- # apply before fitting
- est = TreeEstimator()
- with pytest.raises(NotFittedError):
- est.apply(T)
- # non positive target for Poisson splitting Criterion
- est = DecisionTreeRegressor(criterion="poisson")
- with pytest.raises(ValueError, match="y is not positive.*Poisson"):
- est.fit([[0, 1, 2]], [0, 0, 0])
- with pytest.raises(ValueError, match="Some.*y are negative.*Poisson"):
- est.fit([[0, 1, 2]], [5, -0.1, 2])
- def test_min_samples_split():
- """Test min_samples_split parameter"""
- X = np.asfortranarray(iris.data, dtype=tree._tree.DTYPE)
- y = iris.target
- # test both DepthFirstTreeBuilder and BestFirstTreeBuilder
- # by setting max_leaf_nodes
- for max_leaf_nodes, name in product((None, 1000), ALL_TREES.keys()):
- TreeEstimator = ALL_TREES[name]
- # test for integer parameter
- est = TreeEstimator(
- min_samples_split=10, max_leaf_nodes=max_leaf_nodes, random_state=0
- )
- est.fit(X, y)
- # count samples on nodes, -1 means it is a leaf
- node_samples = est.tree_.n_node_samples[est.tree_.children_left != -1]
- assert np.min(node_samples) > 9, "Failed with {0}".format(name)
- # test for float parameter
- est = TreeEstimator(
- min_samples_split=0.2, max_leaf_nodes=max_leaf_nodes, random_state=0
- )
- est.fit(X, y)
- # count samples on nodes, -1 means it is a leaf
- node_samples = est.tree_.n_node_samples[est.tree_.children_left != -1]
- assert np.min(node_samples) > 9, "Failed with {0}".format(name)
- def test_min_samples_leaf():
- # Test if leaves contain more than leaf_count training examples
- X = np.asfortranarray(iris.data, dtype=tree._tree.DTYPE)
- y = iris.target
- # test both DepthFirstTreeBuilder and BestFirstTreeBuilder
- # by setting max_leaf_nodes
- for max_leaf_nodes, name in product((None, 1000), ALL_TREES.keys()):
- TreeEstimator = ALL_TREES[name]
- # test integer parameter
- est = TreeEstimator(
- min_samples_leaf=5, max_leaf_nodes=max_leaf_nodes, random_state=0
- )
- est.fit(X, y)
- out = est.tree_.apply(X)
- node_counts = np.bincount(out)
- # drop inner nodes
- leaf_count = node_counts[node_counts != 0]
- assert np.min(leaf_count) > 4, "Failed with {0}".format(name)
- # test float parameter
- est = TreeEstimator(
- min_samples_leaf=0.1, max_leaf_nodes=max_leaf_nodes, random_state=0
- )
- est.fit(X, y)
- out = est.tree_.apply(X)
- node_counts = np.bincount(out)
- # drop inner nodes
- leaf_count = node_counts[node_counts != 0]
- assert np.min(leaf_count) > 4, "Failed with {0}".format(name)
- def check_min_weight_fraction_leaf(name, datasets, sparse=False):
- """Test if leaves contain at least min_weight_fraction_leaf of the
- training set"""
- if sparse:
- X = DATASETS[datasets]["X_sparse"].astype(np.float32)
- else:
- X = DATASETS[datasets]["X"].astype(np.float32)
- y = DATASETS[datasets]["y"]
- weights = rng.rand(X.shape[0])
- total_weight = np.sum(weights)
- TreeEstimator = ALL_TREES[name]
- # test both DepthFirstTreeBuilder and BestFirstTreeBuilder
- # by setting max_leaf_nodes
- for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 6)):
- est = TreeEstimator(
- min_weight_fraction_leaf=frac, max_leaf_nodes=max_leaf_nodes, random_state=0
- )
- est.fit(X, y, sample_weight=weights)
- if sparse:
- out = est.tree_.apply(X.tocsr())
- else:
- out = est.tree_.apply(X)
- node_weights = np.bincount(out, weights=weights)
- # drop inner nodes
- leaf_weights = node_weights[node_weights != 0]
- assert (
- np.min(leaf_weights) >= total_weight * est.min_weight_fraction_leaf
- ), "Failed with {0} min_weight_fraction_leaf={1}".format(
- name, est.min_weight_fraction_leaf
- )
- # test case with no weights passed in
- total_weight = X.shape[0]
- for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 6)):
- est = TreeEstimator(
- min_weight_fraction_leaf=frac, max_leaf_nodes=max_leaf_nodes, random_state=0
- )
- est.fit(X, y)
- if sparse:
- out = est.tree_.apply(X.tocsr())
- else:
- out = est.tree_.apply(X)
- node_weights = np.bincount(out)
- # drop inner nodes
- leaf_weights = node_weights[node_weights != 0]
- assert (
- np.min(leaf_weights) >= total_weight * est.min_weight_fraction_leaf
- ), "Failed with {0} min_weight_fraction_leaf={1}".format(
- name, est.min_weight_fraction_leaf
- )
- @pytest.mark.parametrize("name", ALL_TREES)
- def test_min_weight_fraction_leaf_on_dense_input(name):
- check_min_weight_fraction_leaf(name, "iris")
- @pytest.mark.parametrize("name", SPARSE_TREES)
- def test_min_weight_fraction_leaf_on_sparse_input(name):
- check_min_weight_fraction_leaf(name, "multilabel", True)
- def check_min_weight_fraction_leaf_with_min_samples_leaf(name, datasets, sparse=False):
- """Test the interaction between min_weight_fraction_leaf and
- min_samples_leaf when sample_weights is not provided in fit."""
- if sparse:
- X = DATASETS[datasets]["X_sparse"].astype(np.float32)
- else:
- X = DATASETS[datasets]["X"].astype(np.float32)
- y = DATASETS[datasets]["y"]
- total_weight = X.shape[0]
- TreeEstimator = ALL_TREES[name]
- for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 3)):
- # test integer min_samples_leaf
- est = TreeEstimator(
- min_weight_fraction_leaf=frac,
- max_leaf_nodes=max_leaf_nodes,
- min_samples_leaf=5,
- random_state=0,
- )
- est.fit(X, y)
- if sparse:
- out = est.tree_.apply(X.tocsr())
- else:
- out = est.tree_.apply(X)
- node_weights = np.bincount(out)
- # drop inner nodes
- leaf_weights = node_weights[node_weights != 0]
- assert np.min(leaf_weights) >= max(
- (total_weight * est.min_weight_fraction_leaf), 5
- ), "Failed with {0} min_weight_fraction_leaf={1}, min_samples_leaf={2}".format(
- name, est.min_weight_fraction_leaf, est.min_samples_leaf
- )
- for max_leaf_nodes, frac in product((None, 1000), np.linspace(0, 0.5, 3)):
- # test float min_samples_leaf
- est = TreeEstimator(
- min_weight_fraction_leaf=frac,
- max_leaf_nodes=max_leaf_nodes,
- min_samples_leaf=0.1,
- random_state=0,
- )
- est.fit(X, y)
- if sparse:
- out = est.tree_.apply(X.tocsr())
- else:
- out = est.tree_.apply(X)
- node_weights = np.bincount(out)
- # drop inner nodes
- leaf_weights = node_weights[node_weights != 0]
- assert np.min(leaf_weights) >= max(
- (total_weight * est.min_weight_fraction_leaf),
- (total_weight * est.min_samples_leaf),
- ), "Failed with {0} min_weight_fraction_leaf={1}, min_samples_leaf={2}".format(
- name, est.min_weight_fraction_leaf, est.min_samples_leaf
- )
- @pytest.mark.parametrize("name", ALL_TREES)
- def test_min_weight_fraction_leaf_with_min_samples_leaf_on_dense_input(name):
- check_min_weight_fraction_leaf_with_min_samples_leaf(name, "iris")
- @pytest.mark.parametrize("name", SPARSE_TREES)
- def test_min_weight_fraction_leaf_with_min_samples_leaf_on_sparse_input(name):
- check_min_weight_fraction_leaf_with_min_samples_leaf(name, "multilabel", True)
- def test_min_impurity_decrease(global_random_seed):
- # test if min_impurity_decrease ensure that a split is made only if
- # if the impurity decrease is at least that value
- X, y = datasets.make_classification(n_samples=100, random_state=global_random_seed)
- # test both DepthFirstTreeBuilder and BestFirstTreeBuilder
- # by setting max_leaf_nodes
- for max_leaf_nodes, name in product((None, 1000), ALL_TREES.keys()):
- TreeEstimator = ALL_TREES[name]
- # Check default value of min_impurity_decrease, 1e-7
- est1 = TreeEstimator(max_leaf_nodes=max_leaf_nodes, random_state=0)
- # Check with explicit value of 0.05
- est2 = TreeEstimator(
- max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.05, random_state=0
- )
- # Check with a much lower value of 0.0001
- est3 = TreeEstimator(
- max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.0001, random_state=0
- )
- # Check with a much lower value of 0.1
- est4 = TreeEstimator(
- max_leaf_nodes=max_leaf_nodes, min_impurity_decrease=0.1, random_state=0
- )
- for est, expected_decrease in (
- (est1, 1e-7),
- (est2, 0.05),
- (est3, 0.0001),
- (est4, 0.1),
- ):
- assert (
- est.min_impurity_decrease <= expected_decrease
- ), "Failed, min_impurity_decrease = {0} > {1}".format(
- est.min_impurity_decrease, expected_decrease
- )
- est.fit(X, y)
- for node in range(est.tree_.node_count):
- # If current node is a not leaf node, check if the split was
- # justified w.r.t the min_impurity_decrease
- if est.tree_.children_left[node] != TREE_LEAF:
- imp_parent = est.tree_.impurity[node]
- wtd_n_node = est.tree_.weighted_n_node_samples[node]
- left = est.tree_.children_left[node]
- wtd_n_left = est.tree_.weighted_n_node_samples[left]
- imp_left = est.tree_.impurity[left]
- wtd_imp_left = wtd_n_left * imp_left
- right = est.tree_.children_right[node]
- wtd_n_right = est.tree_.weighted_n_node_samples[right]
- imp_right = est.tree_.impurity[right]
- wtd_imp_right = wtd_n_right * imp_right
- wtd_avg_left_right_imp = wtd_imp_right + wtd_imp_left
- wtd_avg_left_right_imp /= wtd_n_node
- fractional_node_weight = (
- est.tree_.weighted_n_node_samples[node] / X.shape[0]
- )
- actual_decrease = fractional_node_weight * (
- imp_parent - wtd_avg_left_right_imp
- )
- assert (
- actual_decrease >= expected_decrease
- ), "Failed with {0} expected min_impurity_decrease={1}".format(
- actual_decrease, expected_decrease
- )
- def test_pickle():
- """Test pickling preserves Tree properties and performance."""
- for name, TreeEstimator in ALL_TREES.items():
- if "Classifier" in name:
- X, y = iris.data, iris.target
- else:
- X, y = diabetes.data, diabetes.target
- est = TreeEstimator(random_state=0)
- est.fit(X, y)
- score = est.score(X, y)
- # test that all class properties are maintained
- attributes = [
- "max_depth",
- "node_count",
- "capacity",
- "n_classes",
- "children_left",
- "children_right",
- "n_leaves",
- "feature",
- "threshold",
- "impurity",
- "n_node_samples",
- "weighted_n_node_samples",
- "value",
- ]
- fitted_attribute = {
- attribute: getattr(est.tree_, attribute) for attribute in attributes
- }
- serialized_object = pickle.dumps(est)
- est2 = pickle.loads(serialized_object)
- assert type(est2) == est.__class__
- score2 = est2.score(X, y)
- assert (
- score == score2
- ), "Failed to generate same score after pickling with {0}".format(name)
- for attribute in fitted_attribute:
- assert_array_equal(
- getattr(est2.tree_, attribute),
- fitted_attribute[attribute],
- err_msg=(
- f"Failed to generate same attribute {attribute} after pickling with"
- f" {name}"
- ),
- )
- def test_multioutput():
- # Check estimators on multi-output problems.
- X = [
- [-2, -1],
- [-1, -1],
- [-1, -2],
- [1, 1],
- [1, 2],
- [2, 1],
- [-2, 1],
- [-1, 1],
- [-1, 2],
- [2, -1],
- [1, -1],
- [1, -2],
- ]
- y = [
- [-1, 0],
- [-1, 0],
- [-1, 0],
- [1, 1],
- [1, 1],
- [1, 1],
- [-1, 2],
- [-1, 2],
- [-1, 2],
- [1, 3],
- [1, 3],
- [1, 3],
- ]
- T = [[-1, -1], [1, 1], [-1, 1], [1, -1]]
- y_true = [[-1, 0], [1, 1], [-1, 2], [1, 3]]
- # toy classification problem
- for name, TreeClassifier in CLF_TREES.items():
- clf = TreeClassifier(random_state=0)
- y_hat = clf.fit(X, y).predict(T)
- assert_array_equal(y_hat, y_true)
- assert y_hat.shape == (4, 2)
- proba = clf.predict_proba(T)
- assert len(proba) == 2
- assert proba[0].shape == (4, 2)
- assert proba[1].shape == (4, 4)
- log_proba = clf.predict_log_proba(T)
- assert len(log_proba) == 2
- assert log_proba[0].shape == (4, 2)
- assert log_proba[1].shape == (4, 4)
- # toy regression problem
- for name, TreeRegressor in REG_TREES.items():
- reg = TreeRegressor(random_state=0)
- y_hat = reg.fit(X, y).predict(T)
- assert_almost_equal(y_hat, y_true)
- assert y_hat.shape == (4, 2)
- def test_classes_shape():
- # Test that n_classes_ and classes_ have proper shape.
- for name, TreeClassifier in CLF_TREES.items():
- # Classification, single output
- clf = TreeClassifier(random_state=0)
- clf.fit(X, y)
- assert clf.n_classes_ == 2
- assert_array_equal(clf.classes_, [-1, 1])
- # Classification, multi-output
- _y = np.vstack((y, np.array(y) * 2)).T
- clf = TreeClassifier(random_state=0)
- clf.fit(X, _y)
- assert len(clf.n_classes_) == 2
- assert len(clf.classes_) == 2
- assert_array_equal(clf.n_classes_, [2, 2])
- assert_array_equal(clf.classes_, [[-1, 1], [-2, 2]])
- def test_unbalanced_iris():
- # Check class rebalancing.
- unbalanced_X = iris.data[:125]
- unbalanced_y = iris.target[:125]
- sample_weight = compute_sample_weight("balanced", unbalanced_y)
- for name, TreeClassifier in CLF_TREES.items():
- clf = TreeClassifier(random_state=0)
- clf.fit(unbalanced_X, unbalanced_y, sample_weight=sample_weight)
- assert_almost_equal(clf.predict(unbalanced_X), unbalanced_y)
- def test_memory_layout():
- # Check that it works no matter the memory layout
- for (name, TreeEstimator), dtype in product(
- ALL_TREES.items(), [np.float64, np.float32]
- ):
- est = TreeEstimator(random_state=0)
- # Nothing
- X = np.asarray(iris.data, dtype=dtype)
- y = iris.target
- assert_array_equal(est.fit(X, y).predict(X), y)
- # C-order
- X = np.asarray(iris.data, order="C", dtype=dtype)
- y = iris.target
- assert_array_equal(est.fit(X, y).predict(X), y)
- # F-order
- X = np.asarray(iris.data, order="F", dtype=dtype)
- y = iris.target
- assert_array_equal(est.fit(X, y).predict(X), y)
- # Contiguous
- X = np.ascontiguousarray(iris.data, dtype=dtype)
- y = iris.target
- assert_array_equal(est.fit(X, y).predict(X), y)
- # csr matrix
- X = csr_matrix(iris.data, dtype=dtype)
- y = iris.target
- assert_array_equal(est.fit(X, y).predict(X), y)
- # csc_matrix
- X = csc_matrix(iris.data, dtype=dtype)
- y = iris.target
- assert_array_equal(est.fit(X, y).predict(X), y)
- # Strided
- X = np.asarray(iris.data[::3], dtype=dtype)
- y = iris.target[::3]
- assert_array_equal(est.fit(X, y).predict(X), y)
- def test_sample_weight():
- # Check sample weighting.
- # Test that zero-weighted samples are not taken into account
- X = np.arange(100)[:, np.newaxis]
- y = np.ones(100)
- y[:50] = 0.0
- sample_weight = np.ones(100)
- sample_weight[y == 0] = 0.0
- clf = DecisionTreeClassifier(random_state=0)
- clf.fit(X, y, sample_weight=sample_weight)
- assert_array_equal(clf.predict(X), np.ones(100))
- # Test that low weighted samples are not taken into account at low depth
- X = np.arange(200)[:, np.newaxis]
- y = np.zeros(200)
- y[50:100] = 1
- y[100:200] = 2
- X[100:200, 0] = 200
- sample_weight = np.ones(200)
- sample_weight[y == 2] = 0.51 # Samples of class '2' are still weightier
- clf = DecisionTreeClassifier(max_depth=1, random_state=0)
- clf.fit(X, y, sample_weight=sample_weight)
- assert clf.tree_.threshold[0] == 149.5
- sample_weight[y == 2] = 0.5 # Samples of class '2' are no longer weightier
- clf = DecisionTreeClassifier(max_depth=1, random_state=0)
- clf.fit(X, y, sample_weight=sample_weight)
- assert clf.tree_.threshold[0] == 49.5 # Threshold should have moved
- # Test that sample weighting is the same as having duplicates
- X = iris.data
- y = iris.target
- duplicates = rng.randint(0, X.shape[0], 100)
- clf = DecisionTreeClassifier(random_state=1)
- clf.fit(X[duplicates], y[duplicates])
- sample_weight = np.bincount(duplicates, minlength=X.shape[0])
- clf2 = DecisionTreeClassifier(random_state=1)
- clf2.fit(X, y, sample_weight=sample_weight)
- internal = clf.tree_.children_left != tree._tree.TREE_LEAF
- assert_array_almost_equal(
- clf.tree_.threshold[internal], clf2.tree_.threshold[internal]
- )
- def test_sample_weight_invalid():
- # Check sample weighting raises errors.
- X = np.arange(100)[:, np.newaxis]
- y = np.ones(100)
- y[:50] = 0.0
- clf = DecisionTreeClassifier(random_state=0)
- sample_weight = np.random.rand(100, 1)
- with pytest.raises(ValueError):
- clf.fit(X, y, sample_weight=sample_weight)
- sample_weight = np.array(0)
- expected_err = r"Singleton.* cannot be considered a valid collection"
- with pytest.raises(TypeError, match=expected_err):
- clf.fit(X, y, sample_weight=sample_weight)
- def check_class_weights(name):
- """Check class_weights resemble sample_weights behavior."""
- TreeClassifier = CLF_TREES[name]
- # Iris is balanced, so no effect expected for using 'balanced' weights
- clf1 = TreeClassifier(random_state=0)
- clf1.fit(iris.data, iris.target)
- clf2 = TreeClassifier(class_weight="balanced", random_state=0)
- clf2.fit(iris.data, iris.target)
- assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
- # Make a multi-output problem with three copies of Iris
- iris_multi = np.vstack((iris.target, iris.target, iris.target)).T
- # Create user-defined weights that should balance over the outputs
- clf3 = TreeClassifier(
- class_weight=[
- {0: 2.0, 1: 2.0, 2: 1.0},
- {0: 2.0, 1: 1.0, 2: 2.0},
- {0: 1.0, 1: 2.0, 2: 2.0},
- ],
- random_state=0,
- )
- clf3.fit(iris.data, iris_multi)
- assert_almost_equal(clf2.feature_importances_, clf3.feature_importances_)
- # Check against multi-output "auto" which should also have no effect
- clf4 = TreeClassifier(class_weight="balanced", random_state=0)
- clf4.fit(iris.data, iris_multi)
- assert_almost_equal(clf3.feature_importances_, clf4.feature_importances_)
- # Inflate importance of class 1, check against user-defined weights
- sample_weight = np.ones(iris.target.shape)
- sample_weight[iris.target == 1] *= 100
- class_weight = {0: 1.0, 1: 100.0, 2: 1.0}
- clf1 = TreeClassifier(random_state=0)
- clf1.fit(iris.data, iris.target, sample_weight)
- clf2 = TreeClassifier(class_weight=class_weight, random_state=0)
- clf2.fit(iris.data, iris.target)
- assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
- # Check that sample_weight and class_weight are multiplicative
- clf1 = TreeClassifier(random_state=0)
- clf1.fit(iris.data, iris.target, sample_weight**2)
- clf2 = TreeClassifier(class_weight=class_weight, random_state=0)
- clf2.fit(iris.data, iris.target, sample_weight)
- assert_almost_equal(clf1.feature_importances_, clf2.feature_importances_)
- @pytest.mark.parametrize("name", CLF_TREES)
- def test_class_weights(name):
- check_class_weights(name)
- def check_class_weight_errors(name):
- # Test if class_weight raises errors and warnings when expected.
- TreeClassifier = CLF_TREES[name]
- _y = np.vstack((y, np.array(y) * 2)).T
- # Incorrect length list for multi-output
- clf = TreeClassifier(class_weight=[{-1: 0.5, 1: 1.0}], random_state=0)
- err_msg = "number of elements in class_weight should match number of outputs."
- with pytest.raises(ValueError, match=err_msg):
- clf.fit(X, _y)
- @pytest.mark.parametrize("name", CLF_TREES)
- def test_class_weight_errors(name):
- check_class_weight_errors(name)
- def test_max_leaf_nodes():
- # Test greedy trees with max_depth + 1 leafs.
- X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
- k = 4
- for name, TreeEstimator in ALL_TREES.items():
- est = TreeEstimator(max_depth=None, max_leaf_nodes=k + 1).fit(X, y)
- assert est.get_n_leaves() == k + 1
- def test_max_leaf_nodes_max_depth():
- # Test precedence of max_leaf_nodes over max_depth.
- X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
- k = 4
- for name, TreeEstimator in ALL_TREES.items():
- est = TreeEstimator(max_depth=1, max_leaf_nodes=k).fit(X, y)
- assert est.get_depth() == 1
- def test_arrays_persist():
- # Ensure property arrays' memory stays alive when tree disappears
- # non-regression for #2726
- for attr in [
- "n_classes",
- "value",
- "children_left",
- "children_right",
- "threshold",
- "impurity",
- "feature",
- "n_node_samples",
- ]:
- value = getattr(DecisionTreeClassifier().fit([[0], [1]], [0, 1]).tree_, attr)
- # if pointing to freed memory, contents may be arbitrary
- assert -3 <= value.flat[0] < 3, "Array points to arbitrary memory"
- def test_only_constant_features():
- random_state = check_random_state(0)
- X = np.zeros((10, 20))
- y = random_state.randint(0, 2, (10,))
- for name, TreeEstimator in ALL_TREES.items():
- est = TreeEstimator(random_state=0)
- est.fit(X, y)
- assert est.tree_.max_depth == 0
- def test_behaviour_constant_feature_after_splits():
- X = np.transpose(
- np.vstack(([[0, 0, 0, 0, 0, 1, 2, 4, 5, 6, 7]], np.zeros((4, 11))))
- )
- y = [0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 3]
- for name, TreeEstimator in ALL_TREES.items():
- # do not check extra random trees
- if "ExtraTree" not in name:
- est = TreeEstimator(random_state=0, max_features=1)
- est.fit(X, y)
- assert est.tree_.max_depth == 2
- assert est.tree_.node_count == 5
- def test_with_only_one_non_constant_features():
- X = np.hstack([np.array([[1.0], [1.0], [0.0], [0.0]]), np.zeros((4, 1000))])
- y = np.array([0.0, 1.0, 0.0, 1.0])
- for name, TreeEstimator in CLF_TREES.items():
- est = TreeEstimator(random_state=0, max_features=1)
- est.fit(X, y)
- assert est.tree_.max_depth == 1
- assert_array_equal(est.predict_proba(X), np.full((4, 2), 0.5))
- for name, TreeEstimator in REG_TREES.items():
- est = TreeEstimator(random_state=0, max_features=1)
- est.fit(X, y)
- assert est.tree_.max_depth == 1
- assert_array_equal(est.predict(X), np.full((4,), 0.5))
- def test_big_input():
- # Test if the warning for too large inputs is appropriate.
- X = np.repeat(10**40.0, 4).astype(np.float64).reshape(-1, 1)
- clf = DecisionTreeClassifier()
- with pytest.raises(ValueError, match="float32"):
- clf.fit(X, [0, 1, 0, 1])
- def test_realloc():
- from sklearn.tree._utils import _realloc_test
- with pytest.raises(MemoryError):
- _realloc_test()
- def test_huge_allocations():
- n_bits = 8 * struct.calcsize("P")
- X = np.random.randn(10, 2)
- y = np.random.randint(0, 2, 10)
- # Sanity check: we cannot request more memory than the size of the address
- # space. Currently raises OverflowError.
- huge = 2 ** (n_bits + 1)
- clf = DecisionTreeClassifier(splitter="best", max_leaf_nodes=huge)
- with pytest.raises(Exception):
- clf.fit(X, y)
- # Non-regression test: MemoryError used to be dropped by Cython
- # because of missing "except *".
- huge = 2 ** (n_bits - 1) - 1
- clf = DecisionTreeClassifier(splitter="best", max_leaf_nodes=huge)
- with pytest.raises(MemoryError):
- clf.fit(X, y)
- def check_sparse_input(tree, dataset, max_depth=None):
- TreeEstimator = ALL_TREES[tree]
- X = DATASETS[dataset]["X"]
- X_sparse = DATASETS[dataset]["X_sparse"]
- y = DATASETS[dataset]["y"]
- # Gain testing time
- if dataset in ["digits", "diabetes"]:
- n_samples = X.shape[0] // 5
- X = X[:n_samples]
- X_sparse = X_sparse[:n_samples]
- y = y[:n_samples]
- for sparse_format in (csr_matrix, csc_matrix, coo_matrix):
- X_sparse = sparse_format(X_sparse)
- # Check the default (depth first search)
- d = TreeEstimator(random_state=0, max_depth=max_depth).fit(X, y)
- s = TreeEstimator(random_state=0, max_depth=max_depth).fit(X_sparse, y)
- assert_tree_equal(
- d.tree_,
- s.tree_,
- "{0} with dense and sparse format gave different trees".format(tree),
- )
- y_pred = d.predict(X)
- if tree in CLF_TREES:
- y_proba = d.predict_proba(X)
- y_log_proba = d.predict_log_proba(X)
- for sparse_matrix in (csr_matrix, csc_matrix, coo_matrix):
- X_sparse_test = sparse_matrix(X_sparse, dtype=np.float32)
- assert_array_almost_equal(s.predict(X_sparse_test), y_pred)
- if tree in CLF_TREES:
- assert_array_almost_equal(s.predict_proba(X_sparse_test), y_proba)
- assert_array_almost_equal(
- s.predict_log_proba(X_sparse_test), y_log_proba
- )
- @pytest.mark.parametrize("tree_type", SPARSE_TREES)
- @pytest.mark.parametrize(
- "dataset",
- (
- "clf_small",
- "toy",
- "digits",
- "multilabel",
- "sparse-pos",
- "sparse-neg",
- "sparse-mix",
- "zeros",
- ),
- )
- def test_sparse_input(tree_type, dataset):
- max_depth = 3 if dataset == "digits" else None
- check_sparse_input(tree_type, dataset, max_depth)
- @pytest.mark.parametrize("tree_type", sorted(set(SPARSE_TREES).intersection(REG_TREES)))
- @pytest.mark.parametrize("dataset", ["diabetes", "reg_small"])
- def test_sparse_input_reg_trees(tree_type, dataset):
- # Due to numerical instability of MSE and too strict test, we limit the
- # maximal depth
- check_sparse_input(tree_type, dataset, 2)
- def check_sparse_parameters(tree, dataset):
- TreeEstimator = ALL_TREES[tree]
- X = DATASETS[dataset]["X"]
- X_sparse = DATASETS[dataset]["X_sparse"]
- y = DATASETS[dataset]["y"]
- # Check max_features
- d = TreeEstimator(random_state=0, max_features=1, max_depth=2).fit(X, y)
- s = TreeEstimator(random_state=0, max_features=1, max_depth=2).fit(X_sparse, y)
- assert_tree_equal(
- d.tree_,
- s.tree_,
- "{0} with dense and sparse format gave different trees".format(tree),
- )
- assert_array_almost_equal(s.predict(X), d.predict(X))
- # Check min_samples_split
- d = TreeEstimator(random_state=0, max_features=1, min_samples_split=10).fit(X, y)
- s = TreeEstimator(random_state=0, max_features=1, min_samples_split=10).fit(
- X_sparse, y
- )
- assert_tree_equal(
- d.tree_,
- s.tree_,
- "{0} with dense and sparse format gave different trees".format(tree),
- )
- assert_array_almost_equal(s.predict(X), d.predict(X))
- # Check min_samples_leaf
- d = TreeEstimator(random_state=0, min_samples_leaf=X_sparse.shape[0] // 2).fit(X, y)
- s = TreeEstimator(random_state=0, min_samples_leaf=X_sparse.shape[0] // 2).fit(
- X_sparse, y
- )
- assert_tree_equal(
- d.tree_,
- s.tree_,
- "{0} with dense and sparse format gave different trees".format(tree),
- )
- assert_array_almost_equal(s.predict(X), d.predict(X))
- # Check best-first search
- d = TreeEstimator(random_state=0, max_leaf_nodes=3).fit(X, y)
- s = TreeEstimator(random_state=0, max_leaf_nodes=3).fit(X_sparse, y)
- assert_tree_equal(
- d.tree_,
- s.tree_,
- "{0} with dense and sparse format gave different trees".format(tree),
- )
- assert_array_almost_equal(s.predict(X), d.predict(X))
- def check_sparse_criterion(tree, dataset):
- TreeEstimator = ALL_TREES[tree]
- X = DATASETS[dataset]["X"]
- X_sparse = DATASETS[dataset]["X_sparse"]
- y = DATASETS[dataset]["y"]
- # Check various criterion
- CRITERIONS = REG_CRITERIONS if tree in REG_TREES else CLF_CRITERIONS
- for criterion in CRITERIONS:
- d = TreeEstimator(random_state=0, max_depth=3, criterion=criterion).fit(X, y)
- s = TreeEstimator(random_state=0, max_depth=3, criterion=criterion).fit(
- X_sparse, y
- )
- assert_tree_equal(
- d.tree_,
- s.tree_,
- "{0} with dense and sparse format gave different trees".format(tree),
- )
- assert_array_almost_equal(s.predict(X), d.predict(X))
- @pytest.mark.parametrize("tree_type", SPARSE_TREES)
- @pytest.mark.parametrize("dataset", ["sparse-pos", "sparse-neg", "sparse-mix", "zeros"])
- @pytest.mark.parametrize("check", [check_sparse_parameters, check_sparse_criterion])
- def test_sparse(tree_type, dataset, check):
- check(tree_type, dataset)
- def check_explicit_sparse_zeros(tree, max_depth=3, n_features=10):
- TreeEstimator = ALL_TREES[tree]
- # n_samples set n_feature to ease construction of a simultaneous
- # construction of a csr and csc matrix
- n_samples = n_features
- samples = np.arange(n_samples)
- # Generate X, y
- random_state = check_random_state(0)
- indices = []
- data = []
- offset = 0
- indptr = [offset]
- for i in range(n_features):
- n_nonzero_i = random_state.binomial(n_samples, 0.5)
- indices_i = random_state.permutation(samples)[:n_nonzero_i]
- indices.append(indices_i)
- data_i = random_state.binomial(3, 0.5, size=(n_nonzero_i,)) - 1
- data.append(data_i)
- offset += n_nonzero_i
- indptr.append(offset)
- indices = np.concatenate(indices)
- data = np.array(np.concatenate(data), dtype=np.float32)
- X_sparse = csc_matrix((data, indices, indptr), shape=(n_samples, n_features))
- X = X_sparse.toarray()
- X_sparse_test = csr_matrix((data, indices, indptr), shape=(n_samples, n_features))
- X_test = X_sparse_test.toarray()
- y = random_state.randint(0, 3, size=(n_samples,))
- # Ensure that X_sparse_test owns its data, indices and indptr array
- X_sparse_test = X_sparse_test.copy()
- # Ensure that we have explicit zeros
- assert (X_sparse.data == 0.0).sum() > 0
- assert (X_sparse_test.data == 0.0).sum() > 0
- # Perform the comparison
- d = TreeEstimator(random_state=0, max_depth=max_depth).fit(X, y)
- s = TreeEstimator(random_state=0, max_depth=max_depth).fit(X_sparse, y)
- assert_tree_equal(
- d.tree_,
- s.tree_,
- "{0} with dense and sparse format gave different trees".format(tree),
- )
- Xs = (X_test, X_sparse_test)
- for X1, X2 in product(Xs, Xs):
- assert_array_almost_equal(s.tree_.apply(X1), d.tree_.apply(X2))
- assert_array_almost_equal(s.apply(X1), d.apply(X2))
- assert_array_almost_equal(s.apply(X1), s.tree_.apply(X1))
- assert_array_almost_equal(
- s.tree_.decision_path(X1).toarray(), d.tree_.decision_path(X2).toarray()
- )
- assert_array_almost_equal(
- s.decision_path(X1).toarray(), d.decision_path(X2).toarray()
- )
- assert_array_almost_equal(
- s.decision_path(X1).toarray(), s.tree_.decision_path(X1).toarray()
- )
- assert_array_almost_equal(s.predict(X1), d.predict(X2))
- if tree in CLF_TREES:
- assert_array_almost_equal(s.predict_proba(X1), d.predict_proba(X2))
- @pytest.mark.parametrize("tree_type", SPARSE_TREES)
- def test_explicit_sparse_zeros(tree_type):
- check_explicit_sparse_zeros(tree_type)
- @ignore_warnings
- def check_raise_error_on_1d_input(name):
- TreeEstimator = ALL_TREES[name]
- X = iris.data[:, 0].ravel()
- X_2d = iris.data[:, 0].reshape((-1, 1))
- y = iris.target
- with pytest.raises(ValueError):
- TreeEstimator(random_state=0).fit(X, y)
- est = TreeEstimator(random_state=0)
- est.fit(X_2d, y)
- with pytest.raises(ValueError):
- est.predict([X])
- @pytest.mark.parametrize("name", ALL_TREES)
- def test_1d_input(name):
- with ignore_warnings():
- check_raise_error_on_1d_input(name)
- def _check_min_weight_leaf_split_level(TreeEstimator, X, y, sample_weight):
- est = TreeEstimator(random_state=0)
- est.fit(X, y, sample_weight=sample_weight)
- assert est.tree_.max_depth == 1
- est = TreeEstimator(random_state=0, min_weight_fraction_leaf=0.4)
- est.fit(X, y, sample_weight=sample_weight)
- assert est.tree_.max_depth == 0
- def check_min_weight_leaf_split_level(name):
- TreeEstimator = ALL_TREES[name]
- X = np.array([[0], [0], [0], [0], [1]])
- y = [0, 0, 0, 0, 1]
- sample_weight = [0.2, 0.2, 0.2, 0.2, 0.2]
- _check_min_weight_leaf_split_level(TreeEstimator, X, y, sample_weight)
- _check_min_weight_leaf_split_level(TreeEstimator, csc_matrix(X), y, sample_weight)
- @pytest.mark.parametrize("name", ALL_TREES)
- def test_min_weight_leaf_split_level(name):
- check_min_weight_leaf_split_level(name)
- def check_public_apply(name):
- X_small32 = X_small.astype(tree._tree.DTYPE, copy=False)
- est = ALL_TREES[name]()
- est.fit(X_small, y_small)
- assert_array_equal(est.apply(X_small), est.tree_.apply(X_small32))
- def check_public_apply_sparse(name):
- X_small32 = csr_matrix(X_small.astype(tree._tree.DTYPE, copy=False))
- est = ALL_TREES[name]()
- est.fit(X_small, y_small)
- assert_array_equal(est.apply(X_small), est.tree_.apply(X_small32))
- @pytest.mark.parametrize("name", ALL_TREES)
- def test_public_apply_all_trees(name):
- check_public_apply(name)
- @pytest.mark.parametrize("name", SPARSE_TREES)
- def test_public_apply_sparse_trees(name):
- check_public_apply_sparse(name)
- def test_decision_path_hardcoded():
- X = iris.data
- y = iris.target
- est = DecisionTreeClassifier(random_state=0, max_depth=1).fit(X, y)
- node_indicator = est.decision_path(X[:2]).toarray()
- assert_array_equal(node_indicator, [[1, 1, 0], [1, 0, 1]])
- def check_decision_path(name):
- X = iris.data
- y = iris.target
- n_samples = X.shape[0]
- TreeEstimator = ALL_TREES[name]
- est = TreeEstimator(random_state=0, max_depth=2)
- est.fit(X, y)
- node_indicator_csr = est.decision_path(X)
- node_indicator = node_indicator_csr.toarray()
- assert node_indicator.shape == (n_samples, est.tree_.node_count)
- # Assert that leaves index are correct
- leaves = est.apply(X)
- leave_indicator = [node_indicator[i, j] for i, j in enumerate(leaves)]
- assert_array_almost_equal(leave_indicator, np.ones(shape=n_samples))
- # Ensure only one leave node per sample
- all_leaves = est.tree_.children_left == TREE_LEAF
- assert_array_almost_equal(
- np.dot(node_indicator, all_leaves), np.ones(shape=n_samples)
- )
- # Ensure max depth is consistent with sum of indicator
- max_depth = node_indicator.sum(axis=1).max()
- assert est.tree_.max_depth <= max_depth
- @pytest.mark.parametrize("name", ALL_TREES)
- def test_decision_path(name):
- check_decision_path(name)
- def check_no_sparse_y_support(name):
- X, y = X_multilabel, csr_matrix(y_multilabel)
- TreeEstimator = ALL_TREES[name]
- with pytest.raises(TypeError):
- TreeEstimator(random_state=0).fit(X, y)
- @pytest.mark.parametrize("name", ALL_TREES)
- def test_no_sparse_y_support(name):
- # Currently we don't support sparse y
- check_no_sparse_y_support(name)
- def test_mae():
- """Check MAE criterion produces correct results on small toy dataset:
- ------------------
- | X | y | weight |
- ------------------
- | 3 | 3 | 0.1 |
- | 5 | 3 | 0.3 |
- | 8 | 4 | 1.0 |
- | 3 | 6 | 0.6 |
- | 5 | 7 | 0.3 |
- ------------------
- |sum wt:| 2.3 |
- ------------------
- Because we are dealing with sample weights, we cannot find the median by
- simply choosing/averaging the centre value(s), instead we consider the
- median where 50% of the cumulative weight is found (in a y sorted data set)
- . Therefore with regards to this test data, the cumulative weight is >= 50%
- when y = 4. Therefore:
- Median = 4
- For all the samples, we can get the total error by summing:
- Absolute(Median - y) * weight
- I.e., total error = (Absolute(4 - 3) * 0.1)
- + (Absolute(4 - 3) * 0.3)
- + (Absolute(4 - 4) * 1.0)
- + (Absolute(4 - 6) * 0.6)
- + (Absolute(4 - 7) * 0.3)
- = 2.5
- Impurity = Total error / total weight
- = 2.5 / 2.3
- = 1.08695652173913
- ------------------
- From this root node, the next best split is between X values of 3 and 5.
- Thus, we have left and right child nodes:
- LEFT RIGHT
- ------------------ ------------------
- | X | y | weight | | X | y | weight |
- ------------------ ------------------
- | 3 | 3 | 0.1 | | 5 | 3 | 0.3 |
- | 3 | 6 | 0.6 | | 8 | 4 | 1.0 |
- ------------------ | 5 | 7 | 0.3 |
- |sum wt:| 0.7 | ------------------
- ------------------ |sum wt:| 1.6 |
- ------------------
- Impurity is found in the same way:
- Left node Median = 6
- Total error = (Absolute(6 - 3) * 0.1)
- + (Absolute(6 - 6) * 0.6)
- = 0.3
- Left Impurity = Total error / total weight
- = 0.3 / 0.7
- = 0.428571428571429
- -------------------
- Likewise for Right node:
- Right node Median = 4
- Total error = (Absolute(4 - 3) * 0.3)
- + (Absolute(4 - 4) * 1.0)
- + (Absolute(4 - 7) * 0.3)
- = 1.2
- Right Impurity = Total error / total weight
- = 1.2 / 1.6
- = 0.75
- ------
- """
- dt_mae = DecisionTreeRegressor(
- random_state=0, criterion="absolute_error", max_leaf_nodes=2
- )
- # Test MAE where sample weights are non-uniform (as illustrated above):
- dt_mae.fit(
- X=[[3], [5], [3], [8], [5]],
- y=[6, 7, 3, 4, 3],
- sample_weight=[0.6, 0.3, 0.1, 1.0, 0.3],
- )
- assert_allclose(dt_mae.tree_.impurity, [2.5 / 2.3, 0.3 / 0.7, 1.2 / 1.6])
- assert_array_equal(dt_mae.tree_.value.flat, [4.0, 6.0, 4.0])
- # Test MAE where all sample weights are uniform:
- dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3], sample_weight=np.ones(5))
- assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0 / 3.0])
- assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0])
- # Test MAE where a `sample_weight` is not explicitly provided.
- # This is equivalent to providing uniform sample weights, though
- # the internal logic is different:
- dt_mae.fit(X=[[3], [5], [3], [8], [5]], y=[6, 7, 3, 4, 3])
- assert_array_equal(dt_mae.tree_.impurity, [1.4, 1.5, 4.0 / 3.0])
- assert_array_equal(dt_mae.tree_.value.flat, [4, 4.5, 4.0])
- def test_criterion_copy():
- # Let's check whether copy of our criterion has the same type
- # and properties as original
- n_outputs = 3
- n_classes = np.arange(3, dtype=np.intp)
- n_samples = 100
- def _pickle_copy(obj):
- return pickle.loads(pickle.dumps(obj))
- for copy_func in [copy.copy, copy.deepcopy, _pickle_copy]:
- for _, typename in CRITERIA_CLF.items():
- criteria = typename(n_outputs, n_classes)
- result = copy_func(criteria).__reduce__()
- typename_, (n_outputs_, n_classes_), _ = result
- assert typename == typename_
- assert n_outputs == n_outputs_
- assert_array_equal(n_classes, n_classes_)
- for _, typename in CRITERIA_REG.items():
- criteria = typename(n_outputs, n_samples)
- result = copy_func(criteria).__reduce__()
- typename_, (n_outputs_, n_samples_), _ = result
- assert typename == typename_
- assert n_outputs == n_outputs_
- assert n_samples == n_samples_
- def test_empty_leaf_infinite_threshold():
- # try to make empty leaf by using near infinite value.
- data = np.random.RandomState(0).randn(100, 11) * 2e38
- data = np.nan_to_num(data.astype("float32"))
- X_full = data[:, :-1]
- X_sparse = csc_matrix(X_full)
- y = data[:, -1]
- for X in [X_full, X_sparse]:
- tree = DecisionTreeRegressor(random_state=0).fit(X, y)
- terminal_regions = tree.apply(X)
- left_leaf = set(np.where(tree.tree_.children_left == TREE_LEAF)[0])
- empty_leaf = left_leaf.difference(terminal_regions)
- infinite_threshold = np.where(~np.isfinite(tree.tree_.threshold))[0]
- assert len(infinite_threshold) == 0
- assert len(empty_leaf) == 0
- @pytest.mark.parametrize("criterion", CLF_CRITERIONS)
- @pytest.mark.parametrize(
- "dataset", sorted(set(DATASETS.keys()) - {"reg_small", "diabetes"})
- )
- @pytest.mark.parametrize("tree_cls", [DecisionTreeClassifier, ExtraTreeClassifier])
- def test_prune_tree_classifier_are_subtrees(criterion, dataset, tree_cls):
- dataset = DATASETS[dataset]
- X, y = dataset["X"], dataset["y"]
- est = tree_cls(max_leaf_nodes=20, random_state=0)
- info = est.cost_complexity_pruning_path(X, y)
- pruning_path = info.ccp_alphas
- impurities = info.impurities
- assert np.all(np.diff(pruning_path) >= 0)
- assert np.all(np.diff(impurities) >= 0)
- assert_pruning_creates_subtree(tree_cls, X, y, pruning_path)
- @pytest.mark.parametrize("criterion", REG_CRITERIONS)
- @pytest.mark.parametrize("dataset", DATASETS.keys())
- @pytest.mark.parametrize("tree_cls", [DecisionTreeRegressor, ExtraTreeRegressor])
- def test_prune_tree_regression_are_subtrees(criterion, dataset, tree_cls):
- dataset = DATASETS[dataset]
- X, y = dataset["X"], dataset["y"]
- est = tree_cls(max_leaf_nodes=20, random_state=0)
- info = est.cost_complexity_pruning_path(X, y)
- pruning_path = info.ccp_alphas
- impurities = info.impurities
- assert np.all(np.diff(pruning_path) >= 0)
- assert np.all(np.diff(impurities) >= 0)
- assert_pruning_creates_subtree(tree_cls, X, y, pruning_path)
- def test_prune_single_node_tree():
- # single node tree
- clf1 = DecisionTreeClassifier(random_state=0)
- clf1.fit([[0], [1]], [0, 0])
- # pruned single node tree
- clf2 = DecisionTreeClassifier(random_state=0, ccp_alpha=10)
- clf2.fit([[0], [1]], [0, 0])
- assert_is_subtree(clf1.tree_, clf2.tree_)
- def assert_pruning_creates_subtree(estimator_cls, X, y, pruning_path):
- # generate trees with increasing alphas
- estimators = []
- for ccp_alpha in pruning_path:
- est = estimator_cls(max_leaf_nodes=20, ccp_alpha=ccp_alpha, random_state=0).fit(
- X, y
- )
- estimators.append(est)
- # A pruned tree must be a subtree of the previous tree (which had a
- # smaller ccp_alpha)
- for prev_est, next_est in zip(estimators, estimators[1:]):
- assert_is_subtree(prev_est.tree_, next_est.tree_)
- def assert_is_subtree(tree, subtree):
- assert tree.node_count >= subtree.node_count
- assert tree.max_depth >= subtree.max_depth
- tree_c_left = tree.children_left
- tree_c_right = tree.children_right
- subtree_c_left = subtree.children_left
- subtree_c_right = subtree.children_right
- stack = [(0, 0)]
- while stack:
- tree_node_idx, subtree_node_idx = stack.pop()
- assert_array_almost_equal(
- tree.value[tree_node_idx], subtree.value[subtree_node_idx]
- )
- assert_almost_equal(
- tree.impurity[tree_node_idx], subtree.impurity[subtree_node_idx]
- )
- assert_almost_equal(
- tree.n_node_samples[tree_node_idx], subtree.n_node_samples[subtree_node_idx]
- )
- assert_almost_equal(
- tree.weighted_n_node_samples[tree_node_idx],
- subtree.weighted_n_node_samples[subtree_node_idx],
- )
- if subtree_c_left[subtree_node_idx] == subtree_c_right[subtree_node_idx]:
- # is a leaf
- assert_almost_equal(TREE_UNDEFINED, subtree.threshold[subtree_node_idx])
- else:
- # not a leaf
- assert_almost_equal(
- tree.threshold[tree_node_idx], subtree.threshold[subtree_node_idx]
- )
- stack.append((tree_c_left[tree_node_idx], subtree_c_left[subtree_node_idx]))
- stack.append(
- (tree_c_right[tree_node_idx], subtree_c_right[subtree_node_idx])
- )
- @pytest.mark.parametrize("name", ALL_TREES)
- @pytest.mark.parametrize("splitter", ["best", "random"])
- @pytest.mark.parametrize("X_format", ["dense", "csr", "csc"])
- def test_apply_path_readonly_all_trees(name, splitter, X_format):
- dataset = DATASETS["clf_small"]
- X_small = dataset["X"].astype(tree._tree.DTYPE, copy=False)
- if X_format == "dense":
- X_readonly = create_memmap_backed_data(X_small)
- else:
- X_readonly = dataset["X_sparse"] # CSR
- if X_format == "csc":
- # Cheap CSR to CSC conversion
- X_readonly = X_readonly.tocsc()
- X_readonly.data = np.array(X_readonly.data, dtype=tree._tree.DTYPE)
- (
- X_readonly.data,
- X_readonly.indices,
- X_readonly.indptr,
- ) = create_memmap_backed_data(
- (X_readonly.data, X_readonly.indices, X_readonly.indptr)
- )
- y_readonly = create_memmap_backed_data(np.array(y_small, dtype=tree._tree.DTYPE))
- est = ALL_TREES[name](splitter=splitter)
- est.fit(X_readonly, y_readonly)
- assert_array_equal(est.predict(X_readonly), est.predict(X_small))
- assert_array_equal(
- est.decision_path(X_readonly).todense(), est.decision_path(X_small).todense()
- )
- @pytest.mark.parametrize("criterion", ["squared_error", "friedman_mse", "poisson"])
- @pytest.mark.parametrize("Tree", REG_TREES.values())
- def test_balance_property(criterion, Tree):
- # Test that sum(y_pred)=sum(y_true) on training set.
- # This works if the mean is predicted (should even be true for each leaf).
- # MAE predicts the median and is therefore excluded from this test.
- # Choose a training set with non-negative targets (for poisson)
- X, y = diabetes.data, diabetes.target
- reg = Tree(criterion=criterion)
- reg.fit(X, y)
- assert np.sum(reg.predict(X)) == pytest.approx(np.sum(y))
- @pytest.mark.parametrize("seed", range(3))
- def test_poisson_zero_nodes(seed):
- # Test that sum(y)=0 and therefore y_pred=0 is forbidden on nodes.
- X = [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0], [1, 2], [1, 2], [1, 3]]
- y = [0, 0, 0, 0, 1, 2, 3, 4]
- # Note that X[:, 0] == 0 is a 100% indicator for y == 0. The tree can
- # easily learn that:
- reg = DecisionTreeRegressor(criterion="squared_error", random_state=seed)
- reg.fit(X, y)
- assert np.amin(reg.predict(X)) == 0
- # whereas Poisson must predict strictly positive numbers
- reg = DecisionTreeRegressor(criterion="poisson", random_state=seed)
- reg.fit(X, y)
- assert np.all(reg.predict(X) > 0)
- # Test additional dataset where something could go wrong.
- n_features = 10
- X, y = datasets.make_regression(
- effective_rank=n_features * 2 // 3,
- tail_strength=0.6,
- n_samples=1_000,
- n_features=n_features,
- n_informative=n_features * 2 // 3,
- random_state=seed,
- )
- # some excess zeros
- y[(-1 < y) & (y < 0)] = 0
- # make sure the target is positive
- y = np.abs(y)
- reg = DecisionTreeRegressor(criterion="poisson", random_state=seed)
- reg.fit(X, y)
- assert np.all(reg.predict(X) > 0)
- def test_poisson_vs_mse():
- # For a Poisson distributed target, Poisson loss should give better results
- # than squared error measured in Poisson deviance as metric.
- # We have a similar test, test_poisson(), in
- # sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
- rng = np.random.RandomState(42)
- n_train, n_test, n_features = 500, 500, 10
- X = datasets.make_low_rank_matrix(
- n_samples=n_train + n_test, n_features=n_features, random_state=rng
- )
- # We create a log-linear Poisson model and downscale coef as it will get
- # exponentiated.
- coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
- y = rng.poisson(lam=np.exp(X @ coef))
- X_train, X_test, y_train, y_test = train_test_split(
- X, y, test_size=n_test, random_state=rng
- )
- # We prevent some overfitting by setting min_samples_split=10.
- tree_poi = DecisionTreeRegressor(
- criterion="poisson", min_samples_split=10, random_state=rng
- )
- tree_mse = DecisionTreeRegressor(
- criterion="squared_error", min_samples_split=10, random_state=rng
- )
- tree_poi.fit(X_train, y_train)
- tree_mse.fit(X_train, y_train)
- dummy = DummyRegressor(strategy="mean").fit(X_train, y_train)
- for X, y, val in [(X_train, y_train, "train"), (X_test, y_test, "test")]:
- metric_poi = mean_poisson_deviance(y, tree_poi.predict(X))
- # squared_error might produce non-positive predictions => clip
- metric_mse = mean_poisson_deviance(y, np.clip(tree_mse.predict(X), 1e-15, None))
- metric_dummy = mean_poisson_deviance(y, dummy.predict(X))
- # As squared_error might correctly predict 0 in train set, its train
- # score can be better than Poisson. This is no longer the case for the
- # test set.
- if val == "test":
- assert metric_poi < 0.5 * metric_mse
- assert metric_poi < 0.75 * metric_dummy
- @pytest.mark.parametrize("criterion", REG_CRITERIONS)
- def test_decision_tree_regressor_sample_weight_consistency(criterion):
- """Test that the impact of sample_weight is consistent."""
- tree_params = dict(criterion=criterion)
- tree = DecisionTreeRegressor(**tree_params, random_state=42)
- for kind in ["zeros", "ones"]:
- check_sample_weights_invariance(
- "DecisionTreeRegressor_" + criterion, tree, kind="zeros"
- )
- rng = np.random.RandomState(0)
- n_samples, n_features = 10, 5
- X = rng.rand(n_samples, n_features)
- y = np.mean(X, axis=1) + rng.rand(n_samples)
- # make it positive in order to work also for poisson criterion
- y += np.min(y) + 0.1
- # check that multiplying sample_weight by 2 is equivalent
- # to repeating corresponding samples twice
- X2 = np.concatenate([X, X[: n_samples // 2]], axis=0)
- y2 = np.concatenate([y, y[: n_samples // 2]])
- sample_weight_1 = np.ones(len(y))
- sample_weight_1[: n_samples // 2] = 2
- tree1 = DecisionTreeRegressor(**tree_params).fit(
- X, y, sample_weight=sample_weight_1
- )
- tree2 = DecisionTreeRegressor(**tree_params).fit(X2, y2, sample_weight=None)
- assert tree1.tree_.node_count == tree2.tree_.node_count
- # Thresholds, tree.tree_.threshold, and values, tree.tree_.value, are not
- # exactly the same, but on the training set, those differences do not
- # matter and thus predictions are the same.
- assert_allclose(tree1.predict(X), tree2.predict(X))
- @pytest.mark.parametrize("Tree", [DecisionTreeClassifier, ExtraTreeClassifier])
- @pytest.mark.parametrize("n_classes", [2, 4])
- def test_criterion_entropy_same_as_log_loss(Tree, n_classes):
- """Test that criterion=entropy gives same as log_loss."""
- n_samples, n_features = 50, 5
- X, y = datasets.make_classification(
- n_classes=n_classes,
- n_samples=n_samples,
- n_features=n_features,
- n_informative=n_features,
- n_redundant=0,
- random_state=42,
- )
- tree_log_loss = Tree(criterion="log_loss", random_state=43).fit(X, y)
- tree_entropy = Tree(criterion="entropy", random_state=43).fit(X, y)
- assert_tree_equal(
- tree_log_loss.tree_,
- tree_entropy.tree_,
- f"{Tree!r} with criterion 'entropy' and 'log_loss' gave different trees.",
- )
- assert_allclose(tree_log_loss.predict(X), tree_entropy.predict(X))
- def test_different_endianness_pickle():
- X, y = datasets.make_classification(random_state=0)
- clf = DecisionTreeClassifier(random_state=0, max_depth=3)
- clf.fit(X, y)
- score = clf.score(X, y)
- def reduce_ndarray(arr):
- return arr.byteswap().newbyteorder().__reduce__()
- def get_pickle_non_native_endianness():
- f = io.BytesIO()
- p = pickle.Pickler(f)
- p.dispatch_table = copyreg.dispatch_table.copy()
- p.dispatch_table[np.ndarray] = reduce_ndarray
- p.dump(clf)
- f.seek(0)
- return f
- new_clf = pickle.load(get_pickle_non_native_endianness())
- new_score = new_clf.score(X, y)
- assert np.isclose(score, new_score)
- def test_different_endianness_joblib_pickle():
- X, y = datasets.make_classification(random_state=0)
- clf = DecisionTreeClassifier(random_state=0, max_depth=3)
- clf.fit(X, y)
- score = clf.score(X, y)
- class NonNativeEndiannessNumpyPickler(NumpyPickler):
- def save(self, obj):
- if isinstance(obj, np.ndarray):
- obj = obj.byteswap().newbyteorder()
- super().save(obj)
- def get_joblib_pickle_non_native_endianness():
- f = io.BytesIO()
- p = NonNativeEndiannessNumpyPickler(f)
- p.dump(clf)
- f.seek(0)
- return f
- new_clf = joblib.load(get_joblib_pickle_non_native_endianness())
- new_score = new_clf.score(X, y)
- assert np.isclose(score, new_score)
- def get_different_bitness_node_ndarray(node_ndarray):
- new_dtype_for_indexing_fields = np.int64 if _IS_32BIT else np.int32
- # field names in Node struct with SIZE_t types (see sklearn/tree/_tree.pxd)
- indexing_field_names = ["left_child", "right_child", "feature", "n_node_samples"]
- new_dtype_dict = {
- name: dtype for name, (dtype, _) in node_ndarray.dtype.fields.items()
- }
- for name in indexing_field_names:
- new_dtype_dict[name] = new_dtype_for_indexing_fields
- new_dtype = np.dtype(
- {"names": list(new_dtype_dict.keys()), "formats": list(new_dtype_dict.values())}
- )
- return node_ndarray.astype(new_dtype, casting="same_kind")
- def get_different_alignment_node_ndarray(node_ndarray):
- new_dtype_dict = {
- name: dtype for name, (dtype, _) in node_ndarray.dtype.fields.items()
- }
- offsets = [offset for dtype, offset in node_ndarray.dtype.fields.values()]
- shifted_offsets = [8 + offset for offset in offsets]
- new_dtype = np.dtype(
- {
- "names": list(new_dtype_dict.keys()),
- "formats": list(new_dtype_dict.values()),
- "offsets": shifted_offsets,
- }
- )
- return node_ndarray.astype(new_dtype, casting="same_kind")
- def reduce_tree_with_different_bitness(tree):
- new_dtype = np.int64 if _IS_32BIT else np.int32
- tree_cls, (n_features, n_classes, n_outputs), state = tree.__reduce__()
- new_n_classes = n_classes.astype(new_dtype, casting="same_kind")
- new_state = state.copy()
- new_state["nodes"] = get_different_bitness_node_ndarray(new_state["nodes"])
- return (tree_cls, (n_features, new_n_classes, n_outputs), new_state)
- def test_different_bitness_pickle():
- X, y = datasets.make_classification(random_state=0)
- clf = DecisionTreeClassifier(random_state=0, max_depth=3)
- clf.fit(X, y)
- score = clf.score(X, y)
- def pickle_dump_with_different_bitness():
- f = io.BytesIO()
- p = pickle.Pickler(f)
- p.dispatch_table = copyreg.dispatch_table.copy()
- p.dispatch_table[CythonTree] = reduce_tree_with_different_bitness
- p.dump(clf)
- f.seek(0)
- return f
- new_clf = pickle.load(pickle_dump_with_different_bitness())
- new_score = new_clf.score(X, y)
- assert score == pytest.approx(new_score)
- def test_different_bitness_joblib_pickle():
- # Make sure that a platform specific pickle generated on a 64 bit
- # platform can be converted at pickle load time into an estimator
- # with Cython code that works with the host's native integer precision
- # to index nodes in the tree data structure when the host is a 32 bit
- # platform (and vice versa).
- X, y = datasets.make_classification(random_state=0)
- clf = DecisionTreeClassifier(random_state=0, max_depth=3)
- clf.fit(X, y)
- score = clf.score(X, y)
- def joblib_dump_with_different_bitness():
- f = io.BytesIO()
- p = NumpyPickler(f)
- p.dispatch_table = copyreg.dispatch_table.copy()
- p.dispatch_table[CythonTree] = reduce_tree_with_different_bitness
- p.dump(clf)
- f.seek(0)
- return f
- new_clf = joblib.load(joblib_dump_with_different_bitness())
- new_score = new_clf.score(X, y)
- assert score == pytest.approx(new_score)
- def test_check_n_classes():
- expected_dtype = np.dtype(np.int32) if _IS_32BIT else np.dtype(np.int64)
- allowed_dtypes = [np.dtype(np.int32), np.dtype(np.int64)]
- allowed_dtypes += [dt.newbyteorder() for dt in allowed_dtypes]
- n_classes = np.array([0, 1], dtype=expected_dtype)
- for dt in allowed_dtypes:
- _check_n_classes(n_classes.astype(dt), expected_dtype)
- with pytest.raises(ValueError, match="Wrong dimensions.+n_classes"):
- wrong_dim_n_classes = np.array([[0, 1]], dtype=expected_dtype)
- _check_n_classes(wrong_dim_n_classes, expected_dtype)
- with pytest.raises(ValueError, match="n_classes.+incompatible dtype"):
- wrong_dtype_n_classes = n_classes.astype(np.float64)
- _check_n_classes(wrong_dtype_n_classes, expected_dtype)
- def test_check_value_ndarray():
- expected_dtype = np.dtype(np.float64)
- expected_shape = (5, 1, 2)
- value_ndarray = np.zeros(expected_shape, dtype=expected_dtype)
- allowed_dtypes = [expected_dtype, expected_dtype.newbyteorder()]
- for dt in allowed_dtypes:
- _check_value_ndarray(
- value_ndarray, expected_dtype=dt, expected_shape=expected_shape
- )
- with pytest.raises(ValueError, match="Wrong shape.+value array"):
- _check_value_ndarray(
- value_ndarray, expected_dtype=expected_dtype, expected_shape=(1, 2)
- )
- for problematic_arr in [value_ndarray[:, :, :1], np.asfortranarray(value_ndarray)]:
- with pytest.raises(ValueError, match="value array.+C-contiguous"):
- _check_value_ndarray(
- problematic_arr,
- expected_dtype=expected_dtype,
- expected_shape=problematic_arr.shape,
- )
- with pytest.raises(ValueError, match="value array.+incompatible dtype"):
- _check_value_ndarray(
- value_ndarray.astype(np.float32),
- expected_dtype=expected_dtype,
- expected_shape=expected_shape,
- )
- def test_check_node_ndarray():
- expected_dtype = NODE_DTYPE
- node_ndarray = np.zeros((5,), dtype=expected_dtype)
- valid_node_ndarrays = [
- node_ndarray,
- get_different_bitness_node_ndarray(node_ndarray),
- get_different_alignment_node_ndarray(node_ndarray),
- ]
- valid_node_ndarrays += [
- arr.astype(arr.dtype.newbyteorder()) for arr in valid_node_ndarrays
- ]
- for arr in valid_node_ndarrays:
- _check_node_ndarray(node_ndarray, expected_dtype=expected_dtype)
- with pytest.raises(ValueError, match="Wrong dimensions.+node array"):
- problematic_node_ndarray = np.zeros((5, 2), dtype=expected_dtype)
- _check_node_ndarray(problematic_node_ndarray, expected_dtype=expected_dtype)
- with pytest.raises(ValueError, match="node array.+C-contiguous"):
- problematic_node_ndarray = node_ndarray[::2]
- _check_node_ndarray(problematic_node_ndarray, expected_dtype=expected_dtype)
- dtype_dict = {name: dtype for name, (dtype, _) in node_ndarray.dtype.fields.items()}
- # array with wrong 'threshold' field dtype (int64 rather than float64)
- new_dtype_dict = dtype_dict.copy()
- new_dtype_dict["threshold"] = np.int64
- new_dtype = np.dtype(
- {"names": list(new_dtype_dict.keys()), "formats": list(new_dtype_dict.values())}
- )
- problematic_node_ndarray = node_ndarray.astype(new_dtype)
- with pytest.raises(ValueError, match="node array.+incompatible dtype"):
- _check_node_ndarray(problematic_node_ndarray, expected_dtype=expected_dtype)
- # array with wrong 'left_child' field dtype (float64 rather than int64 or int32)
- new_dtype_dict = dtype_dict.copy()
- new_dtype_dict["left_child"] = np.float64
- new_dtype = np.dtype(
- {"names": list(new_dtype_dict.keys()), "formats": list(new_dtype_dict.values())}
- )
- problematic_node_ndarray = node_ndarray.astype(new_dtype)
- with pytest.raises(ValueError, match="node array.+incompatible dtype"):
- _check_node_ndarray(problematic_node_ndarray, expected_dtype=expected_dtype)
- @pytest.mark.parametrize(
- "Splitter", chain(DENSE_SPLITTERS.values(), SPARSE_SPLITTERS.values())
- )
- def test_splitter_serializable(Splitter):
- """Check that splitters are serializable."""
- rng = np.random.RandomState(42)
- max_features = 10
- n_outputs, n_classes = 2, np.array([3, 2], dtype=np.intp)
- criterion = CRITERIA_CLF["gini"](n_outputs, n_classes)
- splitter = Splitter(criterion, max_features, 5, 0.5, rng)
- splitter_serialize = pickle.dumps(splitter)
- splitter_back = pickle.loads(splitter_serialize)
- assert splitter_back.max_features == max_features
- assert isinstance(splitter_back, Splitter)
- def test_tree_deserialization_from_read_only_buffer(tmpdir):
- """Check that Trees can be deserialized with read only buffers.
- Non-regression test for gh-25584.
- """
- pickle_path = str(tmpdir.join("clf.joblib"))
- clf = DecisionTreeClassifier(random_state=0)
- clf.fit(X_small, y_small)
- joblib.dump(clf, pickle_path)
- loaded_clf = joblib.load(pickle_path, mmap_mode="r")
- assert_tree_equal(
- loaded_clf.tree_,
- clf.tree_,
- "The trees of the original and loaded classifiers are not equal.",
- )
- @pytest.mark.parametrize("Tree", ALL_TREES.values())
- def test_min_sample_split_1_error(Tree):
- """Check that an error is raised when min_sample_split=1.
- non-regression test for issue gh-25481.
- """
- X = np.array([[0, 0], [1, 1]])
- y = np.array([0, 1])
- # min_samples_split=1.0 is valid
- Tree(min_samples_split=1.0).fit(X, y)
- # min_samples_split=1 is invalid
- tree = Tree(min_samples_split=1)
- msg = (
- r"'min_samples_split' .* must be an int in the range \[2, inf\) "
- r"or a float in the range \(0.0, 1.0\]"
- )
- with pytest.raises(ValueError, match=msg):
- tree.fit(X, y)
- @pytest.mark.parametrize("criterion", ["squared_error", "friedman_mse"])
- def test_missing_values_on_equal_nodes_no_missing(criterion):
- """Check missing values goes to correct node during predictions"""
- X = np.array([[0, 1, 2, 3, 8, 9, 11, 12, 15]]).T
- y = np.array([0.1, 0.2, 0.3, 0.2, 1.4, 1.4, 1.5, 1.6, 2.6])
- dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion)
- dtc.fit(X, y)
- # Goes to right node because it has the most data points
- y_pred = dtc.predict([[np.nan]])
- assert_allclose(y_pred, [np.mean(y[-5:])])
- # equal number of elements in both nodes
- X_equal = X[:-1]
- y_equal = y[:-1]
- dtc = DecisionTreeRegressor(random_state=42, max_depth=1, criterion=criterion)
- dtc.fit(X_equal, y_equal)
- # Goes to right node because the implementation sets:
- # missing_go_to_left = n_left > n_right, which is False
- y_pred = dtc.predict([[np.nan]])
- assert_allclose(y_pred, [np.mean(y_equal[-4:])])
- @pytest.mark.parametrize("criterion", ["entropy", "gini"])
- def test_missing_values_best_splitter_three_classes(criterion):
- """Test when missing values are uniquely present in a class among 3 classes."""
- missing_values_class = 0
- X = np.array([[np.nan] * 4 + [0, 1, 2, 3, 8, 9, 11, 12]]).T
- y = np.array([missing_values_class] * 4 + [1] * 4 + [2] * 4)
- dtc = DecisionTreeClassifier(random_state=42, max_depth=2, criterion=criterion)
- dtc.fit(X, y)
- X_test = np.array([[np.nan, 3, 12]]).T
- y_nan_pred = dtc.predict(X_test)
- # Missing values necessarily are associated to the observed class.
- assert_array_equal(y_nan_pred, [missing_values_class, 1, 2])
- @pytest.mark.parametrize("criterion", ["entropy", "gini"])
- def test_missing_values_best_splitter_to_left(criterion):
- """Missing values spanning only one class at fit-time must make missing
- values at predict-time be classified has belonging to this class."""
- X = np.array([[np.nan] * 4 + [0, 1, 2, 3, 4, 5]]).T
- y = np.array([0] * 4 + [1] * 6)
- dtc = DecisionTreeClassifier(random_state=42, max_depth=2, criterion=criterion)
- dtc.fit(X, y)
- X_test = np.array([[np.nan, 5, np.nan]]).T
- y_pred = dtc.predict(X_test)
- assert_array_equal(y_pred, [0, 1, 0])
- @pytest.mark.parametrize("criterion", ["entropy", "gini"])
- def test_missing_values_best_splitter_to_right(criterion):
- """Missing values and non-missing values sharing one class at fit-time
- must make missing values at predict-time be classified has belonging
- to this class."""
- X = np.array([[np.nan] * 4 + [0, 1, 2, 3, 4, 5]]).T
- y = np.array([1] * 4 + [0] * 4 + [1] * 2)
- dtc = DecisionTreeClassifier(random_state=42, max_depth=2, criterion=criterion)
- dtc.fit(X, y)
- X_test = np.array([[np.nan, 1.2, 4.8]]).T
- y_pred = dtc.predict(X_test)
- assert_array_equal(y_pred, [1, 0, 1])
- @pytest.mark.parametrize("criterion", ["entropy", "gini"])
- def test_missing_values_missing_both_classes_has_nan(criterion):
- """Check behavior of missing value when there is one missing value in each class."""
- X = np.array([[1, 2, 3, 5, np.nan, 10, 20, 30, 60, np.nan]]).T
- y = np.array([0] * 5 + [1] * 5)
- dtc = DecisionTreeClassifier(random_state=42, max_depth=1, criterion=criterion)
- dtc.fit(X, y)
- X_test = np.array([[np.nan, 2.3, 34.2]]).T
- y_pred = dtc.predict(X_test)
- # Missing value goes to the class at the right (here 1) because the implementation
- # searches right first.
- assert_array_equal(y_pred, [1, 0, 1])
- @pytest.mark.parametrize("is_sparse", [True, False])
- @pytest.mark.parametrize(
- "tree",
- [
- DecisionTreeClassifier(splitter="random"),
- DecisionTreeRegressor(criterion="absolute_error"),
- ],
- )
- def test_missing_value_errors(is_sparse, tree):
- """Check unsupported configurations for missing values."""
- X = np.array([[1, 2, 3, 5, np.nan, 10, 20, 30, 60, np.nan]]).T
- y = np.array([0] * 5 + [1] * 5)
- if is_sparse:
- X = csr_matrix(X)
- with pytest.raises(ValueError, match="Input X contains NaN"):
- tree.fit(X, y)
- def test_missing_values_poisson():
- """Smoke test for poisson regression and missing values."""
- X, y = diabetes.data.copy(), diabetes.target
- # Set some values missing
- X[::5, 0] = np.nan
- X[::6, -1] = np.nan
- reg = DecisionTreeRegressor(criterion="poisson", random_state=42)
- reg.fit(X, y)
- y_pred = reg.predict(X)
- assert (y_pred >= 0.0).all()
- @pytest.mark.parametrize(
- "make_data, Tree",
- [
- (datasets.make_regression, DecisionTreeRegressor),
- (datasets.make_classification, DecisionTreeClassifier),
- ],
- )
- @pytest.mark.parametrize("sample_weight_train", [None, "ones"])
- def test_missing_values_is_resilience(make_data, Tree, sample_weight_train):
- """Check that trees can deal with missing values and have decent performance."""
- rng = np.random.RandomState(0)
- n_samples, n_features = 1000, 50
- X, y = make_data(n_samples=n_samples, n_features=n_features, random_state=rng)
- # Create dataset with missing values
- X_missing = X.copy()
- X_missing[rng.choice([False, True], size=X.shape, p=[0.9, 0.1])] = np.nan
- X_missing_train, X_missing_test, y_train, y_test = train_test_split(
- X_missing, y, random_state=0
- )
- if sample_weight_train == "ones":
- sample_weight_train = np.ones(X_missing_train.shape[0])
- # Train tree with missing values
- tree_with_missing = Tree(random_state=rng)
- tree_with_missing.fit(X_missing_train, y_train, sample_weight=sample_weight_train)
- score_with_missing = tree_with_missing.score(X_missing_test, y_test)
- # Train tree without missing values
- X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
- tree = Tree(random_state=rng)
- tree.fit(X_train, y_train, sample_weight=sample_weight_train)
- score_without_missing = tree.score(X_test, y_test)
- # Score is still 90 percent of the tree's score that had no missing values
- assert score_with_missing >= 0.9 * score_without_missing
- def test_missing_value_is_predictive():
- """Check the tree learns when only the missing value is predictive."""
- rng = np.random.RandomState(0)
- n_samples = 1000
- X = rng.standard_normal(size=(n_samples, 10))
- y = rng.randint(0, high=2, size=n_samples)
- # Create a predictive feature using `y` and with some noise
- X_random_mask = rng.choice([False, True], size=n_samples, p=[0.95, 0.05])
- y_mask = y.copy().astype(bool)
- y_mask[X_random_mask] = ~y_mask[X_random_mask]
- X_predictive = rng.standard_normal(size=n_samples)
- X_predictive[y_mask] = np.nan
- X[:, 5] = X_predictive
- X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)
- tree = DecisionTreeClassifier(random_state=rng).fit(X_train, y_train)
- assert tree.score(X_train, y_train) >= 0.85
- assert tree.score(X_test, y_test) >= 0.85
- @pytest.mark.parametrize(
- "make_data, Tree",
- [
- (datasets.make_regression, DecisionTreeRegressor),
- (datasets.make_classification, DecisionTreeClassifier),
- ],
- )
- def test_sample_weight_non_uniform(make_data, Tree):
- """Check sample weight is correctly handled with missing values."""
- rng = np.random.RandomState(0)
- n_samples, n_features = 1000, 10
- X, y = make_data(n_samples=n_samples, n_features=n_features, random_state=rng)
- # Create dataset with missing values
- X[rng.choice([False, True], size=X.shape, p=[0.9, 0.1])] = np.nan
- # Zero sample weight is the same as removing the sample
- sample_weight = np.ones(X.shape[0])
- sample_weight[::2] = 0.0
- tree_with_sw = Tree(random_state=0)
- tree_with_sw.fit(X, y, sample_weight=sample_weight)
- tree_samples_removed = Tree(random_state=0)
- tree_samples_removed.fit(X[1::2, :], y[1::2])
- assert_allclose(tree_samples_removed.predict(X), tree_with_sw.predict(X))
- def test_deterministic_pickle():
- # Non-regression test for:
- # https://github.com/scikit-learn/scikit-learn/issues/27268
- # Uninitialised memory would lead to the two pickle strings being different.
- tree1 = DecisionTreeClassifier(random_state=0).fit(iris.data, iris.target)
- tree2 = DecisionTreeClassifier(random_state=0).fit(iris.data, iris.target)
- pickle1 = pickle.dumps(tree1)
- pickle2 = pickle.dumps(tree2)
- assert pickle1 == pickle2
|