| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685 |
- # Authors:
- #
- # Giorgio Patrini
- #
- # License: BSD 3 clause
- import itertools
- import re
- import warnings
- import numpy as np
- import numpy.linalg as la
- import pytest
- from scipy import sparse, stats
- from sklearn import datasets
- from sklearn.base import clone
- from sklearn.exceptions import NotFittedError
- from sklearn.metrics.pairwise import linear_kernel
- from sklearn.model_selection import cross_val_predict
- from sklearn.pipeline import Pipeline
- from sklearn.preprocessing import (
- Binarizer,
- KernelCenterer,
- MaxAbsScaler,
- MinMaxScaler,
- Normalizer,
- PowerTransformer,
- QuantileTransformer,
- RobustScaler,
- StandardScaler,
- add_dummy_feature,
- maxabs_scale,
- minmax_scale,
- normalize,
- power_transform,
- quantile_transform,
- robust_scale,
- scale,
- )
- from sklearn.preprocessing._data import BOUNDS_THRESHOLD, _handle_zeros_in_scale
- from sklearn.svm import SVR
- from sklearn.utils import gen_batches, shuffle
- from sklearn.utils._testing import (
- _convert_container,
- assert_allclose,
- assert_allclose_dense_sparse,
- assert_almost_equal,
- assert_array_almost_equal,
- assert_array_equal,
- assert_array_less,
- skip_if_32bit,
- )
- from sklearn.utils.sparsefuncs import mean_variance_axis
- iris = datasets.load_iris()
- # Make some data to be used many times
- rng = np.random.RandomState(0)
- n_features = 30
- n_samples = 1000
- offsets = rng.uniform(-1, 1, size=n_features)
- scales = rng.uniform(1, 10, size=n_features)
- X_2d = rng.randn(n_samples, n_features) * scales + offsets
- X_1row = X_2d[0, :].reshape(1, n_features)
- X_1col = X_2d[:, 0].reshape(n_samples, 1)
- X_list_1row = X_1row.tolist()
- X_list_1col = X_1col.tolist()
- def toarray(a):
- if hasattr(a, "toarray"):
- a = a.toarray()
- return a
- def _check_dim_1axis(a):
- return np.asarray(a).shape[0]
- def assert_correct_incr(i, batch_start, batch_stop, n, chunk_size, n_samples_seen):
- if batch_stop != n:
- assert (i + 1) * chunk_size == n_samples_seen
- else:
- assert i * chunk_size + (batch_stop - batch_start) == n_samples_seen
- def test_raises_value_error_if_sample_weights_greater_than_1d():
- # Sample weights must be either scalar or 1D
- n_sampless = [2, 3]
- n_featuress = [3, 2]
- for n_samples, n_features in zip(n_sampless, n_featuress):
- X = rng.randn(n_samples, n_features)
- y = rng.randn(n_samples)
- scaler = StandardScaler()
- # make sure Error is raised the sample weights greater than 1d
- sample_weight_notOK = rng.randn(n_samples, 1) ** 2
- with pytest.raises(ValueError):
- scaler.fit(X, y, sample_weight=sample_weight_notOK)
- @pytest.mark.parametrize(
- ["Xw", "X", "sample_weight"],
- [
- ([[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [1, 2, 3], [4, 5, 6]], [2.0, 1.0]),
- (
- [[1, 0, 1], [0, 0, 1]],
- [[1, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1]],
- np.array([1, 3]),
- ),
- (
- [[1, np.nan, 1], [np.nan, np.nan, 1]],
- [
- [1, np.nan, 1],
- [np.nan, np.nan, 1],
- [np.nan, np.nan, 1],
- [np.nan, np.nan, 1],
- ],
- np.array([1, 3]),
- ),
- ],
- )
- @pytest.mark.parametrize("array_constructor", ["array", "sparse_csr", "sparse_csc"])
- def test_standard_scaler_sample_weight(Xw, X, sample_weight, array_constructor):
- with_mean = not array_constructor.startswith("sparse")
- X = _convert_container(X, array_constructor)
- Xw = _convert_container(Xw, array_constructor)
- # weighted StandardScaler
- yw = np.ones(Xw.shape[0])
- scaler_w = StandardScaler(with_mean=with_mean)
- scaler_w.fit(Xw, yw, sample_weight=sample_weight)
- # unweighted, but with repeated samples
- y = np.ones(X.shape[0])
- scaler = StandardScaler(with_mean=with_mean)
- scaler.fit(X, y)
- X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
- assert_almost_equal(scaler.mean_, scaler_w.mean_)
- assert_almost_equal(scaler.var_, scaler_w.var_)
- assert_almost_equal(scaler.transform(X_test), scaler_w.transform(X_test))
- def test_standard_scaler_1d():
- # Test scaling of dataset along single axis
- for X in [X_1row, X_1col, X_list_1row, X_list_1row]:
- scaler = StandardScaler()
- X_scaled = scaler.fit(X).transform(X, copy=True)
- if isinstance(X, list):
- X = np.array(X) # cast only after scaling done
- if _check_dim_1axis(X) == 1:
- assert_almost_equal(scaler.mean_, X.ravel())
- assert_almost_equal(scaler.scale_, np.ones(n_features))
- assert_array_almost_equal(X_scaled.mean(axis=0), np.zeros_like(n_features))
- assert_array_almost_equal(X_scaled.std(axis=0), np.zeros_like(n_features))
- else:
- assert_almost_equal(scaler.mean_, X.mean())
- assert_almost_equal(scaler.scale_, X.std())
- assert_array_almost_equal(X_scaled.mean(axis=0), np.zeros_like(n_features))
- assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
- assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
- assert scaler.n_samples_seen_ == X.shape[0]
- # check inverse transform
- X_scaled_back = scaler.inverse_transform(X_scaled)
- assert_array_almost_equal(X_scaled_back, X)
- # Constant feature
- X = np.ones((5, 1))
- scaler = StandardScaler()
- X_scaled = scaler.fit(X).transform(X, copy=True)
- assert_almost_equal(scaler.mean_, 1.0)
- assert_almost_equal(scaler.scale_, 1.0)
- assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
- assert_array_almost_equal(X_scaled.std(axis=0), 0.0)
- assert scaler.n_samples_seen_ == X.shape[0]
- @pytest.mark.parametrize(
- "sparse_constructor", [None, sparse.csc_matrix, sparse.csr_matrix]
- )
- @pytest.mark.parametrize("add_sample_weight", [False, True])
- def test_standard_scaler_dtype(add_sample_weight, sparse_constructor):
- # Ensure scaling does not affect dtype
- rng = np.random.RandomState(0)
- n_samples = 10
- n_features = 3
- if add_sample_weight:
- sample_weight = np.ones(n_samples)
- else:
- sample_weight = None
- with_mean = True
- for dtype in [np.float16, np.float32, np.float64]:
- X = rng.randn(n_samples, n_features).astype(dtype)
- if sparse_constructor is not None:
- X = sparse_constructor(X)
- with_mean = False
- scaler = StandardScaler(with_mean=with_mean)
- X_scaled = scaler.fit(X, sample_weight=sample_weight).transform(X)
- assert X.dtype == X_scaled.dtype
- assert scaler.mean_.dtype == np.float64
- assert scaler.scale_.dtype == np.float64
- @pytest.mark.parametrize(
- "scaler",
- [
- StandardScaler(with_mean=False),
- RobustScaler(with_centering=False),
- ],
- )
- @pytest.mark.parametrize(
- "sparse_constructor", [np.asarray, sparse.csc_matrix, sparse.csr_matrix]
- )
- @pytest.mark.parametrize("add_sample_weight", [False, True])
- @pytest.mark.parametrize("dtype", [np.float32, np.float64])
- @pytest.mark.parametrize("constant", [0, 1.0, 100.0])
- def test_standard_scaler_constant_features(
- scaler, add_sample_weight, sparse_constructor, dtype, constant
- ):
- if isinstance(scaler, RobustScaler) and add_sample_weight:
- pytest.skip(f"{scaler.__class__.__name__} does not yet support sample_weight")
- rng = np.random.RandomState(0)
- n_samples = 100
- n_features = 1
- if add_sample_weight:
- fit_params = dict(sample_weight=rng.uniform(size=n_samples) * 2)
- else:
- fit_params = {}
- X_array = np.full(shape=(n_samples, n_features), fill_value=constant, dtype=dtype)
- X = sparse_constructor(X_array)
- X_scaled = scaler.fit(X, **fit_params).transform(X)
- if isinstance(scaler, StandardScaler):
- # The variance info should be close to zero for constant features.
- assert_allclose(scaler.var_, np.zeros(X.shape[1]), atol=1e-7)
- # Constant features should not be scaled (scale of 1.):
- assert_allclose(scaler.scale_, np.ones(X.shape[1]))
- if hasattr(X_scaled, "toarray"):
- assert_allclose(X_scaled.toarray(), X_array)
- else:
- assert_allclose(X_scaled, X)
- if isinstance(scaler, StandardScaler) and not add_sample_weight:
- # Also check consistency with the standard scale function.
- X_scaled_2 = scale(X, with_mean=scaler.with_mean)
- if hasattr(X_scaled_2, "toarray"):
- assert_allclose(X_scaled_2.toarray(), X_scaled_2.toarray())
- else:
- assert_allclose(X_scaled_2, X_scaled_2)
- @pytest.mark.parametrize("n_samples", [10, 100, 10_000])
- @pytest.mark.parametrize("average", [1e-10, 1, 1e10])
- @pytest.mark.parametrize("dtype", [np.float32, np.float64])
- @pytest.mark.parametrize(
- "array_constructor", [np.asarray, sparse.csc_matrix, sparse.csr_matrix]
- )
- def test_standard_scaler_near_constant_features(
- n_samples, array_constructor, average, dtype
- ):
- # Check that when the variance is too small (var << mean**2) the feature
- # is considered constant and not scaled.
- scale_min, scale_max = -30, 19
- scales = np.array([10**i for i in range(scale_min, scale_max + 1)], dtype=dtype)
- n_features = scales.shape[0]
- X = np.empty((n_samples, n_features), dtype=dtype)
- # Make a dataset of known var = scales**2 and mean = average
- X[: n_samples // 2, :] = average + scales
- X[n_samples // 2 :, :] = average - scales
- X_array = array_constructor(X)
- scaler = StandardScaler(with_mean=False).fit(X_array)
- # StandardScaler uses float64 accumulators even if the data has a float32
- # dtype.
- eps = np.finfo(np.float64).eps
- # if var < bound = N.eps.var + N².eps².mean², the feature is considered
- # constant and the scale_ attribute is set to 1.
- bounds = n_samples * eps * scales**2 + n_samples**2 * eps**2 * average**2
- within_bounds = scales**2 <= bounds
- # Check that scale_min is small enough to have some scales below the
- # bound and therefore detected as constant:
- assert np.any(within_bounds)
- # Check that such features are actually treated as constant by the scaler:
- assert all(scaler.var_[within_bounds] <= bounds[within_bounds])
- assert_allclose(scaler.scale_[within_bounds], 1.0)
- # Depending the on the dtype of X, some features might not actually be
- # representable as non constant for small scales (even if above the
- # precision bound of the float64 variance estimate). Such feature should
- # be correctly detected as constants with 0 variance by StandardScaler.
- representable_diff = X[0, :] - X[-1, :] != 0
- assert_allclose(scaler.var_[np.logical_not(representable_diff)], 0)
- assert_allclose(scaler.scale_[np.logical_not(representable_diff)], 1)
- # The other features are scaled and scale_ is equal to sqrt(var_) assuming
- # that scales are large enough for average + scale and average - scale to
- # be distinct in X (depending on X's dtype).
- common_mask = np.logical_and(scales**2 > bounds, representable_diff)
- assert_allclose(scaler.scale_[common_mask], np.sqrt(scaler.var_)[common_mask])
- def test_scale_1d():
- # 1-d inputs
- X_list = [1.0, 3.0, 5.0, 0.0]
- X_arr = np.array(X_list)
- for X in [X_list, X_arr]:
- X_scaled = scale(X)
- assert_array_almost_equal(X_scaled.mean(), 0.0)
- assert_array_almost_equal(X_scaled.std(), 1.0)
- assert_array_equal(scale(X, with_mean=False, with_std=False), X)
- @skip_if_32bit
- def test_standard_scaler_numerical_stability():
- # Test numerical stability of scaling
- # np.log(1e-5) is taken because of its floating point representation
- # was empirically found to cause numerical problems with np.mean & np.std.
- x = np.full(8, np.log(1e-5), dtype=np.float64)
- # This does not raise a warning as the number of samples is too low
- # to trigger the problem in recent numpy
- with warnings.catch_warnings():
- warnings.simplefilter("error", UserWarning)
- scale(x)
- assert_array_almost_equal(scale(x), np.zeros(8))
- # with 2 more samples, the std computation run into numerical issues:
- x = np.full(10, np.log(1e-5), dtype=np.float64)
- warning_message = "standard deviation of the data is probably very close to 0"
- with pytest.warns(UserWarning, match=warning_message):
- x_scaled = scale(x)
- assert_array_almost_equal(x_scaled, np.zeros(10))
- x = np.full(10, 1e-100, dtype=np.float64)
- with warnings.catch_warnings():
- warnings.simplefilter("error", UserWarning)
- x_small_scaled = scale(x)
- assert_array_almost_equal(x_small_scaled, np.zeros(10))
- # Large values can cause (often recoverable) numerical stability issues:
- x_big = np.full(10, 1e100, dtype=np.float64)
- warning_message = "Dataset may contain too large values"
- with pytest.warns(UserWarning, match=warning_message):
- x_big_scaled = scale(x_big)
- assert_array_almost_equal(x_big_scaled, np.zeros(10))
- assert_array_almost_equal(x_big_scaled, x_small_scaled)
- with pytest.warns(UserWarning, match=warning_message):
- x_big_centered = scale(x_big, with_std=False)
- assert_array_almost_equal(x_big_centered, np.zeros(10))
- assert_array_almost_equal(x_big_centered, x_small_scaled)
- def test_scaler_2d_arrays():
- # Test scaling of 2d array along first axis
- rng = np.random.RandomState(0)
- n_features = 5
- n_samples = 4
- X = rng.randn(n_samples, n_features)
- X[:, 0] = 0.0 # first feature is always of zero
- scaler = StandardScaler()
- X_scaled = scaler.fit(X).transform(X, copy=True)
- assert not np.any(np.isnan(X_scaled))
- assert scaler.n_samples_seen_ == n_samples
- assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0])
- assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
- # Check that X has been copied
- assert X_scaled is not X
- # check inverse transform
- X_scaled_back = scaler.inverse_transform(X_scaled)
- assert X_scaled_back is not X
- assert X_scaled_back is not X_scaled
- assert_array_almost_equal(X_scaled_back, X)
- X_scaled = scale(X, axis=1, with_std=False)
- assert not np.any(np.isnan(X_scaled))
- assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0])
- X_scaled = scale(X, axis=1, with_std=True)
- assert not np.any(np.isnan(X_scaled))
- assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0])
- assert_array_almost_equal(X_scaled.std(axis=1), n_samples * [1.0])
- # Check that the data hasn't been modified
- assert X_scaled is not X
- X_scaled = scaler.fit(X).transform(X, copy=False)
- assert not np.any(np.isnan(X_scaled))
- assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0])
- assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
- # Check that X has not been copied
- assert X_scaled is X
- X = rng.randn(4, 5)
- X[:, 0] = 1.0 # first feature is a constant, non zero feature
- scaler = StandardScaler()
- X_scaled = scaler.fit(X).transform(X, copy=True)
- assert not np.any(np.isnan(X_scaled))
- assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0])
- assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
- # Check that X has not been copied
- assert X_scaled is not X
- def test_scaler_float16_overflow():
- # Test if the scaler will not overflow on float16 numpy arrays
- rng = np.random.RandomState(0)
- # float16 has a maximum of 65500.0. On the worst case 5 * 200000 is 100000
- # which is enough to overflow the data type
- X = rng.uniform(5, 10, [200000, 1]).astype(np.float16)
- with np.errstate(over="raise"):
- scaler = StandardScaler().fit(X)
- X_scaled = scaler.transform(X)
- # Calculate the float64 equivalent to verify result
- X_scaled_f64 = StandardScaler().fit_transform(X.astype(np.float64))
- # Overflow calculations may cause -inf, inf, or nan. Since there is no nan
- # input, all of the outputs should be finite. This may be redundant since a
- # FloatingPointError exception will be thrown on overflow above.
- assert np.all(np.isfinite(X_scaled))
- # The normal distribution is very unlikely to go above 4. At 4.0-8.0 the
- # float16 precision is 2^-8 which is around 0.004. Thus only 2 decimals are
- # checked to account for precision differences.
- assert_array_almost_equal(X_scaled, X_scaled_f64, decimal=2)
- def test_handle_zeros_in_scale():
- s1 = np.array([0, 1e-16, 1, 2, 3])
- s2 = _handle_zeros_in_scale(s1, copy=True)
- assert_allclose(s1, np.array([0, 1e-16, 1, 2, 3]))
- assert_allclose(s2, np.array([1, 1, 1, 2, 3]))
- def test_minmax_scaler_partial_fit():
- # Test if partial_fit run over many batches of size 1 and 50
- # gives the same results as fit
- X = X_2d
- n = X.shape[0]
- for chunk_size in [1, 2, 50, n, n + 42]:
- # Test mean at the end of the process
- scaler_batch = MinMaxScaler().fit(X)
- scaler_incr = MinMaxScaler()
- for batch in gen_batches(n_samples, chunk_size):
- scaler_incr = scaler_incr.partial_fit(X[batch])
- assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_)
- assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_)
- assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
- assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_)
- assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
- assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)
- # Test std after 1 step
- batch0 = slice(0, chunk_size)
- scaler_batch = MinMaxScaler().fit(X[batch0])
- scaler_incr = MinMaxScaler().partial_fit(X[batch0])
- assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_)
- assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_)
- assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
- assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_)
- assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
- assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)
- # Test std until the end of partial fits, and
- scaler_batch = MinMaxScaler().fit(X)
- scaler_incr = MinMaxScaler() # Clean estimator
- for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
- scaler_incr = scaler_incr.partial_fit(X[batch])
- assert_correct_incr(
- i,
- batch_start=batch.start,
- batch_stop=batch.stop,
- n=n,
- chunk_size=chunk_size,
- n_samples_seen=scaler_incr.n_samples_seen_,
- )
- def test_standard_scaler_partial_fit():
- # Test if partial_fit run over many batches of size 1 and 50
- # gives the same results as fit
- X = X_2d
- n = X.shape[0]
- for chunk_size in [1, 2, 50, n, n + 42]:
- # Test mean at the end of the process
- scaler_batch = StandardScaler(with_std=False).fit(X)
- scaler_incr = StandardScaler(with_std=False)
- for batch in gen_batches(n_samples, chunk_size):
- scaler_incr = scaler_incr.partial_fit(X[batch])
- assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_)
- assert scaler_batch.var_ == scaler_incr.var_ # Nones
- assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
- # Test std after 1 step
- batch0 = slice(0, chunk_size)
- scaler_incr = StandardScaler().partial_fit(X[batch0])
- if chunk_size == 1:
- assert_array_almost_equal(
- np.zeros(n_features, dtype=np.float64), scaler_incr.var_
- )
- assert_array_almost_equal(
- np.ones(n_features, dtype=np.float64), scaler_incr.scale_
- )
- else:
- assert_array_almost_equal(np.var(X[batch0], axis=0), scaler_incr.var_)
- assert_array_almost_equal(
- np.std(X[batch0], axis=0), scaler_incr.scale_
- ) # no constants
- # Test std until the end of partial fits, and
- scaler_batch = StandardScaler().fit(X)
- scaler_incr = StandardScaler() # Clean estimator
- for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
- scaler_incr = scaler_incr.partial_fit(X[batch])
- assert_correct_incr(
- i,
- batch_start=batch.start,
- batch_stop=batch.stop,
- n=n,
- chunk_size=chunk_size,
- n_samples_seen=scaler_incr.n_samples_seen_,
- )
- assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_)
- assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
- def test_standard_scaler_partial_fit_numerical_stability():
- # Test if the incremental computation introduces significative errors
- # for large datasets with values of large magniture
- rng = np.random.RandomState(0)
- n_features = 2
- n_samples = 100
- offsets = rng.uniform(-1e15, 1e15, size=n_features)
- scales = rng.uniform(1e3, 1e6, size=n_features)
- X = rng.randn(n_samples, n_features) * scales + offsets
- scaler_batch = StandardScaler().fit(X)
- scaler_incr = StandardScaler()
- for chunk in X:
- scaler_incr = scaler_incr.partial_fit(chunk.reshape(1, n_features))
- # Regardless of abs values, they must not be more diff 6 significant digits
- tol = 10 ** (-6)
- assert_allclose(scaler_incr.mean_, scaler_batch.mean_, rtol=tol)
- assert_allclose(scaler_incr.var_, scaler_batch.var_, rtol=tol)
- assert_allclose(scaler_incr.scale_, scaler_batch.scale_, rtol=tol)
- # NOTE Be aware that for much larger offsets std is very unstable (last
- # assert) while mean is OK.
- # Sparse input
- size = (100, 3)
- scale = 1e20
- X = rng.randint(0, 2, size).astype(np.float64) * scale
- X_csr = sparse.csr_matrix(X)
- X_csc = sparse.csc_matrix(X)
- for X in [X_csr, X_csc]:
- # with_mean=False is required with sparse input
- scaler = StandardScaler(with_mean=False).fit(X)
- scaler_incr = StandardScaler(with_mean=False)
- for chunk in X:
- # chunk = sparse.csr_matrix(data_chunks)
- scaler_incr = scaler_incr.partial_fit(chunk)
- # Regardless of magnitude, they must not differ more than of 6 digits
- tol = 10 ** (-6)
- assert scaler.mean_ is not None
- assert_allclose(scaler_incr.var_, scaler.var_, rtol=tol)
- assert_allclose(scaler_incr.scale_, scaler.scale_, rtol=tol)
- @pytest.mark.parametrize("sample_weight", [True, None])
- def test_partial_fit_sparse_input(sample_weight):
- # Check that sparsity is not destroyed
- X = np.array([[1.0], [0.0], [0.0], [5.0]])
- X_csr = sparse.csr_matrix(X)
- X_csc = sparse.csc_matrix(X)
- if sample_weight:
- sample_weight = rng.rand(X_csc.shape[0])
- null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
- for X in [X_csr, X_csc]:
- X_null = null_transform.partial_fit(X, sample_weight=sample_weight).transform(X)
- assert_array_equal(X_null.toarray(), X.toarray())
- X_orig = null_transform.inverse_transform(X_null)
- assert_array_equal(X_orig.toarray(), X_null.toarray())
- assert_array_equal(X_orig.toarray(), X.toarray())
- @pytest.mark.parametrize("sample_weight", [True, None])
- def test_standard_scaler_trasform_with_partial_fit(sample_weight):
- # Check some postconditions after applying partial_fit and transform
- X = X_2d[:100, :]
- if sample_weight:
- sample_weight = rng.rand(X.shape[0])
- scaler_incr = StandardScaler()
- for i, batch in enumerate(gen_batches(X.shape[0], 1)):
- X_sofar = X[: (i + 1), :]
- chunks_copy = X_sofar.copy()
- if sample_weight is None:
- scaled_batch = StandardScaler().fit_transform(X_sofar)
- scaler_incr = scaler_incr.partial_fit(X[batch])
- else:
- scaled_batch = StandardScaler().fit_transform(
- X_sofar, sample_weight=sample_weight[: i + 1]
- )
- scaler_incr = scaler_incr.partial_fit(
- X[batch], sample_weight=sample_weight[batch]
- )
- scaled_incr = scaler_incr.transform(X_sofar)
- assert_array_almost_equal(scaled_batch, scaled_incr)
- assert_array_almost_equal(X_sofar, chunks_copy) # No change
- right_input = scaler_incr.inverse_transform(scaled_incr)
- assert_array_almost_equal(X_sofar, right_input)
- zero = np.zeros(X.shape[1])
- epsilon = np.finfo(float).eps
- assert_array_less(zero, scaler_incr.var_ + epsilon) # as less or equal
- assert_array_less(zero, scaler_incr.scale_ + epsilon)
- if sample_weight is None:
- # (i+1) because the Scaler has been already fitted
- assert (i + 1) == scaler_incr.n_samples_seen_
- else:
- assert np.sum(sample_weight[: i + 1]) == pytest.approx(
- scaler_incr.n_samples_seen_
- )
- def test_standard_check_array_of_inverse_transform():
- # Check if StandardScaler inverse_transform is
- # converting the integer array to float
- x = np.array(
- [
- [1, 1, 1, 0, 1, 0],
- [1, 1, 1, 0, 1, 0],
- [0, 8, 0, 1, 0, 0],
- [1, 4, 1, 1, 0, 0],
- [0, 1, 0, 0, 1, 0],
- [0, 4, 0, 1, 0, 1],
- ],
- dtype=np.int32,
- )
- scaler = StandardScaler()
- scaler.fit(x)
- # The of inverse_transform should be converted
- # to a float array.
- # If not X *= self.scale_ will fail.
- scaler.inverse_transform(x)
- def test_min_max_scaler_iris():
- X = iris.data
- scaler = MinMaxScaler()
- # default params
- X_trans = scaler.fit_transform(X)
- assert_array_almost_equal(X_trans.min(axis=0), 0)
- assert_array_almost_equal(X_trans.max(axis=0), 1)
- X_trans_inv = scaler.inverse_transform(X_trans)
- assert_array_almost_equal(X, X_trans_inv)
- # not default params: min=1, max=2
- scaler = MinMaxScaler(feature_range=(1, 2))
- X_trans = scaler.fit_transform(X)
- assert_array_almost_equal(X_trans.min(axis=0), 1)
- assert_array_almost_equal(X_trans.max(axis=0), 2)
- X_trans_inv = scaler.inverse_transform(X_trans)
- assert_array_almost_equal(X, X_trans_inv)
- # min=-.5, max=.6
- scaler = MinMaxScaler(feature_range=(-0.5, 0.6))
- X_trans = scaler.fit_transform(X)
- assert_array_almost_equal(X_trans.min(axis=0), -0.5)
- assert_array_almost_equal(X_trans.max(axis=0), 0.6)
- X_trans_inv = scaler.inverse_transform(X_trans)
- assert_array_almost_equal(X, X_trans_inv)
- # raises on invalid range
- scaler = MinMaxScaler(feature_range=(2, 1))
- with pytest.raises(ValueError):
- scaler.fit(X)
- def test_min_max_scaler_zero_variance_features():
- # Check min max scaler on toy data with zero variance features
- X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.1], [0.0, 1.0, +1.1]]
- X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]]
- # default params
- scaler = MinMaxScaler()
- X_trans = scaler.fit_transform(X)
- X_expected_0_1 = [[0.0, 0.0, 0.5], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0]]
- assert_array_almost_equal(X_trans, X_expected_0_1)
- X_trans_inv = scaler.inverse_transform(X_trans)
- assert_array_almost_equal(X, X_trans_inv)
- X_trans_new = scaler.transform(X_new)
- X_expected_0_1_new = [[+0.0, 1.0, 0.500], [-1.0, 0.0, 0.083], [+0.0, 0.0, 1.333]]
- assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2)
- # not default params
- scaler = MinMaxScaler(feature_range=(1, 2))
- X_trans = scaler.fit_transform(X)
- X_expected_1_2 = [[1.0, 1.0, 1.5], [1.0, 1.0, 1.0], [1.0, 1.0, 2.0]]
- assert_array_almost_equal(X_trans, X_expected_1_2)
- # function interface
- X_trans = minmax_scale(X)
- assert_array_almost_equal(X_trans, X_expected_0_1)
- X_trans = minmax_scale(X, feature_range=(1, 2))
- assert_array_almost_equal(X_trans, X_expected_1_2)
- def test_minmax_scale_axis1():
- X = iris.data
- X_trans = minmax_scale(X, axis=1)
- assert_array_almost_equal(np.min(X_trans, axis=1), 0)
- assert_array_almost_equal(np.max(X_trans, axis=1), 1)
- def test_min_max_scaler_1d():
- # Test scaling of dataset along single axis
- for X in [X_1row, X_1col, X_list_1row, X_list_1row]:
- scaler = MinMaxScaler(copy=True)
- X_scaled = scaler.fit(X).transform(X)
- if isinstance(X, list):
- X = np.array(X) # cast only after scaling done
- if _check_dim_1axis(X) == 1:
- assert_array_almost_equal(X_scaled.min(axis=0), np.zeros(n_features))
- assert_array_almost_equal(X_scaled.max(axis=0), np.zeros(n_features))
- else:
- assert_array_almost_equal(X_scaled.min(axis=0), 0.0)
- assert_array_almost_equal(X_scaled.max(axis=0), 1.0)
- assert scaler.n_samples_seen_ == X.shape[0]
- # check inverse transform
- X_scaled_back = scaler.inverse_transform(X_scaled)
- assert_array_almost_equal(X_scaled_back, X)
- # Constant feature
- X = np.ones((5, 1))
- scaler = MinMaxScaler()
- X_scaled = scaler.fit(X).transform(X)
- assert X_scaled.min() >= 0.0
- assert X_scaled.max() <= 1.0
- assert scaler.n_samples_seen_ == X.shape[0]
- # Function interface
- X_1d = X_1row.ravel()
- min_ = X_1d.min()
- max_ = X_1d.max()
- assert_array_almost_equal(
- (X_1d - min_) / (max_ - min_), minmax_scale(X_1d, copy=True)
- )
- @pytest.mark.parametrize("sample_weight", [True, None])
- def test_scaler_without_centering(sample_weight):
- rng = np.random.RandomState(42)
- X = rng.randn(4, 5)
- X[:, 0] = 0.0 # first feature is always of zero
- X_csr = sparse.csr_matrix(X)
- X_csc = sparse.csc_matrix(X)
- if sample_weight:
- sample_weight = rng.rand(X.shape[0])
- with pytest.raises(ValueError):
- StandardScaler().fit(X_csr)
- with pytest.raises(ValueError):
- StandardScaler().fit(X_csc)
- null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
- X_null = null_transform.fit_transform(X_csr)
- assert_array_equal(X_null.data, X_csr.data)
- X_orig = null_transform.inverse_transform(X_null)
- assert_array_equal(X_orig.data, X_csr.data)
- scaler = StandardScaler(with_mean=False).fit(X, sample_weight=sample_weight)
- X_scaled = scaler.transform(X, copy=True)
- assert not np.any(np.isnan(X_scaled))
- scaler_csr = StandardScaler(with_mean=False).fit(X_csr, sample_weight=sample_weight)
- X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
- assert not np.any(np.isnan(X_csr_scaled.data))
- scaler_csc = StandardScaler(with_mean=False).fit(X_csc, sample_weight=sample_weight)
- X_csc_scaled = scaler_csc.transform(X_csc, copy=True)
- assert not np.any(np.isnan(X_csc_scaled.data))
- assert_array_almost_equal(scaler.mean_, scaler_csr.mean_)
- assert_array_almost_equal(scaler.var_, scaler_csr.var_)
- assert_array_almost_equal(scaler.scale_, scaler_csr.scale_)
- assert_array_almost_equal(scaler.n_samples_seen_, scaler_csr.n_samples_seen_)
- assert_array_almost_equal(scaler.mean_, scaler_csc.mean_)
- assert_array_almost_equal(scaler.var_, scaler_csc.var_)
- assert_array_almost_equal(scaler.scale_, scaler_csc.scale_)
- assert_array_almost_equal(scaler.n_samples_seen_, scaler_csc.n_samples_seen_)
- if sample_weight is None:
- assert_array_almost_equal(
- X_scaled.mean(axis=0), [0.0, -0.01, 2.24, -0.35, -0.78], 2
- )
- assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
- X_csr_scaled_mean, X_csr_scaled_var = mean_variance_axis(X_csr_scaled, 0)
- assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
- assert_array_almost_equal(X_csr_scaled_var, X_scaled.var(axis=0))
- # Check that X has not been modified (copy)
- assert X_scaled is not X
- assert X_csr_scaled is not X_csr
- X_scaled_back = scaler.inverse_transform(X_scaled)
- assert X_scaled_back is not X
- assert X_scaled_back is not X_scaled
- assert_array_almost_equal(X_scaled_back, X)
- X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
- assert X_csr_scaled_back is not X_csr
- assert X_csr_scaled_back is not X_csr_scaled
- assert_array_almost_equal(X_csr_scaled_back.toarray(), X)
- X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc())
- assert X_csc_scaled_back is not X_csc
- assert X_csc_scaled_back is not X_csc_scaled
- assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
- @pytest.mark.parametrize("with_mean", [True, False])
- @pytest.mark.parametrize("with_std", [True, False])
- @pytest.mark.parametrize(
- "array_constructor", [np.asarray, sparse.csc_matrix, sparse.csr_matrix]
- )
- def test_scaler_n_samples_seen_with_nan(with_mean, with_std, array_constructor):
- X = np.array(
- [[0, 1, 3], [np.nan, 6, 10], [5, 4, np.nan], [8, 0, np.nan]], dtype=np.float64
- )
- X = array_constructor(X)
- if sparse.issparse(X) and with_mean:
- pytest.skip("'with_mean=True' cannot be used with sparse matrix.")
- transformer = StandardScaler(with_mean=with_mean, with_std=with_std)
- transformer.fit(X)
- assert_array_equal(transformer.n_samples_seen_, np.array([3, 4, 2]))
- def _check_identity_scalers_attributes(scaler_1, scaler_2):
- assert scaler_1.mean_ is scaler_2.mean_ is None
- assert scaler_1.var_ is scaler_2.var_ is None
- assert scaler_1.scale_ is scaler_2.scale_ is None
- assert scaler_1.n_samples_seen_ == scaler_2.n_samples_seen_
- def test_scaler_return_identity():
- # test that the scaler return identity when with_mean and with_std are
- # False
- X_dense = np.array([[0, 1, 3], [5, 6, 0], [8, 0, 10]], dtype=np.float64)
- X_csr = sparse.csr_matrix(X_dense)
- X_csc = X_csr.tocsc()
- transformer_dense = StandardScaler(with_mean=False, with_std=False)
- X_trans_dense = transformer_dense.fit_transform(X_dense)
- transformer_csr = clone(transformer_dense)
- X_trans_csr = transformer_csr.fit_transform(X_csr)
- transformer_csc = clone(transformer_dense)
- X_trans_csc = transformer_csc.fit_transform(X_csc)
- assert_allclose_dense_sparse(X_trans_csr, X_csr)
- assert_allclose_dense_sparse(X_trans_csc, X_csc)
- assert_allclose(X_trans_dense, X_dense)
- for trans_1, trans_2 in itertools.combinations(
- [transformer_dense, transformer_csr, transformer_csc], 2
- ):
- _check_identity_scalers_attributes(trans_1, trans_2)
- transformer_dense.partial_fit(X_dense)
- transformer_csr.partial_fit(X_csr)
- transformer_csc.partial_fit(X_csc)
- for trans_1, trans_2 in itertools.combinations(
- [transformer_dense, transformer_csr, transformer_csc], 2
- ):
- _check_identity_scalers_attributes(trans_1, trans_2)
- transformer_dense.fit(X_dense)
- transformer_csr.fit(X_csr)
- transformer_csc.fit(X_csc)
- for trans_1, trans_2 in itertools.combinations(
- [transformer_dense, transformer_csr, transformer_csc], 2
- ):
- _check_identity_scalers_attributes(trans_1, trans_2)
- def test_scaler_int():
- # test that scaler converts integer input to floating
- # for both sparse and dense matrices
- rng = np.random.RandomState(42)
- X = rng.randint(20, size=(4, 5))
- X[:, 0] = 0 # first feature is always of zero
- X_csr = sparse.csr_matrix(X)
- X_csc = sparse.csc_matrix(X)
- null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
- with warnings.catch_warnings(record=True):
- X_null = null_transform.fit_transform(X_csr)
- assert_array_equal(X_null.data, X_csr.data)
- X_orig = null_transform.inverse_transform(X_null)
- assert_array_equal(X_orig.data, X_csr.data)
- with warnings.catch_warnings(record=True):
- scaler = StandardScaler(with_mean=False).fit(X)
- X_scaled = scaler.transform(X, copy=True)
- assert not np.any(np.isnan(X_scaled))
- with warnings.catch_warnings(record=True):
- scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
- X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
- assert not np.any(np.isnan(X_csr_scaled.data))
- with warnings.catch_warnings(record=True):
- scaler_csc = StandardScaler(with_mean=False).fit(X_csc)
- X_csc_scaled = scaler_csc.transform(X_csc, copy=True)
- assert not np.any(np.isnan(X_csc_scaled.data))
- assert_array_almost_equal(scaler.mean_, scaler_csr.mean_)
- assert_array_almost_equal(scaler.var_, scaler_csr.var_)
- assert_array_almost_equal(scaler.scale_, scaler_csr.scale_)
- assert_array_almost_equal(scaler.mean_, scaler_csc.mean_)
- assert_array_almost_equal(scaler.var_, scaler_csc.var_)
- assert_array_almost_equal(scaler.scale_, scaler_csc.scale_)
- assert_array_almost_equal(
- X_scaled.mean(axis=0), [0.0, 1.109, 1.856, 21.0, 1.559], 2
- )
- assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
- X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(
- X_csr_scaled.astype(float), 0
- )
- assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
- assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))
- # Check that X has not been modified (copy)
- assert X_scaled is not X
- assert X_csr_scaled is not X_csr
- X_scaled_back = scaler.inverse_transform(X_scaled)
- assert X_scaled_back is not X
- assert X_scaled_back is not X_scaled
- assert_array_almost_equal(X_scaled_back, X)
- X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
- assert X_csr_scaled_back is not X_csr
- assert X_csr_scaled_back is not X_csr_scaled
- assert_array_almost_equal(X_csr_scaled_back.toarray(), X)
- X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc())
- assert X_csc_scaled_back is not X_csc
- assert X_csc_scaled_back is not X_csc_scaled
- assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
- def test_scaler_without_copy():
- # Check that StandardScaler.fit does not change input
- rng = np.random.RandomState(42)
- X = rng.randn(4, 5)
- X[:, 0] = 0.0 # first feature is always of zero
- X_csr = sparse.csr_matrix(X)
- X_csc = sparse.csc_matrix(X)
- X_copy = X.copy()
- StandardScaler(copy=False).fit(X)
- assert_array_equal(X, X_copy)
- X_csr_copy = X_csr.copy()
- StandardScaler(with_mean=False, copy=False).fit(X_csr)
- assert_array_equal(X_csr.toarray(), X_csr_copy.toarray())
- X_csc_copy = X_csc.copy()
- StandardScaler(with_mean=False, copy=False).fit(X_csc)
- assert_array_equal(X_csc.toarray(), X_csc_copy.toarray())
- def test_scale_sparse_with_mean_raise_exception():
- rng = np.random.RandomState(42)
- X = rng.randn(4, 5)
- X_csr = sparse.csr_matrix(X)
- X_csc = sparse.csc_matrix(X)
- # check scaling and fit with direct calls on sparse data
- with pytest.raises(ValueError):
- scale(X_csr, with_mean=True)
- with pytest.raises(ValueError):
- StandardScaler(with_mean=True).fit(X_csr)
- with pytest.raises(ValueError):
- scale(X_csc, with_mean=True)
- with pytest.raises(ValueError):
- StandardScaler(with_mean=True).fit(X_csc)
- # check transform and inverse_transform after a fit on a dense array
- scaler = StandardScaler(with_mean=True).fit(X)
- with pytest.raises(ValueError):
- scaler.transform(X_csr)
- with pytest.raises(ValueError):
- scaler.transform(X_csc)
- X_transformed_csr = sparse.csr_matrix(scaler.transform(X))
- with pytest.raises(ValueError):
- scaler.inverse_transform(X_transformed_csr)
- X_transformed_csc = sparse.csc_matrix(scaler.transform(X))
- with pytest.raises(ValueError):
- scaler.inverse_transform(X_transformed_csc)
- def test_scale_input_finiteness_validation():
- # Check if non finite inputs raise ValueError
- X = [[np.inf, 5, 6, 7, 8]]
- with pytest.raises(
- ValueError, match="Input contains infinity or a value too large"
- ):
- scale(X)
- def test_robust_scaler_error_sparse():
- X_sparse = sparse.rand(1000, 10)
- scaler = RobustScaler(with_centering=True)
- err_msg = "Cannot center sparse matrices"
- with pytest.raises(ValueError, match=err_msg):
- scaler.fit(X_sparse)
- @pytest.mark.parametrize("with_centering", [True, False])
- @pytest.mark.parametrize("with_scaling", [True, False])
- @pytest.mark.parametrize("X", [np.random.randn(10, 3), sparse.rand(10, 3, density=0.5)])
- def test_robust_scaler_attributes(X, with_centering, with_scaling):
- # check consistent type of attributes
- if with_centering and sparse.issparse(X):
- pytest.skip("RobustScaler cannot center sparse matrix")
- scaler = RobustScaler(with_centering=with_centering, with_scaling=with_scaling)
- scaler.fit(X)
- if with_centering:
- assert isinstance(scaler.center_, np.ndarray)
- else:
- assert scaler.center_ is None
- if with_scaling:
- assert isinstance(scaler.scale_, np.ndarray)
- else:
- assert scaler.scale_ is None
- def test_robust_scaler_col_zero_sparse():
- # check that the scaler is working when there is not data materialized in a
- # column of a sparse matrix
- X = np.random.randn(10, 5)
- X[:, 0] = 0
- X = sparse.csr_matrix(X)
- scaler = RobustScaler(with_centering=False)
- scaler.fit(X)
- assert scaler.scale_[0] == pytest.approx(1)
- X_trans = scaler.transform(X)
- assert_allclose(X[:, 0].toarray(), X_trans[:, 0].toarray())
- def test_robust_scaler_2d_arrays():
- # Test robust scaling of 2d array along first axis
- rng = np.random.RandomState(0)
- X = rng.randn(4, 5)
- X[:, 0] = 0.0 # first feature is always of zero
- scaler = RobustScaler()
- X_scaled = scaler.fit(X).transform(X)
- assert_array_almost_equal(np.median(X_scaled, axis=0), 5 * [0.0])
- assert_array_almost_equal(X_scaled.std(axis=0)[0], 0)
- @pytest.mark.parametrize("density", [0, 0.05, 0.1, 0.5, 1])
- @pytest.mark.parametrize("strictly_signed", ["positive", "negative", "zeros", None])
- def test_robust_scaler_equivalence_dense_sparse(density, strictly_signed):
- # Check the equivalence of the fitting with dense and sparse matrices
- X_sparse = sparse.rand(1000, 5, density=density).tocsc()
- if strictly_signed == "positive":
- X_sparse.data = np.abs(X_sparse.data)
- elif strictly_signed == "negative":
- X_sparse.data = -np.abs(X_sparse.data)
- elif strictly_signed == "zeros":
- X_sparse.data = np.zeros(X_sparse.data.shape, dtype=np.float64)
- X_dense = X_sparse.toarray()
- scaler_sparse = RobustScaler(with_centering=False)
- scaler_dense = RobustScaler(with_centering=False)
- scaler_sparse.fit(X_sparse)
- scaler_dense.fit(X_dense)
- assert_allclose(scaler_sparse.scale_, scaler_dense.scale_)
- def test_robust_scaler_transform_one_row_csr():
- # Check RobustScaler on transforming csr matrix with one row
- rng = np.random.RandomState(0)
- X = rng.randn(4, 5)
- single_row = np.array([[0.1, 1.0, 2.0, 0.0, -1.0]])
- scaler = RobustScaler(with_centering=False)
- scaler = scaler.fit(X)
- row_trans = scaler.transform(sparse.csr_matrix(single_row))
- row_expected = single_row / scaler.scale_
- assert_array_almost_equal(row_trans.toarray(), row_expected)
- row_scaled_back = scaler.inverse_transform(row_trans)
- assert_array_almost_equal(single_row, row_scaled_back.toarray())
- def test_robust_scaler_iris():
- X = iris.data
- scaler = RobustScaler()
- X_trans = scaler.fit_transform(X)
- assert_array_almost_equal(np.median(X_trans, axis=0), 0)
- X_trans_inv = scaler.inverse_transform(X_trans)
- assert_array_almost_equal(X, X_trans_inv)
- q = np.percentile(X_trans, q=(25, 75), axis=0)
- iqr = q[1] - q[0]
- assert_array_almost_equal(iqr, 1)
- def test_robust_scaler_iris_quantiles():
- X = iris.data
- scaler = RobustScaler(quantile_range=(10, 90))
- X_trans = scaler.fit_transform(X)
- assert_array_almost_equal(np.median(X_trans, axis=0), 0)
- X_trans_inv = scaler.inverse_transform(X_trans)
- assert_array_almost_equal(X, X_trans_inv)
- q = np.percentile(X_trans, q=(10, 90), axis=0)
- q_range = q[1] - q[0]
- assert_array_almost_equal(q_range, 1)
- def test_quantile_transform_iris():
- X = iris.data
- # uniform output distribution
- transformer = QuantileTransformer(n_quantiles=30)
- X_trans = transformer.fit_transform(X)
- X_trans_inv = transformer.inverse_transform(X_trans)
- assert_array_almost_equal(X, X_trans_inv)
- # normal output distribution
- transformer = QuantileTransformer(n_quantiles=30, output_distribution="normal")
- X_trans = transformer.fit_transform(X)
- X_trans_inv = transformer.inverse_transform(X_trans)
- assert_array_almost_equal(X, X_trans_inv)
- # make sure it is possible to take the inverse of a sparse matrix
- # which contain negative value; this is the case in the iris dataset
- X_sparse = sparse.csc_matrix(X)
- X_sparse_tran = transformer.fit_transform(X_sparse)
- X_sparse_tran_inv = transformer.inverse_transform(X_sparse_tran)
- assert_array_almost_equal(X_sparse.A, X_sparse_tran_inv.A)
- def test_quantile_transform_check_error():
- X = np.transpose(
- [
- [0, 25, 50, 0, 0, 0, 75, 0, 0, 100],
- [2, 4, 0, 0, 6, 8, 0, 10, 0, 0],
- [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1],
- ]
- )
- X = sparse.csc_matrix(X)
- X_neg = np.transpose(
- [
- [0, 25, 50, 0, 0, 0, 75, 0, 0, 100],
- [-2, 4, 0, 0, 6, 8, 0, 10, 0, 0],
- [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1],
- ]
- )
- X_neg = sparse.csc_matrix(X_neg)
- err_msg = (
- "The number of quantiles cannot be greater than "
- "the number of samples used. Got 1000 quantiles "
- "and 10 samples."
- )
- with pytest.raises(ValueError, match=err_msg):
- QuantileTransformer(subsample=10).fit(X)
- transformer = QuantileTransformer(n_quantiles=10)
- err_msg = "QuantileTransformer only accepts non-negative sparse matrices."
- with pytest.raises(ValueError, match=err_msg):
- transformer.fit(X_neg)
- transformer.fit(X)
- err_msg = "QuantileTransformer only accepts non-negative sparse matrices."
- with pytest.raises(ValueError, match=err_msg):
- transformer.transform(X_neg)
- X_bad_feat = np.transpose(
- [[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]
- )
- err_msg = (
- "X has 2 features, but QuantileTransformer is expecting 3 features as input."
- )
- with pytest.raises(ValueError, match=err_msg):
- transformer.inverse_transform(X_bad_feat)
- transformer = QuantileTransformer(n_quantiles=10).fit(X)
- # check that an error is raised if input is scalar
- with pytest.raises(ValueError, match="Expected 2D array, got scalar array instead"):
- transformer.transform(10)
- # check that a warning is raised is n_quantiles > n_samples
- transformer = QuantileTransformer(n_quantiles=100)
- warn_msg = "n_quantiles is set to n_samples"
- with pytest.warns(UserWarning, match=warn_msg) as record:
- transformer.fit(X)
- assert len(record) == 1
- assert transformer.n_quantiles_ == X.shape[0]
- def test_quantile_transform_sparse_ignore_zeros():
- X = np.array([[0, 1], [0, 0], [0, 2], [0, 2], [0, 1]])
- X_sparse = sparse.csc_matrix(X)
- transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5)
- # dense case -> warning raise
- warning_message = (
- "'ignore_implicit_zeros' takes effect"
- " only with sparse matrix. This parameter has no"
- " effect."
- )
- with pytest.warns(UserWarning, match=warning_message):
- transformer.fit(X)
- X_expected = np.array([[0, 0], [0, 0], [0, 1], [0, 1], [0, 0]])
- X_trans = transformer.fit_transform(X_sparse)
- assert_almost_equal(X_expected, X_trans.A)
- # consider the case where sparse entries are missing values and user-given
- # zeros are to be considered
- X_data = np.array([0, 0, 1, 0, 2, 2, 1, 0, 1, 2, 0])
- X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1])
- X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6, 7, 8])
- X_sparse = sparse.csc_matrix((X_data, (X_row, X_col)))
- X_trans = transformer.fit_transform(X_sparse)
- X_expected = np.array(
- [
- [0.0, 0.5],
- [0.0, 0.0],
- [0.0, 1.0],
- [0.0, 1.0],
- [0.0, 0.5],
- [0.0, 0.0],
- [0.0, 0.5],
- [0.0, 1.0],
- [0.0, 0.0],
- ]
- )
- assert_almost_equal(X_expected, X_trans.A)
- transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5)
- X_data = np.array([-1, -1, 1, 0, 0, 0, 1, -1, 1])
- X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1])
- X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6])
- X_sparse = sparse.csc_matrix((X_data, (X_row, X_col)))
- X_trans = transformer.fit_transform(X_sparse)
- X_expected = np.array(
- [[0, 1], [0, 0.375], [0, 0.375], [0, 0.375], [0, 1], [0, 0], [0, 1]]
- )
- assert_almost_equal(X_expected, X_trans.A)
- assert_almost_equal(X_sparse.A, transformer.inverse_transform(X_trans).A)
- # check in conjunction with subsampling
- transformer = QuantileTransformer(
- ignore_implicit_zeros=True, n_quantiles=5, subsample=8, random_state=0
- )
- X_trans = transformer.fit_transform(X_sparse)
- assert_almost_equal(X_expected, X_trans.A)
- assert_almost_equal(X_sparse.A, transformer.inverse_transform(X_trans).A)
- def test_quantile_transform_dense_toy():
- X = np.array(
- [[0, 2, 2.6], [25, 4, 4.1], [50, 6, 2.3], [75, 8, 9.5], [100, 10, 0.1]]
- )
- transformer = QuantileTransformer(n_quantiles=5)
- transformer.fit(X)
- # using a uniform output, each entry of X should be map between 0 and 1
- # and equally spaced
- X_trans = transformer.fit_transform(X)
- X_expected = np.tile(np.linspace(0, 1, num=5), (3, 1)).T
- assert_almost_equal(np.sort(X_trans, axis=0), X_expected)
- X_test = np.array(
- [
- [-1, 1, 0],
- [101, 11, 10],
- ]
- )
- X_expected = np.array(
- [
- [0, 0, 0],
- [1, 1, 1],
- ]
- )
- assert_array_almost_equal(transformer.transform(X_test), X_expected)
- X_trans_inv = transformer.inverse_transform(X_trans)
- assert_array_almost_equal(X, X_trans_inv)
- def test_quantile_transform_subsampling():
- # Test that subsampling the input yield to a consistent results We check
- # that the computed quantiles are almost mapped to a [0, 1] vector where
- # values are equally spaced. The infinite norm is checked to be smaller
- # than a given threshold. This is repeated 5 times.
- # dense support
- n_samples = 1000000
- n_quantiles = 1000
- X = np.sort(np.random.sample((n_samples, 1)), axis=0)
- ROUND = 5
- inf_norm_arr = []
- for random_state in range(ROUND):
- transformer = QuantileTransformer(
- random_state=random_state,
- n_quantiles=n_quantiles,
- subsample=n_samples // 10,
- )
- transformer.fit(X)
- diff = np.linspace(0, 1, n_quantiles) - np.ravel(transformer.quantiles_)
- inf_norm = np.max(np.abs(diff))
- assert inf_norm < 1e-2
- inf_norm_arr.append(inf_norm)
- # each random subsampling yield a unique approximation to the expected
- # linspace CDF
- assert len(np.unique(inf_norm_arr)) == len(inf_norm_arr)
- # sparse support
- X = sparse.rand(n_samples, 1, density=0.99, format="csc", random_state=0)
- inf_norm_arr = []
- for random_state in range(ROUND):
- transformer = QuantileTransformer(
- random_state=random_state,
- n_quantiles=n_quantiles,
- subsample=n_samples // 10,
- )
- transformer.fit(X)
- diff = np.linspace(0, 1, n_quantiles) - np.ravel(transformer.quantiles_)
- inf_norm = np.max(np.abs(diff))
- assert inf_norm < 1e-1
- inf_norm_arr.append(inf_norm)
- # each random subsampling yield a unique approximation to the expected
- # linspace CDF
- assert len(np.unique(inf_norm_arr)) == len(inf_norm_arr)
- def test_quantile_transform_sparse_toy():
- X = np.array(
- [
- [0.0, 2.0, 0.0],
- [25.0, 4.0, 0.0],
- [50.0, 0.0, 2.6],
- [0.0, 0.0, 4.1],
- [0.0, 6.0, 0.0],
- [0.0, 8.0, 0.0],
- [75.0, 0.0, 2.3],
- [0.0, 10.0, 0.0],
- [0.0, 0.0, 9.5],
- [100.0, 0.0, 0.1],
- ]
- )
- X = sparse.csc_matrix(X)
- transformer = QuantileTransformer(n_quantiles=10)
- transformer.fit(X)
- X_trans = transformer.fit_transform(X)
- assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.0)
- assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.0)
- X_trans_inv = transformer.inverse_transform(X_trans)
- assert_array_almost_equal(X.toarray(), X_trans_inv.toarray())
- transformer_dense = QuantileTransformer(n_quantiles=10).fit(X.toarray())
- X_trans = transformer_dense.transform(X)
- assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.0)
- assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.0)
- X_trans_inv = transformer_dense.inverse_transform(X_trans)
- assert_array_almost_equal(X.toarray(), X_trans_inv.toarray())
- def test_quantile_transform_axis1():
- X = np.array([[0, 25, 50, 75, 100], [2, 4, 6, 8, 10], [2.6, 4.1, 2.3, 9.5, 0.1]])
- X_trans_a0 = quantile_transform(X.T, axis=0, n_quantiles=5)
- X_trans_a1 = quantile_transform(X, axis=1, n_quantiles=5)
- assert_array_almost_equal(X_trans_a0, X_trans_a1.T)
- def test_quantile_transform_bounds():
- # Lower and upper bounds are manually mapped. We checked that in the case
- # of a constant feature and binary feature, the bounds are properly mapped.
- X_dense = np.array([[0, 0], [0, 0], [1, 0]])
- X_sparse = sparse.csc_matrix(X_dense)
- # check sparse and dense are consistent
- X_trans = QuantileTransformer(n_quantiles=3, random_state=0).fit_transform(X_dense)
- assert_array_almost_equal(X_trans, X_dense)
- X_trans_sp = QuantileTransformer(n_quantiles=3, random_state=0).fit_transform(
- X_sparse
- )
- assert_array_almost_equal(X_trans_sp.A, X_dense)
- assert_array_almost_equal(X_trans, X_trans_sp.A)
- # check the consistency of the bounds by learning on 1 matrix
- # and transforming another
- X = np.array([[0, 1], [0, 0.5], [1, 0]])
- X1 = np.array([[0, 0.1], [0, 0.5], [1, 0.1]])
- transformer = QuantileTransformer(n_quantiles=3).fit(X)
- X_trans = transformer.transform(X1)
- assert_array_almost_equal(X_trans, X1)
- # check that values outside of the range learned will be mapped properly.
- X = np.random.random((1000, 1))
- transformer = QuantileTransformer()
- transformer.fit(X)
- assert transformer.transform([[-10]]) == transformer.transform([[np.min(X)]])
- assert transformer.transform([[10]]) == transformer.transform([[np.max(X)]])
- assert transformer.inverse_transform([[-10]]) == transformer.inverse_transform(
- [[np.min(transformer.references_)]]
- )
- assert transformer.inverse_transform([[10]]) == transformer.inverse_transform(
- [[np.max(transformer.references_)]]
- )
- def test_quantile_transform_and_inverse():
- X_1 = iris.data
- X_2 = np.array([[0.0], [BOUNDS_THRESHOLD / 10], [1.5], [2], [3], [3], [4]])
- for X in [X_1, X_2]:
- transformer = QuantileTransformer(n_quantiles=1000, random_state=0)
- X_trans = transformer.fit_transform(X)
- X_trans_inv = transformer.inverse_transform(X_trans)
- assert_array_almost_equal(X, X_trans_inv, decimal=9)
- def test_quantile_transform_nan():
- X = np.array([[np.nan, 0, 0, 1], [np.nan, np.nan, 0, 0.5], [np.nan, 1, 1, 0]])
- transformer = QuantileTransformer(n_quantiles=10, random_state=42)
- transformer.fit_transform(X)
- # check that the quantile of the first column is all NaN
- assert np.isnan(transformer.quantiles_[:, 0]).all()
- # all other column should not contain NaN
- assert not np.isnan(transformer.quantiles_[:, 1:]).any()
- @pytest.mark.parametrize("array_type", ["array", "sparse"])
- def test_quantile_transformer_sorted_quantiles(array_type):
- # Non-regression test for:
- # https://github.com/scikit-learn/scikit-learn/issues/15733
- # Taken from upstream bug report:
- # https://github.com/numpy/numpy/issues/14685
- X = np.array([0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 1, 1, 9, 9, 9, 8, 8, 7] * 10)
- X = 0.1 * X.reshape(-1, 1)
- X = _convert_container(X, array_type)
- n_quantiles = 100
- qt = QuantileTransformer(n_quantiles=n_quantiles).fit(X)
- # Check that the estimated quantile thresholds are monotically
- # increasing:
- quantiles = qt.quantiles_[:, 0]
- assert len(quantiles) == 100
- assert all(np.diff(quantiles) >= 0)
- def test_robust_scaler_invalid_range():
- for range_ in [
- (-1, 90),
- (-2, -3),
- (10, 101),
- (100.5, 101),
- (90, 50),
- ]:
- scaler = RobustScaler(quantile_range=range_)
- with pytest.raises(ValueError, match=r"Invalid quantile range: \("):
- scaler.fit(iris.data)
- def test_scale_function_without_centering():
- rng = np.random.RandomState(42)
- X = rng.randn(4, 5)
- X[:, 0] = 0.0 # first feature is always of zero
- X_csr = sparse.csr_matrix(X)
- X_scaled = scale(X, with_mean=False)
- assert not np.any(np.isnan(X_scaled))
- X_csr_scaled = scale(X_csr, with_mean=False)
- assert not np.any(np.isnan(X_csr_scaled.data))
- # test csc has same outcome
- X_csc_scaled = scale(X_csr.tocsc(), with_mean=False)
- assert_array_almost_equal(X_scaled, X_csc_scaled.toarray())
- # raises value error on axis != 0
- with pytest.raises(ValueError):
- scale(X_csr, with_mean=False, axis=1)
- assert_array_almost_equal(
- X_scaled.mean(axis=0), [0.0, -0.01, 2.24, -0.35, -0.78], 2
- )
- assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
- # Check that X has not been copied
- assert X_scaled is not X
- X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0)
- assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
- assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))
- # null scale
- X_csr_scaled = scale(X_csr, with_mean=False, with_std=False, copy=True)
- assert_array_almost_equal(X_csr.toarray(), X_csr_scaled.toarray())
- def test_robust_scale_axis1():
- X = iris.data
- X_trans = robust_scale(X, axis=1)
- assert_array_almost_equal(np.median(X_trans, axis=1), 0)
- q = np.percentile(X_trans, q=(25, 75), axis=1)
- iqr = q[1] - q[0]
- assert_array_almost_equal(iqr, 1)
- def test_robust_scale_1d_array():
- X = iris.data[:, 1]
- X_trans = robust_scale(X)
- assert_array_almost_equal(np.median(X_trans), 0)
- q = np.percentile(X_trans, q=(25, 75))
- iqr = q[1] - q[0]
- assert_array_almost_equal(iqr, 1)
- def test_robust_scaler_zero_variance_features():
- # Check RobustScaler on toy data with zero variance features
- X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.1], [0.0, 1.0, +1.1]]
- scaler = RobustScaler()
- X_trans = scaler.fit_transform(X)
- # NOTE: for such a small sample size, what we expect in the third column
- # depends HEAVILY on the method used to calculate quantiles. The values
- # here were calculated to fit the quantiles produces by np.percentile
- # using numpy 1.9 Calculating quantiles with
- # scipy.stats.mstats.scoreatquantile or scipy.stats.mstats.mquantiles
- # would yield very different results!
- X_expected = [[0.0, 0.0, +0.0], [0.0, 0.0, -1.0], [0.0, 0.0, +1.0]]
- assert_array_almost_equal(X_trans, X_expected)
- X_trans_inv = scaler.inverse_transform(X_trans)
- assert_array_almost_equal(X, X_trans_inv)
- # make sure new data gets transformed correctly
- X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]]
- X_trans_new = scaler.transform(X_new)
- X_expected_new = [[+0.0, 1.0, +0.0], [-1.0, 0.0, -0.83333], [+0.0, 0.0, +1.66667]]
- assert_array_almost_equal(X_trans_new, X_expected_new, decimal=3)
- def test_robust_scaler_unit_variance():
- # Check RobustScaler with unit_variance=True on standard normal data with
- # outliers
- rng = np.random.RandomState(42)
- X = rng.randn(1000000, 1)
- X_with_outliers = np.vstack([X, np.ones((100, 1)) * 100, np.ones((100, 1)) * -100])
- quantile_range = (1, 99)
- robust_scaler = RobustScaler(quantile_range=quantile_range, unit_variance=True).fit(
- X_with_outliers
- )
- X_trans = robust_scaler.transform(X)
- assert robust_scaler.center_ == pytest.approx(0, abs=1e-3)
- assert robust_scaler.scale_ == pytest.approx(1, abs=1e-2)
- assert X_trans.std() == pytest.approx(1, abs=1e-2)
- def test_maxabs_scaler_zero_variance_features():
- # Check MaxAbsScaler on toy data with zero variance features
- X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.3], [0.0, 1.0, +1.5], [0.0, 0.0, +0.0]]
- scaler = MaxAbsScaler()
- X_trans = scaler.fit_transform(X)
- X_expected = [
- [0.0, 1.0, 1.0 / 3.0],
- [0.0, 1.0, -0.2],
- [0.0, 1.0, 1.0],
- [0.0, 0.0, 0.0],
- ]
- assert_array_almost_equal(X_trans, X_expected)
- X_trans_inv = scaler.inverse_transform(X_trans)
- assert_array_almost_equal(X, X_trans_inv)
- # make sure new data gets transformed correctly
- X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]]
- X_trans_new = scaler.transform(X_new)
- X_expected_new = [[+0.0, 2.0, 1.0 / 3.0], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.0]]
- assert_array_almost_equal(X_trans_new, X_expected_new, decimal=2)
- # function interface
- X_trans = maxabs_scale(X)
- assert_array_almost_equal(X_trans, X_expected)
- # sparse data
- X_csr = sparse.csr_matrix(X)
- X_csc = sparse.csc_matrix(X)
- X_trans_csr = scaler.fit_transform(X_csr)
- X_trans_csc = scaler.fit_transform(X_csc)
- X_expected = [
- [0.0, 1.0, 1.0 / 3.0],
- [0.0, 1.0, -0.2],
- [0.0, 1.0, 1.0],
- [0.0, 0.0, 0.0],
- ]
- assert_array_almost_equal(X_trans_csr.A, X_expected)
- assert_array_almost_equal(X_trans_csc.A, X_expected)
- X_trans_csr_inv = scaler.inverse_transform(X_trans_csr)
- X_trans_csc_inv = scaler.inverse_transform(X_trans_csc)
- assert_array_almost_equal(X, X_trans_csr_inv.A)
- assert_array_almost_equal(X, X_trans_csc_inv.A)
- def test_maxabs_scaler_large_negative_value():
- # Check MaxAbsScaler on toy data with a large negative value
- X = [
- [0.0, 1.0, +0.5, -1.0],
- [0.0, 1.0, -0.3, -0.5],
- [0.0, 1.0, -100.0, 0.0],
- [0.0, 0.0, +0.0, -2.0],
- ]
- scaler = MaxAbsScaler()
- X_trans = scaler.fit_transform(X)
- X_expected = [
- [0.0, 1.0, 0.005, -0.5],
- [0.0, 1.0, -0.003, -0.25],
- [0.0, 1.0, -1.0, 0.0],
- [0.0, 0.0, 0.0, -1.0],
- ]
- assert_array_almost_equal(X_trans, X_expected)
- def test_maxabs_scaler_transform_one_row_csr():
- # Check MaxAbsScaler on transforming csr matrix with one row
- X = sparse.csr_matrix([[0.5, 1.0, 1.0]])
- scaler = MaxAbsScaler()
- scaler = scaler.fit(X)
- X_trans = scaler.transform(X)
- X_expected = sparse.csr_matrix([[1.0, 1.0, 1.0]])
- assert_array_almost_equal(X_trans.toarray(), X_expected.toarray())
- X_scaled_back = scaler.inverse_transform(X_trans)
- assert_array_almost_equal(X.toarray(), X_scaled_back.toarray())
- def test_maxabs_scaler_1d():
- # Test scaling of dataset along single axis
- for X in [X_1row, X_1col, X_list_1row, X_list_1row]:
- scaler = MaxAbsScaler(copy=True)
- X_scaled = scaler.fit(X).transform(X)
- if isinstance(X, list):
- X = np.array(X) # cast only after scaling done
- if _check_dim_1axis(X) == 1:
- assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), np.ones(n_features))
- else:
- assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.0)
- assert scaler.n_samples_seen_ == X.shape[0]
- # check inverse transform
- X_scaled_back = scaler.inverse_transform(X_scaled)
- assert_array_almost_equal(X_scaled_back, X)
- # Constant feature
- X = np.ones((5, 1))
- scaler = MaxAbsScaler()
- X_scaled = scaler.fit(X).transform(X)
- assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.0)
- assert scaler.n_samples_seen_ == X.shape[0]
- # function interface
- X_1d = X_1row.ravel()
- max_abs = np.abs(X_1d).max()
- assert_array_almost_equal(X_1d / max_abs, maxabs_scale(X_1d, copy=True))
- def test_maxabs_scaler_partial_fit():
- # Test if partial_fit run over many batches of size 1 and 50
- # gives the same results as fit
- X = X_2d[:100, :]
- n = X.shape[0]
- for chunk_size in [1, 2, 50, n, n + 42]:
- # Test mean at the end of the process
- scaler_batch = MaxAbsScaler().fit(X)
- scaler_incr = MaxAbsScaler()
- scaler_incr_csr = MaxAbsScaler()
- scaler_incr_csc = MaxAbsScaler()
- for batch in gen_batches(n, chunk_size):
- scaler_incr = scaler_incr.partial_fit(X[batch])
- X_csr = sparse.csr_matrix(X[batch])
- scaler_incr_csr = scaler_incr_csr.partial_fit(X_csr)
- X_csc = sparse.csc_matrix(X[batch])
- scaler_incr_csc = scaler_incr_csc.partial_fit(X_csc)
- assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
- assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csr.max_abs_)
- assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csc.max_abs_)
- assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
- assert scaler_batch.n_samples_seen_ == scaler_incr_csr.n_samples_seen_
- assert scaler_batch.n_samples_seen_ == scaler_incr_csc.n_samples_seen_
- assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
- assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csr.scale_)
- assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csc.scale_)
- assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X))
- # Test std after 1 step
- batch0 = slice(0, chunk_size)
- scaler_batch = MaxAbsScaler().fit(X[batch0])
- scaler_incr = MaxAbsScaler().partial_fit(X[batch0])
- assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
- assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
- assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
- assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X))
- # Test std until the end of partial fits, and
- scaler_batch = MaxAbsScaler().fit(X)
- scaler_incr = MaxAbsScaler() # Clean estimator
- for i, batch in enumerate(gen_batches(n, chunk_size)):
- scaler_incr = scaler_incr.partial_fit(X[batch])
- assert_correct_incr(
- i,
- batch_start=batch.start,
- batch_stop=batch.stop,
- n=n,
- chunk_size=chunk_size,
- n_samples_seen=scaler_incr.n_samples_seen_,
- )
- def test_normalizer_l1():
- rng = np.random.RandomState(0)
- X_dense = rng.randn(4, 5)
- X_sparse_unpruned = sparse.csr_matrix(X_dense)
- # set the row number 3 to zero
- X_dense[3, :] = 0.0
- # set the row number 3 to zero without pruning (can happen in real life)
- indptr_3 = X_sparse_unpruned.indptr[3]
- indptr_4 = X_sparse_unpruned.indptr[4]
- X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0
- # build the pruned variant using the regular constructor
- X_sparse_pruned = sparse.csr_matrix(X_dense)
- # check inputs that support the no-copy optim
- for X in (X_dense, X_sparse_pruned, X_sparse_unpruned):
- normalizer = Normalizer(norm="l1", copy=True)
- X_norm = normalizer.transform(X)
- assert X_norm is not X
- X_norm1 = toarray(X_norm)
- normalizer = Normalizer(norm="l1", copy=False)
- X_norm = normalizer.transform(X)
- assert X_norm is X
- X_norm2 = toarray(X_norm)
- for X_norm in (X_norm1, X_norm2):
- row_sums = np.abs(X_norm).sum(axis=1)
- for i in range(3):
- assert_almost_equal(row_sums[i], 1.0)
- assert_almost_equal(row_sums[3], 0.0)
- # check input for which copy=False won't prevent a copy
- for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix):
- X = init(X_dense)
- X_norm = normalizer = Normalizer(norm="l2", copy=False).transform(X)
- assert X_norm is not X
- assert sparse.issparse(X_norm) and X_norm.format == "csr"
- X_norm = toarray(X_norm)
- for i in range(3):
- assert_almost_equal(row_sums[i], 1.0)
- assert_almost_equal(la.norm(X_norm[3]), 0.0)
- def test_normalizer_l2():
- rng = np.random.RandomState(0)
- X_dense = rng.randn(4, 5)
- X_sparse_unpruned = sparse.csr_matrix(X_dense)
- # set the row number 3 to zero
- X_dense[3, :] = 0.0
- # set the row number 3 to zero without pruning (can happen in real life)
- indptr_3 = X_sparse_unpruned.indptr[3]
- indptr_4 = X_sparse_unpruned.indptr[4]
- X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0
- # build the pruned variant using the regular constructor
- X_sparse_pruned = sparse.csr_matrix(X_dense)
- # check inputs that support the no-copy optim
- for X in (X_dense, X_sparse_pruned, X_sparse_unpruned):
- normalizer = Normalizer(norm="l2", copy=True)
- X_norm1 = normalizer.transform(X)
- assert X_norm1 is not X
- X_norm1 = toarray(X_norm1)
- normalizer = Normalizer(norm="l2", copy=False)
- X_norm2 = normalizer.transform(X)
- assert X_norm2 is X
- X_norm2 = toarray(X_norm2)
- for X_norm in (X_norm1, X_norm2):
- for i in range(3):
- assert_almost_equal(la.norm(X_norm[i]), 1.0)
- assert_almost_equal(la.norm(X_norm[3]), 0.0)
- # check input for which copy=False won't prevent a copy
- for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix):
- X = init(X_dense)
- X_norm = normalizer = Normalizer(norm="l2", copy=False).transform(X)
- assert X_norm is not X
- assert sparse.issparse(X_norm) and X_norm.format == "csr"
- X_norm = toarray(X_norm)
- for i in range(3):
- assert_almost_equal(la.norm(X_norm[i]), 1.0)
- assert_almost_equal(la.norm(X_norm[3]), 0.0)
- def test_normalizer_max():
- rng = np.random.RandomState(0)
- X_dense = rng.randn(4, 5)
- X_sparse_unpruned = sparse.csr_matrix(X_dense)
- # set the row number 3 to zero
- X_dense[3, :] = 0.0
- # set the row number 3 to zero without pruning (can happen in real life)
- indptr_3 = X_sparse_unpruned.indptr[3]
- indptr_4 = X_sparse_unpruned.indptr[4]
- X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0
- # build the pruned variant using the regular constructor
- X_sparse_pruned = sparse.csr_matrix(X_dense)
- # check inputs that support the no-copy optim
- for X in (X_dense, X_sparse_pruned, X_sparse_unpruned):
- normalizer = Normalizer(norm="max", copy=True)
- X_norm1 = normalizer.transform(X)
- assert X_norm1 is not X
- X_norm1 = toarray(X_norm1)
- normalizer = Normalizer(norm="max", copy=False)
- X_norm2 = normalizer.transform(X)
- assert X_norm2 is X
- X_norm2 = toarray(X_norm2)
- for X_norm in (X_norm1, X_norm2):
- row_maxs = abs(X_norm).max(axis=1)
- for i in range(3):
- assert_almost_equal(row_maxs[i], 1.0)
- assert_almost_equal(row_maxs[3], 0.0)
- # check input for which copy=False won't prevent a copy
- for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix):
- X = init(X_dense)
- X_norm = normalizer = Normalizer(norm="l2", copy=False).transform(X)
- assert X_norm is not X
- assert sparse.issparse(X_norm) and X_norm.format == "csr"
- X_norm = toarray(X_norm)
- for i in range(3):
- assert_almost_equal(row_maxs[i], 1.0)
- assert_almost_equal(la.norm(X_norm[3]), 0.0)
- def test_normalizer_max_sign():
- # check that we normalize by a positive number even for negative data
- rng = np.random.RandomState(0)
- X_dense = rng.randn(4, 5)
- # set the row number 3 to zero
- X_dense[3, :] = 0.0
- # check for mixed data where the value with
- # largest magnitude is negative
- X_dense[2, abs(X_dense[2, :]).argmax()] *= -1
- X_all_neg = -np.abs(X_dense)
- X_all_neg_sparse = sparse.csr_matrix(X_all_neg)
- for X in (X_dense, X_all_neg, X_all_neg_sparse):
- normalizer = Normalizer(norm="max")
- X_norm = normalizer.transform(X)
- assert X_norm is not X
- X_norm = toarray(X_norm)
- assert_array_equal(np.sign(X_norm), np.sign(toarray(X)))
- def test_normalize():
- # Test normalize function
- # Only tests functionality not used by the tests for Normalizer.
- X = np.random.RandomState(37).randn(3, 2)
- assert_array_equal(normalize(X, copy=False), normalize(X.T, axis=0, copy=False).T)
- rs = np.random.RandomState(0)
- X_dense = rs.randn(10, 5)
- X_sparse = sparse.csr_matrix(X_dense)
- ones = np.ones((10))
- for X in (X_dense, X_sparse):
- for dtype in (np.float32, np.float64):
- for norm in ("l1", "l2"):
- X = X.astype(dtype)
- X_norm = normalize(X, norm=norm)
- assert X_norm.dtype == dtype
- X_norm = toarray(X_norm)
- if norm == "l1":
- row_sums = np.abs(X_norm).sum(axis=1)
- else:
- X_norm_squared = X_norm**2
- row_sums = X_norm_squared.sum(axis=1)
- assert_array_almost_equal(row_sums, ones)
- # Test return_norm
- X_dense = np.array([[3.0, 0, 4.0], [1.0, 0.0, 0.0], [2.0, 3.0, 0.0]])
- for norm in ("l1", "l2", "max"):
- _, norms = normalize(X_dense, norm=norm, return_norm=True)
- if norm == "l1":
- assert_array_almost_equal(norms, np.array([7.0, 1.0, 5.0]))
- elif norm == "l2":
- assert_array_almost_equal(norms, np.array([5.0, 1.0, 3.60555127]))
- else:
- assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0]))
- X_sparse = sparse.csr_matrix(X_dense)
- for norm in ("l1", "l2"):
- with pytest.raises(NotImplementedError):
- normalize(X_sparse, norm=norm, return_norm=True)
- _, norms = normalize(X_sparse, norm="max", return_norm=True)
- assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0]))
- def test_binarizer():
- X_ = np.array([[1, 0, 5], [2, 3, -1]])
- for init in (np.array, list, sparse.csr_matrix, sparse.csc_matrix):
- X = init(X_.copy())
- binarizer = Binarizer(threshold=2.0, copy=True)
- X_bin = toarray(binarizer.transform(X))
- assert np.sum(X_bin == 0) == 4
- assert np.sum(X_bin == 1) == 2
- X_bin = binarizer.transform(X)
- assert sparse.issparse(X) == sparse.issparse(X_bin)
- binarizer = Binarizer(copy=True).fit(X)
- X_bin = toarray(binarizer.transform(X))
- assert X_bin is not X
- assert np.sum(X_bin == 0) == 2
- assert np.sum(X_bin == 1) == 4
- binarizer = Binarizer(copy=True)
- X_bin = binarizer.transform(X)
- assert X_bin is not X
- X_bin = toarray(X_bin)
- assert np.sum(X_bin == 0) == 2
- assert np.sum(X_bin == 1) == 4
- binarizer = Binarizer(copy=False)
- X_bin = binarizer.transform(X)
- if init is not list:
- assert X_bin is X
- binarizer = Binarizer(copy=False)
- X_float = np.array([[1, 0, 5], [2, 3, -1]], dtype=np.float64)
- X_bin = binarizer.transform(X_float)
- if init is not list:
- assert X_bin is X_float
- X_bin = toarray(X_bin)
- assert np.sum(X_bin == 0) == 2
- assert np.sum(X_bin == 1) == 4
- binarizer = Binarizer(threshold=-0.5, copy=True)
- for init in (np.array, list):
- X = init(X_.copy())
- X_bin = toarray(binarizer.transform(X))
- assert np.sum(X_bin == 0) == 1
- assert np.sum(X_bin == 1) == 5
- X_bin = binarizer.transform(X)
- # Cannot use threshold < 0 for sparse
- with pytest.raises(ValueError):
- binarizer.transform(sparse.csc_matrix(X))
- def test_center_kernel():
- # Test that KernelCenterer is equivalent to StandardScaler
- # in feature space
- rng = np.random.RandomState(0)
- X_fit = rng.random_sample((5, 4))
- scaler = StandardScaler(with_std=False)
- scaler.fit(X_fit)
- X_fit_centered = scaler.transform(X_fit)
- K_fit = np.dot(X_fit, X_fit.T)
- # center fit time matrix
- centerer = KernelCenterer()
- K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T)
- K_fit_centered2 = centerer.fit_transform(K_fit)
- assert_array_almost_equal(K_fit_centered, K_fit_centered2)
- # center predict time matrix
- X_pred = rng.random_sample((2, 4))
- K_pred = np.dot(X_pred, X_fit.T)
- X_pred_centered = scaler.transform(X_pred)
- K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T)
- K_pred_centered2 = centerer.transform(K_pred)
- assert_array_almost_equal(K_pred_centered, K_pred_centered2)
- # check the results coherence with the method proposed in:
- # B. Schölkopf, A. Smola, and K.R. Müller,
- # "Nonlinear component analysis as a kernel eigenvalue problem"
- # equation (B.3)
- # K_centered3 = (I - 1_M) K (I - 1_M)
- # = K - 1_M K - K 1_M + 1_M K 1_M
- ones_M = np.ones_like(K_fit) / K_fit.shape[0]
- K_fit_centered3 = K_fit - ones_M @ K_fit - K_fit @ ones_M + ones_M @ K_fit @ ones_M
- assert_allclose(K_fit_centered, K_fit_centered3)
- # K_test_centered3 = (K_test - 1'_M K)(I - 1_M)
- # = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M
- ones_prime_M = np.ones_like(K_pred) / K_fit.shape[0]
- K_pred_centered3 = (
- K_pred - ones_prime_M @ K_fit - K_pred @ ones_M + ones_prime_M @ K_fit @ ones_M
- )
- assert_allclose(K_pred_centered, K_pred_centered3)
- def test_kernelcenterer_non_linear_kernel():
- """Check kernel centering for non-linear kernel."""
- rng = np.random.RandomState(0)
- X, X_test = rng.randn(100, 50), rng.randn(20, 50)
- def phi(X):
- """Our mapping function phi."""
- return np.vstack(
- [
- np.clip(X, a_min=0, a_max=None),
- -np.clip(X, a_min=None, a_max=0),
- ]
- )
- phi_X = phi(X)
- phi_X_test = phi(X_test)
- # centered the projection
- scaler = StandardScaler(with_std=False)
- phi_X_center = scaler.fit_transform(phi_X)
- phi_X_test_center = scaler.transform(phi_X_test)
- # create the different kernel
- K = phi_X @ phi_X.T
- K_test = phi_X_test @ phi_X.T
- K_center = phi_X_center @ phi_X_center.T
- K_test_center = phi_X_test_center @ phi_X_center.T
- kernel_centerer = KernelCenterer()
- kernel_centerer.fit(K)
- assert_allclose(kernel_centerer.transform(K), K_center)
- assert_allclose(kernel_centerer.transform(K_test), K_test_center)
- # check the results coherence with the method proposed in:
- # B. Schölkopf, A. Smola, and K.R. Müller,
- # "Nonlinear component analysis as a kernel eigenvalue problem"
- # equation (B.3)
- # K_centered = (I - 1_M) K (I - 1_M)
- # = K - 1_M K - K 1_M + 1_M K 1_M
- ones_M = np.ones_like(K) / K.shape[0]
- K_centered = K - ones_M @ K - K @ ones_M + ones_M @ K @ ones_M
- assert_allclose(kernel_centerer.transform(K), K_centered)
- # K_test_centered = (K_test - 1'_M K)(I - 1_M)
- # = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M
- ones_prime_M = np.ones_like(K_test) / K.shape[0]
- K_test_centered = (
- K_test - ones_prime_M @ K - K_test @ ones_M + ones_prime_M @ K @ ones_M
- )
- assert_allclose(kernel_centerer.transform(K_test), K_test_centered)
- def test_cv_pipeline_precomputed():
- # Cross-validate a regression on four coplanar points with the same
- # value. Use precomputed kernel to ensure Pipeline with KernelCenterer
- # is treated as a pairwise operation.
- X = np.array([[3, 0, 0], [0, 3, 0], [0, 0, 3], [1, 1, 1]])
- y_true = np.ones((4,))
- K = X.dot(X.T)
- kcent = KernelCenterer()
- pipeline = Pipeline([("kernel_centerer", kcent), ("svr", SVR())])
- # did the pipeline set the pairwise attribute?
- assert pipeline._get_tags()["pairwise"]
- # test cross-validation, score should be almost perfect
- # NB: this test is pretty vacuous -- it's mainly to test integration
- # of Pipeline and KernelCenterer
- y_pred = cross_val_predict(pipeline, K, y_true, cv=2)
- assert_array_almost_equal(y_true, y_pred)
- def test_fit_transform():
- rng = np.random.RandomState(0)
- X = rng.random_sample((5, 4))
- for obj in (StandardScaler(), Normalizer(), Binarizer()):
- X_transformed = obj.fit(X).transform(X)
- X_transformed2 = obj.fit_transform(X)
- assert_array_equal(X_transformed, X_transformed2)
- def test_add_dummy_feature():
- X = [[1, 0], [0, 1], [0, 1]]
- X = add_dummy_feature(X)
- assert_array_equal(X, [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
- def test_add_dummy_feature_coo():
- X = sparse.coo_matrix([[1, 0], [0, 1], [0, 1]])
- X = add_dummy_feature(X)
- assert sparse.issparse(X) and X.format == "coo", X
- assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
- def test_add_dummy_feature_csc():
- X = sparse.csc_matrix([[1, 0], [0, 1], [0, 1]])
- X = add_dummy_feature(X)
- assert sparse.issparse(X) and X.format == "csc", X
- assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
- def test_add_dummy_feature_csr():
- X = sparse.csr_matrix([[1, 0], [0, 1], [0, 1]])
- X = add_dummy_feature(X)
- assert sparse.issparse(X) and X.format == "csr", X
- assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
- def test_fit_cold_start():
- X = iris.data
- X_2d = X[:, :2]
- # Scalers that have a partial_fit method
- scalers = [
- StandardScaler(with_mean=False, with_std=False),
- MinMaxScaler(),
- MaxAbsScaler(),
- ]
- for scaler in scalers:
- scaler.fit_transform(X)
- # with a different shape, this may break the scaler unless the internal
- # state is reset
- scaler.fit_transform(X_2d)
- @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
- def test_power_transformer_notfitted(method):
- pt = PowerTransformer(method=method)
- X = np.abs(X_1col)
- with pytest.raises(NotFittedError):
- pt.transform(X)
- with pytest.raises(NotFittedError):
- pt.inverse_transform(X)
- @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
- @pytest.mark.parametrize("standardize", [True, False])
- @pytest.mark.parametrize("X", [X_1col, X_2d])
- def test_power_transformer_inverse(method, standardize, X):
- # Make sure we get the original input when applying transform and then
- # inverse transform
- X = np.abs(X) if method == "box-cox" else X
- pt = PowerTransformer(method=method, standardize=standardize)
- X_trans = pt.fit_transform(X)
- assert_almost_equal(X, pt.inverse_transform(X_trans))
- def test_power_transformer_1d():
- X = np.abs(X_1col)
- for standardize in [True, False]:
- pt = PowerTransformer(method="box-cox", standardize=standardize)
- X_trans = pt.fit_transform(X)
- X_trans_func = power_transform(X, method="box-cox", standardize=standardize)
- X_expected, lambda_expected = stats.boxcox(X.flatten())
- if standardize:
- X_expected = scale(X_expected)
- assert_almost_equal(X_expected.reshape(-1, 1), X_trans)
- assert_almost_equal(X_expected.reshape(-1, 1), X_trans_func)
- assert_almost_equal(X, pt.inverse_transform(X_trans))
- assert_almost_equal(lambda_expected, pt.lambdas_[0])
- assert len(pt.lambdas_) == X.shape[1]
- assert isinstance(pt.lambdas_, np.ndarray)
- def test_power_transformer_2d():
- X = np.abs(X_2d)
- for standardize in [True, False]:
- pt = PowerTransformer(method="box-cox", standardize=standardize)
- X_trans_class = pt.fit_transform(X)
- X_trans_func = power_transform(X, method="box-cox", standardize=standardize)
- for X_trans in [X_trans_class, X_trans_func]:
- for j in range(X_trans.shape[1]):
- X_expected, lmbda = stats.boxcox(X[:, j].flatten())
- if standardize:
- X_expected = scale(X_expected)
- assert_almost_equal(X_trans[:, j], X_expected)
- assert_almost_equal(lmbda, pt.lambdas_[j])
- # Test inverse transformation
- X_inv = pt.inverse_transform(X_trans)
- assert_array_almost_equal(X_inv, X)
- assert len(pt.lambdas_) == X.shape[1]
- assert isinstance(pt.lambdas_, np.ndarray)
- def test_power_transformer_boxcox_strictly_positive_exception():
- # Exceptions should be raised for negative arrays and zero arrays when
- # method is boxcox
- pt = PowerTransformer(method="box-cox")
- pt.fit(np.abs(X_2d))
- X_with_negatives = X_2d
- not_positive_message = "strictly positive"
- with pytest.raises(ValueError, match=not_positive_message):
- pt.transform(X_with_negatives)
- with pytest.raises(ValueError, match=not_positive_message):
- pt.fit(X_with_negatives)
- with pytest.raises(ValueError, match=not_positive_message):
- power_transform(X_with_negatives, method="box-cox")
- with pytest.raises(ValueError, match=not_positive_message):
- pt.transform(np.zeros(X_2d.shape))
- with pytest.raises(ValueError, match=not_positive_message):
- pt.fit(np.zeros(X_2d.shape))
- with pytest.raises(ValueError, match=not_positive_message):
- power_transform(np.zeros(X_2d.shape), method="box-cox")
- @pytest.mark.parametrize("X", [X_2d, np.abs(X_2d), -np.abs(X_2d), np.zeros(X_2d.shape)])
- def test_power_transformer_yeojohnson_any_input(X):
- # Yeo-Johnson method should support any kind of input
- power_transform(X, method="yeo-johnson")
- @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
- def test_power_transformer_shape_exception(method):
- pt = PowerTransformer(method=method)
- X = np.abs(X_2d)
- pt.fit(X)
- # Exceptions should be raised for arrays with different num_columns
- # than during fitting
- wrong_shape_message = (
- r"X has \d+ features, but PowerTransformer is " r"expecting \d+ features"
- )
- with pytest.raises(ValueError, match=wrong_shape_message):
- pt.transform(X[:, 0:1])
- with pytest.raises(ValueError, match=wrong_shape_message):
- pt.inverse_transform(X[:, 0:1])
- def test_power_transformer_lambda_zero():
- pt = PowerTransformer(method="box-cox", standardize=False)
- X = np.abs(X_2d)[:, 0:1]
- # Test the lambda = 0 case
- pt.lambdas_ = np.array([0])
- X_trans = pt.transform(X)
- assert_array_almost_equal(pt.inverse_transform(X_trans), X)
- def test_power_transformer_lambda_one():
- # Make sure lambda = 1 corresponds to the identity for yeo-johnson
- pt = PowerTransformer(method="yeo-johnson", standardize=False)
- X = np.abs(X_2d)[:, 0:1]
- pt.lambdas_ = np.array([1])
- X_trans = pt.transform(X)
- assert_array_almost_equal(X_trans, X)
- @pytest.mark.parametrize(
- "method, lmbda",
- [
- ("box-cox", 0.1),
- ("box-cox", 0.5),
- ("yeo-johnson", 0.1),
- ("yeo-johnson", 0.5),
- ("yeo-johnson", 1.0),
- ],
- )
- def test_optimization_power_transformer(method, lmbda):
- # Test the optimization procedure:
- # - set a predefined value for lambda
- # - apply inverse_transform to a normal dist (we get X_inv)
- # - apply fit_transform to X_inv (we get X_inv_trans)
- # - check that X_inv_trans is roughly equal to X
- rng = np.random.RandomState(0)
- n_samples = 20000
- X = rng.normal(loc=0, scale=1, size=(n_samples, 1))
- pt = PowerTransformer(method=method, standardize=False)
- pt.lambdas_ = [lmbda]
- X_inv = pt.inverse_transform(X)
- pt = PowerTransformer(method=method, standardize=False)
- X_inv_trans = pt.fit_transform(X_inv)
- assert_almost_equal(0, np.linalg.norm(X - X_inv_trans) / n_samples, decimal=2)
- assert_almost_equal(0, X_inv_trans.mean(), decimal=1)
- assert_almost_equal(1, X_inv_trans.std(), decimal=1)
- def test_yeo_johnson_darwin_example():
- # test from original paper "A new family of power transformations to
- # improve normality or symmetry" by Yeo and Johnson.
- X = [6.1, -8.4, 1.0, 2.0, 0.7, 2.9, 3.5, 5.1, 1.8, 3.6, 7.0, 3.0, 9.3, 7.5, -6.0]
- X = np.array(X).reshape(-1, 1)
- lmbda = PowerTransformer(method="yeo-johnson").fit(X).lambdas_
- assert np.allclose(lmbda, 1.305, atol=1e-3)
- @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
- def test_power_transformer_nans(method):
- # Make sure lambda estimation is not influenced by NaN values
- # and that transform() supports NaN silently
- X = np.abs(X_1col)
- pt = PowerTransformer(method=method)
- pt.fit(X)
- lmbda_no_nans = pt.lambdas_[0]
- # concat nans at the end and check lambda stays the same
- X = np.concatenate([X, np.full_like(X, np.nan)])
- X = shuffle(X, random_state=0)
- pt.fit(X)
- lmbda_nans = pt.lambdas_[0]
- assert_almost_equal(lmbda_no_nans, lmbda_nans, decimal=5)
- X_trans = pt.transform(X)
- assert_array_equal(np.isnan(X_trans), np.isnan(X))
- @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
- @pytest.mark.parametrize("standardize", [True, False])
- def test_power_transformer_fit_transform(method, standardize):
- # check that fit_transform() and fit().transform() return the same values
- X = X_1col
- if method == "box-cox":
- X = np.abs(X)
- pt = PowerTransformer(method, standardize=standardize)
- assert_array_almost_equal(pt.fit(X).transform(X), pt.fit_transform(X))
- @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
- @pytest.mark.parametrize("standardize", [True, False])
- def test_power_transformer_copy_True(method, standardize):
- # Check that neither fit, transform, fit_transform nor inverse_transform
- # modify X inplace when copy=True
- X = X_1col
- if method == "box-cox":
- X = np.abs(X)
- X_original = X.copy()
- assert X is not X_original # sanity checks
- assert_array_almost_equal(X, X_original)
- pt = PowerTransformer(method, standardize=standardize, copy=True)
- pt.fit(X)
- assert_array_almost_equal(X, X_original)
- X_trans = pt.transform(X)
- assert X_trans is not X
- X_trans = pt.fit_transform(X)
- assert_array_almost_equal(X, X_original)
- assert X_trans is not X
- X_inv_trans = pt.inverse_transform(X_trans)
- assert X_trans is not X_inv_trans
- @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
- @pytest.mark.parametrize("standardize", [True, False])
- def test_power_transformer_copy_False(method, standardize):
- # check that when copy=False fit doesn't change X inplace but transform,
- # fit_transform and inverse_transform do.
- X = X_1col
- if method == "box-cox":
- X = np.abs(X)
- X_original = X.copy()
- assert X is not X_original # sanity checks
- assert_array_almost_equal(X, X_original)
- pt = PowerTransformer(method, standardize=standardize, copy=False)
- pt.fit(X)
- assert_array_almost_equal(X, X_original) # fit didn't change X
- X_trans = pt.transform(X)
- assert X_trans is X
- if method == "box-cox":
- X = np.abs(X)
- X_trans = pt.fit_transform(X)
- assert X_trans is X
- X_inv_trans = pt.inverse_transform(X_trans)
- assert X_trans is X_inv_trans
- def test_power_transformer_box_cox_raise_all_nans_col():
- """Check that box-cox raises informative when a column contains all nans.
- Non-regression test for gh-26303
- """
- X = rng.random_sample((4, 5))
- X[:, 0] = np.nan
- err_msg = "Column must not be all nan."
- pt = PowerTransformer(method="box-cox")
- with pytest.raises(ValueError, match=err_msg):
- pt.fit_transform(X)
- @pytest.mark.parametrize(
- "X_2",
- [
- sparse.random(10, 1, density=0.8, random_state=0),
- sparse.csr_matrix(np.full((10, 1), fill_value=np.nan)),
- ],
- )
- def test_standard_scaler_sparse_partial_fit_finite_variance(X_2):
- # non-regression test for:
- # https://github.com/scikit-learn/scikit-learn/issues/16448
- X_1 = sparse.random(5, 1, density=0.8)
- scaler = StandardScaler(with_mean=False)
- scaler.fit(X_1).partial_fit(X_2)
- assert np.isfinite(scaler.var_[0])
- @pytest.mark.parametrize("feature_range", [(0, 1), (-10, 10)])
- def test_minmax_scaler_clip(feature_range):
- # test behaviour of the parameter 'clip' in MinMaxScaler
- X = iris.data
- scaler = MinMaxScaler(feature_range=feature_range, clip=True).fit(X)
- X_min, X_max = np.min(X, axis=0), np.max(X, axis=0)
- X_test = [np.r_[X_min[:2] - 10, X_max[2:] + 10]]
- X_transformed = scaler.transform(X_test)
- assert_allclose(
- X_transformed,
- [[feature_range[0], feature_range[0], feature_range[1], feature_range[1]]],
- )
- def test_standard_scaler_raise_error_for_1d_input():
- """Check that `inverse_transform` from `StandardScaler` raises an error
- with 1D array.
- Non-regression test for:
- https://github.com/scikit-learn/scikit-learn/issues/19518
- """
- scaler = StandardScaler().fit(X_2d)
- err_msg = "Expected 2D array, got 1D array instead"
- with pytest.raises(ValueError, match=err_msg):
- scaler.inverse_transform(X_2d[:, 0])
- def test_power_transformer_significantly_non_gaussian():
- """Check that significantly non-Gaussian data before transforms correctly.
- For some explored lambdas, the transformed data may be constant and will
- be rejected. Non-regression test for
- https://github.com/scikit-learn/scikit-learn/issues/14959
- """
- X_non_gaussian = 1e6 * np.array(
- [0.6, 2.0, 3.0, 4.0] * 4 + [11, 12, 12, 16, 17, 20, 85, 90], dtype=np.float64
- ).reshape(-1, 1)
- pt = PowerTransformer()
- with warnings.catch_warnings():
- warnings.simplefilter("error", RuntimeWarning)
- X_trans = pt.fit_transform(X_non_gaussian)
- assert not np.any(np.isnan(X_trans))
- assert X_trans.mean() == pytest.approx(0.0)
- assert X_trans.std() == pytest.approx(1.0)
- assert X_trans.min() > -2
- assert X_trans.max() < 2
- @pytest.mark.parametrize(
- "Transformer",
- [
- MinMaxScaler,
- MaxAbsScaler,
- RobustScaler,
- StandardScaler,
- QuantileTransformer,
- PowerTransformer,
- ],
- )
- def test_one_to_one_features(Transformer):
- """Check one-to-one transformers give correct feature names."""
- tr = Transformer().fit(iris.data)
- names_out = tr.get_feature_names_out(iris.feature_names)
- assert_array_equal(names_out, iris.feature_names)
- @pytest.mark.parametrize(
- "Transformer",
- [
- MinMaxScaler,
- MaxAbsScaler,
- RobustScaler,
- StandardScaler,
- QuantileTransformer,
- PowerTransformer,
- Normalizer,
- Binarizer,
- ],
- )
- def test_one_to_one_features_pandas(Transformer):
- """Check one-to-one transformers give correct feature names."""
- pd = pytest.importorskip("pandas")
- df = pd.DataFrame(iris.data, columns=iris.feature_names)
- tr = Transformer().fit(df)
- names_out_df_default = tr.get_feature_names_out()
- assert_array_equal(names_out_df_default, iris.feature_names)
- names_out_df_valid_in = tr.get_feature_names_out(iris.feature_names)
- assert_array_equal(names_out_df_valid_in, iris.feature_names)
- msg = re.escape("input_features is not equal to feature_names_in_")
- with pytest.raises(ValueError, match=msg):
- invalid_names = list("abcd")
- tr.get_feature_names_out(invalid_names)
- def test_kernel_centerer_feature_names_out():
- """Test that kernel centerer `feature_names_out`."""
- rng = np.random.RandomState(0)
- X = rng.random_sample((6, 4))
- X_pairwise = linear_kernel(X)
- centerer = KernelCenterer().fit(X_pairwise)
- names_out = centerer.get_feature_names_out()
- samples_out2 = X_pairwise.shape[1]
- assert_array_equal(names_out, [f"kernelcenterer{i}" for i in range(samples_out2)])
- @pytest.mark.parametrize("standardize", [True, False])
- def test_power_transformer_constant_feature(standardize):
- """Check that PowerTransfomer leaves constant features unchanged."""
- X = [[-2, 0, 2], [-2, 0, 2], [-2, 0, 2]]
- pt = PowerTransformer(method="yeo-johnson", standardize=standardize).fit(X)
- assert_allclose(pt.lambdas_, [1, 1, 1])
- Xft = pt.fit_transform(X)
- Xt = pt.transform(X)
- for Xt_ in [Xft, Xt]:
- if standardize:
- assert_allclose(Xt_, np.zeros_like(X))
- else:
- assert_allclose(Xt_, X)
|