test_data.py 95 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685
  1. # Authors:
  2. #
  3. # Giorgio Patrini
  4. #
  5. # License: BSD 3 clause
  6. import itertools
  7. import re
  8. import warnings
  9. import numpy as np
  10. import numpy.linalg as la
  11. import pytest
  12. from scipy import sparse, stats
  13. from sklearn import datasets
  14. from sklearn.base import clone
  15. from sklearn.exceptions import NotFittedError
  16. from sklearn.metrics.pairwise import linear_kernel
  17. from sklearn.model_selection import cross_val_predict
  18. from sklearn.pipeline import Pipeline
  19. from sklearn.preprocessing import (
  20. Binarizer,
  21. KernelCenterer,
  22. MaxAbsScaler,
  23. MinMaxScaler,
  24. Normalizer,
  25. PowerTransformer,
  26. QuantileTransformer,
  27. RobustScaler,
  28. StandardScaler,
  29. add_dummy_feature,
  30. maxabs_scale,
  31. minmax_scale,
  32. normalize,
  33. power_transform,
  34. quantile_transform,
  35. robust_scale,
  36. scale,
  37. )
  38. from sklearn.preprocessing._data import BOUNDS_THRESHOLD, _handle_zeros_in_scale
  39. from sklearn.svm import SVR
  40. from sklearn.utils import gen_batches, shuffle
  41. from sklearn.utils._testing import (
  42. _convert_container,
  43. assert_allclose,
  44. assert_allclose_dense_sparse,
  45. assert_almost_equal,
  46. assert_array_almost_equal,
  47. assert_array_equal,
  48. assert_array_less,
  49. skip_if_32bit,
  50. )
  51. from sklearn.utils.sparsefuncs import mean_variance_axis
  52. iris = datasets.load_iris()
  53. # Make some data to be used many times
  54. rng = np.random.RandomState(0)
  55. n_features = 30
  56. n_samples = 1000
  57. offsets = rng.uniform(-1, 1, size=n_features)
  58. scales = rng.uniform(1, 10, size=n_features)
  59. X_2d = rng.randn(n_samples, n_features) * scales + offsets
  60. X_1row = X_2d[0, :].reshape(1, n_features)
  61. X_1col = X_2d[:, 0].reshape(n_samples, 1)
  62. X_list_1row = X_1row.tolist()
  63. X_list_1col = X_1col.tolist()
  64. def toarray(a):
  65. if hasattr(a, "toarray"):
  66. a = a.toarray()
  67. return a
  68. def _check_dim_1axis(a):
  69. return np.asarray(a).shape[0]
  70. def assert_correct_incr(i, batch_start, batch_stop, n, chunk_size, n_samples_seen):
  71. if batch_stop != n:
  72. assert (i + 1) * chunk_size == n_samples_seen
  73. else:
  74. assert i * chunk_size + (batch_stop - batch_start) == n_samples_seen
  75. def test_raises_value_error_if_sample_weights_greater_than_1d():
  76. # Sample weights must be either scalar or 1D
  77. n_sampless = [2, 3]
  78. n_featuress = [3, 2]
  79. for n_samples, n_features in zip(n_sampless, n_featuress):
  80. X = rng.randn(n_samples, n_features)
  81. y = rng.randn(n_samples)
  82. scaler = StandardScaler()
  83. # make sure Error is raised the sample weights greater than 1d
  84. sample_weight_notOK = rng.randn(n_samples, 1) ** 2
  85. with pytest.raises(ValueError):
  86. scaler.fit(X, y, sample_weight=sample_weight_notOK)
  87. @pytest.mark.parametrize(
  88. ["Xw", "X", "sample_weight"],
  89. [
  90. ([[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [1, 2, 3], [4, 5, 6]], [2.0, 1.0]),
  91. (
  92. [[1, 0, 1], [0, 0, 1]],
  93. [[1, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1]],
  94. np.array([1, 3]),
  95. ),
  96. (
  97. [[1, np.nan, 1], [np.nan, np.nan, 1]],
  98. [
  99. [1, np.nan, 1],
  100. [np.nan, np.nan, 1],
  101. [np.nan, np.nan, 1],
  102. [np.nan, np.nan, 1],
  103. ],
  104. np.array([1, 3]),
  105. ),
  106. ],
  107. )
  108. @pytest.mark.parametrize("array_constructor", ["array", "sparse_csr", "sparse_csc"])
  109. def test_standard_scaler_sample_weight(Xw, X, sample_weight, array_constructor):
  110. with_mean = not array_constructor.startswith("sparse")
  111. X = _convert_container(X, array_constructor)
  112. Xw = _convert_container(Xw, array_constructor)
  113. # weighted StandardScaler
  114. yw = np.ones(Xw.shape[0])
  115. scaler_w = StandardScaler(with_mean=with_mean)
  116. scaler_w.fit(Xw, yw, sample_weight=sample_weight)
  117. # unweighted, but with repeated samples
  118. y = np.ones(X.shape[0])
  119. scaler = StandardScaler(with_mean=with_mean)
  120. scaler.fit(X, y)
  121. X_test = [[1.5, 2.5, 3.5], [3.5, 4.5, 5.5]]
  122. assert_almost_equal(scaler.mean_, scaler_w.mean_)
  123. assert_almost_equal(scaler.var_, scaler_w.var_)
  124. assert_almost_equal(scaler.transform(X_test), scaler_w.transform(X_test))
  125. def test_standard_scaler_1d():
  126. # Test scaling of dataset along single axis
  127. for X in [X_1row, X_1col, X_list_1row, X_list_1row]:
  128. scaler = StandardScaler()
  129. X_scaled = scaler.fit(X).transform(X, copy=True)
  130. if isinstance(X, list):
  131. X = np.array(X) # cast only after scaling done
  132. if _check_dim_1axis(X) == 1:
  133. assert_almost_equal(scaler.mean_, X.ravel())
  134. assert_almost_equal(scaler.scale_, np.ones(n_features))
  135. assert_array_almost_equal(X_scaled.mean(axis=0), np.zeros_like(n_features))
  136. assert_array_almost_equal(X_scaled.std(axis=0), np.zeros_like(n_features))
  137. else:
  138. assert_almost_equal(scaler.mean_, X.mean())
  139. assert_almost_equal(scaler.scale_, X.std())
  140. assert_array_almost_equal(X_scaled.mean(axis=0), np.zeros_like(n_features))
  141. assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
  142. assert_array_almost_equal(X_scaled.std(axis=0), 1.0)
  143. assert scaler.n_samples_seen_ == X.shape[0]
  144. # check inverse transform
  145. X_scaled_back = scaler.inverse_transform(X_scaled)
  146. assert_array_almost_equal(X_scaled_back, X)
  147. # Constant feature
  148. X = np.ones((5, 1))
  149. scaler = StandardScaler()
  150. X_scaled = scaler.fit(X).transform(X, copy=True)
  151. assert_almost_equal(scaler.mean_, 1.0)
  152. assert_almost_equal(scaler.scale_, 1.0)
  153. assert_array_almost_equal(X_scaled.mean(axis=0), 0.0)
  154. assert_array_almost_equal(X_scaled.std(axis=0), 0.0)
  155. assert scaler.n_samples_seen_ == X.shape[0]
  156. @pytest.mark.parametrize(
  157. "sparse_constructor", [None, sparse.csc_matrix, sparse.csr_matrix]
  158. )
  159. @pytest.mark.parametrize("add_sample_weight", [False, True])
  160. def test_standard_scaler_dtype(add_sample_weight, sparse_constructor):
  161. # Ensure scaling does not affect dtype
  162. rng = np.random.RandomState(0)
  163. n_samples = 10
  164. n_features = 3
  165. if add_sample_weight:
  166. sample_weight = np.ones(n_samples)
  167. else:
  168. sample_weight = None
  169. with_mean = True
  170. for dtype in [np.float16, np.float32, np.float64]:
  171. X = rng.randn(n_samples, n_features).astype(dtype)
  172. if sparse_constructor is not None:
  173. X = sparse_constructor(X)
  174. with_mean = False
  175. scaler = StandardScaler(with_mean=with_mean)
  176. X_scaled = scaler.fit(X, sample_weight=sample_weight).transform(X)
  177. assert X.dtype == X_scaled.dtype
  178. assert scaler.mean_.dtype == np.float64
  179. assert scaler.scale_.dtype == np.float64
  180. @pytest.mark.parametrize(
  181. "scaler",
  182. [
  183. StandardScaler(with_mean=False),
  184. RobustScaler(with_centering=False),
  185. ],
  186. )
  187. @pytest.mark.parametrize(
  188. "sparse_constructor", [np.asarray, sparse.csc_matrix, sparse.csr_matrix]
  189. )
  190. @pytest.mark.parametrize("add_sample_weight", [False, True])
  191. @pytest.mark.parametrize("dtype", [np.float32, np.float64])
  192. @pytest.mark.parametrize("constant", [0, 1.0, 100.0])
  193. def test_standard_scaler_constant_features(
  194. scaler, add_sample_weight, sparse_constructor, dtype, constant
  195. ):
  196. if isinstance(scaler, RobustScaler) and add_sample_weight:
  197. pytest.skip(f"{scaler.__class__.__name__} does not yet support sample_weight")
  198. rng = np.random.RandomState(0)
  199. n_samples = 100
  200. n_features = 1
  201. if add_sample_weight:
  202. fit_params = dict(sample_weight=rng.uniform(size=n_samples) * 2)
  203. else:
  204. fit_params = {}
  205. X_array = np.full(shape=(n_samples, n_features), fill_value=constant, dtype=dtype)
  206. X = sparse_constructor(X_array)
  207. X_scaled = scaler.fit(X, **fit_params).transform(X)
  208. if isinstance(scaler, StandardScaler):
  209. # The variance info should be close to zero for constant features.
  210. assert_allclose(scaler.var_, np.zeros(X.shape[1]), atol=1e-7)
  211. # Constant features should not be scaled (scale of 1.):
  212. assert_allclose(scaler.scale_, np.ones(X.shape[1]))
  213. if hasattr(X_scaled, "toarray"):
  214. assert_allclose(X_scaled.toarray(), X_array)
  215. else:
  216. assert_allclose(X_scaled, X)
  217. if isinstance(scaler, StandardScaler) and not add_sample_weight:
  218. # Also check consistency with the standard scale function.
  219. X_scaled_2 = scale(X, with_mean=scaler.with_mean)
  220. if hasattr(X_scaled_2, "toarray"):
  221. assert_allclose(X_scaled_2.toarray(), X_scaled_2.toarray())
  222. else:
  223. assert_allclose(X_scaled_2, X_scaled_2)
  224. @pytest.mark.parametrize("n_samples", [10, 100, 10_000])
  225. @pytest.mark.parametrize("average", [1e-10, 1, 1e10])
  226. @pytest.mark.parametrize("dtype", [np.float32, np.float64])
  227. @pytest.mark.parametrize(
  228. "array_constructor", [np.asarray, sparse.csc_matrix, sparse.csr_matrix]
  229. )
  230. def test_standard_scaler_near_constant_features(
  231. n_samples, array_constructor, average, dtype
  232. ):
  233. # Check that when the variance is too small (var << mean**2) the feature
  234. # is considered constant and not scaled.
  235. scale_min, scale_max = -30, 19
  236. scales = np.array([10**i for i in range(scale_min, scale_max + 1)], dtype=dtype)
  237. n_features = scales.shape[0]
  238. X = np.empty((n_samples, n_features), dtype=dtype)
  239. # Make a dataset of known var = scales**2 and mean = average
  240. X[: n_samples // 2, :] = average + scales
  241. X[n_samples // 2 :, :] = average - scales
  242. X_array = array_constructor(X)
  243. scaler = StandardScaler(with_mean=False).fit(X_array)
  244. # StandardScaler uses float64 accumulators even if the data has a float32
  245. # dtype.
  246. eps = np.finfo(np.float64).eps
  247. # if var < bound = N.eps.var + N².eps².mean², the feature is considered
  248. # constant and the scale_ attribute is set to 1.
  249. bounds = n_samples * eps * scales**2 + n_samples**2 * eps**2 * average**2
  250. within_bounds = scales**2 <= bounds
  251. # Check that scale_min is small enough to have some scales below the
  252. # bound and therefore detected as constant:
  253. assert np.any(within_bounds)
  254. # Check that such features are actually treated as constant by the scaler:
  255. assert all(scaler.var_[within_bounds] <= bounds[within_bounds])
  256. assert_allclose(scaler.scale_[within_bounds], 1.0)
  257. # Depending the on the dtype of X, some features might not actually be
  258. # representable as non constant for small scales (even if above the
  259. # precision bound of the float64 variance estimate). Such feature should
  260. # be correctly detected as constants with 0 variance by StandardScaler.
  261. representable_diff = X[0, :] - X[-1, :] != 0
  262. assert_allclose(scaler.var_[np.logical_not(representable_diff)], 0)
  263. assert_allclose(scaler.scale_[np.logical_not(representable_diff)], 1)
  264. # The other features are scaled and scale_ is equal to sqrt(var_) assuming
  265. # that scales are large enough for average + scale and average - scale to
  266. # be distinct in X (depending on X's dtype).
  267. common_mask = np.logical_and(scales**2 > bounds, representable_diff)
  268. assert_allclose(scaler.scale_[common_mask], np.sqrt(scaler.var_)[common_mask])
  269. def test_scale_1d():
  270. # 1-d inputs
  271. X_list = [1.0, 3.0, 5.0, 0.0]
  272. X_arr = np.array(X_list)
  273. for X in [X_list, X_arr]:
  274. X_scaled = scale(X)
  275. assert_array_almost_equal(X_scaled.mean(), 0.0)
  276. assert_array_almost_equal(X_scaled.std(), 1.0)
  277. assert_array_equal(scale(X, with_mean=False, with_std=False), X)
  278. @skip_if_32bit
  279. def test_standard_scaler_numerical_stability():
  280. # Test numerical stability of scaling
  281. # np.log(1e-5) is taken because of its floating point representation
  282. # was empirically found to cause numerical problems with np.mean & np.std.
  283. x = np.full(8, np.log(1e-5), dtype=np.float64)
  284. # This does not raise a warning as the number of samples is too low
  285. # to trigger the problem in recent numpy
  286. with warnings.catch_warnings():
  287. warnings.simplefilter("error", UserWarning)
  288. scale(x)
  289. assert_array_almost_equal(scale(x), np.zeros(8))
  290. # with 2 more samples, the std computation run into numerical issues:
  291. x = np.full(10, np.log(1e-5), dtype=np.float64)
  292. warning_message = "standard deviation of the data is probably very close to 0"
  293. with pytest.warns(UserWarning, match=warning_message):
  294. x_scaled = scale(x)
  295. assert_array_almost_equal(x_scaled, np.zeros(10))
  296. x = np.full(10, 1e-100, dtype=np.float64)
  297. with warnings.catch_warnings():
  298. warnings.simplefilter("error", UserWarning)
  299. x_small_scaled = scale(x)
  300. assert_array_almost_equal(x_small_scaled, np.zeros(10))
  301. # Large values can cause (often recoverable) numerical stability issues:
  302. x_big = np.full(10, 1e100, dtype=np.float64)
  303. warning_message = "Dataset may contain too large values"
  304. with pytest.warns(UserWarning, match=warning_message):
  305. x_big_scaled = scale(x_big)
  306. assert_array_almost_equal(x_big_scaled, np.zeros(10))
  307. assert_array_almost_equal(x_big_scaled, x_small_scaled)
  308. with pytest.warns(UserWarning, match=warning_message):
  309. x_big_centered = scale(x_big, with_std=False)
  310. assert_array_almost_equal(x_big_centered, np.zeros(10))
  311. assert_array_almost_equal(x_big_centered, x_small_scaled)
  312. def test_scaler_2d_arrays():
  313. # Test scaling of 2d array along first axis
  314. rng = np.random.RandomState(0)
  315. n_features = 5
  316. n_samples = 4
  317. X = rng.randn(n_samples, n_features)
  318. X[:, 0] = 0.0 # first feature is always of zero
  319. scaler = StandardScaler()
  320. X_scaled = scaler.fit(X).transform(X, copy=True)
  321. assert not np.any(np.isnan(X_scaled))
  322. assert scaler.n_samples_seen_ == n_samples
  323. assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0])
  324. assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
  325. # Check that X has been copied
  326. assert X_scaled is not X
  327. # check inverse transform
  328. X_scaled_back = scaler.inverse_transform(X_scaled)
  329. assert X_scaled_back is not X
  330. assert X_scaled_back is not X_scaled
  331. assert_array_almost_equal(X_scaled_back, X)
  332. X_scaled = scale(X, axis=1, with_std=False)
  333. assert not np.any(np.isnan(X_scaled))
  334. assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0])
  335. X_scaled = scale(X, axis=1, with_std=True)
  336. assert not np.any(np.isnan(X_scaled))
  337. assert_array_almost_equal(X_scaled.mean(axis=1), n_samples * [0.0])
  338. assert_array_almost_equal(X_scaled.std(axis=1), n_samples * [1.0])
  339. # Check that the data hasn't been modified
  340. assert X_scaled is not X
  341. X_scaled = scaler.fit(X).transform(X, copy=False)
  342. assert not np.any(np.isnan(X_scaled))
  343. assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0])
  344. assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
  345. # Check that X has not been copied
  346. assert X_scaled is X
  347. X = rng.randn(4, 5)
  348. X[:, 0] = 1.0 # first feature is a constant, non zero feature
  349. scaler = StandardScaler()
  350. X_scaled = scaler.fit(X).transform(X, copy=True)
  351. assert not np.any(np.isnan(X_scaled))
  352. assert_array_almost_equal(X_scaled.mean(axis=0), n_features * [0.0])
  353. assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
  354. # Check that X has not been copied
  355. assert X_scaled is not X
  356. def test_scaler_float16_overflow():
  357. # Test if the scaler will not overflow on float16 numpy arrays
  358. rng = np.random.RandomState(0)
  359. # float16 has a maximum of 65500.0. On the worst case 5 * 200000 is 100000
  360. # which is enough to overflow the data type
  361. X = rng.uniform(5, 10, [200000, 1]).astype(np.float16)
  362. with np.errstate(over="raise"):
  363. scaler = StandardScaler().fit(X)
  364. X_scaled = scaler.transform(X)
  365. # Calculate the float64 equivalent to verify result
  366. X_scaled_f64 = StandardScaler().fit_transform(X.astype(np.float64))
  367. # Overflow calculations may cause -inf, inf, or nan. Since there is no nan
  368. # input, all of the outputs should be finite. This may be redundant since a
  369. # FloatingPointError exception will be thrown on overflow above.
  370. assert np.all(np.isfinite(X_scaled))
  371. # The normal distribution is very unlikely to go above 4. At 4.0-8.0 the
  372. # float16 precision is 2^-8 which is around 0.004. Thus only 2 decimals are
  373. # checked to account for precision differences.
  374. assert_array_almost_equal(X_scaled, X_scaled_f64, decimal=2)
  375. def test_handle_zeros_in_scale():
  376. s1 = np.array([0, 1e-16, 1, 2, 3])
  377. s2 = _handle_zeros_in_scale(s1, copy=True)
  378. assert_allclose(s1, np.array([0, 1e-16, 1, 2, 3]))
  379. assert_allclose(s2, np.array([1, 1, 1, 2, 3]))
  380. def test_minmax_scaler_partial_fit():
  381. # Test if partial_fit run over many batches of size 1 and 50
  382. # gives the same results as fit
  383. X = X_2d
  384. n = X.shape[0]
  385. for chunk_size in [1, 2, 50, n, n + 42]:
  386. # Test mean at the end of the process
  387. scaler_batch = MinMaxScaler().fit(X)
  388. scaler_incr = MinMaxScaler()
  389. for batch in gen_batches(n_samples, chunk_size):
  390. scaler_incr = scaler_incr.partial_fit(X[batch])
  391. assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_)
  392. assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_)
  393. assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
  394. assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_)
  395. assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
  396. assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)
  397. # Test std after 1 step
  398. batch0 = slice(0, chunk_size)
  399. scaler_batch = MinMaxScaler().fit(X[batch0])
  400. scaler_incr = MinMaxScaler().partial_fit(X[batch0])
  401. assert_array_almost_equal(scaler_batch.data_min_, scaler_incr.data_min_)
  402. assert_array_almost_equal(scaler_batch.data_max_, scaler_incr.data_max_)
  403. assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
  404. assert_array_almost_equal(scaler_batch.data_range_, scaler_incr.data_range_)
  405. assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
  406. assert_array_almost_equal(scaler_batch.min_, scaler_incr.min_)
  407. # Test std until the end of partial fits, and
  408. scaler_batch = MinMaxScaler().fit(X)
  409. scaler_incr = MinMaxScaler() # Clean estimator
  410. for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
  411. scaler_incr = scaler_incr.partial_fit(X[batch])
  412. assert_correct_incr(
  413. i,
  414. batch_start=batch.start,
  415. batch_stop=batch.stop,
  416. n=n,
  417. chunk_size=chunk_size,
  418. n_samples_seen=scaler_incr.n_samples_seen_,
  419. )
  420. def test_standard_scaler_partial_fit():
  421. # Test if partial_fit run over many batches of size 1 and 50
  422. # gives the same results as fit
  423. X = X_2d
  424. n = X.shape[0]
  425. for chunk_size in [1, 2, 50, n, n + 42]:
  426. # Test mean at the end of the process
  427. scaler_batch = StandardScaler(with_std=False).fit(X)
  428. scaler_incr = StandardScaler(with_std=False)
  429. for batch in gen_batches(n_samples, chunk_size):
  430. scaler_incr = scaler_incr.partial_fit(X[batch])
  431. assert_array_almost_equal(scaler_batch.mean_, scaler_incr.mean_)
  432. assert scaler_batch.var_ == scaler_incr.var_ # Nones
  433. assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
  434. # Test std after 1 step
  435. batch0 = slice(0, chunk_size)
  436. scaler_incr = StandardScaler().partial_fit(X[batch0])
  437. if chunk_size == 1:
  438. assert_array_almost_equal(
  439. np.zeros(n_features, dtype=np.float64), scaler_incr.var_
  440. )
  441. assert_array_almost_equal(
  442. np.ones(n_features, dtype=np.float64), scaler_incr.scale_
  443. )
  444. else:
  445. assert_array_almost_equal(np.var(X[batch0], axis=0), scaler_incr.var_)
  446. assert_array_almost_equal(
  447. np.std(X[batch0], axis=0), scaler_incr.scale_
  448. ) # no constants
  449. # Test std until the end of partial fits, and
  450. scaler_batch = StandardScaler().fit(X)
  451. scaler_incr = StandardScaler() # Clean estimator
  452. for i, batch in enumerate(gen_batches(n_samples, chunk_size)):
  453. scaler_incr = scaler_incr.partial_fit(X[batch])
  454. assert_correct_incr(
  455. i,
  456. batch_start=batch.start,
  457. batch_stop=batch.stop,
  458. n=n,
  459. chunk_size=chunk_size,
  460. n_samples_seen=scaler_incr.n_samples_seen_,
  461. )
  462. assert_array_almost_equal(scaler_batch.var_, scaler_incr.var_)
  463. assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
  464. def test_standard_scaler_partial_fit_numerical_stability():
  465. # Test if the incremental computation introduces significative errors
  466. # for large datasets with values of large magniture
  467. rng = np.random.RandomState(0)
  468. n_features = 2
  469. n_samples = 100
  470. offsets = rng.uniform(-1e15, 1e15, size=n_features)
  471. scales = rng.uniform(1e3, 1e6, size=n_features)
  472. X = rng.randn(n_samples, n_features) * scales + offsets
  473. scaler_batch = StandardScaler().fit(X)
  474. scaler_incr = StandardScaler()
  475. for chunk in X:
  476. scaler_incr = scaler_incr.partial_fit(chunk.reshape(1, n_features))
  477. # Regardless of abs values, they must not be more diff 6 significant digits
  478. tol = 10 ** (-6)
  479. assert_allclose(scaler_incr.mean_, scaler_batch.mean_, rtol=tol)
  480. assert_allclose(scaler_incr.var_, scaler_batch.var_, rtol=tol)
  481. assert_allclose(scaler_incr.scale_, scaler_batch.scale_, rtol=tol)
  482. # NOTE Be aware that for much larger offsets std is very unstable (last
  483. # assert) while mean is OK.
  484. # Sparse input
  485. size = (100, 3)
  486. scale = 1e20
  487. X = rng.randint(0, 2, size).astype(np.float64) * scale
  488. X_csr = sparse.csr_matrix(X)
  489. X_csc = sparse.csc_matrix(X)
  490. for X in [X_csr, X_csc]:
  491. # with_mean=False is required with sparse input
  492. scaler = StandardScaler(with_mean=False).fit(X)
  493. scaler_incr = StandardScaler(with_mean=False)
  494. for chunk in X:
  495. # chunk = sparse.csr_matrix(data_chunks)
  496. scaler_incr = scaler_incr.partial_fit(chunk)
  497. # Regardless of magnitude, they must not differ more than of 6 digits
  498. tol = 10 ** (-6)
  499. assert scaler.mean_ is not None
  500. assert_allclose(scaler_incr.var_, scaler.var_, rtol=tol)
  501. assert_allclose(scaler_incr.scale_, scaler.scale_, rtol=tol)
  502. @pytest.mark.parametrize("sample_weight", [True, None])
  503. def test_partial_fit_sparse_input(sample_weight):
  504. # Check that sparsity is not destroyed
  505. X = np.array([[1.0], [0.0], [0.0], [5.0]])
  506. X_csr = sparse.csr_matrix(X)
  507. X_csc = sparse.csc_matrix(X)
  508. if sample_weight:
  509. sample_weight = rng.rand(X_csc.shape[0])
  510. null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
  511. for X in [X_csr, X_csc]:
  512. X_null = null_transform.partial_fit(X, sample_weight=sample_weight).transform(X)
  513. assert_array_equal(X_null.toarray(), X.toarray())
  514. X_orig = null_transform.inverse_transform(X_null)
  515. assert_array_equal(X_orig.toarray(), X_null.toarray())
  516. assert_array_equal(X_orig.toarray(), X.toarray())
  517. @pytest.mark.parametrize("sample_weight", [True, None])
  518. def test_standard_scaler_trasform_with_partial_fit(sample_weight):
  519. # Check some postconditions after applying partial_fit and transform
  520. X = X_2d[:100, :]
  521. if sample_weight:
  522. sample_weight = rng.rand(X.shape[0])
  523. scaler_incr = StandardScaler()
  524. for i, batch in enumerate(gen_batches(X.shape[0], 1)):
  525. X_sofar = X[: (i + 1), :]
  526. chunks_copy = X_sofar.copy()
  527. if sample_weight is None:
  528. scaled_batch = StandardScaler().fit_transform(X_sofar)
  529. scaler_incr = scaler_incr.partial_fit(X[batch])
  530. else:
  531. scaled_batch = StandardScaler().fit_transform(
  532. X_sofar, sample_weight=sample_weight[: i + 1]
  533. )
  534. scaler_incr = scaler_incr.partial_fit(
  535. X[batch], sample_weight=sample_weight[batch]
  536. )
  537. scaled_incr = scaler_incr.transform(X_sofar)
  538. assert_array_almost_equal(scaled_batch, scaled_incr)
  539. assert_array_almost_equal(X_sofar, chunks_copy) # No change
  540. right_input = scaler_incr.inverse_transform(scaled_incr)
  541. assert_array_almost_equal(X_sofar, right_input)
  542. zero = np.zeros(X.shape[1])
  543. epsilon = np.finfo(float).eps
  544. assert_array_less(zero, scaler_incr.var_ + epsilon) # as less or equal
  545. assert_array_less(zero, scaler_incr.scale_ + epsilon)
  546. if sample_weight is None:
  547. # (i+1) because the Scaler has been already fitted
  548. assert (i + 1) == scaler_incr.n_samples_seen_
  549. else:
  550. assert np.sum(sample_weight[: i + 1]) == pytest.approx(
  551. scaler_incr.n_samples_seen_
  552. )
  553. def test_standard_check_array_of_inverse_transform():
  554. # Check if StandardScaler inverse_transform is
  555. # converting the integer array to float
  556. x = np.array(
  557. [
  558. [1, 1, 1, 0, 1, 0],
  559. [1, 1, 1, 0, 1, 0],
  560. [0, 8, 0, 1, 0, 0],
  561. [1, 4, 1, 1, 0, 0],
  562. [0, 1, 0, 0, 1, 0],
  563. [0, 4, 0, 1, 0, 1],
  564. ],
  565. dtype=np.int32,
  566. )
  567. scaler = StandardScaler()
  568. scaler.fit(x)
  569. # The of inverse_transform should be converted
  570. # to a float array.
  571. # If not X *= self.scale_ will fail.
  572. scaler.inverse_transform(x)
  573. def test_min_max_scaler_iris():
  574. X = iris.data
  575. scaler = MinMaxScaler()
  576. # default params
  577. X_trans = scaler.fit_transform(X)
  578. assert_array_almost_equal(X_trans.min(axis=0), 0)
  579. assert_array_almost_equal(X_trans.max(axis=0), 1)
  580. X_trans_inv = scaler.inverse_transform(X_trans)
  581. assert_array_almost_equal(X, X_trans_inv)
  582. # not default params: min=1, max=2
  583. scaler = MinMaxScaler(feature_range=(1, 2))
  584. X_trans = scaler.fit_transform(X)
  585. assert_array_almost_equal(X_trans.min(axis=0), 1)
  586. assert_array_almost_equal(X_trans.max(axis=0), 2)
  587. X_trans_inv = scaler.inverse_transform(X_trans)
  588. assert_array_almost_equal(X, X_trans_inv)
  589. # min=-.5, max=.6
  590. scaler = MinMaxScaler(feature_range=(-0.5, 0.6))
  591. X_trans = scaler.fit_transform(X)
  592. assert_array_almost_equal(X_trans.min(axis=0), -0.5)
  593. assert_array_almost_equal(X_trans.max(axis=0), 0.6)
  594. X_trans_inv = scaler.inverse_transform(X_trans)
  595. assert_array_almost_equal(X, X_trans_inv)
  596. # raises on invalid range
  597. scaler = MinMaxScaler(feature_range=(2, 1))
  598. with pytest.raises(ValueError):
  599. scaler.fit(X)
  600. def test_min_max_scaler_zero_variance_features():
  601. # Check min max scaler on toy data with zero variance features
  602. X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.1], [0.0, 1.0, +1.1]]
  603. X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]]
  604. # default params
  605. scaler = MinMaxScaler()
  606. X_trans = scaler.fit_transform(X)
  607. X_expected_0_1 = [[0.0, 0.0, 0.5], [0.0, 0.0, 0.0], [0.0, 0.0, 1.0]]
  608. assert_array_almost_equal(X_trans, X_expected_0_1)
  609. X_trans_inv = scaler.inverse_transform(X_trans)
  610. assert_array_almost_equal(X, X_trans_inv)
  611. X_trans_new = scaler.transform(X_new)
  612. X_expected_0_1_new = [[+0.0, 1.0, 0.500], [-1.0, 0.0, 0.083], [+0.0, 0.0, 1.333]]
  613. assert_array_almost_equal(X_trans_new, X_expected_0_1_new, decimal=2)
  614. # not default params
  615. scaler = MinMaxScaler(feature_range=(1, 2))
  616. X_trans = scaler.fit_transform(X)
  617. X_expected_1_2 = [[1.0, 1.0, 1.5], [1.0, 1.0, 1.0], [1.0, 1.0, 2.0]]
  618. assert_array_almost_equal(X_trans, X_expected_1_2)
  619. # function interface
  620. X_trans = minmax_scale(X)
  621. assert_array_almost_equal(X_trans, X_expected_0_1)
  622. X_trans = minmax_scale(X, feature_range=(1, 2))
  623. assert_array_almost_equal(X_trans, X_expected_1_2)
  624. def test_minmax_scale_axis1():
  625. X = iris.data
  626. X_trans = minmax_scale(X, axis=1)
  627. assert_array_almost_equal(np.min(X_trans, axis=1), 0)
  628. assert_array_almost_equal(np.max(X_trans, axis=1), 1)
  629. def test_min_max_scaler_1d():
  630. # Test scaling of dataset along single axis
  631. for X in [X_1row, X_1col, X_list_1row, X_list_1row]:
  632. scaler = MinMaxScaler(copy=True)
  633. X_scaled = scaler.fit(X).transform(X)
  634. if isinstance(X, list):
  635. X = np.array(X) # cast only after scaling done
  636. if _check_dim_1axis(X) == 1:
  637. assert_array_almost_equal(X_scaled.min(axis=0), np.zeros(n_features))
  638. assert_array_almost_equal(X_scaled.max(axis=0), np.zeros(n_features))
  639. else:
  640. assert_array_almost_equal(X_scaled.min(axis=0), 0.0)
  641. assert_array_almost_equal(X_scaled.max(axis=0), 1.0)
  642. assert scaler.n_samples_seen_ == X.shape[0]
  643. # check inverse transform
  644. X_scaled_back = scaler.inverse_transform(X_scaled)
  645. assert_array_almost_equal(X_scaled_back, X)
  646. # Constant feature
  647. X = np.ones((5, 1))
  648. scaler = MinMaxScaler()
  649. X_scaled = scaler.fit(X).transform(X)
  650. assert X_scaled.min() >= 0.0
  651. assert X_scaled.max() <= 1.0
  652. assert scaler.n_samples_seen_ == X.shape[0]
  653. # Function interface
  654. X_1d = X_1row.ravel()
  655. min_ = X_1d.min()
  656. max_ = X_1d.max()
  657. assert_array_almost_equal(
  658. (X_1d - min_) / (max_ - min_), minmax_scale(X_1d, copy=True)
  659. )
  660. @pytest.mark.parametrize("sample_weight", [True, None])
  661. def test_scaler_without_centering(sample_weight):
  662. rng = np.random.RandomState(42)
  663. X = rng.randn(4, 5)
  664. X[:, 0] = 0.0 # first feature is always of zero
  665. X_csr = sparse.csr_matrix(X)
  666. X_csc = sparse.csc_matrix(X)
  667. if sample_weight:
  668. sample_weight = rng.rand(X.shape[0])
  669. with pytest.raises(ValueError):
  670. StandardScaler().fit(X_csr)
  671. with pytest.raises(ValueError):
  672. StandardScaler().fit(X_csc)
  673. null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
  674. X_null = null_transform.fit_transform(X_csr)
  675. assert_array_equal(X_null.data, X_csr.data)
  676. X_orig = null_transform.inverse_transform(X_null)
  677. assert_array_equal(X_orig.data, X_csr.data)
  678. scaler = StandardScaler(with_mean=False).fit(X, sample_weight=sample_weight)
  679. X_scaled = scaler.transform(X, copy=True)
  680. assert not np.any(np.isnan(X_scaled))
  681. scaler_csr = StandardScaler(with_mean=False).fit(X_csr, sample_weight=sample_weight)
  682. X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
  683. assert not np.any(np.isnan(X_csr_scaled.data))
  684. scaler_csc = StandardScaler(with_mean=False).fit(X_csc, sample_weight=sample_weight)
  685. X_csc_scaled = scaler_csc.transform(X_csc, copy=True)
  686. assert not np.any(np.isnan(X_csc_scaled.data))
  687. assert_array_almost_equal(scaler.mean_, scaler_csr.mean_)
  688. assert_array_almost_equal(scaler.var_, scaler_csr.var_)
  689. assert_array_almost_equal(scaler.scale_, scaler_csr.scale_)
  690. assert_array_almost_equal(scaler.n_samples_seen_, scaler_csr.n_samples_seen_)
  691. assert_array_almost_equal(scaler.mean_, scaler_csc.mean_)
  692. assert_array_almost_equal(scaler.var_, scaler_csc.var_)
  693. assert_array_almost_equal(scaler.scale_, scaler_csc.scale_)
  694. assert_array_almost_equal(scaler.n_samples_seen_, scaler_csc.n_samples_seen_)
  695. if sample_weight is None:
  696. assert_array_almost_equal(
  697. X_scaled.mean(axis=0), [0.0, -0.01, 2.24, -0.35, -0.78], 2
  698. )
  699. assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
  700. X_csr_scaled_mean, X_csr_scaled_var = mean_variance_axis(X_csr_scaled, 0)
  701. assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
  702. assert_array_almost_equal(X_csr_scaled_var, X_scaled.var(axis=0))
  703. # Check that X has not been modified (copy)
  704. assert X_scaled is not X
  705. assert X_csr_scaled is not X_csr
  706. X_scaled_back = scaler.inverse_transform(X_scaled)
  707. assert X_scaled_back is not X
  708. assert X_scaled_back is not X_scaled
  709. assert_array_almost_equal(X_scaled_back, X)
  710. X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
  711. assert X_csr_scaled_back is not X_csr
  712. assert X_csr_scaled_back is not X_csr_scaled
  713. assert_array_almost_equal(X_csr_scaled_back.toarray(), X)
  714. X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc())
  715. assert X_csc_scaled_back is not X_csc
  716. assert X_csc_scaled_back is not X_csc_scaled
  717. assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
  718. @pytest.mark.parametrize("with_mean", [True, False])
  719. @pytest.mark.parametrize("with_std", [True, False])
  720. @pytest.mark.parametrize(
  721. "array_constructor", [np.asarray, sparse.csc_matrix, sparse.csr_matrix]
  722. )
  723. def test_scaler_n_samples_seen_with_nan(with_mean, with_std, array_constructor):
  724. X = np.array(
  725. [[0, 1, 3], [np.nan, 6, 10], [5, 4, np.nan], [8, 0, np.nan]], dtype=np.float64
  726. )
  727. X = array_constructor(X)
  728. if sparse.issparse(X) and with_mean:
  729. pytest.skip("'with_mean=True' cannot be used with sparse matrix.")
  730. transformer = StandardScaler(with_mean=with_mean, with_std=with_std)
  731. transformer.fit(X)
  732. assert_array_equal(transformer.n_samples_seen_, np.array([3, 4, 2]))
  733. def _check_identity_scalers_attributes(scaler_1, scaler_2):
  734. assert scaler_1.mean_ is scaler_2.mean_ is None
  735. assert scaler_1.var_ is scaler_2.var_ is None
  736. assert scaler_1.scale_ is scaler_2.scale_ is None
  737. assert scaler_1.n_samples_seen_ == scaler_2.n_samples_seen_
  738. def test_scaler_return_identity():
  739. # test that the scaler return identity when with_mean and with_std are
  740. # False
  741. X_dense = np.array([[0, 1, 3], [5, 6, 0], [8, 0, 10]], dtype=np.float64)
  742. X_csr = sparse.csr_matrix(X_dense)
  743. X_csc = X_csr.tocsc()
  744. transformer_dense = StandardScaler(with_mean=False, with_std=False)
  745. X_trans_dense = transformer_dense.fit_transform(X_dense)
  746. transformer_csr = clone(transformer_dense)
  747. X_trans_csr = transformer_csr.fit_transform(X_csr)
  748. transformer_csc = clone(transformer_dense)
  749. X_trans_csc = transformer_csc.fit_transform(X_csc)
  750. assert_allclose_dense_sparse(X_trans_csr, X_csr)
  751. assert_allclose_dense_sparse(X_trans_csc, X_csc)
  752. assert_allclose(X_trans_dense, X_dense)
  753. for trans_1, trans_2 in itertools.combinations(
  754. [transformer_dense, transformer_csr, transformer_csc], 2
  755. ):
  756. _check_identity_scalers_attributes(trans_1, trans_2)
  757. transformer_dense.partial_fit(X_dense)
  758. transformer_csr.partial_fit(X_csr)
  759. transformer_csc.partial_fit(X_csc)
  760. for trans_1, trans_2 in itertools.combinations(
  761. [transformer_dense, transformer_csr, transformer_csc], 2
  762. ):
  763. _check_identity_scalers_attributes(trans_1, trans_2)
  764. transformer_dense.fit(X_dense)
  765. transformer_csr.fit(X_csr)
  766. transformer_csc.fit(X_csc)
  767. for trans_1, trans_2 in itertools.combinations(
  768. [transformer_dense, transformer_csr, transformer_csc], 2
  769. ):
  770. _check_identity_scalers_attributes(trans_1, trans_2)
  771. def test_scaler_int():
  772. # test that scaler converts integer input to floating
  773. # for both sparse and dense matrices
  774. rng = np.random.RandomState(42)
  775. X = rng.randint(20, size=(4, 5))
  776. X[:, 0] = 0 # first feature is always of zero
  777. X_csr = sparse.csr_matrix(X)
  778. X_csc = sparse.csc_matrix(X)
  779. null_transform = StandardScaler(with_mean=False, with_std=False, copy=True)
  780. with warnings.catch_warnings(record=True):
  781. X_null = null_transform.fit_transform(X_csr)
  782. assert_array_equal(X_null.data, X_csr.data)
  783. X_orig = null_transform.inverse_transform(X_null)
  784. assert_array_equal(X_orig.data, X_csr.data)
  785. with warnings.catch_warnings(record=True):
  786. scaler = StandardScaler(with_mean=False).fit(X)
  787. X_scaled = scaler.transform(X, copy=True)
  788. assert not np.any(np.isnan(X_scaled))
  789. with warnings.catch_warnings(record=True):
  790. scaler_csr = StandardScaler(with_mean=False).fit(X_csr)
  791. X_csr_scaled = scaler_csr.transform(X_csr, copy=True)
  792. assert not np.any(np.isnan(X_csr_scaled.data))
  793. with warnings.catch_warnings(record=True):
  794. scaler_csc = StandardScaler(with_mean=False).fit(X_csc)
  795. X_csc_scaled = scaler_csc.transform(X_csc, copy=True)
  796. assert not np.any(np.isnan(X_csc_scaled.data))
  797. assert_array_almost_equal(scaler.mean_, scaler_csr.mean_)
  798. assert_array_almost_equal(scaler.var_, scaler_csr.var_)
  799. assert_array_almost_equal(scaler.scale_, scaler_csr.scale_)
  800. assert_array_almost_equal(scaler.mean_, scaler_csc.mean_)
  801. assert_array_almost_equal(scaler.var_, scaler_csc.var_)
  802. assert_array_almost_equal(scaler.scale_, scaler_csc.scale_)
  803. assert_array_almost_equal(
  804. X_scaled.mean(axis=0), [0.0, 1.109, 1.856, 21.0, 1.559], 2
  805. )
  806. assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
  807. X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(
  808. X_csr_scaled.astype(float), 0
  809. )
  810. assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
  811. assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))
  812. # Check that X has not been modified (copy)
  813. assert X_scaled is not X
  814. assert X_csr_scaled is not X_csr
  815. X_scaled_back = scaler.inverse_transform(X_scaled)
  816. assert X_scaled_back is not X
  817. assert X_scaled_back is not X_scaled
  818. assert_array_almost_equal(X_scaled_back, X)
  819. X_csr_scaled_back = scaler_csr.inverse_transform(X_csr_scaled)
  820. assert X_csr_scaled_back is not X_csr
  821. assert X_csr_scaled_back is not X_csr_scaled
  822. assert_array_almost_equal(X_csr_scaled_back.toarray(), X)
  823. X_csc_scaled_back = scaler_csr.inverse_transform(X_csc_scaled.tocsc())
  824. assert X_csc_scaled_back is not X_csc
  825. assert X_csc_scaled_back is not X_csc_scaled
  826. assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
  827. def test_scaler_without_copy():
  828. # Check that StandardScaler.fit does not change input
  829. rng = np.random.RandomState(42)
  830. X = rng.randn(4, 5)
  831. X[:, 0] = 0.0 # first feature is always of zero
  832. X_csr = sparse.csr_matrix(X)
  833. X_csc = sparse.csc_matrix(X)
  834. X_copy = X.copy()
  835. StandardScaler(copy=False).fit(X)
  836. assert_array_equal(X, X_copy)
  837. X_csr_copy = X_csr.copy()
  838. StandardScaler(with_mean=False, copy=False).fit(X_csr)
  839. assert_array_equal(X_csr.toarray(), X_csr_copy.toarray())
  840. X_csc_copy = X_csc.copy()
  841. StandardScaler(with_mean=False, copy=False).fit(X_csc)
  842. assert_array_equal(X_csc.toarray(), X_csc_copy.toarray())
  843. def test_scale_sparse_with_mean_raise_exception():
  844. rng = np.random.RandomState(42)
  845. X = rng.randn(4, 5)
  846. X_csr = sparse.csr_matrix(X)
  847. X_csc = sparse.csc_matrix(X)
  848. # check scaling and fit with direct calls on sparse data
  849. with pytest.raises(ValueError):
  850. scale(X_csr, with_mean=True)
  851. with pytest.raises(ValueError):
  852. StandardScaler(with_mean=True).fit(X_csr)
  853. with pytest.raises(ValueError):
  854. scale(X_csc, with_mean=True)
  855. with pytest.raises(ValueError):
  856. StandardScaler(with_mean=True).fit(X_csc)
  857. # check transform and inverse_transform after a fit on a dense array
  858. scaler = StandardScaler(with_mean=True).fit(X)
  859. with pytest.raises(ValueError):
  860. scaler.transform(X_csr)
  861. with pytest.raises(ValueError):
  862. scaler.transform(X_csc)
  863. X_transformed_csr = sparse.csr_matrix(scaler.transform(X))
  864. with pytest.raises(ValueError):
  865. scaler.inverse_transform(X_transformed_csr)
  866. X_transformed_csc = sparse.csc_matrix(scaler.transform(X))
  867. with pytest.raises(ValueError):
  868. scaler.inverse_transform(X_transformed_csc)
  869. def test_scale_input_finiteness_validation():
  870. # Check if non finite inputs raise ValueError
  871. X = [[np.inf, 5, 6, 7, 8]]
  872. with pytest.raises(
  873. ValueError, match="Input contains infinity or a value too large"
  874. ):
  875. scale(X)
  876. def test_robust_scaler_error_sparse():
  877. X_sparse = sparse.rand(1000, 10)
  878. scaler = RobustScaler(with_centering=True)
  879. err_msg = "Cannot center sparse matrices"
  880. with pytest.raises(ValueError, match=err_msg):
  881. scaler.fit(X_sparse)
  882. @pytest.mark.parametrize("with_centering", [True, False])
  883. @pytest.mark.parametrize("with_scaling", [True, False])
  884. @pytest.mark.parametrize("X", [np.random.randn(10, 3), sparse.rand(10, 3, density=0.5)])
  885. def test_robust_scaler_attributes(X, with_centering, with_scaling):
  886. # check consistent type of attributes
  887. if with_centering and sparse.issparse(X):
  888. pytest.skip("RobustScaler cannot center sparse matrix")
  889. scaler = RobustScaler(with_centering=with_centering, with_scaling=with_scaling)
  890. scaler.fit(X)
  891. if with_centering:
  892. assert isinstance(scaler.center_, np.ndarray)
  893. else:
  894. assert scaler.center_ is None
  895. if with_scaling:
  896. assert isinstance(scaler.scale_, np.ndarray)
  897. else:
  898. assert scaler.scale_ is None
  899. def test_robust_scaler_col_zero_sparse():
  900. # check that the scaler is working when there is not data materialized in a
  901. # column of a sparse matrix
  902. X = np.random.randn(10, 5)
  903. X[:, 0] = 0
  904. X = sparse.csr_matrix(X)
  905. scaler = RobustScaler(with_centering=False)
  906. scaler.fit(X)
  907. assert scaler.scale_[0] == pytest.approx(1)
  908. X_trans = scaler.transform(X)
  909. assert_allclose(X[:, 0].toarray(), X_trans[:, 0].toarray())
  910. def test_robust_scaler_2d_arrays():
  911. # Test robust scaling of 2d array along first axis
  912. rng = np.random.RandomState(0)
  913. X = rng.randn(4, 5)
  914. X[:, 0] = 0.0 # first feature is always of zero
  915. scaler = RobustScaler()
  916. X_scaled = scaler.fit(X).transform(X)
  917. assert_array_almost_equal(np.median(X_scaled, axis=0), 5 * [0.0])
  918. assert_array_almost_equal(X_scaled.std(axis=0)[0], 0)
  919. @pytest.mark.parametrize("density", [0, 0.05, 0.1, 0.5, 1])
  920. @pytest.mark.parametrize("strictly_signed", ["positive", "negative", "zeros", None])
  921. def test_robust_scaler_equivalence_dense_sparse(density, strictly_signed):
  922. # Check the equivalence of the fitting with dense and sparse matrices
  923. X_sparse = sparse.rand(1000, 5, density=density).tocsc()
  924. if strictly_signed == "positive":
  925. X_sparse.data = np.abs(X_sparse.data)
  926. elif strictly_signed == "negative":
  927. X_sparse.data = -np.abs(X_sparse.data)
  928. elif strictly_signed == "zeros":
  929. X_sparse.data = np.zeros(X_sparse.data.shape, dtype=np.float64)
  930. X_dense = X_sparse.toarray()
  931. scaler_sparse = RobustScaler(with_centering=False)
  932. scaler_dense = RobustScaler(with_centering=False)
  933. scaler_sparse.fit(X_sparse)
  934. scaler_dense.fit(X_dense)
  935. assert_allclose(scaler_sparse.scale_, scaler_dense.scale_)
  936. def test_robust_scaler_transform_one_row_csr():
  937. # Check RobustScaler on transforming csr matrix with one row
  938. rng = np.random.RandomState(0)
  939. X = rng.randn(4, 5)
  940. single_row = np.array([[0.1, 1.0, 2.0, 0.0, -1.0]])
  941. scaler = RobustScaler(with_centering=False)
  942. scaler = scaler.fit(X)
  943. row_trans = scaler.transform(sparse.csr_matrix(single_row))
  944. row_expected = single_row / scaler.scale_
  945. assert_array_almost_equal(row_trans.toarray(), row_expected)
  946. row_scaled_back = scaler.inverse_transform(row_trans)
  947. assert_array_almost_equal(single_row, row_scaled_back.toarray())
  948. def test_robust_scaler_iris():
  949. X = iris.data
  950. scaler = RobustScaler()
  951. X_trans = scaler.fit_transform(X)
  952. assert_array_almost_equal(np.median(X_trans, axis=0), 0)
  953. X_trans_inv = scaler.inverse_transform(X_trans)
  954. assert_array_almost_equal(X, X_trans_inv)
  955. q = np.percentile(X_trans, q=(25, 75), axis=0)
  956. iqr = q[1] - q[0]
  957. assert_array_almost_equal(iqr, 1)
  958. def test_robust_scaler_iris_quantiles():
  959. X = iris.data
  960. scaler = RobustScaler(quantile_range=(10, 90))
  961. X_trans = scaler.fit_transform(X)
  962. assert_array_almost_equal(np.median(X_trans, axis=0), 0)
  963. X_trans_inv = scaler.inverse_transform(X_trans)
  964. assert_array_almost_equal(X, X_trans_inv)
  965. q = np.percentile(X_trans, q=(10, 90), axis=0)
  966. q_range = q[1] - q[0]
  967. assert_array_almost_equal(q_range, 1)
  968. def test_quantile_transform_iris():
  969. X = iris.data
  970. # uniform output distribution
  971. transformer = QuantileTransformer(n_quantiles=30)
  972. X_trans = transformer.fit_transform(X)
  973. X_trans_inv = transformer.inverse_transform(X_trans)
  974. assert_array_almost_equal(X, X_trans_inv)
  975. # normal output distribution
  976. transformer = QuantileTransformer(n_quantiles=30, output_distribution="normal")
  977. X_trans = transformer.fit_transform(X)
  978. X_trans_inv = transformer.inverse_transform(X_trans)
  979. assert_array_almost_equal(X, X_trans_inv)
  980. # make sure it is possible to take the inverse of a sparse matrix
  981. # which contain negative value; this is the case in the iris dataset
  982. X_sparse = sparse.csc_matrix(X)
  983. X_sparse_tran = transformer.fit_transform(X_sparse)
  984. X_sparse_tran_inv = transformer.inverse_transform(X_sparse_tran)
  985. assert_array_almost_equal(X_sparse.A, X_sparse_tran_inv.A)
  986. def test_quantile_transform_check_error():
  987. X = np.transpose(
  988. [
  989. [0, 25, 50, 0, 0, 0, 75, 0, 0, 100],
  990. [2, 4, 0, 0, 6, 8, 0, 10, 0, 0],
  991. [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1],
  992. ]
  993. )
  994. X = sparse.csc_matrix(X)
  995. X_neg = np.transpose(
  996. [
  997. [0, 25, 50, 0, 0, 0, 75, 0, 0, 100],
  998. [-2, 4, 0, 0, 6, 8, 0, 10, 0, 0],
  999. [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1],
  1000. ]
  1001. )
  1002. X_neg = sparse.csc_matrix(X_neg)
  1003. err_msg = (
  1004. "The number of quantiles cannot be greater than "
  1005. "the number of samples used. Got 1000 quantiles "
  1006. "and 10 samples."
  1007. )
  1008. with pytest.raises(ValueError, match=err_msg):
  1009. QuantileTransformer(subsample=10).fit(X)
  1010. transformer = QuantileTransformer(n_quantiles=10)
  1011. err_msg = "QuantileTransformer only accepts non-negative sparse matrices."
  1012. with pytest.raises(ValueError, match=err_msg):
  1013. transformer.fit(X_neg)
  1014. transformer.fit(X)
  1015. err_msg = "QuantileTransformer only accepts non-negative sparse matrices."
  1016. with pytest.raises(ValueError, match=err_msg):
  1017. transformer.transform(X_neg)
  1018. X_bad_feat = np.transpose(
  1019. [[0, 25, 50, 0, 0, 0, 75, 0, 0, 100], [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]]
  1020. )
  1021. err_msg = (
  1022. "X has 2 features, but QuantileTransformer is expecting 3 features as input."
  1023. )
  1024. with pytest.raises(ValueError, match=err_msg):
  1025. transformer.inverse_transform(X_bad_feat)
  1026. transformer = QuantileTransformer(n_quantiles=10).fit(X)
  1027. # check that an error is raised if input is scalar
  1028. with pytest.raises(ValueError, match="Expected 2D array, got scalar array instead"):
  1029. transformer.transform(10)
  1030. # check that a warning is raised is n_quantiles > n_samples
  1031. transformer = QuantileTransformer(n_quantiles=100)
  1032. warn_msg = "n_quantiles is set to n_samples"
  1033. with pytest.warns(UserWarning, match=warn_msg) as record:
  1034. transformer.fit(X)
  1035. assert len(record) == 1
  1036. assert transformer.n_quantiles_ == X.shape[0]
  1037. def test_quantile_transform_sparse_ignore_zeros():
  1038. X = np.array([[0, 1], [0, 0], [0, 2], [0, 2], [0, 1]])
  1039. X_sparse = sparse.csc_matrix(X)
  1040. transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5)
  1041. # dense case -> warning raise
  1042. warning_message = (
  1043. "'ignore_implicit_zeros' takes effect"
  1044. " only with sparse matrix. This parameter has no"
  1045. " effect."
  1046. )
  1047. with pytest.warns(UserWarning, match=warning_message):
  1048. transformer.fit(X)
  1049. X_expected = np.array([[0, 0], [0, 0], [0, 1], [0, 1], [0, 0]])
  1050. X_trans = transformer.fit_transform(X_sparse)
  1051. assert_almost_equal(X_expected, X_trans.A)
  1052. # consider the case where sparse entries are missing values and user-given
  1053. # zeros are to be considered
  1054. X_data = np.array([0, 0, 1, 0, 2, 2, 1, 0, 1, 2, 0])
  1055. X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1])
  1056. X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6, 7, 8])
  1057. X_sparse = sparse.csc_matrix((X_data, (X_row, X_col)))
  1058. X_trans = transformer.fit_transform(X_sparse)
  1059. X_expected = np.array(
  1060. [
  1061. [0.0, 0.5],
  1062. [0.0, 0.0],
  1063. [0.0, 1.0],
  1064. [0.0, 1.0],
  1065. [0.0, 0.5],
  1066. [0.0, 0.0],
  1067. [0.0, 0.5],
  1068. [0.0, 1.0],
  1069. [0.0, 0.0],
  1070. ]
  1071. )
  1072. assert_almost_equal(X_expected, X_trans.A)
  1073. transformer = QuantileTransformer(ignore_implicit_zeros=True, n_quantiles=5)
  1074. X_data = np.array([-1, -1, 1, 0, 0, 0, 1, -1, 1])
  1075. X_col = np.array([0, 0, 1, 1, 1, 1, 1, 1, 1])
  1076. X_row = np.array([0, 4, 0, 1, 2, 3, 4, 5, 6])
  1077. X_sparse = sparse.csc_matrix((X_data, (X_row, X_col)))
  1078. X_trans = transformer.fit_transform(X_sparse)
  1079. X_expected = np.array(
  1080. [[0, 1], [0, 0.375], [0, 0.375], [0, 0.375], [0, 1], [0, 0], [0, 1]]
  1081. )
  1082. assert_almost_equal(X_expected, X_trans.A)
  1083. assert_almost_equal(X_sparse.A, transformer.inverse_transform(X_trans).A)
  1084. # check in conjunction with subsampling
  1085. transformer = QuantileTransformer(
  1086. ignore_implicit_zeros=True, n_quantiles=5, subsample=8, random_state=0
  1087. )
  1088. X_trans = transformer.fit_transform(X_sparse)
  1089. assert_almost_equal(X_expected, X_trans.A)
  1090. assert_almost_equal(X_sparse.A, transformer.inverse_transform(X_trans).A)
  1091. def test_quantile_transform_dense_toy():
  1092. X = np.array(
  1093. [[0, 2, 2.6], [25, 4, 4.1], [50, 6, 2.3], [75, 8, 9.5], [100, 10, 0.1]]
  1094. )
  1095. transformer = QuantileTransformer(n_quantiles=5)
  1096. transformer.fit(X)
  1097. # using a uniform output, each entry of X should be map between 0 and 1
  1098. # and equally spaced
  1099. X_trans = transformer.fit_transform(X)
  1100. X_expected = np.tile(np.linspace(0, 1, num=5), (3, 1)).T
  1101. assert_almost_equal(np.sort(X_trans, axis=0), X_expected)
  1102. X_test = np.array(
  1103. [
  1104. [-1, 1, 0],
  1105. [101, 11, 10],
  1106. ]
  1107. )
  1108. X_expected = np.array(
  1109. [
  1110. [0, 0, 0],
  1111. [1, 1, 1],
  1112. ]
  1113. )
  1114. assert_array_almost_equal(transformer.transform(X_test), X_expected)
  1115. X_trans_inv = transformer.inverse_transform(X_trans)
  1116. assert_array_almost_equal(X, X_trans_inv)
  1117. def test_quantile_transform_subsampling():
  1118. # Test that subsampling the input yield to a consistent results We check
  1119. # that the computed quantiles are almost mapped to a [0, 1] vector where
  1120. # values are equally spaced. The infinite norm is checked to be smaller
  1121. # than a given threshold. This is repeated 5 times.
  1122. # dense support
  1123. n_samples = 1000000
  1124. n_quantiles = 1000
  1125. X = np.sort(np.random.sample((n_samples, 1)), axis=0)
  1126. ROUND = 5
  1127. inf_norm_arr = []
  1128. for random_state in range(ROUND):
  1129. transformer = QuantileTransformer(
  1130. random_state=random_state,
  1131. n_quantiles=n_quantiles,
  1132. subsample=n_samples // 10,
  1133. )
  1134. transformer.fit(X)
  1135. diff = np.linspace(0, 1, n_quantiles) - np.ravel(transformer.quantiles_)
  1136. inf_norm = np.max(np.abs(diff))
  1137. assert inf_norm < 1e-2
  1138. inf_norm_arr.append(inf_norm)
  1139. # each random subsampling yield a unique approximation to the expected
  1140. # linspace CDF
  1141. assert len(np.unique(inf_norm_arr)) == len(inf_norm_arr)
  1142. # sparse support
  1143. X = sparse.rand(n_samples, 1, density=0.99, format="csc", random_state=0)
  1144. inf_norm_arr = []
  1145. for random_state in range(ROUND):
  1146. transformer = QuantileTransformer(
  1147. random_state=random_state,
  1148. n_quantiles=n_quantiles,
  1149. subsample=n_samples // 10,
  1150. )
  1151. transformer.fit(X)
  1152. diff = np.linspace(0, 1, n_quantiles) - np.ravel(transformer.quantiles_)
  1153. inf_norm = np.max(np.abs(diff))
  1154. assert inf_norm < 1e-1
  1155. inf_norm_arr.append(inf_norm)
  1156. # each random subsampling yield a unique approximation to the expected
  1157. # linspace CDF
  1158. assert len(np.unique(inf_norm_arr)) == len(inf_norm_arr)
  1159. def test_quantile_transform_sparse_toy():
  1160. X = np.array(
  1161. [
  1162. [0.0, 2.0, 0.0],
  1163. [25.0, 4.0, 0.0],
  1164. [50.0, 0.0, 2.6],
  1165. [0.0, 0.0, 4.1],
  1166. [0.0, 6.0, 0.0],
  1167. [0.0, 8.0, 0.0],
  1168. [75.0, 0.0, 2.3],
  1169. [0.0, 10.0, 0.0],
  1170. [0.0, 0.0, 9.5],
  1171. [100.0, 0.0, 0.1],
  1172. ]
  1173. )
  1174. X = sparse.csc_matrix(X)
  1175. transformer = QuantileTransformer(n_quantiles=10)
  1176. transformer.fit(X)
  1177. X_trans = transformer.fit_transform(X)
  1178. assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.0)
  1179. assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.0)
  1180. X_trans_inv = transformer.inverse_transform(X_trans)
  1181. assert_array_almost_equal(X.toarray(), X_trans_inv.toarray())
  1182. transformer_dense = QuantileTransformer(n_quantiles=10).fit(X.toarray())
  1183. X_trans = transformer_dense.transform(X)
  1184. assert_array_almost_equal(np.min(X_trans.toarray(), axis=0), 0.0)
  1185. assert_array_almost_equal(np.max(X_trans.toarray(), axis=0), 1.0)
  1186. X_trans_inv = transformer_dense.inverse_transform(X_trans)
  1187. assert_array_almost_equal(X.toarray(), X_trans_inv.toarray())
  1188. def test_quantile_transform_axis1():
  1189. X = np.array([[0, 25, 50, 75, 100], [2, 4, 6, 8, 10], [2.6, 4.1, 2.3, 9.5, 0.1]])
  1190. X_trans_a0 = quantile_transform(X.T, axis=0, n_quantiles=5)
  1191. X_trans_a1 = quantile_transform(X, axis=1, n_quantiles=5)
  1192. assert_array_almost_equal(X_trans_a0, X_trans_a1.T)
  1193. def test_quantile_transform_bounds():
  1194. # Lower and upper bounds are manually mapped. We checked that in the case
  1195. # of a constant feature and binary feature, the bounds are properly mapped.
  1196. X_dense = np.array([[0, 0], [0, 0], [1, 0]])
  1197. X_sparse = sparse.csc_matrix(X_dense)
  1198. # check sparse and dense are consistent
  1199. X_trans = QuantileTransformer(n_quantiles=3, random_state=0).fit_transform(X_dense)
  1200. assert_array_almost_equal(X_trans, X_dense)
  1201. X_trans_sp = QuantileTransformer(n_quantiles=3, random_state=0).fit_transform(
  1202. X_sparse
  1203. )
  1204. assert_array_almost_equal(X_trans_sp.A, X_dense)
  1205. assert_array_almost_equal(X_trans, X_trans_sp.A)
  1206. # check the consistency of the bounds by learning on 1 matrix
  1207. # and transforming another
  1208. X = np.array([[0, 1], [0, 0.5], [1, 0]])
  1209. X1 = np.array([[0, 0.1], [0, 0.5], [1, 0.1]])
  1210. transformer = QuantileTransformer(n_quantiles=3).fit(X)
  1211. X_trans = transformer.transform(X1)
  1212. assert_array_almost_equal(X_trans, X1)
  1213. # check that values outside of the range learned will be mapped properly.
  1214. X = np.random.random((1000, 1))
  1215. transformer = QuantileTransformer()
  1216. transformer.fit(X)
  1217. assert transformer.transform([[-10]]) == transformer.transform([[np.min(X)]])
  1218. assert transformer.transform([[10]]) == transformer.transform([[np.max(X)]])
  1219. assert transformer.inverse_transform([[-10]]) == transformer.inverse_transform(
  1220. [[np.min(transformer.references_)]]
  1221. )
  1222. assert transformer.inverse_transform([[10]]) == transformer.inverse_transform(
  1223. [[np.max(transformer.references_)]]
  1224. )
  1225. def test_quantile_transform_and_inverse():
  1226. X_1 = iris.data
  1227. X_2 = np.array([[0.0], [BOUNDS_THRESHOLD / 10], [1.5], [2], [3], [3], [4]])
  1228. for X in [X_1, X_2]:
  1229. transformer = QuantileTransformer(n_quantiles=1000, random_state=0)
  1230. X_trans = transformer.fit_transform(X)
  1231. X_trans_inv = transformer.inverse_transform(X_trans)
  1232. assert_array_almost_equal(X, X_trans_inv, decimal=9)
  1233. def test_quantile_transform_nan():
  1234. X = np.array([[np.nan, 0, 0, 1], [np.nan, np.nan, 0, 0.5], [np.nan, 1, 1, 0]])
  1235. transformer = QuantileTransformer(n_quantiles=10, random_state=42)
  1236. transformer.fit_transform(X)
  1237. # check that the quantile of the first column is all NaN
  1238. assert np.isnan(transformer.quantiles_[:, 0]).all()
  1239. # all other column should not contain NaN
  1240. assert not np.isnan(transformer.quantiles_[:, 1:]).any()
  1241. @pytest.mark.parametrize("array_type", ["array", "sparse"])
  1242. def test_quantile_transformer_sorted_quantiles(array_type):
  1243. # Non-regression test for:
  1244. # https://github.com/scikit-learn/scikit-learn/issues/15733
  1245. # Taken from upstream bug report:
  1246. # https://github.com/numpy/numpy/issues/14685
  1247. X = np.array([0, 1, 1, 2, 2, 3, 3, 4, 5, 5, 1, 1, 9, 9, 9, 8, 8, 7] * 10)
  1248. X = 0.1 * X.reshape(-1, 1)
  1249. X = _convert_container(X, array_type)
  1250. n_quantiles = 100
  1251. qt = QuantileTransformer(n_quantiles=n_quantiles).fit(X)
  1252. # Check that the estimated quantile thresholds are monotically
  1253. # increasing:
  1254. quantiles = qt.quantiles_[:, 0]
  1255. assert len(quantiles) == 100
  1256. assert all(np.diff(quantiles) >= 0)
  1257. def test_robust_scaler_invalid_range():
  1258. for range_ in [
  1259. (-1, 90),
  1260. (-2, -3),
  1261. (10, 101),
  1262. (100.5, 101),
  1263. (90, 50),
  1264. ]:
  1265. scaler = RobustScaler(quantile_range=range_)
  1266. with pytest.raises(ValueError, match=r"Invalid quantile range: \("):
  1267. scaler.fit(iris.data)
  1268. def test_scale_function_without_centering():
  1269. rng = np.random.RandomState(42)
  1270. X = rng.randn(4, 5)
  1271. X[:, 0] = 0.0 # first feature is always of zero
  1272. X_csr = sparse.csr_matrix(X)
  1273. X_scaled = scale(X, with_mean=False)
  1274. assert not np.any(np.isnan(X_scaled))
  1275. X_csr_scaled = scale(X_csr, with_mean=False)
  1276. assert not np.any(np.isnan(X_csr_scaled.data))
  1277. # test csc has same outcome
  1278. X_csc_scaled = scale(X_csr.tocsc(), with_mean=False)
  1279. assert_array_almost_equal(X_scaled, X_csc_scaled.toarray())
  1280. # raises value error on axis != 0
  1281. with pytest.raises(ValueError):
  1282. scale(X_csr, with_mean=False, axis=1)
  1283. assert_array_almost_equal(
  1284. X_scaled.mean(axis=0), [0.0, -0.01, 2.24, -0.35, -0.78], 2
  1285. )
  1286. assert_array_almost_equal(X_scaled.std(axis=0), [0.0, 1.0, 1.0, 1.0, 1.0])
  1287. # Check that X has not been copied
  1288. assert X_scaled is not X
  1289. X_csr_scaled_mean, X_csr_scaled_std = mean_variance_axis(X_csr_scaled, 0)
  1290. assert_array_almost_equal(X_csr_scaled_mean, X_scaled.mean(axis=0))
  1291. assert_array_almost_equal(X_csr_scaled_std, X_scaled.std(axis=0))
  1292. # null scale
  1293. X_csr_scaled = scale(X_csr, with_mean=False, with_std=False, copy=True)
  1294. assert_array_almost_equal(X_csr.toarray(), X_csr_scaled.toarray())
  1295. def test_robust_scale_axis1():
  1296. X = iris.data
  1297. X_trans = robust_scale(X, axis=1)
  1298. assert_array_almost_equal(np.median(X_trans, axis=1), 0)
  1299. q = np.percentile(X_trans, q=(25, 75), axis=1)
  1300. iqr = q[1] - q[0]
  1301. assert_array_almost_equal(iqr, 1)
  1302. def test_robust_scale_1d_array():
  1303. X = iris.data[:, 1]
  1304. X_trans = robust_scale(X)
  1305. assert_array_almost_equal(np.median(X_trans), 0)
  1306. q = np.percentile(X_trans, q=(25, 75))
  1307. iqr = q[1] - q[0]
  1308. assert_array_almost_equal(iqr, 1)
  1309. def test_robust_scaler_zero_variance_features():
  1310. # Check RobustScaler on toy data with zero variance features
  1311. X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.1], [0.0, 1.0, +1.1]]
  1312. scaler = RobustScaler()
  1313. X_trans = scaler.fit_transform(X)
  1314. # NOTE: for such a small sample size, what we expect in the third column
  1315. # depends HEAVILY on the method used to calculate quantiles. The values
  1316. # here were calculated to fit the quantiles produces by np.percentile
  1317. # using numpy 1.9 Calculating quantiles with
  1318. # scipy.stats.mstats.scoreatquantile or scipy.stats.mstats.mquantiles
  1319. # would yield very different results!
  1320. X_expected = [[0.0, 0.0, +0.0], [0.0, 0.0, -1.0], [0.0, 0.0, +1.0]]
  1321. assert_array_almost_equal(X_trans, X_expected)
  1322. X_trans_inv = scaler.inverse_transform(X_trans)
  1323. assert_array_almost_equal(X, X_trans_inv)
  1324. # make sure new data gets transformed correctly
  1325. X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]]
  1326. X_trans_new = scaler.transform(X_new)
  1327. X_expected_new = [[+0.0, 1.0, +0.0], [-1.0, 0.0, -0.83333], [+0.0, 0.0, +1.66667]]
  1328. assert_array_almost_equal(X_trans_new, X_expected_new, decimal=3)
  1329. def test_robust_scaler_unit_variance():
  1330. # Check RobustScaler with unit_variance=True on standard normal data with
  1331. # outliers
  1332. rng = np.random.RandomState(42)
  1333. X = rng.randn(1000000, 1)
  1334. X_with_outliers = np.vstack([X, np.ones((100, 1)) * 100, np.ones((100, 1)) * -100])
  1335. quantile_range = (1, 99)
  1336. robust_scaler = RobustScaler(quantile_range=quantile_range, unit_variance=True).fit(
  1337. X_with_outliers
  1338. )
  1339. X_trans = robust_scaler.transform(X)
  1340. assert robust_scaler.center_ == pytest.approx(0, abs=1e-3)
  1341. assert robust_scaler.scale_ == pytest.approx(1, abs=1e-2)
  1342. assert X_trans.std() == pytest.approx(1, abs=1e-2)
  1343. def test_maxabs_scaler_zero_variance_features():
  1344. # Check MaxAbsScaler on toy data with zero variance features
  1345. X = [[0.0, 1.0, +0.5], [0.0, 1.0, -0.3], [0.0, 1.0, +1.5], [0.0, 0.0, +0.0]]
  1346. scaler = MaxAbsScaler()
  1347. X_trans = scaler.fit_transform(X)
  1348. X_expected = [
  1349. [0.0, 1.0, 1.0 / 3.0],
  1350. [0.0, 1.0, -0.2],
  1351. [0.0, 1.0, 1.0],
  1352. [0.0, 0.0, 0.0],
  1353. ]
  1354. assert_array_almost_equal(X_trans, X_expected)
  1355. X_trans_inv = scaler.inverse_transform(X_trans)
  1356. assert_array_almost_equal(X, X_trans_inv)
  1357. # make sure new data gets transformed correctly
  1358. X_new = [[+0.0, 2.0, 0.5], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.5]]
  1359. X_trans_new = scaler.transform(X_new)
  1360. X_expected_new = [[+0.0, 2.0, 1.0 / 3.0], [-1.0, 1.0, 0.0], [+0.0, 1.0, 1.0]]
  1361. assert_array_almost_equal(X_trans_new, X_expected_new, decimal=2)
  1362. # function interface
  1363. X_trans = maxabs_scale(X)
  1364. assert_array_almost_equal(X_trans, X_expected)
  1365. # sparse data
  1366. X_csr = sparse.csr_matrix(X)
  1367. X_csc = sparse.csc_matrix(X)
  1368. X_trans_csr = scaler.fit_transform(X_csr)
  1369. X_trans_csc = scaler.fit_transform(X_csc)
  1370. X_expected = [
  1371. [0.0, 1.0, 1.0 / 3.0],
  1372. [0.0, 1.0, -0.2],
  1373. [0.0, 1.0, 1.0],
  1374. [0.0, 0.0, 0.0],
  1375. ]
  1376. assert_array_almost_equal(X_trans_csr.A, X_expected)
  1377. assert_array_almost_equal(X_trans_csc.A, X_expected)
  1378. X_trans_csr_inv = scaler.inverse_transform(X_trans_csr)
  1379. X_trans_csc_inv = scaler.inverse_transform(X_trans_csc)
  1380. assert_array_almost_equal(X, X_trans_csr_inv.A)
  1381. assert_array_almost_equal(X, X_trans_csc_inv.A)
  1382. def test_maxabs_scaler_large_negative_value():
  1383. # Check MaxAbsScaler on toy data with a large negative value
  1384. X = [
  1385. [0.0, 1.0, +0.5, -1.0],
  1386. [0.0, 1.0, -0.3, -0.5],
  1387. [0.0, 1.0, -100.0, 0.0],
  1388. [0.0, 0.0, +0.0, -2.0],
  1389. ]
  1390. scaler = MaxAbsScaler()
  1391. X_trans = scaler.fit_transform(X)
  1392. X_expected = [
  1393. [0.0, 1.0, 0.005, -0.5],
  1394. [0.0, 1.0, -0.003, -0.25],
  1395. [0.0, 1.0, -1.0, 0.0],
  1396. [0.0, 0.0, 0.0, -1.0],
  1397. ]
  1398. assert_array_almost_equal(X_trans, X_expected)
  1399. def test_maxabs_scaler_transform_one_row_csr():
  1400. # Check MaxAbsScaler on transforming csr matrix with one row
  1401. X = sparse.csr_matrix([[0.5, 1.0, 1.0]])
  1402. scaler = MaxAbsScaler()
  1403. scaler = scaler.fit(X)
  1404. X_trans = scaler.transform(X)
  1405. X_expected = sparse.csr_matrix([[1.0, 1.0, 1.0]])
  1406. assert_array_almost_equal(X_trans.toarray(), X_expected.toarray())
  1407. X_scaled_back = scaler.inverse_transform(X_trans)
  1408. assert_array_almost_equal(X.toarray(), X_scaled_back.toarray())
  1409. def test_maxabs_scaler_1d():
  1410. # Test scaling of dataset along single axis
  1411. for X in [X_1row, X_1col, X_list_1row, X_list_1row]:
  1412. scaler = MaxAbsScaler(copy=True)
  1413. X_scaled = scaler.fit(X).transform(X)
  1414. if isinstance(X, list):
  1415. X = np.array(X) # cast only after scaling done
  1416. if _check_dim_1axis(X) == 1:
  1417. assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), np.ones(n_features))
  1418. else:
  1419. assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.0)
  1420. assert scaler.n_samples_seen_ == X.shape[0]
  1421. # check inverse transform
  1422. X_scaled_back = scaler.inverse_transform(X_scaled)
  1423. assert_array_almost_equal(X_scaled_back, X)
  1424. # Constant feature
  1425. X = np.ones((5, 1))
  1426. scaler = MaxAbsScaler()
  1427. X_scaled = scaler.fit(X).transform(X)
  1428. assert_array_almost_equal(np.abs(X_scaled.max(axis=0)), 1.0)
  1429. assert scaler.n_samples_seen_ == X.shape[0]
  1430. # function interface
  1431. X_1d = X_1row.ravel()
  1432. max_abs = np.abs(X_1d).max()
  1433. assert_array_almost_equal(X_1d / max_abs, maxabs_scale(X_1d, copy=True))
  1434. def test_maxabs_scaler_partial_fit():
  1435. # Test if partial_fit run over many batches of size 1 and 50
  1436. # gives the same results as fit
  1437. X = X_2d[:100, :]
  1438. n = X.shape[0]
  1439. for chunk_size in [1, 2, 50, n, n + 42]:
  1440. # Test mean at the end of the process
  1441. scaler_batch = MaxAbsScaler().fit(X)
  1442. scaler_incr = MaxAbsScaler()
  1443. scaler_incr_csr = MaxAbsScaler()
  1444. scaler_incr_csc = MaxAbsScaler()
  1445. for batch in gen_batches(n, chunk_size):
  1446. scaler_incr = scaler_incr.partial_fit(X[batch])
  1447. X_csr = sparse.csr_matrix(X[batch])
  1448. scaler_incr_csr = scaler_incr_csr.partial_fit(X_csr)
  1449. X_csc = sparse.csc_matrix(X[batch])
  1450. scaler_incr_csc = scaler_incr_csc.partial_fit(X_csc)
  1451. assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
  1452. assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csr.max_abs_)
  1453. assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr_csc.max_abs_)
  1454. assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
  1455. assert scaler_batch.n_samples_seen_ == scaler_incr_csr.n_samples_seen_
  1456. assert scaler_batch.n_samples_seen_ == scaler_incr_csc.n_samples_seen_
  1457. assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
  1458. assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csr.scale_)
  1459. assert_array_almost_equal(scaler_batch.scale_, scaler_incr_csc.scale_)
  1460. assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X))
  1461. # Test std after 1 step
  1462. batch0 = slice(0, chunk_size)
  1463. scaler_batch = MaxAbsScaler().fit(X[batch0])
  1464. scaler_incr = MaxAbsScaler().partial_fit(X[batch0])
  1465. assert_array_almost_equal(scaler_batch.max_abs_, scaler_incr.max_abs_)
  1466. assert scaler_batch.n_samples_seen_ == scaler_incr.n_samples_seen_
  1467. assert_array_almost_equal(scaler_batch.scale_, scaler_incr.scale_)
  1468. assert_array_almost_equal(scaler_batch.transform(X), scaler_incr.transform(X))
  1469. # Test std until the end of partial fits, and
  1470. scaler_batch = MaxAbsScaler().fit(X)
  1471. scaler_incr = MaxAbsScaler() # Clean estimator
  1472. for i, batch in enumerate(gen_batches(n, chunk_size)):
  1473. scaler_incr = scaler_incr.partial_fit(X[batch])
  1474. assert_correct_incr(
  1475. i,
  1476. batch_start=batch.start,
  1477. batch_stop=batch.stop,
  1478. n=n,
  1479. chunk_size=chunk_size,
  1480. n_samples_seen=scaler_incr.n_samples_seen_,
  1481. )
  1482. def test_normalizer_l1():
  1483. rng = np.random.RandomState(0)
  1484. X_dense = rng.randn(4, 5)
  1485. X_sparse_unpruned = sparse.csr_matrix(X_dense)
  1486. # set the row number 3 to zero
  1487. X_dense[3, :] = 0.0
  1488. # set the row number 3 to zero without pruning (can happen in real life)
  1489. indptr_3 = X_sparse_unpruned.indptr[3]
  1490. indptr_4 = X_sparse_unpruned.indptr[4]
  1491. X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0
  1492. # build the pruned variant using the regular constructor
  1493. X_sparse_pruned = sparse.csr_matrix(X_dense)
  1494. # check inputs that support the no-copy optim
  1495. for X in (X_dense, X_sparse_pruned, X_sparse_unpruned):
  1496. normalizer = Normalizer(norm="l1", copy=True)
  1497. X_norm = normalizer.transform(X)
  1498. assert X_norm is not X
  1499. X_norm1 = toarray(X_norm)
  1500. normalizer = Normalizer(norm="l1", copy=False)
  1501. X_norm = normalizer.transform(X)
  1502. assert X_norm is X
  1503. X_norm2 = toarray(X_norm)
  1504. for X_norm in (X_norm1, X_norm2):
  1505. row_sums = np.abs(X_norm).sum(axis=1)
  1506. for i in range(3):
  1507. assert_almost_equal(row_sums[i], 1.0)
  1508. assert_almost_equal(row_sums[3], 0.0)
  1509. # check input for which copy=False won't prevent a copy
  1510. for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix):
  1511. X = init(X_dense)
  1512. X_norm = normalizer = Normalizer(norm="l2", copy=False).transform(X)
  1513. assert X_norm is not X
  1514. assert sparse.issparse(X_norm) and X_norm.format == "csr"
  1515. X_norm = toarray(X_norm)
  1516. for i in range(3):
  1517. assert_almost_equal(row_sums[i], 1.0)
  1518. assert_almost_equal(la.norm(X_norm[3]), 0.0)
  1519. def test_normalizer_l2():
  1520. rng = np.random.RandomState(0)
  1521. X_dense = rng.randn(4, 5)
  1522. X_sparse_unpruned = sparse.csr_matrix(X_dense)
  1523. # set the row number 3 to zero
  1524. X_dense[3, :] = 0.0
  1525. # set the row number 3 to zero without pruning (can happen in real life)
  1526. indptr_3 = X_sparse_unpruned.indptr[3]
  1527. indptr_4 = X_sparse_unpruned.indptr[4]
  1528. X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0
  1529. # build the pruned variant using the regular constructor
  1530. X_sparse_pruned = sparse.csr_matrix(X_dense)
  1531. # check inputs that support the no-copy optim
  1532. for X in (X_dense, X_sparse_pruned, X_sparse_unpruned):
  1533. normalizer = Normalizer(norm="l2", copy=True)
  1534. X_norm1 = normalizer.transform(X)
  1535. assert X_norm1 is not X
  1536. X_norm1 = toarray(X_norm1)
  1537. normalizer = Normalizer(norm="l2", copy=False)
  1538. X_norm2 = normalizer.transform(X)
  1539. assert X_norm2 is X
  1540. X_norm2 = toarray(X_norm2)
  1541. for X_norm in (X_norm1, X_norm2):
  1542. for i in range(3):
  1543. assert_almost_equal(la.norm(X_norm[i]), 1.0)
  1544. assert_almost_equal(la.norm(X_norm[3]), 0.0)
  1545. # check input for which copy=False won't prevent a copy
  1546. for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix):
  1547. X = init(X_dense)
  1548. X_norm = normalizer = Normalizer(norm="l2", copy=False).transform(X)
  1549. assert X_norm is not X
  1550. assert sparse.issparse(X_norm) and X_norm.format == "csr"
  1551. X_norm = toarray(X_norm)
  1552. for i in range(3):
  1553. assert_almost_equal(la.norm(X_norm[i]), 1.0)
  1554. assert_almost_equal(la.norm(X_norm[3]), 0.0)
  1555. def test_normalizer_max():
  1556. rng = np.random.RandomState(0)
  1557. X_dense = rng.randn(4, 5)
  1558. X_sparse_unpruned = sparse.csr_matrix(X_dense)
  1559. # set the row number 3 to zero
  1560. X_dense[3, :] = 0.0
  1561. # set the row number 3 to zero without pruning (can happen in real life)
  1562. indptr_3 = X_sparse_unpruned.indptr[3]
  1563. indptr_4 = X_sparse_unpruned.indptr[4]
  1564. X_sparse_unpruned.data[indptr_3:indptr_4] = 0.0
  1565. # build the pruned variant using the regular constructor
  1566. X_sparse_pruned = sparse.csr_matrix(X_dense)
  1567. # check inputs that support the no-copy optim
  1568. for X in (X_dense, X_sparse_pruned, X_sparse_unpruned):
  1569. normalizer = Normalizer(norm="max", copy=True)
  1570. X_norm1 = normalizer.transform(X)
  1571. assert X_norm1 is not X
  1572. X_norm1 = toarray(X_norm1)
  1573. normalizer = Normalizer(norm="max", copy=False)
  1574. X_norm2 = normalizer.transform(X)
  1575. assert X_norm2 is X
  1576. X_norm2 = toarray(X_norm2)
  1577. for X_norm in (X_norm1, X_norm2):
  1578. row_maxs = abs(X_norm).max(axis=1)
  1579. for i in range(3):
  1580. assert_almost_equal(row_maxs[i], 1.0)
  1581. assert_almost_equal(row_maxs[3], 0.0)
  1582. # check input for which copy=False won't prevent a copy
  1583. for init in (sparse.coo_matrix, sparse.csc_matrix, sparse.lil_matrix):
  1584. X = init(X_dense)
  1585. X_norm = normalizer = Normalizer(norm="l2", copy=False).transform(X)
  1586. assert X_norm is not X
  1587. assert sparse.issparse(X_norm) and X_norm.format == "csr"
  1588. X_norm = toarray(X_norm)
  1589. for i in range(3):
  1590. assert_almost_equal(row_maxs[i], 1.0)
  1591. assert_almost_equal(la.norm(X_norm[3]), 0.0)
  1592. def test_normalizer_max_sign():
  1593. # check that we normalize by a positive number even for negative data
  1594. rng = np.random.RandomState(0)
  1595. X_dense = rng.randn(4, 5)
  1596. # set the row number 3 to zero
  1597. X_dense[3, :] = 0.0
  1598. # check for mixed data where the value with
  1599. # largest magnitude is negative
  1600. X_dense[2, abs(X_dense[2, :]).argmax()] *= -1
  1601. X_all_neg = -np.abs(X_dense)
  1602. X_all_neg_sparse = sparse.csr_matrix(X_all_neg)
  1603. for X in (X_dense, X_all_neg, X_all_neg_sparse):
  1604. normalizer = Normalizer(norm="max")
  1605. X_norm = normalizer.transform(X)
  1606. assert X_norm is not X
  1607. X_norm = toarray(X_norm)
  1608. assert_array_equal(np.sign(X_norm), np.sign(toarray(X)))
  1609. def test_normalize():
  1610. # Test normalize function
  1611. # Only tests functionality not used by the tests for Normalizer.
  1612. X = np.random.RandomState(37).randn(3, 2)
  1613. assert_array_equal(normalize(X, copy=False), normalize(X.T, axis=0, copy=False).T)
  1614. rs = np.random.RandomState(0)
  1615. X_dense = rs.randn(10, 5)
  1616. X_sparse = sparse.csr_matrix(X_dense)
  1617. ones = np.ones((10))
  1618. for X in (X_dense, X_sparse):
  1619. for dtype in (np.float32, np.float64):
  1620. for norm in ("l1", "l2"):
  1621. X = X.astype(dtype)
  1622. X_norm = normalize(X, norm=norm)
  1623. assert X_norm.dtype == dtype
  1624. X_norm = toarray(X_norm)
  1625. if norm == "l1":
  1626. row_sums = np.abs(X_norm).sum(axis=1)
  1627. else:
  1628. X_norm_squared = X_norm**2
  1629. row_sums = X_norm_squared.sum(axis=1)
  1630. assert_array_almost_equal(row_sums, ones)
  1631. # Test return_norm
  1632. X_dense = np.array([[3.0, 0, 4.0], [1.0, 0.0, 0.0], [2.0, 3.0, 0.0]])
  1633. for norm in ("l1", "l2", "max"):
  1634. _, norms = normalize(X_dense, norm=norm, return_norm=True)
  1635. if norm == "l1":
  1636. assert_array_almost_equal(norms, np.array([7.0, 1.0, 5.0]))
  1637. elif norm == "l2":
  1638. assert_array_almost_equal(norms, np.array([5.0, 1.0, 3.60555127]))
  1639. else:
  1640. assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0]))
  1641. X_sparse = sparse.csr_matrix(X_dense)
  1642. for norm in ("l1", "l2"):
  1643. with pytest.raises(NotImplementedError):
  1644. normalize(X_sparse, norm=norm, return_norm=True)
  1645. _, norms = normalize(X_sparse, norm="max", return_norm=True)
  1646. assert_array_almost_equal(norms, np.array([4.0, 1.0, 3.0]))
  1647. def test_binarizer():
  1648. X_ = np.array([[1, 0, 5], [2, 3, -1]])
  1649. for init in (np.array, list, sparse.csr_matrix, sparse.csc_matrix):
  1650. X = init(X_.copy())
  1651. binarizer = Binarizer(threshold=2.0, copy=True)
  1652. X_bin = toarray(binarizer.transform(X))
  1653. assert np.sum(X_bin == 0) == 4
  1654. assert np.sum(X_bin == 1) == 2
  1655. X_bin = binarizer.transform(X)
  1656. assert sparse.issparse(X) == sparse.issparse(X_bin)
  1657. binarizer = Binarizer(copy=True).fit(X)
  1658. X_bin = toarray(binarizer.transform(X))
  1659. assert X_bin is not X
  1660. assert np.sum(X_bin == 0) == 2
  1661. assert np.sum(X_bin == 1) == 4
  1662. binarizer = Binarizer(copy=True)
  1663. X_bin = binarizer.transform(X)
  1664. assert X_bin is not X
  1665. X_bin = toarray(X_bin)
  1666. assert np.sum(X_bin == 0) == 2
  1667. assert np.sum(X_bin == 1) == 4
  1668. binarizer = Binarizer(copy=False)
  1669. X_bin = binarizer.transform(X)
  1670. if init is not list:
  1671. assert X_bin is X
  1672. binarizer = Binarizer(copy=False)
  1673. X_float = np.array([[1, 0, 5], [2, 3, -1]], dtype=np.float64)
  1674. X_bin = binarizer.transform(X_float)
  1675. if init is not list:
  1676. assert X_bin is X_float
  1677. X_bin = toarray(X_bin)
  1678. assert np.sum(X_bin == 0) == 2
  1679. assert np.sum(X_bin == 1) == 4
  1680. binarizer = Binarizer(threshold=-0.5, copy=True)
  1681. for init in (np.array, list):
  1682. X = init(X_.copy())
  1683. X_bin = toarray(binarizer.transform(X))
  1684. assert np.sum(X_bin == 0) == 1
  1685. assert np.sum(X_bin == 1) == 5
  1686. X_bin = binarizer.transform(X)
  1687. # Cannot use threshold < 0 for sparse
  1688. with pytest.raises(ValueError):
  1689. binarizer.transform(sparse.csc_matrix(X))
  1690. def test_center_kernel():
  1691. # Test that KernelCenterer is equivalent to StandardScaler
  1692. # in feature space
  1693. rng = np.random.RandomState(0)
  1694. X_fit = rng.random_sample((5, 4))
  1695. scaler = StandardScaler(with_std=False)
  1696. scaler.fit(X_fit)
  1697. X_fit_centered = scaler.transform(X_fit)
  1698. K_fit = np.dot(X_fit, X_fit.T)
  1699. # center fit time matrix
  1700. centerer = KernelCenterer()
  1701. K_fit_centered = np.dot(X_fit_centered, X_fit_centered.T)
  1702. K_fit_centered2 = centerer.fit_transform(K_fit)
  1703. assert_array_almost_equal(K_fit_centered, K_fit_centered2)
  1704. # center predict time matrix
  1705. X_pred = rng.random_sample((2, 4))
  1706. K_pred = np.dot(X_pred, X_fit.T)
  1707. X_pred_centered = scaler.transform(X_pred)
  1708. K_pred_centered = np.dot(X_pred_centered, X_fit_centered.T)
  1709. K_pred_centered2 = centerer.transform(K_pred)
  1710. assert_array_almost_equal(K_pred_centered, K_pred_centered2)
  1711. # check the results coherence with the method proposed in:
  1712. # B. Schölkopf, A. Smola, and K.R. Müller,
  1713. # "Nonlinear component analysis as a kernel eigenvalue problem"
  1714. # equation (B.3)
  1715. # K_centered3 = (I - 1_M) K (I - 1_M)
  1716. # = K - 1_M K - K 1_M + 1_M K 1_M
  1717. ones_M = np.ones_like(K_fit) / K_fit.shape[0]
  1718. K_fit_centered3 = K_fit - ones_M @ K_fit - K_fit @ ones_M + ones_M @ K_fit @ ones_M
  1719. assert_allclose(K_fit_centered, K_fit_centered3)
  1720. # K_test_centered3 = (K_test - 1'_M K)(I - 1_M)
  1721. # = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M
  1722. ones_prime_M = np.ones_like(K_pred) / K_fit.shape[0]
  1723. K_pred_centered3 = (
  1724. K_pred - ones_prime_M @ K_fit - K_pred @ ones_M + ones_prime_M @ K_fit @ ones_M
  1725. )
  1726. assert_allclose(K_pred_centered, K_pred_centered3)
  1727. def test_kernelcenterer_non_linear_kernel():
  1728. """Check kernel centering for non-linear kernel."""
  1729. rng = np.random.RandomState(0)
  1730. X, X_test = rng.randn(100, 50), rng.randn(20, 50)
  1731. def phi(X):
  1732. """Our mapping function phi."""
  1733. return np.vstack(
  1734. [
  1735. np.clip(X, a_min=0, a_max=None),
  1736. -np.clip(X, a_min=None, a_max=0),
  1737. ]
  1738. )
  1739. phi_X = phi(X)
  1740. phi_X_test = phi(X_test)
  1741. # centered the projection
  1742. scaler = StandardScaler(with_std=False)
  1743. phi_X_center = scaler.fit_transform(phi_X)
  1744. phi_X_test_center = scaler.transform(phi_X_test)
  1745. # create the different kernel
  1746. K = phi_X @ phi_X.T
  1747. K_test = phi_X_test @ phi_X.T
  1748. K_center = phi_X_center @ phi_X_center.T
  1749. K_test_center = phi_X_test_center @ phi_X_center.T
  1750. kernel_centerer = KernelCenterer()
  1751. kernel_centerer.fit(K)
  1752. assert_allclose(kernel_centerer.transform(K), K_center)
  1753. assert_allclose(kernel_centerer.transform(K_test), K_test_center)
  1754. # check the results coherence with the method proposed in:
  1755. # B. Schölkopf, A. Smola, and K.R. Müller,
  1756. # "Nonlinear component analysis as a kernel eigenvalue problem"
  1757. # equation (B.3)
  1758. # K_centered = (I - 1_M) K (I - 1_M)
  1759. # = K - 1_M K - K 1_M + 1_M K 1_M
  1760. ones_M = np.ones_like(K) / K.shape[0]
  1761. K_centered = K - ones_M @ K - K @ ones_M + ones_M @ K @ ones_M
  1762. assert_allclose(kernel_centerer.transform(K), K_centered)
  1763. # K_test_centered = (K_test - 1'_M K)(I - 1_M)
  1764. # = K_test - 1'_M K - K_test 1_M + 1'_M K 1_M
  1765. ones_prime_M = np.ones_like(K_test) / K.shape[0]
  1766. K_test_centered = (
  1767. K_test - ones_prime_M @ K - K_test @ ones_M + ones_prime_M @ K @ ones_M
  1768. )
  1769. assert_allclose(kernel_centerer.transform(K_test), K_test_centered)
  1770. def test_cv_pipeline_precomputed():
  1771. # Cross-validate a regression on four coplanar points with the same
  1772. # value. Use precomputed kernel to ensure Pipeline with KernelCenterer
  1773. # is treated as a pairwise operation.
  1774. X = np.array([[3, 0, 0], [0, 3, 0], [0, 0, 3], [1, 1, 1]])
  1775. y_true = np.ones((4,))
  1776. K = X.dot(X.T)
  1777. kcent = KernelCenterer()
  1778. pipeline = Pipeline([("kernel_centerer", kcent), ("svr", SVR())])
  1779. # did the pipeline set the pairwise attribute?
  1780. assert pipeline._get_tags()["pairwise"]
  1781. # test cross-validation, score should be almost perfect
  1782. # NB: this test is pretty vacuous -- it's mainly to test integration
  1783. # of Pipeline and KernelCenterer
  1784. y_pred = cross_val_predict(pipeline, K, y_true, cv=2)
  1785. assert_array_almost_equal(y_true, y_pred)
  1786. def test_fit_transform():
  1787. rng = np.random.RandomState(0)
  1788. X = rng.random_sample((5, 4))
  1789. for obj in (StandardScaler(), Normalizer(), Binarizer()):
  1790. X_transformed = obj.fit(X).transform(X)
  1791. X_transformed2 = obj.fit_transform(X)
  1792. assert_array_equal(X_transformed, X_transformed2)
  1793. def test_add_dummy_feature():
  1794. X = [[1, 0], [0, 1], [0, 1]]
  1795. X = add_dummy_feature(X)
  1796. assert_array_equal(X, [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
  1797. def test_add_dummy_feature_coo():
  1798. X = sparse.coo_matrix([[1, 0], [0, 1], [0, 1]])
  1799. X = add_dummy_feature(X)
  1800. assert sparse.issparse(X) and X.format == "coo", X
  1801. assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
  1802. def test_add_dummy_feature_csc():
  1803. X = sparse.csc_matrix([[1, 0], [0, 1], [0, 1]])
  1804. X = add_dummy_feature(X)
  1805. assert sparse.issparse(X) and X.format == "csc", X
  1806. assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
  1807. def test_add_dummy_feature_csr():
  1808. X = sparse.csr_matrix([[1, 0], [0, 1], [0, 1]])
  1809. X = add_dummy_feature(X)
  1810. assert sparse.issparse(X) and X.format == "csr", X
  1811. assert_array_equal(X.toarray(), [[1, 1, 0], [1, 0, 1], [1, 0, 1]])
  1812. def test_fit_cold_start():
  1813. X = iris.data
  1814. X_2d = X[:, :2]
  1815. # Scalers that have a partial_fit method
  1816. scalers = [
  1817. StandardScaler(with_mean=False, with_std=False),
  1818. MinMaxScaler(),
  1819. MaxAbsScaler(),
  1820. ]
  1821. for scaler in scalers:
  1822. scaler.fit_transform(X)
  1823. # with a different shape, this may break the scaler unless the internal
  1824. # state is reset
  1825. scaler.fit_transform(X_2d)
  1826. @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
  1827. def test_power_transformer_notfitted(method):
  1828. pt = PowerTransformer(method=method)
  1829. X = np.abs(X_1col)
  1830. with pytest.raises(NotFittedError):
  1831. pt.transform(X)
  1832. with pytest.raises(NotFittedError):
  1833. pt.inverse_transform(X)
  1834. @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
  1835. @pytest.mark.parametrize("standardize", [True, False])
  1836. @pytest.mark.parametrize("X", [X_1col, X_2d])
  1837. def test_power_transformer_inverse(method, standardize, X):
  1838. # Make sure we get the original input when applying transform and then
  1839. # inverse transform
  1840. X = np.abs(X) if method == "box-cox" else X
  1841. pt = PowerTransformer(method=method, standardize=standardize)
  1842. X_trans = pt.fit_transform(X)
  1843. assert_almost_equal(X, pt.inverse_transform(X_trans))
  1844. def test_power_transformer_1d():
  1845. X = np.abs(X_1col)
  1846. for standardize in [True, False]:
  1847. pt = PowerTransformer(method="box-cox", standardize=standardize)
  1848. X_trans = pt.fit_transform(X)
  1849. X_trans_func = power_transform(X, method="box-cox", standardize=standardize)
  1850. X_expected, lambda_expected = stats.boxcox(X.flatten())
  1851. if standardize:
  1852. X_expected = scale(X_expected)
  1853. assert_almost_equal(X_expected.reshape(-1, 1), X_trans)
  1854. assert_almost_equal(X_expected.reshape(-1, 1), X_trans_func)
  1855. assert_almost_equal(X, pt.inverse_transform(X_trans))
  1856. assert_almost_equal(lambda_expected, pt.lambdas_[0])
  1857. assert len(pt.lambdas_) == X.shape[1]
  1858. assert isinstance(pt.lambdas_, np.ndarray)
  1859. def test_power_transformer_2d():
  1860. X = np.abs(X_2d)
  1861. for standardize in [True, False]:
  1862. pt = PowerTransformer(method="box-cox", standardize=standardize)
  1863. X_trans_class = pt.fit_transform(X)
  1864. X_trans_func = power_transform(X, method="box-cox", standardize=standardize)
  1865. for X_trans in [X_trans_class, X_trans_func]:
  1866. for j in range(X_trans.shape[1]):
  1867. X_expected, lmbda = stats.boxcox(X[:, j].flatten())
  1868. if standardize:
  1869. X_expected = scale(X_expected)
  1870. assert_almost_equal(X_trans[:, j], X_expected)
  1871. assert_almost_equal(lmbda, pt.lambdas_[j])
  1872. # Test inverse transformation
  1873. X_inv = pt.inverse_transform(X_trans)
  1874. assert_array_almost_equal(X_inv, X)
  1875. assert len(pt.lambdas_) == X.shape[1]
  1876. assert isinstance(pt.lambdas_, np.ndarray)
  1877. def test_power_transformer_boxcox_strictly_positive_exception():
  1878. # Exceptions should be raised for negative arrays and zero arrays when
  1879. # method is boxcox
  1880. pt = PowerTransformer(method="box-cox")
  1881. pt.fit(np.abs(X_2d))
  1882. X_with_negatives = X_2d
  1883. not_positive_message = "strictly positive"
  1884. with pytest.raises(ValueError, match=not_positive_message):
  1885. pt.transform(X_with_negatives)
  1886. with pytest.raises(ValueError, match=not_positive_message):
  1887. pt.fit(X_with_negatives)
  1888. with pytest.raises(ValueError, match=not_positive_message):
  1889. power_transform(X_with_negatives, method="box-cox")
  1890. with pytest.raises(ValueError, match=not_positive_message):
  1891. pt.transform(np.zeros(X_2d.shape))
  1892. with pytest.raises(ValueError, match=not_positive_message):
  1893. pt.fit(np.zeros(X_2d.shape))
  1894. with pytest.raises(ValueError, match=not_positive_message):
  1895. power_transform(np.zeros(X_2d.shape), method="box-cox")
  1896. @pytest.mark.parametrize("X", [X_2d, np.abs(X_2d), -np.abs(X_2d), np.zeros(X_2d.shape)])
  1897. def test_power_transformer_yeojohnson_any_input(X):
  1898. # Yeo-Johnson method should support any kind of input
  1899. power_transform(X, method="yeo-johnson")
  1900. @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
  1901. def test_power_transformer_shape_exception(method):
  1902. pt = PowerTransformer(method=method)
  1903. X = np.abs(X_2d)
  1904. pt.fit(X)
  1905. # Exceptions should be raised for arrays with different num_columns
  1906. # than during fitting
  1907. wrong_shape_message = (
  1908. r"X has \d+ features, but PowerTransformer is " r"expecting \d+ features"
  1909. )
  1910. with pytest.raises(ValueError, match=wrong_shape_message):
  1911. pt.transform(X[:, 0:1])
  1912. with pytest.raises(ValueError, match=wrong_shape_message):
  1913. pt.inverse_transform(X[:, 0:1])
  1914. def test_power_transformer_lambda_zero():
  1915. pt = PowerTransformer(method="box-cox", standardize=False)
  1916. X = np.abs(X_2d)[:, 0:1]
  1917. # Test the lambda = 0 case
  1918. pt.lambdas_ = np.array([0])
  1919. X_trans = pt.transform(X)
  1920. assert_array_almost_equal(pt.inverse_transform(X_trans), X)
  1921. def test_power_transformer_lambda_one():
  1922. # Make sure lambda = 1 corresponds to the identity for yeo-johnson
  1923. pt = PowerTransformer(method="yeo-johnson", standardize=False)
  1924. X = np.abs(X_2d)[:, 0:1]
  1925. pt.lambdas_ = np.array([1])
  1926. X_trans = pt.transform(X)
  1927. assert_array_almost_equal(X_trans, X)
  1928. @pytest.mark.parametrize(
  1929. "method, lmbda",
  1930. [
  1931. ("box-cox", 0.1),
  1932. ("box-cox", 0.5),
  1933. ("yeo-johnson", 0.1),
  1934. ("yeo-johnson", 0.5),
  1935. ("yeo-johnson", 1.0),
  1936. ],
  1937. )
  1938. def test_optimization_power_transformer(method, lmbda):
  1939. # Test the optimization procedure:
  1940. # - set a predefined value for lambda
  1941. # - apply inverse_transform to a normal dist (we get X_inv)
  1942. # - apply fit_transform to X_inv (we get X_inv_trans)
  1943. # - check that X_inv_trans is roughly equal to X
  1944. rng = np.random.RandomState(0)
  1945. n_samples = 20000
  1946. X = rng.normal(loc=0, scale=1, size=(n_samples, 1))
  1947. pt = PowerTransformer(method=method, standardize=False)
  1948. pt.lambdas_ = [lmbda]
  1949. X_inv = pt.inverse_transform(X)
  1950. pt = PowerTransformer(method=method, standardize=False)
  1951. X_inv_trans = pt.fit_transform(X_inv)
  1952. assert_almost_equal(0, np.linalg.norm(X - X_inv_trans) / n_samples, decimal=2)
  1953. assert_almost_equal(0, X_inv_trans.mean(), decimal=1)
  1954. assert_almost_equal(1, X_inv_trans.std(), decimal=1)
  1955. def test_yeo_johnson_darwin_example():
  1956. # test from original paper "A new family of power transformations to
  1957. # improve normality or symmetry" by Yeo and Johnson.
  1958. X = [6.1, -8.4, 1.0, 2.0, 0.7, 2.9, 3.5, 5.1, 1.8, 3.6, 7.0, 3.0, 9.3, 7.5, -6.0]
  1959. X = np.array(X).reshape(-1, 1)
  1960. lmbda = PowerTransformer(method="yeo-johnson").fit(X).lambdas_
  1961. assert np.allclose(lmbda, 1.305, atol=1e-3)
  1962. @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
  1963. def test_power_transformer_nans(method):
  1964. # Make sure lambda estimation is not influenced by NaN values
  1965. # and that transform() supports NaN silently
  1966. X = np.abs(X_1col)
  1967. pt = PowerTransformer(method=method)
  1968. pt.fit(X)
  1969. lmbda_no_nans = pt.lambdas_[0]
  1970. # concat nans at the end and check lambda stays the same
  1971. X = np.concatenate([X, np.full_like(X, np.nan)])
  1972. X = shuffle(X, random_state=0)
  1973. pt.fit(X)
  1974. lmbda_nans = pt.lambdas_[0]
  1975. assert_almost_equal(lmbda_no_nans, lmbda_nans, decimal=5)
  1976. X_trans = pt.transform(X)
  1977. assert_array_equal(np.isnan(X_trans), np.isnan(X))
  1978. @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
  1979. @pytest.mark.parametrize("standardize", [True, False])
  1980. def test_power_transformer_fit_transform(method, standardize):
  1981. # check that fit_transform() and fit().transform() return the same values
  1982. X = X_1col
  1983. if method == "box-cox":
  1984. X = np.abs(X)
  1985. pt = PowerTransformer(method, standardize=standardize)
  1986. assert_array_almost_equal(pt.fit(X).transform(X), pt.fit_transform(X))
  1987. @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
  1988. @pytest.mark.parametrize("standardize", [True, False])
  1989. def test_power_transformer_copy_True(method, standardize):
  1990. # Check that neither fit, transform, fit_transform nor inverse_transform
  1991. # modify X inplace when copy=True
  1992. X = X_1col
  1993. if method == "box-cox":
  1994. X = np.abs(X)
  1995. X_original = X.copy()
  1996. assert X is not X_original # sanity checks
  1997. assert_array_almost_equal(X, X_original)
  1998. pt = PowerTransformer(method, standardize=standardize, copy=True)
  1999. pt.fit(X)
  2000. assert_array_almost_equal(X, X_original)
  2001. X_trans = pt.transform(X)
  2002. assert X_trans is not X
  2003. X_trans = pt.fit_transform(X)
  2004. assert_array_almost_equal(X, X_original)
  2005. assert X_trans is not X
  2006. X_inv_trans = pt.inverse_transform(X_trans)
  2007. assert X_trans is not X_inv_trans
  2008. @pytest.mark.parametrize("method", ["box-cox", "yeo-johnson"])
  2009. @pytest.mark.parametrize("standardize", [True, False])
  2010. def test_power_transformer_copy_False(method, standardize):
  2011. # check that when copy=False fit doesn't change X inplace but transform,
  2012. # fit_transform and inverse_transform do.
  2013. X = X_1col
  2014. if method == "box-cox":
  2015. X = np.abs(X)
  2016. X_original = X.copy()
  2017. assert X is not X_original # sanity checks
  2018. assert_array_almost_equal(X, X_original)
  2019. pt = PowerTransformer(method, standardize=standardize, copy=False)
  2020. pt.fit(X)
  2021. assert_array_almost_equal(X, X_original) # fit didn't change X
  2022. X_trans = pt.transform(X)
  2023. assert X_trans is X
  2024. if method == "box-cox":
  2025. X = np.abs(X)
  2026. X_trans = pt.fit_transform(X)
  2027. assert X_trans is X
  2028. X_inv_trans = pt.inverse_transform(X_trans)
  2029. assert X_trans is X_inv_trans
  2030. def test_power_transformer_box_cox_raise_all_nans_col():
  2031. """Check that box-cox raises informative when a column contains all nans.
  2032. Non-regression test for gh-26303
  2033. """
  2034. X = rng.random_sample((4, 5))
  2035. X[:, 0] = np.nan
  2036. err_msg = "Column must not be all nan."
  2037. pt = PowerTransformer(method="box-cox")
  2038. with pytest.raises(ValueError, match=err_msg):
  2039. pt.fit_transform(X)
  2040. @pytest.mark.parametrize(
  2041. "X_2",
  2042. [
  2043. sparse.random(10, 1, density=0.8, random_state=0),
  2044. sparse.csr_matrix(np.full((10, 1), fill_value=np.nan)),
  2045. ],
  2046. )
  2047. def test_standard_scaler_sparse_partial_fit_finite_variance(X_2):
  2048. # non-regression test for:
  2049. # https://github.com/scikit-learn/scikit-learn/issues/16448
  2050. X_1 = sparse.random(5, 1, density=0.8)
  2051. scaler = StandardScaler(with_mean=False)
  2052. scaler.fit(X_1).partial_fit(X_2)
  2053. assert np.isfinite(scaler.var_[0])
  2054. @pytest.mark.parametrize("feature_range", [(0, 1), (-10, 10)])
  2055. def test_minmax_scaler_clip(feature_range):
  2056. # test behaviour of the parameter 'clip' in MinMaxScaler
  2057. X = iris.data
  2058. scaler = MinMaxScaler(feature_range=feature_range, clip=True).fit(X)
  2059. X_min, X_max = np.min(X, axis=0), np.max(X, axis=0)
  2060. X_test = [np.r_[X_min[:2] - 10, X_max[2:] + 10]]
  2061. X_transformed = scaler.transform(X_test)
  2062. assert_allclose(
  2063. X_transformed,
  2064. [[feature_range[0], feature_range[0], feature_range[1], feature_range[1]]],
  2065. )
  2066. def test_standard_scaler_raise_error_for_1d_input():
  2067. """Check that `inverse_transform` from `StandardScaler` raises an error
  2068. with 1D array.
  2069. Non-regression test for:
  2070. https://github.com/scikit-learn/scikit-learn/issues/19518
  2071. """
  2072. scaler = StandardScaler().fit(X_2d)
  2073. err_msg = "Expected 2D array, got 1D array instead"
  2074. with pytest.raises(ValueError, match=err_msg):
  2075. scaler.inverse_transform(X_2d[:, 0])
  2076. def test_power_transformer_significantly_non_gaussian():
  2077. """Check that significantly non-Gaussian data before transforms correctly.
  2078. For some explored lambdas, the transformed data may be constant and will
  2079. be rejected. Non-regression test for
  2080. https://github.com/scikit-learn/scikit-learn/issues/14959
  2081. """
  2082. X_non_gaussian = 1e6 * np.array(
  2083. [0.6, 2.0, 3.0, 4.0] * 4 + [11, 12, 12, 16, 17, 20, 85, 90], dtype=np.float64
  2084. ).reshape(-1, 1)
  2085. pt = PowerTransformer()
  2086. with warnings.catch_warnings():
  2087. warnings.simplefilter("error", RuntimeWarning)
  2088. X_trans = pt.fit_transform(X_non_gaussian)
  2089. assert not np.any(np.isnan(X_trans))
  2090. assert X_trans.mean() == pytest.approx(0.0)
  2091. assert X_trans.std() == pytest.approx(1.0)
  2092. assert X_trans.min() > -2
  2093. assert X_trans.max() < 2
  2094. @pytest.mark.parametrize(
  2095. "Transformer",
  2096. [
  2097. MinMaxScaler,
  2098. MaxAbsScaler,
  2099. RobustScaler,
  2100. StandardScaler,
  2101. QuantileTransformer,
  2102. PowerTransformer,
  2103. ],
  2104. )
  2105. def test_one_to_one_features(Transformer):
  2106. """Check one-to-one transformers give correct feature names."""
  2107. tr = Transformer().fit(iris.data)
  2108. names_out = tr.get_feature_names_out(iris.feature_names)
  2109. assert_array_equal(names_out, iris.feature_names)
  2110. @pytest.mark.parametrize(
  2111. "Transformer",
  2112. [
  2113. MinMaxScaler,
  2114. MaxAbsScaler,
  2115. RobustScaler,
  2116. StandardScaler,
  2117. QuantileTransformer,
  2118. PowerTransformer,
  2119. Normalizer,
  2120. Binarizer,
  2121. ],
  2122. )
  2123. def test_one_to_one_features_pandas(Transformer):
  2124. """Check one-to-one transformers give correct feature names."""
  2125. pd = pytest.importorskip("pandas")
  2126. df = pd.DataFrame(iris.data, columns=iris.feature_names)
  2127. tr = Transformer().fit(df)
  2128. names_out_df_default = tr.get_feature_names_out()
  2129. assert_array_equal(names_out_df_default, iris.feature_names)
  2130. names_out_df_valid_in = tr.get_feature_names_out(iris.feature_names)
  2131. assert_array_equal(names_out_df_valid_in, iris.feature_names)
  2132. msg = re.escape("input_features is not equal to feature_names_in_")
  2133. with pytest.raises(ValueError, match=msg):
  2134. invalid_names = list("abcd")
  2135. tr.get_feature_names_out(invalid_names)
  2136. def test_kernel_centerer_feature_names_out():
  2137. """Test that kernel centerer `feature_names_out`."""
  2138. rng = np.random.RandomState(0)
  2139. X = rng.random_sample((6, 4))
  2140. X_pairwise = linear_kernel(X)
  2141. centerer = KernelCenterer().fit(X_pairwise)
  2142. names_out = centerer.get_feature_names_out()
  2143. samples_out2 = X_pairwise.shape[1]
  2144. assert_array_equal(names_out, [f"kernelcenterer{i}" for i in range(samples_out2)])
  2145. @pytest.mark.parametrize("standardize", [True, False])
  2146. def test_power_transformer_constant_feature(standardize):
  2147. """Check that PowerTransfomer leaves constant features unchanged."""
  2148. X = [[-2, 0, 2], [-2, 0, 2], [-2, 0, 2]]
  2149. pt = PowerTransformer(method="yeo-johnson", standardize=standardize).fit(X)
  2150. assert_allclose(pt.lambdas_, [1, 1, 1])
  2151. Xft = pt.fit_transform(X)
  2152. Xt = pt.transform(X)
  2153. for Xt_ in [Xft, Xt]:
  2154. if standardize:
  2155. assert_allclose(Xt_, np.zeros_like(X))
  2156. else:
  2157. assert_allclose(Xt_, X)