_data.py 119 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515
  1. # Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr>
  2. # Mathieu Blondel <mathieu@mblondel.org>
  3. # Olivier Grisel <olivier.grisel@ensta.org>
  4. # Andreas Mueller <amueller@ais.uni-bonn.de>
  5. # Eric Martin <eric@ericmart.in>
  6. # Giorgio Patrini <giorgio.patrini@anu.edu.au>
  7. # Eric Chang <ericchang2017@u.northwestern.edu>
  8. # License: BSD 3 clause
  9. import warnings
  10. from numbers import Integral, Real
  11. import numpy as np
  12. from scipy import optimize, sparse, stats
  13. from scipy.special import boxcox
  14. from ..base import (
  15. BaseEstimator,
  16. ClassNamePrefixFeaturesOutMixin,
  17. OneToOneFeatureMixin,
  18. TransformerMixin,
  19. _fit_context,
  20. )
  21. from ..utils import check_array
  22. from ..utils._param_validation import Interval, Options, StrOptions, validate_params
  23. from ..utils.extmath import _incremental_mean_and_var, row_norms
  24. from ..utils.sparsefuncs import (
  25. incr_mean_variance_axis,
  26. inplace_column_scale,
  27. mean_variance_axis,
  28. min_max_axis,
  29. )
  30. from ..utils.sparsefuncs_fast import (
  31. inplace_csr_row_normalize_l1,
  32. inplace_csr_row_normalize_l2,
  33. )
  34. from ..utils.validation import (
  35. FLOAT_DTYPES,
  36. _check_sample_weight,
  37. check_is_fitted,
  38. check_random_state,
  39. )
  40. from ._encoders import OneHotEncoder
  41. BOUNDS_THRESHOLD = 1e-7
  42. __all__ = [
  43. "Binarizer",
  44. "KernelCenterer",
  45. "MinMaxScaler",
  46. "MaxAbsScaler",
  47. "Normalizer",
  48. "OneHotEncoder",
  49. "RobustScaler",
  50. "StandardScaler",
  51. "QuantileTransformer",
  52. "PowerTransformer",
  53. "add_dummy_feature",
  54. "binarize",
  55. "normalize",
  56. "scale",
  57. "robust_scale",
  58. "maxabs_scale",
  59. "minmax_scale",
  60. "quantile_transform",
  61. "power_transform",
  62. ]
  63. def _is_constant_feature(var, mean, n_samples):
  64. """Detect if a feature is indistinguishable from a constant feature.
  65. The detection is based on its computed variance and on the theoretical
  66. error bounds of the '2 pass algorithm' for variance computation.
  67. See "Algorithms for computing the sample variance: analysis and
  68. recommendations", by Chan, Golub, and LeVeque.
  69. """
  70. # In scikit-learn, variance is always computed using float64 accumulators.
  71. eps = np.finfo(np.float64).eps
  72. upper_bound = n_samples * eps * var + (n_samples * mean * eps) ** 2
  73. return var <= upper_bound
  74. def _handle_zeros_in_scale(scale, copy=True, constant_mask=None):
  75. """Set scales of near constant features to 1.
  76. The goal is to avoid division by very small or zero values.
  77. Near constant features are detected automatically by identifying
  78. scales close to machine precision unless they are precomputed by
  79. the caller and passed with the `constant_mask` kwarg.
  80. Typically for standard scaling, the scales are the standard
  81. deviation while near constant features are better detected on the
  82. computed variances which are closer to machine precision by
  83. construction.
  84. """
  85. # if we are fitting on 1D arrays, scale might be a scalar
  86. if np.isscalar(scale):
  87. if scale == 0.0:
  88. scale = 1.0
  89. return scale
  90. elif isinstance(scale, np.ndarray):
  91. if constant_mask is None:
  92. # Detect near constant values to avoid dividing by a very small
  93. # value that could lead to surprising results and numerical
  94. # stability issues.
  95. constant_mask = scale < 10 * np.finfo(scale.dtype).eps
  96. if copy:
  97. # New array to avoid side-effects
  98. scale = scale.copy()
  99. scale[constant_mask] = 1.0
  100. return scale
  101. @validate_params(
  102. {
  103. "X": ["array-like", "sparse matrix"],
  104. "axis": [Options(Integral, {0, 1})],
  105. "with_mean": ["boolean"],
  106. "with_std": ["boolean"],
  107. "copy": ["boolean"],
  108. },
  109. prefer_skip_nested_validation=True,
  110. )
  111. def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True):
  112. """Standardize a dataset along any axis.
  113. Center to the mean and component wise scale to unit variance.
  114. Read more in the :ref:`User Guide <preprocessing_scaler>`.
  115. Parameters
  116. ----------
  117. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  118. The data to center and scale.
  119. axis : {0, 1}, default=0
  120. Axis used to compute the means and standard deviations along. If 0,
  121. independently standardize each feature, otherwise (if 1) standardize
  122. each sample.
  123. with_mean : bool, default=True
  124. If True, center the data before scaling.
  125. with_std : bool, default=True
  126. If True, scale the data to unit variance (or equivalently,
  127. unit standard deviation).
  128. copy : bool, default=True
  129. Set to False to perform inplace row normalization and avoid a
  130. copy (if the input is already a numpy array or a scipy.sparse
  131. CSC matrix and if axis is 1).
  132. Returns
  133. -------
  134. X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
  135. The transformed data.
  136. See Also
  137. --------
  138. StandardScaler : Performs scaling to unit variance using the Transformer
  139. API (e.g. as part of a preprocessing
  140. :class:`~sklearn.pipeline.Pipeline`).
  141. Notes
  142. -----
  143. This implementation will refuse to center scipy.sparse matrices
  144. since it would make them non-sparse and would potentially crash the
  145. program with memory exhaustion problems.
  146. Instead the caller is expected to either set explicitly
  147. `with_mean=False` (in that case, only variance scaling will be
  148. performed on the features of the CSC matrix) or to call `X.toarray()`
  149. if he/she expects the materialized dense array to fit in memory.
  150. To avoid memory copy the caller should pass a CSC matrix.
  151. NaNs are treated as missing values: disregarded to compute the statistics,
  152. and maintained during the data transformation.
  153. We use a biased estimator for the standard deviation, equivalent to
  154. `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to
  155. affect model performance.
  156. For a comparison of the different scalers, transformers, and normalizers,
  157. see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
  158. .. warning:: Risk of data leak
  159. Do not use :func:`~sklearn.preprocessing.scale` unless you know
  160. what you are doing. A common mistake is to apply it to the entire data
  161. *before* splitting into training and test sets. This will bias the
  162. model evaluation because information would have leaked from the test
  163. set to the training set.
  164. In general, we recommend using
  165. :class:`~sklearn.preprocessing.StandardScaler` within a
  166. :ref:`Pipeline <pipeline>` in order to prevent most risks of data
  167. leaking: `pipe = make_pipeline(StandardScaler(), LogisticRegression())`.
  168. """ # noqa
  169. X = check_array(
  170. X,
  171. accept_sparse="csc",
  172. copy=copy,
  173. ensure_2d=False,
  174. estimator="the scale function",
  175. dtype=FLOAT_DTYPES,
  176. force_all_finite="allow-nan",
  177. )
  178. if sparse.issparse(X):
  179. if with_mean:
  180. raise ValueError(
  181. "Cannot center sparse matrices: pass `with_mean=False` instead"
  182. " See docstring for motivation and alternatives."
  183. )
  184. if axis != 0:
  185. raise ValueError(
  186. "Can only scale sparse matrix on axis=0, got axis=%d" % axis
  187. )
  188. if with_std:
  189. _, var = mean_variance_axis(X, axis=0)
  190. var = _handle_zeros_in_scale(var, copy=False)
  191. inplace_column_scale(X, 1 / np.sqrt(var))
  192. else:
  193. X = np.asarray(X)
  194. if with_mean:
  195. mean_ = np.nanmean(X, axis)
  196. if with_std:
  197. scale_ = np.nanstd(X, axis)
  198. # Xr is a view on the original array that enables easy use of
  199. # broadcasting on the axis in which we are interested in
  200. Xr = np.rollaxis(X, axis)
  201. if with_mean:
  202. Xr -= mean_
  203. mean_1 = np.nanmean(Xr, axis=0)
  204. # Verify that mean_1 is 'close to zero'. If X contains very
  205. # large values, mean_1 can also be very large, due to a lack of
  206. # precision of mean_. In this case, a pre-scaling of the
  207. # concerned feature is efficient, for instance by its mean or
  208. # maximum.
  209. if not np.allclose(mean_1, 0):
  210. warnings.warn(
  211. "Numerical issues were encountered "
  212. "when centering the data "
  213. "and might not be solved. Dataset may "
  214. "contain too large values. You may need "
  215. "to prescale your features."
  216. )
  217. Xr -= mean_1
  218. if with_std:
  219. scale_ = _handle_zeros_in_scale(scale_, copy=False)
  220. Xr /= scale_
  221. if with_mean:
  222. mean_2 = np.nanmean(Xr, axis=0)
  223. # If mean_2 is not 'close to zero', it comes from the fact that
  224. # scale_ is very small so that mean_2 = mean_1/scale_ > 0, even
  225. # if mean_1 was close to zero. The problem is thus essentially
  226. # due to the lack of precision of mean_. A solution is then to
  227. # subtract the mean again:
  228. if not np.allclose(mean_2, 0):
  229. warnings.warn(
  230. "Numerical issues were encountered "
  231. "when scaling the data "
  232. "and might not be solved. The standard "
  233. "deviation of the data is probably "
  234. "very close to 0. "
  235. )
  236. Xr -= mean_2
  237. return X
  238. class MinMaxScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
  239. """Transform features by scaling each feature to a given range.
  240. This estimator scales and translates each feature individually such
  241. that it is in the given range on the training set, e.g. between
  242. zero and one.
  243. The transformation is given by::
  244. X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
  245. X_scaled = X_std * (max - min) + min
  246. where min, max = feature_range.
  247. This transformation is often used as an alternative to zero mean,
  248. unit variance scaling.
  249. `MinMaxScaler` doesn't reduce the effect of outliers, but it linearily
  250. scales them down into a fixed range, where the largest occuring data point
  251. corresponds to the maximum value and the smallest one corresponds to the
  252. minimum value. For an example visualization, refer to :ref:`Compare
  253. MinMaxScaler with other scalers <plot_all_scaling_minmax_scaler_section>`.
  254. Read more in the :ref:`User Guide <preprocessing_scaler>`.
  255. Parameters
  256. ----------
  257. feature_range : tuple (min, max), default=(0, 1)
  258. Desired range of transformed data.
  259. copy : bool, default=True
  260. Set to False to perform inplace row normalization and avoid a
  261. copy (if the input is already a numpy array).
  262. clip : bool, default=False
  263. Set to True to clip transformed values of held-out data to
  264. provided `feature range`.
  265. .. versionadded:: 0.24
  266. Attributes
  267. ----------
  268. min_ : ndarray of shape (n_features,)
  269. Per feature adjustment for minimum. Equivalent to
  270. ``min - X.min(axis=0) * self.scale_``
  271. scale_ : ndarray of shape (n_features,)
  272. Per feature relative scaling of the data. Equivalent to
  273. ``(max - min) / (X.max(axis=0) - X.min(axis=0))``
  274. .. versionadded:: 0.17
  275. *scale_* attribute.
  276. data_min_ : ndarray of shape (n_features,)
  277. Per feature minimum seen in the data
  278. .. versionadded:: 0.17
  279. *data_min_*
  280. data_max_ : ndarray of shape (n_features,)
  281. Per feature maximum seen in the data
  282. .. versionadded:: 0.17
  283. *data_max_*
  284. data_range_ : ndarray of shape (n_features,)
  285. Per feature range ``(data_max_ - data_min_)`` seen in the data
  286. .. versionadded:: 0.17
  287. *data_range_*
  288. n_features_in_ : int
  289. Number of features seen during :term:`fit`.
  290. .. versionadded:: 0.24
  291. n_samples_seen_ : int
  292. The number of samples processed by the estimator.
  293. It will be reset on new calls to fit, but increments across
  294. ``partial_fit`` calls.
  295. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  296. Names of features seen during :term:`fit`. Defined only when `X`
  297. has feature names that are all strings.
  298. .. versionadded:: 1.0
  299. See Also
  300. --------
  301. minmax_scale : Equivalent function without the estimator API.
  302. Notes
  303. -----
  304. NaNs are treated as missing values: disregarded in fit, and maintained in
  305. transform.
  306. Examples
  307. --------
  308. >>> from sklearn.preprocessing import MinMaxScaler
  309. >>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
  310. >>> scaler = MinMaxScaler()
  311. >>> print(scaler.fit(data))
  312. MinMaxScaler()
  313. >>> print(scaler.data_max_)
  314. [ 1. 18.]
  315. >>> print(scaler.transform(data))
  316. [[0. 0. ]
  317. [0.25 0.25]
  318. [0.5 0.5 ]
  319. [1. 1. ]]
  320. >>> print(scaler.transform([[2, 2]]))
  321. [[1.5 0. ]]
  322. """
  323. _parameter_constraints: dict = {
  324. "feature_range": [tuple],
  325. "copy": ["boolean"],
  326. "clip": ["boolean"],
  327. }
  328. def __init__(self, feature_range=(0, 1), *, copy=True, clip=False):
  329. self.feature_range = feature_range
  330. self.copy = copy
  331. self.clip = clip
  332. def _reset(self):
  333. """Reset internal data-dependent state of the scaler, if necessary.
  334. __init__ parameters are not touched.
  335. """
  336. # Checking one attribute is enough, because they are all set together
  337. # in partial_fit
  338. if hasattr(self, "scale_"):
  339. del self.scale_
  340. del self.min_
  341. del self.n_samples_seen_
  342. del self.data_min_
  343. del self.data_max_
  344. del self.data_range_
  345. def fit(self, X, y=None):
  346. """Compute the minimum and maximum to be used for later scaling.
  347. Parameters
  348. ----------
  349. X : array-like of shape (n_samples, n_features)
  350. The data used to compute the per-feature minimum and maximum
  351. used for later scaling along the features axis.
  352. y : None
  353. Ignored.
  354. Returns
  355. -------
  356. self : object
  357. Fitted scaler.
  358. """
  359. # Reset internal state before fitting
  360. self._reset()
  361. return self.partial_fit(X, y)
  362. @_fit_context(prefer_skip_nested_validation=True)
  363. def partial_fit(self, X, y=None):
  364. """Online computation of min and max on X for later scaling.
  365. All of X is processed as a single batch. This is intended for cases
  366. when :meth:`fit` is not feasible due to very large number of
  367. `n_samples` or because X is read from a continuous stream.
  368. Parameters
  369. ----------
  370. X : array-like of shape (n_samples, n_features)
  371. The data used to compute the mean and standard deviation
  372. used for later scaling along the features axis.
  373. y : None
  374. Ignored.
  375. Returns
  376. -------
  377. self : object
  378. Fitted scaler.
  379. """
  380. feature_range = self.feature_range
  381. if feature_range[0] >= feature_range[1]:
  382. raise ValueError(
  383. "Minimum of desired feature range must be smaller than maximum. Got %s."
  384. % str(feature_range)
  385. )
  386. if sparse.issparse(X):
  387. raise TypeError(
  388. "MinMaxScaler does not support sparse input. "
  389. "Consider using MaxAbsScaler instead."
  390. )
  391. first_pass = not hasattr(self, "n_samples_seen_")
  392. X = self._validate_data(
  393. X,
  394. reset=first_pass,
  395. dtype=FLOAT_DTYPES,
  396. force_all_finite="allow-nan",
  397. )
  398. data_min = np.nanmin(X, axis=0)
  399. data_max = np.nanmax(X, axis=0)
  400. if first_pass:
  401. self.n_samples_seen_ = X.shape[0]
  402. else:
  403. data_min = np.minimum(self.data_min_, data_min)
  404. data_max = np.maximum(self.data_max_, data_max)
  405. self.n_samples_seen_ += X.shape[0]
  406. data_range = data_max - data_min
  407. self.scale_ = (feature_range[1] - feature_range[0]) / _handle_zeros_in_scale(
  408. data_range, copy=True
  409. )
  410. self.min_ = feature_range[0] - data_min * self.scale_
  411. self.data_min_ = data_min
  412. self.data_max_ = data_max
  413. self.data_range_ = data_range
  414. return self
  415. def transform(self, X):
  416. """Scale features of X according to feature_range.
  417. Parameters
  418. ----------
  419. X : array-like of shape (n_samples, n_features)
  420. Input data that will be transformed.
  421. Returns
  422. -------
  423. Xt : ndarray of shape (n_samples, n_features)
  424. Transformed data.
  425. """
  426. check_is_fitted(self)
  427. X = self._validate_data(
  428. X,
  429. copy=self.copy,
  430. dtype=FLOAT_DTYPES,
  431. force_all_finite="allow-nan",
  432. reset=False,
  433. )
  434. X *= self.scale_
  435. X += self.min_
  436. if self.clip:
  437. np.clip(X, self.feature_range[0], self.feature_range[1], out=X)
  438. return X
  439. def inverse_transform(self, X):
  440. """Undo the scaling of X according to feature_range.
  441. Parameters
  442. ----------
  443. X : array-like of shape (n_samples, n_features)
  444. Input data that will be transformed. It cannot be sparse.
  445. Returns
  446. -------
  447. Xt : ndarray of shape (n_samples, n_features)
  448. Transformed data.
  449. """
  450. check_is_fitted(self)
  451. X = check_array(
  452. X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite="allow-nan"
  453. )
  454. X -= self.min_
  455. X /= self.scale_
  456. return X
  457. def _more_tags(self):
  458. return {"allow_nan": True}
  459. @validate_params(
  460. {
  461. "X": ["array-like"],
  462. "axis": [Options(Integral, {0, 1})],
  463. },
  464. prefer_skip_nested_validation=False,
  465. )
  466. def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True):
  467. """Transform features by scaling each feature to a given range.
  468. This estimator scales and translates each feature individually such
  469. that it is in the given range on the training set, i.e. between
  470. zero and one.
  471. The transformation is given by (when ``axis=0``)::
  472. X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
  473. X_scaled = X_std * (max - min) + min
  474. where min, max = feature_range.
  475. The transformation is calculated as (when ``axis=0``)::
  476. X_scaled = scale * X + min - X.min(axis=0) * scale
  477. where scale = (max - min) / (X.max(axis=0) - X.min(axis=0))
  478. This transformation is often used as an alternative to zero mean,
  479. unit variance scaling.
  480. Read more in the :ref:`User Guide <preprocessing_scaler>`.
  481. .. versionadded:: 0.17
  482. *minmax_scale* function interface
  483. to :class:`~sklearn.preprocessing.MinMaxScaler`.
  484. Parameters
  485. ----------
  486. X : array-like of shape (n_samples, n_features)
  487. The data.
  488. feature_range : tuple (min, max), default=(0, 1)
  489. Desired range of transformed data.
  490. axis : {0, 1}, default=0
  491. Axis used to scale along. If 0, independently scale each feature,
  492. otherwise (if 1) scale each sample.
  493. copy : bool, default=True
  494. Set to False to perform inplace scaling and avoid a copy (if the input
  495. is already a numpy array).
  496. Returns
  497. -------
  498. X_tr : ndarray of shape (n_samples, n_features)
  499. The transformed data.
  500. .. warning:: Risk of data leak
  501. Do not use :func:`~sklearn.preprocessing.minmax_scale` unless you know
  502. what you are doing. A common mistake is to apply it to the entire data
  503. *before* splitting into training and test sets. This will bias the
  504. model evaluation because information would have leaked from the test
  505. set to the training set.
  506. In general, we recommend using
  507. :class:`~sklearn.preprocessing.MinMaxScaler` within a
  508. :ref:`Pipeline <pipeline>` in order to prevent most risks of data
  509. leaking: `pipe = make_pipeline(MinMaxScaler(), LogisticRegression())`.
  510. See Also
  511. --------
  512. MinMaxScaler : Performs scaling to a given range using the Transformer
  513. API (e.g. as part of a preprocessing
  514. :class:`~sklearn.pipeline.Pipeline`).
  515. Notes
  516. -----
  517. For a comparison of the different scalers, transformers, and normalizers,
  518. see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
  519. """
  520. # Unlike the scaler object, this function allows 1d input.
  521. # If copy is required, it will be done inside the scaler object.
  522. X = check_array(
  523. X, copy=False, ensure_2d=False, dtype=FLOAT_DTYPES, force_all_finite="allow-nan"
  524. )
  525. original_ndim = X.ndim
  526. if original_ndim == 1:
  527. X = X.reshape(X.shape[0], 1)
  528. s = MinMaxScaler(feature_range=feature_range, copy=copy)
  529. if axis == 0:
  530. X = s.fit_transform(X)
  531. else:
  532. X = s.fit_transform(X.T).T
  533. if original_ndim == 1:
  534. X = X.ravel()
  535. return X
  536. class StandardScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
  537. """Standardize features by removing the mean and scaling to unit variance.
  538. The standard score of a sample `x` is calculated as:
  539. z = (x - u) / s
  540. where `u` is the mean of the training samples or zero if `with_mean=False`,
  541. and `s` is the standard deviation of the training samples or one if
  542. `with_std=False`.
  543. Centering and scaling happen independently on each feature by computing
  544. the relevant statistics on the samples in the training set. Mean and
  545. standard deviation are then stored to be used on later data using
  546. :meth:`transform`.
  547. Standardization of a dataset is a common requirement for many
  548. machine learning estimators: they might behave badly if the
  549. individual features do not more or less look like standard normally
  550. distributed data (e.g. Gaussian with 0 mean and unit variance).
  551. For instance many elements used in the objective function of
  552. a learning algorithm (such as the RBF kernel of Support Vector
  553. Machines or the L1 and L2 regularizers of linear models) assume that
  554. all features are centered around 0 and have variance in the same
  555. order. If a feature has a variance that is orders of magnitude larger
  556. than others, it might dominate the objective function and make the
  557. estimator unable to learn from other features correctly as expected.
  558. `StandardScaler` is sensitive to outliers, and the features may scale
  559. differently from each other in the presence of outliers. For an example
  560. visualization, refer to :ref:`Compare StandardScaler with other scalers
  561. <plot_all_scaling_standard_scaler_section>`.
  562. This scaler can also be applied to sparse CSR or CSC matrices by passing
  563. `with_mean=False` to avoid breaking the sparsity structure of the data.
  564. Read more in the :ref:`User Guide <preprocessing_scaler>`.
  565. Parameters
  566. ----------
  567. copy : bool, default=True
  568. If False, try to avoid a copy and do inplace scaling instead.
  569. This is not guaranteed to always work inplace; e.g. if the data is
  570. not a NumPy array or scipy.sparse CSR matrix, a copy may still be
  571. returned.
  572. with_mean : bool, default=True
  573. If True, center the data before scaling.
  574. This does not work (and will raise an exception) when attempted on
  575. sparse matrices, because centering them entails building a dense
  576. matrix which in common use cases is likely to be too large to fit in
  577. memory.
  578. with_std : bool, default=True
  579. If True, scale the data to unit variance (or equivalently,
  580. unit standard deviation).
  581. Attributes
  582. ----------
  583. scale_ : ndarray of shape (n_features,) or None
  584. Per feature relative scaling of the data to achieve zero mean and unit
  585. variance. Generally this is calculated using `np.sqrt(var_)`. If a
  586. variance is zero, we can't achieve unit variance, and the data is left
  587. as-is, giving a scaling factor of 1. `scale_` is equal to `None`
  588. when `with_std=False`.
  589. .. versionadded:: 0.17
  590. *scale_*
  591. mean_ : ndarray of shape (n_features,) or None
  592. The mean value for each feature in the training set.
  593. Equal to ``None`` when ``with_mean=False`` and ``with_std=False``.
  594. var_ : ndarray of shape (n_features,) or None
  595. The variance for each feature in the training set. Used to compute
  596. `scale_`. Equal to ``None`` when ``with_mean=False`` and
  597. ``with_std=False``.
  598. n_features_in_ : int
  599. Number of features seen during :term:`fit`.
  600. .. versionadded:: 0.24
  601. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  602. Names of features seen during :term:`fit`. Defined only when `X`
  603. has feature names that are all strings.
  604. .. versionadded:: 1.0
  605. n_samples_seen_ : int or ndarray of shape (n_features,)
  606. The number of samples processed by the estimator for each feature.
  607. If there are no missing samples, the ``n_samples_seen`` will be an
  608. integer, otherwise it will be an array of dtype int. If
  609. `sample_weights` are used it will be a float (if no missing data)
  610. or an array of dtype float that sums the weights seen so far.
  611. Will be reset on new calls to fit, but increments across
  612. ``partial_fit`` calls.
  613. See Also
  614. --------
  615. scale : Equivalent function without the estimator API.
  616. :class:`~sklearn.decomposition.PCA` : Further removes the linear
  617. correlation across features with 'whiten=True'.
  618. Notes
  619. -----
  620. NaNs are treated as missing values: disregarded in fit, and maintained in
  621. transform.
  622. We use a biased estimator for the standard deviation, equivalent to
  623. `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to
  624. affect model performance.
  625. Examples
  626. --------
  627. >>> from sklearn.preprocessing import StandardScaler
  628. >>> data = [[0, 0], [0, 0], [1, 1], [1, 1]]
  629. >>> scaler = StandardScaler()
  630. >>> print(scaler.fit(data))
  631. StandardScaler()
  632. >>> print(scaler.mean_)
  633. [0.5 0.5]
  634. >>> print(scaler.transform(data))
  635. [[-1. -1.]
  636. [-1. -1.]
  637. [ 1. 1.]
  638. [ 1. 1.]]
  639. >>> print(scaler.transform([[2, 2]]))
  640. [[3. 3.]]
  641. """
  642. _parameter_constraints: dict = {
  643. "copy": ["boolean"],
  644. "with_mean": ["boolean"],
  645. "with_std": ["boolean"],
  646. }
  647. def __init__(self, *, copy=True, with_mean=True, with_std=True):
  648. self.with_mean = with_mean
  649. self.with_std = with_std
  650. self.copy = copy
  651. def _reset(self):
  652. """Reset internal data-dependent state of the scaler, if necessary.
  653. __init__ parameters are not touched.
  654. """
  655. # Checking one attribute is enough, because they are all set together
  656. # in partial_fit
  657. if hasattr(self, "scale_"):
  658. del self.scale_
  659. del self.n_samples_seen_
  660. del self.mean_
  661. del self.var_
  662. def fit(self, X, y=None, sample_weight=None):
  663. """Compute the mean and std to be used for later scaling.
  664. Parameters
  665. ----------
  666. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  667. The data used to compute the mean and standard deviation
  668. used for later scaling along the features axis.
  669. y : None
  670. Ignored.
  671. sample_weight : array-like of shape (n_samples,), default=None
  672. Individual weights for each sample.
  673. .. versionadded:: 0.24
  674. parameter *sample_weight* support to StandardScaler.
  675. Returns
  676. -------
  677. self : object
  678. Fitted scaler.
  679. """
  680. # Reset internal state before fitting
  681. self._reset()
  682. return self.partial_fit(X, y, sample_weight)
  683. @_fit_context(prefer_skip_nested_validation=True)
  684. def partial_fit(self, X, y=None, sample_weight=None):
  685. """Online computation of mean and std on X for later scaling.
  686. All of X is processed as a single batch. This is intended for cases
  687. when :meth:`fit` is not feasible due to very large number of
  688. `n_samples` or because X is read from a continuous stream.
  689. The algorithm for incremental mean and std is given in Equation 1.5a,b
  690. in Chan, Tony F., Gene H. Golub, and Randall J. LeVeque. "Algorithms
  691. for computing the sample variance: Analysis and recommendations."
  692. The American Statistician 37.3 (1983): 242-247:
  693. Parameters
  694. ----------
  695. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  696. The data used to compute the mean and standard deviation
  697. used for later scaling along the features axis.
  698. y : None
  699. Ignored.
  700. sample_weight : array-like of shape (n_samples,), default=None
  701. Individual weights for each sample.
  702. .. versionadded:: 0.24
  703. parameter *sample_weight* support to StandardScaler.
  704. Returns
  705. -------
  706. self : object
  707. Fitted scaler.
  708. """
  709. first_call = not hasattr(self, "n_samples_seen_")
  710. X = self._validate_data(
  711. X,
  712. accept_sparse=("csr", "csc"),
  713. dtype=FLOAT_DTYPES,
  714. force_all_finite="allow-nan",
  715. reset=first_call,
  716. )
  717. n_features = X.shape[1]
  718. if sample_weight is not None:
  719. sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
  720. # Even in the case of `with_mean=False`, we update the mean anyway
  721. # This is needed for the incremental computation of the var
  722. # See incr_mean_variance_axis and _incremental_mean_variance_axis
  723. # if n_samples_seen_ is an integer (i.e. no missing values), we need to
  724. # transform it to a NumPy array of shape (n_features,) required by
  725. # incr_mean_variance_axis and _incremental_variance_axis
  726. dtype = np.int64 if sample_weight is None else X.dtype
  727. if not hasattr(self, "n_samples_seen_"):
  728. self.n_samples_seen_ = np.zeros(n_features, dtype=dtype)
  729. elif np.size(self.n_samples_seen_) == 1:
  730. self.n_samples_seen_ = np.repeat(self.n_samples_seen_, X.shape[1])
  731. self.n_samples_seen_ = self.n_samples_seen_.astype(dtype, copy=False)
  732. if sparse.issparse(X):
  733. if self.with_mean:
  734. raise ValueError(
  735. "Cannot center sparse matrices: pass `with_mean=False` "
  736. "instead. See docstring for motivation and alternatives."
  737. )
  738. sparse_constructor = (
  739. sparse.csr_matrix if X.format == "csr" else sparse.csc_matrix
  740. )
  741. if self.with_std:
  742. # First pass
  743. if not hasattr(self, "scale_"):
  744. self.mean_, self.var_, self.n_samples_seen_ = mean_variance_axis(
  745. X, axis=0, weights=sample_weight, return_sum_weights=True
  746. )
  747. # Next passes
  748. else:
  749. (
  750. self.mean_,
  751. self.var_,
  752. self.n_samples_seen_,
  753. ) = incr_mean_variance_axis(
  754. X,
  755. axis=0,
  756. last_mean=self.mean_,
  757. last_var=self.var_,
  758. last_n=self.n_samples_seen_,
  759. weights=sample_weight,
  760. )
  761. # We force the mean and variance to float64 for large arrays
  762. # See https://github.com/scikit-learn/scikit-learn/pull/12338
  763. self.mean_ = self.mean_.astype(np.float64, copy=False)
  764. self.var_ = self.var_.astype(np.float64, copy=False)
  765. else:
  766. self.mean_ = None # as with_mean must be False for sparse
  767. self.var_ = None
  768. weights = _check_sample_weight(sample_weight, X)
  769. sum_weights_nan = weights @ sparse_constructor(
  770. (np.isnan(X.data), X.indices, X.indptr), shape=X.shape
  771. )
  772. self.n_samples_seen_ += (np.sum(weights) - sum_weights_nan).astype(
  773. dtype
  774. )
  775. else:
  776. # First pass
  777. if not hasattr(self, "scale_"):
  778. self.mean_ = 0.0
  779. if self.with_std:
  780. self.var_ = 0.0
  781. else:
  782. self.var_ = None
  783. if not self.with_mean and not self.with_std:
  784. self.mean_ = None
  785. self.var_ = None
  786. self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0)
  787. else:
  788. self.mean_, self.var_, self.n_samples_seen_ = _incremental_mean_and_var(
  789. X,
  790. self.mean_,
  791. self.var_,
  792. self.n_samples_seen_,
  793. sample_weight=sample_weight,
  794. )
  795. # for backward-compatibility, reduce n_samples_seen_ to an integer
  796. # if the number of samples is the same for each feature (i.e. no
  797. # missing values)
  798. if np.ptp(self.n_samples_seen_) == 0:
  799. self.n_samples_seen_ = self.n_samples_seen_[0]
  800. if self.with_std:
  801. # Extract the list of near constant features on the raw variances,
  802. # before taking the square root.
  803. constant_mask = _is_constant_feature(
  804. self.var_, self.mean_, self.n_samples_seen_
  805. )
  806. self.scale_ = _handle_zeros_in_scale(
  807. np.sqrt(self.var_), copy=False, constant_mask=constant_mask
  808. )
  809. else:
  810. self.scale_ = None
  811. return self
  812. def transform(self, X, copy=None):
  813. """Perform standardization by centering and scaling.
  814. Parameters
  815. ----------
  816. X : {array-like, sparse matrix of shape (n_samples, n_features)
  817. The data used to scale along the features axis.
  818. copy : bool, default=None
  819. Copy the input X or not.
  820. Returns
  821. -------
  822. X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
  823. Transformed array.
  824. """
  825. check_is_fitted(self)
  826. copy = copy if copy is not None else self.copy
  827. X = self._validate_data(
  828. X,
  829. reset=False,
  830. accept_sparse="csr",
  831. copy=copy,
  832. dtype=FLOAT_DTYPES,
  833. force_all_finite="allow-nan",
  834. )
  835. if sparse.issparse(X):
  836. if self.with_mean:
  837. raise ValueError(
  838. "Cannot center sparse matrices: pass `with_mean=False` "
  839. "instead. See docstring for motivation and alternatives."
  840. )
  841. if self.scale_ is not None:
  842. inplace_column_scale(X, 1 / self.scale_)
  843. else:
  844. if self.with_mean:
  845. X -= self.mean_
  846. if self.with_std:
  847. X /= self.scale_
  848. return X
  849. def inverse_transform(self, X, copy=None):
  850. """Scale back the data to the original representation.
  851. Parameters
  852. ----------
  853. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  854. The data used to scale along the features axis.
  855. copy : bool, default=None
  856. Copy the input X or not.
  857. Returns
  858. -------
  859. X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
  860. Transformed array.
  861. """
  862. check_is_fitted(self)
  863. copy = copy if copy is not None else self.copy
  864. X = check_array(
  865. X,
  866. accept_sparse="csr",
  867. copy=copy,
  868. dtype=FLOAT_DTYPES,
  869. force_all_finite="allow-nan",
  870. )
  871. if sparse.issparse(X):
  872. if self.with_mean:
  873. raise ValueError(
  874. "Cannot uncenter sparse matrices: pass `with_mean=False` "
  875. "instead See docstring for motivation and alternatives."
  876. )
  877. if self.scale_ is not None:
  878. inplace_column_scale(X, self.scale_)
  879. else:
  880. if self.with_std:
  881. X *= self.scale_
  882. if self.with_mean:
  883. X += self.mean_
  884. return X
  885. def _more_tags(self):
  886. return {"allow_nan": True, "preserves_dtype": [np.float64, np.float32]}
  887. class MaxAbsScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
  888. """Scale each feature by its maximum absolute value.
  889. This estimator scales and translates each feature individually such
  890. that the maximal absolute value of each feature in the
  891. training set will be 1.0. It does not shift/center the data, and
  892. thus does not destroy any sparsity.
  893. This scaler can also be applied to sparse CSR or CSC matrices.
  894. `MaxAbsScaler` doesn't reduce the effect of outliers; it only linearily
  895. scales them down. For an example visualization, refer to :ref:`Compare
  896. MaxAbsScaler with other scalers <plot_all_scaling_max_abs_scaler_section>`.
  897. .. versionadded:: 0.17
  898. Parameters
  899. ----------
  900. copy : bool, default=True
  901. Set to False to perform inplace scaling and avoid a copy (if the input
  902. is already a numpy array).
  903. Attributes
  904. ----------
  905. scale_ : ndarray of shape (n_features,)
  906. Per feature relative scaling of the data.
  907. .. versionadded:: 0.17
  908. *scale_* attribute.
  909. max_abs_ : ndarray of shape (n_features,)
  910. Per feature maximum absolute value.
  911. n_features_in_ : int
  912. Number of features seen during :term:`fit`.
  913. .. versionadded:: 0.24
  914. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  915. Names of features seen during :term:`fit`. Defined only when `X`
  916. has feature names that are all strings.
  917. .. versionadded:: 1.0
  918. n_samples_seen_ : int
  919. The number of samples processed by the estimator. Will be reset on
  920. new calls to fit, but increments across ``partial_fit`` calls.
  921. See Also
  922. --------
  923. maxabs_scale : Equivalent function without the estimator API.
  924. Notes
  925. -----
  926. NaNs are treated as missing values: disregarded in fit, and maintained in
  927. transform.
  928. Examples
  929. --------
  930. >>> from sklearn.preprocessing import MaxAbsScaler
  931. >>> X = [[ 1., -1., 2.],
  932. ... [ 2., 0., 0.],
  933. ... [ 0., 1., -1.]]
  934. >>> transformer = MaxAbsScaler().fit(X)
  935. >>> transformer
  936. MaxAbsScaler()
  937. >>> transformer.transform(X)
  938. array([[ 0.5, -1. , 1. ],
  939. [ 1. , 0. , 0. ],
  940. [ 0. , 1. , -0.5]])
  941. """
  942. _parameter_constraints: dict = {"copy": ["boolean"]}
  943. def __init__(self, *, copy=True):
  944. self.copy = copy
  945. def _reset(self):
  946. """Reset internal data-dependent state of the scaler, if necessary.
  947. __init__ parameters are not touched.
  948. """
  949. # Checking one attribute is enough, because they are all set together
  950. # in partial_fit
  951. if hasattr(self, "scale_"):
  952. del self.scale_
  953. del self.n_samples_seen_
  954. del self.max_abs_
  955. def fit(self, X, y=None):
  956. """Compute the maximum absolute value to be used for later scaling.
  957. Parameters
  958. ----------
  959. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  960. The data used to compute the per-feature minimum and maximum
  961. used for later scaling along the features axis.
  962. y : None
  963. Ignored.
  964. Returns
  965. -------
  966. self : object
  967. Fitted scaler.
  968. """
  969. # Reset internal state before fitting
  970. self._reset()
  971. return self.partial_fit(X, y)
  972. @_fit_context(prefer_skip_nested_validation=True)
  973. def partial_fit(self, X, y=None):
  974. """Online computation of max absolute value of X for later scaling.
  975. All of X is processed as a single batch. This is intended for cases
  976. when :meth:`fit` is not feasible due to very large number of
  977. `n_samples` or because X is read from a continuous stream.
  978. Parameters
  979. ----------
  980. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  981. The data used to compute the mean and standard deviation
  982. used for later scaling along the features axis.
  983. y : None
  984. Ignored.
  985. Returns
  986. -------
  987. self : object
  988. Fitted scaler.
  989. """
  990. first_pass = not hasattr(self, "n_samples_seen_")
  991. X = self._validate_data(
  992. X,
  993. reset=first_pass,
  994. accept_sparse=("csr", "csc"),
  995. dtype=FLOAT_DTYPES,
  996. force_all_finite="allow-nan",
  997. )
  998. if sparse.issparse(X):
  999. mins, maxs = min_max_axis(X, axis=0, ignore_nan=True)
  1000. max_abs = np.maximum(np.abs(mins), np.abs(maxs))
  1001. else:
  1002. max_abs = np.nanmax(np.abs(X), axis=0)
  1003. if first_pass:
  1004. self.n_samples_seen_ = X.shape[0]
  1005. else:
  1006. max_abs = np.maximum(self.max_abs_, max_abs)
  1007. self.n_samples_seen_ += X.shape[0]
  1008. self.max_abs_ = max_abs
  1009. self.scale_ = _handle_zeros_in_scale(max_abs, copy=True)
  1010. return self
  1011. def transform(self, X):
  1012. """Scale the data.
  1013. Parameters
  1014. ----------
  1015. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  1016. The data that should be scaled.
  1017. Returns
  1018. -------
  1019. X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
  1020. Transformed array.
  1021. """
  1022. check_is_fitted(self)
  1023. X = self._validate_data(
  1024. X,
  1025. accept_sparse=("csr", "csc"),
  1026. copy=self.copy,
  1027. reset=False,
  1028. dtype=FLOAT_DTYPES,
  1029. force_all_finite="allow-nan",
  1030. )
  1031. if sparse.issparse(X):
  1032. inplace_column_scale(X, 1.0 / self.scale_)
  1033. else:
  1034. X /= self.scale_
  1035. return X
  1036. def inverse_transform(self, X):
  1037. """Scale back the data to the original representation.
  1038. Parameters
  1039. ----------
  1040. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  1041. The data that should be transformed back.
  1042. Returns
  1043. -------
  1044. X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
  1045. Transformed array.
  1046. """
  1047. check_is_fitted(self)
  1048. X = check_array(
  1049. X,
  1050. accept_sparse=("csr", "csc"),
  1051. copy=self.copy,
  1052. dtype=FLOAT_DTYPES,
  1053. force_all_finite="allow-nan",
  1054. )
  1055. if sparse.issparse(X):
  1056. inplace_column_scale(X, self.scale_)
  1057. else:
  1058. X *= self.scale_
  1059. return X
  1060. def _more_tags(self):
  1061. return {"allow_nan": True}
  1062. @validate_params(
  1063. {
  1064. "X": ["array-like", "sparse matrix"],
  1065. "axis": [Options(Integral, {0, 1})],
  1066. },
  1067. prefer_skip_nested_validation=False,
  1068. )
  1069. def maxabs_scale(X, *, axis=0, copy=True):
  1070. """Scale each feature to the [-1, 1] range without breaking the sparsity.
  1071. This estimator scales each feature individually such
  1072. that the maximal absolute value of each feature in the
  1073. training set will be 1.0.
  1074. This scaler can also be applied to sparse CSR or CSC matrices.
  1075. Parameters
  1076. ----------
  1077. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  1078. The data.
  1079. axis : {0, 1}, default=0
  1080. Axis used to scale along. If 0, independently scale each feature,
  1081. otherwise (if 1) scale each sample.
  1082. copy : bool, default=True
  1083. Set to False to perform inplace scaling and avoid a copy (if the input
  1084. is already a numpy array).
  1085. Returns
  1086. -------
  1087. X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
  1088. The transformed data.
  1089. .. warning:: Risk of data leak
  1090. Do not use :func:`~sklearn.preprocessing.maxabs_scale` unless you know
  1091. what you are doing. A common mistake is to apply it to the entire data
  1092. *before* splitting into training and test sets. This will bias the
  1093. model evaluation because information would have leaked from the test
  1094. set to the training set.
  1095. In general, we recommend using
  1096. :class:`~sklearn.preprocessing.MaxAbsScaler` within a
  1097. :ref:`Pipeline <pipeline>` in order to prevent most risks of data
  1098. leaking: `pipe = make_pipeline(MaxAbsScaler(), LogisticRegression())`.
  1099. See Also
  1100. --------
  1101. MaxAbsScaler : Performs scaling to the [-1, 1] range using
  1102. the Transformer API (e.g. as part of a preprocessing
  1103. :class:`~sklearn.pipeline.Pipeline`).
  1104. Notes
  1105. -----
  1106. NaNs are treated as missing values: disregarded to compute the statistics,
  1107. and maintained during the data transformation.
  1108. For a comparison of the different scalers, transformers, and normalizers,
  1109. see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
  1110. """
  1111. # Unlike the scaler object, this function allows 1d input.
  1112. # If copy is required, it will be done inside the scaler object.
  1113. X = check_array(
  1114. X,
  1115. accept_sparse=("csr", "csc"),
  1116. copy=False,
  1117. ensure_2d=False,
  1118. dtype=FLOAT_DTYPES,
  1119. force_all_finite="allow-nan",
  1120. )
  1121. original_ndim = X.ndim
  1122. if original_ndim == 1:
  1123. X = X.reshape(X.shape[0], 1)
  1124. s = MaxAbsScaler(copy=copy)
  1125. if axis == 0:
  1126. X = s.fit_transform(X)
  1127. else:
  1128. X = s.fit_transform(X.T).T
  1129. if original_ndim == 1:
  1130. X = X.ravel()
  1131. return X
  1132. class RobustScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
  1133. """Scale features using statistics that are robust to outliers.
  1134. This Scaler removes the median and scales the data according to
  1135. the quantile range (defaults to IQR: Interquartile Range).
  1136. The IQR is the range between the 1st quartile (25th quantile)
  1137. and the 3rd quartile (75th quantile).
  1138. Centering and scaling happen independently on each feature by
  1139. computing the relevant statistics on the samples in the training
  1140. set. Median and interquartile range are then stored to be used on
  1141. later data using the :meth:`transform` method.
  1142. Standardization of a dataset is a common preprocessing for many machine
  1143. learning estimators. Typically this is done by removing the mean and
  1144. scaling to unit variance. However, outliers can often influence the sample
  1145. mean / variance in a negative way. In such cases, using the median and the
  1146. interquartile range often give better results. For an example visualization
  1147. and comparison to other scalers, refer to :ref:`Compare RobustScaler with
  1148. other scalers <plot_all_scaling_robust_scaler_section>`.
  1149. .. versionadded:: 0.17
  1150. Read more in the :ref:`User Guide <preprocessing_scaler>`.
  1151. Parameters
  1152. ----------
  1153. with_centering : bool, default=True
  1154. If `True`, center the data before scaling.
  1155. This will cause :meth:`transform` to raise an exception when attempted
  1156. on sparse matrices, because centering them entails building a dense
  1157. matrix which in common use cases is likely to be too large to fit in
  1158. memory.
  1159. with_scaling : bool, default=True
  1160. If `True`, scale the data to interquartile range.
  1161. quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0, \
  1162. default=(25.0, 75.0)
  1163. Quantile range used to calculate `scale_`. By default this is equal to
  1164. the IQR, i.e., `q_min` is the first quantile and `q_max` is the third
  1165. quantile.
  1166. .. versionadded:: 0.18
  1167. copy : bool, default=True
  1168. If `False`, try to avoid a copy and do inplace scaling instead.
  1169. This is not guaranteed to always work inplace; e.g. if the data is
  1170. not a NumPy array or scipy.sparse CSR matrix, a copy may still be
  1171. returned.
  1172. unit_variance : bool, default=False
  1173. If `True`, scale data so that normally distributed features have a
  1174. variance of 1. In general, if the difference between the x-values of
  1175. `q_max` and `q_min` for a standard normal distribution is greater
  1176. than 1, the dataset will be scaled down. If less than 1, the dataset
  1177. will be scaled up.
  1178. .. versionadded:: 0.24
  1179. Attributes
  1180. ----------
  1181. center_ : array of floats
  1182. The median value for each feature in the training set.
  1183. scale_ : array of floats
  1184. The (scaled) interquartile range for each feature in the training set.
  1185. .. versionadded:: 0.17
  1186. *scale_* attribute.
  1187. n_features_in_ : int
  1188. Number of features seen during :term:`fit`.
  1189. .. versionadded:: 0.24
  1190. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  1191. Names of features seen during :term:`fit`. Defined only when `X`
  1192. has feature names that are all strings.
  1193. .. versionadded:: 1.0
  1194. See Also
  1195. --------
  1196. robust_scale : Equivalent function without the estimator API.
  1197. sklearn.decomposition.PCA : Further removes the linear correlation across
  1198. features with 'whiten=True'.
  1199. Notes
  1200. -----
  1201. https://en.wikipedia.org/wiki/Median
  1202. https://en.wikipedia.org/wiki/Interquartile_range
  1203. Examples
  1204. --------
  1205. >>> from sklearn.preprocessing import RobustScaler
  1206. >>> X = [[ 1., -2., 2.],
  1207. ... [ -2., 1., 3.],
  1208. ... [ 4., 1., -2.]]
  1209. >>> transformer = RobustScaler().fit(X)
  1210. >>> transformer
  1211. RobustScaler()
  1212. >>> transformer.transform(X)
  1213. array([[ 0. , -2. , 0. ],
  1214. [-1. , 0. , 0.4],
  1215. [ 1. , 0. , -1.6]])
  1216. """
  1217. _parameter_constraints: dict = {
  1218. "with_centering": ["boolean"],
  1219. "with_scaling": ["boolean"],
  1220. "quantile_range": [tuple],
  1221. "copy": ["boolean"],
  1222. "unit_variance": ["boolean"],
  1223. }
  1224. def __init__(
  1225. self,
  1226. *,
  1227. with_centering=True,
  1228. with_scaling=True,
  1229. quantile_range=(25.0, 75.0),
  1230. copy=True,
  1231. unit_variance=False,
  1232. ):
  1233. self.with_centering = with_centering
  1234. self.with_scaling = with_scaling
  1235. self.quantile_range = quantile_range
  1236. self.unit_variance = unit_variance
  1237. self.copy = copy
  1238. @_fit_context(prefer_skip_nested_validation=True)
  1239. def fit(self, X, y=None):
  1240. """Compute the median and quantiles to be used for scaling.
  1241. Parameters
  1242. ----------
  1243. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  1244. The data used to compute the median and quantiles
  1245. used for later scaling along the features axis.
  1246. y : Ignored
  1247. Not used, present here for API consistency by convention.
  1248. Returns
  1249. -------
  1250. self : object
  1251. Fitted scaler.
  1252. """
  1253. # at fit, convert sparse matrices to csc for optimized computation of
  1254. # the quantiles
  1255. X = self._validate_data(
  1256. X,
  1257. accept_sparse="csc",
  1258. dtype=FLOAT_DTYPES,
  1259. force_all_finite="allow-nan",
  1260. )
  1261. q_min, q_max = self.quantile_range
  1262. if not 0 <= q_min <= q_max <= 100:
  1263. raise ValueError("Invalid quantile range: %s" % str(self.quantile_range))
  1264. if self.with_centering:
  1265. if sparse.issparse(X):
  1266. raise ValueError(
  1267. "Cannot center sparse matrices: use `with_centering=False`"
  1268. " instead. See docstring for motivation and alternatives."
  1269. )
  1270. self.center_ = np.nanmedian(X, axis=0)
  1271. else:
  1272. self.center_ = None
  1273. if self.with_scaling:
  1274. quantiles = []
  1275. for feature_idx in range(X.shape[1]):
  1276. if sparse.issparse(X):
  1277. column_nnz_data = X.data[
  1278. X.indptr[feature_idx] : X.indptr[feature_idx + 1]
  1279. ]
  1280. column_data = np.zeros(shape=X.shape[0], dtype=X.dtype)
  1281. column_data[: len(column_nnz_data)] = column_nnz_data
  1282. else:
  1283. column_data = X[:, feature_idx]
  1284. quantiles.append(np.nanpercentile(column_data, self.quantile_range))
  1285. quantiles = np.transpose(quantiles)
  1286. self.scale_ = quantiles[1] - quantiles[0]
  1287. self.scale_ = _handle_zeros_in_scale(self.scale_, copy=False)
  1288. if self.unit_variance:
  1289. adjust = stats.norm.ppf(q_max / 100.0) - stats.norm.ppf(q_min / 100.0)
  1290. self.scale_ = self.scale_ / adjust
  1291. else:
  1292. self.scale_ = None
  1293. return self
  1294. def transform(self, X):
  1295. """Center and scale the data.
  1296. Parameters
  1297. ----------
  1298. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  1299. The data used to scale along the specified axis.
  1300. Returns
  1301. -------
  1302. X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
  1303. Transformed array.
  1304. """
  1305. check_is_fitted(self)
  1306. X = self._validate_data(
  1307. X,
  1308. accept_sparse=("csr", "csc"),
  1309. copy=self.copy,
  1310. dtype=FLOAT_DTYPES,
  1311. reset=False,
  1312. force_all_finite="allow-nan",
  1313. )
  1314. if sparse.issparse(X):
  1315. if self.with_scaling:
  1316. inplace_column_scale(X, 1.0 / self.scale_)
  1317. else:
  1318. if self.with_centering:
  1319. X -= self.center_
  1320. if self.with_scaling:
  1321. X /= self.scale_
  1322. return X
  1323. def inverse_transform(self, X):
  1324. """Scale back the data to the original representation.
  1325. Parameters
  1326. ----------
  1327. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  1328. The rescaled data to be transformed back.
  1329. Returns
  1330. -------
  1331. X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
  1332. Transformed array.
  1333. """
  1334. check_is_fitted(self)
  1335. X = check_array(
  1336. X,
  1337. accept_sparse=("csr", "csc"),
  1338. copy=self.copy,
  1339. dtype=FLOAT_DTYPES,
  1340. force_all_finite="allow-nan",
  1341. )
  1342. if sparse.issparse(X):
  1343. if self.with_scaling:
  1344. inplace_column_scale(X, self.scale_)
  1345. else:
  1346. if self.with_scaling:
  1347. X *= self.scale_
  1348. if self.with_centering:
  1349. X += self.center_
  1350. return X
  1351. def _more_tags(self):
  1352. return {"allow_nan": True}
  1353. @validate_params(
  1354. {"X": ["array-like", "sparse matrix"], "axis": [Options(Integral, {0, 1})]},
  1355. prefer_skip_nested_validation=False,
  1356. )
  1357. def robust_scale(
  1358. X,
  1359. *,
  1360. axis=0,
  1361. with_centering=True,
  1362. with_scaling=True,
  1363. quantile_range=(25.0, 75.0),
  1364. copy=True,
  1365. unit_variance=False,
  1366. ):
  1367. """Standardize a dataset along any axis.
  1368. Center to the median and component wise scale
  1369. according to the interquartile range.
  1370. Read more in the :ref:`User Guide <preprocessing_scaler>`.
  1371. Parameters
  1372. ----------
  1373. X : {array-like, sparse matrix} of shape (n_sample, n_features)
  1374. The data to center and scale.
  1375. axis : int, default=0
  1376. Axis used to compute the medians and IQR along. If 0,
  1377. independently scale each feature, otherwise (if 1) scale
  1378. each sample.
  1379. with_centering : bool, default=True
  1380. If `True`, center the data before scaling.
  1381. with_scaling : bool, default=True
  1382. If `True`, scale the data to unit variance (or equivalently,
  1383. unit standard deviation).
  1384. quantile_range : tuple (q_min, q_max), 0.0 < q_min < q_max < 100.0,\
  1385. default=(25.0, 75.0)
  1386. Quantile range used to calculate `scale_`. By default this is equal to
  1387. the IQR, i.e., `q_min` is the first quantile and `q_max` is the third
  1388. quantile.
  1389. .. versionadded:: 0.18
  1390. copy : bool, default=True
  1391. Set to `False` to perform inplace row normalization and avoid a
  1392. copy (if the input is already a numpy array or a scipy.sparse
  1393. CSR matrix and if axis is 1).
  1394. unit_variance : bool, default=False
  1395. If `True`, scale data so that normally distributed features have a
  1396. variance of 1. In general, if the difference between the x-values of
  1397. `q_max` and `q_min` for a standard normal distribution is greater
  1398. than 1, the dataset will be scaled down. If less than 1, the dataset
  1399. will be scaled up.
  1400. .. versionadded:: 0.24
  1401. Returns
  1402. -------
  1403. X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
  1404. The transformed data.
  1405. See Also
  1406. --------
  1407. RobustScaler : Performs centering and scaling using the Transformer API
  1408. (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).
  1409. Notes
  1410. -----
  1411. This implementation will refuse to center scipy.sparse matrices
  1412. since it would make them non-sparse and would potentially crash the
  1413. program with memory exhaustion problems.
  1414. Instead the caller is expected to either set explicitly
  1415. `with_centering=False` (in that case, only variance scaling will be
  1416. performed on the features of the CSR matrix) or to call `X.toarray()`
  1417. if he/she expects the materialized dense array to fit in memory.
  1418. To avoid memory copy the caller should pass a CSR matrix.
  1419. For a comparison of the different scalers, transformers, and normalizers,
  1420. see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
  1421. .. warning:: Risk of data leak
  1422. Do not use :func:`~sklearn.preprocessing.robust_scale` unless you know
  1423. what you are doing. A common mistake is to apply it to the entire data
  1424. *before* splitting into training and test sets. This will bias the
  1425. model evaluation because information would have leaked from the test
  1426. set to the training set.
  1427. In general, we recommend using
  1428. :class:`~sklearn.preprocessing.RobustScaler` within a
  1429. :ref:`Pipeline <pipeline>` in order to prevent most risks of data
  1430. leaking: `pipe = make_pipeline(RobustScaler(), LogisticRegression())`.
  1431. """
  1432. X = check_array(
  1433. X,
  1434. accept_sparse=("csr", "csc"),
  1435. copy=False,
  1436. ensure_2d=False,
  1437. dtype=FLOAT_DTYPES,
  1438. force_all_finite="allow-nan",
  1439. )
  1440. original_ndim = X.ndim
  1441. if original_ndim == 1:
  1442. X = X.reshape(X.shape[0], 1)
  1443. s = RobustScaler(
  1444. with_centering=with_centering,
  1445. with_scaling=with_scaling,
  1446. quantile_range=quantile_range,
  1447. unit_variance=unit_variance,
  1448. copy=copy,
  1449. )
  1450. if axis == 0:
  1451. X = s.fit_transform(X)
  1452. else:
  1453. X = s.fit_transform(X.T).T
  1454. if original_ndim == 1:
  1455. X = X.ravel()
  1456. return X
  1457. @validate_params(
  1458. {
  1459. "X": ["array-like", "sparse matrix"],
  1460. "norm": [StrOptions({"l1", "l2", "max"})],
  1461. "axis": [Options(Integral, {0, 1})],
  1462. "copy": ["boolean"],
  1463. "return_norm": ["boolean"],
  1464. },
  1465. prefer_skip_nested_validation=True,
  1466. )
  1467. def normalize(X, norm="l2", *, axis=1, copy=True, return_norm=False):
  1468. """Scale input vectors individually to unit norm (vector length).
  1469. Read more in the :ref:`User Guide <preprocessing_normalization>`.
  1470. Parameters
  1471. ----------
  1472. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  1473. The data to normalize, element by element.
  1474. scipy.sparse matrices should be in CSR format to avoid an
  1475. un-necessary copy.
  1476. norm : {'l1', 'l2', 'max'}, default='l2'
  1477. The norm to use to normalize each non zero sample (or each non-zero
  1478. feature if axis is 0).
  1479. axis : {0, 1}, default=1
  1480. Define axis used to normalize the data along. If 1, independently
  1481. normalize each sample, otherwise (if 0) normalize each feature.
  1482. copy : bool, default=True
  1483. Set to False to perform inplace row normalization and avoid a
  1484. copy (if the input is already a numpy array or a scipy.sparse
  1485. CSR matrix and if axis is 1).
  1486. return_norm : bool, default=False
  1487. Whether to return the computed norms.
  1488. Returns
  1489. -------
  1490. X : {ndarray, sparse matrix} of shape (n_samples, n_features)
  1491. Normalized input X.
  1492. norms : ndarray of shape (n_samples, ) if axis=1 else (n_features, )
  1493. An array of norms along given axis for X.
  1494. When X is sparse, a NotImplementedError will be raised
  1495. for norm 'l1' or 'l2'.
  1496. See Also
  1497. --------
  1498. Normalizer : Performs normalization using the Transformer API
  1499. (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).
  1500. Notes
  1501. -----
  1502. For a comparison of the different scalers, transformers, and normalizers,
  1503. see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
  1504. """
  1505. if axis == 0:
  1506. sparse_format = "csc"
  1507. else: # axis == 1:
  1508. sparse_format = "csr"
  1509. X = check_array(
  1510. X,
  1511. accept_sparse=sparse_format,
  1512. copy=copy,
  1513. estimator="the normalize function",
  1514. dtype=FLOAT_DTYPES,
  1515. )
  1516. if axis == 0:
  1517. X = X.T
  1518. if sparse.issparse(X):
  1519. if return_norm and norm in ("l1", "l2"):
  1520. raise NotImplementedError(
  1521. "return_norm=True is not implemented "
  1522. "for sparse matrices with norm 'l1' "
  1523. "or norm 'l2'"
  1524. )
  1525. if norm == "l1":
  1526. inplace_csr_row_normalize_l1(X)
  1527. elif norm == "l2":
  1528. inplace_csr_row_normalize_l2(X)
  1529. elif norm == "max":
  1530. mins, maxes = min_max_axis(X, 1)
  1531. norms = np.maximum(abs(mins), maxes)
  1532. norms_elementwise = norms.repeat(np.diff(X.indptr))
  1533. mask = norms_elementwise != 0
  1534. X.data[mask] /= norms_elementwise[mask]
  1535. else:
  1536. if norm == "l1":
  1537. norms = np.abs(X).sum(axis=1)
  1538. elif norm == "l2":
  1539. norms = row_norms(X)
  1540. elif norm == "max":
  1541. norms = np.max(abs(X), axis=1)
  1542. norms = _handle_zeros_in_scale(norms, copy=False)
  1543. X /= norms[:, np.newaxis]
  1544. if axis == 0:
  1545. X = X.T
  1546. if return_norm:
  1547. return X, norms
  1548. else:
  1549. return X
  1550. class Normalizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
  1551. """Normalize samples individually to unit norm.
  1552. Each sample (i.e. each row of the data matrix) with at least one
  1553. non zero component is rescaled independently of other samples so
  1554. that its norm (l1, l2 or inf) equals one.
  1555. This transformer is able to work both with dense numpy arrays and
  1556. scipy.sparse matrix (use CSR format if you want to avoid the burden of
  1557. a copy / conversion).
  1558. Scaling inputs to unit norms is a common operation for text
  1559. classification or clustering for instance. For instance the dot
  1560. product of two l2-normalized TF-IDF vectors is the cosine similarity
  1561. of the vectors and is the base similarity metric for the Vector
  1562. Space Model commonly used by the Information Retrieval community.
  1563. For an example visualization, refer to :ref:`Compare Normalizer with other
  1564. scalers <plot_all_scaling_normalizer_section>`.
  1565. Read more in the :ref:`User Guide <preprocessing_normalization>`.
  1566. Parameters
  1567. ----------
  1568. norm : {'l1', 'l2', 'max'}, default='l2'
  1569. The norm to use to normalize each non zero sample. If norm='max'
  1570. is used, values will be rescaled by the maximum of the absolute
  1571. values.
  1572. copy : bool, default=True
  1573. Set to False to perform inplace row normalization and avoid a
  1574. copy (if the input is already a numpy array or a scipy.sparse
  1575. CSR matrix).
  1576. Attributes
  1577. ----------
  1578. n_features_in_ : int
  1579. Number of features seen during :term:`fit`.
  1580. .. versionadded:: 0.24
  1581. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  1582. Names of features seen during :term:`fit`. Defined only when `X`
  1583. has feature names that are all strings.
  1584. .. versionadded:: 1.0
  1585. See Also
  1586. --------
  1587. normalize : Equivalent function without the estimator API.
  1588. Notes
  1589. -----
  1590. This estimator is :term:`stateless` and does not need to be fitted.
  1591. However, we recommend to call :meth:`fit_transform` instead of
  1592. :meth:`transform`, as parameter validation is only performed in
  1593. :meth:`fit`.
  1594. Examples
  1595. --------
  1596. >>> from sklearn.preprocessing import Normalizer
  1597. >>> X = [[4, 1, 2, 2],
  1598. ... [1, 3, 9, 3],
  1599. ... [5, 7, 5, 1]]
  1600. >>> transformer = Normalizer().fit(X) # fit does nothing.
  1601. >>> transformer
  1602. Normalizer()
  1603. >>> transformer.transform(X)
  1604. array([[0.8, 0.2, 0.4, 0.4],
  1605. [0.1, 0.3, 0.9, 0.3],
  1606. [0.5, 0.7, 0.5, 0.1]])
  1607. """
  1608. _parameter_constraints: dict = {
  1609. "norm": [StrOptions({"l1", "l2", "max"})],
  1610. "copy": ["boolean"],
  1611. }
  1612. def __init__(self, norm="l2", *, copy=True):
  1613. self.norm = norm
  1614. self.copy = copy
  1615. @_fit_context(prefer_skip_nested_validation=True)
  1616. def fit(self, X, y=None):
  1617. """Only validates estimator's parameters.
  1618. This method allows to: (i) validate the estimator's parameters and
  1619. (ii) be consistent with the scikit-learn transformer API.
  1620. Parameters
  1621. ----------
  1622. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  1623. The data to estimate the normalization parameters.
  1624. y : Ignored
  1625. Not used, present here for API consistency by convention.
  1626. Returns
  1627. -------
  1628. self : object
  1629. Fitted transformer.
  1630. """
  1631. self._validate_data(X, accept_sparse="csr")
  1632. return self
  1633. def transform(self, X, copy=None):
  1634. """Scale each non zero row of X to unit norm.
  1635. Parameters
  1636. ----------
  1637. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  1638. The data to normalize, row by row. scipy.sparse matrices should be
  1639. in CSR format to avoid an un-necessary copy.
  1640. copy : bool, default=None
  1641. Copy the input X or not.
  1642. Returns
  1643. -------
  1644. X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
  1645. Transformed array.
  1646. """
  1647. copy = copy if copy is not None else self.copy
  1648. X = self._validate_data(X, accept_sparse="csr", reset=False)
  1649. return normalize(X, norm=self.norm, axis=1, copy=copy)
  1650. def _more_tags(self):
  1651. return {"stateless": True}
  1652. @validate_params(
  1653. {
  1654. "X": ["array-like", "sparse matrix"],
  1655. "threshold": [Interval(Real, None, None, closed="neither")],
  1656. "copy": ["boolean"],
  1657. },
  1658. prefer_skip_nested_validation=True,
  1659. )
  1660. def binarize(X, *, threshold=0.0, copy=True):
  1661. """Boolean thresholding of array-like or scipy.sparse matrix.
  1662. Read more in the :ref:`User Guide <preprocessing_binarization>`.
  1663. Parameters
  1664. ----------
  1665. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  1666. The data to binarize, element by element.
  1667. scipy.sparse matrices should be in CSR or CSC format to avoid an
  1668. un-necessary copy.
  1669. threshold : float, default=0.0
  1670. Feature values below or equal to this are replaced by 0, above it by 1.
  1671. Threshold may not be less than 0 for operations on sparse matrices.
  1672. copy : bool, default=True
  1673. Set to False to perform inplace binarization and avoid a copy
  1674. (if the input is already a numpy array or a scipy.sparse CSR / CSC
  1675. matrix and if axis is 1).
  1676. Returns
  1677. -------
  1678. X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
  1679. The transformed data.
  1680. See Also
  1681. --------
  1682. Binarizer : Performs binarization using the Transformer API
  1683. (e.g. as part of a preprocessing :class:`~sklearn.pipeline.Pipeline`).
  1684. """
  1685. X = check_array(X, accept_sparse=["csr", "csc"], copy=copy)
  1686. if sparse.issparse(X):
  1687. if threshold < 0:
  1688. raise ValueError("Cannot binarize a sparse matrix with threshold < 0")
  1689. cond = X.data > threshold
  1690. not_cond = np.logical_not(cond)
  1691. X.data[cond] = 1
  1692. X.data[not_cond] = 0
  1693. X.eliminate_zeros()
  1694. else:
  1695. cond = X > threshold
  1696. not_cond = np.logical_not(cond)
  1697. X[cond] = 1
  1698. X[not_cond] = 0
  1699. return X
  1700. class Binarizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
  1701. """Binarize data (set feature values to 0 or 1) according to a threshold.
  1702. Values greater than the threshold map to 1, while values less than
  1703. or equal to the threshold map to 0. With the default threshold of 0,
  1704. only positive values map to 1.
  1705. Binarization is a common operation on text count data where the
  1706. analyst can decide to only consider the presence or absence of a
  1707. feature rather than a quantified number of occurrences for instance.
  1708. It can also be used as a pre-processing step for estimators that
  1709. consider boolean random variables (e.g. modelled using the Bernoulli
  1710. distribution in a Bayesian setting).
  1711. Read more in the :ref:`User Guide <preprocessing_binarization>`.
  1712. Parameters
  1713. ----------
  1714. threshold : float, default=0.0
  1715. Feature values below or equal to this are replaced by 0, above it by 1.
  1716. Threshold may not be less than 0 for operations on sparse matrices.
  1717. copy : bool, default=True
  1718. Set to False to perform inplace binarization and avoid a copy (if
  1719. the input is already a numpy array or a scipy.sparse CSR matrix).
  1720. Attributes
  1721. ----------
  1722. n_features_in_ : int
  1723. Number of features seen during :term:`fit`.
  1724. .. versionadded:: 0.24
  1725. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  1726. Names of features seen during :term:`fit`. Defined only when `X`
  1727. has feature names that are all strings.
  1728. .. versionadded:: 1.0
  1729. See Also
  1730. --------
  1731. binarize : Equivalent function without the estimator API.
  1732. KBinsDiscretizer : Bin continuous data into intervals.
  1733. OneHotEncoder : Encode categorical features as a one-hot numeric array.
  1734. Notes
  1735. -----
  1736. If the input is a sparse matrix, only the non-zero values are subject
  1737. to update by the :class:`Binarizer` class.
  1738. This estimator is :term:`stateless` and does not need to be fitted.
  1739. However, we recommend to call :meth:`fit_transform` instead of
  1740. :meth:`transform`, as parameter validation is only performed in
  1741. :meth:`fit`.
  1742. Examples
  1743. --------
  1744. >>> from sklearn.preprocessing import Binarizer
  1745. >>> X = [[ 1., -1., 2.],
  1746. ... [ 2., 0., 0.],
  1747. ... [ 0., 1., -1.]]
  1748. >>> transformer = Binarizer().fit(X) # fit does nothing.
  1749. >>> transformer
  1750. Binarizer()
  1751. >>> transformer.transform(X)
  1752. array([[1., 0., 1.],
  1753. [1., 0., 0.],
  1754. [0., 1., 0.]])
  1755. """
  1756. _parameter_constraints: dict = {
  1757. "threshold": [Real],
  1758. "copy": ["boolean"],
  1759. }
  1760. def __init__(self, *, threshold=0.0, copy=True):
  1761. self.threshold = threshold
  1762. self.copy = copy
  1763. @_fit_context(prefer_skip_nested_validation=True)
  1764. def fit(self, X, y=None):
  1765. """Only validates estimator's parameters.
  1766. This method allows to: (i) validate the estimator's parameters and
  1767. (ii) be consistent with the scikit-learn transformer API.
  1768. Parameters
  1769. ----------
  1770. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  1771. The data.
  1772. y : None
  1773. Ignored.
  1774. Returns
  1775. -------
  1776. self : object
  1777. Fitted transformer.
  1778. """
  1779. self._validate_data(X, accept_sparse="csr")
  1780. return self
  1781. def transform(self, X, copy=None):
  1782. """Binarize each element of X.
  1783. Parameters
  1784. ----------
  1785. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  1786. The data to binarize, element by element.
  1787. scipy.sparse matrices should be in CSR format to avoid an
  1788. un-necessary copy.
  1789. copy : bool
  1790. Copy the input X or not.
  1791. Returns
  1792. -------
  1793. X_tr : {ndarray, sparse matrix} of shape (n_samples, n_features)
  1794. Transformed array.
  1795. """
  1796. copy = copy if copy is not None else self.copy
  1797. # TODO: This should be refactored because binarize also calls
  1798. # check_array
  1799. X = self._validate_data(X, accept_sparse=["csr", "csc"], copy=copy, reset=False)
  1800. return binarize(X, threshold=self.threshold, copy=False)
  1801. def _more_tags(self):
  1802. return {"stateless": True}
  1803. class KernelCenterer(ClassNamePrefixFeaturesOutMixin, TransformerMixin, BaseEstimator):
  1804. r"""Center an arbitrary kernel matrix :math:`K`.
  1805. Let define a kernel :math:`K` such that:
  1806. .. math::
  1807. K(X, Y) = \phi(X) . \phi(Y)^{T}
  1808. :math:`\phi(X)` is a function mapping of rows of :math:`X` to a
  1809. Hilbert space and :math:`K` is of shape `(n_samples, n_samples)`.
  1810. This class allows to compute :math:`\tilde{K}(X, Y)` such that:
  1811. .. math::
  1812. \tilde{K(X, Y)} = \tilde{\phi}(X) . \tilde{\phi}(Y)^{T}
  1813. :math:`\tilde{\phi}(X)` is the centered mapped data in the Hilbert
  1814. space.
  1815. `KernelCenterer` centers the features without explicitly computing the
  1816. mapping :math:`\phi(\cdot)`. Working with centered kernels is sometime
  1817. expected when dealing with algebra computation such as eigendecomposition
  1818. for :class:`~sklearn.decomposition.KernelPCA` for instance.
  1819. Read more in the :ref:`User Guide <kernel_centering>`.
  1820. Attributes
  1821. ----------
  1822. K_fit_rows_ : ndarray of shape (n_samples,)
  1823. Average of each column of kernel matrix.
  1824. K_fit_all_ : float
  1825. Average of kernel matrix.
  1826. n_features_in_ : int
  1827. Number of features seen during :term:`fit`.
  1828. .. versionadded:: 0.24
  1829. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  1830. Names of features seen during :term:`fit`. Defined only when `X`
  1831. has feature names that are all strings.
  1832. .. versionadded:: 1.0
  1833. See Also
  1834. --------
  1835. sklearn.kernel_approximation.Nystroem : Approximate a kernel map
  1836. using a subset of the training data.
  1837. References
  1838. ----------
  1839. .. [1] `Schölkopf, Bernhard, Alexander Smola, and Klaus-Robert Müller.
  1840. "Nonlinear component analysis as a kernel eigenvalue problem."
  1841. Neural computation 10.5 (1998): 1299-1319.
  1842. <https://www.mlpack.org/papers/kpca.pdf>`_
  1843. Examples
  1844. --------
  1845. >>> from sklearn.preprocessing import KernelCenterer
  1846. >>> from sklearn.metrics.pairwise import pairwise_kernels
  1847. >>> X = [[ 1., -2., 2.],
  1848. ... [ -2., 1., 3.],
  1849. ... [ 4., 1., -2.]]
  1850. >>> K = pairwise_kernels(X, metric='linear')
  1851. >>> K
  1852. array([[ 9., 2., -2.],
  1853. [ 2., 14., -13.],
  1854. [ -2., -13., 21.]])
  1855. >>> transformer = KernelCenterer().fit(K)
  1856. >>> transformer
  1857. KernelCenterer()
  1858. >>> transformer.transform(K)
  1859. array([[ 5., 0., -5.],
  1860. [ 0., 14., -14.],
  1861. [ -5., -14., 19.]])
  1862. """
  1863. def __init__(self):
  1864. # Needed for backported inspect.signature compatibility with PyPy
  1865. pass
  1866. def fit(self, K, y=None):
  1867. """Fit KernelCenterer.
  1868. Parameters
  1869. ----------
  1870. K : ndarray of shape (n_samples, n_samples)
  1871. Kernel matrix.
  1872. y : None
  1873. Ignored.
  1874. Returns
  1875. -------
  1876. self : object
  1877. Returns the instance itself.
  1878. """
  1879. K = self._validate_data(K, dtype=FLOAT_DTYPES)
  1880. if K.shape[0] != K.shape[1]:
  1881. raise ValueError(
  1882. "Kernel matrix must be a square matrix."
  1883. " Input is a {}x{} matrix.".format(K.shape[0], K.shape[1])
  1884. )
  1885. n_samples = K.shape[0]
  1886. self.K_fit_rows_ = np.sum(K, axis=0) / n_samples
  1887. self.K_fit_all_ = self.K_fit_rows_.sum() / n_samples
  1888. return self
  1889. def transform(self, K, copy=True):
  1890. """Center kernel matrix.
  1891. Parameters
  1892. ----------
  1893. K : ndarray of shape (n_samples1, n_samples2)
  1894. Kernel matrix.
  1895. copy : bool, default=True
  1896. Set to False to perform inplace computation.
  1897. Returns
  1898. -------
  1899. K_new : ndarray of shape (n_samples1, n_samples2)
  1900. Returns the instance itself.
  1901. """
  1902. check_is_fitted(self)
  1903. K = self._validate_data(K, copy=copy, dtype=FLOAT_DTYPES, reset=False)
  1904. K_pred_cols = (np.sum(K, axis=1) / self.K_fit_rows_.shape[0])[:, np.newaxis]
  1905. K -= self.K_fit_rows_
  1906. K -= K_pred_cols
  1907. K += self.K_fit_all_
  1908. return K
  1909. @property
  1910. def _n_features_out(self):
  1911. """Number of transformed output features."""
  1912. # Used by ClassNamePrefixFeaturesOutMixin. This model preserves the
  1913. # number of input features but this is not a one-to-one mapping in the
  1914. # usual sense. Hence the choice not to use OneToOneFeatureMixin to
  1915. # implement get_feature_names_out for this class.
  1916. return self.n_features_in_
  1917. def _more_tags(self):
  1918. return {"pairwise": True}
  1919. @validate_params(
  1920. {
  1921. "X": ["array-like", "sparse matrix"],
  1922. "value": [Interval(Real, None, None, closed="neither")],
  1923. },
  1924. prefer_skip_nested_validation=True,
  1925. )
  1926. def add_dummy_feature(X, value=1.0):
  1927. """Augment dataset with an additional dummy feature.
  1928. This is useful for fitting an intercept term with implementations which
  1929. cannot otherwise fit it directly.
  1930. Parameters
  1931. ----------
  1932. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  1933. Data.
  1934. value : float
  1935. Value to use for the dummy feature.
  1936. Returns
  1937. -------
  1938. X : {ndarray, sparse matrix} of shape (n_samples, n_features + 1)
  1939. Same data with dummy feature added as first column.
  1940. Examples
  1941. --------
  1942. >>> from sklearn.preprocessing import add_dummy_feature
  1943. >>> add_dummy_feature([[0, 1], [1, 0]])
  1944. array([[1., 0., 1.],
  1945. [1., 1., 0.]])
  1946. """
  1947. X = check_array(X, accept_sparse=["csc", "csr", "coo"], dtype=FLOAT_DTYPES)
  1948. n_samples, n_features = X.shape
  1949. shape = (n_samples, n_features + 1)
  1950. if sparse.issparse(X):
  1951. if X.format == "coo":
  1952. # Shift columns to the right.
  1953. col = X.col + 1
  1954. # Column indices of dummy feature are 0 everywhere.
  1955. col = np.concatenate((np.zeros(n_samples), col))
  1956. # Row indices of dummy feature are 0, ..., n_samples-1.
  1957. row = np.concatenate((np.arange(n_samples), X.row))
  1958. # Prepend the dummy feature n_samples times.
  1959. data = np.concatenate((np.full(n_samples, value), X.data))
  1960. return sparse.coo_matrix((data, (row, col)), shape)
  1961. elif X.format == "csc":
  1962. # Shift index pointers since we need to add n_samples elements.
  1963. indptr = X.indptr + n_samples
  1964. # indptr[0] must be 0.
  1965. indptr = np.concatenate((np.array([0]), indptr))
  1966. # Row indices of dummy feature are 0, ..., n_samples-1.
  1967. indices = np.concatenate((np.arange(n_samples), X.indices))
  1968. # Prepend the dummy feature n_samples times.
  1969. data = np.concatenate((np.full(n_samples, value), X.data))
  1970. return sparse.csc_matrix((data, indices, indptr), shape)
  1971. else:
  1972. klass = X.__class__
  1973. return klass(add_dummy_feature(X.tocoo(), value))
  1974. else:
  1975. return np.hstack((np.full((n_samples, 1), value), X))
  1976. class QuantileTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
  1977. """Transform features using quantiles information.
  1978. This method transforms the features to follow a uniform or a normal
  1979. distribution. Therefore, for a given feature, this transformation tends
  1980. to spread out the most frequent values. It also reduces the impact of
  1981. (marginal) outliers: this is therefore a robust preprocessing scheme.
  1982. The transformation is applied on each feature independently. First an
  1983. estimate of the cumulative distribution function of a feature is
  1984. used to map the original values to a uniform distribution. The obtained
  1985. values are then mapped to the desired output distribution using the
  1986. associated quantile function. Features values of new/unseen data that fall
  1987. below or above the fitted range will be mapped to the bounds of the output
  1988. distribution. Note that this transform is non-linear. It may distort linear
  1989. correlations between variables measured at the same scale but renders
  1990. variables measured at different scales more directly comparable.
  1991. For example visualizations, refer to :ref:`Compare QuantileTransformer with
  1992. other scalers <plot_all_scaling_quantile_transformer_section>`.
  1993. Read more in the :ref:`User Guide <preprocessing_transformer>`.
  1994. .. versionadded:: 0.19
  1995. Parameters
  1996. ----------
  1997. n_quantiles : int, default=1000 or n_samples
  1998. Number of quantiles to be computed. It corresponds to the number
  1999. of landmarks used to discretize the cumulative distribution function.
  2000. If n_quantiles is larger than the number of samples, n_quantiles is set
  2001. to the number of samples as a larger number of quantiles does not give
  2002. a better approximation of the cumulative distribution function
  2003. estimator.
  2004. output_distribution : {'uniform', 'normal'}, default='uniform'
  2005. Marginal distribution for the transformed data. The choices are
  2006. 'uniform' (default) or 'normal'.
  2007. ignore_implicit_zeros : bool, default=False
  2008. Only applies to sparse matrices. If True, the sparse entries of the
  2009. matrix are discarded to compute the quantile statistics. If False,
  2010. these entries are treated as zeros.
  2011. subsample : int, default=10_000
  2012. Maximum number of samples used to estimate the quantiles for
  2013. computational efficiency. Note that the subsampling procedure may
  2014. differ for value-identical sparse and dense matrices.
  2015. random_state : int, RandomState instance or None, default=None
  2016. Determines random number generation for subsampling and smoothing
  2017. noise.
  2018. Please see ``subsample`` for more details.
  2019. Pass an int for reproducible results across multiple function calls.
  2020. See :term:`Glossary <random_state>`.
  2021. copy : bool, default=True
  2022. Set to False to perform inplace transformation and avoid a copy (if the
  2023. input is already a numpy array).
  2024. Attributes
  2025. ----------
  2026. n_quantiles_ : int
  2027. The actual number of quantiles used to discretize the cumulative
  2028. distribution function.
  2029. quantiles_ : ndarray of shape (n_quantiles, n_features)
  2030. The values corresponding the quantiles of reference.
  2031. references_ : ndarray of shape (n_quantiles, )
  2032. Quantiles of references.
  2033. n_features_in_ : int
  2034. Number of features seen during :term:`fit`.
  2035. .. versionadded:: 0.24
  2036. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  2037. Names of features seen during :term:`fit`. Defined only when `X`
  2038. has feature names that are all strings.
  2039. .. versionadded:: 1.0
  2040. See Also
  2041. --------
  2042. quantile_transform : Equivalent function without the estimator API.
  2043. PowerTransformer : Perform mapping to a normal distribution using a power
  2044. transform.
  2045. StandardScaler : Perform standardization that is faster, but less robust
  2046. to outliers.
  2047. RobustScaler : Perform robust standardization that removes the influence
  2048. of outliers but does not put outliers and inliers on the same scale.
  2049. Notes
  2050. -----
  2051. NaNs are treated as missing values: disregarded in fit, and maintained in
  2052. transform.
  2053. Examples
  2054. --------
  2055. >>> import numpy as np
  2056. >>> from sklearn.preprocessing import QuantileTransformer
  2057. >>> rng = np.random.RandomState(0)
  2058. >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0)
  2059. >>> qt = QuantileTransformer(n_quantiles=10, random_state=0)
  2060. >>> qt.fit_transform(X)
  2061. array([...])
  2062. """
  2063. _parameter_constraints: dict = {
  2064. "n_quantiles": [Interval(Integral, 1, None, closed="left")],
  2065. "output_distribution": [StrOptions({"uniform", "normal"})],
  2066. "ignore_implicit_zeros": ["boolean"],
  2067. "subsample": [Interval(Integral, 1, None, closed="left")],
  2068. "random_state": ["random_state"],
  2069. "copy": ["boolean"],
  2070. }
  2071. def __init__(
  2072. self,
  2073. *,
  2074. n_quantiles=1000,
  2075. output_distribution="uniform",
  2076. ignore_implicit_zeros=False,
  2077. subsample=10_000,
  2078. random_state=None,
  2079. copy=True,
  2080. ):
  2081. self.n_quantiles = n_quantiles
  2082. self.output_distribution = output_distribution
  2083. self.ignore_implicit_zeros = ignore_implicit_zeros
  2084. self.subsample = subsample
  2085. self.random_state = random_state
  2086. self.copy = copy
  2087. def _dense_fit(self, X, random_state):
  2088. """Compute percentiles for dense matrices.
  2089. Parameters
  2090. ----------
  2091. X : ndarray of shape (n_samples, n_features)
  2092. The data used to scale along the features axis.
  2093. """
  2094. if self.ignore_implicit_zeros:
  2095. warnings.warn(
  2096. "'ignore_implicit_zeros' takes effect only with"
  2097. " sparse matrix. This parameter has no effect."
  2098. )
  2099. n_samples, n_features = X.shape
  2100. references = self.references_ * 100
  2101. self.quantiles_ = []
  2102. for col in X.T:
  2103. if self.subsample < n_samples:
  2104. subsample_idx = random_state.choice(
  2105. n_samples, size=self.subsample, replace=False
  2106. )
  2107. col = col.take(subsample_idx, mode="clip")
  2108. self.quantiles_.append(np.nanpercentile(col, references))
  2109. self.quantiles_ = np.transpose(self.quantiles_)
  2110. # Due to floating-point precision error in `np.nanpercentile`,
  2111. # make sure that quantiles are monotonically increasing.
  2112. # Upstream issue in numpy:
  2113. # https://github.com/numpy/numpy/issues/14685
  2114. self.quantiles_ = np.maximum.accumulate(self.quantiles_)
  2115. def _sparse_fit(self, X, random_state):
  2116. """Compute percentiles for sparse matrices.
  2117. Parameters
  2118. ----------
  2119. X : sparse matrix of shape (n_samples, n_features)
  2120. The data used to scale along the features axis. The sparse matrix
  2121. needs to be nonnegative. If a sparse matrix is provided,
  2122. it will be converted into a sparse ``csc_matrix``.
  2123. """
  2124. n_samples, n_features = X.shape
  2125. references = self.references_ * 100
  2126. self.quantiles_ = []
  2127. for feature_idx in range(n_features):
  2128. column_nnz_data = X.data[X.indptr[feature_idx] : X.indptr[feature_idx + 1]]
  2129. if len(column_nnz_data) > self.subsample:
  2130. column_subsample = self.subsample * len(column_nnz_data) // n_samples
  2131. if self.ignore_implicit_zeros:
  2132. column_data = np.zeros(shape=column_subsample, dtype=X.dtype)
  2133. else:
  2134. column_data = np.zeros(shape=self.subsample, dtype=X.dtype)
  2135. column_data[:column_subsample] = random_state.choice(
  2136. column_nnz_data, size=column_subsample, replace=False
  2137. )
  2138. else:
  2139. if self.ignore_implicit_zeros:
  2140. column_data = np.zeros(shape=len(column_nnz_data), dtype=X.dtype)
  2141. else:
  2142. column_data = np.zeros(shape=n_samples, dtype=X.dtype)
  2143. column_data[: len(column_nnz_data)] = column_nnz_data
  2144. if not column_data.size:
  2145. # if no nnz, an error will be raised for computing the
  2146. # quantiles. Force the quantiles to be zeros.
  2147. self.quantiles_.append([0] * len(references))
  2148. else:
  2149. self.quantiles_.append(np.nanpercentile(column_data, references))
  2150. self.quantiles_ = np.transpose(self.quantiles_)
  2151. # due to floating-point precision error in `np.nanpercentile`,
  2152. # make sure the quantiles are monotonically increasing
  2153. # Upstream issue in numpy:
  2154. # https://github.com/numpy/numpy/issues/14685
  2155. self.quantiles_ = np.maximum.accumulate(self.quantiles_)
  2156. @_fit_context(prefer_skip_nested_validation=True)
  2157. def fit(self, X, y=None):
  2158. """Compute the quantiles used for transforming.
  2159. Parameters
  2160. ----------
  2161. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  2162. The data used to scale along the features axis. If a sparse
  2163. matrix is provided, it will be converted into a sparse
  2164. ``csc_matrix``. Additionally, the sparse matrix needs to be
  2165. nonnegative if `ignore_implicit_zeros` is False.
  2166. y : None
  2167. Ignored.
  2168. Returns
  2169. -------
  2170. self : object
  2171. Fitted transformer.
  2172. """
  2173. if self.n_quantiles > self.subsample:
  2174. raise ValueError(
  2175. "The number of quantiles cannot be greater than"
  2176. " the number of samples used. Got {} quantiles"
  2177. " and {} samples.".format(self.n_quantiles, self.subsample)
  2178. )
  2179. X = self._check_inputs(X, in_fit=True, copy=False)
  2180. n_samples = X.shape[0]
  2181. if self.n_quantiles > n_samples:
  2182. warnings.warn(
  2183. "n_quantiles (%s) is greater than the total number "
  2184. "of samples (%s). n_quantiles is set to "
  2185. "n_samples." % (self.n_quantiles, n_samples)
  2186. )
  2187. self.n_quantiles_ = max(1, min(self.n_quantiles, n_samples))
  2188. rng = check_random_state(self.random_state)
  2189. # Create the quantiles of reference
  2190. self.references_ = np.linspace(0, 1, self.n_quantiles_, endpoint=True)
  2191. if sparse.issparse(X):
  2192. self._sparse_fit(X, rng)
  2193. else:
  2194. self._dense_fit(X, rng)
  2195. return self
  2196. def _transform_col(self, X_col, quantiles, inverse):
  2197. """Private function to transform a single feature."""
  2198. output_distribution = self.output_distribution
  2199. if not inverse:
  2200. lower_bound_x = quantiles[0]
  2201. upper_bound_x = quantiles[-1]
  2202. lower_bound_y = 0
  2203. upper_bound_y = 1
  2204. else:
  2205. lower_bound_x = 0
  2206. upper_bound_x = 1
  2207. lower_bound_y = quantiles[0]
  2208. upper_bound_y = quantiles[-1]
  2209. # for inverse transform, match a uniform distribution
  2210. with np.errstate(invalid="ignore"): # hide NaN comparison warnings
  2211. if output_distribution == "normal":
  2212. X_col = stats.norm.cdf(X_col)
  2213. # else output distribution is already a uniform distribution
  2214. # find index for lower and higher bounds
  2215. with np.errstate(invalid="ignore"): # hide NaN comparison warnings
  2216. if output_distribution == "normal":
  2217. lower_bounds_idx = X_col - BOUNDS_THRESHOLD < lower_bound_x
  2218. upper_bounds_idx = X_col + BOUNDS_THRESHOLD > upper_bound_x
  2219. if output_distribution == "uniform":
  2220. lower_bounds_idx = X_col == lower_bound_x
  2221. upper_bounds_idx = X_col == upper_bound_x
  2222. isfinite_mask = ~np.isnan(X_col)
  2223. X_col_finite = X_col[isfinite_mask]
  2224. if not inverse:
  2225. # Interpolate in one direction and in the other and take the
  2226. # mean. This is in case of repeated values in the features
  2227. # and hence repeated quantiles
  2228. #
  2229. # If we don't do this, only one extreme of the duplicated is
  2230. # used (the upper when we do ascending, and the
  2231. # lower for descending). We take the mean of these two
  2232. X_col[isfinite_mask] = 0.5 * (
  2233. np.interp(X_col_finite, quantiles, self.references_)
  2234. - np.interp(-X_col_finite, -quantiles[::-1], -self.references_[::-1])
  2235. )
  2236. else:
  2237. X_col[isfinite_mask] = np.interp(X_col_finite, self.references_, quantiles)
  2238. X_col[upper_bounds_idx] = upper_bound_y
  2239. X_col[lower_bounds_idx] = lower_bound_y
  2240. # for forward transform, match the output distribution
  2241. if not inverse:
  2242. with np.errstate(invalid="ignore"): # hide NaN comparison warnings
  2243. if output_distribution == "normal":
  2244. X_col = stats.norm.ppf(X_col)
  2245. # find the value to clip the data to avoid mapping to
  2246. # infinity. Clip such that the inverse transform will be
  2247. # consistent
  2248. clip_min = stats.norm.ppf(BOUNDS_THRESHOLD - np.spacing(1))
  2249. clip_max = stats.norm.ppf(1 - (BOUNDS_THRESHOLD - np.spacing(1)))
  2250. X_col = np.clip(X_col, clip_min, clip_max)
  2251. # else output distribution is uniform and the ppf is the
  2252. # identity function so we let X_col unchanged
  2253. return X_col
  2254. def _check_inputs(self, X, in_fit, accept_sparse_negative=False, copy=False):
  2255. """Check inputs before fit and transform."""
  2256. X = self._validate_data(
  2257. X,
  2258. reset=in_fit,
  2259. accept_sparse="csc",
  2260. copy=copy,
  2261. dtype=FLOAT_DTYPES,
  2262. force_all_finite="allow-nan",
  2263. )
  2264. # we only accept positive sparse matrix when ignore_implicit_zeros is
  2265. # false and that we call fit or transform.
  2266. with np.errstate(invalid="ignore"): # hide NaN comparison warnings
  2267. if (
  2268. not accept_sparse_negative
  2269. and not self.ignore_implicit_zeros
  2270. and (sparse.issparse(X) and np.any(X.data < 0))
  2271. ):
  2272. raise ValueError(
  2273. "QuantileTransformer only accepts non-negative sparse matrices."
  2274. )
  2275. return X
  2276. def _transform(self, X, inverse=False):
  2277. """Forward and inverse transform.
  2278. Parameters
  2279. ----------
  2280. X : ndarray of shape (n_samples, n_features)
  2281. The data used to scale along the features axis.
  2282. inverse : bool, default=False
  2283. If False, apply forward transform. If True, apply
  2284. inverse transform.
  2285. Returns
  2286. -------
  2287. X : ndarray of shape (n_samples, n_features)
  2288. Projected data.
  2289. """
  2290. if sparse.issparse(X):
  2291. for feature_idx in range(X.shape[1]):
  2292. column_slice = slice(X.indptr[feature_idx], X.indptr[feature_idx + 1])
  2293. X.data[column_slice] = self._transform_col(
  2294. X.data[column_slice], self.quantiles_[:, feature_idx], inverse
  2295. )
  2296. else:
  2297. for feature_idx in range(X.shape[1]):
  2298. X[:, feature_idx] = self._transform_col(
  2299. X[:, feature_idx], self.quantiles_[:, feature_idx], inverse
  2300. )
  2301. return X
  2302. def transform(self, X):
  2303. """Feature-wise transformation of the data.
  2304. Parameters
  2305. ----------
  2306. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  2307. The data used to scale along the features axis. If a sparse
  2308. matrix is provided, it will be converted into a sparse
  2309. ``csc_matrix``. Additionally, the sparse matrix needs to be
  2310. nonnegative if `ignore_implicit_zeros` is False.
  2311. Returns
  2312. -------
  2313. Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)
  2314. The projected data.
  2315. """
  2316. check_is_fitted(self)
  2317. X = self._check_inputs(X, in_fit=False, copy=self.copy)
  2318. return self._transform(X, inverse=False)
  2319. def inverse_transform(self, X):
  2320. """Back-projection to the original space.
  2321. Parameters
  2322. ----------
  2323. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  2324. The data used to scale along the features axis. If a sparse
  2325. matrix is provided, it will be converted into a sparse
  2326. ``csc_matrix``. Additionally, the sparse matrix needs to be
  2327. nonnegative if `ignore_implicit_zeros` is False.
  2328. Returns
  2329. -------
  2330. Xt : {ndarray, sparse matrix} of (n_samples, n_features)
  2331. The projected data.
  2332. """
  2333. check_is_fitted(self)
  2334. X = self._check_inputs(
  2335. X, in_fit=False, accept_sparse_negative=True, copy=self.copy
  2336. )
  2337. return self._transform(X, inverse=True)
  2338. def _more_tags(self):
  2339. return {"allow_nan": True}
  2340. @validate_params(
  2341. {"X": ["array-like", "sparse matrix"], "axis": [Options(Integral, {0, 1})]},
  2342. prefer_skip_nested_validation=False,
  2343. )
  2344. def quantile_transform(
  2345. X,
  2346. *,
  2347. axis=0,
  2348. n_quantiles=1000,
  2349. output_distribution="uniform",
  2350. ignore_implicit_zeros=False,
  2351. subsample=int(1e5),
  2352. random_state=None,
  2353. copy=True,
  2354. ):
  2355. """Transform features using quantiles information.
  2356. This method transforms the features to follow a uniform or a normal
  2357. distribution. Therefore, for a given feature, this transformation tends
  2358. to spread out the most frequent values. It also reduces the impact of
  2359. (marginal) outliers: this is therefore a robust preprocessing scheme.
  2360. The transformation is applied on each feature independently. First an
  2361. estimate of the cumulative distribution function of a feature is
  2362. used to map the original values to a uniform distribution. The obtained
  2363. values are then mapped to the desired output distribution using the
  2364. associated quantile function. Features values of new/unseen data that fall
  2365. below or above the fitted range will be mapped to the bounds of the output
  2366. distribution. Note that this transform is non-linear. It may distort linear
  2367. correlations between variables measured at the same scale but renders
  2368. variables measured at different scales more directly comparable.
  2369. Read more in the :ref:`User Guide <preprocessing_transformer>`.
  2370. Parameters
  2371. ----------
  2372. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  2373. The data to transform.
  2374. axis : int, default=0
  2375. Axis used to compute the means and standard deviations along. If 0,
  2376. transform each feature, otherwise (if 1) transform each sample.
  2377. n_quantiles : int, default=1000 or n_samples
  2378. Number of quantiles to be computed. It corresponds to the number
  2379. of landmarks used to discretize the cumulative distribution function.
  2380. If n_quantiles is larger than the number of samples, n_quantiles is set
  2381. to the number of samples as a larger number of quantiles does not give
  2382. a better approximation of the cumulative distribution function
  2383. estimator.
  2384. output_distribution : {'uniform', 'normal'}, default='uniform'
  2385. Marginal distribution for the transformed data. The choices are
  2386. 'uniform' (default) or 'normal'.
  2387. ignore_implicit_zeros : bool, default=False
  2388. Only applies to sparse matrices. If True, the sparse entries of the
  2389. matrix are discarded to compute the quantile statistics. If False,
  2390. these entries are treated as zeros.
  2391. subsample : int, default=1e5
  2392. Maximum number of samples used to estimate the quantiles for
  2393. computational efficiency. Note that the subsampling procedure may
  2394. differ for value-identical sparse and dense matrices.
  2395. random_state : int, RandomState instance or None, default=None
  2396. Determines random number generation for subsampling and smoothing
  2397. noise.
  2398. Please see ``subsample`` for more details.
  2399. Pass an int for reproducible results across multiple function calls.
  2400. See :term:`Glossary <random_state>`.
  2401. copy : bool, default=True
  2402. Set to False to perform inplace transformation and avoid a copy (if the
  2403. input is already a numpy array). If True, a copy of `X` is transformed,
  2404. leaving the original `X` unchanged.
  2405. .. versionchanged:: 0.23
  2406. The default value of `copy` changed from False to True in 0.23.
  2407. Returns
  2408. -------
  2409. Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)
  2410. The transformed data.
  2411. See Also
  2412. --------
  2413. QuantileTransformer : Performs quantile-based scaling using the
  2414. Transformer API (e.g. as part of a preprocessing
  2415. :class:`~sklearn.pipeline.Pipeline`).
  2416. power_transform : Maps data to a normal distribution using a
  2417. power transformation.
  2418. scale : Performs standardization that is faster, but less robust
  2419. to outliers.
  2420. robust_scale : Performs robust standardization that removes the influence
  2421. of outliers but does not put outliers and inliers on the same scale.
  2422. Notes
  2423. -----
  2424. NaNs are treated as missing values: disregarded in fit, and maintained in
  2425. transform.
  2426. .. warning:: Risk of data leak
  2427. Do not use :func:`~sklearn.preprocessing.quantile_transform` unless
  2428. you know what you are doing. A common mistake is to apply it
  2429. to the entire data *before* splitting into training and
  2430. test sets. This will bias the model evaluation because
  2431. information would have leaked from the test set to the
  2432. training set.
  2433. In general, we recommend using
  2434. :class:`~sklearn.preprocessing.QuantileTransformer` within a
  2435. :ref:`Pipeline <pipeline>` in order to prevent most risks of data
  2436. leaking:`pipe = make_pipeline(QuantileTransformer(),
  2437. LogisticRegression())`.
  2438. For a comparison of the different scalers, transformers, and normalizers,
  2439. see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
  2440. Examples
  2441. --------
  2442. >>> import numpy as np
  2443. >>> from sklearn.preprocessing import quantile_transform
  2444. >>> rng = np.random.RandomState(0)
  2445. >>> X = np.sort(rng.normal(loc=0.5, scale=0.25, size=(25, 1)), axis=0)
  2446. >>> quantile_transform(X, n_quantiles=10, random_state=0, copy=True)
  2447. array([...])
  2448. """
  2449. n = QuantileTransformer(
  2450. n_quantiles=n_quantiles,
  2451. output_distribution=output_distribution,
  2452. subsample=subsample,
  2453. ignore_implicit_zeros=ignore_implicit_zeros,
  2454. random_state=random_state,
  2455. copy=copy,
  2456. )
  2457. if axis == 0:
  2458. X = n.fit_transform(X)
  2459. else: # axis == 1
  2460. X = n.fit_transform(X.T).T
  2461. return X
  2462. class PowerTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
  2463. """Apply a power transform featurewise to make data more Gaussian-like.
  2464. Power transforms are a family of parametric, monotonic transformations
  2465. that are applied to make data more Gaussian-like. This is useful for
  2466. modeling issues related to heteroscedasticity (non-constant variance),
  2467. or other situations where normality is desired.
  2468. Currently, PowerTransformer supports the Box-Cox transform and the
  2469. Yeo-Johnson transform. The optimal parameter for stabilizing variance and
  2470. minimizing skewness is estimated through maximum likelihood.
  2471. Box-Cox requires input data to be strictly positive, while Yeo-Johnson
  2472. supports both positive or negative data.
  2473. By default, zero-mean, unit-variance normalization is applied to the
  2474. transformed data.
  2475. For an example visualization, refer to :ref:`Compare PowerTransformer with
  2476. other scalers <plot_all_scaling_power_transformer_section>`. To see the
  2477. effect of Box-Cox and Yeo-Johnson transformations on different
  2478. distributions, see:
  2479. :ref:`sphx_glr_auto_examples_preprocessing_plot_map_data_to_normal.py`.
  2480. Read more in the :ref:`User Guide <preprocessing_transformer>`.
  2481. .. versionadded:: 0.20
  2482. Parameters
  2483. ----------
  2484. method : {'yeo-johnson', 'box-cox'}, default='yeo-johnson'
  2485. The power transform method. Available methods are:
  2486. - 'yeo-johnson' [1]_, works with positive and negative values
  2487. - 'box-cox' [2]_, only works with strictly positive values
  2488. standardize : bool, default=True
  2489. Set to True to apply zero-mean, unit-variance normalization to the
  2490. transformed output.
  2491. copy : bool, default=True
  2492. Set to False to perform inplace computation during transformation.
  2493. Attributes
  2494. ----------
  2495. lambdas_ : ndarray of float of shape (n_features,)
  2496. The parameters of the power transformation for the selected features.
  2497. n_features_in_ : int
  2498. Number of features seen during :term:`fit`.
  2499. .. versionadded:: 0.24
  2500. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  2501. Names of features seen during :term:`fit`. Defined only when `X`
  2502. has feature names that are all strings.
  2503. .. versionadded:: 1.0
  2504. See Also
  2505. --------
  2506. power_transform : Equivalent function without the estimator API.
  2507. QuantileTransformer : Maps data to a standard normal distribution with
  2508. the parameter `output_distribution='normal'`.
  2509. Notes
  2510. -----
  2511. NaNs are treated as missing values: disregarded in ``fit``, and maintained
  2512. in ``transform``.
  2513. References
  2514. ----------
  2515. .. [1] :doi:`I.K. Yeo and R.A. Johnson, "A new family of power
  2516. transformations to improve normality or symmetry." Biometrika,
  2517. 87(4), pp.954-959, (2000). <10.1093/biomet/87.4.954>`
  2518. .. [2] :doi:`G.E.P. Box and D.R. Cox, "An Analysis of Transformations",
  2519. Journal of the Royal Statistical Society B, 26, 211-252 (1964).
  2520. <10.1111/j.2517-6161.1964.tb00553.x>`
  2521. Examples
  2522. --------
  2523. >>> import numpy as np
  2524. >>> from sklearn.preprocessing import PowerTransformer
  2525. >>> pt = PowerTransformer()
  2526. >>> data = [[1, 2], [3, 2], [4, 5]]
  2527. >>> print(pt.fit(data))
  2528. PowerTransformer()
  2529. >>> print(pt.lambdas_)
  2530. [ 1.386... -3.100...]
  2531. >>> print(pt.transform(data))
  2532. [[-1.316... -0.707...]
  2533. [ 0.209... -0.707...]
  2534. [ 1.106... 1.414...]]
  2535. """
  2536. _parameter_constraints: dict = {
  2537. "method": [StrOptions({"yeo-johnson", "box-cox"})],
  2538. "standardize": ["boolean"],
  2539. "copy": ["boolean"],
  2540. }
  2541. def __init__(self, method="yeo-johnson", *, standardize=True, copy=True):
  2542. self.method = method
  2543. self.standardize = standardize
  2544. self.copy = copy
  2545. @_fit_context(prefer_skip_nested_validation=True)
  2546. def fit(self, X, y=None):
  2547. """Estimate the optimal parameter lambda for each feature.
  2548. The optimal lambda parameter for minimizing skewness is estimated on
  2549. each feature independently using maximum likelihood.
  2550. Parameters
  2551. ----------
  2552. X : array-like of shape (n_samples, n_features)
  2553. The data used to estimate the optimal transformation parameters.
  2554. y : None
  2555. Ignored.
  2556. Returns
  2557. -------
  2558. self : object
  2559. Fitted transformer.
  2560. """
  2561. self._fit(X, y=y, force_transform=False)
  2562. return self
  2563. @_fit_context(prefer_skip_nested_validation=True)
  2564. def fit_transform(self, X, y=None):
  2565. """Fit `PowerTransformer` to `X`, then transform `X`.
  2566. Parameters
  2567. ----------
  2568. X : array-like of shape (n_samples, n_features)
  2569. The data used to estimate the optimal transformation parameters
  2570. and to be transformed using a power transformation.
  2571. y : Ignored
  2572. Not used, present for API consistency by convention.
  2573. Returns
  2574. -------
  2575. X_new : ndarray of shape (n_samples, n_features)
  2576. Transformed data.
  2577. """
  2578. return self._fit(X, y, force_transform=True)
  2579. def _fit(self, X, y=None, force_transform=False):
  2580. X = self._check_input(X, in_fit=True, check_positive=True)
  2581. if not self.copy and not force_transform: # if call from fit()
  2582. X = X.copy() # force copy so that fit does not change X inplace
  2583. n_samples = X.shape[0]
  2584. mean = np.mean(X, axis=0, dtype=np.float64)
  2585. var = np.var(X, axis=0, dtype=np.float64)
  2586. optim_function = {
  2587. "box-cox": self._box_cox_optimize,
  2588. "yeo-johnson": self._yeo_johnson_optimize,
  2589. }[self.method]
  2590. transform_function = {
  2591. "box-cox": boxcox,
  2592. "yeo-johnson": self._yeo_johnson_transform,
  2593. }[self.method]
  2594. with np.errstate(invalid="ignore"): # hide NaN warnings
  2595. self.lambdas_ = np.empty(X.shape[1], dtype=X.dtype)
  2596. for i, col in enumerate(X.T):
  2597. # For yeo-johnson, leave constant features unchanged
  2598. # lambda=1 corresponds to the identity transformation
  2599. is_constant_feature = _is_constant_feature(var[i], mean[i], n_samples)
  2600. if self.method == "yeo-johnson" and is_constant_feature:
  2601. self.lambdas_[i] = 1.0
  2602. continue
  2603. self.lambdas_[i] = optim_function(col)
  2604. if self.standardize or force_transform:
  2605. X[:, i] = transform_function(X[:, i], self.lambdas_[i])
  2606. if self.standardize:
  2607. self._scaler = StandardScaler(copy=False).set_output(transform="default")
  2608. if force_transform:
  2609. X = self._scaler.fit_transform(X)
  2610. else:
  2611. self._scaler.fit(X)
  2612. return X
  2613. def transform(self, X):
  2614. """Apply the power transform to each feature using the fitted lambdas.
  2615. Parameters
  2616. ----------
  2617. X : array-like of shape (n_samples, n_features)
  2618. The data to be transformed using a power transformation.
  2619. Returns
  2620. -------
  2621. X_trans : ndarray of shape (n_samples, n_features)
  2622. The transformed data.
  2623. """
  2624. check_is_fitted(self)
  2625. X = self._check_input(X, in_fit=False, check_positive=True, check_shape=True)
  2626. transform_function = {
  2627. "box-cox": boxcox,
  2628. "yeo-johnson": self._yeo_johnson_transform,
  2629. }[self.method]
  2630. for i, lmbda in enumerate(self.lambdas_):
  2631. with np.errstate(invalid="ignore"): # hide NaN warnings
  2632. X[:, i] = transform_function(X[:, i], lmbda)
  2633. if self.standardize:
  2634. X = self._scaler.transform(X)
  2635. return X
  2636. def inverse_transform(self, X):
  2637. """Apply the inverse power transformation using the fitted lambdas.
  2638. The inverse of the Box-Cox transformation is given by::
  2639. if lambda_ == 0:
  2640. X = exp(X_trans)
  2641. else:
  2642. X = (X_trans * lambda_ + 1) ** (1 / lambda_)
  2643. The inverse of the Yeo-Johnson transformation is given by::
  2644. if X >= 0 and lambda_ == 0:
  2645. X = exp(X_trans) - 1
  2646. elif X >= 0 and lambda_ != 0:
  2647. X = (X_trans * lambda_ + 1) ** (1 / lambda_) - 1
  2648. elif X < 0 and lambda_ != 2:
  2649. X = 1 - (-(2 - lambda_) * X_trans + 1) ** (1 / (2 - lambda_))
  2650. elif X < 0 and lambda_ == 2:
  2651. X = 1 - exp(-X_trans)
  2652. Parameters
  2653. ----------
  2654. X : array-like of shape (n_samples, n_features)
  2655. The transformed data.
  2656. Returns
  2657. -------
  2658. X : ndarray of shape (n_samples, n_features)
  2659. The original data.
  2660. """
  2661. check_is_fitted(self)
  2662. X = self._check_input(X, in_fit=False, check_shape=True)
  2663. if self.standardize:
  2664. X = self._scaler.inverse_transform(X)
  2665. inv_fun = {
  2666. "box-cox": self._box_cox_inverse_tranform,
  2667. "yeo-johnson": self._yeo_johnson_inverse_transform,
  2668. }[self.method]
  2669. for i, lmbda in enumerate(self.lambdas_):
  2670. with np.errstate(invalid="ignore"): # hide NaN warnings
  2671. X[:, i] = inv_fun(X[:, i], lmbda)
  2672. return X
  2673. def _box_cox_inverse_tranform(self, x, lmbda):
  2674. """Return inverse-transformed input x following Box-Cox inverse
  2675. transform with parameter lambda.
  2676. """
  2677. if lmbda == 0:
  2678. x_inv = np.exp(x)
  2679. else:
  2680. x_inv = (x * lmbda + 1) ** (1 / lmbda)
  2681. return x_inv
  2682. def _yeo_johnson_inverse_transform(self, x, lmbda):
  2683. """Return inverse-transformed input x following Yeo-Johnson inverse
  2684. transform with parameter lambda.
  2685. """
  2686. x_inv = np.zeros_like(x)
  2687. pos = x >= 0
  2688. # when x >= 0
  2689. if abs(lmbda) < np.spacing(1.0):
  2690. x_inv[pos] = np.exp(x[pos]) - 1
  2691. else: # lmbda != 0
  2692. x_inv[pos] = np.power(x[pos] * lmbda + 1, 1 / lmbda) - 1
  2693. # when x < 0
  2694. if abs(lmbda - 2) > np.spacing(1.0):
  2695. x_inv[~pos] = 1 - np.power(-(2 - lmbda) * x[~pos] + 1, 1 / (2 - lmbda))
  2696. else: # lmbda == 2
  2697. x_inv[~pos] = 1 - np.exp(-x[~pos])
  2698. return x_inv
  2699. def _yeo_johnson_transform(self, x, lmbda):
  2700. """Return transformed input x following Yeo-Johnson transform with
  2701. parameter lambda.
  2702. """
  2703. out = np.zeros_like(x)
  2704. pos = x >= 0 # binary mask
  2705. # when x >= 0
  2706. if abs(lmbda) < np.spacing(1.0):
  2707. out[pos] = np.log1p(x[pos])
  2708. else: # lmbda != 0
  2709. out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda
  2710. # when x < 0
  2711. if abs(lmbda - 2) > np.spacing(1.0):
  2712. out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
  2713. else: # lmbda == 2
  2714. out[~pos] = -np.log1p(-x[~pos])
  2715. return out
  2716. def _box_cox_optimize(self, x):
  2717. """Find and return optimal lambda parameter of the Box-Cox transform by
  2718. MLE, for observed data x.
  2719. We here use scipy builtins which uses the brent optimizer.
  2720. """
  2721. mask = np.isnan(x)
  2722. if np.all(mask):
  2723. raise ValueError("Column must not be all nan.")
  2724. # the computation of lambda is influenced by NaNs so we need to
  2725. # get rid of them
  2726. _, lmbda = stats.boxcox(x[~mask], lmbda=None)
  2727. return lmbda
  2728. def _yeo_johnson_optimize(self, x):
  2729. """Find and return optimal lambda parameter of the Yeo-Johnson
  2730. transform by MLE, for observed data x.
  2731. Like for Box-Cox, MLE is done via the brent optimizer.
  2732. """
  2733. x_tiny = np.finfo(np.float64).tiny
  2734. def _neg_log_likelihood(lmbda):
  2735. """Return the negative log likelihood of the observed data x as a
  2736. function of lambda."""
  2737. x_trans = self._yeo_johnson_transform(x, lmbda)
  2738. n_samples = x.shape[0]
  2739. x_trans_var = x_trans.var()
  2740. # Reject transformed data that would raise a RuntimeWarning in np.log
  2741. if x_trans_var < x_tiny:
  2742. return np.inf
  2743. log_var = np.log(x_trans_var)
  2744. loglike = -n_samples / 2 * log_var
  2745. loglike += (lmbda - 1) * (np.sign(x) * np.log1p(np.abs(x))).sum()
  2746. return -loglike
  2747. # the computation of lambda is influenced by NaNs so we need to
  2748. # get rid of them
  2749. x = x[~np.isnan(x)]
  2750. # choosing bracket -2, 2 like for boxcox
  2751. return optimize.brent(_neg_log_likelihood, brack=(-2, 2))
  2752. def _check_input(self, X, in_fit, check_positive=False, check_shape=False):
  2753. """Validate the input before fit and transform.
  2754. Parameters
  2755. ----------
  2756. X : array-like of shape (n_samples, n_features)
  2757. in_fit : bool
  2758. Whether or not `_check_input` is called from `fit` or other
  2759. methods, e.g. `predict`, `transform`, etc.
  2760. check_positive : bool, default=False
  2761. If True, check that all data is positive and non-zero (only if
  2762. ``self.method=='box-cox'``).
  2763. check_shape : bool, default=False
  2764. If True, check that n_features matches the length of self.lambdas_
  2765. """
  2766. X = self._validate_data(
  2767. X,
  2768. ensure_2d=True,
  2769. dtype=FLOAT_DTYPES,
  2770. copy=self.copy,
  2771. force_all_finite="allow-nan",
  2772. reset=in_fit,
  2773. )
  2774. with warnings.catch_warnings():
  2775. warnings.filterwarnings("ignore", r"All-NaN (slice|axis) encountered")
  2776. if check_positive and self.method == "box-cox" and np.nanmin(X) <= 0:
  2777. raise ValueError(
  2778. "The Box-Cox transformation can only be "
  2779. "applied to strictly positive data"
  2780. )
  2781. if check_shape and not X.shape[1] == len(self.lambdas_):
  2782. raise ValueError(
  2783. "Input data has a different number of features "
  2784. "than fitting data. Should have {n}, data has {m}".format(
  2785. n=len(self.lambdas_), m=X.shape[1]
  2786. )
  2787. )
  2788. return X
  2789. def _more_tags(self):
  2790. return {"allow_nan": True}
  2791. @validate_params(
  2792. {"X": ["array-like"]},
  2793. prefer_skip_nested_validation=False,
  2794. )
  2795. def power_transform(X, method="yeo-johnson", *, standardize=True, copy=True):
  2796. """Parametric, monotonic transformation to make data more Gaussian-like.
  2797. Power transforms are a family of parametric, monotonic transformations
  2798. that are applied to make data more Gaussian-like. This is useful for
  2799. modeling issues related to heteroscedasticity (non-constant variance),
  2800. or other situations where normality is desired.
  2801. Currently, power_transform supports the Box-Cox transform and the
  2802. Yeo-Johnson transform. The optimal parameter for stabilizing variance and
  2803. minimizing skewness is estimated through maximum likelihood.
  2804. Box-Cox requires input data to be strictly positive, while Yeo-Johnson
  2805. supports both positive or negative data.
  2806. By default, zero-mean, unit-variance normalization is applied to the
  2807. transformed data.
  2808. Read more in the :ref:`User Guide <preprocessing_transformer>`.
  2809. Parameters
  2810. ----------
  2811. X : array-like of shape (n_samples, n_features)
  2812. The data to be transformed using a power transformation.
  2813. method : {'yeo-johnson', 'box-cox'}, default='yeo-johnson'
  2814. The power transform method. Available methods are:
  2815. - 'yeo-johnson' [1]_, works with positive and negative values
  2816. - 'box-cox' [2]_, only works with strictly positive values
  2817. .. versionchanged:: 0.23
  2818. The default value of the `method` parameter changed from
  2819. 'box-cox' to 'yeo-johnson' in 0.23.
  2820. standardize : bool, default=True
  2821. Set to True to apply zero-mean, unit-variance normalization to the
  2822. transformed output.
  2823. copy : bool, default=True
  2824. Set to False to perform inplace computation during transformation.
  2825. Returns
  2826. -------
  2827. X_trans : ndarray of shape (n_samples, n_features)
  2828. The transformed data.
  2829. See Also
  2830. --------
  2831. PowerTransformer : Equivalent transformation with the
  2832. Transformer API (e.g. as part of a preprocessing
  2833. :class:`~sklearn.pipeline.Pipeline`).
  2834. quantile_transform : Maps data to a standard normal distribution with
  2835. the parameter `output_distribution='normal'`.
  2836. Notes
  2837. -----
  2838. NaNs are treated as missing values: disregarded in ``fit``, and maintained
  2839. in ``transform``.
  2840. For a comparison of the different scalers, transformers, and normalizers,
  2841. see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
  2842. References
  2843. ----------
  2844. .. [1] I.K. Yeo and R.A. Johnson, "A new family of power transformations to
  2845. improve normality or symmetry." Biometrika, 87(4), pp.954-959,
  2846. (2000).
  2847. .. [2] G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal
  2848. of the Royal Statistical Society B, 26, 211-252 (1964).
  2849. Examples
  2850. --------
  2851. >>> import numpy as np
  2852. >>> from sklearn.preprocessing import power_transform
  2853. >>> data = [[1, 2], [3, 2], [4, 5]]
  2854. >>> print(power_transform(data, method='box-cox'))
  2855. [[-1.332... -0.707...]
  2856. [ 0.256... -0.707...]
  2857. [ 1.076... 1.414...]]
  2858. .. warning:: Risk of data leak.
  2859. Do not use :func:`~sklearn.preprocessing.power_transform` unless you
  2860. know what you are doing. A common mistake is to apply it to the entire
  2861. data *before* splitting into training and test sets. This will bias the
  2862. model evaluation because information would have leaked from the test
  2863. set to the training set.
  2864. In general, we recommend using
  2865. :class:`~sklearn.preprocessing.PowerTransformer` within a
  2866. :ref:`Pipeline <pipeline>` in order to prevent most risks of data
  2867. leaking, e.g.: `pipe = make_pipeline(PowerTransformer(),
  2868. LogisticRegression())`.
  2869. """
  2870. pt = PowerTransformer(method=method, standardize=standardize, copy=copy)
  2871. return pt.fit_transform(X)