_ridge.py 89 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562
  1. """
  2. Ridge regression
  3. """
  4. # Author: Mathieu Blondel <mathieu@mblondel.org>
  5. # Reuben Fletcher-Costin <reuben.fletchercostin@gmail.com>
  6. # Fabian Pedregosa <fabian@fseoane.net>
  7. # Michael Eickenberg <michael.eickenberg@nsup.org>
  8. # License: BSD 3 clause
  9. import numbers
  10. import warnings
  11. from abc import ABCMeta, abstractmethod
  12. from functools import partial
  13. from numbers import Integral, Real
  14. import numpy as np
  15. from scipy import linalg, optimize, sparse
  16. from scipy.sparse import linalg as sp_linalg
  17. from ..base import MultiOutputMixin, RegressorMixin, _fit_context, is_classifier
  18. from ..exceptions import ConvergenceWarning
  19. from ..metrics import check_scoring, get_scorer_names
  20. from ..model_selection import GridSearchCV
  21. from ..preprocessing import LabelBinarizer
  22. from ..utils import (
  23. check_array,
  24. check_consistent_length,
  25. check_scalar,
  26. column_or_1d,
  27. compute_sample_weight,
  28. )
  29. from ..utils._param_validation import Interval, StrOptions
  30. from ..utils.extmath import row_norms, safe_sparse_dot
  31. from ..utils.fixes import _sparse_linalg_cg
  32. from ..utils.sparsefuncs import mean_variance_axis
  33. from ..utils.validation import _check_sample_weight, check_is_fitted
  34. from ._base import LinearClassifierMixin, LinearModel, _preprocess_data, _rescale_data
  35. from ._sag import sag_solver
  36. def _get_rescaled_operator(X, X_offset, sample_weight_sqrt):
  37. """Create LinearOperator for matrix products with implicit centering.
  38. Matrix product `LinearOperator @ coef` returns `(X - X_offset) @ coef`.
  39. """
  40. def matvec(b):
  41. return X.dot(b) - sample_weight_sqrt * b.dot(X_offset)
  42. def rmatvec(b):
  43. return X.T.dot(b) - X_offset * b.dot(sample_weight_sqrt)
  44. X1 = sparse.linalg.LinearOperator(shape=X.shape, matvec=matvec, rmatvec=rmatvec)
  45. return X1
  46. def _solve_sparse_cg(
  47. X,
  48. y,
  49. alpha,
  50. max_iter=None,
  51. tol=1e-4,
  52. verbose=0,
  53. X_offset=None,
  54. X_scale=None,
  55. sample_weight_sqrt=None,
  56. ):
  57. if sample_weight_sqrt is None:
  58. sample_weight_sqrt = np.ones(X.shape[0], dtype=X.dtype)
  59. n_samples, n_features = X.shape
  60. if X_offset is None or X_scale is None:
  61. X1 = sp_linalg.aslinearoperator(X)
  62. else:
  63. X_offset_scale = X_offset / X_scale
  64. X1 = _get_rescaled_operator(X, X_offset_scale, sample_weight_sqrt)
  65. coefs = np.empty((y.shape[1], n_features), dtype=X.dtype)
  66. if n_features > n_samples:
  67. def create_mv(curr_alpha):
  68. def _mv(x):
  69. return X1.matvec(X1.rmatvec(x)) + curr_alpha * x
  70. return _mv
  71. else:
  72. def create_mv(curr_alpha):
  73. def _mv(x):
  74. return X1.rmatvec(X1.matvec(x)) + curr_alpha * x
  75. return _mv
  76. for i in range(y.shape[1]):
  77. y_column = y[:, i]
  78. mv = create_mv(alpha[i])
  79. if n_features > n_samples:
  80. # kernel ridge
  81. # w = X.T * inv(X X^t + alpha*Id) y
  82. C = sp_linalg.LinearOperator(
  83. (n_samples, n_samples), matvec=mv, dtype=X.dtype
  84. )
  85. coef, info = _sparse_linalg_cg(C, y_column, rtol=tol)
  86. coefs[i] = X1.rmatvec(coef)
  87. else:
  88. # linear ridge
  89. # w = inv(X^t X + alpha*Id) * X.T y
  90. y_column = X1.rmatvec(y_column)
  91. C = sp_linalg.LinearOperator(
  92. (n_features, n_features), matvec=mv, dtype=X.dtype
  93. )
  94. coefs[i], info = _sparse_linalg_cg(C, y_column, maxiter=max_iter, rtol=tol)
  95. if info < 0:
  96. raise ValueError("Failed with error code %d" % info)
  97. if max_iter is None and info > 0 and verbose:
  98. warnings.warn(
  99. "sparse_cg did not converge after %d iterations." % info,
  100. ConvergenceWarning,
  101. )
  102. return coefs
  103. def _solve_lsqr(
  104. X,
  105. y,
  106. *,
  107. alpha,
  108. fit_intercept=True,
  109. max_iter=None,
  110. tol=1e-4,
  111. X_offset=None,
  112. X_scale=None,
  113. sample_weight_sqrt=None,
  114. ):
  115. """Solve Ridge regression via LSQR.
  116. We expect that y is always mean centered.
  117. If X is dense, we expect it to be mean centered such that we can solve
  118. ||y - Xw||_2^2 + alpha * ||w||_2^2
  119. If X is sparse, we expect X_offset to be given such that we can solve
  120. ||y - (X - X_offset)w||_2^2 + alpha * ||w||_2^2
  121. With sample weights S=diag(sample_weight), this becomes
  122. ||sqrt(S) (y - (X - X_offset) w)||_2^2 + alpha * ||w||_2^2
  123. and we expect y and X to already be rescaled, i.e. sqrt(S) @ y, sqrt(S) @ X. In
  124. this case, X_offset is the sample_weight weighted mean of X before scaling by
  125. sqrt(S). The objective then reads
  126. ||y - (X - sqrt(S) X_offset) w)||_2^2 + alpha * ||w||_2^2
  127. """
  128. if sample_weight_sqrt is None:
  129. sample_weight_sqrt = np.ones(X.shape[0], dtype=X.dtype)
  130. if sparse.issparse(X) and fit_intercept:
  131. X_offset_scale = X_offset / X_scale
  132. X1 = _get_rescaled_operator(X, X_offset_scale, sample_weight_sqrt)
  133. else:
  134. # No need to touch anything
  135. X1 = X
  136. n_samples, n_features = X.shape
  137. coefs = np.empty((y.shape[1], n_features), dtype=X.dtype)
  138. n_iter = np.empty(y.shape[1], dtype=np.int32)
  139. # According to the lsqr documentation, alpha = damp^2.
  140. sqrt_alpha = np.sqrt(alpha)
  141. for i in range(y.shape[1]):
  142. y_column = y[:, i]
  143. info = sp_linalg.lsqr(
  144. X1, y_column, damp=sqrt_alpha[i], atol=tol, btol=tol, iter_lim=max_iter
  145. )
  146. coefs[i] = info[0]
  147. n_iter[i] = info[2]
  148. return coefs, n_iter
  149. def _solve_cholesky(X, y, alpha):
  150. # w = inv(X^t X + alpha*Id) * X.T y
  151. n_features = X.shape[1]
  152. n_targets = y.shape[1]
  153. A = safe_sparse_dot(X.T, X, dense_output=True)
  154. Xy = safe_sparse_dot(X.T, y, dense_output=True)
  155. one_alpha = np.array_equal(alpha, len(alpha) * [alpha[0]])
  156. if one_alpha:
  157. A.flat[:: n_features + 1] += alpha[0]
  158. return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  159. else:
  160. coefs = np.empty([n_targets, n_features], dtype=X.dtype)
  161. for coef, target, current_alpha in zip(coefs, Xy.T, alpha):
  162. A.flat[:: n_features + 1] += current_alpha
  163. coef[:] = linalg.solve(A, target, assume_a="pos", overwrite_a=False).ravel()
  164. A.flat[:: n_features + 1] -= current_alpha
  165. return coefs
  166. def _solve_cholesky_kernel(K, y, alpha, sample_weight=None, copy=False):
  167. # dual_coef = inv(X X^t + alpha*Id) y
  168. n_samples = K.shape[0]
  169. n_targets = y.shape[1]
  170. if copy:
  171. K = K.copy()
  172. alpha = np.atleast_1d(alpha)
  173. one_alpha = (alpha == alpha[0]).all()
  174. has_sw = isinstance(sample_weight, np.ndarray) or sample_weight not in [1.0, None]
  175. if has_sw:
  176. # Unlike other solvers, we need to support sample_weight directly
  177. # because K might be a pre-computed kernel.
  178. sw = np.sqrt(np.atleast_1d(sample_weight))
  179. y = y * sw[:, np.newaxis]
  180. K *= np.outer(sw, sw)
  181. if one_alpha:
  182. # Only one penalty, we can solve multi-target problems in one time.
  183. K.flat[:: n_samples + 1] += alpha[0]
  184. try:
  185. # Note: we must use overwrite_a=False in order to be able to
  186. # use the fall-back solution below in case a LinAlgError
  187. # is raised
  188. dual_coef = linalg.solve(K, y, assume_a="pos", overwrite_a=False)
  189. except np.linalg.LinAlgError:
  190. warnings.warn(
  191. "Singular matrix in solving dual problem. Using "
  192. "least-squares solution instead."
  193. )
  194. dual_coef = linalg.lstsq(K, y)[0]
  195. # K is expensive to compute and store in memory so change it back in
  196. # case it was user-given.
  197. K.flat[:: n_samples + 1] -= alpha[0]
  198. if has_sw:
  199. dual_coef *= sw[:, np.newaxis]
  200. return dual_coef
  201. else:
  202. # One penalty per target. We need to solve each target separately.
  203. dual_coefs = np.empty([n_targets, n_samples], K.dtype)
  204. for dual_coef, target, current_alpha in zip(dual_coefs, y.T, alpha):
  205. K.flat[:: n_samples + 1] += current_alpha
  206. dual_coef[:] = linalg.solve(
  207. K, target, assume_a="pos", overwrite_a=False
  208. ).ravel()
  209. K.flat[:: n_samples + 1] -= current_alpha
  210. if has_sw:
  211. dual_coefs *= sw[np.newaxis, :]
  212. return dual_coefs.T
  213. def _solve_svd(X, y, alpha):
  214. U, s, Vt = linalg.svd(X, full_matrices=False)
  215. idx = s > 1e-15 # same default value as scipy.linalg.pinv
  216. s_nnz = s[idx][:, np.newaxis]
  217. UTy = np.dot(U.T, y)
  218. d = np.zeros((s.size, alpha.size), dtype=X.dtype)
  219. d[idx] = s_nnz / (s_nnz**2 + alpha)
  220. d_UT_y = d * UTy
  221. return np.dot(Vt.T, d_UT_y).T
  222. def _solve_lbfgs(
  223. X,
  224. y,
  225. alpha,
  226. positive=True,
  227. max_iter=None,
  228. tol=1e-4,
  229. X_offset=None,
  230. X_scale=None,
  231. sample_weight_sqrt=None,
  232. ):
  233. """Solve ridge regression with LBFGS.
  234. The main purpose is fitting with forcing coefficients to be positive.
  235. For unconstrained ridge regression, there are faster dedicated solver methods.
  236. Note that with positive bounds on the coefficients, LBFGS seems faster
  237. than scipy.optimize.lsq_linear.
  238. """
  239. n_samples, n_features = X.shape
  240. options = {}
  241. if max_iter is not None:
  242. options["maxiter"] = max_iter
  243. config = {
  244. "method": "L-BFGS-B",
  245. "tol": tol,
  246. "jac": True,
  247. "options": options,
  248. }
  249. if positive:
  250. config["bounds"] = [(0, np.inf)] * n_features
  251. if X_offset is not None and X_scale is not None:
  252. X_offset_scale = X_offset / X_scale
  253. else:
  254. X_offset_scale = None
  255. if sample_weight_sqrt is None:
  256. sample_weight_sqrt = np.ones(X.shape[0], dtype=X.dtype)
  257. coefs = np.empty((y.shape[1], n_features), dtype=X.dtype)
  258. for i in range(y.shape[1]):
  259. x0 = np.zeros((n_features,))
  260. y_column = y[:, i]
  261. def func(w):
  262. residual = X.dot(w) - y_column
  263. if X_offset_scale is not None:
  264. residual -= sample_weight_sqrt * w.dot(X_offset_scale)
  265. f = 0.5 * residual.dot(residual) + 0.5 * alpha[i] * w.dot(w)
  266. grad = X.T @ residual + alpha[i] * w
  267. if X_offset_scale is not None:
  268. grad -= X_offset_scale * residual.dot(sample_weight_sqrt)
  269. return f, grad
  270. result = optimize.minimize(func, x0, **config)
  271. if not result["success"]:
  272. warnings.warn(
  273. (
  274. "The lbfgs solver did not converge. Try increasing max_iter "
  275. f"or tol. Currently: max_iter={max_iter} and tol={tol}"
  276. ),
  277. ConvergenceWarning,
  278. )
  279. coefs[i] = result["x"]
  280. return coefs
  281. def _get_valid_accept_sparse(is_X_sparse, solver):
  282. if is_X_sparse and solver in ["auto", "sag", "saga"]:
  283. return "csr"
  284. else:
  285. return ["csr", "csc", "coo"]
  286. def ridge_regression(
  287. X,
  288. y,
  289. alpha,
  290. *,
  291. sample_weight=None,
  292. solver="auto",
  293. max_iter=None,
  294. tol=1e-4,
  295. verbose=0,
  296. positive=False,
  297. random_state=None,
  298. return_n_iter=False,
  299. return_intercept=False,
  300. check_input=True,
  301. ):
  302. """Solve the ridge equation by the method of normal equations.
  303. Read more in the :ref:`User Guide <ridge_regression>`.
  304. Parameters
  305. ----------
  306. X : {ndarray, sparse matrix, LinearOperator} of shape \
  307. (n_samples, n_features)
  308. Training data.
  309. y : ndarray of shape (n_samples,) or (n_samples, n_targets)
  310. Target values.
  311. alpha : float or array-like of shape (n_targets,)
  312. Constant that multiplies the L2 term, controlling regularization
  313. strength. `alpha` must be a non-negative float i.e. in `[0, inf)`.
  314. When `alpha = 0`, the objective is equivalent to ordinary least
  315. squares, solved by the :class:`LinearRegression` object. For numerical
  316. reasons, using `alpha = 0` with the `Ridge` object is not advised.
  317. Instead, you should use the :class:`LinearRegression` object.
  318. If an array is passed, penalties are assumed to be specific to the
  319. targets. Hence they must correspond in number.
  320. sample_weight : float or array-like of shape (n_samples,), default=None
  321. Individual weights for each sample. If given a float, every sample
  322. will have the same weight. If sample_weight is not None and
  323. solver='auto', the solver will be set to 'cholesky'.
  324. .. versionadded:: 0.17
  325. solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', \
  326. 'sag', 'saga', 'lbfgs'}, default='auto'
  327. Solver to use in the computational routines:
  328. - 'auto' chooses the solver automatically based on the type of data.
  329. - 'svd' uses a Singular Value Decomposition of X to compute the Ridge
  330. coefficients. It is the most stable solver, in particular more stable
  331. for singular matrices than 'cholesky' at the cost of being slower.
  332. - 'cholesky' uses the standard scipy.linalg.solve function to
  333. obtain a closed-form solution via a Cholesky decomposition of
  334. dot(X.T, X)
  335. - 'sparse_cg' uses the conjugate gradient solver as found in
  336. scipy.sparse.linalg.cg. As an iterative algorithm, this solver is
  337. more appropriate than 'cholesky' for large-scale data
  338. (possibility to set `tol` and `max_iter`).
  339. - 'lsqr' uses the dedicated regularized least-squares routine
  340. scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative
  341. procedure.
  342. - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses
  343. its improved, unbiased version named SAGA. Both methods also use an
  344. iterative procedure, and are often faster than other solvers when
  345. both n_samples and n_features are large. Note that 'sag' and
  346. 'saga' fast convergence is only guaranteed on features with
  347. approximately the same scale. You can preprocess the data with a
  348. scaler from sklearn.preprocessing.
  349. - 'lbfgs' uses L-BFGS-B algorithm implemented in
  350. `scipy.optimize.minimize`. It can be used only when `positive`
  351. is True.
  352. All solvers except 'svd' support both dense and sparse data. However, only
  353. 'lsqr', 'sag', 'sparse_cg', and 'lbfgs' support sparse input when
  354. `fit_intercept` is True.
  355. .. versionadded:: 0.17
  356. Stochastic Average Gradient descent solver.
  357. .. versionadded:: 0.19
  358. SAGA solver.
  359. max_iter : int, default=None
  360. Maximum number of iterations for conjugate gradient solver.
  361. For the 'sparse_cg' and 'lsqr' solvers, the default value is determined
  362. by scipy.sparse.linalg. For 'sag' and saga solver, the default value is
  363. 1000. For 'lbfgs' solver, the default value is 15000.
  364. tol : float, default=1e-4
  365. Precision of the solution. Note that `tol` has no effect for solvers 'svd' and
  366. 'cholesky'.
  367. .. versionchanged:: 1.2
  368. Default value changed from 1e-3 to 1e-4 for consistency with other linear
  369. models.
  370. verbose : int, default=0
  371. Verbosity level. Setting verbose > 0 will display additional
  372. information depending on the solver used.
  373. positive : bool, default=False
  374. When set to ``True``, forces the coefficients to be positive.
  375. Only 'lbfgs' solver is supported in this case.
  376. random_state : int, RandomState instance, default=None
  377. Used when ``solver`` == 'sag' or 'saga' to shuffle the data.
  378. See :term:`Glossary <random_state>` for details.
  379. return_n_iter : bool, default=False
  380. If True, the method also returns `n_iter`, the actual number of
  381. iteration performed by the solver.
  382. .. versionadded:: 0.17
  383. return_intercept : bool, default=False
  384. If True and if X is sparse, the method also returns the intercept,
  385. and the solver is automatically changed to 'sag'. This is only a
  386. temporary fix for fitting the intercept with sparse data. For dense
  387. data, use sklearn.linear_model._preprocess_data before your regression.
  388. .. versionadded:: 0.17
  389. check_input : bool, default=True
  390. If False, the input arrays X and y will not be checked.
  391. .. versionadded:: 0.21
  392. Returns
  393. -------
  394. coef : ndarray of shape (n_features,) or (n_targets, n_features)
  395. Weight vector(s).
  396. n_iter : int, optional
  397. The actual number of iteration performed by the solver.
  398. Only returned if `return_n_iter` is True.
  399. intercept : float or ndarray of shape (n_targets,)
  400. The intercept of the model. Only returned if `return_intercept`
  401. is True and if X is a scipy sparse array.
  402. Notes
  403. -----
  404. This function won't compute the intercept.
  405. Regularization improves the conditioning of the problem and
  406. reduces the variance of the estimates. Larger values specify stronger
  407. regularization. Alpha corresponds to ``1 / (2C)`` in other linear
  408. models such as :class:`~sklearn.linear_model.LogisticRegression` or
  409. :class:`~sklearn.svm.LinearSVC`. If an array is passed, penalties are
  410. assumed to be specific to the targets. Hence they must correspond in
  411. number.
  412. """
  413. return _ridge_regression(
  414. X,
  415. y,
  416. alpha,
  417. sample_weight=sample_weight,
  418. solver=solver,
  419. max_iter=max_iter,
  420. tol=tol,
  421. verbose=verbose,
  422. positive=positive,
  423. random_state=random_state,
  424. return_n_iter=return_n_iter,
  425. return_intercept=return_intercept,
  426. X_scale=None,
  427. X_offset=None,
  428. check_input=check_input,
  429. )
  430. def _ridge_regression(
  431. X,
  432. y,
  433. alpha,
  434. sample_weight=None,
  435. solver="auto",
  436. max_iter=None,
  437. tol=1e-4,
  438. verbose=0,
  439. positive=False,
  440. random_state=None,
  441. return_n_iter=False,
  442. return_intercept=False,
  443. X_scale=None,
  444. X_offset=None,
  445. check_input=True,
  446. fit_intercept=False,
  447. ):
  448. has_sw = sample_weight is not None
  449. if solver == "auto":
  450. if positive:
  451. solver = "lbfgs"
  452. elif return_intercept:
  453. # sag supports fitting intercept directly
  454. solver = "sag"
  455. elif not sparse.issparse(X):
  456. solver = "cholesky"
  457. else:
  458. solver = "sparse_cg"
  459. if solver not in ("sparse_cg", "cholesky", "svd", "lsqr", "sag", "saga", "lbfgs"):
  460. raise ValueError(
  461. "Known solvers are 'sparse_cg', 'cholesky', 'svd'"
  462. " 'lsqr', 'sag', 'saga' or 'lbfgs'. Got %s." % solver
  463. )
  464. if positive and solver != "lbfgs":
  465. raise ValueError(
  466. "When positive=True, only 'lbfgs' solver can be used. "
  467. f"Please change solver {solver} to 'lbfgs' "
  468. "or set positive=False."
  469. )
  470. if solver == "lbfgs" and not positive:
  471. raise ValueError(
  472. "'lbfgs' solver can be used only when positive=True. "
  473. "Please use another solver."
  474. )
  475. if return_intercept and solver != "sag":
  476. raise ValueError(
  477. "In Ridge, only 'sag' solver can directly fit the "
  478. "intercept. Please change solver to 'sag' or set "
  479. "return_intercept=False."
  480. )
  481. if check_input:
  482. _dtype = [np.float64, np.float32]
  483. _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), solver)
  484. X = check_array(X, accept_sparse=_accept_sparse, dtype=_dtype, order="C")
  485. y = check_array(y, dtype=X.dtype, ensure_2d=False, order=None)
  486. check_consistent_length(X, y)
  487. n_samples, n_features = X.shape
  488. if y.ndim > 2:
  489. raise ValueError("Target y has the wrong shape %s" % str(y.shape))
  490. ravel = False
  491. if y.ndim == 1:
  492. y = y.reshape(-1, 1)
  493. ravel = True
  494. n_samples_, n_targets = y.shape
  495. if n_samples != n_samples_:
  496. raise ValueError(
  497. "Number of samples in X and y does not correspond: %d != %d"
  498. % (n_samples, n_samples_)
  499. )
  500. if has_sw:
  501. sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
  502. if solver not in ["sag", "saga"]:
  503. # SAG supports sample_weight directly. For other solvers,
  504. # we implement sample_weight via a simple rescaling.
  505. X, y, sample_weight_sqrt = _rescale_data(X, y, sample_weight)
  506. # Some callers of this method might pass alpha as single
  507. # element array which already has been validated.
  508. if alpha is not None and not isinstance(alpha, np.ndarray):
  509. alpha = check_scalar(
  510. alpha,
  511. "alpha",
  512. target_type=numbers.Real,
  513. min_val=0.0,
  514. include_boundaries="left",
  515. )
  516. # There should be either 1 or n_targets penalties
  517. alpha = np.asarray(alpha, dtype=X.dtype).ravel()
  518. if alpha.size not in [1, n_targets]:
  519. raise ValueError(
  520. "Number of targets and number of penalties do not correspond: %d != %d"
  521. % (alpha.size, n_targets)
  522. )
  523. if alpha.size == 1 and n_targets > 1:
  524. alpha = np.repeat(alpha, n_targets)
  525. n_iter = None
  526. if solver == "sparse_cg":
  527. coef = _solve_sparse_cg(
  528. X,
  529. y,
  530. alpha,
  531. max_iter=max_iter,
  532. tol=tol,
  533. verbose=verbose,
  534. X_offset=X_offset,
  535. X_scale=X_scale,
  536. sample_weight_sqrt=sample_weight_sqrt if has_sw else None,
  537. )
  538. elif solver == "lsqr":
  539. coef, n_iter = _solve_lsqr(
  540. X,
  541. y,
  542. alpha=alpha,
  543. fit_intercept=fit_intercept,
  544. max_iter=max_iter,
  545. tol=tol,
  546. X_offset=X_offset,
  547. X_scale=X_scale,
  548. sample_weight_sqrt=sample_weight_sqrt if has_sw else None,
  549. )
  550. elif solver == "cholesky":
  551. if n_features > n_samples:
  552. K = safe_sparse_dot(X, X.T, dense_output=True)
  553. try:
  554. dual_coef = _solve_cholesky_kernel(K, y, alpha)
  555. coef = safe_sparse_dot(X.T, dual_coef, dense_output=True).T
  556. except linalg.LinAlgError:
  557. # use SVD solver if matrix is singular
  558. solver = "svd"
  559. else:
  560. try:
  561. coef = _solve_cholesky(X, y, alpha)
  562. except linalg.LinAlgError:
  563. # use SVD solver if matrix is singular
  564. solver = "svd"
  565. elif solver in ["sag", "saga"]:
  566. # precompute max_squared_sum for all targets
  567. max_squared_sum = row_norms(X, squared=True).max()
  568. coef = np.empty((y.shape[1], n_features), dtype=X.dtype)
  569. n_iter = np.empty(y.shape[1], dtype=np.int32)
  570. intercept = np.zeros((y.shape[1],), dtype=X.dtype)
  571. for i, (alpha_i, target) in enumerate(zip(alpha, y.T)):
  572. init = {
  573. "coef": np.zeros((n_features + int(return_intercept), 1), dtype=X.dtype)
  574. }
  575. coef_, n_iter_, _ = sag_solver(
  576. X,
  577. target.ravel(),
  578. sample_weight,
  579. "squared",
  580. alpha_i,
  581. 0,
  582. max_iter,
  583. tol,
  584. verbose,
  585. random_state,
  586. False,
  587. max_squared_sum,
  588. init,
  589. is_saga=solver == "saga",
  590. )
  591. if return_intercept:
  592. coef[i] = coef_[:-1]
  593. intercept[i] = coef_[-1]
  594. else:
  595. coef[i] = coef_
  596. n_iter[i] = n_iter_
  597. if intercept.shape[0] == 1:
  598. intercept = intercept[0]
  599. coef = np.asarray(coef)
  600. elif solver == "lbfgs":
  601. coef = _solve_lbfgs(
  602. X,
  603. y,
  604. alpha,
  605. positive=positive,
  606. tol=tol,
  607. max_iter=max_iter,
  608. X_offset=X_offset,
  609. X_scale=X_scale,
  610. sample_weight_sqrt=sample_weight_sqrt if has_sw else None,
  611. )
  612. if solver == "svd":
  613. if sparse.issparse(X):
  614. raise TypeError("SVD solver does not support sparse inputs currently")
  615. coef = _solve_svd(X, y, alpha)
  616. if ravel:
  617. # When y was passed as a 1d-array, we flatten the coefficients.
  618. coef = coef.ravel()
  619. if return_n_iter and return_intercept:
  620. return coef, n_iter, intercept
  621. elif return_intercept:
  622. return coef, intercept
  623. elif return_n_iter:
  624. return coef, n_iter
  625. else:
  626. return coef
  627. class _BaseRidge(LinearModel, metaclass=ABCMeta):
  628. _parameter_constraints: dict = {
  629. "alpha": [Interval(Real, 0, None, closed="left"), np.ndarray],
  630. "fit_intercept": ["boolean"],
  631. "copy_X": ["boolean"],
  632. "max_iter": [Interval(Integral, 1, None, closed="left"), None],
  633. "tol": [Interval(Real, 0, None, closed="left")],
  634. "solver": [
  635. StrOptions(
  636. {"auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga", "lbfgs"}
  637. )
  638. ],
  639. "positive": ["boolean"],
  640. "random_state": ["random_state"],
  641. }
  642. @abstractmethod
  643. def __init__(
  644. self,
  645. alpha=1.0,
  646. *,
  647. fit_intercept=True,
  648. copy_X=True,
  649. max_iter=None,
  650. tol=1e-4,
  651. solver="auto",
  652. positive=False,
  653. random_state=None,
  654. ):
  655. self.alpha = alpha
  656. self.fit_intercept = fit_intercept
  657. self.copy_X = copy_X
  658. self.max_iter = max_iter
  659. self.tol = tol
  660. self.solver = solver
  661. self.positive = positive
  662. self.random_state = random_state
  663. def fit(self, X, y, sample_weight=None):
  664. if self.solver == "lbfgs" and not self.positive:
  665. raise ValueError(
  666. "'lbfgs' solver can be used only when positive=True. "
  667. "Please use another solver."
  668. )
  669. if self.positive:
  670. if self.solver not in ["auto", "lbfgs"]:
  671. raise ValueError(
  672. f"solver='{self.solver}' does not support positive fitting. Please"
  673. " set the solver to 'auto' or 'lbfgs', or set `positive=False`"
  674. )
  675. else:
  676. solver = self.solver
  677. elif sparse.issparse(X) and self.fit_intercept:
  678. if self.solver not in ["auto", "lbfgs", "lsqr", "sag", "sparse_cg"]:
  679. raise ValueError(
  680. "solver='{}' does not support fitting the intercept "
  681. "on sparse data. Please set the solver to 'auto' or "
  682. "'lsqr', 'sparse_cg', 'sag', 'lbfgs' "
  683. "or set `fit_intercept=False`".format(self.solver)
  684. )
  685. if self.solver in ["lsqr", "lbfgs"]:
  686. solver = self.solver
  687. elif self.solver == "sag" and self.max_iter is None and self.tol > 1e-4:
  688. warnings.warn(
  689. '"sag" solver requires many iterations to fit '
  690. "an intercept with sparse inputs. Either set the "
  691. 'solver to "auto" or "sparse_cg", or set a low '
  692. '"tol" and a high "max_iter" (especially if inputs are '
  693. "not standardized)."
  694. )
  695. solver = "sag"
  696. else:
  697. solver = "sparse_cg"
  698. else:
  699. solver = self.solver
  700. if sample_weight is not None:
  701. sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
  702. # when X is sparse we only remove offset from y
  703. X, y, X_offset, y_offset, X_scale = _preprocess_data(
  704. X,
  705. y,
  706. self.fit_intercept,
  707. copy=self.copy_X,
  708. sample_weight=sample_weight,
  709. )
  710. if solver == "sag" and sparse.issparse(X) and self.fit_intercept:
  711. self.coef_, self.n_iter_, self.intercept_ = _ridge_regression(
  712. X,
  713. y,
  714. alpha=self.alpha,
  715. sample_weight=sample_weight,
  716. max_iter=self.max_iter,
  717. tol=self.tol,
  718. solver="sag",
  719. positive=self.positive,
  720. random_state=self.random_state,
  721. return_n_iter=True,
  722. return_intercept=True,
  723. check_input=False,
  724. )
  725. # add the offset which was subtracted by _preprocess_data
  726. self.intercept_ += y_offset
  727. else:
  728. if sparse.issparse(X) and self.fit_intercept:
  729. # required to fit intercept with sparse_cg and lbfgs solver
  730. params = {"X_offset": X_offset, "X_scale": X_scale}
  731. else:
  732. # for dense matrices or when intercept is set to 0
  733. params = {}
  734. self.coef_, self.n_iter_ = _ridge_regression(
  735. X,
  736. y,
  737. alpha=self.alpha,
  738. sample_weight=sample_weight,
  739. max_iter=self.max_iter,
  740. tol=self.tol,
  741. solver=solver,
  742. positive=self.positive,
  743. random_state=self.random_state,
  744. return_n_iter=True,
  745. return_intercept=False,
  746. check_input=False,
  747. fit_intercept=self.fit_intercept,
  748. **params,
  749. )
  750. self._set_intercept(X_offset, y_offset, X_scale)
  751. return self
  752. class Ridge(MultiOutputMixin, RegressorMixin, _BaseRidge):
  753. """Linear least squares with l2 regularization.
  754. Minimizes the objective function::
  755. ||y - Xw||^2_2 + alpha * ||w||^2_2
  756. This model solves a regression model where the loss function is
  757. the linear least squares function and regularization is given by
  758. the l2-norm. Also known as Ridge Regression or Tikhonov regularization.
  759. This estimator has built-in support for multi-variate regression
  760. (i.e., when y is a 2d-array of shape (n_samples, n_targets)).
  761. Read more in the :ref:`User Guide <ridge_regression>`.
  762. Parameters
  763. ----------
  764. alpha : {float, ndarray of shape (n_targets,)}, default=1.0
  765. Constant that multiplies the L2 term, controlling regularization
  766. strength. `alpha` must be a non-negative float i.e. in `[0, inf)`.
  767. When `alpha = 0`, the objective is equivalent to ordinary least
  768. squares, solved by the :class:`LinearRegression` object. For numerical
  769. reasons, using `alpha = 0` with the `Ridge` object is not advised.
  770. Instead, you should use the :class:`LinearRegression` object.
  771. If an array is passed, penalties are assumed to be specific to the
  772. targets. Hence they must correspond in number.
  773. fit_intercept : bool, default=True
  774. Whether to fit the intercept for this model. If set
  775. to false, no intercept will be used in calculations
  776. (i.e. ``X`` and ``y`` are expected to be centered).
  777. copy_X : bool, default=True
  778. If True, X will be copied; else, it may be overwritten.
  779. max_iter : int, default=None
  780. Maximum number of iterations for conjugate gradient solver.
  781. For 'sparse_cg' and 'lsqr' solvers, the default value is determined
  782. by scipy.sparse.linalg. For 'sag' solver, the default value is 1000.
  783. For 'lbfgs' solver, the default value is 15000.
  784. tol : float, default=1e-4
  785. The precision of the solution (`coef_`) is determined by `tol` which
  786. specifies a different convergence criterion for each solver:
  787. - 'svd': `tol` has no impact.
  788. - 'cholesky': `tol` has no impact.
  789. - 'sparse_cg': norm of residuals smaller than `tol`.
  790. - 'lsqr': `tol` is set as atol and btol of scipy.sparse.linalg.lsqr,
  791. which control the norm of the residual vector in terms of the norms of
  792. matrix and coefficients.
  793. - 'sag' and 'saga': relative change of coef smaller than `tol`.
  794. - 'lbfgs': maximum of the absolute (projected) gradient=max|residuals|
  795. smaller than `tol`.
  796. .. versionchanged:: 1.2
  797. Default value changed from 1e-3 to 1e-4 for consistency with other linear
  798. models.
  799. solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', \
  800. 'sag', 'saga', 'lbfgs'}, default='auto'
  801. Solver to use in the computational routines:
  802. - 'auto' chooses the solver automatically based on the type of data.
  803. - 'svd' uses a Singular Value Decomposition of X to compute the Ridge
  804. coefficients. It is the most stable solver, in particular more stable
  805. for singular matrices than 'cholesky' at the cost of being slower.
  806. - 'cholesky' uses the standard scipy.linalg.solve function to
  807. obtain a closed-form solution.
  808. - 'sparse_cg' uses the conjugate gradient solver as found in
  809. scipy.sparse.linalg.cg. As an iterative algorithm, this solver is
  810. more appropriate than 'cholesky' for large-scale data
  811. (possibility to set `tol` and `max_iter`).
  812. - 'lsqr' uses the dedicated regularized least-squares routine
  813. scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative
  814. procedure.
  815. - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses
  816. its improved, unbiased version named SAGA. Both methods also use an
  817. iterative procedure, and are often faster than other solvers when
  818. both n_samples and n_features are large. Note that 'sag' and
  819. 'saga' fast convergence is only guaranteed on features with
  820. approximately the same scale. You can preprocess the data with a
  821. scaler from sklearn.preprocessing.
  822. - 'lbfgs' uses L-BFGS-B algorithm implemented in
  823. `scipy.optimize.minimize`. It can be used only when `positive`
  824. is True.
  825. All solvers except 'svd' support both dense and sparse data. However, only
  826. 'lsqr', 'sag', 'sparse_cg', and 'lbfgs' support sparse input when
  827. `fit_intercept` is True.
  828. .. versionadded:: 0.17
  829. Stochastic Average Gradient descent solver.
  830. .. versionadded:: 0.19
  831. SAGA solver.
  832. positive : bool, default=False
  833. When set to ``True``, forces the coefficients to be positive.
  834. Only 'lbfgs' solver is supported in this case.
  835. random_state : int, RandomState instance, default=None
  836. Used when ``solver`` == 'sag' or 'saga' to shuffle the data.
  837. See :term:`Glossary <random_state>` for details.
  838. .. versionadded:: 0.17
  839. `random_state` to support Stochastic Average Gradient.
  840. Attributes
  841. ----------
  842. coef_ : ndarray of shape (n_features,) or (n_targets, n_features)
  843. Weight vector(s).
  844. intercept_ : float or ndarray of shape (n_targets,)
  845. Independent term in decision function. Set to 0.0 if
  846. ``fit_intercept = False``.
  847. n_iter_ : None or ndarray of shape (n_targets,)
  848. Actual number of iterations for each target. Available only for
  849. sag and lsqr solvers. Other solvers will return None.
  850. .. versionadded:: 0.17
  851. n_features_in_ : int
  852. Number of features seen during :term:`fit`.
  853. .. versionadded:: 0.24
  854. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  855. Names of features seen during :term:`fit`. Defined only when `X`
  856. has feature names that are all strings.
  857. .. versionadded:: 1.0
  858. See Also
  859. --------
  860. RidgeClassifier : Ridge classifier.
  861. RidgeCV : Ridge regression with built-in cross validation.
  862. :class:`~sklearn.kernel_ridge.KernelRidge` : Kernel ridge regression
  863. combines ridge regression with the kernel trick.
  864. Notes
  865. -----
  866. Regularization improves the conditioning of the problem and
  867. reduces the variance of the estimates. Larger values specify stronger
  868. regularization. Alpha corresponds to ``1 / (2C)`` in other linear
  869. models such as :class:`~sklearn.linear_model.LogisticRegression` or
  870. :class:`~sklearn.svm.LinearSVC`.
  871. Examples
  872. --------
  873. >>> from sklearn.linear_model import Ridge
  874. >>> import numpy as np
  875. >>> n_samples, n_features = 10, 5
  876. >>> rng = np.random.RandomState(0)
  877. >>> y = rng.randn(n_samples)
  878. >>> X = rng.randn(n_samples, n_features)
  879. >>> clf = Ridge(alpha=1.0)
  880. >>> clf.fit(X, y)
  881. Ridge()
  882. """
  883. def __init__(
  884. self,
  885. alpha=1.0,
  886. *,
  887. fit_intercept=True,
  888. copy_X=True,
  889. max_iter=None,
  890. tol=1e-4,
  891. solver="auto",
  892. positive=False,
  893. random_state=None,
  894. ):
  895. super().__init__(
  896. alpha=alpha,
  897. fit_intercept=fit_intercept,
  898. copy_X=copy_X,
  899. max_iter=max_iter,
  900. tol=tol,
  901. solver=solver,
  902. positive=positive,
  903. random_state=random_state,
  904. )
  905. @_fit_context(prefer_skip_nested_validation=True)
  906. def fit(self, X, y, sample_weight=None):
  907. """Fit Ridge regression model.
  908. Parameters
  909. ----------
  910. X : {ndarray, sparse matrix} of shape (n_samples, n_features)
  911. Training data.
  912. y : ndarray of shape (n_samples,) or (n_samples, n_targets)
  913. Target values.
  914. sample_weight : float or ndarray of shape (n_samples,), default=None
  915. Individual weights for each sample. If given a float, every sample
  916. will have the same weight.
  917. Returns
  918. -------
  919. self : object
  920. Fitted estimator.
  921. """
  922. _accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), self.solver)
  923. X, y = self._validate_data(
  924. X,
  925. y,
  926. accept_sparse=_accept_sparse,
  927. dtype=[np.float64, np.float32],
  928. multi_output=True,
  929. y_numeric=True,
  930. )
  931. return super().fit(X, y, sample_weight=sample_weight)
  932. class _RidgeClassifierMixin(LinearClassifierMixin):
  933. def _prepare_data(self, X, y, sample_weight, solver):
  934. """Validate `X` and `y` and binarize `y`.
  935. Parameters
  936. ----------
  937. X : {ndarray, sparse matrix} of shape (n_samples, n_features)
  938. Training data.
  939. y : ndarray of shape (n_samples,)
  940. Target values.
  941. sample_weight : float or ndarray of shape (n_samples,), default=None
  942. Individual weights for each sample. If given a float, every sample
  943. will have the same weight.
  944. solver : str
  945. The solver used in `Ridge` to know which sparse format to support.
  946. Returns
  947. -------
  948. X : {ndarray, sparse matrix} of shape (n_samples, n_features)
  949. Validated training data.
  950. y : ndarray of shape (n_samples,)
  951. Validated target values.
  952. sample_weight : ndarray of shape (n_samples,)
  953. Validated sample weights.
  954. Y : ndarray of shape (n_samples, n_classes)
  955. The binarized version of `y`.
  956. """
  957. accept_sparse = _get_valid_accept_sparse(sparse.issparse(X), solver)
  958. X, y = self._validate_data(
  959. X,
  960. y,
  961. accept_sparse=accept_sparse,
  962. multi_output=True,
  963. y_numeric=False,
  964. )
  965. self._label_binarizer = LabelBinarizer(pos_label=1, neg_label=-1)
  966. Y = self._label_binarizer.fit_transform(y)
  967. if not self._label_binarizer.y_type_.startswith("multilabel"):
  968. y = column_or_1d(y, warn=True)
  969. sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
  970. if self.class_weight:
  971. sample_weight = sample_weight * compute_sample_weight(self.class_weight, y)
  972. return X, y, sample_weight, Y
  973. def predict(self, X):
  974. """Predict class labels for samples in `X`.
  975. Parameters
  976. ----------
  977. X : {array-like, spare matrix} of shape (n_samples, n_features)
  978. The data matrix for which we want to predict the targets.
  979. Returns
  980. -------
  981. y_pred : ndarray of shape (n_samples,) or (n_samples, n_outputs)
  982. Vector or matrix containing the predictions. In binary and
  983. multiclass problems, this is a vector containing `n_samples`. In
  984. a multilabel problem, it returns a matrix of shape
  985. `(n_samples, n_outputs)`.
  986. """
  987. check_is_fitted(self, attributes=["_label_binarizer"])
  988. if self._label_binarizer.y_type_.startswith("multilabel"):
  989. # Threshold such that the negative label is -1 and positive label
  990. # is 1 to use the inverse transform of the label binarizer fitted
  991. # during fit.
  992. scores = 2 * (self.decision_function(X) > 0) - 1
  993. return self._label_binarizer.inverse_transform(scores)
  994. return super().predict(X)
  995. @property
  996. def classes_(self):
  997. """Classes labels."""
  998. return self._label_binarizer.classes_
  999. def _more_tags(self):
  1000. return {"multilabel": True}
  1001. class RidgeClassifier(_RidgeClassifierMixin, _BaseRidge):
  1002. """Classifier using Ridge regression.
  1003. This classifier first converts the target values into ``{-1, 1}`` and
  1004. then treats the problem as a regression task (multi-output regression in
  1005. the multiclass case).
  1006. Read more in the :ref:`User Guide <ridge_regression>`.
  1007. Parameters
  1008. ----------
  1009. alpha : float, default=1.0
  1010. Regularization strength; must be a positive float. Regularization
  1011. improves the conditioning of the problem and reduces the variance of
  1012. the estimates. Larger values specify stronger regularization.
  1013. Alpha corresponds to ``1 / (2C)`` in other linear models such as
  1014. :class:`~sklearn.linear_model.LogisticRegression` or
  1015. :class:`~sklearn.svm.LinearSVC`.
  1016. fit_intercept : bool, default=True
  1017. Whether to calculate the intercept for this model. If set to false, no
  1018. intercept will be used in calculations (e.g. data is expected to be
  1019. already centered).
  1020. copy_X : bool, default=True
  1021. If True, X will be copied; else, it may be overwritten.
  1022. max_iter : int, default=None
  1023. Maximum number of iterations for conjugate gradient solver.
  1024. The default value is determined by scipy.sparse.linalg.
  1025. tol : float, default=1e-4
  1026. The precision of the solution (`coef_`) is determined by `tol` which
  1027. specifies a different convergence criterion for each solver:
  1028. - 'svd': `tol` has no impact.
  1029. - 'cholesky': `tol` has no impact.
  1030. - 'sparse_cg': norm of residuals smaller than `tol`.
  1031. - 'lsqr': `tol` is set as atol and btol of scipy.sparse.linalg.lsqr,
  1032. which control the norm of the residual vector in terms of the norms of
  1033. matrix and coefficients.
  1034. - 'sag' and 'saga': relative change of coef smaller than `tol`.
  1035. - 'lbfgs': maximum of the absolute (projected) gradient=max|residuals|
  1036. smaller than `tol`.
  1037. .. versionchanged:: 1.2
  1038. Default value changed from 1e-3 to 1e-4 for consistency with other linear
  1039. models.
  1040. class_weight : dict or 'balanced', default=None
  1041. Weights associated with classes in the form ``{class_label: weight}``.
  1042. If not given, all classes are supposed to have weight one.
  1043. The "balanced" mode uses the values of y to automatically adjust
  1044. weights inversely proportional to class frequencies in the input data
  1045. as ``n_samples / (n_classes * np.bincount(y))``.
  1046. solver : {'auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', \
  1047. 'sag', 'saga', 'lbfgs'}, default='auto'
  1048. Solver to use in the computational routines:
  1049. - 'auto' chooses the solver automatically based on the type of data.
  1050. - 'svd' uses a Singular Value Decomposition of X to compute the Ridge
  1051. coefficients. It is the most stable solver, in particular more stable
  1052. for singular matrices than 'cholesky' at the cost of being slower.
  1053. - 'cholesky' uses the standard scipy.linalg.solve function to
  1054. obtain a closed-form solution.
  1055. - 'sparse_cg' uses the conjugate gradient solver as found in
  1056. scipy.sparse.linalg.cg. As an iterative algorithm, this solver is
  1057. more appropriate than 'cholesky' for large-scale data
  1058. (possibility to set `tol` and `max_iter`).
  1059. - 'lsqr' uses the dedicated regularized least-squares routine
  1060. scipy.sparse.linalg.lsqr. It is the fastest and uses an iterative
  1061. procedure.
  1062. - 'sag' uses a Stochastic Average Gradient descent, and 'saga' uses
  1063. its unbiased and more flexible version named SAGA. Both methods
  1064. use an iterative procedure, and are often faster than other solvers
  1065. when both n_samples and n_features are large. Note that 'sag' and
  1066. 'saga' fast convergence is only guaranteed on features with
  1067. approximately the same scale. You can preprocess the data with a
  1068. scaler from sklearn.preprocessing.
  1069. .. versionadded:: 0.17
  1070. Stochastic Average Gradient descent solver.
  1071. .. versionadded:: 0.19
  1072. SAGA solver.
  1073. - 'lbfgs' uses L-BFGS-B algorithm implemented in
  1074. `scipy.optimize.minimize`. It can be used only when `positive`
  1075. is True.
  1076. positive : bool, default=False
  1077. When set to ``True``, forces the coefficients to be positive.
  1078. Only 'lbfgs' solver is supported in this case.
  1079. random_state : int, RandomState instance, default=None
  1080. Used when ``solver`` == 'sag' or 'saga' to shuffle the data.
  1081. See :term:`Glossary <random_state>` for details.
  1082. Attributes
  1083. ----------
  1084. coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)
  1085. Coefficient of the features in the decision function.
  1086. ``coef_`` is of shape (1, n_features) when the given problem is binary.
  1087. intercept_ : float or ndarray of shape (n_targets,)
  1088. Independent term in decision function. Set to 0.0 if
  1089. ``fit_intercept = False``.
  1090. n_iter_ : None or ndarray of shape (n_targets,)
  1091. Actual number of iterations for each target. Available only for
  1092. sag and lsqr solvers. Other solvers will return None.
  1093. classes_ : ndarray of shape (n_classes,)
  1094. The classes labels.
  1095. n_features_in_ : int
  1096. Number of features seen during :term:`fit`.
  1097. .. versionadded:: 0.24
  1098. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  1099. Names of features seen during :term:`fit`. Defined only when `X`
  1100. has feature names that are all strings.
  1101. .. versionadded:: 1.0
  1102. See Also
  1103. --------
  1104. Ridge : Ridge regression.
  1105. RidgeClassifierCV : Ridge classifier with built-in cross validation.
  1106. Notes
  1107. -----
  1108. For multi-class classification, n_class classifiers are trained in
  1109. a one-versus-all approach. Concretely, this is implemented by taking
  1110. advantage of the multi-variate response support in Ridge.
  1111. Examples
  1112. --------
  1113. >>> from sklearn.datasets import load_breast_cancer
  1114. >>> from sklearn.linear_model import RidgeClassifier
  1115. >>> X, y = load_breast_cancer(return_X_y=True)
  1116. >>> clf = RidgeClassifier().fit(X, y)
  1117. >>> clf.score(X, y)
  1118. 0.9595...
  1119. """
  1120. _parameter_constraints: dict = {
  1121. **_BaseRidge._parameter_constraints,
  1122. "class_weight": [dict, StrOptions({"balanced"}), None],
  1123. }
  1124. def __init__(
  1125. self,
  1126. alpha=1.0,
  1127. *,
  1128. fit_intercept=True,
  1129. copy_X=True,
  1130. max_iter=None,
  1131. tol=1e-4,
  1132. class_weight=None,
  1133. solver="auto",
  1134. positive=False,
  1135. random_state=None,
  1136. ):
  1137. super().__init__(
  1138. alpha=alpha,
  1139. fit_intercept=fit_intercept,
  1140. copy_X=copy_X,
  1141. max_iter=max_iter,
  1142. tol=tol,
  1143. solver=solver,
  1144. positive=positive,
  1145. random_state=random_state,
  1146. )
  1147. self.class_weight = class_weight
  1148. @_fit_context(prefer_skip_nested_validation=True)
  1149. def fit(self, X, y, sample_weight=None):
  1150. """Fit Ridge classifier model.
  1151. Parameters
  1152. ----------
  1153. X : {ndarray, sparse matrix} of shape (n_samples, n_features)
  1154. Training data.
  1155. y : ndarray of shape (n_samples,)
  1156. Target values.
  1157. sample_weight : float or ndarray of shape (n_samples,), default=None
  1158. Individual weights for each sample. If given a float, every sample
  1159. will have the same weight.
  1160. .. versionadded:: 0.17
  1161. *sample_weight* support to RidgeClassifier.
  1162. Returns
  1163. -------
  1164. self : object
  1165. Instance of the estimator.
  1166. """
  1167. X, y, sample_weight, Y = self._prepare_data(X, y, sample_weight, self.solver)
  1168. super().fit(X, Y, sample_weight=sample_weight)
  1169. return self
  1170. def _check_gcv_mode(X, gcv_mode):
  1171. if gcv_mode in ["eigen", "svd"]:
  1172. return gcv_mode
  1173. # if X has more rows than columns, use decomposition of X^T.X,
  1174. # otherwise X.X^T
  1175. if X.shape[0] > X.shape[1]:
  1176. return "svd"
  1177. return "eigen"
  1178. def _find_smallest_angle(query, vectors):
  1179. """Find the column of vectors that is most aligned with the query.
  1180. Both query and the columns of vectors must have their l2 norm equal to 1.
  1181. Parameters
  1182. ----------
  1183. query : ndarray of shape (n_samples,)
  1184. Normalized query vector.
  1185. vectors : ndarray of shape (n_samples, n_features)
  1186. Vectors to which we compare query, as columns. Must be normalized.
  1187. """
  1188. abs_cosine = np.abs(query.dot(vectors))
  1189. index = np.argmax(abs_cosine)
  1190. return index
  1191. class _X_CenterStackOp(sparse.linalg.LinearOperator):
  1192. """Behaves as centered and scaled X with an added intercept column.
  1193. This operator behaves as
  1194. np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]])
  1195. """
  1196. def __init__(self, X, X_mean, sqrt_sw):
  1197. n_samples, n_features = X.shape
  1198. super().__init__(X.dtype, (n_samples, n_features + 1))
  1199. self.X = X
  1200. self.X_mean = X_mean
  1201. self.sqrt_sw = sqrt_sw
  1202. def _matvec(self, v):
  1203. v = v.ravel()
  1204. return (
  1205. safe_sparse_dot(self.X, v[:-1], dense_output=True)
  1206. - self.sqrt_sw * self.X_mean.dot(v[:-1])
  1207. + v[-1] * self.sqrt_sw
  1208. )
  1209. def _matmat(self, v):
  1210. return (
  1211. safe_sparse_dot(self.X, v[:-1], dense_output=True)
  1212. - self.sqrt_sw[:, None] * self.X_mean.dot(v[:-1])
  1213. + v[-1] * self.sqrt_sw[:, None]
  1214. )
  1215. def _transpose(self):
  1216. return _XT_CenterStackOp(self.X, self.X_mean, self.sqrt_sw)
  1217. class _XT_CenterStackOp(sparse.linalg.LinearOperator):
  1218. """Behaves as transposed centered and scaled X with an intercept column.
  1219. This operator behaves as
  1220. np.hstack([X - sqrt_sw[:, None] * X_mean, sqrt_sw[:, None]]).T
  1221. """
  1222. def __init__(self, X, X_mean, sqrt_sw):
  1223. n_samples, n_features = X.shape
  1224. super().__init__(X.dtype, (n_features + 1, n_samples))
  1225. self.X = X
  1226. self.X_mean = X_mean
  1227. self.sqrt_sw = sqrt_sw
  1228. def _matvec(self, v):
  1229. v = v.ravel()
  1230. n_features = self.shape[0]
  1231. res = np.empty(n_features, dtype=self.X.dtype)
  1232. res[:-1] = safe_sparse_dot(self.X.T, v, dense_output=True) - (
  1233. self.X_mean * self.sqrt_sw.dot(v)
  1234. )
  1235. res[-1] = np.dot(v, self.sqrt_sw)
  1236. return res
  1237. def _matmat(self, v):
  1238. n_features = self.shape[0]
  1239. res = np.empty((n_features, v.shape[1]), dtype=self.X.dtype)
  1240. res[:-1] = safe_sparse_dot(self.X.T, v, dense_output=True) - self.X_mean[
  1241. :, None
  1242. ] * self.sqrt_sw.dot(v)
  1243. res[-1] = np.dot(self.sqrt_sw, v)
  1244. return res
  1245. class _IdentityRegressor:
  1246. """Fake regressor which will directly output the prediction."""
  1247. def decision_function(self, y_predict):
  1248. return y_predict
  1249. def predict(self, y_predict):
  1250. return y_predict
  1251. class _IdentityClassifier(LinearClassifierMixin):
  1252. """Fake classifier which will directly output the prediction.
  1253. We inherit from LinearClassifierMixin to get the proper shape for the
  1254. output `y`.
  1255. """
  1256. def __init__(self, classes):
  1257. self.classes_ = classes
  1258. def decision_function(self, y_predict):
  1259. return y_predict
  1260. class _RidgeGCV(LinearModel):
  1261. """Ridge regression with built-in Leave-one-out Cross-Validation.
  1262. This class is not intended to be used directly. Use RidgeCV instead.
  1263. Notes
  1264. -----
  1265. We want to solve (K + alpha*Id)c = y,
  1266. where K = X X^T is the kernel matrix.
  1267. Let G = (K + alpha*Id).
  1268. Dual solution: c = G^-1y
  1269. Primal solution: w = X^T c
  1270. Compute eigendecomposition K = Q V Q^T.
  1271. Then G^-1 = Q (V + alpha*Id)^-1 Q^T,
  1272. where (V + alpha*Id) is diagonal.
  1273. It is thus inexpensive to inverse for many alphas.
  1274. Let loov be the vector of prediction values for each example
  1275. when the model was fitted with all examples but this example.
  1276. loov = (KG^-1Y - diag(KG^-1)Y) / diag(I-KG^-1)
  1277. Let looe be the vector of prediction errors for each example
  1278. when the model was fitted with all examples but this example.
  1279. looe = y - loov = c / diag(G^-1)
  1280. The best score (negative mean squared error or user-provided scoring) is
  1281. stored in the `best_score_` attribute, and the selected hyperparameter in
  1282. `alpha_`.
  1283. References
  1284. ----------
  1285. http://cbcl.mit.edu/publications/ps/MIT-CSAIL-TR-2007-025.pdf
  1286. https://www.mit.edu/~9.520/spring07/Classes/rlsslides.pdf
  1287. """
  1288. def __init__(
  1289. self,
  1290. alphas=(0.1, 1.0, 10.0),
  1291. *,
  1292. fit_intercept=True,
  1293. scoring=None,
  1294. copy_X=True,
  1295. gcv_mode=None,
  1296. store_cv_values=False,
  1297. is_clf=False,
  1298. alpha_per_target=False,
  1299. ):
  1300. self.alphas = alphas
  1301. self.fit_intercept = fit_intercept
  1302. self.scoring = scoring
  1303. self.copy_X = copy_X
  1304. self.gcv_mode = gcv_mode
  1305. self.store_cv_values = store_cv_values
  1306. self.is_clf = is_clf
  1307. self.alpha_per_target = alpha_per_target
  1308. @staticmethod
  1309. def _decomp_diag(v_prime, Q):
  1310. # compute diagonal of the matrix: dot(Q, dot(diag(v_prime), Q^T))
  1311. return (v_prime * Q**2).sum(axis=-1)
  1312. @staticmethod
  1313. def _diag_dot(D, B):
  1314. # compute dot(diag(D), B)
  1315. if len(B.shape) > 1:
  1316. # handle case where B is > 1-d
  1317. D = D[(slice(None),) + (np.newaxis,) * (len(B.shape) - 1)]
  1318. return D * B
  1319. def _compute_gram(self, X, sqrt_sw):
  1320. """Computes the Gram matrix XX^T with possible centering.
  1321. Parameters
  1322. ----------
  1323. X : {ndarray, sparse matrix} of shape (n_samples, n_features)
  1324. The preprocessed design matrix.
  1325. sqrt_sw : ndarray of shape (n_samples,)
  1326. square roots of sample weights
  1327. Returns
  1328. -------
  1329. gram : ndarray of shape (n_samples, n_samples)
  1330. The Gram matrix.
  1331. X_mean : ndarray of shape (n_feature,)
  1332. The weighted mean of ``X`` for each feature.
  1333. Notes
  1334. -----
  1335. When X is dense the centering has been done in preprocessing
  1336. so the mean is 0 and we just compute XX^T.
  1337. When X is sparse it has not been centered in preprocessing, but it has
  1338. been scaled by sqrt(sample weights).
  1339. When self.fit_intercept is False no centering is done.
  1340. The centered X is never actually computed because centering would break
  1341. the sparsity of X.
  1342. """
  1343. center = self.fit_intercept and sparse.issparse(X)
  1344. if not center:
  1345. # in this case centering has been done in preprocessing
  1346. # or we are not fitting an intercept.
  1347. X_mean = np.zeros(X.shape[1], dtype=X.dtype)
  1348. return safe_sparse_dot(X, X.T, dense_output=True), X_mean
  1349. # X is sparse
  1350. n_samples = X.shape[0]
  1351. sample_weight_matrix = sparse.dia_matrix(
  1352. (sqrt_sw, 0), shape=(n_samples, n_samples)
  1353. )
  1354. X_weighted = sample_weight_matrix.dot(X)
  1355. X_mean, _ = mean_variance_axis(X_weighted, axis=0)
  1356. X_mean *= n_samples / sqrt_sw.dot(sqrt_sw)
  1357. X_mX = sqrt_sw[:, None] * safe_sparse_dot(X_mean, X.T, dense_output=True)
  1358. X_mX_m = np.outer(sqrt_sw, sqrt_sw) * np.dot(X_mean, X_mean)
  1359. return (
  1360. safe_sparse_dot(X, X.T, dense_output=True) + X_mX_m - X_mX - X_mX.T,
  1361. X_mean,
  1362. )
  1363. def _compute_covariance(self, X, sqrt_sw):
  1364. """Computes covariance matrix X^TX with possible centering.
  1365. Parameters
  1366. ----------
  1367. X : sparse matrix of shape (n_samples, n_features)
  1368. The preprocessed design matrix.
  1369. sqrt_sw : ndarray of shape (n_samples,)
  1370. square roots of sample weights
  1371. Returns
  1372. -------
  1373. covariance : ndarray of shape (n_features, n_features)
  1374. The covariance matrix.
  1375. X_mean : ndarray of shape (n_feature,)
  1376. The weighted mean of ``X`` for each feature.
  1377. Notes
  1378. -----
  1379. Since X is sparse it has not been centered in preprocessing, but it has
  1380. been scaled by sqrt(sample weights).
  1381. When self.fit_intercept is False no centering is done.
  1382. The centered X is never actually computed because centering would break
  1383. the sparsity of X.
  1384. """
  1385. if not self.fit_intercept:
  1386. # in this case centering has been done in preprocessing
  1387. # or we are not fitting an intercept.
  1388. X_mean = np.zeros(X.shape[1], dtype=X.dtype)
  1389. return safe_sparse_dot(X.T, X, dense_output=True), X_mean
  1390. # this function only gets called for sparse X
  1391. n_samples = X.shape[0]
  1392. sample_weight_matrix = sparse.dia_matrix(
  1393. (sqrt_sw, 0), shape=(n_samples, n_samples)
  1394. )
  1395. X_weighted = sample_weight_matrix.dot(X)
  1396. X_mean, _ = mean_variance_axis(X_weighted, axis=0)
  1397. X_mean = X_mean * n_samples / sqrt_sw.dot(sqrt_sw)
  1398. weight_sum = sqrt_sw.dot(sqrt_sw)
  1399. return (
  1400. safe_sparse_dot(X.T, X, dense_output=True)
  1401. - weight_sum * np.outer(X_mean, X_mean),
  1402. X_mean,
  1403. )
  1404. def _sparse_multidot_diag(self, X, A, X_mean, sqrt_sw):
  1405. """Compute the diagonal of (X - X_mean).dot(A).dot((X - X_mean).T)
  1406. without explicitly centering X nor computing X.dot(A)
  1407. when X is sparse.
  1408. Parameters
  1409. ----------
  1410. X : sparse matrix of shape (n_samples, n_features)
  1411. A : ndarray of shape (n_features, n_features)
  1412. X_mean : ndarray of shape (n_features,)
  1413. sqrt_sw : ndarray of shape (n_features,)
  1414. square roots of sample weights
  1415. Returns
  1416. -------
  1417. diag : np.ndarray, shape (n_samples,)
  1418. The computed diagonal.
  1419. """
  1420. intercept_col = scale = sqrt_sw
  1421. batch_size = X.shape[1]
  1422. diag = np.empty(X.shape[0], dtype=X.dtype)
  1423. for start in range(0, X.shape[0], batch_size):
  1424. batch = slice(start, min(X.shape[0], start + batch_size), 1)
  1425. X_batch = np.empty(
  1426. (X[batch].shape[0], X.shape[1] + self.fit_intercept), dtype=X.dtype
  1427. )
  1428. if self.fit_intercept:
  1429. X_batch[:, :-1] = X[batch].A - X_mean * scale[batch][:, None]
  1430. X_batch[:, -1] = intercept_col[batch]
  1431. else:
  1432. X_batch = X[batch].A
  1433. diag[batch] = (X_batch.dot(A) * X_batch).sum(axis=1)
  1434. return diag
  1435. def _eigen_decompose_gram(self, X, y, sqrt_sw):
  1436. """Eigendecomposition of X.X^T, used when n_samples <= n_features."""
  1437. # if X is dense it has already been centered in preprocessing
  1438. K, X_mean = self._compute_gram(X, sqrt_sw)
  1439. if self.fit_intercept:
  1440. # to emulate centering X with sample weights,
  1441. # ie removing the weighted average, we add a column
  1442. # containing the square roots of the sample weights.
  1443. # by centering, it is orthogonal to the other columns
  1444. K += np.outer(sqrt_sw, sqrt_sw)
  1445. eigvals, Q = linalg.eigh(K)
  1446. QT_y = np.dot(Q.T, y)
  1447. return X_mean, eigvals, Q, QT_y
  1448. def _solve_eigen_gram(self, alpha, y, sqrt_sw, X_mean, eigvals, Q, QT_y):
  1449. """Compute dual coefficients and diagonal of G^-1.
  1450. Used when we have a decomposition of X.X^T (n_samples <= n_features).
  1451. """
  1452. w = 1.0 / (eigvals + alpha)
  1453. if self.fit_intercept:
  1454. # the vector containing the square roots of the sample weights (1
  1455. # when no sample weights) is the eigenvector of XX^T which
  1456. # corresponds to the intercept; we cancel the regularization on
  1457. # this dimension. the corresponding eigenvalue is
  1458. # sum(sample_weight).
  1459. normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw)
  1460. intercept_dim = _find_smallest_angle(normalized_sw, Q)
  1461. w[intercept_dim] = 0 # cancel regularization for the intercept
  1462. c = np.dot(Q, self._diag_dot(w, QT_y))
  1463. G_inverse_diag = self._decomp_diag(w, Q)
  1464. # handle case where y is 2-d
  1465. if len(y.shape) != 1:
  1466. G_inverse_diag = G_inverse_diag[:, np.newaxis]
  1467. return G_inverse_diag, c
  1468. def _eigen_decompose_covariance(self, X, y, sqrt_sw):
  1469. """Eigendecomposition of X^T.X, used when n_samples > n_features
  1470. and X is sparse.
  1471. """
  1472. n_samples, n_features = X.shape
  1473. cov = np.empty((n_features + 1, n_features + 1), dtype=X.dtype)
  1474. cov[:-1, :-1], X_mean = self._compute_covariance(X, sqrt_sw)
  1475. if not self.fit_intercept:
  1476. cov = cov[:-1, :-1]
  1477. # to emulate centering X with sample weights,
  1478. # ie removing the weighted average, we add a column
  1479. # containing the square roots of the sample weights.
  1480. # by centering, it is orthogonal to the other columns
  1481. # when all samples have the same weight we add a column of 1
  1482. else:
  1483. cov[-1] = 0
  1484. cov[:, -1] = 0
  1485. cov[-1, -1] = sqrt_sw.dot(sqrt_sw)
  1486. nullspace_dim = max(0, n_features - n_samples)
  1487. eigvals, V = linalg.eigh(cov)
  1488. # remove eigenvalues and vectors in the null space of X^T.X
  1489. eigvals = eigvals[nullspace_dim:]
  1490. V = V[:, nullspace_dim:]
  1491. return X_mean, eigvals, V, X
  1492. def _solve_eigen_covariance_no_intercept(
  1493. self, alpha, y, sqrt_sw, X_mean, eigvals, V, X
  1494. ):
  1495. """Compute dual coefficients and diagonal of G^-1.
  1496. Used when we have a decomposition of X^T.X
  1497. (n_samples > n_features and X is sparse), and not fitting an intercept.
  1498. """
  1499. w = 1 / (eigvals + alpha)
  1500. A = (V * w).dot(V.T)
  1501. AXy = A.dot(safe_sparse_dot(X.T, y, dense_output=True))
  1502. y_hat = safe_sparse_dot(X, AXy, dense_output=True)
  1503. hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw)
  1504. if len(y.shape) != 1:
  1505. # handle case where y is 2-d
  1506. hat_diag = hat_diag[:, np.newaxis]
  1507. return (1 - hat_diag) / alpha, (y - y_hat) / alpha
  1508. def _solve_eigen_covariance_intercept(
  1509. self, alpha, y, sqrt_sw, X_mean, eigvals, V, X
  1510. ):
  1511. """Compute dual coefficients and diagonal of G^-1.
  1512. Used when we have a decomposition of X^T.X
  1513. (n_samples > n_features and X is sparse),
  1514. and we are fitting an intercept.
  1515. """
  1516. # the vector [0, 0, ..., 0, 1]
  1517. # is the eigenvector of X^TX which
  1518. # corresponds to the intercept; we cancel the regularization on
  1519. # this dimension. the corresponding eigenvalue is
  1520. # sum(sample_weight), e.g. n when uniform sample weights.
  1521. intercept_sv = np.zeros(V.shape[0])
  1522. intercept_sv[-1] = 1
  1523. intercept_dim = _find_smallest_angle(intercept_sv, V)
  1524. w = 1 / (eigvals + alpha)
  1525. w[intercept_dim] = 1 / eigvals[intercept_dim]
  1526. A = (V * w).dot(V.T)
  1527. # add a column to X containing the square roots of sample weights
  1528. X_op = _X_CenterStackOp(X, X_mean, sqrt_sw)
  1529. AXy = A.dot(X_op.T.dot(y))
  1530. y_hat = X_op.dot(AXy)
  1531. hat_diag = self._sparse_multidot_diag(X, A, X_mean, sqrt_sw)
  1532. # return (1 - hat_diag), (y - y_hat)
  1533. if len(y.shape) != 1:
  1534. # handle case where y is 2-d
  1535. hat_diag = hat_diag[:, np.newaxis]
  1536. return (1 - hat_diag) / alpha, (y - y_hat) / alpha
  1537. def _solve_eigen_covariance(self, alpha, y, sqrt_sw, X_mean, eigvals, V, X):
  1538. """Compute dual coefficients and diagonal of G^-1.
  1539. Used when we have a decomposition of X^T.X
  1540. (n_samples > n_features and X is sparse).
  1541. """
  1542. if self.fit_intercept:
  1543. return self._solve_eigen_covariance_intercept(
  1544. alpha, y, sqrt_sw, X_mean, eigvals, V, X
  1545. )
  1546. return self._solve_eigen_covariance_no_intercept(
  1547. alpha, y, sqrt_sw, X_mean, eigvals, V, X
  1548. )
  1549. def _svd_decompose_design_matrix(self, X, y, sqrt_sw):
  1550. # X already centered
  1551. X_mean = np.zeros(X.shape[1], dtype=X.dtype)
  1552. if self.fit_intercept:
  1553. # to emulate fit_intercept=True situation, add a column
  1554. # containing the square roots of the sample weights
  1555. # by centering, the other columns are orthogonal to that one
  1556. intercept_column = sqrt_sw[:, None]
  1557. X = np.hstack((X, intercept_column))
  1558. U, singvals, _ = linalg.svd(X, full_matrices=0)
  1559. singvals_sq = singvals**2
  1560. UT_y = np.dot(U.T, y)
  1561. return X_mean, singvals_sq, U, UT_y
  1562. def _solve_svd_design_matrix(self, alpha, y, sqrt_sw, X_mean, singvals_sq, U, UT_y):
  1563. """Compute dual coefficients and diagonal of G^-1.
  1564. Used when we have an SVD decomposition of X
  1565. (n_samples > n_features and X is dense).
  1566. """
  1567. w = ((singvals_sq + alpha) ** -1) - (alpha**-1)
  1568. if self.fit_intercept:
  1569. # detect intercept column
  1570. normalized_sw = sqrt_sw / np.linalg.norm(sqrt_sw)
  1571. intercept_dim = _find_smallest_angle(normalized_sw, U)
  1572. # cancel the regularization for the intercept
  1573. w[intercept_dim] = -(alpha**-1)
  1574. c = np.dot(U, self._diag_dot(w, UT_y)) + (alpha**-1) * y
  1575. G_inverse_diag = self._decomp_diag(w, U) + (alpha**-1)
  1576. if len(y.shape) != 1:
  1577. # handle case where y is 2-d
  1578. G_inverse_diag = G_inverse_diag[:, np.newaxis]
  1579. return G_inverse_diag, c
  1580. def fit(self, X, y, sample_weight=None):
  1581. """Fit Ridge regression model with gcv.
  1582. Parameters
  1583. ----------
  1584. X : {ndarray, sparse matrix} of shape (n_samples, n_features)
  1585. Training data. Will be cast to float64 if necessary.
  1586. y : ndarray of shape (n_samples,) or (n_samples, n_targets)
  1587. Target values. Will be cast to float64 if necessary.
  1588. sample_weight : float or ndarray of shape (n_samples,), default=None
  1589. Individual weights for each sample. If given a float, every sample
  1590. will have the same weight.
  1591. Returns
  1592. -------
  1593. self : object
  1594. """
  1595. X, y = self._validate_data(
  1596. X,
  1597. y,
  1598. accept_sparse=["csr", "csc", "coo"],
  1599. dtype=[np.float64],
  1600. multi_output=True,
  1601. y_numeric=True,
  1602. )
  1603. # alpha_per_target cannot be used in classifier mode. All subclasses
  1604. # of _RidgeGCV that are classifiers keep alpha_per_target at its
  1605. # default value: False, so the condition below should never happen.
  1606. assert not (self.is_clf and self.alpha_per_target)
  1607. if sample_weight is not None:
  1608. sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
  1609. self.alphas = np.asarray(self.alphas)
  1610. X, y, X_offset, y_offset, X_scale = _preprocess_data(
  1611. X,
  1612. y,
  1613. self.fit_intercept,
  1614. copy=self.copy_X,
  1615. sample_weight=sample_weight,
  1616. )
  1617. gcv_mode = _check_gcv_mode(X, self.gcv_mode)
  1618. if gcv_mode == "eigen":
  1619. decompose = self._eigen_decompose_gram
  1620. solve = self._solve_eigen_gram
  1621. elif gcv_mode == "svd":
  1622. if sparse.issparse(X):
  1623. decompose = self._eigen_decompose_covariance
  1624. solve = self._solve_eigen_covariance
  1625. else:
  1626. decompose = self._svd_decompose_design_matrix
  1627. solve = self._solve_svd_design_matrix
  1628. n_samples = X.shape[0]
  1629. if sample_weight is not None:
  1630. X, y, sqrt_sw = _rescale_data(X, y, sample_weight)
  1631. else:
  1632. sqrt_sw = np.ones(n_samples, dtype=X.dtype)
  1633. X_mean, *decomposition = decompose(X, y, sqrt_sw)
  1634. scorer = check_scoring(self, scoring=self.scoring, allow_none=True)
  1635. error = scorer is None
  1636. n_y = 1 if len(y.shape) == 1 else y.shape[1]
  1637. n_alphas = 1 if np.ndim(self.alphas) == 0 else len(self.alphas)
  1638. if self.store_cv_values:
  1639. self.cv_values_ = np.empty((n_samples * n_y, n_alphas), dtype=X.dtype)
  1640. best_coef, best_score, best_alpha = None, None, None
  1641. for i, alpha in enumerate(np.atleast_1d(self.alphas)):
  1642. G_inverse_diag, c = solve(float(alpha), y, sqrt_sw, X_mean, *decomposition)
  1643. if error:
  1644. squared_errors = (c / G_inverse_diag) ** 2
  1645. if self.alpha_per_target:
  1646. alpha_score = -squared_errors.mean(axis=0)
  1647. else:
  1648. alpha_score = -squared_errors.mean()
  1649. if self.store_cv_values:
  1650. self.cv_values_[:, i] = squared_errors.ravel()
  1651. else:
  1652. predictions = y - (c / G_inverse_diag)
  1653. if self.store_cv_values:
  1654. self.cv_values_[:, i] = predictions.ravel()
  1655. if self.is_clf:
  1656. identity_estimator = _IdentityClassifier(classes=np.arange(n_y))
  1657. alpha_score = scorer(
  1658. identity_estimator, predictions, y.argmax(axis=1)
  1659. )
  1660. else:
  1661. identity_estimator = _IdentityRegressor()
  1662. if self.alpha_per_target:
  1663. alpha_score = np.array(
  1664. [
  1665. scorer(identity_estimator, predictions[:, j], y[:, j])
  1666. for j in range(n_y)
  1667. ]
  1668. )
  1669. else:
  1670. alpha_score = scorer(
  1671. identity_estimator, predictions.ravel(), y.ravel()
  1672. )
  1673. # Keep track of the best model
  1674. if best_score is None:
  1675. # initialize
  1676. if self.alpha_per_target and n_y > 1:
  1677. best_coef = c
  1678. best_score = np.atleast_1d(alpha_score)
  1679. best_alpha = np.full(n_y, alpha)
  1680. else:
  1681. best_coef = c
  1682. best_score = alpha_score
  1683. best_alpha = alpha
  1684. else:
  1685. # update
  1686. if self.alpha_per_target and n_y > 1:
  1687. to_update = alpha_score > best_score
  1688. best_coef[:, to_update] = c[:, to_update]
  1689. best_score[to_update] = alpha_score[to_update]
  1690. best_alpha[to_update] = alpha
  1691. elif alpha_score > best_score:
  1692. best_coef, best_score, best_alpha = c, alpha_score, alpha
  1693. self.alpha_ = best_alpha
  1694. self.best_score_ = best_score
  1695. self.dual_coef_ = best_coef
  1696. self.coef_ = safe_sparse_dot(self.dual_coef_.T, X)
  1697. if sparse.issparse(X):
  1698. X_offset = X_mean * X_scale
  1699. else:
  1700. X_offset += X_mean * X_scale
  1701. self._set_intercept(X_offset, y_offset, X_scale)
  1702. if self.store_cv_values:
  1703. if len(y.shape) == 1:
  1704. cv_values_shape = n_samples, n_alphas
  1705. else:
  1706. cv_values_shape = n_samples, n_y, n_alphas
  1707. self.cv_values_ = self.cv_values_.reshape(cv_values_shape)
  1708. return self
  1709. class _BaseRidgeCV(LinearModel):
  1710. _parameter_constraints: dict = {
  1711. "alphas": ["array-like", Interval(Real, 0, None, closed="neither")],
  1712. "fit_intercept": ["boolean"],
  1713. "scoring": [StrOptions(set(get_scorer_names())), callable, None],
  1714. "cv": ["cv_object"],
  1715. "gcv_mode": [StrOptions({"auto", "svd", "eigen"}), None],
  1716. "store_cv_values": ["boolean"],
  1717. "alpha_per_target": ["boolean"],
  1718. }
  1719. def __init__(
  1720. self,
  1721. alphas=(0.1, 1.0, 10.0),
  1722. *,
  1723. fit_intercept=True,
  1724. scoring=None,
  1725. cv=None,
  1726. gcv_mode=None,
  1727. store_cv_values=False,
  1728. alpha_per_target=False,
  1729. ):
  1730. self.alphas = alphas
  1731. self.fit_intercept = fit_intercept
  1732. self.scoring = scoring
  1733. self.cv = cv
  1734. self.gcv_mode = gcv_mode
  1735. self.store_cv_values = store_cv_values
  1736. self.alpha_per_target = alpha_per_target
  1737. def fit(self, X, y, sample_weight=None):
  1738. """Fit Ridge regression model with cv.
  1739. Parameters
  1740. ----------
  1741. X : ndarray of shape (n_samples, n_features)
  1742. Training data. If using GCV, will be cast to float64
  1743. if necessary.
  1744. y : ndarray of shape (n_samples,) or (n_samples, n_targets)
  1745. Target values. Will be cast to X's dtype if necessary.
  1746. sample_weight : float or ndarray of shape (n_samples,), default=None
  1747. Individual weights for each sample. If given a float, every sample
  1748. will have the same weight.
  1749. Returns
  1750. -------
  1751. self : object
  1752. Fitted estimator.
  1753. Notes
  1754. -----
  1755. When sample_weight is provided, the selected hyperparameter may depend
  1756. on whether we use leave-one-out cross-validation (cv=None or cv='auto')
  1757. or another form of cross-validation, because only leave-one-out
  1758. cross-validation takes the sample weights into account when computing
  1759. the validation score.
  1760. """
  1761. cv = self.cv
  1762. check_scalar_alpha = partial(
  1763. check_scalar,
  1764. target_type=numbers.Real,
  1765. min_val=0.0,
  1766. include_boundaries="neither",
  1767. )
  1768. if isinstance(self.alphas, (np.ndarray, list, tuple)):
  1769. n_alphas = 1 if np.ndim(self.alphas) == 0 else len(self.alphas)
  1770. if n_alphas != 1:
  1771. for index, alpha in enumerate(self.alphas):
  1772. alpha = check_scalar_alpha(alpha, f"alphas[{index}]")
  1773. else:
  1774. self.alphas[0] = check_scalar_alpha(self.alphas[0], "alphas")
  1775. alphas = np.asarray(self.alphas)
  1776. if cv is None:
  1777. estimator = _RidgeGCV(
  1778. alphas,
  1779. fit_intercept=self.fit_intercept,
  1780. scoring=self.scoring,
  1781. gcv_mode=self.gcv_mode,
  1782. store_cv_values=self.store_cv_values,
  1783. is_clf=is_classifier(self),
  1784. alpha_per_target=self.alpha_per_target,
  1785. )
  1786. estimator.fit(X, y, sample_weight=sample_weight)
  1787. self.alpha_ = estimator.alpha_
  1788. self.best_score_ = estimator.best_score_
  1789. if self.store_cv_values:
  1790. self.cv_values_ = estimator.cv_values_
  1791. else:
  1792. if self.store_cv_values:
  1793. raise ValueError("cv!=None and store_cv_values=True are incompatible")
  1794. if self.alpha_per_target:
  1795. raise ValueError("cv!=None and alpha_per_target=True are incompatible")
  1796. parameters = {"alpha": alphas}
  1797. solver = "sparse_cg" if sparse.issparse(X) else "auto"
  1798. model = RidgeClassifier if is_classifier(self) else Ridge
  1799. gs = GridSearchCV(
  1800. model(
  1801. fit_intercept=self.fit_intercept,
  1802. solver=solver,
  1803. ),
  1804. parameters,
  1805. cv=cv,
  1806. scoring=self.scoring,
  1807. )
  1808. gs.fit(X, y, sample_weight=sample_weight)
  1809. estimator = gs.best_estimator_
  1810. self.alpha_ = gs.best_estimator_.alpha
  1811. self.best_score_ = gs.best_score_
  1812. self.coef_ = estimator.coef_
  1813. self.intercept_ = estimator.intercept_
  1814. self.n_features_in_ = estimator.n_features_in_
  1815. if hasattr(estimator, "feature_names_in_"):
  1816. self.feature_names_in_ = estimator.feature_names_in_
  1817. return self
  1818. class RidgeCV(MultiOutputMixin, RegressorMixin, _BaseRidgeCV):
  1819. """Ridge regression with built-in cross-validation.
  1820. See glossary entry for :term:`cross-validation estimator`.
  1821. By default, it performs efficient Leave-One-Out Cross-Validation.
  1822. Read more in the :ref:`User Guide <ridge_regression>`.
  1823. Parameters
  1824. ----------
  1825. alphas : array-like of shape (n_alphas,), default=(0.1, 1.0, 10.0)
  1826. Array of alpha values to try.
  1827. Regularization strength; must be a positive float. Regularization
  1828. improves the conditioning of the problem and reduces the variance of
  1829. the estimates. Larger values specify stronger regularization.
  1830. Alpha corresponds to ``1 / (2C)`` in other linear models such as
  1831. :class:`~sklearn.linear_model.LogisticRegression` or
  1832. :class:`~sklearn.svm.LinearSVC`.
  1833. If using Leave-One-Out cross-validation, alphas must be positive.
  1834. fit_intercept : bool, default=True
  1835. Whether to calculate the intercept for this model. If set
  1836. to false, no intercept will be used in calculations
  1837. (i.e. data is expected to be centered).
  1838. scoring : str, callable, default=None
  1839. A string (see model evaluation documentation) or
  1840. a scorer callable object / function with signature
  1841. ``scorer(estimator, X, y)``.
  1842. If None, the negative mean squared error if cv is 'auto' or None
  1843. (i.e. when using leave-one-out cross-validation), and r2 score
  1844. otherwise.
  1845. cv : int, cross-validation generator or an iterable, default=None
  1846. Determines the cross-validation splitting strategy.
  1847. Possible inputs for cv are:
  1848. - None, to use the efficient Leave-One-Out cross-validation
  1849. - integer, to specify the number of folds.
  1850. - :term:`CV splitter`,
  1851. - An iterable yielding (train, test) splits as arrays of indices.
  1852. For integer/None inputs, if ``y`` is binary or multiclass,
  1853. :class:`~sklearn.model_selection.StratifiedKFold` is used, else,
  1854. :class:`~sklearn.model_selection.KFold` is used.
  1855. Refer :ref:`User Guide <cross_validation>` for the various
  1856. cross-validation strategies that can be used here.
  1857. gcv_mode : {'auto', 'svd', 'eigen'}, default='auto'
  1858. Flag indicating which strategy to use when performing
  1859. Leave-One-Out Cross-Validation. Options are::
  1860. 'auto' : use 'svd' if n_samples > n_features, otherwise use 'eigen'
  1861. 'svd' : force use of singular value decomposition of X when X is
  1862. dense, eigenvalue decomposition of X^T.X when X is sparse.
  1863. 'eigen' : force computation via eigendecomposition of X.X^T
  1864. The 'auto' mode is the default and is intended to pick the cheaper
  1865. option of the two depending on the shape of the training data.
  1866. store_cv_values : bool, default=False
  1867. Flag indicating if the cross-validation values corresponding to
  1868. each alpha should be stored in the ``cv_values_`` attribute (see
  1869. below). This flag is only compatible with ``cv=None`` (i.e. using
  1870. Leave-One-Out Cross-Validation).
  1871. alpha_per_target : bool, default=False
  1872. Flag indicating whether to optimize the alpha value (picked from the
  1873. `alphas` parameter list) for each target separately (for multi-output
  1874. settings: multiple prediction targets). When set to `True`, after
  1875. fitting, the `alpha_` attribute will contain a value for each target.
  1876. When set to `False`, a single alpha is used for all targets.
  1877. .. versionadded:: 0.24
  1878. Attributes
  1879. ----------
  1880. cv_values_ : ndarray of shape (n_samples, n_alphas) or \
  1881. shape (n_samples, n_targets, n_alphas), optional
  1882. Cross-validation values for each alpha (only available if
  1883. ``store_cv_values=True`` and ``cv=None``). After ``fit()`` has been
  1884. called, this attribute will contain the mean squared errors if
  1885. `scoring is None` otherwise it will contain standardized per point
  1886. prediction values.
  1887. coef_ : ndarray of shape (n_features) or (n_targets, n_features)
  1888. Weight vector(s).
  1889. intercept_ : float or ndarray of shape (n_targets,)
  1890. Independent term in decision function. Set to 0.0 if
  1891. ``fit_intercept = False``.
  1892. alpha_ : float or ndarray of shape (n_targets,)
  1893. Estimated regularization parameter, or, if ``alpha_per_target=True``,
  1894. the estimated regularization parameter for each target.
  1895. best_score_ : float or ndarray of shape (n_targets,)
  1896. Score of base estimator with best alpha, or, if
  1897. ``alpha_per_target=True``, a score for each target.
  1898. .. versionadded:: 0.23
  1899. n_features_in_ : int
  1900. Number of features seen during :term:`fit`.
  1901. .. versionadded:: 0.24
  1902. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  1903. Names of features seen during :term:`fit`. Defined only when `X`
  1904. has feature names that are all strings.
  1905. .. versionadded:: 1.0
  1906. See Also
  1907. --------
  1908. Ridge : Ridge regression.
  1909. RidgeClassifier : Classifier based on ridge regression on {-1, 1} labels.
  1910. RidgeClassifierCV : Ridge classifier with built-in cross validation.
  1911. Examples
  1912. --------
  1913. >>> from sklearn.datasets import load_diabetes
  1914. >>> from sklearn.linear_model import RidgeCV
  1915. >>> X, y = load_diabetes(return_X_y=True)
  1916. >>> clf = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y)
  1917. >>> clf.score(X, y)
  1918. 0.5166...
  1919. """
  1920. @_fit_context(prefer_skip_nested_validation=True)
  1921. def fit(self, X, y, sample_weight=None):
  1922. """Fit Ridge regression model with cv.
  1923. Parameters
  1924. ----------
  1925. X : ndarray of shape (n_samples, n_features)
  1926. Training data. If using GCV, will be cast to float64
  1927. if necessary.
  1928. y : ndarray of shape (n_samples,) or (n_samples, n_targets)
  1929. Target values. Will be cast to X's dtype if necessary.
  1930. sample_weight : float or ndarray of shape (n_samples,), default=None
  1931. Individual weights for each sample. If given a float, every sample
  1932. will have the same weight.
  1933. Returns
  1934. -------
  1935. self : object
  1936. Fitted estimator.
  1937. Notes
  1938. -----
  1939. When sample_weight is provided, the selected hyperparameter may depend
  1940. on whether we use leave-one-out cross-validation (cv=None or cv='auto')
  1941. or another form of cross-validation, because only leave-one-out
  1942. cross-validation takes the sample weights into account when computing
  1943. the validation score.
  1944. """
  1945. super().fit(X, y, sample_weight=sample_weight)
  1946. return self
  1947. class RidgeClassifierCV(_RidgeClassifierMixin, _BaseRidgeCV):
  1948. """Ridge classifier with built-in cross-validation.
  1949. See glossary entry for :term:`cross-validation estimator`.
  1950. By default, it performs Leave-One-Out Cross-Validation. Currently,
  1951. only the n_features > n_samples case is handled efficiently.
  1952. Read more in the :ref:`User Guide <ridge_regression>`.
  1953. Parameters
  1954. ----------
  1955. alphas : array-like of shape (n_alphas,), default=(0.1, 1.0, 10.0)
  1956. Array of alpha values to try.
  1957. Regularization strength; must be a positive float. Regularization
  1958. improves the conditioning of the problem and reduces the variance of
  1959. the estimates. Larger values specify stronger regularization.
  1960. Alpha corresponds to ``1 / (2C)`` in other linear models such as
  1961. :class:`~sklearn.linear_model.LogisticRegression` or
  1962. :class:`~sklearn.svm.LinearSVC`.
  1963. fit_intercept : bool, default=True
  1964. Whether to calculate the intercept for this model. If set
  1965. to false, no intercept will be used in calculations
  1966. (i.e. data is expected to be centered).
  1967. scoring : str, callable, default=None
  1968. A string (see model evaluation documentation) or
  1969. a scorer callable object / function with signature
  1970. ``scorer(estimator, X, y)``.
  1971. cv : int, cross-validation generator or an iterable, default=None
  1972. Determines the cross-validation splitting strategy.
  1973. Possible inputs for cv are:
  1974. - None, to use the efficient Leave-One-Out cross-validation
  1975. - integer, to specify the number of folds.
  1976. - :term:`CV splitter`,
  1977. - An iterable yielding (train, test) splits as arrays of indices.
  1978. Refer :ref:`User Guide <cross_validation>` for the various
  1979. cross-validation strategies that can be used here.
  1980. class_weight : dict or 'balanced', default=None
  1981. Weights associated with classes in the form ``{class_label: weight}``.
  1982. If not given, all classes are supposed to have weight one.
  1983. The "balanced" mode uses the values of y to automatically adjust
  1984. weights inversely proportional to class frequencies in the input data
  1985. as ``n_samples / (n_classes * np.bincount(y))``.
  1986. store_cv_values : bool, default=False
  1987. Flag indicating if the cross-validation values corresponding to
  1988. each alpha should be stored in the ``cv_values_`` attribute (see
  1989. below). This flag is only compatible with ``cv=None`` (i.e. using
  1990. Leave-One-Out Cross-Validation).
  1991. Attributes
  1992. ----------
  1993. cv_values_ : ndarray of shape (n_samples, n_targets, n_alphas), optional
  1994. Cross-validation values for each alpha (only if ``store_cv_values=True`` and
  1995. ``cv=None``). After ``fit()`` has been called, this attribute will
  1996. contain the mean squared errors if `scoring is None` otherwise it
  1997. will contain standardized per point prediction values.
  1998. coef_ : ndarray of shape (1, n_features) or (n_targets, n_features)
  1999. Coefficient of the features in the decision function.
  2000. ``coef_`` is of shape (1, n_features) when the given problem is binary.
  2001. intercept_ : float or ndarray of shape (n_targets,)
  2002. Independent term in decision function. Set to 0.0 if
  2003. ``fit_intercept = False``.
  2004. alpha_ : float
  2005. Estimated regularization parameter.
  2006. best_score_ : float
  2007. Score of base estimator with best alpha.
  2008. .. versionadded:: 0.23
  2009. classes_ : ndarray of shape (n_classes,)
  2010. The classes labels.
  2011. n_features_in_ : int
  2012. Number of features seen during :term:`fit`.
  2013. .. versionadded:: 0.24
  2014. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  2015. Names of features seen during :term:`fit`. Defined only when `X`
  2016. has feature names that are all strings.
  2017. .. versionadded:: 1.0
  2018. See Also
  2019. --------
  2020. Ridge : Ridge regression.
  2021. RidgeClassifier : Ridge classifier.
  2022. RidgeCV : Ridge regression with built-in cross validation.
  2023. Notes
  2024. -----
  2025. For multi-class classification, n_class classifiers are trained in
  2026. a one-versus-all approach. Concretely, this is implemented by taking
  2027. advantage of the multi-variate response support in Ridge.
  2028. Examples
  2029. --------
  2030. >>> from sklearn.datasets import load_breast_cancer
  2031. >>> from sklearn.linear_model import RidgeClassifierCV
  2032. >>> X, y = load_breast_cancer(return_X_y=True)
  2033. >>> clf = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y)
  2034. >>> clf.score(X, y)
  2035. 0.9630...
  2036. """
  2037. _parameter_constraints: dict = {
  2038. **_BaseRidgeCV._parameter_constraints,
  2039. "class_weight": [dict, StrOptions({"balanced"}), None],
  2040. }
  2041. for param in ("gcv_mode", "alpha_per_target"):
  2042. _parameter_constraints.pop(param)
  2043. def __init__(
  2044. self,
  2045. alphas=(0.1, 1.0, 10.0),
  2046. *,
  2047. fit_intercept=True,
  2048. scoring=None,
  2049. cv=None,
  2050. class_weight=None,
  2051. store_cv_values=False,
  2052. ):
  2053. super().__init__(
  2054. alphas=alphas,
  2055. fit_intercept=fit_intercept,
  2056. scoring=scoring,
  2057. cv=cv,
  2058. store_cv_values=store_cv_values,
  2059. )
  2060. self.class_weight = class_weight
  2061. @_fit_context(prefer_skip_nested_validation=True)
  2062. def fit(self, X, y, sample_weight=None):
  2063. """Fit Ridge classifier with cv.
  2064. Parameters
  2065. ----------
  2066. X : ndarray of shape (n_samples, n_features)
  2067. Training vectors, where `n_samples` is the number of samples
  2068. and `n_features` is the number of features. When using GCV,
  2069. will be cast to float64 if necessary.
  2070. y : ndarray of shape (n_samples,)
  2071. Target values. Will be cast to X's dtype if necessary.
  2072. sample_weight : float or ndarray of shape (n_samples,), default=None
  2073. Individual weights for each sample. If given a float, every sample
  2074. will have the same weight.
  2075. Returns
  2076. -------
  2077. self : object
  2078. Fitted estimator.
  2079. """
  2080. # `RidgeClassifier` does not accept "sag" or "saga" solver and thus support
  2081. # csr, csc, and coo sparse matrices. By using solver="eigen" we force to accept
  2082. # all sparse format.
  2083. X, y, sample_weight, Y = self._prepare_data(X, y, sample_weight, solver="eigen")
  2084. # If cv is None, gcv mode will be used and we used the binarized Y
  2085. # since y will not be binarized in _RidgeGCV estimator.
  2086. # If cv is not None, a GridSearchCV with some RidgeClassifier
  2087. # estimators are used where y will be binarized. Thus, we pass y
  2088. # instead of the binarized Y.
  2089. target = Y if self.cv is None else y
  2090. super().fit(X, target, sample_weight=sample_weight)
  2091. return self
  2092. def _more_tags(self):
  2093. return {
  2094. "multilabel": True,
  2095. "_xfail_checks": {
  2096. "check_sample_weights_invariance": (
  2097. "zero sample_weight is not equivalent to removing samples"
  2098. ),
  2099. },
  2100. }