_logistic.py 80 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114
  1. """
  2. Logistic Regression
  3. """
  4. # Author: Gael Varoquaux <gael.varoquaux@normalesup.org>
  5. # Fabian Pedregosa <f@bianp.net>
  6. # Alexandre Gramfort <alexandre.gramfort@telecom-paristech.fr>
  7. # Manoj Kumar <manojkumarsivaraj334@gmail.com>
  8. # Lars Buitinck
  9. # Simon Wu <s8wu@uwaterloo.ca>
  10. # Arthur Mensch <arthur.mensch@m4x.org
  11. import numbers
  12. import warnings
  13. from numbers import Integral, Real
  14. import numpy as np
  15. from joblib import effective_n_jobs
  16. from scipy import optimize
  17. from sklearn.metrics import get_scorer_names
  18. from .._loss.loss import HalfBinomialLoss, HalfMultinomialLoss
  19. from ..base import _fit_context
  20. from ..metrics import get_scorer
  21. from ..model_selection import check_cv
  22. from ..preprocessing import LabelBinarizer, LabelEncoder
  23. from ..svm._base import _fit_liblinear
  24. from ..utils import (
  25. check_array,
  26. check_consistent_length,
  27. check_random_state,
  28. compute_class_weight,
  29. )
  30. from ..utils._param_validation import Interval, StrOptions
  31. from ..utils.extmath import row_norms, softmax
  32. from ..utils.multiclass import check_classification_targets
  33. from ..utils.optimize import _check_optimize_result, _newton_cg
  34. from ..utils.parallel import Parallel, delayed
  35. from ..utils.validation import _check_sample_weight, check_is_fitted
  36. from ._base import BaseEstimator, LinearClassifierMixin, SparseCoefMixin
  37. from ._glm.glm import NewtonCholeskySolver
  38. from ._linear_loss import LinearModelLoss
  39. from ._sag import sag_solver
  40. _LOGISTIC_SOLVER_CONVERGENCE_MSG = (
  41. "Please also refer to the documentation for alternative solver options:\n"
  42. " https://scikit-learn.org/stable/modules/linear_model.html"
  43. "#logistic-regression"
  44. )
  45. def _check_solver(solver, penalty, dual):
  46. # TODO(1.4): Remove "none" option
  47. if solver not in ["liblinear", "saga"] and penalty not in ("l2", "none", None):
  48. raise ValueError(
  49. "Solver %s supports only 'l2' or 'none' penalties, got %s penalty."
  50. % (solver, penalty)
  51. )
  52. if solver != "liblinear" and dual:
  53. raise ValueError(
  54. "Solver %s supports only dual=False, got dual=%s" % (solver, dual)
  55. )
  56. if penalty == "elasticnet" and solver != "saga":
  57. raise ValueError(
  58. "Only 'saga' solver supports elasticnet penalty, got solver={}.".format(
  59. solver
  60. )
  61. )
  62. if solver == "liblinear" and penalty == "none":
  63. raise ValueError("penalty='none' is not supported for the liblinear solver")
  64. return solver
  65. def _check_multi_class(multi_class, solver, n_classes):
  66. """Computes the multi class type, either "multinomial" or "ovr".
  67. For `n_classes` > 2 and a solver that supports it, returns "multinomial".
  68. For all other cases, in particular binary classification, return "ovr".
  69. """
  70. if multi_class == "auto":
  71. if solver in ("liblinear", "newton-cholesky"):
  72. multi_class = "ovr"
  73. elif n_classes > 2:
  74. multi_class = "multinomial"
  75. else:
  76. multi_class = "ovr"
  77. if multi_class == "multinomial" and solver in ("liblinear", "newton-cholesky"):
  78. raise ValueError("Solver %s does not support a multinomial backend." % solver)
  79. return multi_class
  80. def _logistic_regression_path(
  81. X,
  82. y,
  83. pos_class=None,
  84. Cs=10,
  85. fit_intercept=True,
  86. max_iter=100,
  87. tol=1e-4,
  88. verbose=0,
  89. solver="lbfgs",
  90. coef=None,
  91. class_weight=None,
  92. dual=False,
  93. penalty="l2",
  94. intercept_scaling=1.0,
  95. multi_class="auto",
  96. random_state=None,
  97. check_input=True,
  98. max_squared_sum=None,
  99. sample_weight=None,
  100. l1_ratio=None,
  101. n_threads=1,
  102. ):
  103. """Compute a Logistic Regression model for a list of regularization
  104. parameters.
  105. This is an implementation that uses the result of the previous model
  106. to speed up computations along the set of solutions, making it faster
  107. than sequentially calling LogisticRegression for the different parameters.
  108. Note that there will be no speedup with liblinear solver, since it does
  109. not handle warm-starting.
  110. Read more in the :ref:`User Guide <logistic_regression>`.
  111. Parameters
  112. ----------
  113. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  114. Input data.
  115. y : array-like of shape (n_samples,) or (n_samples, n_targets)
  116. Input data, target values.
  117. pos_class : int, default=None
  118. The class with respect to which we perform a one-vs-all fit.
  119. If None, then it is assumed that the given problem is binary.
  120. Cs : int or array-like of shape (n_cs,), default=10
  121. List of values for the regularization parameter or integer specifying
  122. the number of regularization parameters that should be used. In this
  123. case, the parameters will be chosen in a logarithmic scale between
  124. 1e-4 and 1e4.
  125. fit_intercept : bool, default=True
  126. Whether to fit an intercept for the model. In this case the shape of
  127. the returned array is (n_cs, n_features + 1).
  128. max_iter : int, default=100
  129. Maximum number of iterations for the solver.
  130. tol : float, default=1e-4
  131. Stopping criterion. For the newton-cg and lbfgs solvers, the iteration
  132. will stop when ``max{|g_i | i = 1, ..., n} <= tol``
  133. where ``g_i`` is the i-th component of the gradient.
  134. verbose : int, default=0
  135. For the liblinear and lbfgs solvers set verbose to any positive
  136. number for verbosity.
  137. solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, \
  138. default='lbfgs'
  139. Numerical solver to use.
  140. coef : array-like of shape (n_features,), default=None
  141. Initialization value for coefficients of logistic regression.
  142. Useless for liblinear solver.
  143. class_weight : dict or 'balanced', default=None
  144. Weights associated with classes in the form ``{class_label: weight}``.
  145. If not given, all classes are supposed to have weight one.
  146. The "balanced" mode uses the values of y to automatically adjust
  147. weights inversely proportional to class frequencies in the input data
  148. as ``n_samples / (n_classes * np.bincount(y))``.
  149. Note that these weights will be multiplied with sample_weight (passed
  150. through the fit method) if sample_weight is specified.
  151. dual : bool, default=False
  152. Dual or primal formulation. Dual formulation is only implemented for
  153. l2 penalty with liblinear solver. Prefer dual=False when
  154. n_samples > n_features.
  155. penalty : {'l1', 'l2', 'elasticnet'}, default='l2'
  156. Used to specify the norm used in the penalization. The 'newton-cg',
  157. 'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is
  158. only supported by the 'saga' solver.
  159. intercept_scaling : float, default=1.
  160. Useful only when the solver 'liblinear' is used
  161. and self.fit_intercept is set to True. In this case, x becomes
  162. [x, self.intercept_scaling],
  163. i.e. a "synthetic" feature with constant value equal to
  164. intercept_scaling is appended to the instance vector.
  165. The intercept becomes ``intercept_scaling * synthetic_feature_weight``.
  166. Note! the synthetic feature weight is subject to l1/l2 regularization
  167. as all other features.
  168. To lessen the effect of regularization on synthetic feature weight
  169. (and therefore on the intercept) intercept_scaling has to be increased.
  170. multi_class : {'ovr', 'multinomial', 'auto'}, default='auto'
  171. If the option chosen is 'ovr', then a binary problem is fit for each
  172. label. For 'multinomial' the loss minimised is the multinomial loss fit
  173. across the entire probability distribution, *even when the data is
  174. binary*. 'multinomial' is unavailable when solver='liblinear'.
  175. 'auto' selects 'ovr' if the data is binary, or if solver='liblinear',
  176. and otherwise selects 'multinomial'.
  177. .. versionadded:: 0.18
  178. Stochastic Average Gradient descent solver for 'multinomial' case.
  179. .. versionchanged:: 0.22
  180. Default changed from 'ovr' to 'auto' in 0.22.
  181. random_state : int, RandomState instance, default=None
  182. Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the
  183. data. See :term:`Glossary <random_state>` for details.
  184. check_input : bool, default=True
  185. If False, the input arrays X and y will not be checked.
  186. max_squared_sum : float, default=None
  187. Maximum squared sum of X over samples. Used only in SAG solver.
  188. If None, it will be computed, going through all the samples.
  189. The value should be precomputed to speed up cross validation.
  190. sample_weight : array-like of shape(n_samples,), default=None
  191. Array of weights that are assigned to individual samples.
  192. If not provided, then each sample is given unit weight.
  193. l1_ratio : float, default=None
  194. The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only
  195. used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent
  196. to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent
  197. to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a
  198. combination of L1 and L2.
  199. n_threads : int, default=1
  200. Number of OpenMP threads to use.
  201. Returns
  202. -------
  203. coefs : ndarray of shape (n_cs, n_features) or (n_cs, n_features + 1)
  204. List of coefficients for the Logistic Regression model. If
  205. fit_intercept is set to True then the second dimension will be
  206. n_features + 1, where the last item represents the intercept. For
  207. ``multiclass='multinomial'``, the shape is (n_classes, n_cs,
  208. n_features) or (n_classes, n_cs, n_features + 1).
  209. Cs : ndarray
  210. Grid of Cs used for cross-validation.
  211. n_iter : array of shape (n_cs,)
  212. Actual number of iteration for each Cs.
  213. Notes
  214. -----
  215. You might get slightly different results with the solver liblinear than
  216. with the others since this uses LIBLINEAR which penalizes the intercept.
  217. .. versionchanged:: 0.19
  218. The "copy" parameter was removed.
  219. """
  220. if isinstance(Cs, numbers.Integral):
  221. Cs = np.logspace(-4, 4, Cs)
  222. solver = _check_solver(solver, penalty, dual)
  223. # Preprocessing.
  224. if check_input:
  225. X = check_array(
  226. X,
  227. accept_sparse="csr",
  228. dtype=np.float64,
  229. accept_large_sparse=solver not in ["liblinear", "sag", "saga"],
  230. )
  231. y = check_array(y, ensure_2d=False, dtype=None)
  232. check_consistent_length(X, y)
  233. n_samples, n_features = X.shape
  234. classes = np.unique(y)
  235. random_state = check_random_state(random_state)
  236. multi_class = _check_multi_class(multi_class, solver, len(classes))
  237. if pos_class is None and multi_class != "multinomial":
  238. if classes.size > 2:
  239. raise ValueError("To fit OvR, use the pos_class argument")
  240. # np.unique(y) gives labels in sorted order.
  241. pos_class = classes[1]
  242. # If sample weights exist, convert them to array (support for lists)
  243. # and check length
  244. # Otherwise set them to 1 for all examples
  245. sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype, copy=True)
  246. if solver == "newton-cholesky":
  247. # IMPORTANT NOTE: Rescaling of sample_weight:
  248. # Same as in _GeneralizedLinearRegressor.fit().
  249. # We want to minimize
  250. # obj = 1/(2*sum(sample_weight)) * sum(sample_weight * deviance)
  251. # + 1/2 * alpha * L2,
  252. # with
  253. # deviance = 2 * log_loss.
  254. # The objective is invariant to multiplying sample_weight by a constant. We
  255. # choose this constant such that sum(sample_weight) = 1. Thus, we end up with
  256. # obj = sum(sample_weight * loss) + 1/2 * alpha * L2.
  257. # Note that LinearModelLoss.loss() computes sum(sample_weight * loss).
  258. #
  259. # This rescaling has to be done before multiplying by class_weights.
  260. sw_sum = sample_weight.sum() # needed to rescale penalty, nasty matter!
  261. sample_weight = sample_weight / sw_sum
  262. # If class_weights is a dict (provided by the user), the weights
  263. # are assigned to the original labels. If it is "balanced", then
  264. # the class_weights are assigned after masking the labels with a OvR.
  265. le = LabelEncoder()
  266. if isinstance(class_weight, dict) or multi_class == "multinomial":
  267. class_weight_ = compute_class_weight(class_weight, classes=classes, y=y)
  268. sample_weight *= class_weight_[le.fit_transform(y)]
  269. # For doing a ovr, we need to mask the labels first. For the
  270. # multinomial case this is not necessary.
  271. if multi_class == "ovr":
  272. w0 = np.zeros(n_features + int(fit_intercept), dtype=X.dtype)
  273. mask = y == pos_class
  274. y_bin = np.ones(y.shape, dtype=X.dtype)
  275. if solver in ["lbfgs", "newton-cg", "newton-cholesky"]:
  276. # HalfBinomialLoss, used for those solvers, represents y in [0, 1] instead
  277. # of in [-1, 1].
  278. mask_classes = np.array([0, 1])
  279. y_bin[~mask] = 0.0
  280. else:
  281. mask_classes = np.array([-1, 1])
  282. y_bin[~mask] = -1.0
  283. # for compute_class_weight
  284. if class_weight == "balanced":
  285. class_weight_ = compute_class_weight(
  286. class_weight, classes=mask_classes, y=y_bin
  287. )
  288. sample_weight *= class_weight_[le.fit_transform(y_bin)]
  289. else:
  290. if solver in ["sag", "saga", "lbfgs", "newton-cg"]:
  291. # SAG, lbfgs and newton-cg multinomial solvers need LabelEncoder,
  292. # not LabelBinarizer, i.e. y as a 1d-array of integers.
  293. # LabelEncoder also saves memory compared to LabelBinarizer, especially
  294. # when n_classes is large.
  295. le = LabelEncoder()
  296. Y_multi = le.fit_transform(y).astype(X.dtype, copy=False)
  297. else:
  298. # For liblinear solver, apply LabelBinarizer, i.e. y is one-hot encoded.
  299. lbin = LabelBinarizer()
  300. Y_multi = lbin.fit_transform(y)
  301. if Y_multi.shape[1] == 1:
  302. Y_multi = np.hstack([1 - Y_multi, Y_multi])
  303. w0 = np.zeros(
  304. (classes.size, n_features + int(fit_intercept)), order="F", dtype=X.dtype
  305. )
  306. if coef is not None:
  307. # it must work both giving the bias term and not
  308. if multi_class == "ovr":
  309. if coef.size not in (n_features, w0.size):
  310. raise ValueError(
  311. "Initialization coef is of shape %d, expected shape %d or %d"
  312. % (coef.size, n_features, w0.size)
  313. )
  314. w0[: coef.size] = coef
  315. else:
  316. # For binary problems coef.shape[0] should be 1, otherwise it
  317. # should be classes.size.
  318. n_classes = classes.size
  319. if n_classes == 2:
  320. n_classes = 1
  321. if coef.shape[0] != n_classes or coef.shape[1] not in (
  322. n_features,
  323. n_features + 1,
  324. ):
  325. raise ValueError(
  326. "Initialization coef is of shape (%d, %d), expected "
  327. "shape (%d, %d) or (%d, %d)"
  328. % (
  329. coef.shape[0],
  330. coef.shape[1],
  331. classes.size,
  332. n_features,
  333. classes.size,
  334. n_features + 1,
  335. )
  336. )
  337. if n_classes == 1:
  338. w0[0, : coef.shape[1]] = -coef
  339. w0[1, : coef.shape[1]] = coef
  340. else:
  341. w0[:, : coef.shape[1]] = coef
  342. if multi_class == "multinomial":
  343. if solver in ["lbfgs", "newton-cg"]:
  344. # scipy.optimize.minimize and newton-cg accept only ravelled parameters,
  345. # i.e. 1d-arrays. LinearModelLoss expects classes to be contiguous and
  346. # reconstructs the 2d-array via w0.reshape((n_classes, -1), order="F").
  347. # As w0 is F-contiguous, ravel(order="F") also avoids a copy.
  348. w0 = w0.ravel(order="F")
  349. loss = LinearModelLoss(
  350. base_loss=HalfMultinomialLoss(n_classes=classes.size),
  351. fit_intercept=fit_intercept,
  352. )
  353. target = Y_multi
  354. if solver in "lbfgs":
  355. func = loss.loss_gradient
  356. elif solver == "newton-cg":
  357. func = loss.loss
  358. grad = loss.gradient
  359. hess = loss.gradient_hessian_product # hess = [gradient, hessp]
  360. warm_start_sag = {"coef": w0.T}
  361. else:
  362. target = y_bin
  363. if solver == "lbfgs":
  364. loss = LinearModelLoss(
  365. base_loss=HalfBinomialLoss(), fit_intercept=fit_intercept
  366. )
  367. func = loss.loss_gradient
  368. elif solver == "newton-cg":
  369. loss = LinearModelLoss(
  370. base_loss=HalfBinomialLoss(), fit_intercept=fit_intercept
  371. )
  372. func = loss.loss
  373. grad = loss.gradient
  374. hess = loss.gradient_hessian_product # hess = [gradient, hessp]
  375. elif solver == "newton-cholesky":
  376. loss = LinearModelLoss(
  377. base_loss=HalfBinomialLoss(), fit_intercept=fit_intercept
  378. )
  379. warm_start_sag = {"coef": np.expand_dims(w0, axis=1)}
  380. coefs = list()
  381. n_iter = np.zeros(len(Cs), dtype=np.int32)
  382. for i, C in enumerate(Cs):
  383. if solver == "lbfgs":
  384. l2_reg_strength = 1.0 / C
  385. iprint = [-1, 50, 1, 100, 101][
  386. np.searchsorted(np.array([0, 1, 2, 3]), verbose)
  387. ]
  388. opt_res = optimize.minimize(
  389. func,
  390. w0,
  391. method="L-BFGS-B",
  392. jac=True,
  393. args=(X, target, sample_weight, l2_reg_strength, n_threads),
  394. options={"iprint": iprint, "gtol": tol, "maxiter": max_iter},
  395. )
  396. n_iter_i = _check_optimize_result(
  397. solver,
  398. opt_res,
  399. max_iter,
  400. extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG,
  401. )
  402. w0, loss = opt_res.x, opt_res.fun
  403. elif solver == "newton-cg":
  404. l2_reg_strength = 1.0 / C
  405. args = (X, target, sample_weight, l2_reg_strength, n_threads)
  406. w0, n_iter_i = _newton_cg(
  407. hess, func, grad, w0, args=args, maxiter=max_iter, tol=tol
  408. )
  409. elif solver == "newton-cholesky":
  410. # The division by sw_sum is a consequence of the rescaling of
  411. # sample_weight, see comment above.
  412. l2_reg_strength = 1.0 / C / sw_sum
  413. sol = NewtonCholeskySolver(
  414. coef=w0,
  415. linear_loss=loss,
  416. l2_reg_strength=l2_reg_strength,
  417. tol=tol,
  418. max_iter=max_iter,
  419. n_threads=n_threads,
  420. verbose=verbose,
  421. )
  422. w0 = sol.solve(X=X, y=target, sample_weight=sample_weight)
  423. n_iter_i = sol.iteration
  424. elif solver == "liblinear":
  425. (
  426. coef_,
  427. intercept_,
  428. n_iter_i,
  429. ) = _fit_liblinear(
  430. X,
  431. target,
  432. C,
  433. fit_intercept,
  434. intercept_scaling,
  435. None,
  436. penalty,
  437. dual,
  438. verbose,
  439. max_iter,
  440. tol,
  441. random_state,
  442. sample_weight=sample_weight,
  443. )
  444. if fit_intercept:
  445. w0 = np.concatenate([coef_.ravel(), intercept_])
  446. else:
  447. w0 = coef_.ravel()
  448. # n_iter_i is an array for each class. However, `target` is always encoded
  449. # in {-1, 1}, so we only take the first element of n_iter_i.
  450. n_iter_i = n_iter_i.item()
  451. elif solver in ["sag", "saga"]:
  452. if multi_class == "multinomial":
  453. target = target.astype(X.dtype, copy=False)
  454. loss = "multinomial"
  455. else:
  456. loss = "log"
  457. # alpha is for L2-norm, beta is for L1-norm
  458. if penalty == "l1":
  459. alpha = 0.0
  460. beta = 1.0 / C
  461. elif penalty == "l2":
  462. alpha = 1.0 / C
  463. beta = 0.0
  464. else: # Elastic-Net penalty
  465. alpha = (1.0 / C) * (1 - l1_ratio)
  466. beta = (1.0 / C) * l1_ratio
  467. w0, n_iter_i, warm_start_sag = sag_solver(
  468. X,
  469. target,
  470. sample_weight,
  471. loss,
  472. alpha,
  473. beta,
  474. max_iter,
  475. tol,
  476. verbose,
  477. random_state,
  478. False,
  479. max_squared_sum,
  480. warm_start_sag,
  481. is_saga=(solver == "saga"),
  482. )
  483. else:
  484. raise ValueError(
  485. "solver must be one of {'liblinear', 'lbfgs', "
  486. "'newton-cg', 'sag'}, got '%s' instead" % solver
  487. )
  488. if multi_class == "multinomial":
  489. n_classes = max(2, classes.size)
  490. if solver in ["lbfgs", "newton-cg"]:
  491. multi_w0 = np.reshape(w0, (n_classes, -1), order="F")
  492. else:
  493. multi_w0 = w0
  494. if n_classes == 2:
  495. multi_w0 = multi_w0[1][np.newaxis, :]
  496. coefs.append(multi_w0.copy())
  497. else:
  498. coefs.append(w0.copy())
  499. n_iter[i] = n_iter_i
  500. return np.array(coefs), np.array(Cs), n_iter
  501. # helper function for LogisticCV
  502. def _log_reg_scoring_path(
  503. X,
  504. y,
  505. train,
  506. test,
  507. pos_class=None,
  508. Cs=10,
  509. scoring=None,
  510. fit_intercept=False,
  511. max_iter=100,
  512. tol=1e-4,
  513. class_weight=None,
  514. verbose=0,
  515. solver="lbfgs",
  516. penalty="l2",
  517. dual=False,
  518. intercept_scaling=1.0,
  519. multi_class="auto",
  520. random_state=None,
  521. max_squared_sum=None,
  522. sample_weight=None,
  523. l1_ratio=None,
  524. ):
  525. """Computes scores across logistic_regression_path
  526. Parameters
  527. ----------
  528. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  529. Training data.
  530. y : array-like of shape (n_samples,) or (n_samples, n_targets)
  531. Target labels.
  532. train : list of indices
  533. The indices of the train set.
  534. test : list of indices
  535. The indices of the test set.
  536. pos_class : int, default=None
  537. The class with respect to which we perform a one-vs-all fit.
  538. If None, then it is assumed that the given problem is binary.
  539. Cs : int or list of floats, default=10
  540. Each of the values in Cs describes the inverse of
  541. regularization strength. If Cs is as an int, then a grid of Cs
  542. values are chosen in a logarithmic scale between 1e-4 and 1e4.
  543. If not provided, then a fixed set of values for Cs are used.
  544. scoring : callable, default=None
  545. A string (see model evaluation documentation) or
  546. a scorer callable object / function with signature
  547. ``scorer(estimator, X, y)``. For a list of scoring functions
  548. that can be used, look at :mod:`sklearn.metrics`. The
  549. default scoring option used is accuracy_score.
  550. fit_intercept : bool, default=False
  551. If False, then the bias term is set to zero. Else the last
  552. term of each coef_ gives us the intercept.
  553. max_iter : int, default=100
  554. Maximum number of iterations for the solver.
  555. tol : float, default=1e-4
  556. Tolerance for stopping criteria.
  557. class_weight : dict or 'balanced', default=None
  558. Weights associated with classes in the form ``{class_label: weight}``.
  559. If not given, all classes are supposed to have weight one.
  560. The "balanced" mode uses the values of y to automatically adjust
  561. weights inversely proportional to class frequencies in the input data
  562. as ``n_samples / (n_classes * np.bincount(y))``
  563. Note that these weights will be multiplied with sample_weight (passed
  564. through the fit method) if sample_weight is specified.
  565. verbose : int, default=0
  566. For the liblinear and lbfgs solvers set verbose to any positive
  567. number for verbosity.
  568. solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, \
  569. default='lbfgs'
  570. Decides which solver to use.
  571. penalty : {'l1', 'l2', 'elasticnet'}, default='l2'
  572. Used to specify the norm used in the penalization. The 'newton-cg',
  573. 'sag' and 'lbfgs' solvers support only l2 penalties. 'elasticnet' is
  574. only supported by the 'saga' solver.
  575. dual : bool, default=False
  576. Dual or primal formulation. Dual formulation is only implemented for
  577. l2 penalty with liblinear solver. Prefer dual=False when
  578. n_samples > n_features.
  579. intercept_scaling : float, default=1.
  580. Useful only when the solver 'liblinear' is used
  581. and self.fit_intercept is set to True. In this case, x becomes
  582. [x, self.intercept_scaling],
  583. i.e. a "synthetic" feature with constant value equals to
  584. intercept_scaling is appended to the instance vector.
  585. The intercept becomes intercept_scaling * synthetic feature weight
  586. Note! the synthetic feature weight is subject to l1/l2 regularization
  587. as all other features.
  588. To lessen the effect of regularization on synthetic feature weight
  589. (and therefore on the intercept) intercept_scaling has to be increased.
  590. multi_class : {'auto', 'ovr', 'multinomial'}, default='auto'
  591. If the option chosen is 'ovr', then a binary problem is fit for each
  592. label. For 'multinomial' the loss minimised is the multinomial loss fit
  593. across the entire probability distribution, *even when the data is
  594. binary*. 'multinomial' is unavailable when solver='liblinear'.
  595. random_state : int, RandomState instance, default=None
  596. Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the
  597. data. See :term:`Glossary <random_state>` for details.
  598. max_squared_sum : float, default=None
  599. Maximum squared sum of X over samples. Used only in SAG solver.
  600. If None, it will be computed, going through all the samples.
  601. The value should be precomputed to speed up cross validation.
  602. sample_weight : array-like of shape(n_samples,), default=None
  603. Array of weights that are assigned to individual samples.
  604. If not provided, then each sample is given unit weight.
  605. l1_ratio : float, default=None
  606. The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only
  607. used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent
  608. to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent
  609. to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a
  610. combination of L1 and L2.
  611. Returns
  612. -------
  613. coefs : ndarray of shape (n_cs, n_features) or (n_cs, n_features + 1)
  614. List of coefficients for the Logistic Regression model. If
  615. fit_intercept is set to True then the second dimension will be
  616. n_features + 1, where the last item represents the intercept.
  617. Cs : ndarray
  618. Grid of Cs used for cross-validation.
  619. scores : ndarray of shape (n_cs,)
  620. Scores obtained for each Cs.
  621. n_iter : ndarray of shape(n_cs,)
  622. Actual number of iteration for each Cs.
  623. """
  624. X_train = X[train]
  625. X_test = X[test]
  626. y_train = y[train]
  627. y_test = y[test]
  628. if sample_weight is not None:
  629. sample_weight = _check_sample_weight(sample_weight, X)
  630. sample_weight = sample_weight[train]
  631. coefs, Cs, n_iter = _logistic_regression_path(
  632. X_train,
  633. y_train,
  634. Cs=Cs,
  635. l1_ratio=l1_ratio,
  636. fit_intercept=fit_intercept,
  637. solver=solver,
  638. max_iter=max_iter,
  639. class_weight=class_weight,
  640. pos_class=pos_class,
  641. multi_class=multi_class,
  642. tol=tol,
  643. verbose=verbose,
  644. dual=dual,
  645. penalty=penalty,
  646. intercept_scaling=intercept_scaling,
  647. random_state=random_state,
  648. check_input=False,
  649. max_squared_sum=max_squared_sum,
  650. sample_weight=sample_weight,
  651. )
  652. log_reg = LogisticRegression(solver=solver, multi_class=multi_class)
  653. # The score method of Logistic Regression has a classes_ attribute.
  654. if multi_class == "ovr":
  655. log_reg.classes_ = np.array([-1, 1])
  656. elif multi_class == "multinomial":
  657. log_reg.classes_ = np.unique(y_train)
  658. else:
  659. raise ValueError(
  660. "multi_class should be either multinomial or ovr, got %d" % multi_class
  661. )
  662. if pos_class is not None:
  663. mask = y_test == pos_class
  664. y_test = np.ones(y_test.shape, dtype=np.float64)
  665. y_test[~mask] = -1.0
  666. scores = list()
  667. scoring = get_scorer(scoring)
  668. for w in coefs:
  669. if multi_class == "ovr":
  670. w = w[np.newaxis, :]
  671. if fit_intercept:
  672. log_reg.coef_ = w[:, :-1]
  673. log_reg.intercept_ = w[:, -1]
  674. else:
  675. log_reg.coef_ = w
  676. log_reg.intercept_ = 0.0
  677. if scoring is None:
  678. scores.append(log_reg.score(X_test, y_test))
  679. else:
  680. scores.append(scoring(log_reg, X_test, y_test))
  681. return coefs, Cs, np.array(scores), n_iter
  682. class LogisticRegression(LinearClassifierMixin, SparseCoefMixin, BaseEstimator):
  683. """
  684. Logistic Regression (aka logit, MaxEnt) classifier.
  685. In the multiclass case, the training algorithm uses the one-vs-rest (OvR)
  686. scheme if the 'multi_class' option is set to 'ovr', and uses the
  687. cross-entropy loss if the 'multi_class' option is set to 'multinomial'.
  688. (Currently the 'multinomial' option is supported only by the 'lbfgs',
  689. 'sag', 'saga' and 'newton-cg' solvers.)
  690. This class implements regularized logistic regression using the
  691. 'liblinear' library, 'newton-cg', 'sag', 'saga' and 'lbfgs' solvers. **Note
  692. that regularization is applied by default**. It can handle both dense
  693. and sparse input. Use C-ordered arrays or CSR matrices containing 64-bit
  694. floats for optimal performance; any other input format will be converted
  695. (and copied).
  696. The 'newton-cg', 'sag', and 'lbfgs' solvers support only L2 regularization
  697. with primal formulation, or no regularization. The 'liblinear' solver
  698. supports both L1 and L2 regularization, with a dual formulation only for
  699. the L2 penalty. The Elastic-Net regularization is only supported by the
  700. 'saga' solver.
  701. Read more in the :ref:`User Guide <logistic_regression>`.
  702. Parameters
  703. ----------
  704. penalty : {'l1', 'l2', 'elasticnet', None}, default='l2'
  705. Specify the norm of the penalty:
  706. - `None`: no penalty is added;
  707. - `'l2'`: add a L2 penalty term and it is the default choice;
  708. - `'l1'`: add a L1 penalty term;
  709. - `'elasticnet'`: both L1 and L2 penalty terms are added.
  710. .. warning::
  711. Some penalties may not work with some solvers. See the parameter
  712. `solver` below, to know the compatibility between the penalty and
  713. solver.
  714. .. versionadded:: 0.19
  715. l1 penalty with SAGA solver (allowing 'multinomial' + L1)
  716. .. deprecated:: 1.2
  717. The 'none' option was deprecated in version 1.2, and will be removed
  718. in 1.4. Use `None` instead.
  719. dual : bool, default=False
  720. Dual (constrained) or primal (regularized, see also
  721. :ref:`this equation <regularized-logistic-loss>`) formulation. Dual formulation
  722. is only implemented for l2 penalty with liblinear solver. Prefer dual=False when
  723. n_samples > n_features.
  724. tol : float, default=1e-4
  725. Tolerance for stopping criteria.
  726. C : float, default=1.0
  727. Inverse of regularization strength; must be a positive float.
  728. Like in support vector machines, smaller values specify stronger
  729. regularization.
  730. fit_intercept : bool, default=True
  731. Specifies if a constant (a.k.a. bias or intercept) should be
  732. added to the decision function.
  733. intercept_scaling : float, default=1
  734. Useful only when the solver 'liblinear' is used
  735. and self.fit_intercept is set to True. In this case, x becomes
  736. [x, self.intercept_scaling],
  737. i.e. a "synthetic" feature with constant value equal to
  738. intercept_scaling is appended to the instance vector.
  739. The intercept becomes ``intercept_scaling * synthetic_feature_weight``.
  740. Note! the synthetic feature weight is subject to l1/l2 regularization
  741. as all other features.
  742. To lessen the effect of regularization on synthetic feature weight
  743. (and therefore on the intercept) intercept_scaling has to be increased.
  744. class_weight : dict or 'balanced', default=None
  745. Weights associated with classes in the form ``{class_label: weight}``.
  746. If not given, all classes are supposed to have weight one.
  747. The "balanced" mode uses the values of y to automatically adjust
  748. weights inversely proportional to class frequencies in the input data
  749. as ``n_samples / (n_classes * np.bincount(y))``.
  750. Note that these weights will be multiplied with sample_weight (passed
  751. through the fit method) if sample_weight is specified.
  752. .. versionadded:: 0.17
  753. *class_weight='balanced'*
  754. random_state : int, RandomState instance, default=None
  755. Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the
  756. data. See :term:`Glossary <random_state>` for details.
  757. solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, \
  758. default='lbfgs'
  759. Algorithm to use in the optimization problem. Default is 'lbfgs'.
  760. To choose a solver, you might want to consider the following aspects:
  761. - For small datasets, 'liblinear' is a good choice, whereas 'sag'
  762. and 'saga' are faster for large ones;
  763. - For multiclass problems, only 'newton-cg', 'sag', 'saga' and
  764. 'lbfgs' handle multinomial loss;
  765. - 'liblinear' is limited to one-versus-rest schemes.
  766. - 'newton-cholesky' is a good choice for `n_samples` >> `n_features`,
  767. especially with one-hot encoded categorical features with rare
  768. categories. Note that it is limited to binary classification and the
  769. one-versus-rest reduction for multiclass classification. Be aware that
  770. the memory usage of this solver has a quadratic dependency on
  771. `n_features` because it explicitly computes the Hessian matrix.
  772. .. warning::
  773. The choice of the algorithm depends on the penalty chosen.
  774. Supported penalties by solver:
  775. - 'lbfgs' - ['l2', None]
  776. - 'liblinear' - ['l1', 'l2']
  777. - 'newton-cg' - ['l2', None]
  778. - 'newton-cholesky' - ['l2', None]
  779. - 'sag' - ['l2', None]
  780. - 'saga' - ['elasticnet', 'l1', 'l2', None]
  781. .. note::
  782. 'sag' and 'saga' fast convergence is only guaranteed on features
  783. with approximately the same scale. You can preprocess the data with
  784. a scaler from :mod:`sklearn.preprocessing`.
  785. .. seealso::
  786. Refer to the User Guide for more information regarding
  787. :class:`LogisticRegression` and more specifically the
  788. :ref:`Table <Logistic_regression>`
  789. summarizing solver/penalty supports.
  790. .. versionadded:: 0.17
  791. Stochastic Average Gradient descent solver.
  792. .. versionadded:: 0.19
  793. SAGA solver.
  794. .. versionchanged:: 0.22
  795. The default solver changed from 'liblinear' to 'lbfgs' in 0.22.
  796. .. versionadded:: 1.2
  797. newton-cholesky solver.
  798. max_iter : int, default=100
  799. Maximum number of iterations taken for the solvers to converge.
  800. multi_class : {'auto', 'ovr', 'multinomial'}, default='auto'
  801. If the option chosen is 'ovr', then a binary problem is fit for each
  802. label. For 'multinomial' the loss minimised is the multinomial loss fit
  803. across the entire probability distribution, *even when the data is
  804. binary*. 'multinomial' is unavailable when solver='liblinear'.
  805. 'auto' selects 'ovr' if the data is binary, or if solver='liblinear',
  806. and otherwise selects 'multinomial'.
  807. .. versionadded:: 0.18
  808. Stochastic Average Gradient descent solver for 'multinomial' case.
  809. .. versionchanged:: 0.22
  810. Default changed from 'ovr' to 'auto' in 0.22.
  811. verbose : int, default=0
  812. For the liblinear and lbfgs solvers set verbose to any positive
  813. number for verbosity.
  814. warm_start : bool, default=False
  815. When set to True, reuse the solution of the previous call to fit as
  816. initialization, otherwise, just erase the previous solution.
  817. Useless for liblinear solver. See :term:`the Glossary <warm_start>`.
  818. .. versionadded:: 0.17
  819. *warm_start* to support *lbfgs*, *newton-cg*, *sag*, *saga* solvers.
  820. n_jobs : int, default=None
  821. Number of CPU cores used when parallelizing over classes if
  822. multi_class='ovr'". This parameter is ignored when the ``solver`` is
  823. set to 'liblinear' regardless of whether 'multi_class' is specified or
  824. not. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
  825. context. ``-1`` means using all processors.
  826. See :term:`Glossary <n_jobs>` for more details.
  827. l1_ratio : float, default=None
  828. The Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``. Only
  829. used if ``penalty='elasticnet'``. Setting ``l1_ratio=0`` is equivalent
  830. to using ``penalty='l2'``, while setting ``l1_ratio=1`` is equivalent
  831. to using ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a
  832. combination of L1 and L2.
  833. Attributes
  834. ----------
  835. classes_ : ndarray of shape (n_classes, )
  836. A list of class labels known to the classifier.
  837. coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)
  838. Coefficient of the features in the decision function.
  839. `coef_` is of shape (1, n_features) when the given problem is binary.
  840. In particular, when `multi_class='multinomial'`, `coef_` corresponds
  841. to outcome 1 (True) and `-coef_` corresponds to outcome 0 (False).
  842. intercept_ : ndarray of shape (1,) or (n_classes,)
  843. Intercept (a.k.a. bias) added to the decision function.
  844. If `fit_intercept` is set to False, the intercept is set to zero.
  845. `intercept_` is of shape (1,) when the given problem is binary.
  846. In particular, when `multi_class='multinomial'`, `intercept_`
  847. corresponds to outcome 1 (True) and `-intercept_` corresponds to
  848. outcome 0 (False).
  849. n_features_in_ : int
  850. Number of features seen during :term:`fit`.
  851. .. versionadded:: 0.24
  852. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  853. Names of features seen during :term:`fit`. Defined only when `X`
  854. has feature names that are all strings.
  855. .. versionadded:: 1.0
  856. n_iter_ : ndarray of shape (n_classes,) or (1, )
  857. Actual number of iterations for all classes. If binary or multinomial,
  858. it returns only 1 element. For liblinear solver, only the maximum
  859. number of iteration across all classes is given.
  860. .. versionchanged:: 0.20
  861. In SciPy <= 1.0.0 the number of lbfgs iterations may exceed
  862. ``max_iter``. ``n_iter_`` will now report at most ``max_iter``.
  863. See Also
  864. --------
  865. SGDClassifier : Incrementally trained logistic regression (when given
  866. the parameter ``loss="log_loss"``).
  867. LogisticRegressionCV : Logistic regression with built-in cross validation.
  868. Notes
  869. -----
  870. The underlying C implementation uses a random number generator to
  871. select features when fitting the model. It is thus not uncommon,
  872. to have slightly different results for the same input data. If
  873. that happens, try with a smaller tol parameter.
  874. Predict output may not match that of standalone liblinear in certain
  875. cases. See :ref:`differences from liblinear <liblinear_differences>`
  876. in the narrative documentation.
  877. References
  878. ----------
  879. L-BFGS-B -- Software for Large-scale Bound-constrained Optimization
  880. Ciyou Zhu, Richard Byrd, Jorge Nocedal and Jose Luis Morales.
  881. http://users.iems.northwestern.edu/~nocedal/lbfgsb.html
  882. LIBLINEAR -- A Library for Large Linear Classification
  883. https://www.csie.ntu.edu.tw/~cjlin/liblinear/
  884. SAG -- Mark Schmidt, Nicolas Le Roux, and Francis Bach
  885. Minimizing Finite Sums with the Stochastic Average Gradient
  886. https://hal.inria.fr/hal-00860051/document
  887. SAGA -- Defazio, A., Bach F. & Lacoste-Julien S. (2014).
  888. :arxiv:`"SAGA: A Fast Incremental Gradient Method With Support
  889. for Non-Strongly Convex Composite Objectives" <1407.0202>`
  890. Hsiang-Fu Yu, Fang-Lan Huang, Chih-Jen Lin (2011). Dual coordinate descent
  891. methods for logistic regression and maximum entropy models.
  892. Machine Learning 85(1-2):41-75.
  893. https://www.csie.ntu.edu.tw/~cjlin/papers/maxent_dual.pdf
  894. Examples
  895. --------
  896. >>> from sklearn.datasets import load_iris
  897. >>> from sklearn.linear_model import LogisticRegression
  898. >>> X, y = load_iris(return_X_y=True)
  899. >>> clf = LogisticRegression(random_state=0).fit(X, y)
  900. >>> clf.predict(X[:2, :])
  901. array([0, 0])
  902. >>> clf.predict_proba(X[:2, :])
  903. array([[9.8...e-01, 1.8...e-02, 1.4...e-08],
  904. [9.7...e-01, 2.8...e-02, ...e-08]])
  905. >>> clf.score(X, y)
  906. 0.97...
  907. """
  908. _parameter_constraints: dict = {
  909. # TODO(1.4): Remove "none" option
  910. "penalty": [
  911. StrOptions({"l1", "l2", "elasticnet", "none"}, deprecated={"none"}),
  912. None,
  913. ],
  914. "dual": ["boolean"],
  915. "tol": [Interval(Real, 0, None, closed="left")],
  916. "C": [Interval(Real, 0, None, closed="right")],
  917. "fit_intercept": ["boolean"],
  918. "intercept_scaling": [Interval(Real, 0, None, closed="neither")],
  919. "class_weight": [dict, StrOptions({"balanced"}), None],
  920. "random_state": ["random_state"],
  921. "solver": [
  922. StrOptions(
  923. {"lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"}
  924. )
  925. ],
  926. "max_iter": [Interval(Integral, 0, None, closed="left")],
  927. "multi_class": [StrOptions({"auto", "ovr", "multinomial"})],
  928. "verbose": ["verbose"],
  929. "warm_start": ["boolean"],
  930. "n_jobs": [None, Integral],
  931. "l1_ratio": [Interval(Real, 0, 1, closed="both"), None],
  932. }
  933. def __init__(
  934. self,
  935. penalty="l2",
  936. *,
  937. dual=False,
  938. tol=1e-4,
  939. C=1.0,
  940. fit_intercept=True,
  941. intercept_scaling=1,
  942. class_weight=None,
  943. random_state=None,
  944. solver="lbfgs",
  945. max_iter=100,
  946. multi_class="auto",
  947. verbose=0,
  948. warm_start=False,
  949. n_jobs=None,
  950. l1_ratio=None,
  951. ):
  952. self.penalty = penalty
  953. self.dual = dual
  954. self.tol = tol
  955. self.C = C
  956. self.fit_intercept = fit_intercept
  957. self.intercept_scaling = intercept_scaling
  958. self.class_weight = class_weight
  959. self.random_state = random_state
  960. self.solver = solver
  961. self.max_iter = max_iter
  962. self.multi_class = multi_class
  963. self.verbose = verbose
  964. self.warm_start = warm_start
  965. self.n_jobs = n_jobs
  966. self.l1_ratio = l1_ratio
  967. @_fit_context(prefer_skip_nested_validation=True)
  968. def fit(self, X, y, sample_weight=None):
  969. """
  970. Fit the model according to the given training data.
  971. Parameters
  972. ----------
  973. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  974. Training vector, where `n_samples` is the number of samples and
  975. `n_features` is the number of features.
  976. y : array-like of shape (n_samples,)
  977. Target vector relative to X.
  978. sample_weight : array-like of shape (n_samples,) default=None
  979. Array of weights that are assigned to individual samples.
  980. If not provided, then each sample is given unit weight.
  981. .. versionadded:: 0.17
  982. *sample_weight* support to LogisticRegression.
  983. Returns
  984. -------
  985. self
  986. Fitted estimator.
  987. Notes
  988. -----
  989. The SAGA solver supports both float64 and float32 bit arrays.
  990. """
  991. solver = _check_solver(self.solver, self.penalty, self.dual)
  992. if self.penalty != "elasticnet" and self.l1_ratio is not None:
  993. warnings.warn(
  994. "l1_ratio parameter is only used when penalty is "
  995. "'elasticnet'. Got "
  996. "(penalty={})".format(self.penalty)
  997. )
  998. if self.penalty == "elasticnet" and self.l1_ratio is None:
  999. raise ValueError("l1_ratio must be specified when penalty is elasticnet.")
  1000. # TODO(1.4): Remove "none" option
  1001. if self.penalty == "none":
  1002. warnings.warn(
  1003. (
  1004. "`penalty='none'`has been deprecated in 1.2 and will be removed in"
  1005. " 1.4. To keep the past behaviour, set `penalty=None`."
  1006. ),
  1007. FutureWarning,
  1008. )
  1009. if self.penalty is None or self.penalty == "none":
  1010. if self.C != 1.0: # default values
  1011. warnings.warn(
  1012. "Setting penalty=None will ignore the C and l1_ratio parameters"
  1013. )
  1014. # Note that check for l1_ratio is done right above
  1015. C_ = np.inf
  1016. penalty = "l2"
  1017. else:
  1018. C_ = self.C
  1019. penalty = self.penalty
  1020. if solver == "lbfgs":
  1021. _dtype = np.float64
  1022. else:
  1023. _dtype = [np.float64, np.float32]
  1024. X, y = self._validate_data(
  1025. X,
  1026. y,
  1027. accept_sparse="csr",
  1028. dtype=_dtype,
  1029. order="C",
  1030. accept_large_sparse=solver not in ["liblinear", "sag", "saga"],
  1031. )
  1032. check_classification_targets(y)
  1033. self.classes_ = np.unique(y)
  1034. multi_class = _check_multi_class(self.multi_class, solver, len(self.classes_))
  1035. if solver == "liblinear":
  1036. if effective_n_jobs(self.n_jobs) != 1:
  1037. warnings.warn(
  1038. "'n_jobs' > 1 does not have any effect when"
  1039. " 'solver' is set to 'liblinear'. Got 'n_jobs'"
  1040. " = {}.".format(effective_n_jobs(self.n_jobs))
  1041. )
  1042. self.coef_, self.intercept_, self.n_iter_ = _fit_liblinear(
  1043. X,
  1044. y,
  1045. self.C,
  1046. self.fit_intercept,
  1047. self.intercept_scaling,
  1048. self.class_weight,
  1049. self.penalty,
  1050. self.dual,
  1051. self.verbose,
  1052. self.max_iter,
  1053. self.tol,
  1054. self.random_state,
  1055. sample_weight=sample_weight,
  1056. )
  1057. return self
  1058. if solver in ["sag", "saga"]:
  1059. max_squared_sum = row_norms(X, squared=True).max()
  1060. else:
  1061. max_squared_sum = None
  1062. n_classes = len(self.classes_)
  1063. classes_ = self.classes_
  1064. if n_classes < 2:
  1065. raise ValueError(
  1066. "This solver needs samples of at least 2 classes"
  1067. " in the data, but the data contains only one"
  1068. " class: %r"
  1069. % classes_[0]
  1070. )
  1071. if len(self.classes_) == 2:
  1072. n_classes = 1
  1073. classes_ = classes_[1:]
  1074. if self.warm_start:
  1075. warm_start_coef = getattr(self, "coef_", None)
  1076. else:
  1077. warm_start_coef = None
  1078. if warm_start_coef is not None and self.fit_intercept:
  1079. warm_start_coef = np.append(
  1080. warm_start_coef, self.intercept_[:, np.newaxis], axis=1
  1081. )
  1082. # Hack so that we iterate only once for the multinomial case.
  1083. if multi_class == "multinomial":
  1084. classes_ = [None]
  1085. warm_start_coef = [warm_start_coef]
  1086. if warm_start_coef is None:
  1087. warm_start_coef = [None] * n_classes
  1088. path_func = delayed(_logistic_regression_path)
  1089. # The SAG solver releases the GIL so it's more efficient to use
  1090. # threads for this solver.
  1091. if solver in ["sag", "saga"]:
  1092. prefer = "threads"
  1093. else:
  1094. prefer = "processes"
  1095. # TODO: Refactor this to avoid joblib parallelism entirely when doing binary
  1096. # and multinomial multiclass classification and use joblib only for the
  1097. # one-vs-rest multiclass case.
  1098. if (
  1099. solver in ["lbfgs", "newton-cg", "newton-cholesky"]
  1100. and len(classes_) == 1
  1101. and effective_n_jobs(self.n_jobs) == 1
  1102. ):
  1103. # In the future, we would like n_threads = _openmp_effective_n_threads()
  1104. # For the time being, we just do
  1105. n_threads = 1
  1106. else:
  1107. n_threads = 1
  1108. fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)(
  1109. path_func(
  1110. X,
  1111. y,
  1112. pos_class=class_,
  1113. Cs=[C_],
  1114. l1_ratio=self.l1_ratio,
  1115. fit_intercept=self.fit_intercept,
  1116. tol=self.tol,
  1117. verbose=self.verbose,
  1118. solver=solver,
  1119. multi_class=multi_class,
  1120. max_iter=self.max_iter,
  1121. class_weight=self.class_weight,
  1122. check_input=False,
  1123. random_state=self.random_state,
  1124. coef=warm_start_coef_,
  1125. penalty=penalty,
  1126. max_squared_sum=max_squared_sum,
  1127. sample_weight=sample_weight,
  1128. n_threads=n_threads,
  1129. )
  1130. for class_, warm_start_coef_ in zip(classes_, warm_start_coef)
  1131. )
  1132. fold_coefs_, _, n_iter_ = zip(*fold_coefs_)
  1133. self.n_iter_ = np.asarray(n_iter_, dtype=np.int32)[:, 0]
  1134. n_features = X.shape[1]
  1135. if multi_class == "multinomial":
  1136. self.coef_ = fold_coefs_[0][0]
  1137. else:
  1138. self.coef_ = np.asarray(fold_coefs_)
  1139. self.coef_ = self.coef_.reshape(
  1140. n_classes, n_features + int(self.fit_intercept)
  1141. )
  1142. if self.fit_intercept:
  1143. self.intercept_ = self.coef_[:, -1]
  1144. self.coef_ = self.coef_[:, :-1]
  1145. else:
  1146. self.intercept_ = np.zeros(n_classes)
  1147. return self
  1148. def predict_proba(self, X):
  1149. """
  1150. Probability estimates.
  1151. The returned estimates for all classes are ordered by the
  1152. label of classes.
  1153. For a multi_class problem, if multi_class is set to be "multinomial"
  1154. the softmax function is used to find the predicted probability of
  1155. each class.
  1156. Else use a one-vs-rest approach, i.e calculate the probability
  1157. of each class assuming it to be positive using the logistic function.
  1158. and normalize these values across all the classes.
  1159. Parameters
  1160. ----------
  1161. X : array-like of shape (n_samples, n_features)
  1162. Vector to be scored, where `n_samples` is the number of samples and
  1163. `n_features` is the number of features.
  1164. Returns
  1165. -------
  1166. T : array-like of shape (n_samples, n_classes)
  1167. Returns the probability of the sample for each class in the model,
  1168. where classes are ordered as they are in ``self.classes_``.
  1169. """
  1170. check_is_fitted(self)
  1171. ovr = self.multi_class in ["ovr", "warn"] or (
  1172. self.multi_class == "auto"
  1173. and (
  1174. self.classes_.size <= 2
  1175. or self.solver in ("liblinear", "newton-cholesky")
  1176. )
  1177. )
  1178. if ovr:
  1179. return super()._predict_proba_lr(X)
  1180. else:
  1181. decision = self.decision_function(X)
  1182. if decision.ndim == 1:
  1183. # Workaround for multi_class="multinomial" and binary outcomes
  1184. # which requires softmax prediction with only a 1D decision.
  1185. decision_2d = np.c_[-decision, decision]
  1186. else:
  1187. decision_2d = decision
  1188. return softmax(decision_2d, copy=False)
  1189. def predict_log_proba(self, X):
  1190. """
  1191. Predict logarithm of probability estimates.
  1192. The returned estimates for all classes are ordered by the
  1193. label of classes.
  1194. Parameters
  1195. ----------
  1196. X : array-like of shape (n_samples, n_features)
  1197. Vector to be scored, where `n_samples` is the number of samples and
  1198. `n_features` is the number of features.
  1199. Returns
  1200. -------
  1201. T : array-like of shape (n_samples, n_classes)
  1202. Returns the log-probability of the sample for each class in the
  1203. model, where classes are ordered as they are in ``self.classes_``.
  1204. """
  1205. return np.log(self.predict_proba(X))
  1206. class LogisticRegressionCV(LogisticRegression, LinearClassifierMixin, BaseEstimator):
  1207. """Logistic Regression CV (aka logit, MaxEnt) classifier.
  1208. See glossary entry for :term:`cross-validation estimator`.
  1209. This class implements logistic regression using liblinear, newton-cg, sag
  1210. of lbfgs optimizer. The newton-cg, sag and lbfgs solvers support only L2
  1211. regularization with primal formulation. The liblinear solver supports both
  1212. L1 and L2 regularization, with a dual formulation only for the L2 penalty.
  1213. Elastic-Net penalty is only supported by the saga solver.
  1214. For the grid of `Cs` values and `l1_ratios` values, the best hyperparameter
  1215. is selected by the cross-validator
  1216. :class:`~sklearn.model_selection.StratifiedKFold`, but it can be changed
  1217. using the :term:`cv` parameter. The 'newton-cg', 'sag', 'saga' and 'lbfgs'
  1218. solvers can warm-start the coefficients (see :term:`Glossary<warm_start>`).
  1219. Read more in the :ref:`User Guide <logistic_regression>`.
  1220. Parameters
  1221. ----------
  1222. Cs : int or list of floats, default=10
  1223. Each of the values in Cs describes the inverse of regularization
  1224. strength. If Cs is as an int, then a grid of Cs values are chosen
  1225. in a logarithmic scale between 1e-4 and 1e4.
  1226. Like in support vector machines, smaller values specify stronger
  1227. regularization.
  1228. fit_intercept : bool, default=True
  1229. Specifies if a constant (a.k.a. bias or intercept) should be
  1230. added to the decision function.
  1231. cv : int or cross-validation generator, default=None
  1232. The default cross-validation generator used is Stratified K-Folds.
  1233. If an integer is provided, then it is the number of folds used.
  1234. See the module :mod:`sklearn.model_selection` module for the
  1235. list of possible cross-validation objects.
  1236. .. versionchanged:: 0.22
  1237. ``cv`` default value if None changed from 3-fold to 5-fold.
  1238. dual : bool, default=False
  1239. Dual (constrained) or primal (regularized, see also
  1240. :ref:`this equation <regularized-logistic-loss>`) formulation. Dual formulation
  1241. is only implemented for l2 penalty with liblinear solver. Prefer dual=False when
  1242. n_samples > n_features.
  1243. penalty : {'l1', 'l2', 'elasticnet'}, default='l2'
  1244. Specify the norm of the penalty:
  1245. - `'l2'`: add a L2 penalty term (used by default);
  1246. - `'l1'`: add a L1 penalty term;
  1247. - `'elasticnet'`: both L1 and L2 penalty terms are added.
  1248. .. warning::
  1249. Some penalties may not work with some solvers. See the parameter
  1250. `solver` below, to know the compatibility between the penalty and
  1251. solver.
  1252. scoring : str or callable, default=None
  1253. A string (see model evaluation documentation) or
  1254. a scorer callable object / function with signature
  1255. ``scorer(estimator, X, y)``. For a list of scoring functions
  1256. that can be used, look at :mod:`sklearn.metrics`. The
  1257. default scoring option used is 'accuracy'.
  1258. solver : {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, \
  1259. default='lbfgs'
  1260. Algorithm to use in the optimization problem. Default is 'lbfgs'.
  1261. To choose a solver, you might want to consider the following aspects:
  1262. - For small datasets, 'liblinear' is a good choice, whereas 'sag'
  1263. and 'saga' are faster for large ones;
  1264. - For multiclass problems, only 'newton-cg', 'sag', 'saga' and
  1265. 'lbfgs' handle multinomial loss;
  1266. - 'liblinear' might be slower in :class:`LogisticRegressionCV`
  1267. because it does not handle warm-starting. 'liblinear' is
  1268. limited to one-versus-rest schemes.
  1269. - 'newton-cholesky' is a good choice for `n_samples` >> `n_features`,
  1270. especially with one-hot encoded categorical features with rare
  1271. categories. Note that it is limited to binary classification and the
  1272. one-versus-rest reduction for multiclass classification. Be aware that
  1273. the memory usage of this solver has a quadratic dependency on
  1274. `n_features` because it explicitly computes the Hessian matrix.
  1275. .. warning::
  1276. The choice of the algorithm depends on the penalty chosen.
  1277. Supported penalties by solver:
  1278. - 'lbfgs' - ['l2']
  1279. - 'liblinear' - ['l1', 'l2']
  1280. - 'newton-cg' - ['l2']
  1281. - 'newton-cholesky' - ['l2']
  1282. - 'sag' - ['l2']
  1283. - 'saga' - ['elasticnet', 'l1', 'l2']
  1284. .. note::
  1285. 'sag' and 'saga' fast convergence is only guaranteed on features
  1286. with approximately the same scale. You can preprocess the data with
  1287. a scaler from :mod:`sklearn.preprocessing`.
  1288. .. versionadded:: 0.17
  1289. Stochastic Average Gradient descent solver.
  1290. .. versionadded:: 0.19
  1291. SAGA solver.
  1292. .. versionadded:: 1.2
  1293. newton-cholesky solver.
  1294. tol : float, default=1e-4
  1295. Tolerance for stopping criteria.
  1296. max_iter : int, default=100
  1297. Maximum number of iterations of the optimization algorithm.
  1298. class_weight : dict or 'balanced', default=None
  1299. Weights associated with classes in the form ``{class_label: weight}``.
  1300. If not given, all classes are supposed to have weight one.
  1301. The "balanced" mode uses the values of y to automatically adjust
  1302. weights inversely proportional to class frequencies in the input data
  1303. as ``n_samples / (n_classes * np.bincount(y))``.
  1304. Note that these weights will be multiplied with sample_weight (passed
  1305. through the fit method) if sample_weight is specified.
  1306. .. versionadded:: 0.17
  1307. class_weight == 'balanced'
  1308. n_jobs : int, default=None
  1309. Number of CPU cores used during the cross-validation loop.
  1310. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
  1311. ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
  1312. for more details.
  1313. verbose : int, default=0
  1314. For the 'liblinear', 'sag' and 'lbfgs' solvers set verbose to any
  1315. positive number for verbosity.
  1316. refit : bool, default=True
  1317. If set to True, the scores are averaged across all folds, and the
  1318. coefs and the C that corresponds to the best score is taken, and a
  1319. final refit is done using these parameters.
  1320. Otherwise the coefs, intercepts and C that correspond to the
  1321. best scores across folds are averaged.
  1322. intercept_scaling : float, default=1
  1323. Useful only when the solver 'liblinear' is used
  1324. and self.fit_intercept is set to True. In this case, x becomes
  1325. [x, self.intercept_scaling],
  1326. i.e. a "synthetic" feature with constant value equal to
  1327. intercept_scaling is appended to the instance vector.
  1328. The intercept becomes ``intercept_scaling * synthetic_feature_weight``.
  1329. Note! the synthetic feature weight is subject to l1/l2 regularization
  1330. as all other features.
  1331. To lessen the effect of regularization on synthetic feature weight
  1332. (and therefore on the intercept) intercept_scaling has to be increased.
  1333. multi_class : {'auto, 'ovr', 'multinomial'}, default='auto'
  1334. If the option chosen is 'ovr', then a binary problem is fit for each
  1335. label. For 'multinomial' the loss minimised is the multinomial loss fit
  1336. across the entire probability distribution, *even when the data is
  1337. binary*. 'multinomial' is unavailable when solver='liblinear'.
  1338. 'auto' selects 'ovr' if the data is binary, or if solver='liblinear',
  1339. and otherwise selects 'multinomial'.
  1340. .. versionadded:: 0.18
  1341. Stochastic Average Gradient descent solver for 'multinomial' case.
  1342. .. versionchanged:: 0.22
  1343. Default changed from 'ovr' to 'auto' in 0.22.
  1344. random_state : int, RandomState instance, default=None
  1345. Used when `solver='sag'`, 'saga' or 'liblinear' to shuffle the data.
  1346. Note that this only applies to the solver and not the cross-validation
  1347. generator. See :term:`Glossary <random_state>` for details.
  1348. l1_ratios : list of float, default=None
  1349. The list of Elastic-Net mixing parameter, with ``0 <= l1_ratio <= 1``.
  1350. Only used if ``penalty='elasticnet'``. A value of 0 is equivalent to
  1351. using ``penalty='l2'``, while 1 is equivalent to using
  1352. ``penalty='l1'``. For ``0 < l1_ratio <1``, the penalty is a combination
  1353. of L1 and L2.
  1354. Attributes
  1355. ----------
  1356. classes_ : ndarray of shape (n_classes, )
  1357. A list of class labels known to the classifier.
  1358. coef_ : ndarray of shape (1, n_features) or (n_classes, n_features)
  1359. Coefficient of the features in the decision function.
  1360. `coef_` is of shape (1, n_features) when the given problem
  1361. is binary.
  1362. intercept_ : ndarray of shape (1,) or (n_classes,)
  1363. Intercept (a.k.a. bias) added to the decision function.
  1364. If `fit_intercept` is set to False, the intercept is set to zero.
  1365. `intercept_` is of shape(1,) when the problem is binary.
  1366. Cs_ : ndarray of shape (n_cs)
  1367. Array of C i.e. inverse of regularization parameter values used
  1368. for cross-validation.
  1369. l1_ratios_ : ndarray of shape (n_l1_ratios)
  1370. Array of l1_ratios used for cross-validation. If no l1_ratio is used
  1371. (i.e. penalty is not 'elasticnet'), this is set to ``[None]``
  1372. coefs_paths_ : ndarray of shape (n_folds, n_cs, n_features) or \
  1373. (n_folds, n_cs, n_features + 1)
  1374. dict with classes as the keys, and the path of coefficients obtained
  1375. during cross-validating across each fold and then across each Cs
  1376. after doing an OvR for the corresponding class as values.
  1377. If the 'multi_class' option is set to 'multinomial', then
  1378. the coefs_paths are the coefficients corresponding to each class.
  1379. Each dict value has shape ``(n_folds, n_cs, n_features)`` or
  1380. ``(n_folds, n_cs, n_features + 1)`` depending on whether the
  1381. intercept is fit or not. If ``penalty='elasticnet'``, the shape is
  1382. ``(n_folds, n_cs, n_l1_ratios_, n_features)`` or
  1383. ``(n_folds, n_cs, n_l1_ratios_, n_features + 1)``.
  1384. scores_ : dict
  1385. dict with classes as the keys, and the values as the
  1386. grid of scores obtained during cross-validating each fold, after doing
  1387. an OvR for the corresponding class. If the 'multi_class' option
  1388. given is 'multinomial' then the same scores are repeated across
  1389. all classes, since this is the multinomial class. Each dict value
  1390. has shape ``(n_folds, n_cs)`` or ``(n_folds, n_cs, n_l1_ratios)`` if
  1391. ``penalty='elasticnet'``.
  1392. C_ : ndarray of shape (n_classes,) or (n_classes - 1,)
  1393. Array of C that maps to the best scores across every class. If refit is
  1394. set to False, then for each class, the best C is the average of the
  1395. C's that correspond to the best scores for each fold.
  1396. `C_` is of shape(n_classes,) when the problem is binary.
  1397. l1_ratio_ : ndarray of shape (n_classes,) or (n_classes - 1,)
  1398. Array of l1_ratio that maps to the best scores across every class. If
  1399. refit is set to False, then for each class, the best l1_ratio is the
  1400. average of the l1_ratio's that correspond to the best scores for each
  1401. fold. `l1_ratio_` is of shape(n_classes,) when the problem is binary.
  1402. n_iter_ : ndarray of shape (n_classes, n_folds, n_cs) or (1, n_folds, n_cs)
  1403. Actual number of iterations for all classes, folds and Cs.
  1404. In the binary or multinomial cases, the first dimension is equal to 1.
  1405. If ``penalty='elasticnet'``, the shape is ``(n_classes, n_folds,
  1406. n_cs, n_l1_ratios)`` or ``(1, n_folds, n_cs, n_l1_ratios)``.
  1407. n_features_in_ : int
  1408. Number of features seen during :term:`fit`.
  1409. .. versionadded:: 0.24
  1410. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  1411. Names of features seen during :term:`fit`. Defined only when `X`
  1412. has feature names that are all strings.
  1413. .. versionadded:: 1.0
  1414. See Also
  1415. --------
  1416. LogisticRegression : Logistic regression without tuning the
  1417. hyperparameter `C`.
  1418. Examples
  1419. --------
  1420. >>> from sklearn.datasets import load_iris
  1421. >>> from sklearn.linear_model import LogisticRegressionCV
  1422. >>> X, y = load_iris(return_X_y=True)
  1423. >>> clf = LogisticRegressionCV(cv=5, random_state=0).fit(X, y)
  1424. >>> clf.predict(X[:2, :])
  1425. array([0, 0])
  1426. >>> clf.predict_proba(X[:2, :]).shape
  1427. (2, 3)
  1428. >>> clf.score(X, y)
  1429. 0.98...
  1430. """
  1431. _parameter_constraints: dict = {**LogisticRegression._parameter_constraints}
  1432. for param in ["C", "warm_start", "l1_ratio"]:
  1433. _parameter_constraints.pop(param)
  1434. _parameter_constraints.update(
  1435. {
  1436. "Cs": [Interval(Integral, 1, None, closed="left"), "array-like"],
  1437. "cv": ["cv_object"],
  1438. "scoring": [StrOptions(set(get_scorer_names())), callable, None],
  1439. "l1_ratios": ["array-like", None],
  1440. "refit": ["boolean"],
  1441. "penalty": [StrOptions({"l1", "l2", "elasticnet"})],
  1442. }
  1443. )
  1444. def __init__(
  1445. self,
  1446. *,
  1447. Cs=10,
  1448. fit_intercept=True,
  1449. cv=None,
  1450. dual=False,
  1451. penalty="l2",
  1452. scoring=None,
  1453. solver="lbfgs",
  1454. tol=1e-4,
  1455. max_iter=100,
  1456. class_weight=None,
  1457. n_jobs=None,
  1458. verbose=0,
  1459. refit=True,
  1460. intercept_scaling=1.0,
  1461. multi_class="auto",
  1462. random_state=None,
  1463. l1_ratios=None,
  1464. ):
  1465. self.Cs = Cs
  1466. self.fit_intercept = fit_intercept
  1467. self.cv = cv
  1468. self.dual = dual
  1469. self.penalty = penalty
  1470. self.scoring = scoring
  1471. self.tol = tol
  1472. self.max_iter = max_iter
  1473. self.class_weight = class_weight
  1474. self.n_jobs = n_jobs
  1475. self.verbose = verbose
  1476. self.solver = solver
  1477. self.refit = refit
  1478. self.intercept_scaling = intercept_scaling
  1479. self.multi_class = multi_class
  1480. self.random_state = random_state
  1481. self.l1_ratios = l1_ratios
  1482. @_fit_context(prefer_skip_nested_validation=True)
  1483. def fit(self, X, y, sample_weight=None):
  1484. """Fit the model according to the given training data.
  1485. Parameters
  1486. ----------
  1487. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  1488. Training vector, where `n_samples` is the number of samples and
  1489. `n_features` is the number of features.
  1490. y : array-like of shape (n_samples,)
  1491. Target vector relative to X.
  1492. sample_weight : array-like of shape (n_samples,) default=None
  1493. Array of weights that are assigned to individual samples.
  1494. If not provided, then each sample is given unit weight.
  1495. Returns
  1496. -------
  1497. self : object
  1498. Fitted LogisticRegressionCV estimator.
  1499. """
  1500. solver = _check_solver(self.solver, self.penalty, self.dual)
  1501. if self.penalty == "elasticnet":
  1502. if (
  1503. self.l1_ratios is None
  1504. or len(self.l1_ratios) == 0
  1505. or any(
  1506. (
  1507. not isinstance(l1_ratio, numbers.Number)
  1508. or l1_ratio < 0
  1509. or l1_ratio > 1
  1510. )
  1511. for l1_ratio in self.l1_ratios
  1512. )
  1513. ):
  1514. raise ValueError(
  1515. "l1_ratios must be a list of numbers between "
  1516. "0 and 1; got (l1_ratios=%r)"
  1517. % self.l1_ratios
  1518. )
  1519. l1_ratios_ = self.l1_ratios
  1520. else:
  1521. if self.l1_ratios is not None:
  1522. warnings.warn(
  1523. "l1_ratios parameter is only used when penalty "
  1524. "is 'elasticnet'. Got (penalty={})".format(self.penalty)
  1525. )
  1526. l1_ratios_ = [None]
  1527. X, y = self._validate_data(
  1528. X,
  1529. y,
  1530. accept_sparse="csr",
  1531. dtype=np.float64,
  1532. order="C",
  1533. accept_large_sparse=solver not in ["liblinear", "sag", "saga"],
  1534. )
  1535. check_classification_targets(y)
  1536. class_weight = self.class_weight
  1537. # Encode for string labels
  1538. label_encoder = LabelEncoder().fit(y)
  1539. y = label_encoder.transform(y)
  1540. if isinstance(class_weight, dict):
  1541. class_weight = {
  1542. label_encoder.transform([cls])[0]: v for cls, v in class_weight.items()
  1543. }
  1544. # The original class labels
  1545. classes = self.classes_ = label_encoder.classes_
  1546. encoded_labels = label_encoder.transform(label_encoder.classes_)
  1547. multi_class = _check_multi_class(self.multi_class, solver, len(classes))
  1548. if solver in ["sag", "saga"]:
  1549. max_squared_sum = row_norms(X, squared=True).max()
  1550. else:
  1551. max_squared_sum = None
  1552. # init cross-validation generator
  1553. cv = check_cv(self.cv, y, classifier=True)
  1554. folds = list(cv.split(X, y))
  1555. # Use the label encoded classes
  1556. n_classes = len(encoded_labels)
  1557. if n_classes < 2:
  1558. raise ValueError(
  1559. "This solver needs samples of at least 2 classes"
  1560. " in the data, but the data contains only one"
  1561. " class: %r"
  1562. % classes[0]
  1563. )
  1564. if n_classes == 2:
  1565. # OvR in case of binary problems is as good as fitting
  1566. # the higher label
  1567. n_classes = 1
  1568. encoded_labels = encoded_labels[1:]
  1569. classes = classes[1:]
  1570. # We need this hack to iterate only once over labels, in the case of
  1571. # multi_class = multinomial, without changing the value of the labels.
  1572. if multi_class == "multinomial":
  1573. iter_encoded_labels = iter_classes = [None]
  1574. else:
  1575. iter_encoded_labels = encoded_labels
  1576. iter_classes = classes
  1577. # compute the class weights for the entire dataset y
  1578. if class_weight == "balanced":
  1579. class_weight = compute_class_weight(
  1580. class_weight, classes=np.arange(len(self.classes_)), y=y
  1581. )
  1582. class_weight = dict(enumerate(class_weight))
  1583. path_func = delayed(_log_reg_scoring_path)
  1584. # The SAG solver releases the GIL so it's more efficient to use
  1585. # threads for this solver.
  1586. if self.solver in ["sag", "saga"]:
  1587. prefer = "threads"
  1588. else:
  1589. prefer = "processes"
  1590. fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)(
  1591. path_func(
  1592. X,
  1593. y,
  1594. train,
  1595. test,
  1596. pos_class=label,
  1597. Cs=self.Cs,
  1598. fit_intercept=self.fit_intercept,
  1599. penalty=self.penalty,
  1600. dual=self.dual,
  1601. solver=solver,
  1602. tol=self.tol,
  1603. max_iter=self.max_iter,
  1604. verbose=self.verbose,
  1605. class_weight=class_weight,
  1606. scoring=self.scoring,
  1607. multi_class=multi_class,
  1608. intercept_scaling=self.intercept_scaling,
  1609. random_state=self.random_state,
  1610. max_squared_sum=max_squared_sum,
  1611. sample_weight=sample_weight,
  1612. l1_ratio=l1_ratio,
  1613. )
  1614. for label in iter_encoded_labels
  1615. for train, test in folds
  1616. for l1_ratio in l1_ratios_
  1617. )
  1618. # _log_reg_scoring_path will output different shapes depending on the
  1619. # multi_class param, so we need to reshape the outputs accordingly.
  1620. # Cs is of shape (n_classes . n_folds . n_l1_ratios, n_Cs) and all the
  1621. # rows are equal, so we just take the first one.
  1622. # After reshaping,
  1623. # - scores is of shape (n_classes, n_folds, n_Cs . n_l1_ratios)
  1624. # - coefs_paths is of shape
  1625. # (n_classes, n_folds, n_Cs . n_l1_ratios, n_features)
  1626. # - n_iter is of shape
  1627. # (n_classes, n_folds, n_Cs . n_l1_ratios) or
  1628. # (1, n_folds, n_Cs . n_l1_ratios)
  1629. coefs_paths, Cs, scores, n_iter_ = zip(*fold_coefs_)
  1630. self.Cs_ = Cs[0]
  1631. if multi_class == "multinomial":
  1632. coefs_paths = np.reshape(
  1633. coefs_paths,
  1634. (len(folds), len(l1_ratios_) * len(self.Cs_), n_classes, -1),
  1635. )
  1636. # equiv to coefs_paths = np.moveaxis(coefs_paths, (0, 1, 2, 3),
  1637. # (1, 2, 0, 3))
  1638. coefs_paths = np.swapaxes(coefs_paths, 0, 1)
  1639. coefs_paths = np.swapaxes(coefs_paths, 0, 2)
  1640. self.n_iter_ = np.reshape(
  1641. n_iter_, (1, len(folds), len(self.Cs_) * len(l1_ratios_))
  1642. )
  1643. # repeat same scores across all classes
  1644. scores = np.tile(scores, (n_classes, 1, 1))
  1645. else:
  1646. coefs_paths = np.reshape(
  1647. coefs_paths,
  1648. (n_classes, len(folds), len(self.Cs_) * len(l1_ratios_), -1),
  1649. )
  1650. self.n_iter_ = np.reshape(
  1651. n_iter_, (n_classes, len(folds), len(self.Cs_) * len(l1_ratios_))
  1652. )
  1653. scores = np.reshape(scores, (n_classes, len(folds), -1))
  1654. self.scores_ = dict(zip(classes, scores))
  1655. self.coefs_paths_ = dict(zip(classes, coefs_paths))
  1656. self.C_ = list()
  1657. self.l1_ratio_ = list()
  1658. self.coef_ = np.empty((n_classes, X.shape[1]))
  1659. self.intercept_ = np.zeros(n_classes)
  1660. for index, (cls, encoded_label) in enumerate(
  1661. zip(iter_classes, iter_encoded_labels)
  1662. ):
  1663. if multi_class == "ovr":
  1664. scores = self.scores_[cls]
  1665. coefs_paths = self.coefs_paths_[cls]
  1666. else:
  1667. # For multinomial, all scores are the same across classes
  1668. scores = scores[0]
  1669. # coefs_paths will keep its original shape because
  1670. # logistic_regression_path expects it this way
  1671. if self.refit:
  1672. # best_index is between 0 and (n_Cs . n_l1_ratios - 1)
  1673. # for example, with n_cs=2 and n_l1_ratios=3
  1674. # the layout of scores is
  1675. # [c1, c2, c1, c2, c1, c2]
  1676. # l1_1 , l1_2 , l1_3
  1677. best_index = scores.sum(axis=0).argmax()
  1678. best_index_C = best_index % len(self.Cs_)
  1679. C_ = self.Cs_[best_index_C]
  1680. self.C_.append(C_)
  1681. best_index_l1 = best_index // len(self.Cs_)
  1682. l1_ratio_ = l1_ratios_[best_index_l1]
  1683. self.l1_ratio_.append(l1_ratio_)
  1684. if multi_class == "multinomial":
  1685. coef_init = np.mean(coefs_paths[:, :, best_index, :], axis=1)
  1686. else:
  1687. coef_init = np.mean(coefs_paths[:, best_index, :], axis=0)
  1688. # Note that y is label encoded and hence pos_class must be
  1689. # the encoded label / None (for 'multinomial')
  1690. w, _, _ = _logistic_regression_path(
  1691. X,
  1692. y,
  1693. pos_class=encoded_label,
  1694. Cs=[C_],
  1695. solver=solver,
  1696. fit_intercept=self.fit_intercept,
  1697. coef=coef_init,
  1698. max_iter=self.max_iter,
  1699. tol=self.tol,
  1700. penalty=self.penalty,
  1701. class_weight=class_weight,
  1702. multi_class=multi_class,
  1703. verbose=max(0, self.verbose - 1),
  1704. random_state=self.random_state,
  1705. check_input=False,
  1706. max_squared_sum=max_squared_sum,
  1707. sample_weight=sample_weight,
  1708. l1_ratio=l1_ratio_,
  1709. )
  1710. w = w[0]
  1711. else:
  1712. # Take the best scores across every fold and the average of
  1713. # all coefficients corresponding to the best scores.
  1714. best_indices = np.argmax(scores, axis=1)
  1715. if multi_class == "ovr":
  1716. w = np.mean(
  1717. [coefs_paths[i, best_indices[i], :] for i in range(len(folds))],
  1718. axis=0,
  1719. )
  1720. else:
  1721. w = np.mean(
  1722. [
  1723. coefs_paths[:, i, best_indices[i], :]
  1724. for i in range(len(folds))
  1725. ],
  1726. axis=0,
  1727. )
  1728. best_indices_C = best_indices % len(self.Cs_)
  1729. self.C_.append(np.mean(self.Cs_[best_indices_C]))
  1730. if self.penalty == "elasticnet":
  1731. best_indices_l1 = best_indices // len(self.Cs_)
  1732. self.l1_ratio_.append(np.mean(l1_ratios_[best_indices_l1]))
  1733. else:
  1734. self.l1_ratio_.append(None)
  1735. if multi_class == "multinomial":
  1736. self.C_ = np.tile(self.C_, n_classes)
  1737. self.l1_ratio_ = np.tile(self.l1_ratio_, n_classes)
  1738. self.coef_ = w[:, : X.shape[1]]
  1739. if self.fit_intercept:
  1740. self.intercept_ = w[:, -1]
  1741. else:
  1742. self.coef_[index] = w[: X.shape[1]]
  1743. if self.fit_intercept:
  1744. self.intercept_[index] = w[-1]
  1745. self.C_ = np.asarray(self.C_)
  1746. self.l1_ratio_ = np.asarray(self.l1_ratio_)
  1747. self.l1_ratios_ = np.asarray(l1_ratios_)
  1748. # if elasticnet was used, add the l1_ratios dimension to some
  1749. # attributes
  1750. if self.l1_ratios is not None:
  1751. # with n_cs=2 and n_l1_ratios=3
  1752. # the layout of scores is
  1753. # [c1, c2, c1, c2, c1, c2]
  1754. # l1_1 , l1_2 , l1_3
  1755. # To get a 2d array with the following layout
  1756. # l1_1, l1_2, l1_3
  1757. # c1 [[ . , . , . ],
  1758. # c2 [ . , . , . ]]
  1759. # We need to first reshape and then transpose.
  1760. # The same goes for the other arrays
  1761. for cls, coefs_path in self.coefs_paths_.items():
  1762. self.coefs_paths_[cls] = coefs_path.reshape(
  1763. (len(folds), self.l1_ratios_.size, self.Cs_.size, -1)
  1764. )
  1765. self.coefs_paths_[cls] = np.transpose(
  1766. self.coefs_paths_[cls], (0, 2, 1, 3)
  1767. )
  1768. for cls, score in self.scores_.items():
  1769. self.scores_[cls] = score.reshape(
  1770. (len(folds), self.l1_ratios_.size, self.Cs_.size)
  1771. )
  1772. self.scores_[cls] = np.transpose(self.scores_[cls], (0, 2, 1))
  1773. self.n_iter_ = self.n_iter_.reshape(
  1774. (-1, len(folds), self.l1_ratios_.size, self.Cs_.size)
  1775. )
  1776. self.n_iter_ = np.transpose(self.n_iter_, (0, 1, 3, 2))
  1777. return self
  1778. def score(self, X, y, sample_weight=None):
  1779. """Score using the `scoring` option on the given test data and labels.
  1780. Parameters
  1781. ----------
  1782. X : array-like of shape (n_samples, n_features)
  1783. Test samples.
  1784. y : array-like of shape (n_samples,)
  1785. True labels for X.
  1786. sample_weight : array-like of shape (n_samples,), default=None
  1787. Sample weights.
  1788. Returns
  1789. -------
  1790. score : float
  1791. Score of self.predict(X) w.r.t. y.
  1792. """
  1793. scoring = self.scoring or "accuracy"
  1794. scoring = get_scorer(scoring)
  1795. return scoring(self, X, y, sample_weight=sample_weight)
  1796. def _more_tags(self):
  1797. return {
  1798. "_xfail_checks": {
  1799. "check_sample_weights_invariance": (
  1800. "zero sample_weight is not equivalent to removing samples"
  1801. ),
  1802. }
  1803. }