_linear_loss.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658
  1. """
  2. Loss functions for linear models with raw_prediction = X @ coef
  3. """
  4. import numpy as np
  5. from scipy import sparse
  6. from ..utils.extmath import squared_norm
  7. class LinearModelLoss:
  8. """General class for loss functions with raw_prediction = X @ coef + intercept.
  9. Note that raw_prediction is also known as linear predictor.
  10. The loss is the sum of per sample losses and includes a term for L2
  11. regularization::
  12. loss = sum_i s_i loss(y_i, X_i @ coef + intercept)
  13. + 1/2 * l2_reg_strength * ||coef||_2^2
  14. with sample weights s_i=1 if sample_weight=None.
  15. Gradient and hessian, for simplicity without intercept, are::
  16. gradient = X.T @ loss.gradient + l2_reg_strength * coef
  17. hessian = X.T @ diag(loss.hessian) @ X + l2_reg_strength * identity
  18. Conventions:
  19. if fit_intercept:
  20. n_dof = n_features + 1
  21. else:
  22. n_dof = n_features
  23. if base_loss.is_multiclass:
  24. coef.shape = (n_classes, n_dof) or ravelled (n_classes * n_dof,)
  25. else:
  26. coef.shape = (n_dof,)
  27. The intercept term is at the end of the coef array:
  28. if base_loss.is_multiclass:
  29. if coef.shape (n_classes, n_dof):
  30. intercept = coef[:, -1]
  31. if coef.shape (n_classes * n_dof,)
  32. intercept = coef[n_features::n_dof] = coef[(n_dof-1)::n_dof]
  33. intercept.shape = (n_classes,)
  34. else:
  35. intercept = coef[-1]
  36. Note: If coef has shape (n_classes * n_dof,), the 2d-array can be reconstructed as
  37. coef.reshape((n_classes, -1), order="F")
  38. The option order="F" makes coef[:, i] contiguous. This, in turn, makes the
  39. coefficients without intercept, coef[:, :-1], contiguous and speeds up
  40. matrix-vector computations.
  41. Note: If the average loss per sample is wanted instead of the sum of the loss per
  42. sample, one can simply use a rescaled sample_weight such that
  43. sum(sample_weight) = 1.
  44. Parameters
  45. ----------
  46. base_loss : instance of class BaseLoss from sklearn._loss.
  47. fit_intercept : bool
  48. """
  49. def __init__(self, base_loss, fit_intercept):
  50. self.base_loss = base_loss
  51. self.fit_intercept = fit_intercept
  52. def init_zero_coef(self, X, dtype=None):
  53. """Allocate coef of correct shape with zeros.
  54. Parameters:
  55. -----------
  56. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  57. Training data.
  58. dtype : data-type, default=None
  59. Overrides the data type of coef. With dtype=None, coef will have the same
  60. dtype as X.
  61. Returns
  62. -------
  63. coef : ndarray of shape (n_dof,) or (n_classes, n_dof)
  64. Coefficients of a linear model.
  65. """
  66. n_features = X.shape[1]
  67. n_classes = self.base_loss.n_classes
  68. if self.fit_intercept:
  69. n_dof = n_features + 1
  70. else:
  71. n_dof = n_features
  72. if self.base_loss.is_multiclass:
  73. coef = np.zeros_like(X, shape=(n_classes, n_dof), dtype=dtype, order="F")
  74. else:
  75. coef = np.zeros_like(X, shape=n_dof, dtype=dtype)
  76. return coef
  77. def weight_intercept(self, coef):
  78. """Helper function to get coefficients and intercept.
  79. Parameters
  80. ----------
  81. coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
  82. Coefficients of a linear model.
  83. If shape (n_classes * n_dof,), the classes of one feature are contiguous,
  84. i.e. one reconstructs the 2d-array via
  85. coef.reshape((n_classes, -1), order="F").
  86. Returns
  87. -------
  88. weights : ndarray of shape (n_features,) or (n_classes, n_features)
  89. Coefficients without intercept term.
  90. intercept : float or ndarray of shape (n_classes,)
  91. Intercept terms.
  92. """
  93. if not self.base_loss.is_multiclass:
  94. if self.fit_intercept:
  95. intercept = coef[-1]
  96. weights = coef[:-1]
  97. else:
  98. intercept = 0.0
  99. weights = coef
  100. else:
  101. # reshape to (n_classes, n_dof)
  102. if coef.ndim == 1:
  103. weights = coef.reshape((self.base_loss.n_classes, -1), order="F")
  104. else:
  105. weights = coef
  106. if self.fit_intercept:
  107. intercept = weights[:, -1]
  108. weights = weights[:, :-1]
  109. else:
  110. intercept = 0.0
  111. return weights, intercept
  112. def weight_intercept_raw(self, coef, X):
  113. """Helper function to get coefficients, intercept and raw_prediction.
  114. Parameters
  115. ----------
  116. coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
  117. Coefficients of a linear model.
  118. If shape (n_classes * n_dof,), the classes of one feature are contiguous,
  119. i.e. one reconstructs the 2d-array via
  120. coef.reshape((n_classes, -1), order="F").
  121. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  122. Training data.
  123. Returns
  124. -------
  125. weights : ndarray of shape (n_features,) or (n_classes, n_features)
  126. Coefficients without intercept term.
  127. intercept : float or ndarray of shape (n_classes,)
  128. Intercept terms.
  129. raw_prediction : ndarray of shape (n_samples,) or \
  130. (n_samples, n_classes)
  131. """
  132. weights, intercept = self.weight_intercept(coef)
  133. if not self.base_loss.is_multiclass:
  134. raw_prediction = X @ weights + intercept
  135. else:
  136. # weights has shape (n_classes, n_dof)
  137. raw_prediction = X @ weights.T + intercept # ndarray, likely C-contiguous
  138. return weights, intercept, raw_prediction
  139. def l2_penalty(self, weights, l2_reg_strength):
  140. """Compute L2 penalty term l2_reg_strength/2 *||w||_2^2."""
  141. norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  142. return 0.5 * l2_reg_strength * norm2_w
  143. def loss(
  144. self,
  145. coef,
  146. X,
  147. y,
  148. sample_weight=None,
  149. l2_reg_strength=0.0,
  150. n_threads=1,
  151. raw_prediction=None,
  152. ):
  153. """Compute the loss as sum over point-wise losses.
  154. Parameters
  155. ----------
  156. coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
  157. Coefficients of a linear model.
  158. If shape (n_classes * n_dof,), the classes of one feature are contiguous,
  159. i.e. one reconstructs the 2d-array via
  160. coef.reshape((n_classes, -1), order="F").
  161. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  162. Training data.
  163. y : contiguous array of shape (n_samples,)
  164. Observed, true target values.
  165. sample_weight : None or contiguous array of shape (n_samples,), default=None
  166. Sample weights.
  167. l2_reg_strength : float, default=0.0
  168. L2 regularization strength
  169. n_threads : int, default=1
  170. Number of OpenMP threads to use.
  171. raw_prediction : C-contiguous array of shape (n_samples,) or array of \
  172. shape (n_samples, n_classes)
  173. Raw prediction values (in link space). If provided, these are used. If
  174. None, then raw_prediction = X @ coef + intercept is calculated.
  175. Returns
  176. -------
  177. loss : float
  178. Sum of losses per sample plus penalty.
  179. """
  180. if raw_prediction is None:
  181. weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
  182. else:
  183. weights, intercept = self.weight_intercept(coef)
  184. loss = self.base_loss.loss(
  185. y_true=y,
  186. raw_prediction=raw_prediction,
  187. sample_weight=sample_weight,
  188. n_threads=n_threads,
  189. )
  190. loss = loss.sum()
  191. return loss + self.l2_penalty(weights, l2_reg_strength)
  192. def loss_gradient(
  193. self,
  194. coef,
  195. X,
  196. y,
  197. sample_weight=None,
  198. l2_reg_strength=0.0,
  199. n_threads=1,
  200. raw_prediction=None,
  201. ):
  202. """Computes the sum of loss and gradient w.r.t. coef.
  203. Parameters
  204. ----------
  205. coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
  206. Coefficients of a linear model.
  207. If shape (n_classes * n_dof,), the classes of one feature are contiguous,
  208. i.e. one reconstructs the 2d-array via
  209. coef.reshape((n_classes, -1), order="F").
  210. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  211. Training data.
  212. y : contiguous array of shape (n_samples,)
  213. Observed, true target values.
  214. sample_weight : None or contiguous array of shape (n_samples,), default=None
  215. Sample weights.
  216. l2_reg_strength : float, default=0.0
  217. L2 regularization strength
  218. n_threads : int, default=1
  219. Number of OpenMP threads to use.
  220. raw_prediction : C-contiguous array of shape (n_samples,) or array of \
  221. shape (n_samples, n_classes)
  222. Raw prediction values (in link space). If provided, these are used. If
  223. None, then raw_prediction = X @ coef + intercept is calculated.
  224. Returns
  225. -------
  226. loss : float
  227. Sum of losses per sample plus penalty.
  228. gradient : ndarray of shape coef.shape
  229. The gradient of the loss.
  230. """
  231. n_features, n_classes = X.shape[1], self.base_loss.n_classes
  232. n_dof = n_features + int(self.fit_intercept)
  233. if raw_prediction is None:
  234. weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
  235. else:
  236. weights, intercept = self.weight_intercept(coef)
  237. loss, grad_pointwise = self.base_loss.loss_gradient(
  238. y_true=y,
  239. raw_prediction=raw_prediction,
  240. sample_weight=sample_weight,
  241. n_threads=n_threads,
  242. )
  243. loss = loss.sum()
  244. loss += self.l2_penalty(weights, l2_reg_strength)
  245. if not self.base_loss.is_multiclass:
  246. grad = np.empty_like(coef, dtype=weights.dtype)
  247. grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  248. if self.fit_intercept:
  249. grad[-1] = grad_pointwise.sum()
  250. else:
  251. grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
  252. # grad_pointwise.shape = (n_samples, n_classes)
  253. grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  254. if self.fit_intercept:
  255. grad[:, -1] = grad_pointwise.sum(axis=0)
  256. if coef.ndim == 1:
  257. grad = grad.ravel(order="F")
  258. return loss, grad
  259. def gradient(
  260. self,
  261. coef,
  262. X,
  263. y,
  264. sample_weight=None,
  265. l2_reg_strength=0.0,
  266. n_threads=1,
  267. raw_prediction=None,
  268. ):
  269. """Computes the gradient w.r.t. coef.
  270. Parameters
  271. ----------
  272. coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
  273. Coefficients of a linear model.
  274. If shape (n_classes * n_dof,), the classes of one feature are contiguous,
  275. i.e. one reconstructs the 2d-array via
  276. coef.reshape((n_classes, -1), order="F").
  277. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  278. Training data.
  279. y : contiguous array of shape (n_samples,)
  280. Observed, true target values.
  281. sample_weight : None or contiguous array of shape (n_samples,), default=None
  282. Sample weights.
  283. l2_reg_strength : float, default=0.0
  284. L2 regularization strength
  285. n_threads : int, default=1
  286. Number of OpenMP threads to use.
  287. raw_prediction : C-contiguous array of shape (n_samples,) or array of \
  288. shape (n_samples, n_classes)
  289. Raw prediction values (in link space). If provided, these are used. If
  290. None, then raw_prediction = X @ coef + intercept is calculated.
  291. Returns
  292. -------
  293. gradient : ndarray of shape coef.shape
  294. The gradient of the loss.
  295. """
  296. n_features, n_classes = X.shape[1], self.base_loss.n_classes
  297. n_dof = n_features + int(self.fit_intercept)
  298. if raw_prediction is None:
  299. weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
  300. else:
  301. weights, intercept = self.weight_intercept(coef)
  302. grad_pointwise = self.base_loss.gradient(
  303. y_true=y,
  304. raw_prediction=raw_prediction,
  305. sample_weight=sample_weight,
  306. n_threads=n_threads,
  307. )
  308. if not self.base_loss.is_multiclass:
  309. grad = np.empty_like(coef, dtype=weights.dtype)
  310. grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  311. if self.fit_intercept:
  312. grad[-1] = grad_pointwise.sum()
  313. return grad
  314. else:
  315. grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
  316. # gradient.shape = (n_samples, n_classes)
  317. grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  318. if self.fit_intercept:
  319. grad[:, -1] = grad_pointwise.sum(axis=0)
  320. if coef.ndim == 1:
  321. return grad.ravel(order="F")
  322. else:
  323. return grad
  324. def gradient_hessian(
  325. self,
  326. coef,
  327. X,
  328. y,
  329. sample_weight=None,
  330. l2_reg_strength=0.0,
  331. n_threads=1,
  332. gradient_out=None,
  333. hessian_out=None,
  334. raw_prediction=None,
  335. ):
  336. """Computes gradient and hessian w.r.t. coef.
  337. Parameters
  338. ----------
  339. coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
  340. Coefficients of a linear model.
  341. If shape (n_classes * n_dof,), the classes of one feature are contiguous,
  342. i.e. one reconstructs the 2d-array via
  343. coef.reshape((n_classes, -1), order="F").
  344. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  345. Training data.
  346. y : contiguous array of shape (n_samples,)
  347. Observed, true target values.
  348. sample_weight : None or contiguous array of shape (n_samples,), default=None
  349. Sample weights.
  350. l2_reg_strength : float, default=0.0
  351. L2 regularization strength
  352. n_threads : int, default=1
  353. Number of OpenMP threads to use.
  354. gradient_out : None or ndarray of shape coef.shape
  355. A location into which the gradient is stored. If None, a new array
  356. might be created.
  357. hessian_out : None or ndarray
  358. A location into which the hessian is stored. If None, a new array
  359. might be created.
  360. raw_prediction : C-contiguous array of shape (n_samples,) or array of \
  361. shape (n_samples, n_classes)
  362. Raw prediction values (in link space). If provided, these are used. If
  363. None, then raw_prediction = X @ coef + intercept is calculated.
  364. Returns
  365. -------
  366. gradient : ndarray of shape coef.shape
  367. The gradient of the loss.
  368. hessian : ndarray
  369. Hessian matrix.
  370. hessian_warning : bool
  371. True if pointwise hessian has more than half of its elements non-positive.
  372. """
  373. n_samples, n_features = X.shape
  374. n_dof = n_features + int(self.fit_intercept)
  375. if raw_prediction is None:
  376. weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
  377. else:
  378. weights, intercept = self.weight_intercept(coef)
  379. grad_pointwise, hess_pointwise = self.base_loss.gradient_hessian(
  380. y_true=y,
  381. raw_prediction=raw_prediction,
  382. sample_weight=sample_weight,
  383. n_threads=n_threads,
  384. )
  385. # For non-canonical link functions and far away from the optimum, the pointwise
  386. # hessian can be negative. We take care that 75% of the hessian entries are
  387. # positive.
  388. hessian_warning = np.mean(hess_pointwise <= 0) > 0.25
  389. hess_pointwise = np.abs(hess_pointwise)
  390. if not self.base_loss.is_multiclass:
  391. # gradient
  392. if gradient_out is None:
  393. grad = np.empty_like(coef, dtype=weights.dtype)
  394. else:
  395. grad = gradient_out
  396. grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  397. if self.fit_intercept:
  398. grad[-1] = grad_pointwise.sum()
  399. # hessian
  400. if hessian_out is None:
  401. hess = np.empty(shape=(n_dof, n_dof), dtype=weights.dtype)
  402. else:
  403. hess = hessian_out
  404. if hessian_warning:
  405. # Exit early without computing the hessian.
  406. return grad, hess, hessian_warning
  407. # TODO: This "sandwich product", X' diag(W) X, is the main computational
  408. # bottleneck for solvers. A dedicated Cython routine might improve it
  409. # exploiting the symmetry (as opposed to, e.g., BLAS gemm).
  410. if sparse.issparse(X):
  411. hess[:n_features, :n_features] = (
  412. X.T
  413. @ sparse.dia_matrix(
  414. (hess_pointwise, 0), shape=(n_samples, n_samples)
  415. )
  416. @ X
  417. ).toarray()
  418. else:
  419. # np.einsum may use less memory but the following, using BLAS matrix
  420. # multiplication (gemm), is by far faster.
  421. WX = hess_pointwise[:, None] * X
  422. hess[:n_features, :n_features] = np.dot(X.T, WX)
  423. if l2_reg_strength > 0:
  424. # The L2 penalty enters the Hessian on the diagonal only. To add those
  425. # terms, we use a flattened view on the array.
  426. hess.reshape(-1)[
  427. : (n_features * n_dof) : (n_dof + 1)
  428. ] += l2_reg_strength
  429. if self.fit_intercept:
  430. # With intercept included as added column to X, the hessian becomes
  431. # hess = (X, 1)' @ diag(h) @ (X, 1)
  432. # = (X' @ diag(h) @ X, X' @ h)
  433. # ( h @ X, sum(h))
  434. # The left upper part has already been filled, it remains to compute
  435. # the last row and the last column.
  436. Xh = X.T @ hess_pointwise
  437. hess[:-1, -1] = Xh
  438. hess[-1, :-1] = Xh
  439. hess[-1, -1] = hess_pointwise.sum()
  440. else:
  441. # Here we may safely assume HalfMultinomialLoss aka categorical
  442. # cross-entropy.
  443. raise NotImplementedError
  444. return grad, hess, hessian_warning
  445. def gradient_hessian_product(
  446. self, coef, X, y, sample_weight=None, l2_reg_strength=0.0, n_threads=1
  447. ):
  448. """Computes gradient and hessp (hessian product function) w.r.t. coef.
  449. Parameters
  450. ----------
  451. coef : ndarray of shape (n_dof,), (n_classes, n_dof) or (n_classes * n_dof,)
  452. Coefficients of a linear model.
  453. If shape (n_classes * n_dof,), the classes of one feature are contiguous,
  454. i.e. one reconstructs the 2d-array via
  455. coef.reshape((n_classes, -1), order="F").
  456. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  457. Training data.
  458. y : contiguous array of shape (n_samples,)
  459. Observed, true target values.
  460. sample_weight : None or contiguous array of shape (n_samples,), default=None
  461. Sample weights.
  462. l2_reg_strength : float, default=0.0
  463. L2 regularization strength
  464. n_threads : int, default=1
  465. Number of OpenMP threads to use.
  466. Returns
  467. -------
  468. gradient : ndarray of shape coef.shape
  469. The gradient of the loss.
  470. hessp : callable
  471. Function that takes in a vector input of shape of gradient and
  472. and returns matrix-vector product with hessian.
  473. """
  474. (n_samples, n_features), n_classes = X.shape, self.base_loss.n_classes
  475. n_dof = n_features + int(self.fit_intercept)
  476. weights, intercept, raw_prediction = self.weight_intercept_raw(coef, X)
  477. if not self.base_loss.is_multiclass:
  478. grad_pointwise, hess_pointwise = self.base_loss.gradient_hessian(
  479. y_true=y,
  480. raw_prediction=raw_prediction,
  481. sample_weight=sample_weight,
  482. n_threads=n_threads,
  483. )
  484. grad = np.empty_like(coef, dtype=weights.dtype)
  485. grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  486. if self.fit_intercept:
  487. grad[-1] = grad_pointwise.sum()
  488. # Precompute as much as possible: hX, hX_sum and hessian_sum
  489. hessian_sum = hess_pointwise.sum()
  490. if sparse.issparse(X):
  491. hX = (
  492. sparse.dia_matrix((hess_pointwise, 0), shape=(n_samples, n_samples))
  493. @ X
  494. )
  495. else:
  496. hX = hess_pointwise[:, np.newaxis] * X
  497. if self.fit_intercept:
  498. # Calculate the double derivative with respect to intercept.
  499. # Note: In case hX is sparse, hX.sum is a matrix object.
  500. hX_sum = np.squeeze(np.asarray(hX.sum(axis=0)))
  501. # prevent squeezing to zero-dim array if n_features == 1
  502. hX_sum = np.atleast_1d(hX_sum)
  503. # With intercept included and l2_reg_strength = 0, hessp returns
  504. # res = (X, 1)' @ diag(h) @ (X, 1) @ s
  505. # = (X, 1)' @ (hX @ s[:n_features], sum(h) * s[-1])
  506. # res[:n_features] = X' @ hX @ s[:n_features] + sum(h) * s[-1]
  507. # res[-1] = 1' @ hX @ s[:n_features] + sum(h) * s[-1]
  508. def hessp(s):
  509. ret = np.empty_like(s)
  510. if sparse.issparse(X):
  511. ret[:n_features] = X.T @ (hX @ s[:n_features])
  512. else:
  513. ret[:n_features] = np.linalg.multi_dot([X.T, hX, s[:n_features]])
  514. ret[:n_features] += l2_reg_strength * s[:n_features]
  515. if self.fit_intercept:
  516. ret[:n_features] += s[-1] * hX_sum
  517. ret[-1] = hX_sum @ s[:n_features] + hessian_sum * s[-1]
  518. return ret
  519. else:
  520. # Here we may safely assume HalfMultinomialLoss aka categorical
  521. # cross-entropy.
  522. # HalfMultinomialLoss computes only the diagonal part of the hessian, i.e.
  523. # diagonal in the classes. Here, we want the matrix-vector product of the
  524. # full hessian. Therefore, we call gradient_proba.
  525. grad_pointwise, proba = self.base_loss.gradient_proba(
  526. y_true=y,
  527. raw_prediction=raw_prediction,
  528. sample_weight=sample_weight,
  529. n_threads=n_threads,
  530. )
  531. grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
  532. grad[:, :n_features] = grad_pointwise.T @ X + l2_reg_strength * weights
  533. if self.fit_intercept:
  534. grad[:, -1] = grad_pointwise.sum(axis=0)
  535. # Full hessian-vector product, i.e. not only the diagonal part of the
  536. # hessian. Derivation with some index battle for input vector s:
  537. # - sample index i
  538. # - feature indices j, m
  539. # - class indices k, l
  540. # - 1_{k=l} is one if k=l else 0
  541. # - p_i_k is the (predicted) probability that sample i belongs to class k
  542. # for all i: sum_k p_i_k = 1
  543. # - s_l_m is input vector for class l and feature m
  544. # - X' = X transposed
  545. #
  546. # Note: Hessian with dropping most indices is just:
  547. # X' @ p_k (1(k=l) - p_l) @ X
  548. #
  549. # result_{k j} = sum_{i, l, m} Hessian_{i, k j, m l} * s_l_m
  550. # = sum_{i, l, m} (X')_{ji} * p_i_k * (1_{k=l} - p_i_l)
  551. # * X_{im} s_l_m
  552. # = sum_{i, m} (X')_{ji} * p_i_k
  553. # * (X_{im} * s_k_m - sum_l p_i_l * X_{im} * s_l_m)
  554. #
  555. # See also https://github.com/scikit-learn/scikit-learn/pull/3646#discussion_r17461411 # noqa
  556. def hessp(s):
  557. s = s.reshape((n_classes, -1), order="F") # shape = (n_classes, n_dof)
  558. if self.fit_intercept:
  559. s_intercept = s[:, -1]
  560. s = s[:, :-1] # shape = (n_classes, n_features)
  561. else:
  562. s_intercept = 0
  563. tmp = X @ s.T + s_intercept # X_{im} * s_k_m
  564. tmp += (-proba * tmp).sum(axis=1)[:, np.newaxis] # - sum_l ..
  565. tmp *= proba # * p_i_k
  566. if sample_weight is not None:
  567. tmp *= sample_weight[:, np.newaxis]
  568. # hess_prod = empty_like(grad), but we ravel grad below and this
  569. # function is run after that.
  570. hess_prod = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")
  571. hess_prod[:, :n_features] = tmp.T @ X + l2_reg_strength * s
  572. if self.fit_intercept:
  573. hess_prod[:, -1] = tmp.sum(axis=0)
  574. if coef.ndim == 1:
  575. return hess_prod.ravel(order="F")
  576. else:
  577. return hess_prod
  578. if coef.ndim == 1:
  579. return grad.ravel(order="F"), hessp
  580. return grad, hessp