test_linear_loss.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. """
  2. Tests for LinearModelLoss
  3. Note that correctness of losses (which compose LinearModelLoss) is already well
  4. covered in the _loss module.
  5. """
  6. import numpy as np
  7. import pytest
  8. from numpy.testing import assert_allclose
  9. from scipy import linalg, optimize, sparse
  10. from sklearn._loss.loss import (
  11. HalfBinomialLoss,
  12. HalfMultinomialLoss,
  13. HalfPoissonLoss,
  14. )
  15. from sklearn.datasets import make_low_rank_matrix
  16. from sklearn.linear_model._linear_loss import LinearModelLoss
  17. from sklearn.utils.extmath import squared_norm
  18. # We do not need to test all losses, just what LinearModelLoss does on top of the
  19. # base losses.
  20. LOSSES = [HalfBinomialLoss, HalfMultinomialLoss, HalfPoissonLoss]
  21. def random_X_y_coef(
  22. linear_model_loss, n_samples, n_features, coef_bound=(-2, 2), seed=42
  23. ):
  24. """Random generate y, X and coef in valid range."""
  25. rng = np.random.RandomState(seed)
  26. n_dof = n_features + linear_model_loss.fit_intercept
  27. X = make_low_rank_matrix(
  28. n_samples=n_samples,
  29. n_features=n_features,
  30. random_state=rng,
  31. )
  32. coef = linear_model_loss.init_zero_coef(X)
  33. if linear_model_loss.base_loss.is_multiclass:
  34. n_classes = linear_model_loss.base_loss.n_classes
  35. coef.flat[:] = rng.uniform(
  36. low=coef_bound[0],
  37. high=coef_bound[1],
  38. size=n_classes * n_dof,
  39. )
  40. if linear_model_loss.fit_intercept:
  41. raw_prediction = X @ coef[:, :-1].T + coef[:, -1]
  42. else:
  43. raw_prediction = X @ coef.T
  44. proba = linear_model_loss.base_loss.link.inverse(raw_prediction)
  45. # y = rng.choice(np.arange(n_classes), p=proba) does not work.
  46. # See https://stackoverflow.com/a/34190035/16761084
  47. def choice_vectorized(items, p):
  48. s = p.cumsum(axis=1)
  49. r = rng.rand(p.shape[0])[:, None]
  50. k = (s < r).sum(axis=1)
  51. return items[k]
  52. y = choice_vectorized(np.arange(n_classes), p=proba).astype(np.float64)
  53. else:
  54. coef.flat[:] = rng.uniform(
  55. low=coef_bound[0],
  56. high=coef_bound[1],
  57. size=n_dof,
  58. )
  59. if linear_model_loss.fit_intercept:
  60. raw_prediction = X @ coef[:-1] + coef[-1]
  61. else:
  62. raw_prediction = X @ coef
  63. y = linear_model_loss.base_loss.link.inverse(
  64. raw_prediction + rng.uniform(low=-1, high=1, size=n_samples)
  65. )
  66. return X, y, coef
  67. @pytest.mark.parametrize("base_loss", LOSSES)
  68. @pytest.mark.parametrize("fit_intercept", [False, True])
  69. @pytest.mark.parametrize("n_features", [0, 1, 10])
  70. @pytest.mark.parametrize("dtype", [None, np.float32, np.float64, np.int64])
  71. def test_init_zero_coef(base_loss, fit_intercept, n_features, dtype):
  72. """Test that init_zero_coef initializes coef correctly."""
  73. loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=fit_intercept)
  74. rng = np.random.RandomState(42)
  75. X = rng.normal(size=(5, n_features))
  76. coef = loss.init_zero_coef(X, dtype=dtype)
  77. if loss.base_loss.is_multiclass:
  78. n_classes = loss.base_loss.n_classes
  79. assert coef.shape == (n_classes, n_features + fit_intercept)
  80. assert coef.flags["F_CONTIGUOUS"]
  81. else:
  82. assert coef.shape == (n_features + fit_intercept,)
  83. if dtype is None:
  84. assert coef.dtype == X.dtype
  85. else:
  86. assert coef.dtype == dtype
  87. assert np.count_nonzero(coef) == 0
  88. @pytest.mark.parametrize("base_loss", LOSSES)
  89. @pytest.mark.parametrize("fit_intercept", [False, True])
  90. @pytest.mark.parametrize("sample_weight", [None, "range"])
  91. @pytest.mark.parametrize("l2_reg_strength", [0, 1])
  92. def test_loss_grad_hess_are_the_same(
  93. base_loss, fit_intercept, sample_weight, l2_reg_strength
  94. ):
  95. """Test that loss and gradient are the same across different functions."""
  96. loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=fit_intercept)
  97. X, y, coef = random_X_y_coef(
  98. linear_model_loss=loss, n_samples=10, n_features=5, seed=42
  99. )
  100. if sample_weight == "range":
  101. sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
  102. l1 = loss.loss(
  103. coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
  104. )
  105. g1 = loss.gradient(
  106. coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
  107. )
  108. l2, g2 = loss.loss_gradient(
  109. coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
  110. )
  111. g3, h3 = loss.gradient_hessian_product(
  112. coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
  113. )
  114. if not base_loss.is_multiclass:
  115. g4, h4, _ = loss.gradient_hessian(
  116. coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
  117. )
  118. else:
  119. with pytest.raises(NotImplementedError):
  120. loss.gradient_hessian(
  121. coef,
  122. X,
  123. y,
  124. sample_weight=sample_weight,
  125. l2_reg_strength=l2_reg_strength,
  126. )
  127. assert_allclose(l1, l2)
  128. assert_allclose(g1, g2)
  129. assert_allclose(g1, g3)
  130. if not base_loss.is_multiclass:
  131. assert_allclose(g1, g4)
  132. assert_allclose(h4 @ g4, h3(g3))
  133. # same for sparse X
  134. X = sparse.csr_matrix(X)
  135. l1_sp = loss.loss(
  136. coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
  137. )
  138. g1_sp = loss.gradient(
  139. coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
  140. )
  141. l2_sp, g2_sp = loss.loss_gradient(
  142. coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
  143. )
  144. g3_sp, h3_sp = loss.gradient_hessian_product(
  145. coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
  146. )
  147. if not base_loss.is_multiclass:
  148. g4_sp, h4_sp, _ = loss.gradient_hessian(
  149. coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
  150. )
  151. assert_allclose(l1, l1_sp)
  152. assert_allclose(l1, l2_sp)
  153. assert_allclose(g1, g1_sp)
  154. assert_allclose(g1, g2_sp)
  155. assert_allclose(g1, g3_sp)
  156. assert_allclose(h3(g1), h3_sp(g1_sp))
  157. if not base_loss.is_multiclass:
  158. assert_allclose(g1, g4_sp)
  159. assert_allclose(h4 @ g4, h4_sp @ g1_sp)
  160. @pytest.mark.parametrize("base_loss", LOSSES)
  161. @pytest.mark.parametrize("sample_weight", [None, "range"])
  162. @pytest.mark.parametrize("l2_reg_strength", [0, 1])
  163. @pytest.mark.parametrize("X_sparse", [False, True])
  164. def test_loss_gradients_hessp_intercept(
  165. base_loss, sample_weight, l2_reg_strength, X_sparse
  166. ):
  167. """Test that loss and gradient handle intercept correctly."""
  168. loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=False)
  169. loss_inter = LinearModelLoss(base_loss=base_loss(), fit_intercept=True)
  170. n_samples, n_features = 10, 5
  171. X, y, coef = random_X_y_coef(
  172. linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42
  173. )
  174. X[:, -1] = 1 # make last column of 1 to mimic intercept term
  175. X_inter = X[
  176. :, :-1
  177. ] # exclude intercept column as it is added automatically by loss_inter
  178. if X_sparse:
  179. X = sparse.csr_matrix(X)
  180. if sample_weight == "range":
  181. sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
  182. l, g = loss.loss_gradient(
  183. coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
  184. )
  185. _, hessp = loss.gradient_hessian_product(
  186. coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
  187. )
  188. l_inter, g_inter = loss_inter.loss_gradient(
  189. coef, X_inter, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
  190. )
  191. _, hessp_inter = loss_inter.gradient_hessian_product(
  192. coef, X_inter, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
  193. )
  194. # Note, that intercept gets no L2 penalty.
  195. assert l == pytest.approx(
  196. l_inter + 0.5 * l2_reg_strength * squared_norm(coef.T[-1])
  197. )
  198. g_inter_corrected = g_inter
  199. g_inter_corrected.T[-1] += l2_reg_strength * coef.T[-1]
  200. assert_allclose(g, g_inter_corrected)
  201. s = np.random.RandomState(42).randn(*coef.shape)
  202. h = hessp(s)
  203. h_inter = hessp_inter(s)
  204. h_inter_corrected = h_inter
  205. h_inter_corrected.T[-1] += l2_reg_strength * s.T[-1]
  206. assert_allclose(h, h_inter_corrected)
  207. @pytest.mark.parametrize("base_loss", LOSSES)
  208. @pytest.mark.parametrize("fit_intercept", [False, True])
  209. @pytest.mark.parametrize("sample_weight", [None, "range"])
  210. @pytest.mark.parametrize("l2_reg_strength", [0, 1])
  211. def test_gradients_hessians_numerically(
  212. base_loss, fit_intercept, sample_weight, l2_reg_strength
  213. ):
  214. """Test gradients and hessians with numerical derivatives.
  215. Gradient should equal the numerical derivatives of the loss function.
  216. Hessians should equal the numerical derivatives of gradients.
  217. """
  218. loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=fit_intercept)
  219. n_samples, n_features = 10, 5
  220. X, y, coef = random_X_y_coef(
  221. linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42
  222. )
  223. coef = coef.ravel(order="F") # this is important only for multinomial loss
  224. if sample_weight == "range":
  225. sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
  226. # 1. Check gradients numerically
  227. eps = 1e-6
  228. g, hessp = loss.gradient_hessian_product(
  229. coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
  230. )
  231. # Use a trick to get central finite difference of accuracy 4 (five-point stencil)
  232. # https://en.wikipedia.org/wiki/Numerical_differentiation
  233. # https://en.wikipedia.org/wiki/Finite_difference_coefficient
  234. # approx_g1 = (f(x + eps) - f(x - eps)) / (2*eps)
  235. approx_g1 = optimize.approx_fprime(
  236. coef,
  237. lambda coef: loss.loss(
  238. coef - eps,
  239. X,
  240. y,
  241. sample_weight=sample_weight,
  242. l2_reg_strength=l2_reg_strength,
  243. ),
  244. 2 * eps,
  245. )
  246. # approx_g2 = (f(x + 2*eps) - f(x - 2*eps)) / (4*eps)
  247. approx_g2 = optimize.approx_fprime(
  248. coef,
  249. lambda coef: loss.loss(
  250. coef - 2 * eps,
  251. X,
  252. y,
  253. sample_weight=sample_weight,
  254. l2_reg_strength=l2_reg_strength,
  255. ),
  256. 4 * eps,
  257. )
  258. # Five-point stencil approximation
  259. # See: https://en.wikipedia.org/wiki/Five-point_stencil#1D_first_derivative
  260. approx_g = (4 * approx_g1 - approx_g2) / 3
  261. assert_allclose(g, approx_g, rtol=1e-2, atol=1e-8)
  262. # 2. Check hessp numerically along the second direction of the gradient
  263. vector = np.zeros_like(g)
  264. vector[1] = 1
  265. hess_col = hessp(vector)
  266. # Computation of the Hessian is particularly fragile to numerical errors when doing
  267. # simple finite differences. Here we compute the grad along a path in the direction
  268. # of the vector and then use a least-square regression to estimate the slope
  269. eps = 1e-3
  270. d_x = np.linspace(-eps, eps, 30)
  271. d_grad = np.array(
  272. [
  273. loss.gradient(
  274. coef + t * vector,
  275. X,
  276. y,
  277. sample_weight=sample_weight,
  278. l2_reg_strength=l2_reg_strength,
  279. )
  280. for t in d_x
  281. ]
  282. )
  283. d_grad -= d_grad.mean(axis=0)
  284. approx_hess_col = linalg.lstsq(d_x[:, np.newaxis], d_grad)[0].ravel()
  285. assert_allclose(approx_hess_col, hess_col, rtol=1e-3)
  286. @pytest.mark.parametrize("fit_intercept", [False, True])
  287. def test_multinomial_coef_shape(fit_intercept):
  288. """Test that multinomial LinearModelLoss respects shape of coef."""
  289. loss = LinearModelLoss(base_loss=HalfMultinomialLoss(), fit_intercept=fit_intercept)
  290. n_samples, n_features = 10, 5
  291. X, y, coef = random_X_y_coef(
  292. linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42
  293. )
  294. s = np.random.RandomState(42).randn(*coef.shape)
  295. l, g = loss.loss_gradient(coef, X, y)
  296. g1 = loss.gradient(coef, X, y)
  297. g2, hessp = loss.gradient_hessian_product(coef, X, y)
  298. h = hessp(s)
  299. assert g.shape == coef.shape
  300. assert h.shape == coef.shape
  301. assert_allclose(g, g1)
  302. assert_allclose(g, g2)
  303. coef_r = coef.ravel(order="F")
  304. s_r = s.ravel(order="F")
  305. l_r, g_r = loss.loss_gradient(coef_r, X, y)
  306. g1_r = loss.gradient(coef_r, X, y)
  307. g2_r, hessp_r = loss.gradient_hessian_product(coef_r, X, y)
  308. h_r = hessp_r(s_r)
  309. assert g_r.shape == coef_r.shape
  310. assert h_r.shape == coef_r.shape
  311. assert_allclose(g_r, g1_r)
  312. assert_allclose(g_r, g2_r)
  313. assert_allclose(g, g_r.reshape(loss.base_loss.n_classes, -1, order="F"))
  314. assert_allclose(h, h_r.reshape(loss.base_loss.n_classes, -1, order="F"))