| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354 |
- """
- Tests for LinearModelLoss
- Note that correctness of losses (which compose LinearModelLoss) is already well
- covered in the _loss module.
- """
- import numpy as np
- import pytest
- from numpy.testing import assert_allclose
- from scipy import linalg, optimize, sparse
- from sklearn._loss.loss import (
- HalfBinomialLoss,
- HalfMultinomialLoss,
- HalfPoissonLoss,
- )
- from sklearn.datasets import make_low_rank_matrix
- from sklearn.linear_model._linear_loss import LinearModelLoss
- from sklearn.utils.extmath import squared_norm
- # We do not need to test all losses, just what LinearModelLoss does on top of the
- # base losses.
- LOSSES = [HalfBinomialLoss, HalfMultinomialLoss, HalfPoissonLoss]
- def random_X_y_coef(
- linear_model_loss, n_samples, n_features, coef_bound=(-2, 2), seed=42
- ):
- """Random generate y, X and coef in valid range."""
- rng = np.random.RandomState(seed)
- n_dof = n_features + linear_model_loss.fit_intercept
- X = make_low_rank_matrix(
- n_samples=n_samples,
- n_features=n_features,
- random_state=rng,
- )
- coef = linear_model_loss.init_zero_coef(X)
- if linear_model_loss.base_loss.is_multiclass:
- n_classes = linear_model_loss.base_loss.n_classes
- coef.flat[:] = rng.uniform(
- low=coef_bound[0],
- high=coef_bound[1],
- size=n_classes * n_dof,
- )
- if linear_model_loss.fit_intercept:
- raw_prediction = X @ coef[:, :-1].T + coef[:, -1]
- else:
- raw_prediction = X @ coef.T
- proba = linear_model_loss.base_loss.link.inverse(raw_prediction)
- # y = rng.choice(np.arange(n_classes), p=proba) does not work.
- # See https://stackoverflow.com/a/34190035/16761084
- def choice_vectorized(items, p):
- s = p.cumsum(axis=1)
- r = rng.rand(p.shape[0])[:, None]
- k = (s < r).sum(axis=1)
- return items[k]
- y = choice_vectorized(np.arange(n_classes), p=proba).astype(np.float64)
- else:
- coef.flat[:] = rng.uniform(
- low=coef_bound[0],
- high=coef_bound[1],
- size=n_dof,
- )
- if linear_model_loss.fit_intercept:
- raw_prediction = X @ coef[:-1] + coef[-1]
- else:
- raw_prediction = X @ coef
- y = linear_model_loss.base_loss.link.inverse(
- raw_prediction + rng.uniform(low=-1, high=1, size=n_samples)
- )
- return X, y, coef
- @pytest.mark.parametrize("base_loss", LOSSES)
- @pytest.mark.parametrize("fit_intercept", [False, True])
- @pytest.mark.parametrize("n_features", [0, 1, 10])
- @pytest.mark.parametrize("dtype", [None, np.float32, np.float64, np.int64])
- def test_init_zero_coef(base_loss, fit_intercept, n_features, dtype):
- """Test that init_zero_coef initializes coef correctly."""
- loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=fit_intercept)
- rng = np.random.RandomState(42)
- X = rng.normal(size=(5, n_features))
- coef = loss.init_zero_coef(X, dtype=dtype)
- if loss.base_loss.is_multiclass:
- n_classes = loss.base_loss.n_classes
- assert coef.shape == (n_classes, n_features + fit_intercept)
- assert coef.flags["F_CONTIGUOUS"]
- else:
- assert coef.shape == (n_features + fit_intercept,)
- if dtype is None:
- assert coef.dtype == X.dtype
- else:
- assert coef.dtype == dtype
- assert np.count_nonzero(coef) == 0
- @pytest.mark.parametrize("base_loss", LOSSES)
- @pytest.mark.parametrize("fit_intercept", [False, True])
- @pytest.mark.parametrize("sample_weight", [None, "range"])
- @pytest.mark.parametrize("l2_reg_strength", [0, 1])
- def test_loss_grad_hess_are_the_same(
- base_loss, fit_intercept, sample_weight, l2_reg_strength
- ):
- """Test that loss and gradient are the same across different functions."""
- loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=fit_intercept)
- X, y, coef = random_X_y_coef(
- linear_model_loss=loss, n_samples=10, n_features=5, seed=42
- )
- if sample_weight == "range":
- sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
- l1 = loss.loss(
- coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
- )
- g1 = loss.gradient(
- coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
- )
- l2, g2 = loss.loss_gradient(
- coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
- )
- g3, h3 = loss.gradient_hessian_product(
- coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
- )
- if not base_loss.is_multiclass:
- g4, h4, _ = loss.gradient_hessian(
- coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
- )
- else:
- with pytest.raises(NotImplementedError):
- loss.gradient_hessian(
- coef,
- X,
- y,
- sample_weight=sample_weight,
- l2_reg_strength=l2_reg_strength,
- )
- assert_allclose(l1, l2)
- assert_allclose(g1, g2)
- assert_allclose(g1, g3)
- if not base_loss.is_multiclass:
- assert_allclose(g1, g4)
- assert_allclose(h4 @ g4, h3(g3))
- # same for sparse X
- X = sparse.csr_matrix(X)
- l1_sp = loss.loss(
- coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
- )
- g1_sp = loss.gradient(
- coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
- )
- l2_sp, g2_sp = loss.loss_gradient(
- coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
- )
- g3_sp, h3_sp = loss.gradient_hessian_product(
- coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
- )
- if not base_loss.is_multiclass:
- g4_sp, h4_sp, _ = loss.gradient_hessian(
- coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
- )
- assert_allclose(l1, l1_sp)
- assert_allclose(l1, l2_sp)
- assert_allclose(g1, g1_sp)
- assert_allclose(g1, g2_sp)
- assert_allclose(g1, g3_sp)
- assert_allclose(h3(g1), h3_sp(g1_sp))
- if not base_loss.is_multiclass:
- assert_allclose(g1, g4_sp)
- assert_allclose(h4 @ g4, h4_sp @ g1_sp)
- @pytest.mark.parametrize("base_loss", LOSSES)
- @pytest.mark.parametrize("sample_weight", [None, "range"])
- @pytest.mark.parametrize("l2_reg_strength", [0, 1])
- @pytest.mark.parametrize("X_sparse", [False, True])
- def test_loss_gradients_hessp_intercept(
- base_loss, sample_weight, l2_reg_strength, X_sparse
- ):
- """Test that loss and gradient handle intercept correctly."""
- loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=False)
- loss_inter = LinearModelLoss(base_loss=base_loss(), fit_intercept=True)
- n_samples, n_features = 10, 5
- X, y, coef = random_X_y_coef(
- linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42
- )
- X[:, -1] = 1 # make last column of 1 to mimic intercept term
- X_inter = X[
- :, :-1
- ] # exclude intercept column as it is added automatically by loss_inter
- if X_sparse:
- X = sparse.csr_matrix(X)
- if sample_weight == "range":
- sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
- l, g = loss.loss_gradient(
- coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
- )
- _, hessp = loss.gradient_hessian_product(
- coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
- )
- l_inter, g_inter = loss_inter.loss_gradient(
- coef, X_inter, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
- )
- _, hessp_inter = loss_inter.gradient_hessian_product(
- coef, X_inter, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
- )
- # Note, that intercept gets no L2 penalty.
- assert l == pytest.approx(
- l_inter + 0.5 * l2_reg_strength * squared_norm(coef.T[-1])
- )
- g_inter_corrected = g_inter
- g_inter_corrected.T[-1] += l2_reg_strength * coef.T[-1]
- assert_allclose(g, g_inter_corrected)
- s = np.random.RandomState(42).randn(*coef.shape)
- h = hessp(s)
- h_inter = hessp_inter(s)
- h_inter_corrected = h_inter
- h_inter_corrected.T[-1] += l2_reg_strength * s.T[-1]
- assert_allclose(h, h_inter_corrected)
- @pytest.mark.parametrize("base_loss", LOSSES)
- @pytest.mark.parametrize("fit_intercept", [False, True])
- @pytest.mark.parametrize("sample_weight", [None, "range"])
- @pytest.mark.parametrize("l2_reg_strength", [0, 1])
- def test_gradients_hessians_numerically(
- base_loss, fit_intercept, sample_weight, l2_reg_strength
- ):
- """Test gradients and hessians with numerical derivatives.
- Gradient should equal the numerical derivatives of the loss function.
- Hessians should equal the numerical derivatives of gradients.
- """
- loss = LinearModelLoss(base_loss=base_loss(), fit_intercept=fit_intercept)
- n_samples, n_features = 10, 5
- X, y, coef = random_X_y_coef(
- linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42
- )
- coef = coef.ravel(order="F") # this is important only for multinomial loss
- if sample_weight == "range":
- sample_weight = np.linspace(1, y.shape[0], num=y.shape[0])
- # 1. Check gradients numerically
- eps = 1e-6
- g, hessp = loss.gradient_hessian_product(
- coef, X, y, sample_weight=sample_weight, l2_reg_strength=l2_reg_strength
- )
- # Use a trick to get central finite difference of accuracy 4 (five-point stencil)
- # https://en.wikipedia.org/wiki/Numerical_differentiation
- # https://en.wikipedia.org/wiki/Finite_difference_coefficient
- # approx_g1 = (f(x + eps) - f(x - eps)) / (2*eps)
- approx_g1 = optimize.approx_fprime(
- coef,
- lambda coef: loss.loss(
- coef - eps,
- X,
- y,
- sample_weight=sample_weight,
- l2_reg_strength=l2_reg_strength,
- ),
- 2 * eps,
- )
- # approx_g2 = (f(x + 2*eps) - f(x - 2*eps)) / (4*eps)
- approx_g2 = optimize.approx_fprime(
- coef,
- lambda coef: loss.loss(
- coef - 2 * eps,
- X,
- y,
- sample_weight=sample_weight,
- l2_reg_strength=l2_reg_strength,
- ),
- 4 * eps,
- )
- # Five-point stencil approximation
- # See: https://en.wikipedia.org/wiki/Five-point_stencil#1D_first_derivative
- approx_g = (4 * approx_g1 - approx_g2) / 3
- assert_allclose(g, approx_g, rtol=1e-2, atol=1e-8)
- # 2. Check hessp numerically along the second direction of the gradient
- vector = np.zeros_like(g)
- vector[1] = 1
- hess_col = hessp(vector)
- # Computation of the Hessian is particularly fragile to numerical errors when doing
- # simple finite differences. Here we compute the grad along a path in the direction
- # of the vector and then use a least-square regression to estimate the slope
- eps = 1e-3
- d_x = np.linspace(-eps, eps, 30)
- d_grad = np.array(
- [
- loss.gradient(
- coef + t * vector,
- X,
- y,
- sample_weight=sample_weight,
- l2_reg_strength=l2_reg_strength,
- )
- for t in d_x
- ]
- )
- d_grad -= d_grad.mean(axis=0)
- approx_hess_col = linalg.lstsq(d_x[:, np.newaxis], d_grad)[0].ravel()
- assert_allclose(approx_hess_col, hess_col, rtol=1e-3)
- @pytest.mark.parametrize("fit_intercept", [False, True])
- def test_multinomial_coef_shape(fit_intercept):
- """Test that multinomial LinearModelLoss respects shape of coef."""
- loss = LinearModelLoss(base_loss=HalfMultinomialLoss(), fit_intercept=fit_intercept)
- n_samples, n_features = 10, 5
- X, y, coef = random_X_y_coef(
- linear_model_loss=loss, n_samples=n_samples, n_features=n_features, seed=42
- )
- s = np.random.RandomState(42).randn(*coef.shape)
- l, g = loss.loss_gradient(coef, X, y)
- g1 = loss.gradient(coef, X, y)
- g2, hessp = loss.gradient_hessian_product(coef, X, y)
- h = hessp(s)
- assert g.shape == coef.shape
- assert h.shape == coef.shape
- assert_allclose(g, g1)
- assert_allclose(g, g2)
- coef_r = coef.ravel(order="F")
- s_r = s.ravel(order="F")
- l_r, g_r = loss.loss_gradient(coef_r, X, y)
- g1_r = loss.gradient(coef_r, X, y)
- g2_r, hessp_r = loss.gradient_hessian_product(coef_r, X, y)
- h_r = hessp_r(s_r)
- assert g_r.shape == coef_r.shape
- assert h_r.shape == coef_r.shape
- assert_allclose(g_r, g1_r)
- assert_allclose(g_r, g2_r)
- assert_allclose(g, g_r.reshape(loss.base_loss.n_classes, -1, order="F"))
- assert_allclose(h, h_r.reshape(loss.base_loss.n_classes, -1, order="F"))
|