test_common.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. # License: BSD 3 clause
  2. import inspect
  3. import numpy as np
  4. import pytest
  5. from sklearn.base import is_classifier
  6. from sklearn.datasets import make_low_rank_matrix
  7. from sklearn.linear_model import (
  8. ARDRegression,
  9. BayesianRidge,
  10. ElasticNet,
  11. ElasticNetCV,
  12. Lars,
  13. LarsCV,
  14. Lasso,
  15. LassoCV,
  16. LassoLarsCV,
  17. LassoLarsIC,
  18. LinearRegression,
  19. LogisticRegression,
  20. LogisticRegressionCV,
  21. MultiTaskElasticNet,
  22. MultiTaskElasticNetCV,
  23. MultiTaskLasso,
  24. MultiTaskLassoCV,
  25. OrthogonalMatchingPursuit,
  26. OrthogonalMatchingPursuitCV,
  27. PoissonRegressor,
  28. Ridge,
  29. RidgeCV,
  30. SGDRegressor,
  31. TweedieRegressor,
  32. )
  33. # Note: GammaRegressor() and TweedieRegressor(power != 1) have a non-canonical link.
  34. @pytest.mark.parametrize(
  35. "model",
  36. [
  37. ARDRegression(),
  38. BayesianRidge(),
  39. ElasticNet(),
  40. ElasticNetCV(),
  41. Lars(),
  42. LarsCV(),
  43. Lasso(),
  44. LassoCV(),
  45. LassoLarsCV(),
  46. LassoLarsIC(),
  47. LinearRegression(),
  48. # TODO: FIx SAGA which fails badly with sample_weights.
  49. # This is a known limitation, see:
  50. # https://github.com/scikit-learn/scikit-learn/issues/21305
  51. pytest.param(
  52. LogisticRegression(
  53. penalty="elasticnet", solver="saga", l1_ratio=0.5, tol=1e-15
  54. ),
  55. marks=pytest.mark.xfail(reason="Missing importance sampling scheme"),
  56. ),
  57. LogisticRegressionCV(),
  58. MultiTaskElasticNet(),
  59. MultiTaskElasticNetCV(),
  60. MultiTaskLasso(),
  61. MultiTaskLassoCV(),
  62. OrthogonalMatchingPursuit(),
  63. OrthogonalMatchingPursuitCV(),
  64. PoissonRegressor(),
  65. Ridge(),
  66. RidgeCV(),
  67. pytest.param(
  68. SGDRegressor(tol=1e-15),
  69. marks=pytest.mark.xfail(reason="Insufficient precision."),
  70. ),
  71. SGDRegressor(penalty="elasticnet", max_iter=10_000),
  72. TweedieRegressor(power=0), # same as Ridge
  73. ],
  74. ids=lambda x: x.__class__.__name__,
  75. )
  76. @pytest.mark.parametrize("with_sample_weight", [False, True])
  77. def test_balance_property(model, with_sample_weight, global_random_seed):
  78. # Test that sum(y_predicted) == sum(y_observed) on the training set.
  79. # This must hold for all linear models with deviance of an exponential disperson
  80. # family as loss and the corresponding canonical link if fit_intercept=True.
  81. # Examples:
  82. # - squared error and identity link (most linear models)
  83. # - Poisson deviance with log link
  84. # - log loss with logit link
  85. # This is known as balance property or unconditional calibration/unbiasedness.
  86. # For reference, see Corollary 3.18, 3.20 and Chapter 5.1.5 of
  87. # M.V. Wuthrich and M. Merz, "Statistical Foundations of Actuarial Learning and its
  88. # Applications" (June 3, 2022). http://doi.org/10.2139/ssrn.3822407
  89. if (
  90. with_sample_weight
  91. and "sample_weight" not in inspect.signature(model.fit).parameters.keys()
  92. ):
  93. pytest.skip("Estimator does not support sample_weight.")
  94. rel = 2e-4 # test precision
  95. if isinstance(model, SGDRegressor):
  96. rel = 1e-1
  97. elif hasattr(model, "solver") and model.solver == "saga":
  98. rel = 1e-2
  99. rng = np.random.RandomState(global_random_seed)
  100. n_train, n_features, n_targets = 100, 10, None
  101. if isinstance(
  102. model,
  103. (MultiTaskElasticNet, MultiTaskElasticNetCV, MultiTaskLasso, MultiTaskLassoCV),
  104. ):
  105. n_targets = 3
  106. X = make_low_rank_matrix(n_samples=n_train, n_features=n_features, random_state=rng)
  107. if n_targets:
  108. coef = (
  109. rng.uniform(low=-2, high=2, size=(n_features, n_targets))
  110. / np.max(X, axis=0)[:, None]
  111. )
  112. else:
  113. coef = rng.uniform(low=-2, high=2, size=n_features) / np.max(X, axis=0)
  114. expectation = np.exp(X @ coef + 0.5)
  115. y = rng.poisson(lam=expectation) + 1 # strict positive, i.e. y > 0
  116. if is_classifier(model):
  117. y = (y > expectation + 1).astype(np.float64)
  118. if with_sample_weight:
  119. sw = rng.uniform(low=1, high=10, size=y.shape[0])
  120. else:
  121. sw = None
  122. model.set_params(fit_intercept=True) # to be sure
  123. if with_sample_weight:
  124. model.fit(X, y, sample_weight=sw)
  125. else:
  126. model.fit(X, y)
  127. # Assert balance property.
  128. if is_classifier(model):
  129. assert np.average(model.predict_proba(X)[:, 1], weights=sw) == pytest.approx(
  130. np.average(y, weights=sw), rel=rel
  131. )
  132. else:
  133. assert np.average(model.predict(X), weights=sw, axis=0) == pytest.approx(
  134. np.average(y, weights=sw, axis=0), rel=rel
  135. )