test_huber.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. # Authors: Manoj Kumar mks542@nyu.edu
  2. # License: BSD 3 clause
  3. import numpy as np
  4. from scipy import optimize, sparse
  5. from sklearn.datasets import make_regression
  6. from sklearn.linear_model import HuberRegressor, LinearRegression, Ridge, SGDRegressor
  7. from sklearn.linear_model._huber import _huber_loss_and_gradient
  8. from sklearn.utils._testing import (
  9. assert_almost_equal,
  10. assert_array_almost_equal,
  11. assert_array_equal,
  12. )
  13. def make_regression_with_outliers(n_samples=50, n_features=20):
  14. rng = np.random.RandomState(0)
  15. # Generate data with outliers by replacing 10% of the samples with noise.
  16. X, y = make_regression(
  17. n_samples=n_samples, n_features=n_features, random_state=0, noise=0.05
  18. )
  19. # Replace 10% of the sample with noise.
  20. num_noise = int(0.1 * n_samples)
  21. random_samples = rng.randint(0, n_samples, num_noise)
  22. X[random_samples, :] = 2.0 * rng.normal(0, 1, (num_noise, X.shape[1]))
  23. return X, y
  24. def test_huber_equals_lr_for_high_epsilon():
  25. # Test that Ridge matches LinearRegression for large epsilon
  26. X, y = make_regression_with_outliers()
  27. lr = LinearRegression()
  28. lr.fit(X, y)
  29. huber = HuberRegressor(epsilon=1e3, alpha=0.0)
  30. huber.fit(X, y)
  31. assert_almost_equal(huber.coef_, lr.coef_, 3)
  32. assert_almost_equal(huber.intercept_, lr.intercept_, 2)
  33. def test_huber_max_iter():
  34. X, y = make_regression_with_outliers()
  35. huber = HuberRegressor(max_iter=1)
  36. huber.fit(X, y)
  37. assert huber.n_iter_ == huber.max_iter
  38. def test_huber_gradient():
  39. # Test that the gradient calculated by _huber_loss_and_gradient is correct
  40. rng = np.random.RandomState(1)
  41. X, y = make_regression_with_outliers()
  42. sample_weight = rng.randint(1, 3, (y.shape[0]))
  43. def loss_func(x, *args):
  44. return _huber_loss_and_gradient(x, *args)[0]
  45. def grad_func(x, *args):
  46. return _huber_loss_and_gradient(x, *args)[1]
  47. # Check using optimize.check_grad that the gradients are equal.
  48. for _ in range(5):
  49. # Check for both fit_intercept and otherwise.
  50. for n_features in [X.shape[1] + 1, X.shape[1] + 2]:
  51. w = rng.randn(n_features)
  52. w[-1] = np.abs(w[-1])
  53. grad_same = optimize.check_grad(
  54. loss_func, grad_func, w, X, y, 0.01, 0.1, sample_weight
  55. )
  56. assert_almost_equal(grad_same, 1e-6, 4)
  57. def test_huber_sample_weights():
  58. # Test sample_weights implementation in HuberRegressor"""
  59. X, y = make_regression_with_outliers()
  60. huber = HuberRegressor()
  61. huber.fit(X, y)
  62. huber_coef = huber.coef_
  63. huber_intercept = huber.intercept_
  64. # Rescale coefs before comparing with assert_array_almost_equal to make
  65. # sure that the number of decimal places used is somewhat insensitive to
  66. # the amplitude of the coefficients and therefore to the scale of the
  67. # data and the regularization parameter
  68. scale = max(np.mean(np.abs(huber.coef_)), np.mean(np.abs(huber.intercept_)))
  69. huber.fit(X, y, sample_weight=np.ones(y.shape[0]))
  70. assert_array_almost_equal(huber.coef_ / scale, huber_coef / scale)
  71. assert_array_almost_equal(huber.intercept_ / scale, huber_intercept / scale)
  72. X, y = make_regression_with_outliers(n_samples=5, n_features=20)
  73. X_new = np.vstack((X, np.vstack((X[1], X[1], X[3]))))
  74. y_new = np.concatenate((y, [y[1]], [y[1]], [y[3]]))
  75. huber.fit(X_new, y_new)
  76. huber_coef = huber.coef_
  77. huber_intercept = huber.intercept_
  78. sample_weight = np.ones(X.shape[0])
  79. sample_weight[1] = 3
  80. sample_weight[3] = 2
  81. huber.fit(X, y, sample_weight=sample_weight)
  82. assert_array_almost_equal(huber.coef_ / scale, huber_coef / scale)
  83. assert_array_almost_equal(huber.intercept_ / scale, huber_intercept / scale)
  84. # Test sparse implementation with sample weights.
  85. X_csr = sparse.csr_matrix(X)
  86. huber_sparse = HuberRegressor()
  87. huber_sparse.fit(X_csr, y, sample_weight=sample_weight)
  88. assert_array_almost_equal(huber_sparse.coef_ / scale, huber_coef / scale)
  89. def test_huber_sparse():
  90. X, y = make_regression_with_outliers()
  91. huber = HuberRegressor(alpha=0.1)
  92. huber.fit(X, y)
  93. X_csr = sparse.csr_matrix(X)
  94. huber_sparse = HuberRegressor(alpha=0.1)
  95. huber_sparse.fit(X_csr, y)
  96. assert_array_almost_equal(huber_sparse.coef_, huber.coef_)
  97. assert_array_equal(huber.outliers_, huber_sparse.outliers_)
  98. def test_huber_scaling_invariant():
  99. # Test that outliers filtering is scaling independent.
  100. X, y = make_regression_with_outliers()
  101. huber = HuberRegressor(fit_intercept=False, alpha=0.0)
  102. huber.fit(X, y)
  103. n_outliers_mask_1 = huber.outliers_
  104. assert not np.all(n_outliers_mask_1)
  105. huber.fit(X, 2.0 * y)
  106. n_outliers_mask_2 = huber.outliers_
  107. assert_array_equal(n_outliers_mask_2, n_outliers_mask_1)
  108. huber.fit(2.0 * X, 2.0 * y)
  109. n_outliers_mask_3 = huber.outliers_
  110. assert_array_equal(n_outliers_mask_3, n_outliers_mask_1)
  111. def test_huber_and_sgd_same_results():
  112. # Test they should converge to same coefficients for same parameters
  113. X, y = make_regression_with_outliers(n_samples=10, n_features=2)
  114. # Fit once to find out the scale parameter. Scale down X and y by scale
  115. # so that the scale parameter is optimized to 1.0
  116. huber = HuberRegressor(fit_intercept=False, alpha=0.0, epsilon=1.35)
  117. huber.fit(X, y)
  118. X_scale = X / huber.scale_
  119. y_scale = y / huber.scale_
  120. huber.fit(X_scale, y_scale)
  121. assert_almost_equal(huber.scale_, 1.0, 3)
  122. sgdreg = SGDRegressor(
  123. alpha=0.0,
  124. loss="huber",
  125. shuffle=True,
  126. random_state=0,
  127. max_iter=10000,
  128. fit_intercept=False,
  129. epsilon=1.35,
  130. tol=None,
  131. )
  132. sgdreg.fit(X_scale, y_scale)
  133. assert_array_almost_equal(huber.coef_, sgdreg.coef_, 1)
  134. def test_huber_warm_start():
  135. X, y = make_regression_with_outliers()
  136. huber_warm = HuberRegressor(alpha=1.0, max_iter=10000, warm_start=True, tol=1e-1)
  137. huber_warm.fit(X, y)
  138. huber_warm_coef = huber_warm.coef_.copy()
  139. huber_warm.fit(X, y)
  140. # SciPy performs the tol check after doing the coef updates, so
  141. # these would be almost same but not equal.
  142. assert_array_almost_equal(huber_warm.coef_, huber_warm_coef, 1)
  143. assert huber_warm.n_iter_ == 0
  144. def test_huber_better_r2_score():
  145. # Test that huber returns a better r2 score than non-outliers"""
  146. X, y = make_regression_with_outliers()
  147. huber = HuberRegressor(alpha=0.01)
  148. huber.fit(X, y)
  149. linear_loss = np.dot(X, huber.coef_) + huber.intercept_ - y
  150. mask = np.abs(linear_loss) < huber.epsilon * huber.scale_
  151. huber_score = huber.score(X[mask], y[mask])
  152. huber_outlier_score = huber.score(X[~mask], y[~mask])
  153. # The Ridge regressor should be influenced by the outliers and hence
  154. # give a worse score on the non-outliers as compared to the huber
  155. # regressor.
  156. ridge = Ridge(alpha=0.01)
  157. ridge.fit(X, y)
  158. ridge_score = ridge.score(X[mask], y[mask])
  159. ridge_outlier_score = ridge.score(X[~mask], y[~mask])
  160. assert huber_score > ridge_score
  161. # The huber model should also fit poorly on the outliers.
  162. assert ridge_outlier_score > huber_outlier_score
  163. def test_huber_bool():
  164. # Test that it does not crash with bool data
  165. X, y = make_regression(n_samples=200, n_features=2, noise=4.0, random_state=0)
  166. X_bool = X > 0
  167. HuberRegressor().fit(X_bool, y)