_huber.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. # Authors: Manoj Kumar mks542@nyu.edu
  2. # License: BSD 3 clause
  3. from numbers import Integral, Real
  4. import numpy as np
  5. from scipy import optimize
  6. from ..base import BaseEstimator, RegressorMixin, _fit_context
  7. from ..utils import axis0_safe_slice
  8. from ..utils._param_validation import Interval
  9. from ..utils.extmath import safe_sparse_dot
  10. from ..utils.optimize import _check_optimize_result
  11. from ..utils.validation import _check_sample_weight
  12. from ._base import LinearModel
  13. def _huber_loss_and_gradient(w, X, y, epsilon, alpha, sample_weight=None):
  14. """Returns the Huber loss and the gradient.
  15. Parameters
  16. ----------
  17. w : ndarray, shape (n_features + 1,) or (n_features + 2,)
  18. Feature vector.
  19. w[:n_features] gives the coefficients
  20. w[-1] gives the scale factor and if the intercept is fit w[-2]
  21. gives the intercept factor.
  22. X : ndarray of shape (n_samples, n_features)
  23. Input data.
  24. y : ndarray of shape (n_samples,)
  25. Target vector.
  26. epsilon : float
  27. Robustness of the Huber estimator.
  28. alpha : float
  29. Regularization parameter.
  30. sample_weight : ndarray of shape (n_samples,), default=None
  31. Weight assigned to each sample.
  32. Returns
  33. -------
  34. loss : float
  35. Huber loss.
  36. gradient : ndarray, shape (len(w))
  37. Returns the derivative of the Huber loss with respect to each
  38. coefficient, intercept and the scale as a vector.
  39. """
  40. _, n_features = X.shape
  41. fit_intercept = n_features + 2 == w.shape[0]
  42. if fit_intercept:
  43. intercept = w[-2]
  44. sigma = w[-1]
  45. w = w[:n_features]
  46. n_samples = np.sum(sample_weight)
  47. # Calculate the values where |y - X'w -c / sigma| > epsilon
  48. # The values above this threshold are outliers.
  49. linear_loss = y - safe_sparse_dot(X, w)
  50. if fit_intercept:
  51. linear_loss -= intercept
  52. abs_linear_loss = np.abs(linear_loss)
  53. outliers_mask = abs_linear_loss > epsilon * sigma
  54. # Calculate the linear loss due to the outliers.
  55. # This is equal to (2 * M * |y - X'w -c / sigma| - M**2) * sigma
  56. outliers = abs_linear_loss[outliers_mask]
  57. num_outliers = np.count_nonzero(outliers_mask)
  58. n_non_outliers = X.shape[0] - num_outliers
  59. # n_sq_outliers includes the weight give to the outliers while
  60. # num_outliers is just the number of outliers.
  61. outliers_sw = sample_weight[outliers_mask]
  62. n_sw_outliers = np.sum(outliers_sw)
  63. outlier_loss = (
  64. 2.0 * epsilon * np.sum(outliers_sw * outliers)
  65. - sigma * n_sw_outliers * epsilon**2
  66. )
  67. # Calculate the quadratic loss due to the non-outliers.-
  68. # This is equal to |(y - X'w - c)**2 / sigma**2| * sigma
  69. non_outliers = linear_loss[~outliers_mask]
  70. weighted_non_outliers = sample_weight[~outliers_mask] * non_outliers
  71. weighted_loss = np.dot(weighted_non_outliers.T, non_outliers)
  72. squared_loss = weighted_loss / sigma
  73. if fit_intercept:
  74. grad = np.zeros(n_features + 2)
  75. else:
  76. grad = np.zeros(n_features + 1)
  77. # Gradient due to the squared loss.
  78. X_non_outliers = -axis0_safe_slice(X, ~outliers_mask, n_non_outliers)
  79. grad[:n_features] = (
  80. 2.0 / sigma * safe_sparse_dot(weighted_non_outliers, X_non_outliers)
  81. )
  82. # Gradient due to the linear loss.
  83. signed_outliers = np.ones_like(outliers)
  84. signed_outliers_mask = linear_loss[outliers_mask] < 0
  85. signed_outliers[signed_outliers_mask] = -1.0
  86. X_outliers = axis0_safe_slice(X, outliers_mask, num_outliers)
  87. sw_outliers = sample_weight[outliers_mask] * signed_outliers
  88. grad[:n_features] -= 2.0 * epsilon * (safe_sparse_dot(sw_outliers, X_outliers))
  89. # Gradient due to the penalty.
  90. grad[:n_features] += alpha * 2.0 * w
  91. # Gradient due to sigma.
  92. grad[-1] = n_samples
  93. grad[-1] -= n_sw_outliers * epsilon**2
  94. grad[-1] -= squared_loss / sigma
  95. # Gradient due to the intercept.
  96. if fit_intercept:
  97. grad[-2] = -2.0 * np.sum(weighted_non_outliers) / sigma
  98. grad[-2] -= 2.0 * epsilon * np.sum(sw_outliers)
  99. loss = n_samples * sigma + squared_loss + outlier_loss
  100. loss += alpha * np.dot(w, w)
  101. return loss, grad
  102. class HuberRegressor(LinearModel, RegressorMixin, BaseEstimator):
  103. """L2-regularized linear regression model that is robust to outliers.
  104. The Huber Regressor optimizes the squared loss for the samples where
  105. ``|(y - Xw - c) / sigma| < epsilon`` and the absolute loss for the samples
  106. where ``|(y - Xw - c) / sigma| > epsilon``, where the model coefficients
  107. ``w``, the intercept ``c`` and the scale ``sigma`` are parameters
  108. to be optimized. The parameter sigma makes sure that if y is scaled up
  109. or down by a certain factor, one does not need to rescale epsilon to
  110. achieve the same robustness. Note that this does not take into account
  111. the fact that the different features of X may be of different scales.
  112. The Huber loss function has the advantage of not being heavily influenced
  113. by the outliers while not completely ignoring their effect.
  114. Read more in the :ref:`User Guide <huber_regression>`
  115. .. versionadded:: 0.18
  116. Parameters
  117. ----------
  118. epsilon : float, default=1.35
  119. The parameter epsilon controls the number of samples that should be
  120. classified as outliers. The smaller the epsilon, the more robust it is
  121. to outliers. Epsilon must be in the range `[1, inf)`.
  122. max_iter : int, default=100
  123. Maximum number of iterations that
  124. ``scipy.optimize.minimize(method="L-BFGS-B")`` should run for.
  125. alpha : float, default=0.0001
  126. Strength of the squared L2 regularization. Note that the penalty is
  127. equal to ``alpha * ||w||^2``.
  128. Must be in the range `[0, inf)`.
  129. warm_start : bool, default=False
  130. This is useful if the stored attributes of a previously used model
  131. has to be reused. If set to False, then the coefficients will
  132. be rewritten for every call to fit.
  133. See :term:`the Glossary <warm_start>`.
  134. fit_intercept : bool, default=True
  135. Whether or not to fit the intercept. This can be set to False
  136. if the data is already centered around the origin.
  137. tol : float, default=1e-05
  138. The iteration will stop when
  139. ``max{|proj g_i | i = 1, ..., n}`` <= ``tol``
  140. where pg_i is the i-th component of the projected gradient.
  141. Attributes
  142. ----------
  143. coef_ : array, shape (n_features,)
  144. Features got by optimizing the L2-regularized Huber loss.
  145. intercept_ : float
  146. Bias.
  147. scale_ : float
  148. The value by which ``|y - Xw - c|`` is scaled down.
  149. n_features_in_ : int
  150. Number of features seen during :term:`fit`.
  151. .. versionadded:: 0.24
  152. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  153. Names of features seen during :term:`fit`. Defined only when `X`
  154. has feature names that are all strings.
  155. .. versionadded:: 1.0
  156. n_iter_ : int
  157. Number of iterations that
  158. ``scipy.optimize.minimize(method="L-BFGS-B")`` has run for.
  159. .. versionchanged:: 0.20
  160. In SciPy <= 1.0.0 the number of lbfgs iterations may exceed
  161. ``max_iter``. ``n_iter_`` will now report at most ``max_iter``.
  162. outliers_ : array, shape (n_samples,)
  163. A boolean mask which is set to True where the samples are identified
  164. as outliers.
  165. See Also
  166. --------
  167. RANSACRegressor : RANSAC (RANdom SAmple Consensus) algorithm.
  168. TheilSenRegressor : Theil-Sen Estimator robust multivariate regression model.
  169. SGDRegressor : Fitted by minimizing a regularized empirical loss with SGD.
  170. References
  171. ----------
  172. .. [1] Peter J. Huber, Elvezio M. Ronchetti, Robust Statistics
  173. Concomitant scale estimates, pg 172
  174. .. [2] Art B. Owen (2006), A robust hybrid of lasso and ridge regression.
  175. https://statweb.stanford.edu/~owen/reports/hhu.pdf
  176. Examples
  177. --------
  178. >>> import numpy as np
  179. >>> from sklearn.linear_model import HuberRegressor, LinearRegression
  180. >>> from sklearn.datasets import make_regression
  181. >>> rng = np.random.RandomState(0)
  182. >>> X, y, coef = make_regression(
  183. ... n_samples=200, n_features=2, noise=4.0, coef=True, random_state=0)
  184. >>> X[:4] = rng.uniform(10, 20, (4, 2))
  185. >>> y[:4] = rng.uniform(10, 20, 4)
  186. >>> huber = HuberRegressor().fit(X, y)
  187. >>> huber.score(X, y)
  188. -7.284...
  189. >>> huber.predict(X[:1,])
  190. array([806.7200...])
  191. >>> linear = LinearRegression().fit(X, y)
  192. >>> print("True coefficients:", coef)
  193. True coefficients: [20.4923... 34.1698...]
  194. >>> print("Huber coefficients:", huber.coef_)
  195. Huber coefficients: [17.7906... 31.0106...]
  196. >>> print("Linear Regression coefficients:", linear.coef_)
  197. Linear Regression coefficients: [-1.9221... 7.0226...]
  198. """
  199. _parameter_constraints: dict = {
  200. "epsilon": [Interval(Real, 1.0, None, closed="left")],
  201. "max_iter": [Interval(Integral, 0, None, closed="left")],
  202. "alpha": [Interval(Real, 0, None, closed="left")],
  203. "warm_start": ["boolean"],
  204. "fit_intercept": ["boolean"],
  205. "tol": [Interval(Real, 0.0, None, closed="left")],
  206. }
  207. def __init__(
  208. self,
  209. *,
  210. epsilon=1.35,
  211. max_iter=100,
  212. alpha=0.0001,
  213. warm_start=False,
  214. fit_intercept=True,
  215. tol=1e-05,
  216. ):
  217. self.epsilon = epsilon
  218. self.max_iter = max_iter
  219. self.alpha = alpha
  220. self.warm_start = warm_start
  221. self.fit_intercept = fit_intercept
  222. self.tol = tol
  223. @_fit_context(prefer_skip_nested_validation=True)
  224. def fit(self, X, y, sample_weight=None):
  225. """Fit the model according to the given training data.
  226. Parameters
  227. ----------
  228. X : array-like, shape (n_samples, n_features)
  229. Training vector, where `n_samples` is the number of samples and
  230. `n_features` is the number of features.
  231. y : array-like, shape (n_samples,)
  232. Target vector relative to X.
  233. sample_weight : array-like, shape (n_samples,)
  234. Weight given to each sample.
  235. Returns
  236. -------
  237. self : object
  238. Fitted `HuberRegressor` estimator.
  239. """
  240. X, y = self._validate_data(
  241. X,
  242. y,
  243. copy=False,
  244. accept_sparse=["csr"],
  245. y_numeric=True,
  246. dtype=[np.float64, np.float32],
  247. )
  248. sample_weight = _check_sample_weight(sample_weight, X)
  249. if self.warm_start and hasattr(self, "coef_"):
  250. parameters = np.concatenate((self.coef_, [self.intercept_, self.scale_]))
  251. else:
  252. if self.fit_intercept:
  253. parameters = np.zeros(X.shape[1] + 2)
  254. else:
  255. parameters = np.zeros(X.shape[1] + 1)
  256. # Make sure to initialize the scale parameter to a strictly
  257. # positive value:
  258. parameters[-1] = 1
  259. # Sigma or the scale factor should be non-negative.
  260. # Setting it to be zero might cause undefined bounds hence we set it
  261. # to a value close to zero.
  262. bounds = np.tile([-np.inf, np.inf], (parameters.shape[0], 1))
  263. bounds[-1][0] = np.finfo(np.float64).eps * 10
  264. opt_res = optimize.minimize(
  265. _huber_loss_and_gradient,
  266. parameters,
  267. method="L-BFGS-B",
  268. jac=True,
  269. args=(X, y, self.epsilon, self.alpha, sample_weight),
  270. options={"maxiter": self.max_iter, "gtol": self.tol, "iprint": -1},
  271. bounds=bounds,
  272. )
  273. parameters = opt_res.x
  274. if opt_res.status == 2:
  275. raise ValueError(
  276. "HuberRegressor convergence failed: l-BFGS-b solver terminated with %s"
  277. % opt_res.message
  278. )
  279. self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  280. self.scale_ = parameters[-1]
  281. if self.fit_intercept:
  282. self.intercept_ = parameters[-2]
  283. else:
  284. self.intercept_ = 0.0
  285. self.coef_ = parameters[: X.shape[1]]
  286. residual = np.abs(y - safe_sparse_dot(X, self.coef_) - self.intercept_)
  287. self.outliers_ = residual > self.scale_ * self.epsilon
  288. return self