_bayes.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848
  1. """
  2. Various bayesian regression
  3. """
  4. # Authors: V. Michel, F. Pedregosa, A. Gramfort
  5. # License: BSD 3 clause
  6. import warnings
  7. from math import log
  8. from numbers import Integral, Real
  9. import numpy as np
  10. from scipy import linalg
  11. from scipy.linalg import pinvh
  12. from ..base import RegressorMixin, _fit_context
  13. from ..utils._param_validation import Hidden, Interval, StrOptions
  14. from ..utils.extmath import fast_logdet
  15. from ..utils.validation import _check_sample_weight
  16. from ._base import LinearModel, _preprocess_data, _rescale_data
  17. # TODO(1.5) Remove
  18. def _deprecate_n_iter(n_iter, max_iter):
  19. """Deprecates n_iter in favour of max_iter. Checks if the n_iter has been
  20. used instead of max_iter and generates a deprecation warning if True.
  21. Parameters
  22. ----------
  23. n_iter : int,
  24. Value of n_iter attribute passed by the estimator.
  25. max_iter : int, default=None
  26. Value of max_iter attribute passed by the estimator.
  27. If `None`, it corresponds to `max_iter=300`.
  28. Returns
  29. -------
  30. max_iter : int,
  31. Value of max_iter which shall further be used by the estimator.
  32. Notes
  33. -----
  34. This function should be completely removed in 1.5.
  35. """
  36. if n_iter != "deprecated":
  37. if max_iter is not None:
  38. raise ValueError(
  39. "Both `n_iter` and `max_iter` attributes were set. Attribute"
  40. " `n_iter` was deprecated in version 1.3 and will be removed in"
  41. " 1.5. To avoid this error, only set the `max_iter` attribute."
  42. )
  43. warnings.warn(
  44. (
  45. "'n_iter' was renamed to 'max_iter' in version 1.3 and "
  46. "will be removed in 1.5"
  47. ),
  48. FutureWarning,
  49. )
  50. max_iter = n_iter
  51. elif max_iter is None:
  52. max_iter = 300
  53. return max_iter
  54. ###############################################################################
  55. # BayesianRidge regression
  56. class BayesianRidge(RegressorMixin, LinearModel):
  57. """Bayesian ridge regression.
  58. Fit a Bayesian ridge model. See the Notes section for details on this
  59. implementation and the optimization of the regularization parameters
  60. lambda (precision of the weights) and alpha (precision of the noise).
  61. Read more in the :ref:`User Guide <bayesian_regression>`.
  62. Parameters
  63. ----------
  64. max_iter : int, default=None
  65. Maximum number of iterations over the complete dataset before
  66. stopping independently of any early stopping criterion. If `None`, it
  67. corresponds to `max_iter=300`.
  68. .. versionchanged:: 1.3
  69. tol : float, default=1e-3
  70. Stop the algorithm if w has converged.
  71. alpha_1 : float, default=1e-6
  72. Hyper-parameter : shape parameter for the Gamma distribution prior
  73. over the alpha parameter.
  74. alpha_2 : float, default=1e-6
  75. Hyper-parameter : inverse scale parameter (rate parameter) for the
  76. Gamma distribution prior over the alpha parameter.
  77. lambda_1 : float, default=1e-6
  78. Hyper-parameter : shape parameter for the Gamma distribution prior
  79. over the lambda parameter.
  80. lambda_2 : float, default=1e-6
  81. Hyper-parameter : inverse scale parameter (rate parameter) for the
  82. Gamma distribution prior over the lambda parameter.
  83. alpha_init : float, default=None
  84. Initial value for alpha (precision of the noise).
  85. If not set, alpha_init is 1/Var(y).
  86. .. versionadded:: 0.22
  87. lambda_init : float, default=None
  88. Initial value for lambda (precision of the weights).
  89. If not set, lambda_init is 1.
  90. .. versionadded:: 0.22
  91. compute_score : bool, default=False
  92. If True, compute the log marginal likelihood at each iteration of the
  93. optimization.
  94. fit_intercept : bool, default=True
  95. Whether to calculate the intercept for this model.
  96. The intercept is not treated as a probabilistic parameter
  97. and thus has no associated variance. If set
  98. to False, no intercept will be used in calculations
  99. (i.e. data is expected to be centered).
  100. copy_X : bool, default=True
  101. If True, X will be copied; else, it may be overwritten.
  102. verbose : bool, default=False
  103. Verbose mode when fitting the model.
  104. n_iter : int
  105. Maximum number of iterations. Should be greater than or equal to 1.
  106. .. deprecated:: 1.3
  107. `n_iter` is deprecated in 1.3 and will be removed in 1.5. Use
  108. `max_iter` instead.
  109. Attributes
  110. ----------
  111. coef_ : array-like of shape (n_features,)
  112. Coefficients of the regression model (mean of distribution)
  113. intercept_ : float
  114. Independent term in decision function. Set to 0.0 if
  115. `fit_intercept = False`.
  116. alpha_ : float
  117. Estimated precision of the noise.
  118. lambda_ : float
  119. Estimated precision of the weights.
  120. sigma_ : array-like of shape (n_features, n_features)
  121. Estimated variance-covariance matrix of the weights
  122. scores_ : array-like of shape (n_iter_+1,)
  123. If computed_score is True, value of the log marginal likelihood (to be
  124. maximized) at each iteration of the optimization. The array starts
  125. with the value of the log marginal likelihood obtained for the initial
  126. values of alpha and lambda and ends with the value obtained for the
  127. estimated alpha and lambda.
  128. n_iter_ : int
  129. The actual number of iterations to reach the stopping criterion.
  130. X_offset_ : ndarray of shape (n_features,)
  131. If `fit_intercept=True`, offset subtracted for centering data to a
  132. zero mean. Set to np.zeros(n_features) otherwise.
  133. X_scale_ : ndarray of shape (n_features,)
  134. Set to np.ones(n_features).
  135. n_features_in_ : int
  136. Number of features seen during :term:`fit`.
  137. .. versionadded:: 0.24
  138. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  139. Names of features seen during :term:`fit`. Defined only when `X`
  140. has feature names that are all strings.
  141. .. versionadded:: 1.0
  142. See Also
  143. --------
  144. ARDRegression : Bayesian ARD regression.
  145. Notes
  146. -----
  147. There exist several strategies to perform Bayesian ridge regression. This
  148. implementation is based on the algorithm described in Appendix A of
  149. (Tipping, 2001) where updates of the regularization parameters are done as
  150. suggested in (MacKay, 1992). Note that according to A New
  151. View of Automatic Relevance Determination (Wipf and Nagarajan, 2008) these
  152. update rules do not guarantee that the marginal likelihood is increasing
  153. between two consecutive iterations of the optimization.
  154. References
  155. ----------
  156. D. J. C. MacKay, Bayesian Interpolation, Computation and Neural Systems,
  157. Vol. 4, No. 3, 1992.
  158. M. E. Tipping, Sparse Bayesian Learning and the Relevance Vector Machine,
  159. Journal of Machine Learning Research, Vol. 1, 2001.
  160. Examples
  161. --------
  162. >>> from sklearn import linear_model
  163. >>> clf = linear_model.BayesianRidge()
  164. >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
  165. BayesianRidge()
  166. >>> clf.predict([[1, 1]])
  167. array([1.])
  168. """
  169. _parameter_constraints: dict = {
  170. "max_iter": [Interval(Integral, 1, None, closed="left"), None],
  171. "tol": [Interval(Real, 0, None, closed="neither")],
  172. "alpha_1": [Interval(Real, 0, None, closed="left")],
  173. "alpha_2": [Interval(Real, 0, None, closed="left")],
  174. "lambda_1": [Interval(Real, 0, None, closed="left")],
  175. "lambda_2": [Interval(Real, 0, None, closed="left")],
  176. "alpha_init": [None, Interval(Real, 0, None, closed="left")],
  177. "lambda_init": [None, Interval(Real, 0, None, closed="left")],
  178. "compute_score": ["boolean"],
  179. "fit_intercept": ["boolean"],
  180. "copy_X": ["boolean"],
  181. "verbose": ["verbose"],
  182. "n_iter": [
  183. Interval(Integral, 1, None, closed="left"),
  184. Hidden(StrOptions({"deprecated"})),
  185. ],
  186. }
  187. def __init__(
  188. self,
  189. *,
  190. max_iter=None, # TODO(1.5): Set to 300
  191. tol=1.0e-3,
  192. alpha_1=1.0e-6,
  193. alpha_2=1.0e-6,
  194. lambda_1=1.0e-6,
  195. lambda_2=1.0e-6,
  196. alpha_init=None,
  197. lambda_init=None,
  198. compute_score=False,
  199. fit_intercept=True,
  200. copy_X=True,
  201. verbose=False,
  202. n_iter="deprecated", # TODO(1.5): Remove
  203. ):
  204. self.max_iter = max_iter
  205. self.tol = tol
  206. self.alpha_1 = alpha_1
  207. self.alpha_2 = alpha_2
  208. self.lambda_1 = lambda_1
  209. self.lambda_2 = lambda_2
  210. self.alpha_init = alpha_init
  211. self.lambda_init = lambda_init
  212. self.compute_score = compute_score
  213. self.fit_intercept = fit_intercept
  214. self.copy_X = copy_X
  215. self.verbose = verbose
  216. self.n_iter = n_iter
  217. @_fit_context(prefer_skip_nested_validation=True)
  218. def fit(self, X, y, sample_weight=None):
  219. """Fit the model.
  220. Parameters
  221. ----------
  222. X : ndarray of shape (n_samples, n_features)
  223. Training data.
  224. y : ndarray of shape (n_samples,)
  225. Target values. Will be cast to X's dtype if necessary.
  226. sample_weight : ndarray of shape (n_samples,), default=None
  227. Individual weights for each sample.
  228. .. versionadded:: 0.20
  229. parameter *sample_weight* support to BayesianRidge.
  230. Returns
  231. -------
  232. self : object
  233. Returns the instance itself.
  234. """
  235. max_iter = _deprecate_n_iter(self.n_iter, self.max_iter)
  236. X, y = self._validate_data(X, y, dtype=[np.float64, np.float32], y_numeric=True)
  237. if sample_weight is not None:
  238. sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
  239. X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data(
  240. X,
  241. y,
  242. self.fit_intercept,
  243. copy=self.copy_X,
  244. sample_weight=sample_weight,
  245. )
  246. if sample_weight is not None:
  247. # Sample weight can be implemented via a simple rescaling.
  248. X, y, _ = _rescale_data(X, y, sample_weight)
  249. self.X_offset_ = X_offset_
  250. self.X_scale_ = X_scale_
  251. n_samples, n_features = X.shape
  252. # Initialization of the values of the parameters
  253. eps = np.finfo(np.float64).eps
  254. # Add `eps` in the denominator to omit division by zero if `np.var(y)`
  255. # is zero
  256. alpha_ = self.alpha_init
  257. lambda_ = self.lambda_init
  258. if alpha_ is None:
  259. alpha_ = 1.0 / (np.var(y) + eps)
  260. if lambda_ is None:
  261. lambda_ = 1.0
  262. verbose = self.verbose
  263. lambda_1 = self.lambda_1
  264. lambda_2 = self.lambda_2
  265. alpha_1 = self.alpha_1
  266. alpha_2 = self.alpha_2
  267. self.scores_ = list()
  268. coef_old_ = None
  269. XT_y = np.dot(X.T, y)
  270. U, S, Vh = linalg.svd(X, full_matrices=False)
  271. eigen_vals_ = S**2
  272. # Convergence loop of the bayesian ridge regression
  273. for iter_ in range(max_iter):
  274. # update posterior mean coef_ based on alpha_ and lambda_ and
  275. # compute corresponding rmse
  276. coef_, rmse_ = self._update_coef_(
  277. X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
  278. )
  279. if self.compute_score:
  280. # compute the log marginal likelihood
  281. s = self._log_marginal_likelihood(
  282. n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_
  283. )
  284. self.scores_.append(s)
  285. # Update alpha and lambda according to (MacKay, 1992)
  286. gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))
  287. lambda_ = (gamma_ + 2 * lambda_1) / (np.sum(coef_**2) + 2 * lambda_2)
  288. alpha_ = (n_samples - gamma_ + 2 * alpha_1) / (rmse_ + 2 * alpha_2)
  289. # Check for convergence
  290. if iter_ != 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
  291. if verbose:
  292. print("Convergence after ", str(iter_), " iterations")
  293. break
  294. coef_old_ = np.copy(coef_)
  295. self.n_iter_ = iter_ + 1
  296. # return regularization parameters and corresponding posterior mean,
  297. # log marginal likelihood and posterior covariance
  298. self.alpha_ = alpha_
  299. self.lambda_ = lambda_
  300. self.coef_, rmse_ = self._update_coef_(
  301. X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
  302. )
  303. if self.compute_score:
  304. # compute the log marginal likelihood
  305. s = self._log_marginal_likelihood(
  306. n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_
  307. )
  308. self.scores_.append(s)
  309. self.scores_ = np.array(self.scores_)
  310. # posterior covariance is given by 1/alpha_ * scaled_sigma_
  311. scaled_sigma_ = np.dot(
  312. Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis]
  313. )
  314. self.sigma_ = (1.0 / alpha_) * scaled_sigma_
  315. self._set_intercept(X_offset_, y_offset_, X_scale_)
  316. return self
  317. def predict(self, X, return_std=False):
  318. """Predict using the linear model.
  319. In addition to the mean of the predictive distribution, also its
  320. standard deviation can be returned.
  321. Parameters
  322. ----------
  323. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  324. Samples.
  325. return_std : bool, default=False
  326. Whether to return the standard deviation of posterior prediction.
  327. Returns
  328. -------
  329. y_mean : array-like of shape (n_samples,)
  330. Mean of predictive distribution of query points.
  331. y_std : array-like of shape (n_samples,)
  332. Standard deviation of predictive distribution of query points.
  333. """
  334. y_mean = self._decision_function(X)
  335. if not return_std:
  336. return y_mean
  337. else:
  338. sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
  339. y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_))
  340. return y_mean, y_std
  341. def _update_coef_(
  342. self, X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
  343. ):
  344. """Update posterior mean and compute corresponding rmse.
  345. Posterior mean is given by coef_ = scaled_sigma_ * X.T * y where
  346. scaled_sigma_ = (lambda_/alpha_ * np.eye(n_features)
  347. + np.dot(X.T, X))^-1
  348. """
  349. if n_samples > n_features:
  350. coef_ = np.linalg.multi_dot(
  351. [Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis], XT_y]
  352. )
  353. else:
  354. coef_ = np.linalg.multi_dot(
  355. [X.T, U / (eigen_vals_ + lambda_ / alpha_)[None, :], U.T, y]
  356. )
  357. rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
  358. return coef_, rmse_
  359. def _log_marginal_likelihood(
  360. self, n_samples, n_features, eigen_vals, alpha_, lambda_, coef, rmse
  361. ):
  362. """Log marginal likelihood."""
  363. alpha_1 = self.alpha_1
  364. alpha_2 = self.alpha_2
  365. lambda_1 = self.lambda_1
  366. lambda_2 = self.lambda_2
  367. # compute the log of the determinant of the posterior covariance.
  368. # posterior covariance is given by
  369. # sigma = (lambda_ * np.eye(n_features) + alpha_ * np.dot(X.T, X))^-1
  370. if n_samples > n_features:
  371. logdet_sigma = -np.sum(np.log(lambda_ + alpha_ * eigen_vals))
  372. else:
  373. logdet_sigma = np.full(n_features, lambda_, dtype=np.array(lambda_).dtype)
  374. logdet_sigma[:n_samples] += alpha_ * eigen_vals
  375. logdet_sigma = -np.sum(np.log(logdet_sigma))
  376. score = lambda_1 * log(lambda_) - lambda_2 * lambda_
  377. score += alpha_1 * log(alpha_) - alpha_2 * alpha_
  378. score += 0.5 * (
  379. n_features * log(lambda_)
  380. + n_samples * log(alpha_)
  381. - alpha_ * rmse
  382. - lambda_ * np.sum(coef**2)
  383. + logdet_sigma
  384. - n_samples * log(2 * np.pi)
  385. )
  386. return score
  387. ###############################################################################
  388. # ARD (Automatic Relevance Determination) regression
  389. class ARDRegression(RegressorMixin, LinearModel):
  390. """Bayesian ARD regression.
  391. Fit the weights of a regression model, using an ARD prior. The weights of
  392. the regression model are assumed to be in Gaussian distributions.
  393. Also estimate the parameters lambda (precisions of the distributions of the
  394. weights) and alpha (precision of the distribution of the noise).
  395. The estimation is done by an iterative procedures (Evidence Maximization)
  396. Read more in the :ref:`User Guide <bayesian_regression>`.
  397. Parameters
  398. ----------
  399. max_iter : int, default=None
  400. Maximum number of iterations. If `None`, it corresponds to `max_iter=300`.
  401. .. versionchanged:: 1.3
  402. tol : float, default=1e-3
  403. Stop the algorithm if w has converged.
  404. alpha_1 : float, default=1e-6
  405. Hyper-parameter : shape parameter for the Gamma distribution prior
  406. over the alpha parameter.
  407. alpha_2 : float, default=1e-6
  408. Hyper-parameter : inverse scale parameter (rate parameter) for the
  409. Gamma distribution prior over the alpha parameter.
  410. lambda_1 : float, default=1e-6
  411. Hyper-parameter : shape parameter for the Gamma distribution prior
  412. over the lambda parameter.
  413. lambda_2 : float, default=1e-6
  414. Hyper-parameter : inverse scale parameter (rate parameter) for the
  415. Gamma distribution prior over the lambda parameter.
  416. compute_score : bool, default=False
  417. If True, compute the objective function at each step of the model.
  418. threshold_lambda : float, default=10 000
  419. Threshold for removing (pruning) weights with high precision from
  420. the computation.
  421. fit_intercept : bool, default=True
  422. Whether to calculate the intercept for this model. If set
  423. to false, no intercept will be used in calculations
  424. (i.e. data is expected to be centered).
  425. copy_X : bool, default=True
  426. If True, X will be copied; else, it may be overwritten.
  427. verbose : bool, default=False
  428. Verbose mode when fitting the model.
  429. n_iter : int
  430. Maximum number of iterations.
  431. .. deprecated:: 1.3
  432. `n_iter` is deprecated in 1.3 and will be removed in 1.5. Use
  433. `max_iter` instead.
  434. Attributes
  435. ----------
  436. coef_ : array-like of shape (n_features,)
  437. Coefficients of the regression model (mean of distribution)
  438. alpha_ : float
  439. estimated precision of the noise.
  440. lambda_ : array-like of shape (n_features,)
  441. estimated precisions of the weights.
  442. sigma_ : array-like of shape (n_features, n_features)
  443. estimated variance-covariance matrix of the weights
  444. scores_ : float
  445. if computed, value of the objective function (to be maximized)
  446. n_iter_ : int
  447. The actual number of iterations to reach the stopping criterion.
  448. .. versionadded:: 1.3
  449. intercept_ : float
  450. Independent term in decision function. Set to 0.0 if
  451. ``fit_intercept = False``.
  452. X_offset_ : float
  453. If `fit_intercept=True`, offset subtracted for centering data to a
  454. zero mean. Set to np.zeros(n_features) otherwise.
  455. X_scale_ : float
  456. Set to np.ones(n_features).
  457. n_features_in_ : int
  458. Number of features seen during :term:`fit`.
  459. .. versionadded:: 0.24
  460. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  461. Names of features seen during :term:`fit`. Defined only when `X`
  462. has feature names that are all strings.
  463. .. versionadded:: 1.0
  464. See Also
  465. --------
  466. BayesianRidge : Bayesian ridge regression.
  467. Notes
  468. -----
  469. For an example, see :ref:`examples/linear_model/plot_ard.py
  470. <sphx_glr_auto_examples_linear_model_plot_ard.py>`.
  471. References
  472. ----------
  473. D. J. C. MacKay, Bayesian nonlinear modeling for the prediction
  474. competition, ASHRAE Transactions, 1994.
  475. R. Salakhutdinov, Lecture notes on Statistical Machine Learning,
  476. http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf#page=15
  477. Their beta is our ``self.alpha_``
  478. Their alpha is our ``self.lambda_``
  479. ARD is a little different than the slide: only dimensions/features for
  480. which ``self.lambda_ < self.threshold_lambda`` are kept and the rest are
  481. discarded.
  482. Examples
  483. --------
  484. >>> from sklearn import linear_model
  485. >>> clf = linear_model.ARDRegression()
  486. >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
  487. ARDRegression()
  488. >>> clf.predict([[1, 1]])
  489. array([1.])
  490. """
  491. _parameter_constraints: dict = {
  492. "max_iter": [Interval(Integral, 1, None, closed="left"), None],
  493. "tol": [Interval(Real, 0, None, closed="left")],
  494. "alpha_1": [Interval(Real, 0, None, closed="left")],
  495. "alpha_2": [Interval(Real, 0, None, closed="left")],
  496. "lambda_1": [Interval(Real, 0, None, closed="left")],
  497. "lambda_2": [Interval(Real, 0, None, closed="left")],
  498. "compute_score": ["boolean"],
  499. "threshold_lambda": [Interval(Real, 0, None, closed="left")],
  500. "fit_intercept": ["boolean"],
  501. "copy_X": ["boolean"],
  502. "verbose": ["verbose"],
  503. "n_iter": [
  504. Interval(Integral, 1, None, closed="left"),
  505. Hidden(StrOptions({"deprecated"})),
  506. ],
  507. }
  508. def __init__(
  509. self,
  510. *,
  511. max_iter=None, # TODO(1.5): Set to 300
  512. tol=1.0e-3,
  513. alpha_1=1.0e-6,
  514. alpha_2=1.0e-6,
  515. lambda_1=1.0e-6,
  516. lambda_2=1.0e-6,
  517. compute_score=False,
  518. threshold_lambda=1.0e4,
  519. fit_intercept=True,
  520. copy_X=True,
  521. verbose=False,
  522. n_iter="deprecated", # TODO(1.5): Remove
  523. ):
  524. self.max_iter = max_iter
  525. self.tol = tol
  526. self.fit_intercept = fit_intercept
  527. self.alpha_1 = alpha_1
  528. self.alpha_2 = alpha_2
  529. self.lambda_1 = lambda_1
  530. self.lambda_2 = lambda_2
  531. self.compute_score = compute_score
  532. self.threshold_lambda = threshold_lambda
  533. self.copy_X = copy_X
  534. self.verbose = verbose
  535. self.n_iter = n_iter
  536. @_fit_context(prefer_skip_nested_validation=True)
  537. def fit(self, X, y):
  538. """Fit the model according to the given training data and parameters.
  539. Iterative procedure to maximize the evidence
  540. Parameters
  541. ----------
  542. X : array-like of shape (n_samples, n_features)
  543. Training vector, where `n_samples` is the number of samples and
  544. `n_features` is the number of features.
  545. y : array-like of shape (n_samples,)
  546. Target values (integers). Will be cast to X's dtype if necessary.
  547. Returns
  548. -------
  549. self : object
  550. Fitted estimator.
  551. """
  552. max_iter = _deprecate_n_iter(self.n_iter, self.max_iter)
  553. X, y = self._validate_data(
  554. X, y, dtype=[np.float64, np.float32], y_numeric=True, ensure_min_samples=2
  555. )
  556. n_samples, n_features = X.shape
  557. coef_ = np.zeros(n_features, dtype=X.dtype)
  558. X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data(
  559. X, y, self.fit_intercept, copy=self.copy_X
  560. )
  561. self.X_offset_ = X_offset_
  562. self.X_scale_ = X_scale_
  563. # Launch the convergence loop
  564. keep_lambda = np.ones(n_features, dtype=bool)
  565. lambda_1 = self.lambda_1
  566. lambda_2 = self.lambda_2
  567. alpha_1 = self.alpha_1
  568. alpha_2 = self.alpha_2
  569. verbose = self.verbose
  570. # Initialization of the values of the parameters
  571. eps = np.finfo(np.float64).eps
  572. # Add `eps` in the denominator to omit division by zero if `np.var(y)`
  573. # is zero
  574. alpha_ = 1.0 / (np.var(y) + eps)
  575. lambda_ = np.ones(n_features, dtype=X.dtype)
  576. self.scores_ = list()
  577. coef_old_ = None
  578. def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_):
  579. coef_[keep_lambda] = alpha_ * np.linalg.multi_dot(
  580. [sigma_, X[:, keep_lambda].T, y]
  581. )
  582. return coef_
  583. update_sigma = (
  584. self._update_sigma
  585. if n_samples >= n_features
  586. else self._update_sigma_woodbury
  587. )
  588. # Iterative procedure of ARDRegression
  589. for iter_ in range(max_iter):
  590. sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
  591. coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
  592. # Update alpha and lambda
  593. rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
  594. gamma_ = 1.0 - lambda_[keep_lambda] * np.diag(sigma_)
  595. lambda_[keep_lambda] = (gamma_ + 2.0 * lambda_1) / (
  596. (coef_[keep_lambda]) ** 2 + 2.0 * lambda_2
  597. )
  598. alpha_ = (n_samples - gamma_.sum() + 2.0 * alpha_1) / (
  599. rmse_ + 2.0 * alpha_2
  600. )
  601. # Prune the weights with a precision over a threshold
  602. keep_lambda = lambda_ < self.threshold_lambda
  603. coef_[~keep_lambda] = 0
  604. # Compute the objective function
  605. if self.compute_score:
  606. s = (lambda_1 * np.log(lambda_) - lambda_2 * lambda_).sum()
  607. s += alpha_1 * log(alpha_) - alpha_2 * alpha_
  608. s += 0.5 * (
  609. fast_logdet(sigma_)
  610. + n_samples * log(alpha_)
  611. + np.sum(np.log(lambda_))
  612. )
  613. s -= 0.5 * (alpha_ * rmse_ + (lambda_ * coef_**2).sum())
  614. self.scores_.append(s)
  615. # Check for convergence
  616. if iter_ > 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
  617. if verbose:
  618. print("Converged after %s iterations" % iter_)
  619. break
  620. coef_old_ = np.copy(coef_)
  621. if not keep_lambda.any():
  622. break
  623. self.n_iter_ = iter_ + 1
  624. if keep_lambda.any():
  625. # update sigma and mu using updated params from the last iteration
  626. sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
  627. coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
  628. else:
  629. sigma_ = np.array([]).reshape(0, 0)
  630. self.coef_ = coef_
  631. self.alpha_ = alpha_
  632. self.sigma_ = sigma_
  633. self.lambda_ = lambda_
  634. self._set_intercept(X_offset_, y_offset_, X_scale_)
  635. return self
  636. def _update_sigma_woodbury(self, X, alpha_, lambda_, keep_lambda):
  637. # See slides as referenced in the docstring note
  638. # this function is used when n_samples < n_features and will invert
  639. # a matrix of shape (n_samples, n_samples) making use of the
  640. # woodbury formula:
  641. # https://en.wikipedia.org/wiki/Woodbury_matrix_identity
  642. n_samples = X.shape[0]
  643. X_keep = X[:, keep_lambda]
  644. inv_lambda = 1 / lambda_[keep_lambda].reshape(1, -1)
  645. sigma_ = pinvh(
  646. np.eye(n_samples, dtype=X.dtype) / alpha_
  647. + np.dot(X_keep * inv_lambda, X_keep.T)
  648. )
  649. sigma_ = np.dot(sigma_, X_keep * inv_lambda)
  650. sigma_ = -np.dot(inv_lambda.reshape(-1, 1) * X_keep.T, sigma_)
  651. sigma_[np.diag_indices(sigma_.shape[1])] += 1.0 / lambda_[keep_lambda]
  652. return sigma_
  653. def _update_sigma(self, X, alpha_, lambda_, keep_lambda):
  654. # See slides as referenced in the docstring note
  655. # this function is used when n_samples >= n_features and will
  656. # invert a matrix of shape (n_features, n_features)
  657. X_keep = X[:, keep_lambda]
  658. gram = np.dot(X_keep.T, X_keep)
  659. eye = np.eye(gram.shape[0], dtype=X.dtype)
  660. sigma_inv = lambda_[keep_lambda] * eye + alpha_ * gram
  661. sigma_ = pinvh(sigma_inv)
  662. return sigma_
  663. def predict(self, X, return_std=False):
  664. """Predict using the linear model.
  665. In addition to the mean of the predictive distribution, also its
  666. standard deviation can be returned.
  667. Parameters
  668. ----------
  669. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  670. Samples.
  671. return_std : bool, default=False
  672. Whether to return the standard deviation of posterior prediction.
  673. Returns
  674. -------
  675. y_mean : array-like of shape (n_samples,)
  676. Mean of predictive distribution of query points.
  677. y_std : array-like of shape (n_samples,)
  678. Standard deviation of predictive distribution of query points.
  679. """
  680. y_mean = self._decision_function(X)
  681. if return_std is False:
  682. return y_mean
  683. else:
  684. X = X[:, self.lambda_ < self.threshold_lambda]
  685. sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
  686. y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_))
  687. return y_mean, y_std