| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848 |
- """
- Various bayesian regression
- """
- # Authors: V. Michel, F. Pedregosa, A. Gramfort
- # License: BSD 3 clause
- import warnings
- from math import log
- from numbers import Integral, Real
- import numpy as np
- from scipy import linalg
- from scipy.linalg import pinvh
- from ..base import RegressorMixin, _fit_context
- from ..utils._param_validation import Hidden, Interval, StrOptions
- from ..utils.extmath import fast_logdet
- from ..utils.validation import _check_sample_weight
- from ._base import LinearModel, _preprocess_data, _rescale_data
- # TODO(1.5) Remove
- def _deprecate_n_iter(n_iter, max_iter):
- """Deprecates n_iter in favour of max_iter. Checks if the n_iter has been
- used instead of max_iter and generates a deprecation warning if True.
- Parameters
- ----------
- n_iter : int,
- Value of n_iter attribute passed by the estimator.
- max_iter : int, default=None
- Value of max_iter attribute passed by the estimator.
- If `None`, it corresponds to `max_iter=300`.
- Returns
- -------
- max_iter : int,
- Value of max_iter which shall further be used by the estimator.
- Notes
- -----
- This function should be completely removed in 1.5.
- """
- if n_iter != "deprecated":
- if max_iter is not None:
- raise ValueError(
- "Both `n_iter` and `max_iter` attributes were set. Attribute"
- " `n_iter` was deprecated in version 1.3 and will be removed in"
- " 1.5. To avoid this error, only set the `max_iter` attribute."
- )
- warnings.warn(
- (
- "'n_iter' was renamed to 'max_iter' in version 1.3 and "
- "will be removed in 1.5"
- ),
- FutureWarning,
- )
- max_iter = n_iter
- elif max_iter is None:
- max_iter = 300
- return max_iter
- ###############################################################################
- # BayesianRidge regression
- class BayesianRidge(RegressorMixin, LinearModel):
- """Bayesian ridge regression.
- Fit a Bayesian ridge model. See the Notes section for details on this
- implementation and the optimization of the regularization parameters
- lambda (precision of the weights) and alpha (precision of the noise).
- Read more in the :ref:`User Guide <bayesian_regression>`.
- Parameters
- ----------
- max_iter : int, default=None
- Maximum number of iterations over the complete dataset before
- stopping independently of any early stopping criterion. If `None`, it
- corresponds to `max_iter=300`.
- .. versionchanged:: 1.3
- tol : float, default=1e-3
- Stop the algorithm if w has converged.
- alpha_1 : float, default=1e-6
- Hyper-parameter : shape parameter for the Gamma distribution prior
- over the alpha parameter.
- alpha_2 : float, default=1e-6
- Hyper-parameter : inverse scale parameter (rate parameter) for the
- Gamma distribution prior over the alpha parameter.
- lambda_1 : float, default=1e-6
- Hyper-parameter : shape parameter for the Gamma distribution prior
- over the lambda parameter.
- lambda_2 : float, default=1e-6
- Hyper-parameter : inverse scale parameter (rate parameter) for the
- Gamma distribution prior over the lambda parameter.
- alpha_init : float, default=None
- Initial value for alpha (precision of the noise).
- If not set, alpha_init is 1/Var(y).
- .. versionadded:: 0.22
- lambda_init : float, default=None
- Initial value for lambda (precision of the weights).
- If not set, lambda_init is 1.
- .. versionadded:: 0.22
- compute_score : bool, default=False
- If True, compute the log marginal likelihood at each iteration of the
- optimization.
- fit_intercept : bool, default=True
- Whether to calculate the intercept for this model.
- The intercept is not treated as a probabilistic parameter
- and thus has no associated variance. If set
- to False, no intercept will be used in calculations
- (i.e. data is expected to be centered).
- copy_X : bool, default=True
- If True, X will be copied; else, it may be overwritten.
- verbose : bool, default=False
- Verbose mode when fitting the model.
- n_iter : int
- Maximum number of iterations. Should be greater than or equal to 1.
- .. deprecated:: 1.3
- `n_iter` is deprecated in 1.3 and will be removed in 1.5. Use
- `max_iter` instead.
- Attributes
- ----------
- coef_ : array-like of shape (n_features,)
- Coefficients of the regression model (mean of distribution)
- intercept_ : float
- Independent term in decision function. Set to 0.0 if
- `fit_intercept = False`.
- alpha_ : float
- Estimated precision of the noise.
- lambda_ : float
- Estimated precision of the weights.
- sigma_ : array-like of shape (n_features, n_features)
- Estimated variance-covariance matrix of the weights
- scores_ : array-like of shape (n_iter_+1,)
- If computed_score is True, value of the log marginal likelihood (to be
- maximized) at each iteration of the optimization. The array starts
- with the value of the log marginal likelihood obtained for the initial
- values of alpha and lambda and ends with the value obtained for the
- estimated alpha and lambda.
- n_iter_ : int
- The actual number of iterations to reach the stopping criterion.
- X_offset_ : ndarray of shape (n_features,)
- If `fit_intercept=True`, offset subtracted for centering data to a
- zero mean. Set to np.zeros(n_features) otherwise.
- X_scale_ : ndarray of shape (n_features,)
- Set to np.ones(n_features).
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- See Also
- --------
- ARDRegression : Bayesian ARD regression.
- Notes
- -----
- There exist several strategies to perform Bayesian ridge regression. This
- implementation is based on the algorithm described in Appendix A of
- (Tipping, 2001) where updates of the regularization parameters are done as
- suggested in (MacKay, 1992). Note that according to A New
- View of Automatic Relevance Determination (Wipf and Nagarajan, 2008) these
- update rules do not guarantee that the marginal likelihood is increasing
- between two consecutive iterations of the optimization.
- References
- ----------
- D. J. C. MacKay, Bayesian Interpolation, Computation and Neural Systems,
- Vol. 4, No. 3, 1992.
- M. E. Tipping, Sparse Bayesian Learning and the Relevance Vector Machine,
- Journal of Machine Learning Research, Vol. 1, 2001.
- Examples
- --------
- >>> from sklearn import linear_model
- >>> clf = linear_model.BayesianRidge()
- >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
- BayesianRidge()
- >>> clf.predict([[1, 1]])
- array([1.])
- """
- _parameter_constraints: dict = {
- "max_iter": [Interval(Integral, 1, None, closed="left"), None],
- "tol": [Interval(Real, 0, None, closed="neither")],
- "alpha_1": [Interval(Real, 0, None, closed="left")],
- "alpha_2": [Interval(Real, 0, None, closed="left")],
- "lambda_1": [Interval(Real, 0, None, closed="left")],
- "lambda_2": [Interval(Real, 0, None, closed="left")],
- "alpha_init": [None, Interval(Real, 0, None, closed="left")],
- "lambda_init": [None, Interval(Real, 0, None, closed="left")],
- "compute_score": ["boolean"],
- "fit_intercept": ["boolean"],
- "copy_X": ["boolean"],
- "verbose": ["verbose"],
- "n_iter": [
- Interval(Integral, 1, None, closed="left"),
- Hidden(StrOptions({"deprecated"})),
- ],
- }
- def __init__(
- self,
- *,
- max_iter=None, # TODO(1.5): Set to 300
- tol=1.0e-3,
- alpha_1=1.0e-6,
- alpha_2=1.0e-6,
- lambda_1=1.0e-6,
- lambda_2=1.0e-6,
- alpha_init=None,
- lambda_init=None,
- compute_score=False,
- fit_intercept=True,
- copy_X=True,
- verbose=False,
- n_iter="deprecated", # TODO(1.5): Remove
- ):
- self.max_iter = max_iter
- self.tol = tol
- self.alpha_1 = alpha_1
- self.alpha_2 = alpha_2
- self.lambda_1 = lambda_1
- self.lambda_2 = lambda_2
- self.alpha_init = alpha_init
- self.lambda_init = lambda_init
- self.compute_score = compute_score
- self.fit_intercept = fit_intercept
- self.copy_X = copy_X
- self.verbose = verbose
- self.n_iter = n_iter
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, X, y, sample_weight=None):
- """Fit the model.
- Parameters
- ----------
- X : ndarray of shape (n_samples, n_features)
- Training data.
- y : ndarray of shape (n_samples,)
- Target values. Will be cast to X's dtype if necessary.
- sample_weight : ndarray of shape (n_samples,), default=None
- Individual weights for each sample.
- .. versionadded:: 0.20
- parameter *sample_weight* support to BayesianRidge.
- Returns
- -------
- self : object
- Returns the instance itself.
- """
- max_iter = _deprecate_n_iter(self.n_iter, self.max_iter)
- X, y = self._validate_data(X, y, dtype=[np.float64, np.float32], y_numeric=True)
- if sample_weight is not None:
- sample_weight = _check_sample_weight(sample_weight, X, dtype=X.dtype)
- X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data(
- X,
- y,
- self.fit_intercept,
- copy=self.copy_X,
- sample_weight=sample_weight,
- )
- if sample_weight is not None:
- # Sample weight can be implemented via a simple rescaling.
- X, y, _ = _rescale_data(X, y, sample_weight)
- self.X_offset_ = X_offset_
- self.X_scale_ = X_scale_
- n_samples, n_features = X.shape
- # Initialization of the values of the parameters
- eps = np.finfo(np.float64).eps
- # Add `eps` in the denominator to omit division by zero if `np.var(y)`
- # is zero
- alpha_ = self.alpha_init
- lambda_ = self.lambda_init
- if alpha_ is None:
- alpha_ = 1.0 / (np.var(y) + eps)
- if lambda_ is None:
- lambda_ = 1.0
- verbose = self.verbose
- lambda_1 = self.lambda_1
- lambda_2 = self.lambda_2
- alpha_1 = self.alpha_1
- alpha_2 = self.alpha_2
- self.scores_ = list()
- coef_old_ = None
- XT_y = np.dot(X.T, y)
- U, S, Vh = linalg.svd(X, full_matrices=False)
- eigen_vals_ = S**2
- # Convergence loop of the bayesian ridge regression
- for iter_ in range(max_iter):
- # update posterior mean coef_ based on alpha_ and lambda_ and
- # compute corresponding rmse
- coef_, rmse_ = self._update_coef_(
- X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
- )
- if self.compute_score:
- # compute the log marginal likelihood
- s = self._log_marginal_likelihood(
- n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_
- )
- self.scores_.append(s)
- # Update alpha and lambda according to (MacKay, 1992)
- gamma_ = np.sum((alpha_ * eigen_vals_) / (lambda_ + alpha_ * eigen_vals_))
- lambda_ = (gamma_ + 2 * lambda_1) / (np.sum(coef_**2) + 2 * lambda_2)
- alpha_ = (n_samples - gamma_ + 2 * alpha_1) / (rmse_ + 2 * alpha_2)
- # Check for convergence
- if iter_ != 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
- if verbose:
- print("Convergence after ", str(iter_), " iterations")
- break
- coef_old_ = np.copy(coef_)
- self.n_iter_ = iter_ + 1
- # return regularization parameters and corresponding posterior mean,
- # log marginal likelihood and posterior covariance
- self.alpha_ = alpha_
- self.lambda_ = lambda_
- self.coef_, rmse_ = self._update_coef_(
- X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
- )
- if self.compute_score:
- # compute the log marginal likelihood
- s = self._log_marginal_likelihood(
- n_samples, n_features, eigen_vals_, alpha_, lambda_, coef_, rmse_
- )
- self.scores_.append(s)
- self.scores_ = np.array(self.scores_)
- # posterior covariance is given by 1/alpha_ * scaled_sigma_
- scaled_sigma_ = np.dot(
- Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis]
- )
- self.sigma_ = (1.0 / alpha_) * scaled_sigma_
- self._set_intercept(X_offset_, y_offset_, X_scale_)
- return self
- def predict(self, X, return_std=False):
- """Predict using the linear model.
- In addition to the mean of the predictive distribution, also its
- standard deviation can be returned.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Samples.
- return_std : bool, default=False
- Whether to return the standard deviation of posterior prediction.
- Returns
- -------
- y_mean : array-like of shape (n_samples,)
- Mean of predictive distribution of query points.
- y_std : array-like of shape (n_samples,)
- Standard deviation of predictive distribution of query points.
- """
- y_mean = self._decision_function(X)
- if not return_std:
- return y_mean
- else:
- sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
- y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_))
- return y_mean, y_std
- def _update_coef_(
- self, X, y, n_samples, n_features, XT_y, U, Vh, eigen_vals_, alpha_, lambda_
- ):
- """Update posterior mean and compute corresponding rmse.
- Posterior mean is given by coef_ = scaled_sigma_ * X.T * y where
- scaled_sigma_ = (lambda_/alpha_ * np.eye(n_features)
- + np.dot(X.T, X))^-1
- """
- if n_samples > n_features:
- coef_ = np.linalg.multi_dot(
- [Vh.T, Vh / (eigen_vals_ + lambda_ / alpha_)[:, np.newaxis], XT_y]
- )
- else:
- coef_ = np.linalg.multi_dot(
- [X.T, U / (eigen_vals_ + lambda_ / alpha_)[None, :], U.T, y]
- )
- rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
- return coef_, rmse_
- def _log_marginal_likelihood(
- self, n_samples, n_features, eigen_vals, alpha_, lambda_, coef, rmse
- ):
- """Log marginal likelihood."""
- alpha_1 = self.alpha_1
- alpha_2 = self.alpha_2
- lambda_1 = self.lambda_1
- lambda_2 = self.lambda_2
- # compute the log of the determinant of the posterior covariance.
- # posterior covariance is given by
- # sigma = (lambda_ * np.eye(n_features) + alpha_ * np.dot(X.T, X))^-1
- if n_samples > n_features:
- logdet_sigma = -np.sum(np.log(lambda_ + alpha_ * eigen_vals))
- else:
- logdet_sigma = np.full(n_features, lambda_, dtype=np.array(lambda_).dtype)
- logdet_sigma[:n_samples] += alpha_ * eigen_vals
- logdet_sigma = -np.sum(np.log(logdet_sigma))
- score = lambda_1 * log(lambda_) - lambda_2 * lambda_
- score += alpha_1 * log(alpha_) - alpha_2 * alpha_
- score += 0.5 * (
- n_features * log(lambda_)
- + n_samples * log(alpha_)
- - alpha_ * rmse
- - lambda_ * np.sum(coef**2)
- + logdet_sigma
- - n_samples * log(2 * np.pi)
- )
- return score
- ###############################################################################
- # ARD (Automatic Relevance Determination) regression
- class ARDRegression(RegressorMixin, LinearModel):
- """Bayesian ARD regression.
- Fit the weights of a regression model, using an ARD prior. The weights of
- the regression model are assumed to be in Gaussian distributions.
- Also estimate the parameters lambda (precisions of the distributions of the
- weights) and alpha (precision of the distribution of the noise).
- The estimation is done by an iterative procedures (Evidence Maximization)
- Read more in the :ref:`User Guide <bayesian_regression>`.
- Parameters
- ----------
- max_iter : int, default=None
- Maximum number of iterations. If `None`, it corresponds to `max_iter=300`.
- .. versionchanged:: 1.3
- tol : float, default=1e-3
- Stop the algorithm if w has converged.
- alpha_1 : float, default=1e-6
- Hyper-parameter : shape parameter for the Gamma distribution prior
- over the alpha parameter.
- alpha_2 : float, default=1e-6
- Hyper-parameter : inverse scale parameter (rate parameter) for the
- Gamma distribution prior over the alpha parameter.
- lambda_1 : float, default=1e-6
- Hyper-parameter : shape parameter for the Gamma distribution prior
- over the lambda parameter.
- lambda_2 : float, default=1e-6
- Hyper-parameter : inverse scale parameter (rate parameter) for the
- Gamma distribution prior over the lambda parameter.
- compute_score : bool, default=False
- If True, compute the objective function at each step of the model.
- threshold_lambda : float, default=10 000
- Threshold for removing (pruning) weights with high precision from
- the computation.
- fit_intercept : bool, default=True
- Whether to calculate the intercept for this model. If set
- to false, no intercept will be used in calculations
- (i.e. data is expected to be centered).
- copy_X : bool, default=True
- If True, X will be copied; else, it may be overwritten.
- verbose : bool, default=False
- Verbose mode when fitting the model.
- n_iter : int
- Maximum number of iterations.
- .. deprecated:: 1.3
- `n_iter` is deprecated in 1.3 and will be removed in 1.5. Use
- `max_iter` instead.
- Attributes
- ----------
- coef_ : array-like of shape (n_features,)
- Coefficients of the regression model (mean of distribution)
- alpha_ : float
- estimated precision of the noise.
- lambda_ : array-like of shape (n_features,)
- estimated precisions of the weights.
- sigma_ : array-like of shape (n_features, n_features)
- estimated variance-covariance matrix of the weights
- scores_ : float
- if computed, value of the objective function (to be maximized)
- n_iter_ : int
- The actual number of iterations to reach the stopping criterion.
- .. versionadded:: 1.3
- intercept_ : float
- Independent term in decision function. Set to 0.0 if
- ``fit_intercept = False``.
- X_offset_ : float
- If `fit_intercept=True`, offset subtracted for centering data to a
- zero mean. Set to np.zeros(n_features) otherwise.
- X_scale_ : float
- Set to np.ones(n_features).
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- See Also
- --------
- BayesianRidge : Bayesian ridge regression.
- Notes
- -----
- For an example, see :ref:`examples/linear_model/plot_ard.py
- <sphx_glr_auto_examples_linear_model_plot_ard.py>`.
- References
- ----------
- D. J. C. MacKay, Bayesian nonlinear modeling for the prediction
- competition, ASHRAE Transactions, 1994.
- R. Salakhutdinov, Lecture notes on Statistical Machine Learning,
- http://www.utstat.toronto.edu/~rsalakhu/sta4273/notes/Lecture2.pdf#page=15
- Their beta is our ``self.alpha_``
- Their alpha is our ``self.lambda_``
- ARD is a little different than the slide: only dimensions/features for
- which ``self.lambda_ < self.threshold_lambda`` are kept and the rest are
- discarded.
- Examples
- --------
- >>> from sklearn import linear_model
- >>> clf = linear_model.ARDRegression()
- >>> clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])
- ARDRegression()
- >>> clf.predict([[1, 1]])
- array([1.])
- """
- _parameter_constraints: dict = {
- "max_iter": [Interval(Integral, 1, None, closed="left"), None],
- "tol": [Interval(Real, 0, None, closed="left")],
- "alpha_1": [Interval(Real, 0, None, closed="left")],
- "alpha_2": [Interval(Real, 0, None, closed="left")],
- "lambda_1": [Interval(Real, 0, None, closed="left")],
- "lambda_2": [Interval(Real, 0, None, closed="left")],
- "compute_score": ["boolean"],
- "threshold_lambda": [Interval(Real, 0, None, closed="left")],
- "fit_intercept": ["boolean"],
- "copy_X": ["boolean"],
- "verbose": ["verbose"],
- "n_iter": [
- Interval(Integral, 1, None, closed="left"),
- Hidden(StrOptions({"deprecated"})),
- ],
- }
- def __init__(
- self,
- *,
- max_iter=None, # TODO(1.5): Set to 300
- tol=1.0e-3,
- alpha_1=1.0e-6,
- alpha_2=1.0e-6,
- lambda_1=1.0e-6,
- lambda_2=1.0e-6,
- compute_score=False,
- threshold_lambda=1.0e4,
- fit_intercept=True,
- copy_X=True,
- verbose=False,
- n_iter="deprecated", # TODO(1.5): Remove
- ):
- self.max_iter = max_iter
- self.tol = tol
- self.fit_intercept = fit_intercept
- self.alpha_1 = alpha_1
- self.alpha_2 = alpha_2
- self.lambda_1 = lambda_1
- self.lambda_2 = lambda_2
- self.compute_score = compute_score
- self.threshold_lambda = threshold_lambda
- self.copy_X = copy_X
- self.verbose = verbose
- self.n_iter = n_iter
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, X, y):
- """Fit the model according to the given training data and parameters.
- Iterative procedure to maximize the evidence
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features)
- Training vector, where `n_samples` is the number of samples and
- `n_features` is the number of features.
- y : array-like of shape (n_samples,)
- Target values (integers). Will be cast to X's dtype if necessary.
- Returns
- -------
- self : object
- Fitted estimator.
- """
- max_iter = _deprecate_n_iter(self.n_iter, self.max_iter)
- X, y = self._validate_data(
- X, y, dtype=[np.float64, np.float32], y_numeric=True, ensure_min_samples=2
- )
- n_samples, n_features = X.shape
- coef_ = np.zeros(n_features, dtype=X.dtype)
- X, y, X_offset_, y_offset_, X_scale_ = _preprocess_data(
- X, y, self.fit_intercept, copy=self.copy_X
- )
- self.X_offset_ = X_offset_
- self.X_scale_ = X_scale_
- # Launch the convergence loop
- keep_lambda = np.ones(n_features, dtype=bool)
- lambda_1 = self.lambda_1
- lambda_2 = self.lambda_2
- alpha_1 = self.alpha_1
- alpha_2 = self.alpha_2
- verbose = self.verbose
- # Initialization of the values of the parameters
- eps = np.finfo(np.float64).eps
- # Add `eps` in the denominator to omit division by zero if `np.var(y)`
- # is zero
- alpha_ = 1.0 / (np.var(y) + eps)
- lambda_ = np.ones(n_features, dtype=X.dtype)
- self.scores_ = list()
- coef_old_ = None
- def update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_):
- coef_[keep_lambda] = alpha_ * np.linalg.multi_dot(
- [sigma_, X[:, keep_lambda].T, y]
- )
- return coef_
- update_sigma = (
- self._update_sigma
- if n_samples >= n_features
- else self._update_sigma_woodbury
- )
- # Iterative procedure of ARDRegression
- for iter_ in range(max_iter):
- sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
- coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
- # Update alpha and lambda
- rmse_ = np.sum((y - np.dot(X, coef_)) ** 2)
- gamma_ = 1.0 - lambda_[keep_lambda] * np.diag(sigma_)
- lambda_[keep_lambda] = (gamma_ + 2.0 * lambda_1) / (
- (coef_[keep_lambda]) ** 2 + 2.0 * lambda_2
- )
- alpha_ = (n_samples - gamma_.sum() + 2.0 * alpha_1) / (
- rmse_ + 2.0 * alpha_2
- )
- # Prune the weights with a precision over a threshold
- keep_lambda = lambda_ < self.threshold_lambda
- coef_[~keep_lambda] = 0
- # Compute the objective function
- if self.compute_score:
- s = (lambda_1 * np.log(lambda_) - lambda_2 * lambda_).sum()
- s += alpha_1 * log(alpha_) - alpha_2 * alpha_
- s += 0.5 * (
- fast_logdet(sigma_)
- + n_samples * log(alpha_)
- + np.sum(np.log(lambda_))
- )
- s -= 0.5 * (alpha_ * rmse_ + (lambda_ * coef_**2).sum())
- self.scores_.append(s)
- # Check for convergence
- if iter_ > 0 and np.sum(np.abs(coef_old_ - coef_)) < self.tol:
- if verbose:
- print("Converged after %s iterations" % iter_)
- break
- coef_old_ = np.copy(coef_)
- if not keep_lambda.any():
- break
- self.n_iter_ = iter_ + 1
- if keep_lambda.any():
- # update sigma and mu using updated params from the last iteration
- sigma_ = update_sigma(X, alpha_, lambda_, keep_lambda)
- coef_ = update_coeff(X, y, coef_, alpha_, keep_lambda, sigma_)
- else:
- sigma_ = np.array([]).reshape(0, 0)
- self.coef_ = coef_
- self.alpha_ = alpha_
- self.sigma_ = sigma_
- self.lambda_ = lambda_
- self._set_intercept(X_offset_, y_offset_, X_scale_)
- return self
- def _update_sigma_woodbury(self, X, alpha_, lambda_, keep_lambda):
- # See slides as referenced in the docstring note
- # this function is used when n_samples < n_features and will invert
- # a matrix of shape (n_samples, n_samples) making use of the
- # woodbury formula:
- # https://en.wikipedia.org/wiki/Woodbury_matrix_identity
- n_samples = X.shape[0]
- X_keep = X[:, keep_lambda]
- inv_lambda = 1 / lambda_[keep_lambda].reshape(1, -1)
- sigma_ = pinvh(
- np.eye(n_samples, dtype=X.dtype) / alpha_
- + np.dot(X_keep * inv_lambda, X_keep.T)
- )
- sigma_ = np.dot(sigma_, X_keep * inv_lambda)
- sigma_ = -np.dot(inv_lambda.reshape(-1, 1) * X_keep.T, sigma_)
- sigma_[np.diag_indices(sigma_.shape[1])] += 1.0 / lambda_[keep_lambda]
- return sigma_
- def _update_sigma(self, X, alpha_, lambda_, keep_lambda):
- # See slides as referenced in the docstring note
- # this function is used when n_samples >= n_features and will
- # invert a matrix of shape (n_features, n_features)
- X_keep = X[:, keep_lambda]
- gram = np.dot(X_keep.T, X_keep)
- eye = np.eye(gram.shape[0], dtype=X.dtype)
- sigma_inv = lambda_[keep_lambda] * eye + alpha_ * gram
- sigma_ = pinvh(sigma_inv)
- return sigma_
- def predict(self, X, return_std=False):
- """Predict using the linear model.
- In addition to the mean of the predictive distribution, also its
- standard deviation can be returned.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Samples.
- return_std : bool, default=False
- Whether to return the standard deviation of posterior prediction.
- Returns
- -------
- y_mean : array-like of shape (n_samples,)
- Mean of predictive distribution of query points.
- y_std : array-like of shape (n_samples,)
- Standard deviation of predictive distribution of query points.
- """
- y_mean = self._decision_function(X)
- if return_std is False:
- return y_mean
- else:
- X = X[:, self.lambda_ < self.threshold_lambda]
- sigmas_squared_data = (np.dot(X, self.sigma_) * X).sum(axis=1)
- y_std = np.sqrt(sigmas_squared_data + (1.0 / self.alpha_))
- return y_mean, y_std
|