| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673 |
- """Gaussian processes regression."""
- # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
- # Modified by: Pete Green <p.l.green@liverpool.ac.uk>
- # License: BSD 3 clause
- import warnings
- from numbers import Integral, Real
- from operator import itemgetter
- import numpy as np
- import scipy.optimize
- from scipy.linalg import cho_solve, cholesky, solve_triangular
- from ..base import BaseEstimator, MultiOutputMixin, RegressorMixin, _fit_context, clone
- from ..preprocessing._data import _handle_zeros_in_scale
- from ..utils import check_random_state
- from ..utils._param_validation import Interval, StrOptions
- from ..utils.optimize import _check_optimize_result
- from .kernels import RBF, Kernel
- from .kernels import ConstantKernel as C
- GPR_CHOLESKY_LOWER = True
- class GaussianProcessRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
- """Gaussian process regression (GPR).
- The implementation is based on Algorithm 2.1 of [RW2006]_.
- In addition to standard scikit-learn estimator API,
- :class:`GaussianProcessRegressor`:
- * allows prediction without prior fitting (based on the GP prior)
- * provides an additional method `sample_y(X)`, which evaluates samples
- drawn from the GPR (prior or posterior) at given inputs
- * exposes a method `log_marginal_likelihood(theta)`, which can be used
- externally for other ways of selecting hyperparameters, e.g., via
- Markov chain Monte Carlo.
- To learn the difference between a point-estimate approach vs. a more
- Bayesian modelling approach, refer to the example entitled
- :ref:`sphx_glr_auto_examples_gaussian_process_plot_compare_gpr_krr.py`.
- Read more in the :ref:`User Guide <gaussian_process>`.
- .. versionadded:: 0.18
- Parameters
- ----------
- kernel : kernel instance, default=None
- The kernel specifying the covariance function of the GP. If None is
- passed, the kernel ``ConstantKernel(1.0, constant_value_bounds="fixed")
- * RBF(1.0, length_scale_bounds="fixed")`` is used as default. Note that
- the kernel hyperparameters are optimized during fitting unless the
- bounds are marked as "fixed".
- alpha : float or ndarray of shape (n_samples,), default=1e-10
- Value added to the diagonal of the kernel matrix during fitting.
- This can prevent a potential numerical issue during fitting, by
- ensuring that the calculated values form a positive definite matrix.
- It can also be interpreted as the variance of additional Gaussian
- measurement noise on the training observations. Note that this is
- different from using a `WhiteKernel`. If an array is passed, it must
- have the same number of entries as the data used for fitting and is
- used as datapoint-dependent noise level. Allowing to specify the
- noise level directly as a parameter is mainly for convenience and
- for consistency with :class:`~sklearn.linear_model.Ridge`.
- optimizer : "fmin_l_bfgs_b", callable or None, default="fmin_l_bfgs_b"
- Can either be one of the internally supported optimizers for optimizing
- the kernel's parameters, specified by a string, or an externally
- defined optimizer passed as a callable. If a callable is passed, it
- must have the signature::
- def optimizer(obj_func, initial_theta, bounds):
- # * 'obj_func': the objective function to be minimized, which
- # takes the hyperparameters theta as a parameter and an
- # optional flag eval_gradient, which determines if the
- # gradient is returned additionally to the function value
- # * 'initial_theta': the initial value for theta, which can be
- # used by local optimizers
- # * 'bounds': the bounds on the values of theta
- ....
- # Returned are the best found hyperparameters theta and
- # the corresponding value of the target function.
- return theta_opt, func_min
- Per default, the L-BFGS-B algorithm from `scipy.optimize.minimize`
- is used. If None is passed, the kernel's parameters are kept fixed.
- Available internal optimizers are: `{'fmin_l_bfgs_b'}`.
- n_restarts_optimizer : int, default=0
- The number of restarts of the optimizer for finding the kernel's
- parameters which maximize the log-marginal likelihood. The first run
- of the optimizer is performed from the kernel's initial parameters,
- the remaining ones (if any) from thetas sampled log-uniform randomly
- from the space of allowed theta-values. If greater than 0, all bounds
- must be finite. Note that `n_restarts_optimizer == 0` implies that one
- run is performed.
- normalize_y : bool, default=False
- Whether or not to normalize the target values `y` by removing the mean
- and scaling to unit-variance. This is recommended for cases where
- zero-mean, unit-variance priors are used. Note that, in this
- implementation, the normalisation is reversed before the GP predictions
- are reported.
- .. versionchanged:: 0.23
- copy_X_train : bool, default=True
- If True, a persistent copy of the training data is stored in the
- object. Otherwise, just a reference to the training data is stored,
- which might cause predictions to change if the data is modified
- externally.
- n_targets : int, default=None
- The number of dimensions of the target values. Used to decide the number
- of outputs when sampling from the prior distributions (i.e. calling
- :meth:`sample_y` before :meth:`fit`). This parameter is ignored once
- :meth:`fit` has been called.
- .. versionadded:: 1.3
- random_state : int, RandomState instance or None, default=None
- Determines random number generation used to initialize the centers.
- Pass an int for reproducible results across multiple function calls.
- See :term:`Glossary <random_state>`.
- Attributes
- ----------
- X_train_ : array-like of shape (n_samples, n_features) or list of object
- Feature vectors or other representations of training data (also
- required for prediction).
- y_train_ : array-like of shape (n_samples,) or (n_samples, n_targets)
- Target values in training data (also required for prediction).
- kernel_ : kernel instance
- The kernel used for prediction. The structure of the kernel is the
- same as the one passed as parameter but with optimized hyperparameters.
- L_ : array-like of shape (n_samples, n_samples)
- Lower-triangular Cholesky decomposition of the kernel in ``X_train_``.
- alpha_ : array-like of shape (n_samples,)
- Dual coefficients of training data points in kernel space.
- log_marginal_likelihood_value_ : float
- The log-marginal-likelihood of ``self.kernel_.theta``.
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- See Also
- --------
- GaussianProcessClassifier : Gaussian process classification (GPC)
- based on Laplace approximation.
- References
- ----------
- .. [RW2006] `Carl E. Rasmussen and Christopher K.I. Williams,
- "Gaussian Processes for Machine Learning",
- MIT Press 2006 <https://www.gaussianprocess.org/gpml/chapters/RW.pdf>`_
- Examples
- --------
- >>> from sklearn.datasets import make_friedman2
- >>> from sklearn.gaussian_process import GaussianProcessRegressor
- >>> from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
- >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)
- >>> kernel = DotProduct() + WhiteKernel()
- >>> gpr = GaussianProcessRegressor(kernel=kernel,
- ... random_state=0).fit(X, y)
- >>> gpr.score(X, y)
- 0.3680...
- >>> gpr.predict(X[:2,:], return_std=True)
- (array([653.0..., 592.1...]), array([316.6..., 316.6...]))
- """
- _parameter_constraints: dict = {
- "kernel": [None, Kernel],
- "alpha": [Interval(Real, 0, None, closed="left"), np.ndarray],
- "optimizer": [StrOptions({"fmin_l_bfgs_b"}), callable, None],
- "n_restarts_optimizer": [Interval(Integral, 0, None, closed="left")],
- "normalize_y": ["boolean"],
- "copy_X_train": ["boolean"],
- "n_targets": [Interval(Integral, 1, None, closed="left"), None],
- "random_state": ["random_state"],
- }
- def __init__(
- self,
- kernel=None,
- *,
- alpha=1e-10,
- optimizer="fmin_l_bfgs_b",
- n_restarts_optimizer=0,
- normalize_y=False,
- copy_X_train=True,
- n_targets=None,
- random_state=None,
- ):
- self.kernel = kernel
- self.alpha = alpha
- self.optimizer = optimizer
- self.n_restarts_optimizer = n_restarts_optimizer
- self.normalize_y = normalize_y
- self.copy_X_train = copy_X_train
- self.n_targets = n_targets
- self.random_state = random_state
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, X, y):
- """Fit Gaussian process regression model.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features) or list of object
- Feature vectors or other representations of training data.
- y : array-like of shape (n_samples,) or (n_samples, n_targets)
- Target values.
- Returns
- -------
- self : object
- GaussianProcessRegressor class instance.
- """
- if self.kernel is None: # Use an RBF kernel as default
- self.kernel_ = C(1.0, constant_value_bounds="fixed") * RBF(
- 1.0, length_scale_bounds="fixed"
- )
- else:
- self.kernel_ = clone(self.kernel)
- self._rng = check_random_state(self.random_state)
- if self.kernel_.requires_vector_input:
- dtype, ensure_2d = "numeric", True
- else:
- dtype, ensure_2d = None, False
- X, y = self._validate_data(
- X,
- y,
- multi_output=True,
- y_numeric=True,
- ensure_2d=ensure_2d,
- dtype=dtype,
- )
- n_targets_seen = y.shape[1] if y.ndim > 1 else 1
- if self.n_targets is not None and n_targets_seen != self.n_targets:
- raise ValueError(
- "The number of targets seen in `y` is different from the parameter "
- f"`n_targets`. Got {n_targets_seen} != {self.n_targets}."
- )
- # Normalize target value
- if self.normalize_y:
- self._y_train_mean = np.mean(y, axis=0)
- self._y_train_std = _handle_zeros_in_scale(np.std(y, axis=0), copy=False)
- # Remove mean and make unit variance
- y = (y - self._y_train_mean) / self._y_train_std
- else:
- shape_y_stats = (y.shape[1],) if y.ndim == 2 else 1
- self._y_train_mean = np.zeros(shape=shape_y_stats)
- self._y_train_std = np.ones(shape=shape_y_stats)
- if np.iterable(self.alpha) and self.alpha.shape[0] != y.shape[0]:
- if self.alpha.shape[0] == 1:
- self.alpha = self.alpha[0]
- else:
- raise ValueError(
- "alpha must be a scalar or an array with same number of "
- f"entries as y. ({self.alpha.shape[0]} != {y.shape[0]})"
- )
- self.X_train_ = np.copy(X) if self.copy_X_train else X
- self.y_train_ = np.copy(y) if self.copy_X_train else y
- if self.optimizer is not None and self.kernel_.n_dims > 0:
- # Choose hyperparameters based on maximizing the log-marginal
- # likelihood (potentially starting from several initial values)
- def obj_func(theta, eval_gradient=True):
- if eval_gradient:
- lml, grad = self.log_marginal_likelihood(
- theta, eval_gradient=True, clone_kernel=False
- )
- return -lml, -grad
- else:
- return -self.log_marginal_likelihood(theta, clone_kernel=False)
- # First optimize starting from theta specified in kernel
- optima = [
- (
- self._constrained_optimization(
- obj_func, self.kernel_.theta, self.kernel_.bounds
- )
- )
- ]
- # Additional runs are performed from log-uniform chosen initial
- # theta
- if self.n_restarts_optimizer > 0:
- if not np.isfinite(self.kernel_.bounds).all():
- raise ValueError(
- "Multiple optimizer restarts (n_restarts_optimizer>0) "
- "requires that all bounds are finite."
- )
- bounds = self.kernel_.bounds
- for iteration in range(self.n_restarts_optimizer):
- theta_initial = self._rng.uniform(bounds[:, 0], bounds[:, 1])
- optima.append(
- self._constrained_optimization(obj_func, theta_initial, bounds)
- )
- # Select result from run with minimal (negative) log-marginal
- # likelihood
- lml_values = list(map(itemgetter(1), optima))
- self.kernel_.theta = optima[np.argmin(lml_values)][0]
- self.kernel_._check_bounds_params()
- self.log_marginal_likelihood_value_ = -np.min(lml_values)
- else:
- self.log_marginal_likelihood_value_ = self.log_marginal_likelihood(
- self.kernel_.theta, clone_kernel=False
- )
- # Precompute quantities required for predictions which are independent
- # of actual query points
- # Alg. 2.1, page 19, line 2 -> L = cholesky(K + sigma^2 I)
- K = self.kernel_(self.X_train_)
- K[np.diag_indices_from(K)] += self.alpha
- try:
- self.L_ = cholesky(K, lower=GPR_CHOLESKY_LOWER, check_finite=False)
- except np.linalg.LinAlgError as exc:
- exc.args = (
- (
- f"The kernel, {self.kernel_}, is not returning a positive "
- "definite matrix. Try gradually increasing the 'alpha' "
- "parameter of your GaussianProcessRegressor estimator."
- ),
- ) + exc.args
- raise
- # Alg 2.1, page 19, line 3 -> alpha = L^T \ (L \ y)
- self.alpha_ = cho_solve(
- (self.L_, GPR_CHOLESKY_LOWER),
- self.y_train_,
- check_finite=False,
- )
- return self
- def predict(self, X, return_std=False, return_cov=False):
- """Predict using the Gaussian process regression model.
- We can also predict based on an unfitted model by using the GP prior.
- In addition to the mean of the predictive distribution, optionally also
- returns its standard deviation (`return_std=True`) or covariance
- (`return_cov=True`). Note that at most one of the two can be requested.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features) or list of object
- Query points where the GP is evaluated.
- return_std : bool, default=False
- If True, the standard-deviation of the predictive distribution at
- the query points is returned along with the mean.
- return_cov : bool, default=False
- If True, the covariance of the joint predictive distribution at
- the query points is returned along with the mean.
- Returns
- -------
- y_mean : ndarray of shape (n_samples,) or (n_samples, n_targets)
- Mean of predictive distribution a query points.
- y_std : ndarray of shape (n_samples,) or (n_samples, n_targets), optional
- Standard deviation of predictive distribution at query points.
- Only returned when `return_std` is True.
- y_cov : ndarray of shape (n_samples, n_samples) or \
- (n_samples, n_samples, n_targets), optional
- Covariance of joint predictive distribution a query points.
- Only returned when `return_cov` is True.
- """
- if return_std and return_cov:
- raise RuntimeError(
- "At most one of return_std or return_cov can be requested."
- )
- if self.kernel is None or self.kernel.requires_vector_input:
- dtype, ensure_2d = "numeric", True
- else:
- dtype, ensure_2d = None, False
- X = self._validate_data(X, ensure_2d=ensure_2d, dtype=dtype, reset=False)
- if not hasattr(self, "X_train_"): # Unfitted;predict based on GP prior
- if self.kernel is None:
- kernel = C(1.0, constant_value_bounds="fixed") * RBF(
- 1.0, length_scale_bounds="fixed"
- )
- else:
- kernel = self.kernel
- n_targets = self.n_targets if self.n_targets is not None else 1
- y_mean = np.zeros(shape=(X.shape[0], n_targets)).squeeze()
- if return_cov:
- y_cov = kernel(X)
- if n_targets > 1:
- y_cov = np.repeat(
- np.expand_dims(y_cov, -1), repeats=n_targets, axis=-1
- )
- return y_mean, y_cov
- elif return_std:
- y_var = kernel.diag(X)
- if n_targets > 1:
- y_var = np.repeat(
- np.expand_dims(y_var, -1), repeats=n_targets, axis=-1
- )
- return y_mean, np.sqrt(y_var)
- else:
- return y_mean
- else: # Predict based on GP posterior
- # Alg 2.1, page 19, line 4 -> f*_bar = K(X_test, X_train) . alpha
- K_trans = self.kernel_(X, self.X_train_)
- y_mean = K_trans @ self.alpha_
- # undo normalisation
- y_mean = self._y_train_std * y_mean + self._y_train_mean
- # if y_mean has shape (n_samples, 1), reshape to (n_samples,)
- if y_mean.ndim > 1 and y_mean.shape[1] == 1:
- y_mean = np.squeeze(y_mean, axis=1)
- # Alg 2.1, page 19, line 5 -> v = L \ K(X_test, X_train)^T
- V = solve_triangular(
- self.L_, K_trans.T, lower=GPR_CHOLESKY_LOWER, check_finite=False
- )
- if return_cov:
- # Alg 2.1, page 19, line 6 -> K(X_test, X_test) - v^T. v
- y_cov = self.kernel_(X) - V.T @ V
- # undo normalisation
- y_cov = np.outer(y_cov, self._y_train_std**2).reshape(
- *y_cov.shape, -1
- )
- # if y_cov has shape (n_samples, n_samples, 1), reshape to
- # (n_samples, n_samples)
- if y_cov.shape[2] == 1:
- y_cov = np.squeeze(y_cov, axis=2)
- return y_mean, y_cov
- elif return_std:
- # Compute variance of predictive distribution
- # Use einsum to avoid explicitly forming the large matrix
- # V^T @ V just to extract its diagonal afterward.
- y_var = self.kernel_.diag(X).copy()
- y_var -= np.einsum("ij,ji->i", V.T, V)
- # Check if any of the variances is negative because of
- # numerical issues. If yes: set the variance to 0.
- y_var_negative = y_var < 0
- if np.any(y_var_negative):
- warnings.warn(
- "Predicted variances smaller than 0. "
- "Setting those variances to 0."
- )
- y_var[y_var_negative] = 0.0
- # undo normalisation
- y_var = np.outer(y_var, self._y_train_std**2).reshape(
- *y_var.shape, -1
- )
- # if y_var has shape (n_samples, 1), reshape to (n_samples,)
- if y_var.shape[1] == 1:
- y_var = np.squeeze(y_var, axis=1)
- return y_mean, np.sqrt(y_var)
- else:
- return y_mean
- def sample_y(self, X, n_samples=1, random_state=0):
- """Draw samples from Gaussian process and evaluate at X.
- Parameters
- ----------
- X : array-like of shape (n_samples_X, n_features) or list of object
- Query points where the GP is evaluated.
- n_samples : int, default=1
- Number of samples drawn from the Gaussian process per query point.
- random_state : int, RandomState instance or None, default=0
- Determines random number generation to randomly draw samples.
- Pass an int for reproducible results across multiple function
- calls.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- y_samples : ndarray of shape (n_samples_X, n_samples), or \
- (n_samples_X, n_targets, n_samples)
- Values of n_samples samples drawn from Gaussian process and
- evaluated at query points.
- """
- rng = check_random_state(random_state)
- y_mean, y_cov = self.predict(X, return_cov=True)
- if y_mean.ndim == 1:
- y_samples = rng.multivariate_normal(y_mean, y_cov, n_samples).T
- else:
- y_samples = [
- rng.multivariate_normal(
- y_mean[:, target], y_cov[..., target], n_samples
- ).T[:, np.newaxis]
- for target in range(y_mean.shape[1])
- ]
- y_samples = np.hstack(y_samples)
- return y_samples
- def log_marginal_likelihood(
- self, theta=None, eval_gradient=False, clone_kernel=True
- ):
- """Return log-marginal likelihood of theta for training data.
- Parameters
- ----------
- theta : array-like of shape (n_kernel_params,) default=None
- Kernel hyperparameters for which the log-marginal likelihood is
- evaluated. If None, the precomputed log_marginal_likelihood
- of ``self.kernel_.theta`` is returned.
- eval_gradient : bool, default=False
- If True, the gradient of the log-marginal likelihood with respect
- to the kernel hyperparameters at position theta is returned
- additionally. If True, theta must not be None.
- clone_kernel : bool, default=True
- If True, the kernel attribute is copied. If False, the kernel
- attribute is modified, but may result in a performance improvement.
- Returns
- -------
- log_likelihood : float
- Log-marginal likelihood of theta for training data.
- log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional
- Gradient of the log-marginal likelihood with respect to the kernel
- hyperparameters at position theta.
- Only returned when eval_gradient is True.
- """
- if theta is None:
- if eval_gradient:
- raise ValueError("Gradient can only be evaluated for theta!=None")
- return self.log_marginal_likelihood_value_
- if clone_kernel:
- kernel = self.kernel_.clone_with_theta(theta)
- else:
- kernel = self.kernel_
- kernel.theta = theta
- if eval_gradient:
- K, K_gradient = kernel(self.X_train_, eval_gradient=True)
- else:
- K = kernel(self.X_train_)
- # Alg. 2.1, page 19, line 2 -> L = cholesky(K + sigma^2 I)
- K[np.diag_indices_from(K)] += self.alpha
- try:
- L = cholesky(K, lower=GPR_CHOLESKY_LOWER, check_finite=False)
- except np.linalg.LinAlgError:
- return (-np.inf, np.zeros_like(theta)) if eval_gradient else -np.inf
- # Support multi-dimensional output of self.y_train_
- y_train = self.y_train_
- if y_train.ndim == 1:
- y_train = y_train[:, np.newaxis]
- # Alg 2.1, page 19, line 3 -> alpha = L^T \ (L \ y)
- alpha = cho_solve((L, GPR_CHOLESKY_LOWER), y_train, check_finite=False)
- # Alg 2.1, page 19, line 7
- # -0.5 . y^T . alpha - sum(log(diag(L))) - n_samples / 2 log(2*pi)
- # y is originally thought to be a (1, n_samples) row vector. However,
- # in multioutputs, y is of shape (n_samples, 2) and we need to compute
- # y^T . alpha for each output, independently using einsum. Thus, it
- # is equivalent to:
- # for output_idx in range(n_outputs):
- # log_likelihood_dims[output_idx] = (
- # y_train[:, [output_idx]] @ alpha[:, [output_idx]]
- # )
- log_likelihood_dims = -0.5 * np.einsum("ik,ik->k", y_train, alpha)
- log_likelihood_dims -= np.log(np.diag(L)).sum()
- log_likelihood_dims -= K.shape[0] / 2 * np.log(2 * np.pi)
- # the log likehood is sum-up across the outputs
- log_likelihood = log_likelihood_dims.sum(axis=-1)
- if eval_gradient:
- # Eq. 5.9, p. 114, and footnote 5 in p. 114
- # 0.5 * trace((alpha . alpha^T - K^-1) . K_gradient)
- # alpha is supposed to be a vector of (n_samples,) elements. With
- # multioutputs, alpha is a matrix of size (n_samples, n_outputs).
- # Therefore, we want to construct a matrix of
- # (n_samples, n_samples, n_outputs) equivalent to
- # for output_idx in range(n_outputs):
- # output_alpha = alpha[:, [output_idx]]
- # inner_term[..., output_idx] = output_alpha @ output_alpha.T
- inner_term = np.einsum("ik,jk->ijk", alpha, alpha)
- # compute K^-1 of shape (n_samples, n_samples)
- K_inv = cho_solve(
- (L, GPR_CHOLESKY_LOWER), np.eye(K.shape[0]), check_finite=False
- )
- # create a new axis to use broadcasting between inner_term and
- # K_inv
- inner_term -= K_inv[..., np.newaxis]
- # Since we are interested about the trace of
- # inner_term @ K_gradient, we don't explicitly compute the
- # matrix-by-matrix operation and instead use an einsum. Therefore
- # it is equivalent to:
- # for param_idx in range(n_kernel_params):
- # for output_idx in range(n_output):
- # log_likehood_gradient_dims[param_idx, output_idx] = (
- # inner_term[..., output_idx] @
- # K_gradient[..., param_idx]
- # )
- log_likelihood_gradient_dims = 0.5 * np.einsum(
- "ijl,jik->kl", inner_term, K_gradient
- )
- # the log likehood gradient is the sum-up across the outputs
- log_likelihood_gradient = log_likelihood_gradient_dims.sum(axis=-1)
- if eval_gradient:
- return log_likelihood, log_likelihood_gradient
- else:
- return log_likelihood
- def _constrained_optimization(self, obj_func, initial_theta, bounds):
- if self.optimizer == "fmin_l_bfgs_b":
- opt_res = scipy.optimize.minimize(
- obj_func,
- initial_theta,
- method="L-BFGS-B",
- jac=True,
- bounds=bounds,
- )
- _check_optimize_result("lbfgs", opt_res)
- theta_opt, func_min = opt_res.x, opt_res.fun
- elif callable(self.optimizer):
- theta_opt, func_min = self.optimizer(obj_func, initial_theta, bounds=bounds)
- else:
- raise ValueError(f"Unknown optimizer {self.optimizer}.")
- return theta_opt, func_min
- def _more_tags(self):
- return {"requires_fit": False}
|