_gpr.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673
  1. """Gaussian processes regression."""
  2. # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
  3. # Modified by: Pete Green <p.l.green@liverpool.ac.uk>
  4. # License: BSD 3 clause
  5. import warnings
  6. from numbers import Integral, Real
  7. from operator import itemgetter
  8. import numpy as np
  9. import scipy.optimize
  10. from scipy.linalg import cho_solve, cholesky, solve_triangular
  11. from ..base import BaseEstimator, MultiOutputMixin, RegressorMixin, _fit_context, clone
  12. from ..preprocessing._data import _handle_zeros_in_scale
  13. from ..utils import check_random_state
  14. from ..utils._param_validation import Interval, StrOptions
  15. from ..utils.optimize import _check_optimize_result
  16. from .kernels import RBF, Kernel
  17. from .kernels import ConstantKernel as C
  18. GPR_CHOLESKY_LOWER = True
  19. class GaussianProcessRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
  20. """Gaussian process regression (GPR).
  21. The implementation is based on Algorithm 2.1 of [RW2006]_.
  22. In addition to standard scikit-learn estimator API,
  23. :class:`GaussianProcessRegressor`:
  24. * allows prediction without prior fitting (based on the GP prior)
  25. * provides an additional method `sample_y(X)`, which evaluates samples
  26. drawn from the GPR (prior or posterior) at given inputs
  27. * exposes a method `log_marginal_likelihood(theta)`, which can be used
  28. externally for other ways of selecting hyperparameters, e.g., via
  29. Markov chain Monte Carlo.
  30. To learn the difference between a point-estimate approach vs. a more
  31. Bayesian modelling approach, refer to the example entitled
  32. :ref:`sphx_glr_auto_examples_gaussian_process_plot_compare_gpr_krr.py`.
  33. Read more in the :ref:`User Guide <gaussian_process>`.
  34. .. versionadded:: 0.18
  35. Parameters
  36. ----------
  37. kernel : kernel instance, default=None
  38. The kernel specifying the covariance function of the GP. If None is
  39. passed, the kernel ``ConstantKernel(1.0, constant_value_bounds="fixed")
  40. * RBF(1.0, length_scale_bounds="fixed")`` is used as default. Note that
  41. the kernel hyperparameters are optimized during fitting unless the
  42. bounds are marked as "fixed".
  43. alpha : float or ndarray of shape (n_samples,), default=1e-10
  44. Value added to the diagonal of the kernel matrix during fitting.
  45. This can prevent a potential numerical issue during fitting, by
  46. ensuring that the calculated values form a positive definite matrix.
  47. It can also be interpreted as the variance of additional Gaussian
  48. measurement noise on the training observations. Note that this is
  49. different from using a `WhiteKernel`. If an array is passed, it must
  50. have the same number of entries as the data used for fitting and is
  51. used as datapoint-dependent noise level. Allowing to specify the
  52. noise level directly as a parameter is mainly for convenience and
  53. for consistency with :class:`~sklearn.linear_model.Ridge`.
  54. optimizer : "fmin_l_bfgs_b", callable or None, default="fmin_l_bfgs_b"
  55. Can either be one of the internally supported optimizers for optimizing
  56. the kernel's parameters, specified by a string, or an externally
  57. defined optimizer passed as a callable. If a callable is passed, it
  58. must have the signature::
  59. def optimizer(obj_func, initial_theta, bounds):
  60. # * 'obj_func': the objective function to be minimized, which
  61. # takes the hyperparameters theta as a parameter and an
  62. # optional flag eval_gradient, which determines if the
  63. # gradient is returned additionally to the function value
  64. # * 'initial_theta': the initial value for theta, which can be
  65. # used by local optimizers
  66. # * 'bounds': the bounds on the values of theta
  67. ....
  68. # Returned are the best found hyperparameters theta and
  69. # the corresponding value of the target function.
  70. return theta_opt, func_min
  71. Per default, the L-BFGS-B algorithm from `scipy.optimize.minimize`
  72. is used. If None is passed, the kernel's parameters are kept fixed.
  73. Available internal optimizers are: `{'fmin_l_bfgs_b'}`.
  74. n_restarts_optimizer : int, default=0
  75. The number of restarts of the optimizer for finding the kernel's
  76. parameters which maximize the log-marginal likelihood. The first run
  77. of the optimizer is performed from the kernel's initial parameters,
  78. the remaining ones (if any) from thetas sampled log-uniform randomly
  79. from the space of allowed theta-values. If greater than 0, all bounds
  80. must be finite. Note that `n_restarts_optimizer == 0` implies that one
  81. run is performed.
  82. normalize_y : bool, default=False
  83. Whether or not to normalize the target values `y` by removing the mean
  84. and scaling to unit-variance. This is recommended for cases where
  85. zero-mean, unit-variance priors are used. Note that, in this
  86. implementation, the normalisation is reversed before the GP predictions
  87. are reported.
  88. .. versionchanged:: 0.23
  89. copy_X_train : bool, default=True
  90. If True, a persistent copy of the training data is stored in the
  91. object. Otherwise, just a reference to the training data is stored,
  92. which might cause predictions to change if the data is modified
  93. externally.
  94. n_targets : int, default=None
  95. The number of dimensions of the target values. Used to decide the number
  96. of outputs when sampling from the prior distributions (i.e. calling
  97. :meth:`sample_y` before :meth:`fit`). This parameter is ignored once
  98. :meth:`fit` has been called.
  99. .. versionadded:: 1.3
  100. random_state : int, RandomState instance or None, default=None
  101. Determines random number generation used to initialize the centers.
  102. Pass an int for reproducible results across multiple function calls.
  103. See :term:`Glossary <random_state>`.
  104. Attributes
  105. ----------
  106. X_train_ : array-like of shape (n_samples, n_features) or list of object
  107. Feature vectors or other representations of training data (also
  108. required for prediction).
  109. y_train_ : array-like of shape (n_samples,) or (n_samples, n_targets)
  110. Target values in training data (also required for prediction).
  111. kernel_ : kernel instance
  112. The kernel used for prediction. The structure of the kernel is the
  113. same as the one passed as parameter but with optimized hyperparameters.
  114. L_ : array-like of shape (n_samples, n_samples)
  115. Lower-triangular Cholesky decomposition of the kernel in ``X_train_``.
  116. alpha_ : array-like of shape (n_samples,)
  117. Dual coefficients of training data points in kernel space.
  118. log_marginal_likelihood_value_ : float
  119. The log-marginal-likelihood of ``self.kernel_.theta``.
  120. n_features_in_ : int
  121. Number of features seen during :term:`fit`.
  122. .. versionadded:: 0.24
  123. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  124. Names of features seen during :term:`fit`. Defined only when `X`
  125. has feature names that are all strings.
  126. .. versionadded:: 1.0
  127. See Also
  128. --------
  129. GaussianProcessClassifier : Gaussian process classification (GPC)
  130. based on Laplace approximation.
  131. References
  132. ----------
  133. .. [RW2006] `Carl E. Rasmussen and Christopher K.I. Williams,
  134. "Gaussian Processes for Machine Learning",
  135. MIT Press 2006 <https://www.gaussianprocess.org/gpml/chapters/RW.pdf>`_
  136. Examples
  137. --------
  138. >>> from sklearn.datasets import make_friedman2
  139. >>> from sklearn.gaussian_process import GaussianProcessRegressor
  140. >>> from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
  141. >>> X, y = make_friedman2(n_samples=500, noise=0, random_state=0)
  142. >>> kernel = DotProduct() + WhiteKernel()
  143. >>> gpr = GaussianProcessRegressor(kernel=kernel,
  144. ... random_state=0).fit(X, y)
  145. >>> gpr.score(X, y)
  146. 0.3680...
  147. >>> gpr.predict(X[:2,:], return_std=True)
  148. (array([653.0..., 592.1...]), array([316.6..., 316.6...]))
  149. """
  150. _parameter_constraints: dict = {
  151. "kernel": [None, Kernel],
  152. "alpha": [Interval(Real, 0, None, closed="left"), np.ndarray],
  153. "optimizer": [StrOptions({"fmin_l_bfgs_b"}), callable, None],
  154. "n_restarts_optimizer": [Interval(Integral, 0, None, closed="left")],
  155. "normalize_y": ["boolean"],
  156. "copy_X_train": ["boolean"],
  157. "n_targets": [Interval(Integral, 1, None, closed="left"), None],
  158. "random_state": ["random_state"],
  159. }
  160. def __init__(
  161. self,
  162. kernel=None,
  163. *,
  164. alpha=1e-10,
  165. optimizer="fmin_l_bfgs_b",
  166. n_restarts_optimizer=0,
  167. normalize_y=False,
  168. copy_X_train=True,
  169. n_targets=None,
  170. random_state=None,
  171. ):
  172. self.kernel = kernel
  173. self.alpha = alpha
  174. self.optimizer = optimizer
  175. self.n_restarts_optimizer = n_restarts_optimizer
  176. self.normalize_y = normalize_y
  177. self.copy_X_train = copy_X_train
  178. self.n_targets = n_targets
  179. self.random_state = random_state
  180. @_fit_context(prefer_skip_nested_validation=True)
  181. def fit(self, X, y):
  182. """Fit Gaussian process regression model.
  183. Parameters
  184. ----------
  185. X : array-like of shape (n_samples, n_features) or list of object
  186. Feature vectors or other representations of training data.
  187. y : array-like of shape (n_samples,) or (n_samples, n_targets)
  188. Target values.
  189. Returns
  190. -------
  191. self : object
  192. GaussianProcessRegressor class instance.
  193. """
  194. if self.kernel is None: # Use an RBF kernel as default
  195. self.kernel_ = C(1.0, constant_value_bounds="fixed") * RBF(
  196. 1.0, length_scale_bounds="fixed"
  197. )
  198. else:
  199. self.kernel_ = clone(self.kernel)
  200. self._rng = check_random_state(self.random_state)
  201. if self.kernel_.requires_vector_input:
  202. dtype, ensure_2d = "numeric", True
  203. else:
  204. dtype, ensure_2d = None, False
  205. X, y = self._validate_data(
  206. X,
  207. y,
  208. multi_output=True,
  209. y_numeric=True,
  210. ensure_2d=ensure_2d,
  211. dtype=dtype,
  212. )
  213. n_targets_seen = y.shape[1] if y.ndim > 1 else 1
  214. if self.n_targets is not None and n_targets_seen != self.n_targets:
  215. raise ValueError(
  216. "The number of targets seen in `y` is different from the parameter "
  217. f"`n_targets`. Got {n_targets_seen} != {self.n_targets}."
  218. )
  219. # Normalize target value
  220. if self.normalize_y:
  221. self._y_train_mean = np.mean(y, axis=0)
  222. self._y_train_std = _handle_zeros_in_scale(np.std(y, axis=0), copy=False)
  223. # Remove mean and make unit variance
  224. y = (y - self._y_train_mean) / self._y_train_std
  225. else:
  226. shape_y_stats = (y.shape[1],) if y.ndim == 2 else 1
  227. self._y_train_mean = np.zeros(shape=shape_y_stats)
  228. self._y_train_std = np.ones(shape=shape_y_stats)
  229. if np.iterable(self.alpha) and self.alpha.shape[0] != y.shape[0]:
  230. if self.alpha.shape[0] == 1:
  231. self.alpha = self.alpha[0]
  232. else:
  233. raise ValueError(
  234. "alpha must be a scalar or an array with same number of "
  235. f"entries as y. ({self.alpha.shape[0]} != {y.shape[0]})"
  236. )
  237. self.X_train_ = np.copy(X) if self.copy_X_train else X
  238. self.y_train_ = np.copy(y) if self.copy_X_train else y
  239. if self.optimizer is not None and self.kernel_.n_dims > 0:
  240. # Choose hyperparameters based on maximizing the log-marginal
  241. # likelihood (potentially starting from several initial values)
  242. def obj_func(theta, eval_gradient=True):
  243. if eval_gradient:
  244. lml, grad = self.log_marginal_likelihood(
  245. theta, eval_gradient=True, clone_kernel=False
  246. )
  247. return -lml, -grad
  248. else:
  249. return -self.log_marginal_likelihood(theta, clone_kernel=False)
  250. # First optimize starting from theta specified in kernel
  251. optima = [
  252. (
  253. self._constrained_optimization(
  254. obj_func, self.kernel_.theta, self.kernel_.bounds
  255. )
  256. )
  257. ]
  258. # Additional runs are performed from log-uniform chosen initial
  259. # theta
  260. if self.n_restarts_optimizer > 0:
  261. if not np.isfinite(self.kernel_.bounds).all():
  262. raise ValueError(
  263. "Multiple optimizer restarts (n_restarts_optimizer>0) "
  264. "requires that all bounds are finite."
  265. )
  266. bounds = self.kernel_.bounds
  267. for iteration in range(self.n_restarts_optimizer):
  268. theta_initial = self._rng.uniform(bounds[:, 0], bounds[:, 1])
  269. optima.append(
  270. self._constrained_optimization(obj_func, theta_initial, bounds)
  271. )
  272. # Select result from run with minimal (negative) log-marginal
  273. # likelihood
  274. lml_values = list(map(itemgetter(1), optima))
  275. self.kernel_.theta = optima[np.argmin(lml_values)][0]
  276. self.kernel_._check_bounds_params()
  277. self.log_marginal_likelihood_value_ = -np.min(lml_values)
  278. else:
  279. self.log_marginal_likelihood_value_ = self.log_marginal_likelihood(
  280. self.kernel_.theta, clone_kernel=False
  281. )
  282. # Precompute quantities required for predictions which are independent
  283. # of actual query points
  284. # Alg. 2.1, page 19, line 2 -> L = cholesky(K + sigma^2 I)
  285. K = self.kernel_(self.X_train_)
  286. K[np.diag_indices_from(K)] += self.alpha
  287. try:
  288. self.L_ = cholesky(K, lower=GPR_CHOLESKY_LOWER, check_finite=False)
  289. except np.linalg.LinAlgError as exc:
  290. exc.args = (
  291. (
  292. f"The kernel, {self.kernel_}, is not returning a positive "
  293. "definite matrix. Try gradually increasing the 'alpha' "
  294. "parameter of your GaussianProcessRegressor estimator."
  295. ),
  296. ) + exc.args
  297. raise
  298. # Alg 2.1, page 19, line 3 -> alpha = L^T \ (L \ y)
  299. self.alpha_ = cho_solve(
  300. (self.L_, GPR_CHOLESKY_LOWER),
  301. self.y_train_,
  302. check_finite=False,
  303. )
  304. return self
  305. def predict(self, X, return_std=False, return_cov=False):
  306. """Predict using the Gaussian process regression model.
  307. We can also predict based on an unfitted model by using the GP prior.
  308. In addition to the mean of the predictive distribution, optionally also
  309. returns its standard deviation (`return_std=True`) or covariance
  310. (`return_cov=True`). Note that at most one of the two can be requested.
  311. Parameters
  312. ----------
  313. X : array-like of shape (n_samples, n_features) or list of object
  314. Query points where the GP is evaluated.
  315. return_std : bool, default=False
  316. If True, the standard-deviation of the predictive distribution at
  317. the query points is returned along with the mean.
  318. return_cov : bool, default=False
  319. If True, the covariance of the joint predictive distribution at
  320. the query points is returned along with the mean.
  321. Returns
  322. -------
  323. y_mean : ndarray of shape (n_samples,) or (n_samples, n_targets)
  324. Mean of predictive distribution a query points.
  325. y_std : ndarray of shape (n_samples,) or (n_samples, n_targets), optional
  326. Standard deviation of predictive distribution at query points.
  327. Only returned when `return_std` is True.
  328. y_cov : ndarray of shape (n_samples, n_samples) or \
  329. (n_samples, n_samples, n_targets), optional
  330. Covariance of joint predictive distribution a query points.
  331. Only returned when `return_cov` is True.
  332. """
  333. if return_std and return_cov:
  334. raise RuntimeError(
  335. "At most one of return_std or return_cov can be requested."
  336. )
  337. if self.kernel is None or self.kernel.requires_vector_input:
  338. dtype, ensure_2d = "numeric", True
  339. else:
  340. dtype, ensure_2d = None, False
  341. X = self._validate_data(X, ensure_2d=ensure_2d, dtype=dtype, reset=False)
  342. if not hasattr(self, "X_train_"): # Unfitted;predict based on GP prior
  343. if self.kernel is None:
  344. kernel = C(1.0, constant_value_bounds="fixed") * RBF(
  345. 1.0, length_scale_bounds="fixed"
  346. )
  347. else:
  348. kernel = self.kernel
  349. n_targets = self.n_targets if self.n_targets is not None else 1
  350. y_mean = np.zeros(shape=(X.shape[0], n_targets)).squeeze()
  351. if return_cov:
  352. y_cov = kernel(X)
  353. if n_targets > 1:
  354. y_cov = np.repeat(
  355. np.expand_dims(y_cov, -1), repeats=n_targets, axis=-1
  356. )
  357. return y_mean, y_cov
  358. elif return_std:
  359. y_var = kernel.diag(X)
  360. if n_targets > 1:
  361. y_var = np.repeat(
  362. np.expand_dims(y_var, -1), repeats=n_targets, axis=-1
  363. )
  364. return y_mean, np.sqrt(y_var)
  365. else:
  366. return y_mean
  367. else: # Predict based on GP posterior
  368. # Alg 2.1, page 19, line 4 -> f*_bar = K(X_test, X_train) . alpha
  369. K_trans = self.kernel_(X, self.X_train_)
  370. y_mean = K_trans @ self.alpha_
  371. # undo normalisation
  372. y_mean = self._y_train_std * y_mean + self._y_train_mean
  373. # if y_mean has shape (n_samples, 1), reshape to (n_samples,)
  374. if y_mean.ndim > 1 and y_mean.shape[1] == 1:
  375. y_mean = np.squeeze(y_mean, axis=1)
  376. # Alg 2.1, page 19, line 5 -> v = L \ K(X_test, X_train)^T
  377. V = solve_triangular(
  378. self.L_, K_trans.T, lower=GPR_CHOLESKY_LOWER, check_finite=False
  379. )
  380. if return_cov:
  381. # Alg 2.1, page 19, line 6 -> K(X_test, X_test) - v^T. v
  382. y_cov = self.kernel_(X) - V.T @ V
  383. # undo normalisation
  384. y_cov = np.outer(y_cov, self._y_train_std**2).reshape(
  385. *y_cov.shape, -1
  386. )
  387. # if y_cov has shape (n_samples, n_samples, 1), reshape to
  388. # (n_samples, n_samples)
  389. if y_cov.shape[2] == 1:
  390. y_cov = np.squeeze(y_cov, axis=2)
  391. return y_mean, y_cov
  392. elif return_std:
  393. # Compute variance of predictive distribution
  394. # Use einsum to avoid explicitly forming the large matrix
  395. # V^T @ V just to extract its diagonal afterward.
  396. y_var = self.kernel_.diag(X).copy()
  397. y_var -= np.einsum("ij,ji->i", V.T, V)
  398. # Check if any of the variances is negative because of
  399. # numerical issues. If yes: set the variance to 0.
  400. y_var_negative = y_var < 0
  401. if np.any(y_var_negative):
  402. warnings.warn(
  403. "Predicted variances smaller than 0. "
  404. "Setting those variances to 0."
  405. )
  406. y_var[y_var_negative] = 0.0
  407. # undo normalisation
  408. y_var = np.outer(y_var, self._y_train_std**2).reshape(
  409. *y_var.shape, -1
  410. )
  411. # if y_var has shape (n_samples, 1), reshape to (n_samples,)
  412. if y_var.shape[1] == 1:
  413. y_var = np.squeeze(y_var, axis=1)
  414. return y_mean, np.sqrt(y_var)
  415. else:
  416. return y_mean
  417. def sample_y(self, X, n_samples=1, random_state=0):
  418. """Draw samples from Gaussian process and evaluate at X.
  419. Parameters
  420. ----------
  421. X : array-like of shape (n_samples_X, n_features) or list of object
  422. Query points where the GP is evaluated.
  423. n_samples : int, default=1
  424. Number of samples drawn from the Gaussian process per query point.
  425. random_state : int, RandomState instance or None, default=0
  426. Determines random number generation to randomly draw samples.
  427. Pass an int for reproducible results across multiple function
  428. calls.
  429. See :term:`Glossary <random_state>`.
  430. Returns
  431. -------
  432. y_samples : ndarray of shape (n_samples_X, n_samples), or \
  433. (n_samples_X, n_targets, n_samples)
  434. Values of n_samples samples drawn from Gaussian process and
  435. evaluated at query points.
  436. """
  437. rng = check_random_state(random_state)
  438. y_mean, y_cov = self.predict(X, return_cov=True)
  439. if y_mean.ndim == 1:
  440. y_samples = rng.multivariate_normal(y_mean, y_cov, n_samples).T
  441. else:
  442. y_samples = [
  443. rng.multivariate_normal(
  444. y_mean[:, target], y_cov[..., target], n_samples
  445. ).T[:, np.newaxis]
  446. for target in range(y_mean.shape[1])
  447. ]
  448. y_samples = np.hstack(y_samples)
  449. return y_samples
  450. def log_marginal_likelihood(
  451. self, theta=None, eval_gradient=False, clone_kernel=True
  452. ):
  453. """Return log-marginal likelihood of theta for training data.
  454. Parameters
  455. ----------
  456. theta : array-like of shape (n_kernel_params,) default=None
  457. Kernel hyperparameters for which the log-marginal likelihood is
  458. evaluated. If None, the precomputed log_marginal_likelihood
  459. of ``self.kernel_.theta`` is returned.
  460. eval_gradient : bool, default=False
  461. If True, the gradient of the log-marginal likelihood with respect
  462. to the kernel hyperparameters at position theta is returned
  463. additionally. If True, theta must not be None.
  464. clone_kernel : bool, default=True
  465. If True, the kernel attribute is copied. If False, the kernel
  466. attribute is modified, but may result in a performance improvement.
  467. Returns
  468. -------
  469. log_likelihood : float
  470. Log-marginal likelihood of theta for training data.
  471. log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional
  472. Gradient of the log-marginal likelihood with respect to the kernel
  473. hyperparameters at position theta.
  474. Only returned when eval_gradient is True.
  475. """
  476. if theta is None:
  477. if eval_gradient:
  478. raise ValueError("Gradient can only be evaluated for theta!=None")
  479. return self.log_marginal_likelihood_value_
  480. if clone_kernel:
  481. kernel = self.kernel_.clone_with_theta(theta)
  482. else:
  483. kernel = self.kernel_
  484. kernel.theta = theta
  485. if eval_gradient:
  486. K, K_gradient = kernel(self.X_train_, eval_gradient=True)
  487. else:
  488. K = kernel(self.X_train_)
  489. # Alg. 2.1, page 19, line 2 -> L = cholesky(K + sigma^2 I)
  490. K[np.diag_indices_from(K)] += self.alpha
  491. try:
  492. L = cholesky(K, lower=GPR_CHOLESKY_LOWER, check_finite=False)
  493. except np.linalg.LinAlgError:
  494. return (-np.inf, np.zeros_like(theta)) if eval_gradient else -np.inf
  495. # Support multi-dimensional output of self.y_train_
  496. y_train = self.y_train_
  497. if y_train.ndim == 1:
  498. y_train = y_train[:, np.newaxis]
  499. # Alg 2.1, page 19, line 3 -> alpha = L^T \ (L \ y)
  500. alpha = cho_solve((L, GPR_CHOLESKY_LOWER), y_train, check_finite=False)
  501. # Alg 2.1, page 19, line 7
  502. # -0.5 . y^T . alpha - sum(log(diag(L))) - n_samples / 2 log(2*pi)
  503. # y is originally thought to be a (1, n_samples) row vector. However,
  504. # in multioutputs, y is of shape (n_samples, 2) and we need to compute
  505. # y^T . alpha for each output, independently using einsum. Thus, it
  506. # is equivalent to:
  507. # for output_idx in range(n_outputs):
  508. # log_likelihood_dims[output_idx] = (
  509. # y_train[:, [output_idx]] @ alpha[:, [output_idx]]
  510. # )
  511. log_likelihood_dims = -0.5 * np.einsum("ik,ik->k", y_train, alpha)
  512. log_likelihood_dims -= np.log(np.diag(L)).sum()
  513. log_likelihood_dims -= K.shape[0] / 2 * np.log(2 * np.pi)
  514. # the log likehood is sum-up across the outputs
  515. log_likelihood = log_likelihood_dims.sum(axis=-1)
  516. if eval_gradient:
  517. # Eq. 5.9, p. 114, and footnote 5 in p. 114
  518. # 0.5 * trace((alpha . alpha^T - K^-1) . K_gradient)
  519. # alpha is supposed to be a vector of (n_samples,) elements. With
  520. # multioutputs, alpha is a matrix of size (n_samples, n_outputs).
  521. # Therefore, we want to construct a matrix of
  522. # (n_samples, n_samples, n_outputs) equivalent to
  523. # for output_idx in range(n_outputs):
  524. # output_alpha = alpha[:, [output_idx]]
  525. # inner_term[..., output_idx] = output_alpha @ output_alpha.T
  526. inner_term = np.einsum("ik,jk->ijk", alpha, alpha)
  527. # compute K^-1 of shape (n_samples, n_samples)
  528. K_inv = cho_solve(
  529. (L, GPR_CHOLESKY_LOWER), np.eye(K.shape[0]), check_finite=False
  530. )
  531. # create a new axis to use broadcasting between inner_term and
  532. # K_inv
  533. inner_term -= K_inv[..., np.newaxis]
  534. # Since we are interested about the trace of
  535. # inner_term @ K_gradient, we don't explicitly compute the
  536. # matrix-by-matrix operation and instead use an einsum. Therefore
  537. # it is equivalent to:
  538. # for param_idx in range(n_kernel_params):
  539. # for output_idx in range(n_output):
  540. # log_likehood_gradient_dims[param_idx, output_idx] = (
  541. # inner_term[..., output_idx] @
  542. # K_gradient[..., param_idx]
  543. # )
  544. log_likelihood_gradient_dims = 0.5 * np.einsum(
  545. "ijl,jik->kl", inner_term, K_gradient
  546. )
  547. # the log likehood gradient is the sum-up across the outputs
  548. log_likelihood_gradient = log_likelihood_gradient_dims.sum(axis=-1)
  549. if eval_gradient:
  550. return log_likelihood, log_likelihood_gradient
  551. else:
  552. return log_likelihood
  553. def _constrained_optimization(self, obj_func, initial_theta, bounds):
  554. if self.optimizer == "fmin_l_bfgs_b":
  555. opt_res = scipy.optimize.minimize(
  556. obj_func,
  557. initial_theta,
  558. method="L-BFGS-B",
  559. jac=True,
  560. bounds=bounds,
  561. )
  562. _check_optimize_result("lbfgs", opt_res)
  563. theta_opt, func_min = opt_res.x, opt_res.fun
  564. elif callable(self.optimizer):
  565. theta_opt, func_min = self.optimizer(obj_func, initial_theta, bounds=bounds)
  566. else:
  567. raise ValueError(f"Unknown optimizer {self.optimizer}.")
  568. return theta_opt, func_min
  569. def _more_tags(self):
  570. return {"requires_fit": False}