_gpc.py 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902
  1. """Gaussian processes classification."""
  2. # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
  3. #
  4. # License: BSD 3 clause
  5. from numbers import Integral
  6. from operator import itemgetter
  7. import numpy as np
  8. import scipy.optimize
  9. from scipy.linalg import cho_solve, cholesky, solve
  10. from scipy.special import erf, expit
  11. from ..base import BaseEstimator, ClassifierMixin, _fit_context, clone
  12. from ..multiclass import OneVsOneClassifier, OneVsRestClassifier
  13. from ..preprocessing import LabelEncoder
  14. from ..utils import check_random_state
  15. from ..utils._param_validation import Interval, StrOptions
  16. from ..utils.optimize import _check_optimize_result
  17. from ..utils.validation import check_is_fitted
  18. from .kernels import RBF, CompoundKernel, Kernel
  19. from .kernels import ConstantKernel as C
  20. # Values required for approximating the logistic sigmoid by
  21. # error functions. coefs are obtained via:
  22. # x = np.array([0, 0.6, 2, 3.5, 4.5, np.inf])
  23. # b = logistic(x)
  24. # A = (erf(np.dot(x, self.lambdas)) + 1) / 2
  25. # coefs = lstsq(A, b)[0]
  26. LAMBDAS = np.array([0.41, 0.4, 0.37, 0.44, 0.39])[:, np.newaxis]
  27. COEFS = np.array(
  28. [-1854.8214151, 3516.89893646, 221.29346712, 128.12323805, -2010.49422654]
  29. )[:, np.newaxis]
  30. class _BinaryGaussianProcessClassifierLaplace(BaseEstimator):
  31. """Binary Gaussian process classification based on Laplace approximation.
  32. The implementation is based on Algorithm 3.1, 3.2, and 5.1 from [RW2006]_.
  33. Internally, the Laplace approximation is used for approximating the
  34. non-Gaussian posterior by a Gaussian.
  35. Currently, the implementation is restricted to using the logistic link
  36. function.
  37. .. versionadded:: 0.18
  38. Parameters
  39. ----------
  40. kernel : kernel instance, default=None
  41. The kernel specifying the covariance function of the GP. If None is
  42. passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
  43. the kernel's hyperparameters are optimized during fitting.
  44. optimizer : 'fmin_l_bfgs_b' or callable, default='fmin_l_bfgs_b'
  45. Can either be one of the internally supported optimizers for optimizing
  46. the kernel's parameters, specified by a string, or an externally
  47. defined optimizer passed as a callable. If a callable is passed, it
  48. must have the signature::
  49. def optimizer(obj_func, initial_theta, bounds):
  50. # * 'obj_func' is the objective function to be maximized, which
  51. # takes the hyperparameters theta as parameter and an
  52. # optional flag eval_gradient, which determines if the
  53. # gradient is returned additionally to the function value
  54. # * 'initial_theta': the initial value for theta, which can be
  55. # used by local optimizers
  56. # * 'bounds': the bounds on the values of theta
  57. ....
  58. # Returned are the best found hyperparameters theta and
  59. # the corresponding value of the target function.
  60. return theta_opt, func_min
  61. Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize
  62. is used. If None is passed, the kernel's parameters are kept fixed.
  63. Available internal optimizers are::
  64. 'fmin_l_bfgs_b'
  65. n_restarts_optimizer : int, default=0
  66. The number of restarts of the optimizer for finding the kernel's
  67. parameters which maximize the log-marginal likelihood. The first run
  68. of the optimizer is performed from the kernel's initial parameters,
  69. the remaining ones (if any) from thetas sampled log-uniform randomly
  70. from the space of allowed theta-values. If greater than 0, all bounds
  71. must be finite. Note that n_restarts_optimizer=0 implies that one
  72. run is performed.
  73. max_iter_predict : int, default=100
  74. The maximum number of iterations in Newton's method for approximating
  75. the posterior during predict. Smaller values will reduce computation
  76. time at the cost of worse results.
  77. warm_start : bool, default=False
  78. If warm-starts are enabled, the solution of the last Newton iteration
  79. on the Laplace approximation of the posterior mode is used as
  80. initialization for the next call of _posterior_mode(). This can speed
  81. up convergence when _posterior_mode is called several times on similar
  82. problems as in hyperparameter optimization. See :term:`the Glossary
  83. <warm_start>`.
  84. copy_X_train : bool, default=True
  85. If True, a persistent copy of the training data is stored in the
  86. object. Otherwise, just a reference to the training data is stored,
  87. which might cause predictions to change if the data is modified
  88. externally.
  89. random_state : int, RandomState instance or None, default=None
  90. Determines random number generation used to initialize the centers.
  91. Pass an int for reproducible results across multiple function calls.
  92. See :term:`Glossary <random_state>`.
  93. Attributes
  94. ----------
  95. X_train_ : array-like of shape (n_samples, n_features) or list of object
  96. Feature vectors or other representations of training data (also
  97. required for prediction).
  98. y_train_ : array-like of shape (n_samples,)
  99. Target values in training data (also required for prediction)
  100. classes_ : array-like of shape (n_classes,)
  101. Unique class labels.
  102. kernel_ : kernl instance
  103. The kernel used for prediction. The structure of the kernel is the
  104. same as the one passed as parameter but with optimized hyperparameters
  105. L_ : array-like of shape (n_samples, n_samples)
  106. Lower-triangular Cholesky decomposition of the kernel in X_train_
  107. pi_ : array-like of shape (n_samples,)
  108. The probabilities of the positive class for the training points
  109. X_train_
  110. W_sr_ : array-like of shape (n_samples,)
  111. Square root of W, the Hessian of log-likelihood of the latent function
  112. values for the observed labels. Since W is diagonal, only the diagonal
  113. of sqrt(W) is stored.
  114. log_marginal_likelihood_value_ : float
  115. The log-marginal-likelihood of ``self.kernel_.theta``
  116. References
  117. ----------
  118. .. [RW2006] `Carl E. Rasmussen and Christopher K.I. Williams,
  119. "Gaussian Processes for Machine Learning",
  120. MIT Press 2006 <https://www.gaussianprocess.org/gpml/chapters/RW.pdf>`_
  121. """
  122. def __init__(
  123. self,
  124. kernel=None,
  125. *,
  126. optimizer="fmin_l_bfgs_b",
  127. n_restarts_optimizer=0,
  128. max_iter_predict=100,
  129. warm_start=False,
  130. copy_X_train=True,
  131. random_state=None,
  132. ):
  133. self.kernel = kernel
  134. self.optimizer = optimizer
  135. self.n_restarts_optimizer = n_restarts_optimizer
  136. self.max_iter_predict = max_iter_predict
  137. self.warm_start = warm_start
  138. self.copy_X_train = copy_X_train
  139. self.random_state = random_state
  140. def fit(self, X, y):
  141. """Fit Gaussian process classification model.
  142. Parameters
  143. ----------
  144. X : array-like of shape (n_samples, n_features) or list of object
  145. Feature vectors or other representations of training data.
  146. y : array-like of shape (n_samples,)
  147. Target values, must be binary.
  148. Returns
  149. -------
  150. self : returns an instance of self.
  151. """
  152. if self.kernel is None: # Use an RBF kernel as default
  153. self.kernel_ = C(1.0, constant_value_bounds="fixed") * RBF(
  154. 1.0, length_scale_bounds="fixed"
  155. )
  156. else:
  157. self.kernel_ = clone(self.kernel)
  158. self.rng = check_random_state(self.random_state)
  159. self.X_train_ = np.copy(X) if self.copy_X_train else X
  160. # Encode class labels and check that it is a binary classification
  161. # problem
  162. label_encoder = LabelEncoder()
  163. self.y_train_ = label_encoder.fit_transform(y)
  164. self.classes_ = label_encoder.classes_
  165. if self.classes_.size > 2:
  166. raise ValueError(
  167. "%s supports only binary classification. y contains classes %s"
  168. % (self.__class__.__name__, self.classes_)
  169. )
  170. elif self.classes_.size == 1:
  171. raise ValueError(
  172. "{0:s} requires 2 classes; got {1:d} class".format(
  173. self.__class__.__name__, self.classes_.size
  174. )
  175. )
  176. if self.optimizer is not None and self.kernel_.n_dims > 0:
  177. # Choose hyperparameters based on maximizing the log-marginal
  178. # likelihood (potentially starting from several initial values)
  179. def obj_func(theta, eval_gradient=True):
  180. if eval_gradient:
  181. lml, grad = self.log_marginal_likelihood(
  182. theta, eval_gradient=True, clone_kernel=False
  183. )
  184. return -lml, -grad
  185. else:
  186. return -self.log_marginal_likelihood(theta, clone_kernel=False)
  187. # First optimize starting from theta specified in kernel
  188. optima = [
  189. self._constrained_optimization(
  190. obj_func, self.kernel_.theta, self.kernel_.bounds
  191. )
  192. ]
  193. # Additional runs are performed from log-uniform chosen initial
  194. # theta
  195. if self.n_restarts_optimizer > 0:
  196. if not np.isfinite(self.kernel_.bounds).all():
  197. raise ValueError(
  198. "Multiple optimizer restarts (n_restarts_optimizer>0) "
  199. "requires that all bounds are finite."
  200. )
  201. bounds = self.kernel_.bounds
  202. for iteration in range(self.n_restarts_optimizer):
  203. theta_initial = np.exp(self.rng.uniform(bounds[:, 0], bounds[:, 1]))
  204. optima.append(
  205. self._constrained_optimization(obj_func, theta_initial, bounds)
  206. )
  207. # Select result from run with minimal (negative) log-marginal
  208. # likelihood
  209. lml_values = list(map(itemgetter(1), optima))
  210. self.kernel_.theta = optima[np.argmin(lml_values)][0]
  211. self.kernel_._check_bounds_params()
  212. self.log_marginal_likelihood_value_ = -np.min(lml_values)
  213. else:
  214. self.log_marginal_likelihood_value_ = self.log_marginal_likelihood(
  215. self.kernel_.theta
  216. )
  217. # Precompute quantities required for predictions which are independent
  218. # of actual query points
  219. K = self.kernel_(self.X_train_)
  220. _, (self.pi_, self.W_sr_, self.L_, _, _) = self._posterior_mode(
  221. K, return_temporaries=True
  222. )
  223. return self
  224. def predict(self, X):
  225. """Perform classification on an array of test vectors X.
  226. Parameters
  227. ----------
  228. X : array-like of shape (n_samples, n_features) or list of object
  229. Query points where the GP is evaluated for classification.
  230. Returns
  231. -------
  232. C : ndarray of shape (n_samples,)
  233. Predicted target values for X, values are from ``classes_``
  234. """
  235. check_is_fitted(self)
  236. # As discussed on Section 3.4.2 of GPML, for making hard binary
  237. # decisions, it is enough to compute the MAP of the posterior and
  238. # pass it through the link function
  239. K_star = self.kernel_(self.X_train_, X) # K_star =k(x_star)
  240. f_star = K_star.T.dot(self.y_train_ - self.pi_) # Algorithm 3.2,Line 4
  241. return np.where(f_star > 0, self.classes_[1], self.classes_[0])
  242. def predict_proba(self, X):
  243. """Return probability estimates for the test vector X.
  244. Parameters
  245. ----------
  246. X : array-like of shape (n_samples, n_features) or list of object
  247. Query points where the GP is evaluated for classification.
  248. Returns
  249. -------
  250. C : array-like of shape (n_samples, n_classes)
  251. Returns the probability of the samples for each class in
  252. the model. The columns correspond to the classes in sorted
  253. order, as they appear in the attribute ``classes_``.
  254. """
  255. check_is_fitted(self)
  256. # Based on Algorithm 3.2 of GPML
  257. K_star = self.kernel_(self.X_train_, X) # K_star =k(x_star)
  258. f_star = K_star.T.dot(self.y_train_ - self.pi_) # Line 4
  259. v = solve(self.L_, self.W_sr_[:, np.newaxis] * K_star) # Line 5
  260. # Line 6 (compute np.diag(v.T.dot(v)) via einsum)
  261. var_f_star = self.kernel_.diag(X) - np.einsum("ij,ij->j", v, v)
  262. # Line 7:
  263. # Approximate \int log(z) * N(z | f_star, var_f_star)
  264. # Approximation is due to Williams & Barber, "Bayesian Classification
  265. # with Gaussian Processes", Appendix A: Approximate the logistic
  266. # sigmoid by a linear combination of 5 error functions.
  267. # For information on how this integral can be computed see
  268. # blitiri.blogspot.de/2012/11/gaussian-integral-of-error-function.html
  269. alpha = 1 / (2 * var_f_star)
  270. gamma = LAMBDAS * f_star
  271. integrals = (
  272. np.sqrt(np.pi / alpha)
  273. * erf(gamma * np.sqrt(alpha / (alpha + LAMBDAS**2)))
  274. / (2 * np.sqrt(var_f_star * 2 * np.pi))
  275. )
  276. pi_star = (COEFS * integrals).sum(axis=0) + 0.5 * COEFS.sum()
  277. return np.vstack((1 - pi_star, pi_star)).T
  278. def log_marginal_likelihood(
  279. self, theta=None, eval_gradient=False, clone_kernel=True
  280. ):
  281. """Returns log-marginal likelihood of theta for training data.
  282. Parameters
  283. ----------
  284. theta : array-like of shape (n_kernel_params,), default=None
  285. Kernel hyperparameters for which the log-marginal likelihood is
  286. evaluated. If None, the precomputed log_marginal_likelihood
  287. of ``self.kernel_.theta`` is returned.
  288. eval_gradient : bool, default=False
  289. If True, the gradient of the log-marginal likelihood with respect
  290. to the kernel hyperparameters at position theta is returned
  291. additionally. If True, theta must not be None.
  292. clone_kernel : bool, default=True
  293. If True, the kernel attribute is copied. If False, the kernel
  294. attribute is modified, but may result in a performance improvement.
  295. Returns
  296. -------
  297. log_likelihood : float
  298. Log-marginal likelihood of theta for training data.
  299. log_likelihood_gradient : ndarray of shape (n_kernel_params,), \
  300. optional
  301. Gradient of the log-marginal likelihood with respect to the kernel
  302. hyperparameters at position theta.
  303. Only returned when `eval_gradient` is True.
  304. """
  305. if theta is None:
  306. if eval_gradient:
  307. raise ValueError("Gradient can only be evaluated for theta!=None")
  308. return self.log_marginal_likelihood_value_
  309. if clone_kernel:
  310. kernel = self.kernel_.clone_with_theta(theta)
  311. else:
  312. kernel = self.kernel_
  313. kernel.theta = theta
  314. if eval_gradient:
  315. K, K_gradient = kernel(self.X_train_, eval_gradient=True)
  316. else:
  317. K = kernel(self.X_train_)
  318. # Compute log-marginal-likelihood Z and also store some temporaries
  319. # which can be reused for computing Z's gradient
  320. Z, (pi, W_sr, L, b, a) = self._posterior_mode(K, return_temporaries=True)
  321. if not eval_gradient:
  322. return Z
  323. # Compute gradient based on Algorithm 5.1 of GPML
  324. d_Z = np.empty(theta.shape[0])
  325. # XXX: Get rid of the np.diag() in the next line
  326. R = W_sr[:, np.newaxis] * cho_solve((L, True), np.diag(W_sr)) # Line 7
  327. C = solve(L, W_sr[:, np.newaxis] * K) # Line 8
  328. # Line 9: (use einsum to compute np.diag(C.T.dot(C))))
  329. s_2 = (
  330. -0.5
  331. * (np.diag(K) - np.einsum("ij, ij -> j", C, C))
  332. * (pi * (1 - pi) * (1 - 2 * pi))
  333. ) # third derivative
  334. for j in range(d_Z.shape[0]):
  335. C = K_gradient[:, :, j] # Line 11
  336. # Line 12: (R.T.ravel().dot(C.ravel()) = np.trace(R.dot(C)))
  337. s_1 = 0.5 * a.T.dot(C).dot(a) - 0.5 * R.T.ravel().dot(C.ravel())
  338. b = C.dot(self.y_train_ - pi) # Line 13
  339. s_3 = b - K.dot(R.dot(b)) # Line 14
  340. d_Z[j] = s_1 + s_2.T.dot(s_3) # Line 15
  341. return Z, d_Z
  342. def _posterior_mode(self, K, return_temporaries=False):
  343. """Mode-finding for binary Laplace GPC and fixed kernel.
  344. This approximates the posterior of the latent function values for given
  345. inputs and target observations with a Gaussian approximation and uses
  346. Newton's iteration to find the mode of this approximation.
  347. """
  348. # Based on Algorithm 3.1 of GPML
  349. # If warm_start are enabled, we reuse the last solution for the
  350. # posterior mode as initialization; otherwise, we initialize with 0
  351. if (
  352. self.warm_start
  353. and hasattr(self, "f_cached")
  354. and self.f_cached.shape == self.y_train_.shape
  355. ):
  356. f = self.f_cached
  357. else:
  358. f = np.zeros_like(self.y_train_, dtype=np.float64)
  359. # Use Newton's iteration method to find mode of Laplace approximation
  360. log_marginal_likelihood = -np.inf
  361. for _ in range(self.max_iter_predict):
  362. # Line 4
  363. pi = expit(f)
  364. W = pi * (1 - pi)
  365. # Line 5
  366. W_sr = np.sqrt(W)
  367. W_sr_K = W_sr[:, np.newaxis] * K
  368. B = np.eye(W.shape[0]) + W_sr_K * W_sr
  369. L = cholesky(B, lower=True)
  370. # Line 6
  371. b = W * f + (self.y_train_ - pi)
  372. # Line 7
  373. a = b - W_sr * cho_solve((L, True), W_sr_K.dot(b))
  374. # Line 8
  375. f = K.dot(a)
  376. # Line 10: Compute log marginal likelihood in loop and use as
  377. # convergence criterion
  378. lml = (
  379. -0.5 * a.T.dot(f)
  380. - np.log1p(np.exp(-(self.y_train_ * 2 - 1) * f)).sum()
  381. - np.log(np.diag(L)).sum()
  382. )
  383. # Check if we have converged (log marginal likelihood does
  384. # not decrease)
  385. # XXX: more complex convergence criterion
  386. if lml - log_marginal_likelihood < 1e-10:
  387. break
  388. log_marginal_likelihood = lml
  389. self.f_cached = f # Remember solution for later warm-starts
  390. if return_temporaries:
  391. return log_marginal_likelihood, (pi, W_sr, L, b, a)
  392. else:
  393. return log_marginal_likelihood
  394. def _constrained_optimization(self, obj_func, initial_theta, bounds):
  395. if self.optimizer == "fmin_l_bfgs_b":
  396. opt_res = scipy.optimize.minimize(
  397. obj_func, initial_theta, method="L-BFGS-B", jac=True, bounds=bounds
  398. )
  399. _check_optimize_result("lbfgs", opt_res)
  400. theta_opt, func_min = opt_res.x, opt_res.fun
  401. elif callable(self.optimizer):
  402. theta_opt, func_min = self.optimizer(obj_func, initial_theta, bounds=bounds)
  403. else:
  404. raise ValueError("Unknown optimizer %s." % self.optimizer)
  405. return theta_opt, func_min
  406. class GaussianProcessClassifier(ClassifierMixin, BaseEstimator):
  407. """Gaussian process classification (GPC) based on Laplace approximation.
  408. The implementation is based on Algorithm 3.1, 3.2, and 5.1 from [RW2006]_.
  409. Internally, the Laplace approximation is used for approximating the
  410. non-Gaussian posterior by a Gaussian.
  411. Currently, the implementation is restricted to using the logistic link
  412. function. For multi-class classification, several binary one-versus rest
  413. classifiers are fitted. Note that this class thus does not implement
  414. a true multi-class Laplace approximation.
  415. Read more in the :ref:`User Guide <gaussian_process>`.
  416. .. versionadded:: 0.18
  417. Parameters
  418. ----------
  419. kernel : kernel instance, default=None
  420. The kernel specifying the covariance function of the GP. If None is
  421. passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
  422. the kernel's hyperparameters are optimized during fitting. Also kernel
  423. cannot be a `CompoundKernel`.
  424. optimizer : 'fmin_l_bfgs_b', callable or None, default='fmin_l_bfgs_b'
  425. Can either be one of the internally supported optimizers for optimizing
  426. the kernel's parameters, specified by a string, or an externally
  427. defined optimizer passed as a callable. If a callable is passed, it
  428. must have the signature::
  429. def optimizer(obj_func, initial_theta, bounds):
  430. # * 'obj_func' is the objective function to be maximized, which
  431. # takes the hyperparameters theta as parameter and an
  432. # optional flag eval_gradient, which determines if the
  433. # gradient is returned additionally to the function value
  434. # * 'initial_theta': the initial value for theta, which can be
  435. # used by local optimizers
  436. # * 'bounds': the bounds on the values of theta
  437. ....
  438. # Returned are the best found hyperparameters theta and
  439. # the corresponding value of the target function.
  440. return theta_opt, func_min
  441. Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize
  442. is used. If None is passed, the kernel's parameters are kept fixed.
  443. Available internal optimizers are::
  444. 'fmin_l_bfgs_b'
  445. n_restarts_optimizer : int, default=0
  446. The number of restarts of the optimizer for finding the kernel's
  447. parameters which maximize the log-marginal likelihood. The first run
  448. of the optimizer is performed from the kernel's initial parameters,
  449. the remaining ones (if any) from thetas sampled log-uniform randomly
  450. from the space of allowed theta-values. If greater than 0, all bounds
  451. must be finite. Note that n_restarts_optimizer=0 implies that one
  452. run is performed.
  453. max_iter_predict : int, default=100
  454. The maximum number of iterations in Newton's method for approximating
  455. the posterior during predict. Smaller values will reduce computation
  456. time at the cost of worse results.
  457. warm_start : bool, default=False
  458. If warm-starts are enabled, the solution of the last Newton iteration
  459. on the Laplace approximation of the posterior mode is used as
  460. initialization for the next call of _posterior_mode(). This can speed
  461. up convergence when _posterior_mode is called several times on similar
  462. problems as in hyperparameter optimization. See :term:`the Glossary
  463. <warm_start>`.
  464. copy_X_train : bool, default=True
  465. If True, a persistent copy of the training data is stored in the
  466. object. Otherwise, just a reference to the training data is stored,
  467. which might cause predictions to change if the data is modified
  468. externally.
  469. random_state : int, RandomState instance or None, default=None
  470. Determines random number generation used to initialize the centers.
  471. Pass an int for reproducible results across multiple function calls.
  472. See :term:`Glossary <random_state>`.
  473. multi_class : {'one_vs_rest', 'one_vs_one'}, default='one_vs_rest'
  474. Specifies how multi-class classification problems are handled.
  475. Supported are 'one_vs_rest' and 'one_vs_one'. In 'one_vs_rest',
  476. one binary Gaussian process classifier is fitted for each class, which
  477. is trained to separate this class from the rest. In 'one_vs_one', one
  478. binary Gaussian process classifier is fitted for each pair of classes,
  479. which is trained to separate these two classes. The predictions of
  480. these binary predictors are combined into multi-class predictions.
  481. Note that 'one_vs_one' does not support predicting probability
  482. estimates.
  483. n_jobs : int, default=None
  484. The number of jobs to use for the computation: the specified
  485. multiclass problems are computed in parallel.
  486. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
  487. ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
  488. for more details.
  489. Attributes
  490. ----------
  491. base_estimator_ : ``Estimator`` instance
  492. The estimator instance that defines the likelihood function
  493. using the observed data.
  494. kernel_ : kernel instance
  495. The kernel used for prediction. In case of binary classification,
  496. the structure of the kernel is the same as the one passed as parameter
  497. but with optimized hyperparameters. In case of multi-class
  498. classification, a CompoundKernel is returned which consists of the
  499. different kernels used in the one-versus-rest classifiers.
  500. log_marginal_likelihood_value_ : float
  501. The log-marginal-likelihood of ``self.kernel_.theta``
  502. classes_ : array-like of shape (n_classes,)
  503. Unique class labels.
  504. n_classes_ : int
  505. The number of classes in the training data
  506. n_features_in_ : int
  507. Number of features seen during :term:`fit`.
  508. .. versionadded:: 0.24
  509. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  510. Names of features seen during :term:`fit`. Defined only when `X`
  511. has feature names that are all strings.
  512. .. versionadded:: 1.0
  513. See Also
  514. --------
  515. GaussianProcessRegressor : Gaussian process regression (GPR).
  516. References
  517. ----------
  518. .. [RW2006] `Carl E. Rasmussen and Christopher K.I. Williams,
  519. "Gaussian Processes for Machine Learning",
  520. MIT Press 2006 <https://www.gaussianprocess.org/gpml/chapters/RW.pdf>`_
  521. Examples
  522. --------
  523. >>> from sklearn.datasets import load_iris
  524. >>> from sklearn.gaussian_process import GaussianProcessClassifier
  525. >>> from sklearn.gaussian_process.kernels import RBF
  526. >>> X, y = load_iris(return_X_y=True)
  527. >>> kernel = 1.0 * RBF(1.0)
  528. >>> gpc = GaussianProcessClassifier(kernel=kernel,
  529. ... random_state=0).fit(X, y)
  530. >>> gpc.score(X, y)
  531. 0.9866...
  532. >>> gpc.predict_proba(X[:2,:])
  533. array([[0.83548752, 0.03228706, 0.13222543],
  534. [0.79064206, 0.06525643, 0.14410151]])
  535. """
  536. _parameter_constraints: dict = {
  537. "kernel": [Kernel, None],
  538. "optimizer": [StrOptions({"fmin_l_bfgs_b"}), callable, None],
  539. "n_restarts_optimizer": [Interval(Integral, 0, None, closed="left")],
  540. "max_iter_predict": [Interval(Integral, 1, None, closed="left")],
  541. "warm_start": ["boolean"],
  542. "copy_X_train": ["boolean"],
  543. "random_state": ["random_state"],
  544. "multi_class": [StrOptions({"one_vs_rest", "one_vs_one"})],
  545. "n_jobs": [Integral, None],
  546. }
  547. def __init__(
  548. self,
  549. kernel=None,
  550. *,
  551. optimizer="fmin_l_bfgs_b",
  552. n_restarts_optimizer=0,
  553. max_iter_predict=100,
  554. warm_start=False,
  555. copy_X_train=True,
  556. random_state=None,
  557. multi_class="one_vs_rest",
  558. n_jobs=None,
  559. ):
  560. self.kernel = kernel
  561. self.optimizer = optimizer
  562. self.n_restarts_optimizer = n_restarts_optimizer
  563. self.max_iter_predict = max_iter_predict
  564. self.warm_start = warm_start
  565. self.copy_X_train = copy_X_train
  566. self.random_state = random_state
  567. self.multi_class = multi_class
  568. self.n_jobs = n_jobs
  569. @_fit_context(prefer_skip_nested_validation=True)
  570. def fit(self, X, y):
  571. """Fit Gaussian process classification model.
  572. Parameters
  573. ----------
  574. X : array-like of shape (n_samples, n_features) or list of object
  575. Feature vectors or other representations of training data.
  576. y : array-like of shape (n_samples,)
  577. Target values, must be binary.
  578. Returns
  579. -------
  580. self : object
  581. Returns an instance of self.
  582. """
  583. if isinstance(self.kernel, CompoundKernel):
  584. raise ValueError("kernel cannot be a CompoundKernel")
  585. if self.kernel is None or self.kernel.requires_vector_input:
  586. X, y = self._validate_data(
  587. X, y, multi_output=False, ensure_2d=True, dtype="numeric"
  588. )
  589. else:
  590. X, y = self._validate_data(
  591. X, y, multi_output=False, ensure_2d=False, dtype=None
  592. )
  593. self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
  594. kernel=self.kernel,
  595. optimizer=self.optimizer,
  596. n_restarts_optimizer=self.n_restarts_optimizer,
  597. max_iter_predict=self.max_iter_predict,
  598. warm_start=self.warm_start,
  599. copy_X_train=self.copy_X_train,
  600. random_state=self.random_state,
  601. )
  602. self.classes_ = np.unique(y)
  603. self.n_classes_ = self.classes_.size
  604. if self.n_classes_ == 1:
  605. raise ValueError(
  606. "GaussianProcessClassifier requires 2 or more "
  607. "distinct classes; got %d class (only class %s "
  608. "is present)" % (self.n_classes_, self.classes_[0])
  609. )
  610. if self.n_classes_ > 2:
  611. if self.multi_class == "one_vs_rest":
  612. self.base_estimator_ = OneVsRestClassifier(
  613. self.base_estimator_, n_jobs=self.n_jobs
  614. )
  615. elif self.multi_class == "one_vs_one":
  616. self.base_estimator_ = OneVsOneClassifier(
  617. self.base_estimator_, n_jobs=self.n_jobs
  618. )
  619. else:
  620. raise ValueError("Unknown multi-class mode %s" % self.multi_class)
  621. self.base_estimator_.fit(X, y)
  622. if self.n_classes_ > 2:
  623. self.log_marginal_likelihood_value_ = np.mean(
  624. [
  625. estimator.log_marginal_likelihood()
  626. for estimator in self.base_estimator_.estimators_
  627. ]
  628. )
  629. else:
  630. self.log_marginal_likelihood_value_ = (
  631. self.base_estimator_.log_marginal_likelihood()
  632. )
  633. return self
  634. def predict(self, X):
  635. """Perform classification on an array of test vectors X.
  636. Parameters
  637. ----------
  638. X : array-like of shape (n_samples, n_features) or list of object
  639. Query points where the GP is evaluated for classification.
  640. Returns
  641. -------
  642. C : ndarray of shape (n_samples,)
  643. Predicted target values for X, values are from ``classes_``.
  644. """
  645. check_is_fitted(self)
  646. if self.kernel is None or self.kernel.requires_vector_input:
  647. X = self._validate_data(X, ensure_2d=True, dtype="numeric", reset=False)
  648. else:
  649. X = self._validate_data(X, ensure_2d=False, dtype=None, reset=False)
  650. return self.base_estimator_.predict(X)
  651. def predict_proba(self, X):
  652. """Return probability estimates for the test vector X.
  653. Parameters
  654. ----------
  655. X : array-like of shape (n_samples, n_features) or list of object
  656. Query points where the GP is evaluated for classification.
  657. Returns
  658. -------
  659. C : array-like of shape (n_samples, n_classes)
  660. Returns the probability of the samples for each class in
  661. the model. The columns correspond to the classes in sorted
  662. order, as they appear in the attribute :term:`classes_`.
  663. """
  664. check_is_fitted(self)
  665. if self.n_classes_ > 2 and self.multi_class == "one_vs_one":
  666. raise ValueError(
  667. "one_vs_one multi-class mode does not support "
  668. "predicting probability estimates. Use "
  669. "one_vs_rest mode instead."
  670. )
  671. if self.kernel is None or self.kernel.requires_vector_input:
  672. X = self._validate_data(X, ensure_2d=True, dtype="numeric", reset=False)
  673. else:
  674. X = self._validate_data(X, ensure_2d=False, dtype=None, reset=False)
  675. return self.base_estimator_.predict_proba(X)
  676. @property
  677. def kernel_(self):
  678. """Return the kernel of the base estimator."""
  679. if self.n_classes_ == 2:
  680. return self.base_estimator_.kernel_
  681. else:
  682. return CompoundKernel(
  683. [estimator.kernel_ for estimator in self.base_estimator_.estimators_]
  684. )
  685. def log_marginal_likelihood(
  686. self, theta=None, eval_gradient=False, clone_kernel=True
  687. ):
  688. """Return log-marginal likelihood of theta for training data.
  689. In the case of multi-class classification, the mean log-marginal
  690. likelihood of the one-versus-rest classifiers are returned.
  691. Parameters
  692. ----------
  693. theta : array-like of shape (n_kernel_params,), default=None
  694. Kernel hyperparameters for which the log-marginal likelihood is
  695. evaluated. In the case of multi-class classification, theta may
  696. be the hyperparameters of the compound kernel or of an individual
  697. kernel. In the latter case, all individual kernel get assigned the
  698. same theta values. If None, the precomputed log_marginal_likelihood
  699. of ``self.kernel_.theta`` is returned.
  700. eval_gradient : bool, default=False
  701. If True, the gradient of the log-marginal likelihood with respect
  702. to the kernel hyperparameters at position theta is returned
  703. additionally. Note that gradient computation is not supported
  704. for non-binary classification. If True, theta must not be None.
  705. clone_kernel : bool, default=True
  706. If True, the kernel attribute is copied. If False, the kernel
  707. attribute is modified, but may result in a performance improvement.
  708. Returns
  709. -------
  710. log_likelihood : float
  711. Log-marginal likelihood of theta for training data.
  712. log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional
  713. Gradient of the log-marginal likelihood with respect to the kernel
  714. hyperparameters at position theta.
  715. Only returned when `eval_gradient` is True.
  716. """
  717. check_is_fitted(self)
  718. if theta is None:
  719. if eval_gradient:
  720. raise ValueError("Gradient can only be evaluated for theta!=None")
  721. return self.log_marginal_likelihood_value_
  722. theta = np.asarray(theta)
  723. if self.n_classes_ == 2:
  724. return self.base_estimator_.log_marginal_likelihood(
  725. theta, eval_gradient, clone_kernel=clone_kernel
  726. )
  727. else:
  728. if eval_gradient:
  729. raise NotImplementedError(
  730. "Gradient of log-marginal-likelihood not implemented for "
  731. "multi-class GPC."
  732. )
  733. estimators = self.base_estimator_.estimators_
  734. n_dims = estimators[0].kernel_.n_dims
  735. if theta.shape[0] == n_dims: # use same theta for all sub-kernels
  736. return np.mean(
  737. [
  738. estimator.log_marginal_likelihood(
  739. theta, clone_kernel=clone_kernel
  740. )
  741. for i, estimator in enumerate(estimators)
  742. ]
  743. )
  744. elif theta.shape[0] == n_dims * self.classes_.shape[0]:
  745. # theta for compound kernel
  746. return np.mean(
  747. [
  748. estimator.log_marginal_likelihood(
  749. theta[n_dims * i : n_dims * (i + 1)],
  750. clone_kernel=clone_kernel,
  751. )
  752. for i, estimator in enumerate(estimators)
  753. ]
  754. )
  755. else:
  756. raise ValueError(
  757. "Shape of theta must be either %d or %d. "
  758. "Obtained theta with shape %d."
  759. % (n_dims, n_dims * self.classes_.shape[0], theta.shape[0])
  760. )