| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902 |
- """Gaussian processes classification."""
- # Authors: Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
- #
- # License: BSD 3 clause
- from numbers import Integral
- from operator import itemgetter
- import numpy as np
- import scipy.optimize
- from scipy.linalg import cho_solve, cholesky, solve
- from scipy.special import erf, expit
- from ..base import BaseEstimator, ClassifierMixin, _fit_context, clone
- from ..multiclass import OneVsOneClassifier, OneVsRestClassifier
- from ..preprocessing import LabelEncoder
- from ..utils import check_random_state
- from ..utils._param_validation import Interval, StrOptions
- from ..utils.optimize import _check_optimize_result
- from ..utils.validation import check_is_fitted
- from .kernels import RBF, CompoundKernel, Kernel
- from .kernels import ConstantKernel as C
- # Values required for approximating the logistic sigmoid by
- # error functions. coefs are obtained via:
- # x = np.array([0, 0.6, 2, 3.5, 4.5, np.inf])
- # b = logistic(x)
- # A = (erf(np.dot(x, self.lambdas)) + 1) / 2
- # coefs = lstsq(A, b)[0]
- LAMBDAS = np.array([0.41, 0.4, 0.37, 0.44, 0.39])[:, np.newaxis]
- COEFS = np.array(
- [-1854.8214151, 3516.89893646, 221.29346712, 128.12323805, -2010.49422654]
- )[:, np.newaxis]
- class _BinaryGaussianProcessClassifierLaplace(BaseEstimator):
- """Binary Gaussian process classification based on Laplace approximation.
- The implementation is based on Algorithm 3.1, 3.2, and 5.1 from [RW2006]_.
- Internally, the Laplace approximation is used for approximating the
- non-Gaussian posterior by a Gaussian.
- Currently, the implementation is restricted to using the logistic link
- function.
- .. versionadded:: 0.18
- Parameters
- ----------
- kernel : kernel instance, default=None
- The kernel specifying the covariance function of the GP. If None is
- passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
- the kernel's hyperparameters are optimized during fitting.
- optimizer : 'fmin_l_bfgs_b' or callable, default='fmin_l_bfgs_b'
- Can either be one of the internally supported optimizers for optimizing
- the kernel's parameters, specified by a string, or an externally
- defined optimizer passed as a callable. If a callable is passed, it
- must have the signature::
- def optimizer(obj_func, initial_theta, bounds):
- # * 'obj_func' is the objective function to be maximized, which
- # takes the hyperparameters theta as parameter and an
- # optional flag eval_gradient, which determines if the
- # gradient is returned additionally to the function value
- # * 'initial_theta': the initial value for theta, which can be
- # used by local optimizers
- # * 'bounds': the bounds on the values of theta
- ....
- # Returned are the best found hyperparameters theta and
- # the corresponding value of the target function.
- return theta_opt, func_min
- Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize
- is used. If None is passed, the kernel's parameters are kept fixed.
- Available internal optimizers are::
- 'fmin_l_bfgs_b'
- n_restarts_optimizer : int, default=0
- The number of restarts of the optimizer for finding the kernel's
- parameters which maximize the log-marginal likelihood. The first run
- of the optimizer is performed from the kernel's initial parameters,
- the remaining ones (if any) from thetas sampled log-uniform randomly
- from the space of allowed theta-values. If greater than 0, all bounds
- must be finite. Note that n_restarts_optimizer=0 implies that one
- run is performed.
- max_iter_predict : int, default=100
- The maximum number of iterations in Newton's method for approximating
- the posterior during predict. Smaller values will reduce computation
- time at the cost of worse results.
- warm_start : bool, default=False
- If warm-starts are enabled, the solution of the last Newton iteration
- on the Laplace approximation of the posterior mode is used as
- initialization for the next call of _posterior_mode(). This can speed
- up convergence when _posterior_mode is called several times on similar
- problems as in hyperparameter optimization. See :term:`the Glossary
- <warm_start>`.
- copy_X_train : bool, default=True
- If True, a persistent copy of the training data is stored in the
- object. Otherwise, just a reference to the training data is stored,
- which might cause predictions to change if the data is modified
- externally.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation used to initialize the centers.
- Pass an int for reproducible results across multiple function calls.
- See :term:`Glossary <random_state>`.
- Attributes
- ----------
- X_train_ : array-like of shape (n_samples, n_features) or list of object
- Feature vectors or other representations of training data (also
- required for prediction).
- y_train_ : array-like of shape (n_samples,)
- Target values in training data (also required for prediction)
- classes_ : array-like of shape (n_classes,)
- Unique class labels.
- kernel_ : kernl instance
- The kernel used for prediction. The structure of the kernel is the
- same as the one passed as parameter but with optimized hyperparameters
- L_ : array-like of shape (n_samples, n_samples)
- Lower-triangular Cholesky decomposition of the kernel in X_train_
- pi_ : array-like of shape (n_samples,)
- The probabilities of the positive class for the training points
- X_train_
- W_sr_ : array-like of shape (n_samples,)
- Square root of W, the Hessian of log-likelihood of the latent function
- values for the observed labels. Since W is diagonal, only the diagonal
- of sqrt(W) is stored.
- log_marginal_likelihood_value_ : float
- The log-marginal-likelihood of ``self.kernel_.theta``
- References
- ----------
- .. [RW2006] `Carl E. Rasmussen and Christopher K.I. Williams,
- "Gaussian Processes for Machine Learning",
- MIT Press 2006 <https://www.gaussianprocess.org/gpml/chapters/RW.pdf>`_
- """
- def __init__(
- self,
- kernel=None,
- *,
- optimizer="fmin_l_bfgs_b",
- n_restarts_optimizer=0,
- max_iter_predict=100,
- warm_start=False,
- copy_X_train=True,
- random_state=None,
- ):
- self.kernel = kernel
- self.optimizer = optimizer
- self.n_restarts_optimizer = n_restarts_optimizer
- self.max_iter_predict = max_iter_predict
- self.warm_start = warm_start
- self.copy_X_train = copy_X_train
- self.random_state = random_state
- def fit(self, X, y):
- """Fit Gaussian process classification model.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features) or list of object
- Feature vectors or other representations of training data.
- y : array-like of shape (n_samples,)
- Target values, must be binary.
- Returns
- -------
- self : returns an instance of self.
- """
- if self.kernel is None: # Use an RBF kernel as default
- self.kernel_ = C(1.0, constant_value_bounds="fixed") * RBF(
- 1.0, length_scale_bounds="fixed"
- )
- else:
- self.kernel_ = clone(self.kernel)
- self.rng = check_random_state(self.random_state)
- self.X_train_ = np.copy(X) if self.copy_X_train else X
- # Encode class labels and check that it is a binary classification
- # problem
- label_encoder = LabelEncoder()
- self.y_train_ = label_encoder.fit_transform(y)
- self.classes_ = label_encoder.classes_
- if self.classes_.size > 2:
- raise ValueError(
- "%s supports only binary classification. y contains classes %s"
- % (self.__class__.__name__, self.classes_)
- )
- elif self.classes_.size == 1:
- raise ValueError(
- "{0:s} requires 2 classes; got {1:d} class".format(
- self.__class__.__name__, self.classes_.size
- )
- )
- if self.optimizer is not None and self.kernel_.n_dims > 0:
- # Choose hyperparameters based on maximizing the log-marginal
- # likelihood (potentially starting from several initial values)
- def obj_func(theta, eval_gradient=True):
- if eval_gradient:
- lml, grad = self.log_marginal_likelihood(
- theta, eval_gradient=True, clone_kernel=False
- )
- return -lml, -grad
- else:
- return -self.log_marginal_likelihood(theta, clone_kernel=False)
- # First optimize starting from theta specified in kernel
- optima = [
- self._constrained_optimization(
- obj_func, self.kernel_.theta, self.kernel_.bounds
- )
- ]
- # Additional runs are performed from log-uniform chosen initial
- # theta
- if self.n_restarts_optimizer > 0:
- if not np.isfinite(self.kernel_.bounds).all():
- raise ValueError(
- "Multiple optimizer restarts (n_restarts_optimizer>0) "
- "requires that all bounds are finite."
- )
- bounds = self.kernel_.bounds
- for iteration in range(self.n_restarts_optimizer):
- theta_initial = np.exp(self.rng.uniform(bounds[:, 0], bounds[:, 1]))
- optima.append(
- self._constrained_optimization(obj_func, theta_initial, bounds)
- )
- # Select result from run with minimal (negative) log-marginal
- # likelihood
- lml_values = list(map(itemgetter(1), optima))
- self.kernel_.theta = optima[np.argmin(lml_values)][0]
- self.kernel_._check_bounds_params()
- self.log_marginal_likelihood_value_ = -np.min(lml_values)
- else:
- self.log_marginal_likelihood_value_ = self.log_marginal_likelihood(
- self.kernel_.theta
- )
- # Precompute quantities required for predictions which are independent
- # of actual query points
- K = self.kernel_(self.X_train_)
- _, (self.pi_, self.W_sr_, self.L_, _, _) = self._posterior_mode(
- K, return_temporaries=True
- )
- return self
- def predict(self, X):
- """Perform classification on an array of test vectors X.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features) or list of object
- Query points where the GP is evaluated for classification.
- Returns
- -------
- C : ndarray of shape (n_samples,)
- Predicted target values for X, values are from ``classes_``
- """
- check_is_fitted(self)
- # As discussed on Section 3.4.2 of GPML, for making hard binary
- # decisions, it is enough to compute the MAP of the posterior and
- # pass it through the link function
- K_star = self.kernel_(self.X_train_, X) # K_star =k(x_star)
- f_star = K_star.T.dot(self.y_train_ - self.pi_) # Algorithm 3.2,Line 4
- return np.where(f_star > 0, self.classes_[1], self.classes_[0])
- def predict_proba(self, X):
- """Return probability estimates for the test vector X.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features) or list of object
- Query points where the GP is evaluated for classification.
- Returns
- -------
- C : array-like of shape (n_samples, n_classes)
- Returns the probability of the samples for each class in
- the model. The columns correspond to the classes in sorted
- order, as they appear in the attribute ``classes_``.
- """
- check_is_fitted(self)
- # Based on Algorithm 3.2 of GPML
- K_star = self.kernel_(self.X_train_, X) # K_star =k(x_star)
- f_star = K_star.T.dot(self.y_train_ - self.pi_) # Line 4
- v = solve(self.L_, self.W_sr_[:, np.newaxis] * K_star) # Line 5
- # Line 6 (compute np.diag(v.T.dot(v)) via einsum)
- var_f_star = self.kernel_.diag(X) - np.einsum("ij,ij->j", v, v)
- # Line 7:
- # Approximate \int log(z) * N(z | f_star, var_f_star)
- # Approximation is due to Williams & Barber, "Bayesian Classification
- # with Gaussian Processes", Appendix A: Approximate the logistic
- # sigmoid by a linear combination of 5 error functions.
- # For information on how this integral can be computed see
- # blitiri.blogspot.de/2012/11/gaussian-integral-of-error-function.html
- alpha = 1 / (2 * var_f_star)
- gamma = LAMBDAS * f_star
- integrals = (
- np.sqrt(np.pi / alpha)
- * erf(gamma * np.sqrt(alpha / (alpha + LAMBDAS**2)))
- / (2 * np.sqrt(var_f_star * 2 * np.pi))
- )
- pi_star = (COEFS * integrals).sum(axis=0) + 0.5 * COEFS.sum()
- return np.vstack((1 - pi_star, pi_star)).T
- def log_marginal_likelihood(
- self, theta=None, eval_gradient=False, clone_kernel=True
- ):
- """Returns log-marginal likelihood of theta for training data.
- Parameters
- ----------
- theta : array-like of shape (n_kernel_params,), default=None
- Kernel hyperparameters for which the log-marginal likelihood is
- evaluated. If None, the precomputed log_marginal_likelihood
- of ``self.kernel_.theta`` is returned.
- eval_gradient : bool, default=False
- If True, the gradient of the log-marginal likelihood with respect
- to the kernel hyperparameters at position theta is returned
- additionally. If True, theta must not be None.
- clone_kernel : bool, default=True
- If True, the kernel attribute is copied. If False, the kernel
- attribute is modified, but may result in a performance improvement.
- Returns
- -------
- log_likelihood : float
- Log-marginal likelihood of theta for training data.
- log_likelihood_gradient : ndarray of shape (n_kernel_params,), \
- optional
- Gradient of the log-marginal likelihood with respect to the kernel
- hyperparameters at position theta.
- Only returned when `eval_gradient` is True.
- """
- if theta is None:
- if eval_gradient:
- raise ValueError("Gradient can only be evaluated for theta!=None")
- return self.log_marginal_likelihood_value_
- if clone_kernel:
- kernel = self.kernel_.clone_with_theta(theta)
- else:
- kernel = self.kernel_
- kernel.theta = theta
- if eval_gradient:
- K, K_gradient = kernel(self.X_train_, eval_gradient=True)
- else:
- K = kernel(self.X_train_)
- # Compute log-marginal-likelihood Z and also store some temporaries
- # which can be reused for computing Z's gradient
- Z, (pi, W_sr, L, b, a) = self._posterior_mode(K, return_temporaries=True)
- if not eval_gradient:
- return Z
- # Compute gradient based on Algorithm 5.1 of GPML
- d_Z = np.empty(theta.shape[0])
- # XXX: Get rid of the np.diag() in the next line
- R = W_sr[:, np.newaxis] * cho_solve((L, True), np.diag(W_sr)) # Line 7
- C = solve(L, W_sr[:, np.newaxis] * K) # Line 8
- # Line 9: (use einsum to compute np.diag(C.T.dot(C))))
- s_2 = (
- -0.5
- * (np.diag(K) - np.einsum("ij, ij -> j", C, C))
- * (pi * (1 - pi) * (1 - 2 * pi))
- ) # third derivative
- for j in range(d_Z.shape[0]):
- C = K_gradient[:, :, j] # Line 11
- # Line 12: (R.T.ravel().dot(C.ravel()) = np.trace(R.dot(C)))
- s_1 = 0.5 * a.T.dot(C).dot(a) - 0.5 * R.T.ravel().dot(C.ravel())
- b = C.dot(self.y_train_ - pi) # Line 13
- s_3 = b - K.dot(R.dot(b)) # Line 14
- d_Z[j] = s_1 + s_2.T.dot(s_3) # Line 15
- return Z, d_Z
- def _posterior_mode(self, K, return_temporaries=False):
- """Mode-finding for binary Laplace GPC and fixed kernel.
- This approximates the posterior of the latent function values for given
- inputs and target observations with a Gaussian approximation and uses
- Newton's iteration to find the mode of this approximation.
- """
- # Based on Algorithm 3.1 of GPML
- # If warm_start are enabled, we reuse the last solution for the
- # posterior mode as initialization; otherwise, we initialize with 0
- if (
- self.warm_start
- and hasattr(self, "f_cached")
- and self.f_cached.shape == self.y_train_.shape
- ):
- f = self.f_cached
- else:
- f = np.zeros_like(self.y_train_, dtype=np.float64)
- # Use Newton's iteration method to find mode of Laplace approximation
- log_marginal_likelihood = -np.inf
- for _ in range(self.max_iter_predict):
- # Line 4
- pi = expit(f)
- W = pi * (1 - pi)
- # Line 5
- W_sr = np.sqrt(W)
- W_sr_K = W_sr[:, np.newaxis] * K
- B = np.eye(W.shape[0]) + W_sr_K * W_sr
- L = cholesky(B, lower=True)
- # Line 6
- b = W * f + (self.y_train_ - pi)
- # Line 7
- a = b - W_sr * cho_solve((L, True), W_sr_K.dot(b))
- # Line 8
- f = K.dot(a)
- # Line 10: Compute log marginal likelihood in loop and use as
- # convergence criterion
- lml = (
- -0.5 * a.T.dot(f)
- - np.log1p(np.exp(-(self.y_train_ * 2 - 1) * f)).sum()
- - np.log(np.diag(L)).sum()
- )
- # Check if we have converged (log marginal likelihood does
- # not decrease)
- # XXX: more complex convergence criterion
- if lml - log_marginal_likelihood < 1e-10:
- break
- log_marginal_likelihood = lml
- self.f_cached = f # Remember solution for later warm-starts
- if return_temporaries:
- return log_marginal_likelihood, (pi, W_sr, L, b, a)
- else:
- return log_marginal_likelihood
- def _constrained_optimization(self, obj_func, initial_theta, bounds):
- if self.optimizer == "fmin_l_bfgs_b":
- opt_res = scipy.optimize.minimize(
- obj_func, initial_theta, method="L-BFGS-B", jac=True, bounds=bounds
- )
- _check_optimize_result("lbfgs", opt_res)
- theta_opt, func_min = opt_res.x, opt_res.fun
- elif callable(self.optimizer):
- theta_opt, func_min = self.optimizer(obj_func, initial_theta, bounds=bounds)
- else:
- raise ValueError("Unknown optimizer %s." % self.optimizer)
- return theta_opt, func_min
- class GaussianProcessClassifier(ClassifierMixin, BaseEstimator):
- """Gaussian process classification (GPC) based on Laplace approximation.
- The implementation is based on Algorithm 3.1, 3.2, and 5.1 from [RW2006]_.
- Internally, the Laplace approximation is used for approximating the
- non-Gaussian posterior by a Gaussian.
- Currently, the implementation is restricted to using the logistic link
- function. For multi-class classification, several binary one-versus rest
- classifiers are fitted. Note that this class thus does not implement
- a true multi-class Laplace approximation.
- Read more in the :ref:`User Guide <gaussian_process>`.
- .. versionadded:: 0.18
- Parameters
- ----------
- kernel : kernel instance, default=None
- The kernel specifying the covariance function of the GP. If None is
- passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that
- the kernel's hyperparameters are optimized during fitting. Also kernel
- cannot be a `CompoundKernel`.
- optimizer : 'fmin_l_bfgs_b', callable or None, default='fmin_l_bfgs_b'
- Can either be one of the internally supported optimizers for optimizing
- the kernel's parameters, specified by a string, or an externally
- defined optimizer passed as a callable. If a callable is passed, it
- must have the signature::
- def optimizer(obj_func, initial_theta, bounds):
- # * 'obj_func' is the objective function to be maximized, which
- # takes the hyperparameters theta as parameter and an
- # optional flag eval_gradient, which determines if the
- # gradient is returned additionally to the function value
- # * 'initial_theta': the initial value for theta, which can be
- # used by local optimizers
- # * 'bounds': the bounds on the values of theta
- ....
- # Returned are the best found hyperparameters theta and
- # the corresponding value of the target function.
- return theta_opt, func_min
- Per default, the 'L-BFGS-B' algorithm from scipy.optimize.minimize
- is used. If None is passed, the kernel's parameters are kept fixed.
- Available internal optimizers are::
- 'fmin_l_bfgs_b'
- n_restarts_optimizer : int, default=0
- The number of restarts of the optimizer for finding the kernel's
- parameters which maximize the log-marginal likelihood. The first run
- of the optimizer is performed from the kernel's initial parameters,
- the remaining ones (if any) from thetas sampled log-uniform randomly
- from the space of allowed theta-values. If greater than 0, all bounds
- must be finite. Note that n_restarts_optimizer=0 implies that one
- run is performed.
- max_iter_predict : int, default=100
- The maximum number of iterations in Newton's method for approximating
- the posterior during predict. Smaller values will reduce computation
- time at the cost of worse results.
- warm_start : bool, default=False
- If warm-starts are enabled, the solution of the last Newton iteration
- on the Laplace approximation of the posterior mode is used as
- initialization for the next call of _posterior_mode(). This can speed
- up convergence when _posterior_mode is called several times on similar
- problems as in hyperparameter optimization. See :term:`the Glossary
- <warm_start>`.
- copy_X_train : bool, default=True
- If True, a persistent copy of the training data is stored in the
- object. Otherwise, just a reference to the training data is stored,
- which might cause predictions to change if the data is modified
- externally.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation used to initialize the centers.
- Pass an int for reproducible results across multiple function calls.
- See :term:`Glossary <random_state>`.
- multi_class : {'one_vs_rest', 'one_vs_one'}, default='one_vs_rest'
- Specifies how multi-class classification problems are handled.
- Supported are 'one_vs_rest' and 'one_vs_one'. In 'one_vs_rest',
- one binary Gaussian process classifier is fitted for each class, which
- is trained to separate this class from the rest. In 'one_vs_one', one
- binary Gaussian process classifier is fitted for each pair of classes,
- which is trained to separate these two classes. The predictions of
- these binary predictors are combined into multi-class predictions.
- Note that 'one_vs_one' does not support predicting probability
- estimates.
- n_jobs : int, default=None
- The number of jobs to use for the computation: the specified
- multiclass problems are computed in parallel.
- ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
- ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
- for more details.
- Attributes
- ----------
- base_estimator_ : ``Estimator`` instance
- The estimator instance that defines the likelihood function
- using the observed data.
- kernel_ : kernel instance
- The kernel used for prediction. In case of binary classification,
- the structure of the kernel is the same as the one passed as parameter
- but with optimized hyperparameters. In case of multi-class
- classification, a CompoundKernel is returned which consists of the
- different kernels used in the one-versus-rest classifiers.
- log_marginal_likelihood_value_ : float
- The log-marginal-likelihood of ``self.kernel_.theta``
- classes_ : array-like of shape (n_classes,)
- Unique class labels.
- n_classes_ : int
- The number of classes in the training data
- n_features_in_ : int
- Number of features seen during :term:`fit`.
- .. versionadded:: 0.24
- feature_names_in_ : ndarray of shape (`n_features_in_`,)
- Names of features seen during :term:`fit`. Defined only when `X`
- has feature names that are all strings.
- .. versionadded:: 1.0
- See Also
- --------
- GaussianProcessRegressor : Gaussian process regression (GPR).
- References
- ----------
- .. [RW2006] `Carl E. Rasmussen and Christopher K.I. Williams,
- "Gaussian Processes for Machine Learning",
- MIT Press 2006 <https://www.gaussianprocess.org/gpml/chapters/RW.pdf>`_
- Examples
- --------
- >>> from sklearn.datasets import load_iris
- >>> from sklearn.gaussian_process import GaussianProcessClassifier
- >>> from sklearn.gaussian_process.kernels import RBF
- >>> X, y = load_iris(return_X_y=True)
- >>> kernel = 1.0 * RBF(1.0)
- >>> gpc = GaussianProcessClassifier(kernel=kernel,
- ... random_state=0).fit(X, y)
- >>> gpc.score(X, y)
- 0.9866...
- >>> gpc.predict_proba(X[:2,:])
- array([[0.83548752, 0.03228706, 0.13222543],
- [0.79064206, 0.06525643, 0.14410151]])
- """
- _parameter_constraints: dict = {
- "kernel": [Kernel, None],
- "optimizer": [StrOptions({"fmin_l_bfgs_b"}), callable, None],
- "n_restarts_optimizer": [Interval(Integral, 0, None, closed="left")],
- "max_iter_predict": [Interval(Integral, 1, None, closed="left")],
- "warm_start": ["boolean"],
- "copy_X_train": ["boolean"],
- "random_state": ["random_state"],
- "multi_class": [StrOptions({"one_vs_rest", "one_vs_one"})],
- "n_jobs": [Integral, None],
- }
- def __init__(
- self,
- kernel=None,
- *,
- optimizer="fmin_l_bfgs_b",
- n_restarts_optimizer=0,
- max_iter_predict=100,
- warm_start=False,
- copy_X_train=True,
- random_state=None,
- multi_class="one_vs_rest",
- n_jobs=None,
- ):
- self.kernel = kernel
- self.optimizer = optimizer
- self.n_restarts_optimizer = n_restarts_optimizer
- self.max_iter_predict = max_iter_predict
- self.warm_start = warm_start
- self.copy_X_train = copy_X_train
- self.random_state = random_state
- self.multi_class = multi_class
- self.n_jobs = n_jobs
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, X, y):
- """Fit Gaussian process classification model.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features) or list of object
- Feature vectors or other representations of training data.
- y : array-like of shape (n_samples,)
- Target values, must be binary.
- Returns
- -------
- self : object
- Returns an instance of self.
- """
- if isinstance(self.kernel, CompoundKernel):
- raise ValueError("kernel cannot be a CompoundKernel")
- if self.kernel is None or self.kernel.requires_vector_input:
- X, y = self._validate_data(
- X, y, multi_output=False, ensure_2d=True, dtype="numeric"
- )
- else:
- X, y = self._validate_data(
- X, y, multi_output=False, ensure_2d=False, dtype=None
- )
- self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace(
- kernel=self.kernel,
- optimizer=self.optimizer,
- n_restarts_optimizer=self.n_restarts_optimizer,
- max_iter_predict=self.max_iter_predict,
- warm_start=self.warm_start,
- copy_X_train=self.copy_X_train,
- random_state=self.random_state,
- )
- self.classes_ = np.unique(y)
- self.n_classes_ = self.classes_.size
- if self.n_classes_ == 1:
- raise ValueError(
- "GaussianProcessClassifier requires 2 or more "
- "distinct classes; got %d class (only class %s "
- "is present)" % (self.n_classes_, self.classes_[0])
- )
- if self.n_classes_ > 2:
- if self.multi_class == "one_vs_rest":
- self.base_estimator_ = OneVsRestClassifier(
- self.base_estimator_, n_jobs=self.n_jobs
- )
- elif self.multi_class == "one_vs_one":
- self.base_estimator_ = OneVsOneClassifier(
- self.base_estimator_, n_jobs=self.n_jobs
- )
- else:
- raise ValueError("Unknown multi-class mode %s" % self.multi_class)
- self.base_estimator_.fit(X, y)
- if self.n_classes_ > 2:
- self.log_marginal_likelihood_value_ = np.mean(
- [
- estimator.log_marginal_likelihood()
- for estimator in self.base_estimator_.estimators_
- ]
- )
- else:
- self.log_marginal_likelihood_value_ = (
- self.base_estimator_.log_marginal_likelihood()
- )
- return self
- def predict(self, X):
- """Perform classification on an array of test vectors X.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features) or list of object
- Query points where the GP is evaluated for classification.
- Returns
- -------
- C : ndarray of shape (n_samples,)
- Predicted target values for X, values are from ``classes_``.
- """
- check_is_fitted(self)
- if self.kernel is None or self.kernel.requires_vector_input:
- X = self._validate_data(X, ensure_2d=True, dtype="numeric", reset=False)
- else:
- X = self._validate_data(X, ensure_2d=False, dtype=None, reset=False)
- return self.base_estimator_.predict(X)
- def predict_proba(self, X):
- """Return probability estimates for the test vector X.
- Parameters
- ----------
- X : array-like of shape (n_samples, n_features) or list of object
- Query points where the GP is evaluated for classification.
- Returns
- -------
- C : array-like of shape (n_samples, n_classes)
- Returns the probability of the samples for each class in
- the model. The columns correspond to the classes in sorted
- order, as they appear in the attribute :term:`classes_`.
- """
- check_is_fitted(self)
- if self.n_classes_ > 2 and self.multi_class == "one_vs_one":
- raise ValueError(
- "one_vs_one multi-class mode does not support "
- "predicting probability estimates. Use "
- "one_vs_rest mode instead."
- )
- if self.kernel is None or self.kernel.requires_vector_input:
- X = self._validate_data(X, ensure_2d=True, dtype="numeric", reset=False)
- else:
- X = self._validate_data(X, ensure_2d=False, dtype=None, reset=False)
- return self.base_estimator_.predict_proba(X)
- @property
- def kernel_(self):
- """Return the kernel of the base estimator."""
- if self.n_classes_ == 2:
- return self.base_estimator_.kernel_
- else:
- return CompoundKernel(
- [estimator.kernel_ for estimator in self.base_estimator_.estimators_]
- )
- def log_marginal_likelihood(
- self, theta=None, eval_gradient=False, clone_kernel=True
- ):
- """Return log-marginal likelihood of theta for training data.
- In the case of multi-class classification, the mean log-marginal
- likelihood of the one-versus-rest classifiers are returned.
- Parameters
- ----------
- theta : array-like of shape (n_kernel_params,), default=None
- Kernel hyperparameters for which the log-marginal likelihood is
- evaluated. In the case of multi-class classification, theta may
- be the hyperparameters of the compound kernel or of an individual
- kernel. In the latter case, all individual kernel get assigned the
- same theta values. If None, the precomputed log_marginal_likelihood
- of ``self.kernel_.theta`` is returned.
- eval_gradient : bool, default=False
- If True, the gradient of the log-marginal likelihood with respect
- to the kernel hyperparameters at position theta is returned
- additionally. Note that gradient computation is not supported
- for non-binary classification. If True, theta must not be None.
- clone_kernel : bool, default=True
- If True, the kernel attribute is copied. If False, the kernel
- attribute is modified, but may result in a performance improvement.
- Returns
- -------
- log_likelihood : float
- Log-marginal likelihood of theta for training data.
- log_likelihood_gradient : ndarray of shape (n_kernel_params,), optional
- Gradient of the log-marginal likelihood with respect to the kernel
- hyperparameters at position theta.
- Only returned when `eval_gradient` is True.
- """
- check_is_fitted(self)
- if theta is None:
- if eval_gradient:
- raise ValueError("Gradient can only be evaluated for theta!=None")
- return self.log_marginal_likelihood_value_
- theta = np.asarray(theta)
- if self.n_classes_ == 2:
- return self.base_estimator_.log_marginal_likelihood(
- theta, eval_gradient, clone_kernel=clone_kernel
- )
- else:
- if eval_gradient:
- raise NotImplementedError(
- "Gradient of log-marginal-likelihood not implemented for "
- "multi-class GPC."
- )
- estimators = self.base_estimator_.estimators_
- n_dims = estimators[0].kernel_.n_dims
- if theta.shape[0] == n_dims: # use same theta for all sub-kernels
- return np.mean(
- [
- estimator.log_marginal_likelihood(
- theta, clone_kernel=clone_kernel
- )
- for i, estimator in enumerate(estimators)
- ]
- )
- elif theta.shape[0] == n_dims * self.classes_.shape[0]:
- # theta for compound kernel
- return np.mean(
- [
- estimator.log_marginal_likelihood(
- theta[n_dims * i : n_dims * (i + 1)],
- clone_kernel=clone_kernel,
- )
- for i, estimator in enumerate(estimators)
- ]
- )
- else:
- raise ValueError(
- "Shape of theta must be either %d or %d. "
- "Obtained theta with shape %d."
- % (n_dims, n_dims * self.classes_.shape[0], theta.shape[0])
- )
|