kernel_ridge.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. """Module :mod:`sklearn.kernel_ridge` implements kernel ridge regression."""
  2. # Authors: Mathieu Blondel <mathieu@mblondel.org>
  3. # Jan Hendrik Metzen <jhm@informatik.uni-bremen.de>
  4. # License: BSD 3 clause
  5. from numbers import Integral, Real
  6. import numpy as np
  7. from .base import BaseEstimator, MultiOutputMixin, RegressorMixin, _fit_context
  8. from .linear_model._ridge import _solve_cholesky_kernel
  9. from .metrics.pairwise import PAIRWISE_KERNEL_FUNCTIONS, pairwise_kernels
  10. from .utils._param_validation import Interval, StrOptions
  11. from .utils.validation import _check_sample_weight, check_is_fitted
  12. class KernelRidge(MultiOutputMixin, RegressorMixin, BaseEstimator):
  13. """Kernel ridge regression.
  14. Kernel ridge regression (KRR) combines ridge regression (linear least
  15. squares with l2-norm regularization) with the kernel trick. It thus
  16. learns a linear function in the space induced by the respective kernel and
  17. the data. For non-linear kernels, this corresponds to a non-linear
  18. function in the original space.
  19. The form of the model learned by KRR is identical to support vector
  20. regression (SVR). However, different loss functions are used: KRR uses
  21. squared error loss while support vector regression uses epsilon-insensitive
  22. loss, both combined with l2 regularization. In contrast to SVR, fitting a
  23. KRR model can be done in closed-form and is typically faster for
  24. medium-sized datasets. On the other hand, the learned model is non-sparse
  25. and thus slower than SVR, which learns a sparse model for epsilon > 0, at
  26. prediction-time.
  27. This estimator has built-in support for multi-variate regression
  28. (i.e., when y is a 2d-array of shape [n_samples, n_targets]).
  29. Read more in the :ref:`User Guide <kernel_ridge>`.
  30. Parameters
  31. ----------
  32. alpha : float or array-like of shape (n_targets,), default=1.0
  33. Regularization strength; must be a positive float. Regularization
  34. improves the conditioning of the problem and reduces the variance of
  35. the estimates. Larger values specify stronger regularization.
  36. Alpha corresponds to ``1 / (2C)`` in other linear models such as
  37. :class:`~sklearn.linear_model.LogisticRegression` or
  38. :class:`~sklearn.svm.LinearSVC`. If an array is passed, penalties are
  39. assumed to be specific to the targets. Hence they must correspond in
  40. number. See :ref:`ridge_regression` for formula.
  41. kernel : str or callable, default="linear"
  42. Kernel mapping used internally. This parameter is directly passed to
  43. :class:`~sklearn.metrics.pairwise.pairwise_kernels`.
  44. If `kernel` is a string, it must be one of the metrics
  45. in `pairwise.PAIRWISE_KERNEL_FUNCTIONS` or "precomputed".
  46. If `kernel` is "precomputed", X is assumed to be a kernel matrix.
  47. Alternatively, if `kernel` is a callable function, it is called on
  48. each pair of instances (rows) and the resulting value recorded. The
  49. callable should take two rows from X as input and return the
  50. corresponding kernel value as a single number. This means that
  51. callables from :mod:`sklearn.metrics.pairwise` are not allowed, as
  52. they operate on matrices, not single samples. Use the string
  53. identifying the kernel instead.
  54. gamma : float, default=None
  55. Gamma parameter for the RBF, laplacian, polynomial, exponential chi2
  56. and sigmoid kernels. Interpretation of the default value is left to
  57. the kernel; see the documentation for sklearn.metrics.pairwise.
  58. Ignored by other kernels.
  59. degree : int, default=3
  60. Degree of the polynomial kernel. Ignored by other kernels.
  61. coef0 : float, default=1
  62. Zero coefficient for polynomial and sigmoid kernels.
  63. Ignored by other kernels.
  64. kernel_params : dict, default=None
  65. Additional parameters (keyword arguments) for kernel function passed
  66. as callable object.
  67. Attributes
  68. ----------
  69. dual_coef_ : ndarray of shape (n_samples,) or (n_samples, n_targets)
  70. Representation of weight vector(s) in kernel space
  71. X_fit_ : {ndarray, sparse matrix} of shape (n_samples, n_features)
  72. Training data, which is also required for prediction. If
  73. kernel == "precomputed" this is instead the precomputed
  74. training matrix, of shape (n_samples, n_samples).
  75. n_features_in_ : int
  76. Number of features seen during :term:`fit`.
  77. .. versionadded:: 0.24
  78. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  79. Names of features seen during :term:`fit`. Defined only when `X`
  80. has feature names that are all strings.
  81. .. versionadded:: 1.0
  82. See Also
  83. --------
  84. sklearn.gaussian_process.GaussianProcessRegressor : Gaussian
  85. Process regressor providing automatic kernel hyperparameters
  86. tuning and predictions uncertainty.
  87. sklearn.linear_model.Ridge : Linear ridge regression.
  88. sklearn.linear_model.RidgeCV : Ridge regression with built-in
  89. cross-validation.
  90. sklearn.svm.SVR : Support Vector Regression accepting a large variety
  91. of kernels.
  92. References
  93. ----------
  94. * Kevin P. Murphy
  95. "Machine Learning: A Probabilistic Perspective", The MIT Press
  96. chapter 14.4.3, pp. 492-493
  97. Examples
  98. --------
  99. >>> from sklearn.kernel_ridge import KernelRidge
  100. >>> import numpy as np
  101. >>> n_samples, n_features = 10, 5
  102. >>> rng = np.random.RandomState(0)
  103. >>> y = rng.randn(n_samples)
  104. >>> X = rng.randn(n_samples, n_features)
  105. >>> krr = KernelRidge(alpha=1.0)
  106. >>> krr.fit(X, y)
  107. KernelRidge(alpha=1.0)
  108. """
  109. _parameter_constraints: dict = {
  110. "alpha": [Interval(Real, 0, None, closed="left"), "array-like"],
  111. "kernel": [
  112. StrOptions(set(PAIRWISE_KERNEL_FUNCTIONS.keys()) | {"precomputed"}),
  113. callable,
  114. ],
  115. "gamma": [Interval(Real, 0, None, closed="left"), None],
  116. "degree": [Interval(Integral, 0, None, closed="left")],
  117. "coef0": [Interval(Real, None, None, closed="neither")],
  118. "kernel_params": [dict, None],
  119. }
  120. def __init__(
  121. self,
  122. alpha=1,
  123. *,
  124. kernel="linear",
  125. gamma=None,
  126. degree=3,
  127. coef0=1,
  128. kernel_params=None,
  129. ):
  130. self.alpha = alpha
  131. self.kernel = kernel
  132. self.gamma = gamma
  133. self.degree = degree
  134. self.coef0 = coef0
  135. self.kernel_params = kernel_params
  136. def _get_kernel(self, X, Y=None):
  137. if callable(self.kernel):
  138. params = self.kernel_params or {}
  139. else:
  140. params = {"gamma": self.gamma, "degree": self.degree, "coef0": self.coef0}
  141. return pairwise_kernels(X, Y, metric=self.kernel, filter_params=True, **params)
  142. def _more_tags(self):
  143. return {"pairwise": self.kernel == "precomputed"}
  144. @_fit_context(prefer_skip_nested_validation=True)
  145. def fit(self, X, y, sample_weight=None):
  146. """Fit Kernel Ridge regression model.
  147. Parameters
  148. ----------
  149. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  150. Training data. If kernel == "precomputed" this is instead
  151. a precomputed kernel matrix, of shape (n_samples, n_samples).
  152. y : array-like of shape (n_samples,) or (n_samples, n_targets)
  153. Target values.
  154. sample_weight : float or array-like of shape (n_samples,), default=None
  155. Individual weights for each sample, ignored if None is passed.
  156. Returns
  157. -------
  158. self : object
  159. Returns the instance itself.
  160. """
  161. # Convert data
  162. X, y = self._validate_data(
  163. X, y, accept_sparse=("csr", "csc"), multi_output=True, y_numeric=True
  164. )
  165. if sample_weight is not None and not isinstance(sample_weight, float):
  166. sample_weight = _check_sample_weight(sample_weight, X)
  167. K = self._get_kernel(X)
  168. alpha = np.atleast_1d(self.alpha)
  169. ravel = False
  170. if len(y.shape) == 1:
  171. y = y.reshape(-1, 1)
  172. ravel = True
  173. copy = self.kernel == "precomputed"
  174. self.dual_coef_ = _solve_cholesky_kernel(K, y, alpha, sample_weight, copy)
  175. if ravel:
  176. self.dual_coef_ = self.dual_coef_.ravel()
  177. self.X_fit_ = X
  178. return self
  179. def predict(self, X):
  180. """Predict using the kernel ridge model.
  181. Parameters
  182. ----------
  183. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  184. Samples. If kernel == "precomputed" this is instead a
  185. precomputed kernel matrix, shape = [n_samples,
  186. n_samples_fitted], where n_samples_fitted is the number of
  187. samples used in the fitting for this estimator.
  188. Returns
  189. -------
  190. C : ndarray of shape (n_samples,) or (n_samples, n_targets)
  191. Returns predicted values.
  192. """
  193. check_is_fitted(self)
  194. X = self._validate_data(X, accept_sparse=("csr", "csc"), reset=False)
  195. K = self._get_kernel(X, self.X_fit_)
  196. return np.dot(K, self.dual_coef_)