test_kernel_pca.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565
  1. import warnings
  2. import numpy as np
  3. import pytest
  4. import scipy.sparse as sp
  5. import sklearn
  6. from sklearn.datasets import load_iris, make_blobs, make_circles
  7. from sklearn.decomposition import PCA, KernelPCA
  8. from sklearn.exceptions import NotFittedError
  9. from sklearn.linear_model import Perceptron
  10. from sklearn.metrics.pairwise import rbf_kernel
  11. from sklearn.model_selection import GridSearchCV
  12. from sklearn.pipeline import Pipeline
  13. from sklearn.preprocessing import StandardScaler
  14. from sklearn.utils._testing import (
  15. assert_allclose,
  16. assert_array_almost_equal,
  17. assert_array_equal,
  18. )
  19. from sklearn.utils.validation import _check_psd_eigenvalues
  20. def test_kernel_pca():
  21. """Nominal test for all solvers and all known kernels + a custom one
  22. It tests
  23. - that fit_transform is equivalent to fit+transform
  24. - that the shapes of transforms and inverse transforms are correct
  25. """
  26. rng = np.random.RandomState(0)
  27. X_fit = rng.random_sample((5, 4))
  28. X_pred = rng.random_sample((2, 4))
  29. def histogram(x, y, **kwargs):
  30. # Histogram kernel implemented as a callable.
  31. assert kwargs == {} # no kernel_params that we didn't ask for
  32. return np.minimum(x, y).sum()
  33. for eigen_solver in ("auto", "dense", "arpack", "randomized"):
  34. for kernel in ("linear", "rbf", "poly", histogram):
  35. # histogram kernel produces singular matrix inside linalg.solve
  36. # XXX use a least-squares approximation?
  37. inv = not callable(kernel)
  38. # transform fit data
  39. kpca = KernelPCA(
  40. 4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=inv
  41. )
  42. X_fit_transformed = kpca.fit_transform(X_fit)
  43. X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
  44. assert_array_almost_equal(
  45. np.abs(X_fit_transformed), np.abs(X_fit_transformed2)
  46. )
  47. # non-regression test: previously, gamma would be 0 by default,
  48. # forcing all eigenvalues to 0 under the poly kernel
  49. assert X_fit_transformed.size != 0
  50. # transform new data
  51. X_pred_transformed = kpca.transform(X_pred)
  52. assert X_pred_transformed.shape[1] == X_fit_transformed.shape[1]
  53. # inverse transform
  54. if inv:
  55. X_pred2 = kpca.inverse_transform(X_pred_transformed)
  56. assert X_pred2.shape == X_pred.shape
  57. def test_kernel_pca_invalid_parameters():
  58. """Check that kPCA raises an error if the parameters are invalid
  59. Tests fitting inverse transform with a precomputed kernel raises a
  60. ValueError.
  61. """
  62. estimator = KernelPCA(
  63. n_components=10, fit_inverse_transform=True, kernel="precomputed"
  64. )
  65. err_ms = "Cannot fit_inverse_transform with a precomputed kernel"
  66. with pytest.raises(ValueError, match=err_ms):
  67. estimator.fit(np.random.randn(10, 10))
  68. def test_kernel_pca_consistent_transform():
  69. """Check robustness to mutations in the original training array
  70. Test that after fitting a kPCA model, it stays independent of any
  71. mutation of the values of the original data object by relying on an
  72. internal copy.
  73. """
  74. # X_fit_ needs to retain the old, unmodified copy of X
  75. state = np.random.RandomState(0)
  76. X = state.rand(10, 10)
  77. kpca = KernelPCA(random_state=state).fit(X)
  78. transformed1 = kpca.transform(X)
  79. X_copy = X.copy()
  80. X[:, 0] = 666
  81. transformed2 = kpca.transform(X_copy)
  82. assert_array_almost_equal(transformed1, transformed2)
  83. def test_kernel_pca_deterministic_output():
  84. """Test that Kernel PCA produces deterministic output
  85. Tests that the same inputs and random state produce the same output.
  86. """
  87. rng = np.random.RandomState(0)
  88. X = rng.rand(10, 10)
  89. eigen_solver = ("arpack", "dense")
  90. for solver in eigen_solver:
  91. transformed_X = np.zeros((20, 2))
  92. for i in range(20):
  93. kpca = KernelPCA(n_components=2, eigen_solver=solver, random_state=rng)
  94. transformed_X[i, :] = kpca.fit_transform(X)[0]
  95. assert_allclose(transformed_X, np.tile(transformed_X[0, :], 20).reshape(20, 2))
  96. def test_kernel_pca_sparse():
  97. """Test that kPCA works on a sparse data input.
  98. Same test as ``test_kernel_pca except inverse_transform`` since it's not
  99. implemented for sparse matrices.
  100. """
  101. rng = np.random.RandomState(0)
  102. X_fit = sp.csr_matrix(rng.random_sample((5, 4)))
  103. X_pred = sp.csr_matrix(rng.random_sample((2, 4)))
  104. for eigen_solver in ("auto", "arpack", "randomized"):
  105. for kernel in ("linear", "rbf", "poly"):
  106. # transform fit data
  107. kpca = KernelPCA(
  108. 4,
  109. kernel=kernel,
  110. eigen_solver=eigen_solver,
  111. fit_inverse_transform=False,
  112. random_state=0,
  113. )
  114. X_fit_transformed = kpca.fit_transform(X_fit)
  115. X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
  116. assert_array_almost_equal(
  117. np.abs(X_fit_transformed), np.abs(X_fit_transformed2)
  118. )
  119. # transform new data
  120. X_pred_transformed = kpca.transform(X_pred)
  121. assert X_pred_transformed.shape[1] == X_fit_transformed.shape[1]
  122. # inverse transform: not available for sparse matrices
  123. # XXX: should we raise another exception type here? For instance:
  124. # NotImplementedError.
  125. with pytest.raises(NotFittedError):
  126. kpca.inverse_transform(X_pred_transformed)
  127. @pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"])
  128. @pytest.mark.parametrize("n_features", [4, 10])
  129. def test_kernel_pca_linear_kernel(solver, n_features):
  130. """Test that kPCA with linear kernel is equivalent to PCA for all solvers.
  131. KernelPCA with linear kernel should produce the same output as PCA.
  132. """
  133. rng = np.random.RandomState(0)
  134. X_fit = rng.random_sample((5, n_features))
  135. X_pred = rng.random_sample((2, n_features))
  136. # for a linear kernel, kernel PCA should find the same projection as PCA
  137. # modulo the sign (direction)
  138. # fit only the first four components: fifth is near zero eigenvalue, so
  139. # can be trimmed due to roundoff error
  140. n_comps = 3 if solver == "arpack" else 4
  141. assert_array_almost_equal(
  142. np.abs(KernelPCA(n_comps, eigen_solver=solver).fit(X_fit).transform(X_pred)),
  143. np.abs(
  144. PCA(n_comps, svd_solver=solver if solver != "dense" else "full")
  145. .fit(X_fit)
  146. .transform(X_pred)
  147. ),
  148. )
  149. def test_kernel_pca_n_components():
  150. """Test that `n_components` is correctly taken into account for projections
  151. For all solvers this tests that the output has the correct shape depending
  152. on the selected number of components.
  153. """
  154. rng = np.random.RandomState(0)
  155. X_fit = rng.random_sample((5, 4))
  156. X_pred = rng.random_sample((2, 4))
  157. for eigen_solver in ("dense", "arpack", "randomized"):
  158. for c in [1, 2, 4]:
  159. kpca = KernelPCA(n_components=c, eigen_solver=eigen_solver)
  160. shape = kpca.fit(X_fit).transform(X_pred).shape
  161. assert shape == (2, c)
  162. def test_remove_zero_eig():
  163. """Check that the ``remove_zero_eig`` parameter works correctly.
  164. Tests that the null-space (Zero) eigenvalues are removed when
  165. remove_zero_eig=True, whereas they are not by default.
  166. """
  167. X = np.array([[1 - 1e-30, 1], [1, 1], [1, 1 - 1e-20]])
  168. # n_components=None (default) => remove_zero_eig is True
  169. kpca = KernelPCA()
  170. Xt = kpca.fit_transform(X)
  171. assert Xt.shape == (3, 0)
  172. kpca = KernelPCA(n_components=2)
  173. Xt = kpca.fit_transform(X)
  174. assert Xt.shape == (3, 2)
  175. kpca = KernelPCA(n_components=2, remove_zero_eig=True)
  176. Xt = kpca.fit_transform(X)
  177. assert Xt.shape == (3, 0)
  178. def test_leave_zero_eig():
  179. """Non-regression test for issue #12141 (PR #12143)
  180. This test checks that fit().transform() returns the same result as
  181. fit_transform() in case of non-removed zero eigenvalue.
  182. """
  183. X_fit = np.array([[1, 1], [0, 0]])
  184. # Assert that even with all np warnings on, there is no div by zero warning
  185. with warnings.catch_warnings():
  186. # There might be warnings about the kernel being badly conditioned,
  187. # but there should not be warnings about division by zero.
  188. # (Numpy division by zero warning can have many message variants, but
  189. # at least we know that it is a RuntimeWarning so lets check only this)
  190. warnings.simplefilter("error", RuntimeWarning)
  191. with np.errstate(all="warn"):
  192. k = KernelPCA(n_components=2, remove_zero_eig=False, eigen_solver="dense")
  193. # Fit, then transform
  194. A = k.fit(X_fit).transform(X_fit)
  195. # Do both at once
  196. B = k.fit_transform(X_fit)
  197. # Compare
  198. assert_array_almost_equal(np.abs(A), np.abs(B))
  199. def test_kernel_pca_precomputed():
  200. """Test that kPCA works with a precomputed kernel, for all solvers"""
  201. rng = np.random.RandomState(0)
  202. X_fit = rng.random_sample((5, 4))
  203. X_pred = rng.random_sample((2, 4))
  204. for eigen_solver in ("dense", "arpack", "randomized"):
  205. X_kpca = (
  206. KernelPCA(4, eigen_solver=eigen_solver, random_state=0)
  207. .fit(X_fit)
  208. .transform(X_pred)
  209. )
  210. X_kpca2 = (
  211. KernelPCA(
  212. 4, eigen_solver=eigen_solver, kernel="precomputed", random_state=0
  213. )
  214. .fit(np.dot(X_fit, X_fit.T))
  215. .transform(np.dot(X_pred, X_fit.T))
  216. )
  217. X_kpca_train = KernelPCA(
  218. 4, eigen_solver=eigen_solver, kernel="precomputed", random_state=0
  219. ).fit_transform(np.dot(X_fit, X_fit.T))
  220. X_kpca_train2 = (
  221. KernelPCA(
  222. 4, eigen_solver=eigen_solver, kernel="precomputed", random_state=0
  223. )
  224. .fit(np.dot(X_fit, X_fit.T))
  225. .transform(np.dot(X_fit, X_fit.T))
  226. )
  227. assert_array_almost_equal(np.abs(X_kpca), np.abs(X_kpca2))
  228. assert_array_almost_equal(np.abs(X_kpca_train), np.abs(X_kpca_train2))
  229. @pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"])
  230. def test_kernel_pca_precomputed_non_symmetric(solver):
  231. """Check that the kernel centerer works.
  232. Tests that a non symmetric precomputed kernel is actually accepted
  233. because the kernel centerer does its job correctly.
  234. """
  235. # a non symmetric gram matrix
  236. K = [[1, 2], [3, 40]]
  237. kpca = KernelPCA(
  238. kernel="precomputed", eigen_solver=solver, n_components=1, random_state=0
  239. )
  240. kpca.fit(K) # no error
  241. # same test with centered kernel
  242. Kc = [[9, -9], [-9, 9]]
  243. kpca_c = KernelPCA(
  244. kernel="precomputed", eigen_solver=solver, n_components=1, random_state=0
  245. )
  246. kpca_c.fit(Kc)
  247. # comparison between the non-centered and centered versions
  248. assert_array_equal(kpca.eigenvectors_, kpca_c.eigenvectors_)
  249. assert_array_equal(kpca.eigenvalues_, kpca_c.eigenvalues_)
  250. def test_gridsearch_pipeline():
  251. """Check that kPCA works as expected in a grid search pipeline
  252. Test if we can do a grid-search to find parameters to separate
  253. circles with a perceptron model.
  254. """
  255. X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0)
  256. kpca = KernelPCA(kernel="rbf", n_components=2)
  257. pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron(max_iter=5))])
  258. param_grid = dict(kernel_pca__gamma=2.0 ** np.arange(-2, 2))
  259. grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid)
  260. grid_search.fit(X, y)
  261. assert grid_search.best_score_ == 1
  262. def test_gridsearch_pipeline_precomputed():
  263. """Check that kPCA works as expected in a grid search pipeline (2)
  264. Test if we can do a grid-search to find parameters to separate
  265. circles with a perceptron model. This test uses a precomputed kernel.
  266. """
  267. X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0)
  268. kpca = KernelPCA(kernel="precomputed", n_components=2)
  269. pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron(max_iter=5))])
  270. param_grid = dict(Perceptron__max_iter=np.arange(1, 5))
  271. grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid)
  272. X_kernel = rbf_kernel(X, gamma=2.0)
  273. grid_search.fit(X_kernel, y)
  274. assert grid_search.best_score_ == 1
  275. def test_nested_circles():
  276. """Check that kPCA projects in a space where nested circles are separable
  277. Tests that 2D nested circles become separable with a perceptron when
  278. projected in the first 2 kPCA using an RBF kernel, while raw samples
  279. are not directly separable in the original space.
  280. """
  281. X, y = make_circles(n_samples=400, factor=0.3, noise=0.05, random_state=0)
  282. # 2D nested circles are not linearly separable
  283. train_score = Perceptron(max_iter=5).fit(X, y).score(X, y)
  284. assert train_score < 0.8
  285. # Project the circles data into the first 2 components of a RBF Kernel
  286. # PCA model.
  287. # Note that the gamma value is data dependent. If this test breaks
  288. # and the gamma value has to be updated, the Kernel PCA example will
  289. # have to be updated too.
  290. kpca = KernelPCA(
  291. kernel="rbf", n_components=2, fit_inverse_transform=True, gamma=2.0
  292. )
  293. X_kpca = kpca.fit_transform(X)
  294. # The data is perfectly linearly separable in that space
  295. train_score = Perceptron(max_iter=5).fit(X_kpca, y).score(X_kpca, y)
  296. assert train_score == 1.0
  297. def test_kernel_conditioning():
  298. """Check that ``_check_psd_eigenvalues`` is correctly called in kPCA
  299. Non-regression test for issue #12140 (PR #12145).
  300. """
  301. # create a pathological X leading to small non-zero eigenvalue
  302. X = [[5, 1], [5 + 1e-8, 1e-8], [5 + 1e-8, 0]]
  303. kpca = KernelPCA(kernel="linear", n_components=2, fit_inverse_transform=True)
  304. kpca.fit(X)
  305. # check that the small non-zero eigenvalue was correctly set to zero
  306. assert kpca.eigenvalues_.min() == 0
  307. assert np.all(kpca.eigenvalues_ == _check_psd_eigenvalues(kpca.eigenvalues_))
  308. @pytest.mark.parametrize("solver", ["auto", "dense", "arpack", "randomized"])
  309. def test_precomputed_kernel_not_psd(solver):
  310. """Check how KernelPCA works with non-PSD kernels depending on n_components
  311. Tests for all methods what happens with a non PSD gram matrix (this
  312. can happen in an isomap scenario, or with custom kernel functions, or
  313. maybe with ill-posed datasets).
  314. When ``n_component`` is large enough to capture a negative eigenvalue, an
  315. error should be raised. Otherwise, KernelPCA should run without error
  316. since the negative eigenvalues are not selected.
  317. """
  318. # a non PSD kernel with large eigenvalues, already centered
  319. # it was captured from an isomap call and multiplied by 100 for compacity
  320. K = [
  321. [4.48, -1.0, 8.07, 2.33, 2.33, 2.33, -5.76, -12.78],
  322. [-1.0, -6.48, 4.5, -1.24, -1.24, -1.24, -0.81, 7.49],
  323. [8.07, 4.5, 15.48, 2.09, 2.09, 2.09, -11.1, -23.23],
  324. [2.33, -1.24, 2.09, 4.0, -3.65, -3.65, 1.02, -0.9],
  325. [2.33, -1.24, 2.09, -3.65, 4.0, -3.65, 1.02, -0.9],
  326. [2.33, -1.24, 2.09, -3.65, -3.65, 4.0, 1.02, -0.9],
  327. [-5.76, -0.81, -11.1, 1.02, 1.02, 1.02, 4.86, 9.75],
  328. [-12.78, 7.49, -23.23, -0.9, -0.9, -0.9, 9.75, 21.46],
  329. ]
  330. # this gram matrix has 5 positive eigenvalues and 3 negative ones
  331. # [ 52.72, 7.65, 7.65, 5.02, 0. , -0. , -6.13, -15.11]
  332. # 1. ask for enough components to get a significant negative one
  333. kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, n_components=7)
  334. # make sure that the appropriate error is raised
  335. with pytest.raises(ValueError, match="There are significant negative eigenvalues"):
  336. kpca.fit(K)
  337. # 2. ask for a small enough n_components to get only positive ones
  338. kpca = KernelPCA(kernel="precomputed", eigen_solver=solver, n_components=2)
  339. if solver == "randomized":
  340. # the randomized method is still inconsistent with the others on this
  341. # since it selects the eigenvalues based on the largest 2 modules, not
  342. # on the largest 2 values.
  343. #
  344. # At least we can ensure that we return an error instead of returning
  345. # the wrong eigenvalues
  346. with pytest.raises(
  347. ValueError, match="There are significant negative eigenvalues"
  348. ):
  349. kpca.fit(K)
  350. else:
  351. # general case: make sure that it works
  352. kpca.fit(K)
  353. @pytest.mark.parametrize("n_components", [4, 10, 20])
  354. def test_kernel_pca_solvers_equivalence(n_components):
  355. """Check that 'dense' 'arpack' & 'randomized' solvers give similar results"""
  356. # Generate random data
  357. n_train, n_test = 1_000, 100
  358. X, _ = make_circles(
  359. n_samples=(n_train + n_test), factor=0.3, noise=0.05, random_state=0
  360. )
  361. X_fit, X_pred = X[:n_train, :], X[n_train:, :]
  362. # reference (full)
  363. ref_pred = (
  364. KernelPCA(n_components, eigen_solver="dense", random_state=0)
  365. .fit(X_fit)
  366. .transform(X_pred)
  367. )
  368. # arpack
  369. a_pred = (
  370. KernelPCA(n_components, eigen_solver="arpack", random_state=0)
  371. .fit(X_fit)
  372. .transform(X_pred)
  373. )
  374. # check that the result is still correct despite the approx
  375. assert_array_almost_equal(np.abs(a_pred), np.abs(ref_pred))
  376. # randomized
  377. r_pred = (
  378. KernelPCA(n_components, eigen_solver="randomized", random_state=0)
  379. .fit(X_fit)
  380. .transform(X_pred)
  381. )
  382. # check that the result is still correct despite the approximation
  383. assert_array_almost_equal(np.abs(r_pred), np.abs(ref_pred))
  384. def test_kernel_pca_inverse_transform_reconstruction():
  385. """Test if the reconstruction is a good approximation.
  386. Note that in general it is not possible to get an arbitrarily good
  387. reconstruction because of kernel centering that does not
  388. preserve all the information of the original data.
  389. """
  390. X, *_ = make_blobs(n_samples=100, n_features=4, random_state=0)
  391. kpca = KernelPCA(
  392. n_components=20, kernel="rbf", fit_inverse_transform=True, alpha=1e-3
  393. )
  394. X_trans = kpca.fit_transform(X)
  395. X_reconst = kpca.inverse_transform(X_trans)
  396. assert np.linalg.norm(X - X_reconst) / np.linalg.norm(X) < 1e-1
  397. def test_kernel_pca_raise_not_fitted_error():
  398. X = np.random.randn(15).reshape(5, 3)
  399. kpca = KernelPCA()
  400. kpca.fit(X)
  401. with pytest.raises(NotFittedError):
  402. kpca.inverse_transform(X)
  403. def test_32_64_decomposition_shape():
  404. """Test that the decomposition is similar for 32 and 64 bits data
  405. Non regression test for
  406. https://github.com/scikit-learn/scikit-learn/issues/18146
  407. """
  408. X, y = make_blobs(
  409. n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, cluster_std=0.1
  410. )
  411. X = StandardScaler().fit_transform(X)
  412. X -= X.min()
  413. # Compare the shapes (corresponds to the number of non-zero eigenvalues)
  414. kpca = KernelPCA()
  415. assert kpca.fit_transform(X).shape == kpca.fit_transform(X.astype(np.float32)).shape
  416. def test_kernel_pca_feature_names_out():
  417. """Check feature names out for KernelPCA."""
  418. X, *_ = make_blobs(n_samples=100, n_features=4, random_state=0)
  419. kpca = KernelPCA(n_components=2).fit(X)
  420. names = kpca.get_feature_names_out()
  421. assert_array_equal([f"kernelpca{i}" for i in range(2)], names)
  422. def test_kernel_pca_inverse_correct_gamma():
  423. """Check that gamma is set correctly when not provided.
  424. Non-regression test for #26280
  425. """
  426. rng = np.random.RandomState(0)
  427. X = rng.random_sample((5, 4))
  428. kwargs = {
  429. "n_components": 2,
  430. "random_state": rng,
  431. "fit_inverse_transform": True,
  432. "kernel": "rbf",
  433. }
  434. expected_gamma = 1 / X.shape[1]
  435. kpca1 = KernelPCA(gamma=None, **kwargs).fit(X)
  436. kpca2 = KernelPCA(gamma=expected_gamma, **kwargs).fit(X)
  437. assert kpca1.gamma_ == expected_gamma
  438. assert kpca2.gamma_ == expected_gamma
  439. X1_recon = kpca1.inverse_transform(kpca1.transform(X))
  440. X2_recon = kpca2.inverse_transform(kpca1.transform(X))
  441. assert_allclose(X1_recon, X2_recon)
  442. def test_kernel_pca_pandas_output():
  443. """Check that KernelPCA works with pandas output when the solver is arpack.
  444. Non-regression test for:
  445. https://github.com/scikit-learn/scikit-learn/issues/27579
  446. """
  447. pytest.importorskip("pandas")
  448. X, _ = load_iris(as_frame=True, return_X_y=True)
  449. with sklearn.config_context(transform_output="pandas"):
  450. KernelPCA(n_components=2, eigen_solver="arpack").fit_transform(X)