test_incremental_pca.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452
  1. """Tests for Incremental PCA."""
  2. import warnings
  3. import numpy as np
  4. import pytest
  5. from numpy.testing import assert_array_equal
  6. from scipy import sparse
  7. from sklearn import datasets
  8. from sklearn.decomposition import PCA, IncrementalPCA
  9. from sklearn.utils._testing import (
  10. assert_allclose_dense_sparse,
  11. assert_almost_equal,
  12. assert_array_almost_equal,
  13. )
  14. iris = datasets.load_iris()
  15. def test_incremental_pca():
  16. # Incremental PCA on dense arrays.
  17. X = iris.data
  18. batch_size = X.shape[0] // 3
  19. ipca = IncrementalPCA(n_components=2, batch_size=batch_size)
  20. pca = PCA(n_components=2)
  21. pca.fit_transform(X)
  22. X_transformed = ipca.fit_transform(X)
  23. assert X_transformed.shape == (X.shape[0], 2)
  24. np.testing.assert_allclose(
  25. ipca.explained_variance_ratio_.sum(),
  26. pca.explained_variance_ratio_.sum(),
  27. rtol=1e-3,
  28. )
  29. for n_components in [1, 2, X.shape[1]]:
  30. ipca = IncrementalPCA(n_components, batch_size=batch_size)
  31. ipca.fit(X)
  32. cov = ipca.get_covariance()
  33. precision = ipca.get_precision()
  34. np.testing.assert_allclose(
  35. np.dot(cov, precision), np.eye(X.shape[1]), atol=1e-13
  36. )
  37. @pytest.mark.parametrize(
  38. "matrix_class", [sparse.csc_matrix, sparse.csr_matrix, sparse.lil_matrix]
  39. )
  40. def test_incremental_pca_sparse(matrix_class):
  41. # Incremental PCA on sparse arrays.
  42. X = iris.data
  43. pca = PCA(n_components=2)
  44. pca.fit_transform(X)
  45. X_sparse = matrix_class(X)
  46. batch_size = X_sparse.shape[0] // 3
  47. ipca = IncrementalPCA(n_components=2, batch_size=batch_size)
  48. X_transformed = ipca.fit_transform(X_sparse)
  49. assert X_transformed.shape == (X_sparse.shape[0], 2)
  50. np.testing.assert_allclose(
  51. ipca.explained_variance_ratio_.sum(),
  52. pca.explained_variance_ratio_.sum(),
  53. rtol=1e-3,
  54. )
  55. for n_components in [1, 2, X.shape[1]]:
  56. ipca = IncrementalPCA(n_components, batch_size=batch_size)
  57. ipca.fit(X_sparse)
  58. cov = ipca.get_covariance()
  59. precision = ipca.get_precision()
  60. np.testing.assert_allclose(
  61. np.dot(cov, precision), np.eye(X_sparse.shape[1]), atol=1e-13
  62. )
  63. with pytest.raises(
  64. TypeError,
  65. match=(
  66. "IncrementalPCA.partial_fit does not support "
  67. "sparse input. Either convert data to dense "
  68. "or use IncrementalPCA.fit to do so in batches."
  69. ),
  70. ):
  71. ipca.partial_fit(X_sparse)
  72. def test_incremental_pca_check_projection():
  73. # Test that the projection of data is correct.
  74. rng = np.random.RandomState(1999)
  75. n, p = 100, 3
  76. X = rng.randn(n, p) * 0.1
  77. X[:10] += np.array([3, 4, 5])
  78. Xt = 0.1 * rng.randn(1, p) + np.array([3, 4, 5])
  79. # Get the reconstruction of the generated data X
  80. # Note that Xt has the same "components" as X, just separated
  81. # This is what we want to ensure is recreated correctly
  82. Yt = IncrementalPCA(n_components=2).fit(X).transform(Xt)
  83. # Normalize
  84. Yt /= np.sqrt((Yt**2).sum())
  85. # Make sure that the first element of Yt is ~1, this means
  86. # the reconstruction worked as expected
  87. assert_almost_equal(np.abs(Yt[0][0]), 1.0, 1)
  88. def test_incremental_pca_inverse():
  89. # Test that the projection of data can be inverted.
  90. rng = np.random.RandomState(1999)
  91. n, p = 50, 3
  92. X = rng.randn(n, p) # spherical data
  93. X[:, 1] *= 0.00001 # make middle component relatively small
  94. X += [5, 4, 3] # make a large mean
  95. # same check that we can find the original data from the transformed
  96. # signal (since the data is almost of rank n_components)
  97. ipca = IncrementalPCA(n_components=2, batch_size=10).fit(X)
  98. Y = ipca.transform(X)
  99. Y_inverse = ipca.inverse_transform(Y)
  100. assert_almost_equal(X, Y_inverse, decimal=3)
  101. def test_incremental_pca_validation():
  102. # Test that n_components is <= n_features.
  103. X = np.array([[0, 1, 0], [1, 0, 0]])
  104. n_samples, n_features = X.shape
  105. n_components = 4
  106. with pytest.raises(
  107. ValueError,
  108. match=(
  109. "n_components={} invalid"
  110. " for n_features={}, need more rows than"
  111. " columns for IncrementalPCA"
  112. " processing".format(n_components, n_features)
  113. ),
  114. ):
  115. IncrementalPCA(n_components, batch_size=10).fit(X)
  116. # Tests that n_components is also <= n_samples.
  117. n_components = 3
  118. with pytest.raises(
  119. ValueError,
  120. match=(
  121. "n_components={} must be"
  122. " less or equal to the batch number of"
  123. " samples {}".format(n_components, n_samples)
  124. ),
  125. ):
  126. IncrementalPCA(n_components=n_components).partial_fit(X)
  127. def test_n_samples_equal_n_components():
  128. # Ensures no warning is raised when n_samples==n_components
  129. # Non-regression test for gh-19050
  130. ipca = IncrementalPCA(n_components=5)
  131. with warnings.catch_warnings():
  132. warnings.simplefilter("error", RuntimeWarning)
  133. ipca.partial_fit(np.random.randn(5, 7))
  134. with warnings.catch_warnings():
  135. warnings.simplefilter("error", RuntimeWarning)
  136. ipca.fit(np.random.randn(5, 7))
  137. def test_n_components_none():
  138. # Ensures that n_components == None is handled correctly
  139. rng = np.random.RandomState(1999)
  140. for n_samples, n_features in [(50, 10), (10, 50)]:
  141. X = rng.rand(n_samples, n_features)
  142. ipca = IncrementalPCA(n_components=None)
  143. # First partial_fit call, ipca.n_components_ is inferred from
  144. # min(X.shape)
  145. ipca.partial_fit(X)
  146. assert ipca.n_components_ == min(X.shape)
  147. # Second partial_fit call, ipca.n_components_ is inferred from
  148. # ipca.components_ computed from the first partial_fit call
  149. ipca.partial_fit(X)
  150. assert ipca.n_components_ == ipca.components_.shape[0]
  151. def test_incremental_pca_set_params():
  152. # Test that components_ sign is stable over batch sizes.
  153. rng = np.random.RandomState(1999)
  154. n_samples = 100
  155. n_features = 20
  156. X = rng.randn(n_samples, n_features)
  157. X2 = rng.randn(n_samples, n_features)
  158. X3 = rng.randn(n_samples, n_features)
  159. ipca = IncrementalPCA(n_components=20)
  160. ipca.fit(X)
  161. # Decreasing number of components
  162. ipca.set_params(n_components=10)
  163. with pytest.raises(ValueError):
  164. ipca.partial_fit(X2)
  165. # Increasing number of components
  166. ipca.set_params(n_components=15)
  167. with pytest.raises(ValueError):
  168. ipca.partial_fit(X3)
  169. # Returning to original setting
  170. ipca.set_params(n_components=20)
  171. ipca.partial_fit(X)
  172. def test_incremental_pca_num_features_change():
  173. # Test that changing n_components will raise an error.
  174. rng = np.random.RandomState(1999)
  175. n_samples = 100
  176. X = rng.randn(n_samples, 20)
  177. X2 = rng.randn(n_samples, 50)
  178. ipca = IncrementalPCA(n_components=None)
  179. ipca.fit(X)
  180. with pytest.raises(ValueError):
  181. ipca.partial_fit(X2)
  182. def test_incremental_pca_batch_signs():
  183. # Test that components_ sign is stable over batch sizes.
  184. rng = np.random.RandomState(1999)
  185. n_samples = 100
  186. n_features = 3
  187. X = rng.randn(n_samples, n_features)
  188. all_components = []
  189. batch_sizes = np.arange(10, 20)
  190. for batch_size in batch_sizes:
  191. ipca = IncrementalPCA(n_components=None, batch_size=batch_size).fit(X)
  192. all_components.append(ipca.components_)
  193. for i, j in zip(all_components[:-1], all_components[1:]):
  194. assert_almost_equal(np.sign(i), np.sign(j), decimal=6)
  195. def test_incremental_pca_batch_values():
  196. # Test that components_ values are stable over batch sizes.
  197. rng = np.random.RandomState(1999)
  198. n_samples = 100
  199. n_features = 3
  200. X = rng.randn(n_samples, n_features)
  201. all_components = []
  202. batch_sizes = np.arange(20, 40, 3)
  203. for batch_size in batch_sizes:
  204. ipca = IncrementalPCA(n_components=None, batch_size=batch_size).fit(X)
  205. all_components.append(ipca.components_)
  206. for i, j in zip(all_components[:-1], all_components[1:]):
  207. assert_almost_equal(i, j, decimal=1)
  208. def test_incremental_pca_batch_rank():
  209. # Test sample size in each batch is always larger or equal to n_components
  210. rng = np.random.RandomState(1999)
  211. n_samples = 100
  212. n_features = 20
  213. X = rng.randn(n_samples, n_features)
  214. all_components = []
  215. batch_sizes = np.arange(20, 90, 3)
  216. for batch_size in batch_sizes:
  217. ipca = IncrementalPCA(n_components=20, batch_size=batch_size).fit(X)
  218. all_components.append(ipca.components_)
  219. for components_i, components_j in zip(all_components[:-1], all_components[1:]):
  220. assert_allclose_dense_sparse(components_i, components_j)
  221. def test_incremental_pca_partial_fit():
  222. # Test that fit and partial_fit get equivalent results.
  223. rng = np.random.RandomState(1999)
  224. n, p = 50, 3
  225. X = rng.randn(n, p) # spherical data
  226. X[:, 1] *= 0.00001 # make middle component relatively small
  227. X += [5, 4, 3] # make a large mean
  228. # same check that we can find the original data from the transformed
  229. # signal (since the data is almost of rank n_components)
  230. batch_size = 10
  231. ipca = IncrementalPCA(n_components=2, batch_size=batch_size).fit(X)
  232. pipca = IncrementalPCA(n_components=2, batch_size=batch_size)
  233. # Add one to make sure endpoint is included
  234. batch_itr = np.arange(0, n + 1, batch_size)
  235. for i, j in zip(batch_itr[:-1], batch_itr[1:]):
  236. pipca.partial_fit(X[i:j, :])
  237. assert_almost_equal(ipca.components_, pipca.components_, decimal=3)
  238. def test_incremental_pca_against_pca_iris():
  239. # Test that IncrementalPCA and PCA are approximate (to a sign flip).
  240. X = iris.data
  241. Y_pca = PCA(n_components=2).fit_transform(X)
  242. Y_ipca = IncrementalPCA(n_components=2, batch_size=25).fit_transform(X)
  243. assert_almost_equal(np.abs(Y_pca), np.abs(Y_ipca), 1)
  244. def test_incremental_pca_against_pca_random_data():
  245. # Test that IncrementalPCA and PCA are approximate (to a sign flip).
  246. rng = np.random.RandomState(1999)
  247. n_samples = 100
  248. n_features = 3
  249. X = rng.randn(n_samples, n_features) + 5 * rng.rand(1, n_features)
  250. Y_pca = PCA(n_components=3).fit_transform(X)
  251. Y_ipca = IncrementalPCA(n_components=3, batch_size=25).fit_transform(X)
  252. assert_almost_equal(np.abs(Y_pca), np.abs(Y_ipca), 1)
  253. def test_explained_variances():
  254. # Test that PCA and IncrementalPCA calculations match
  255. X = datasets.make_low_rank_matrix(
  256. 1000, 100, tail_strength=0.0, effective_rank=10, random_state=1999
  257. )
  258. prec = 3
  259. n_samples, n_features = X.shape
  260. for nc in [None, 99]:
  261. pca = PCA(n_components=nc).fit(X)
  262. ipca = IncrementalPCA(n_components=nc, batch_size=100).fit(X)
  263. assert_almost_equal(
  264. pca.explained_variance_, ipca.explained_variance_, decimal=prec
  265. )
  266. assert_almost_equal(
  267. pca.explained_variance_ratio_, ipca.explained_variance_ratio_, decimal=prec
  268. )
  269. assert_almost_equal(pca.noise_variance_, ipca.noise_variance_, decimal=prec)
  270. def test_singular_values():
  271. # Check that the IncrementalPCA output has the correct singular values
  272. rng = np.random.RandomState(0)
  273. n_samples = 1000
  274. n_features = 100
  275. X = datasets.make_low_rank_matrix(
  276. n_samples, n_features, tail_strength=0.0, effective_rank=10, random_state=rng
  277. )
  278. pca = PCA(n_components=10, svd_solver="full", random_state=rng).fit(X)
  279. ipca = IncrementalPCA(n_components=10, batch_size=100).fit(X)
  280. assert_array_almost_equal(pca.singular_values_, ipca.singular_values_, 2)
  281. # Compare to the Frobenius norm
  282. X_pca = pca.transform(X)
  283. X_ipca = ipca.transform(X)
  284. assert_array_almost_equal(
  285. np.sum(pca.singular_values_**2.0), np.linalg.norm(X_pca, "fro") ** 2.0, 12
  286. )
  287. assert_array_almost_equal(
  288. np.sum(ipca.singular_values_**2.0), np.linalg.norm(X_ipca, "fro") ** 2.0, 2
  289. )
  290. # Compare to the 2-norms of the score vectors
  291. assert_array_almost_equal(
  292. pca.singular_values_, np.sqrt(np.sum(X_pca**2.0, axis=0)), 12
  293. )
  294. assert_array_almost_equal(
  295. ipca.singular_values_, np.sqrt(np.sum(X_ipca**2.0, axis=0)), 2
  296. )
  297. # Set the singular values and see what we get back
  298. rng = np.random.RandomState(0)
  299. n_samples = 100
  300. n_features = 110
  301. X = datasets.make_low_rank_matrix(
  302. n_samples, n_features, tail_strength=0.0, effective_rank=3, random_state=rng
  303. )
  304. pca = PCA(n_components=3, svd_solver="full", random_state=rng)
  305. ipca = IncrementalPCA(n_components=3, batch_size=100)
  306. X_pca = pca.fit_transform(X)
  307. X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0))
  308. X_pca[:, 0] *= 3.142
  309. X_pca[:, 1] *= 2.718
  310. X_hat = np.dot(X_pca, pca.components_)
  311. pca.fit(X_hat)
  312. ipca.fit(X_hat)
  313. assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14)
  314. assert_array_almost_equal(ipca.singular_values_, [3.142, 2.718, 1.0], 14)
  315. def test_whitening():
  316. # Test that PCA and IncrementalPCA transforms match to sign flip.
  317. X = datasets.make_low_rank_matrix(
  318. 1000, 10, tail_strength=0.0, effective_rank=2, random_state=1999
  319. )
  320. prec = 3
  321. n_samples, n_features = X.shape
  322. for nc in [None, 9]:
  323. pca = PCA(whiten=True, n_components=nc).fit(X)
  324. ipca = IncrementalPCA(whiten=True, n_components=nc, batch_size=250).fit(X)
  325. Xt_pca = pca.transform(X)
  326. Xt_ipca = ipca.transform(X)
  327. assert_almost_equal(np.abs(Xt_pca), np.abs(Xt_ipca), decimal=prec)
  328. Xinv_ipca = ipca.inverse_transform(Xt_ipca)
  329. Xinv_pca = pca.inverse_transform(Xt_pca)
  330. assert_almost_equal(X, Xinv_ipca, decimal=prec)
  331. assert_almost_equal(X, Xinv_pca, decimal=prec)
  332. assert_almost_equal(Xinv_pca, Xinv_ipca, decimal=prec)
  333. def test_incremental_pca_partial_fit_float_division():
  334. # Test to ensure float division is used in all versions of Python
  335. # (non-regression test for issue #9489)
  336. rng = np.random.RandomState(0)
  337. A = rng.randn(5, 3) + 2
  338. B = rng.randn(7, 3) + 5
  339. pca = IncrementalPCA(n_components=2)
  340. pca.partial_fit(A)
  341. # Set n_samples_seen_ to be a floating point number instead of an int
  342. pca.n_samples_seen_ = float(pca.n_samples_seen_)
  343. pca.partial_fit(B)
  344. singular_vals_float_samples_seen = pca.singular_values_
  345. pca2 = IncrementalPCA(n_components=2)
  346. pca2.partial_fit(A)
  347. pca2.partial_fit(B)
  348. singular_vals_int_samples_seen = pca2.singular_values_
  349. np.testing.assert_allclose(
  350. singular_vals_float_samples_seen, singular_vals_int_samples_seen
  351. )
  352. def test_incremental_pca_fit_overflow_error():
  353. # Test for overflow error on Windows OS
  354. # (non-regression test for issue #17693)
  355. rng = np.random.RandomState(0)
  356. A = rng.rand(500000, 2)
  357. ipca = IncrementalPCA(n_components=2, batch_size=10000)
  358. ipca.fit(A)
  359. pca = PCA(n_components=2)
  360. pca.fit(A)
  361. np.testing.assert_allclose(ipca.singular_values_, pca.singular_values_)
  362. def test_incremental_pca_feature_names_out():
  363. """Check feature names out for IncrementalPCA."""
  364. ipca = IncrementalPCA(n_components=2).fit(iris.data)
  365. names = ipca.get_feature_names_out()
  366. assert_array_equal([f"incrementalpca{i}" for i in range(2)], names)