test_random_projection.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470
  1. import functools
  2. import warnings
  3. from typing import Any, List
  4. import numpy as np
  5. import pytest
  6. import scipy.sparse as sp
  7. from sklearn.exceptions import DataDimensionalityWarning
  8. from sklearn.metrics import euclidean_distances
  9. from sklearn.random_projection import (
  10. GaussianRandomProjection,
  11. SparseRandomProjection,
  12. _gaussian_random_matrix,
  13. _sparse_random_matrix,
  14. johnson_lindenstrauss_min_dim,
  15. )
  16. from sklearn.utils._testing import (
  17. assert_allclose,
  18. assert_allclose_dense_sparse,
  19. assert_almost_equal,
  20. assert_array_almost_equal,
  21. assert_array_equal,
  22. )
  23. all_sparse_random_matrix: List[Any] = [_sparse_random_matrix]
  24. all_dense_random_matrix: List[Any] = [_gaussian_random_matrix]
  25. all_random_matrix = all_sparse_random_matrix + all_dense_random_matrix
  26. all_SparseRandomProjection: List[Any] = [SparseRandomProjection]
  27. all_DenseRandomProjection: List[Any] = [GaussianRandomProjection]
  28. all_RandomProjection = all_SparseRandomProjection + all_DenseRandomProjection
  29. # Make some random data with uniformly located non zero entries with
  30. # Gaussian distributed values
  31. def make_sparse_random_data(n_samples, n_features, n_nonzeros, random_state=0):
  32. rng = np.random.RandomState(random_state)
  33. data_coo = sp.coo_matrix(
  34. (
  35. rng.randn(n_nonzeros),
  36. (
  37. rng.randint(n_samples, size=n_nonzeros),
  38. rng.randint(n_features, size=n_nonzeros),
  39. ),
  40. ),
  41. shape=(n_samples, n_features),
  42. )
  43. return data_coo.toarray(), data_coo.tocsr()
  44. def densify(matrix):
  45. if not sp.issparse(matrix):
  46. return matrix
  47. else:
  48. return matrix.toarray()
  49. n_samples, n_features = (10, 1000)
  50. n_nonzeros = int(n_samples * n_features / 100.0)
  51. data, data_csr = make_sparse_random_data(n_samples, n_features, n_nonzeros)
  52. ###############################################################################
  53. # test on JL lemma
  54. ###############################################################################
  55. @pytest.mark.parametrize(
  56. "n_samples, eps",
  57. [
  58. ([100, 110], [0.9, 1.1]),
  59. ([90, 100], [0.1, 0.0]),
  60. ([50, -40], [0.1, 0.2]),
  61. ],
  62. )
  63. def test_invalid_jl_domain(n_samples, eps):
  64. with pytest.raises(ValueError):
  65. johnson_lindenstrauss_min_dim(n_samples, eps=eps)
  66. def test_input_size_jl_min_dim():
  67. with pytest.raises(ValueError):
  68. johnson_lindenstrauss_min_dim(3 * [100], eps=2 * [0.9])
  69. johnson_lindenstrauss_min_dim(
  70. np.random.randint(1, 10, size=(10, 10)), eps=np.full((10, 10), 0.5)
  71. )
  72. ###############################################################################
  73. # tests random matrix generation
  74. ###############################################################################
  75. def check_input_size_random_matrix(random_matrix):
  76. inputs = [(0, 0), (-1, 1), (1, -1), (1, 0), (-1, 0)]
  77. for n_components, n_features in inputs:
  78. with pytest.raises(ValueError):
  79. random_matrix(n_components, n_features)
  80. def check_size_generated(random_matrix):
  81. inputs = [(1, 5), (5, 1), (5, 5), (1, 1)]
  82. for n_components, n_features in inputs:
  83. assert random_matrix(n_components, n_features).shape == (
  84. n_components,
  85. n_features,
  86. )
  87. def check_zero_mean_and_unit_norm(random_matrix):
  88. # All random matrix should produce a transformation matrix
  89. # with zero mean and unit norm for each columns
  90. A = densify(random_matrix(10000, 1, random_state=0))
  91. assert_array_almost_equal(0, np.mean(A), 3)
  92. assert_array_almost_equal(1.0, np.linalg.norm(A), 1)
  93. def check_input_with_sparse_random_matrix(random_matrix):
  94. n_components, n_features = 5, 10
  95. for density in [-1.0, 0.0, 1.1]:
  96. with pytest.raises(ValueError):
  97. random_matrix(n_components, n_features, density=density)
  98. @pytest.mark.parametrize("random_matrix", all_random_matrix)
  99. def test_basic_property_of_random_matrix(random_matrix):
  100. # Check basic properties of random matrix generation
  101. check_input_size_random_matrix(random_matrix)
  102. check_size_generated(random_matrix)
  103. check_zero_mean_and_unit_norm(random_matrix)
  104. @pytest.mark.parametrize("random_matrix", all_sparse_random_matrix)
  105. def test_basic_property_of_sparse_random_matrix(random_matrix):
  106. check_input_with_sparse_random_matrix(random_matrix)
  107. random_matrix_dense = functools.partial(random_matrix, density=1.0)
  108. check_zero_mean_and_unit_norm(random_matrix_dense)
  109. def test_gaussian_random_matrix():
  110. # Check some statical properties of Gaussian random matrix
  111. # Check that the random matrix follow the proper distribution.
  112. # Let's say that each element of a_{ij} of A is taken from
  113. # a_ij ~ N(0.0, 1 / n_components).
  114. #
  115. n_components = 100
  116. n_features = 1000
  117. A = _gaussian_random_matrix(n_components, n_features, random_state=0)
  118. assert_array_almost_equal(0.0, np.mean(A), 2)
  119. assert_array_almost_equal(np.var(A, ddof=1), 1 / n_components, 1)
  120. def test_sparse_random_matrix():
  121. # Check some statical properties of sparse random matrix
  122. n_components = 100
  123. n_features = 500
  124. for density in [0.3, 1.0]:
  125. s = 1 / density
  126. A = _sparse_random_matrix(
  127. n_components, n_features, density=density, random_state=0
  128. )
  129. A = densify(A)
  130. # Check possible values
  131. values = np.unique(A)
  132. assert np.sqrt(s) / np.sqrt(n_components) in values
  133. assert -np.sqrt(s) / np.sqrt(n_components) in values
  134. if density == 1.0:
  135. assert np.size(values) == 2
  136. else:
  137. assert 0.0 in values
  138. assert np.size(values) == 3
  139. # Check that the random matrix follow the proper distribution.
  140. # Let's say that each element of a_{ij} of A is taken from
  141. #
  142. # - -sqrt(s) / sqrt(n_components) with probability 1 / 2s
  143. # - 0 with probability 1 - 1 / s
  144. # - +sqrt(s) / sqrt(n_components) with probability 1 / 2s
  145. #
  146. assert_almost_equal(np.mean(A == 0.0), 1 - 1 / s, decimal=2)
  147. assert_almost_equal(
  148. np.mean(A == np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2
  149. )
  150. assert_almost_equal(
  151. np.mean(A == -np.sqrt(s) / np.sqrt(n_components)), 1 / (2 * s), decimal=2
  152. )
  153. assert_almost_equal(np.var(A == 0.0, ddof=1), (1 - 1 / s) * 1 / s, decimal=2)
  154. assert_almost_equal(
  155. np.var(A == np.sqrt(s) / np.sqrt(n_components), ddof=1),
  156. (1 - 1 / (2 * s)) * 1 / (2 * s),
  157. decimal=2,
  158. )
  159. assert_almost_equal(
  160. np.var(A == -np.sqrt(s) / np.sqrt(n_components), ddof=1),
  161. (1 - 1 / (2 * s)) * 1 / (2 * s),
  162. decimal=2,
  163. )
  164. ###############################################################################
  165. # tests on random projection transformer
  166. ###############################################################################
  167. def test_random_projection_transformer_invalid_input():
  168. n_components = "auto"
  169. fit_data = [[0, 1, 2]]
  170. for RandomProjection in all_RandomProjection:
  171. with pytest.raises(ValueError):
  172. RandomProjection(n_components=n_components).fit(fit_data)
  173. def test_try_to_transform_before_fit():
  174. for RandomProjection in all_RandomProjection:
  175. with pytest.raises(ValueError):
  176. RandomProjection(n_components="auto").transform(data)
  177. def test_too_many_samples_to_find_a_safe_embedding():
  178. data, _ = make_sparse_random_data(1000, 100, 1000)
  179. for RandomProjection in all_RandomProjection:
  180. rp = RandomProjection(n_components="auto", eps=0.1)
  181. expected_msg = (
  182. "eps=0.100000 and n_samples=1000 lead to a target dimension"
  183. " of 5920 which is larger than the original space with"
  184. " n_features=100"
  185. )
  186. with pytest.raises(ValueError, match=expected_msg):
  187. rp.fit(data)
  188. def test_random_projection_embedding_quality():
  189. data, _ = make_sparse_random_data(8, 5000, 15000)
  190. eps = 0.2
  191. original_distances = euclidean_distances(data, squared=True)
  192. original_distances = original_distances.ravel()
  193. non_identical = original_distances != 0.0
  194. # remove 0 distances to avoid division by 0
  195. original_distances = original_distances[non_identical]
  196. for RandomProjection in all_RandomProjection:
  197. rp = RandomProjection(n_components="auto", eps=eps, random_state=0)
  198. projected = rp.fit_transform(data)
  199. projected_distances = euclidean_distances(projected, squared=True)
  200. projected_distances = projected_distances.ravel()
  201. # remove 0 distances to avoid division by 0
  202. projected_distances = projected_distances[non_identical]
  203. distances_ratio = projected_distances / original_distances
  204. # check that the automatically tuned values for the density respect the
  205. # contract for eps: pairwise distances are preserved according to the
  206. # Johnson-Lindenstrauss lemma
  207. assert distances_ratio.max() < 1 + eps
  208. assert 1 - eps < distances_ratio.min()
  209. def test_SparseRandomProj_output_representation():
  210. for SparseRandomProj in all_SparseRandomProjection:
  211. # when using sparse input, the projected data can be forced to be a
  212. # dense numpy array
  213. rp = SparseRandomProj(n_components=10, dense_output=True, random_state=0)
  214. rp.fit(data)
  215. assert isinstance(rp.transform(data), np.ndarray)
  216. sparse_data = sp.csr_matrix(data)
  217. assert isinstance(rp.transform(sparse_data), np.ndarray)
  218. # the output can be left to a sparse matrix instead
  219. rp = SparseRandomProj(n_components=10, dense_output=False, random_state=0)
  220. rp = rp.fit(data)
  221. # output for dense input will stay dense:
  222. assert isinstance(rp.transform(data), np.ndarray)
  223. # output for sparse output will be sparse:
  224. assert sp.issparse(rp.transform(sparse_data))
  225. def test_correct_RandomProjection_dimensions_embedding():
  226. for RandomProjection in all_RandomProjection:
  227. rp = RandomProjection(n_components="auto", random_state=0, eps=0.5).fit(data)
  228. # the number of components is adjusted from the shape of the training
  229. # set
  230. assert rp.n_components == "auto"
  231. assert rp.n_components_ == 110
  232. if RandomProjection in all_SparseRandomProjection:
  233. assert rp.density == "auto"
  234. assert_almost_equal(rp.density_, 0.03, 2)
  235. assert rp.components_.shape == (110, n_features)
  236. projected_1 = rp.transform(data)
  237. assert projected_1.shape == (n_samples, 110)
  238. # once the RP is 'fitted' the projection is always the same
  239. projected_2 = rp.transform(data)
  240. assert_array_equal(projected_1, projected_2)
  241. # fit transform with same random seed will lead to the same results
  242. rp2 = RandomProjection(random_state=0, eps=0.5)
  243. projected_3 = rp2.fit_transform(data)
  244. assert_array_equal(projected_1, projected_3)
  245. # Try to transform with an input X of size different from fitted.
  246. with pytest.raises(ValueError):
  247. rp.transform(data[:, 1:5])
  248. # it is also possible to fix the number of components and the density
  249. # level
  250. if RandomProjection in all_SparseRandomProjection:
  251. rp = RandomProjection(n_components=100, density=0.001, random_state=0)
  252. projected = rp.fit_transform(data)
  253. assert projected.shape == (n_samples, 100)
  254. assert rp.components_.shape == (100, n_features)
  255. assert rp.components_.nnz < 115 # close to 1% density
  256. assert 85 < rp.components_.nnz # close to 1% density
  257. def test_warning_n_components_greater_than_n_features():
  258. n_features = 20
  259. data, _ = make_sparse_random_data(5, n_features, int(n_features / 4))
  260. for RandomProjection in all_RandomProjection:
  261. with pytest.warns(DataDimensionalityWarning):
  262. RandomProjection(n_components=n_features + 1).fit(data)
  263. def test_works_with_sparse_data():
  264. n_features = 20
  265. data, _ = make_sparse_random_data(5, n_features, int(n_features / 4))
  266. for RandomProjection in all_RandomProjection:
  267. rp_dense = RandomProjection(n_components=3, random_state=1).fit(data)
  268. rp_sparse = RandomProjection(n_components=3, random_state=1).fit(
  269. sp.csr_matrix(data)
  270. )
  271. assert_array_almost_equal(
  272. densify(rp_dense.components_), densify(rp_sparse.components_)
  273. )
  274. def test_johnson_lindenstrauss_min_dim():
  275. """Test Johnson-Lindenstrauss for small eps.
  276. Regression test for #17111: before #19374, 32-bit systems would fail.
  277. """
  278. assert johnson_lindenstrauss_min_dim(100, eps=1e-5) == 368416070986
  279. @pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
  280. def test_random_projection_feature_names_out(random_projection_cls):
  281. random_projection = random_projection_cls(n_components=2)
  282. random_projection.fit(data)
  283. names_out = random_projection.get_feature_names_out()
  284. class_name_lower = random_projection_cls.__name__.lower()
  285. expected_names_out = np.array(
  286. [f"{class_name_lower}{i}" for i in range(random_projection.n_components_)],
  287. dtype=object,
  288. )
  289. assert_array_equal(names_out, expected_names_out)
  290. @pytest.mark.parametrize("n_samples", (2, 9, 10, 11, 1000))
  291. @pytest.mark.parametrize("n_features", (2, 9, 10, 11, 1000))
  292. @pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
  293. @pytest.mark.parametrize("compute_inverse_components", [True, False])
  294. def test_inverse_transform(
  295. n_samples,
  296. n_features,
  297. random_projection_cls,
  298. compute_inverse_components,
  299. global_random_seed,
  300. ):
  301. n_components = 10
  302. random_projection = random_projection_cls(
  303. n_components=n_components,
  304. compute_inverse_components=compute_inverse_components,
  305. random_state=global_random_seed,
  306. )
  307. X_dense, X_csr = make_sparse_random_data(
  308. n_samples,
  309. n_features,
  310. n_samples * n_features // 100 + 1,
  311. random_state=global_random_seed,
  312. )
  313. for X in [X_dense, X_csr]:
  314. with warnings.catch_warnings():
  315. warnings.filterwarnings(
  316. "ignore",
  317. message=(
  318. "The number of components is higher than the number of features"
  319. ),
  320. category=DataDimensionalityWarning,
  321. )
  322. projected = random_projection.fit_transform(X)
  323. if compute_inverse_components:
  324. assert hasattr(random_projection, "inverse_components_")
  325. inv_components = random_projection.inverse_components_
  326. assert inv_components.shape == (n_features, n_components)
  327. projected_back = random_projection.inverse_transform(projected)
  328. assert projected_back.shape == X.shape
  329. projected_again = random_projection.transform(projected_back)
  330. if hasattr(projected, "toarray"):
  331. projected = projected.toarray()
  332. assert_allclose(projected, projected_again, rtol=1e-7, atol=1e-10)
  333. @pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
  334. @pytest.mark.parametrize(
  335. "input_dtype, expected_dtype",
  336. (
  337. (np.float32, np.float32),
  338. (np.float64, np.float64),
  339. (np.int32, np.float64),
  340. (np.int64, np.float64),
  341. ),
  342. )
  343. def test_random_projection_dtype_match(
  344. random_projection_cls, input_dtype, expected_dtype
  345. ):
  346. # Verify output matrix dtype
  347. rng = np.random.RandomState(42)
  348. X = rng.rand(25, 3000)
  349. rp = random_projection_cls(random_state=0)
  350. transformed = rp.fit_transform(X.astype(input_dtype))
  351. assert rp.components_.dtype == expected_dtype
  352. assert transformed.dtype == expected_dtype
  353. @pytest.mark.parametrize("random_projection_cls", all_RandomProjection)
  354. def test_random_projection_numerical_consistency(random_projection_cls):
  355. # Verify numerical consistency among np.float32 and np.float64
  356. atol = 1e-5
  357. rng = np.random.RandomState(42)
  358. X = rng.rand(25, 3000)
  359. rp_32 = random_projection_cls(random_state=0)
  360. rp_64 = random_projection_cls(random_state=0)
  361. projection_32 = rp_32.fit_transform(X.astype(np.float32))
  362. projection_64 = rp_64.fit_transform(X.astype(np.float64))
  363. assert_allclose(projection_64, projection_32, atol=atol)
  364. assert_allclose_dense_sparse(rp_32.components_, rp_64.components_)