test_extmath.py 35 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049
  1. # Authors: Olivier Grisel <olivier.grisel@ensta.org>
  2. # Mathieu Blondel <mathieu@mblondel.org>
  3. # Denis Engemann <denis-alexander.engemann@inria.fr>
  4. #
  5. # License: BSD 3 clause
  6. import numpy as np
  7. import pytest
  8. from scipy import linalg, sparse
  9. from scipy.linalg import eigh
  10. from scipy.sparse.linalg import eigsh
  11. from scipy.special import expit
  12. from sklearn.datasets import make_low_rank_matrix, make_sparse_spd_matrix
  13. from sklearn.utils import gen_batches
  14. from sklearn.utils._arpack import _init_arpack_v0
  15. from sklearn.utils._testing import (
  16. assert_allclose,
  17. assert_allclose_dense_sparse,
  18. assert_almost_equal,
  19. assert_array_almost_equal,
  20. assert_array_equal,
  21. skip_if_32bit,
  22. )
  23. from sklearn.utils.extmath import (
  24. _deterministic_vector_sign_flip,
  25. _incremental_mean_and_var,
  26. _randomized_eigsh,
  27. _safe_accumulator_op,
  28. cartesian,
  29. density,
  30. log_logistic,
  31. randomized_svd,
  32. row_norms,
  33. safe_sparse_dot,
  34. softmax,
  35. stable_cumsum,
  36. svd_flip,
  37. weighted_mode,
  38. )
  39. from sklearn.utils.fixes import _mode
  40. def test_density():
  41. rng = np.random.RandomState(0)
  42. X = rng.randint(10, size=(10, 5))
  43. X[1, 2] = 0
  44. X[5, 3] = 0
  45. X_csr = sparse.csr_matrix(X)
  46. X_csc = sparse.csc_matrix(X)
  47. X_coo = sparse.coo_matrix(X)
  48. X_lil = sparse.lil_matrix(X)
  49. for X_ in (X_csr, X_csc, X_coo, X_lil):
  50. assert density(X_) == density(X)
  51. # TODO(1.4): Remove test
  52. def test_density_deprecated_kwargs():
  53. """Check that future warning is raised when user enters keyword arguments."""
  54. test_array = np.array([[1, 2, 3], [4, 5, 6]])
  55. with pytest.warns(
  56. FutureWarning,
  57. match=(
  58. "Additional keyword arguments are deprecated in version 1.2 and will be"
  59. " removed in version 1.4."
  60. ),
  61. ):
  62. density(test_array, a=1)
  63. def test_uniform_weights():
  64. # with uniform weights, results should be identical to stats.mode
  65. rng = np.random.RandomState(0)
  66. x = rng.randint(10, size=(10, 5))
  67. weights = np.ones(x.shape)
  68. for axis in (None, 0, 1):
  69. mode, score = _mode(x, axis)
  70. mode2, score2 = weighted_mode(x, weights, axis=axis)
  71. assert_array_equal(mode, mode2)
  72. assert_array_equal(score, score2)
  73. def test_random_weights():
  74. # set this up so that each row should have a weighted mode of 6,
  75. # with a score that is easily reproduced
  76. mode_result = 6
  77. rng = np.random.RandomState(0)
  78. x = rng.randint(mode_result, size=(100, 10))
  79. w = rng.random_sample(x.shape)
  80. x[:, :5] = mode_result
  81. w[:, :5] += 1
  82. mode, score = weighted_mode(x, w, axis=1)
  83. assert_array_equal(mode, mode_result)
  84. assert_array_almost_equal(score.ravel(), w[:, :5].sum(1))
  85. def check_randomized_svd_low_rank(dtype):
  86. # Check that extmath.randomized_svd is consistent with linalg.svd
  87. n_samples = 100
  88. n_features = 500
  89. rank = 5
  90. k = 10
  91. decimal = 5 if dtype == np.float32 else 7
  92. dtype = np.dtype(dtype)
  93. # generate a matrix X of approximate effective rank `rank` and no noise
  94. # component (very structured signal):
  95. X = make_low_rank_matrix(
  96. n_samples=n_samples,
  97. n_features=n_features,
  98. effective_rank=rank,
  99. tail_strength=0.0,
  100. random_state=0,
  101. ).astype(dtype, copy=False)
  102. assert X.shape == (n_samples, n_features)
  103. # compute the singular values of X using the slow exact method
  104. U, s, Vt = linalg.svd(X, full_matrices=False)
  105. # Convert the singular values to the specific dtype
  106. U = U.astype(dtype, copy=False)
  107. s = s.astype(dtype, copy=False)
  108. Vt = Vt.astype(dtype, copy=False)
  109. for normalizer in ["auto", "LU", "QR"]: # 'none' would not be stable
  110. # compute the singular values of X using the fast approximate method
  111. Ua, sa, Va = randomized_svd(
  112. X, k, power_iteration_normalizer=normalizer, random_state=0
  113. )
  114. # If the input dtype is float, then the output dtype is float of the
  115. # same bit size (f32 is not upcast to f64)
  116. # But if the input dtype is int, the output dtype is float64
  117. if dtype.kind == "f":
  118. assert Ua.dtype == dtype
  119. assert sa.dtype == dtype
  120. assert Va.dtype == dtype
  121. else:
  122. assert Ua.dtype == np.float64
  123. assert sa.dtype == np.float64
  124. assert Va.dtype == np.float64
  125. assert Ua.shape == (n_samples, k)
  126. assert sa.shape == (k,)
  127. assert Va.shape == (k, n_features)
  128. # ensure that the singular values of both methods are equal up to the
  129. # real rank of the matrix
  130. assert_almost_equal(s[:k], sa, decimal=decimal)
  131. # check the singular vectors too (while not checking the sign)
  132. assert_almost_equal(
  133. np.dot(U[:, :k], Vt[:k, :]), np.dot(Ua, Va), decimal=decimal
  134. )
  135. # check the sparse matrix representation
  136. X = sparse.csr_matrix(X)
  137. # compute the singular values of X using the fast approximate method
  138. Ua, sa, Va = randomized_svd(
  139. X, k, power_iteration_normalizer=normalizer, random_state=0
  140. )
  141. if dtype.kind == "f":
  142. assert Ua.dtype == dtype
  143. assert sa.dtype == dtype
  144. assert Va.dtype == dtype
  145. else:
  146. assert Ua.dtype.kind == "f"
  147. assert sa.dtype.kind == "f"
  148. assert Va.dtype.kind == "f"
  149. assert_almost_equal(s[:rank], sa[:rank], decimal=decimal)
  150. @pytest.mark.parametrize("dtype", (np.int32, np.int64, np.float32, np.float64))
  151. def test_randomized_svd_low_rank_all_dtypes(dtype):
  152. check_randomized_svd_low_rank(dtype)
  153. @pytest.mark.parametrize("dtype", (np.int32, np.int64, np.float32, np.float64))
  154. def test_randomized_eigsh(dtype):
  155. """Test that `_randomized_eigsh` returns the appropriate components"""
  156. rng = np.random.RandomState(42)
  157. X = np.diag(np.array([1.0, -2.0, 0.0, 3.0], dtype=dtype))
  158. # random rotation that preserves the eigenvalues of X
  159. rand_rot = np.linalg.qr(rng.normal(size=X.shape))[0]
  160. X = rand_rot @ X @ rand_rot.T
  161. # with 'module' selection method, the negative eigenvalue shows up
  162. eigvals, eigvecs = _randomized_eigsh(X, n_components=2, selection="module")
  163. # eigenvalues
  164. assert eigvals.shape == (2,)
  165. assert_array_almost_equal(eigvals, [3.0, -2.0]) # negative eigenvalue here
  166. # eigenvectors
  167. assert eigvecs.shape == (4, 2)
  168. # with 'value' selection method, the negative eigenvalue does not show up
  169. with pytest.raises(NotImplementedError):
  170. _randomized_eigsh(X, n_components=2, selection="value")
  171. @pytest.mark.parametrize("k", (10, 50, 100, 199, 200))
  172. def test_randomized_eigsh_compared_to_others(k):
  173. """Check that `_randomized_eigsh` is similar to other `eigsh`
  174. Tests that for a random PSD matrix, `_randomized_eigsh` provides results
  175. comparable to LAPACK (scipy.linalg.eigh) and ARPACK
  176. (scipy.sparse.linalg.eigsh).
  177. Note: some versions of ARPACK do not support k=n_features.
  178. """
  179. # make a random PSD matrix
  180. n_features = 200
  181. X = make_sparse_spd_matrix(n_features, random_state=0)
  182. # compare two versions of randomized
  183. # rough and fast
  184. eigvals, eigvecs = _randomized_eigsh(
  185. X, n_components=k, selection="module", n_iter=25, random_state=0
  186. )
  187. # more accurate but slow (TODO find realistic settings here)
  188. eigvals_qr, eigvecs_qr = _randomized_eigsh(
  189. X,
  190. n_components=k,
  191. n_iter=25,
  192. n_oversamples=20,
  193. random_state=0,
  194. power_iteration_normalizer="QR",
  195. selection="module",
  196. )
  197. # with LAPACK
  198. eigvals_lapack, eigvecs_lapack = eigh(
  199. X, subset_by_index=(n_features - k, n_features - 1)
  200. )
  201. indices = eigvals_lapack.argsort()[::-1]
  202. eigvals_lapack = eigvals_lapack[indices]
  203. eigvecs_lapack = eigvecs_lapack[:, indices]
  204. # -- eigenvalues comparison
  205. assert eigvals_lapack.shape == (k,)
  206. # comparison precision
  207. assert_array_almost_equal(eigvals, eigvals_lapack, decimal=6)
  208. assert_array_almost_equal(eigvals_qr, eigvals_lapack, decimal=6)
  209. # -- eigenvectors comparison
  210. assert eigvecs_lapack.shape == (n_features, k)
  211. # flip eigenvectors' sign to enforce deterministic output
  212. dummy_vecs = np.zeros_like(eigvecs).T
  213. eigvecs, _ = svd_flip(eigvecs, dummy_vecs)
  214. eigvecs_qr, _ = svd_flip(eigvecs_qr, dummy_vecs)
  215. eigvecs_lapack, _ = svd_flip(eigvecs_lapack, dummy_vecs)
  216. assert_array_almost_equal(eigvecs, eigvecs_lapack, decimal=4)
  217. assert_array_almost_equal(eigvecs_qr, eigvecs_lapack, decimal=6)
  218. # comparison ARPACK ~ LAPACK (some ARPACK implems do not support k=n)
  219. if k < n_features:
  220. v0 = _init_arpack_v0(n_features, random_state=0)
  221. # "LA" largest algebraic <=> selection="value" in randomized_eigsh
  222. eigvals_arpack, eigvecs_arpack = eigsh(
  223. X, k, which="LA", tol=0, maxiter=None, v0=v0
  224. )
  225. indices = eigvals_arpack.argsort()[::-1]
  226. # eigenvalues
  227. eigvals_arpack = eigvals_arpack[indices]
  228. assert_array_almost_equal(eigvals_lapack, eigvals_arpack, decimal=10)
  229. # eigenvectors
  230. eigvecs_arpack = eigvecs_arpack[:, indices]
  231. eigvecs_arpack, _ = svd_flip(eigvecs_arpack, dummy_vecs)
  232. assert_array_almost_equal(eigvecs_arpack, eigvecs_lapack, decimal=8)
  233. @pytest.mark.parametrize(
  234. "n,rank",
  235. [
  236. (10, 7),
  237. (100, 10),
  238. (100, 80),
  239. (500, 10),
  240. (500, 250),
  241. (500, 400),
  242. ],
  243. )
  244. def test_randomized_eigsh_reconst_low_rank(n, rank):
  245. """Check that randomized_eigsh is able to reconstruct a low rank psd matrix
  246. Tests that the decomposition provided by `_randomized_eigsh` leads to
  247. orthonormal eigenvectors, and that a low rank PSD matrix can be effectively
  248. reconstructed with good accuracy using it.
  249. """
  250. assert rank < n
  251. # create a low rank PSD
  252. rng = np.random.RandomState(69)
  253. X = rng.randn(n, rank)
  254. A = X @ X.T
  255. # approximate A with the "right" number of components
  256. S, V = _randomized_eigsh(A, n_components=rank, random_state=rng)
  257. # orthonormality checks
  258. assert_array_almost_equal(np.linalg.norm(V, axis=0), np.ones(S.shape))
  259. assert_array_almost_equal(V.T @ V, np.diag(np.ones(S.shape)))
  260. # reconstruction
  261. A_reconstruct = V @ np.diag(S) @ V.T
  262. # test that the approximation is good
  263. assert_array_almost_equal(A_reconstruct, A, decimal=6)
  264. @pytest.mark.parametrize("dtype", (np.float32, np.float64))
  265. def test_row_norms(dtype):
  266. X = np.random.RandomState(42).randn(100, 100)
  267. if dtype is np.float32:
  268. precision = 4
  269. else:
  270. precision = 5
  271. X = X.astype(dtype, copy=False)
  272. sq_norm = (X**2).sum(axis=1)
  273. assert_array_almost_equal(sq_norm, row_norms(X, squared=True), precision)
  274. assert_array_almost_equal(np.sqrt(sq_norm), row_norms(X), precision)
  275. for csr_index_dtype in [np.int32, np.int64]:
  276. Xcsr = sparse.csr_matrix(X, dtype=dtype)
  277. # csr_matrix will use int32 indices by default,
  278. # up-casting those to int64 when necessary
  279. if csr_index_dtype is np.int64:
  280. Xcsr.indptr = Xcsr.indptr.astype(csr_index_dtype, copy=False)
  281. Xcsr.indices = Xcsr.indices.astype(csr_index_dtype, copy=False)
  282. assert Xcsr.indices.dtype == csr_index_dtype
  283. assert Xcsr.indptr.dtype == csr_index_dtype
  284. assert_array_almost_equal(sq_norm, row_norms(Xcsr, squared=True), precision)
  285. assert_array_almost_equal(np.sqrt(sq_norm), row_norms(Xcsr), precision)
  286. def test_randomized_svd_low_rank_with_noise():
  287. # Check that extmath.randomized_svd can handle noisy matrices
  288. n_samples = 100
  289. n_features = 500
  290. rank = 5
  291. k = 10
  292. # generate a matrix X wity structure approximate rank `rank` and an
  293. # important noisy component
  294. X = make_low_rank_matrix(
  295. n_samples=n_samples,
  296. n_features=n_features,
  297. effective_rank=rank,
  298. tail_strength=0.1,
  299. random_state=0,
  300. )
  301. assert X.shape == (n_samples, n_features)
  302. # compute the singular values of X using the slow exact method
  303. _, s, _ = linalg.svd(X, full_matrices=False)
  304. for normalizer in ["auto", "none", "LU", "QR"]:
  305. # compute the singular values of X using the fast approximate
  306. # method without the iterated power method
  307. _, sa, _ = randomized_svd(
  308. X, k, n_iter=0, power_iteration_normalizer=normalizer, random_state=0
  309. )
  310. # the approximation does not tolerate the noise:
  311. assert np.abs(s[:k] - sa).max() > 0.01
  312. # compute the singular values of X using the fast approximate
  313. # method with iterated power method
  314. _, sap, _ = randomized_svd(
  315. X, k, power_iteration_normalizer=normalizer, random_state=0
  316. )
  317. # the iterated power method is helping getting rid of the noise:
  318. assert_almost_equal(s[:k], sap, decimal=3)
  319. def test_randomized_svd_infinite_rank():
  320. # Check that extmath.randomized_svd can handle noisy matrices
  321. n_samples = 100
  322. n_features = 500
  323. rank = 5
  324. k = 10
  325. # let us try again without 'low_rank component': just regularly but slowly
  326. # decreasing singular values: the rank of the data matrix is infinite
  327. X = make_low_rank_matrix(
  328. n_samples=n_samples,
  329. n_features=n_features,
  330. effective_rank=rank,
  331. tail_strength=1.0,
  332. random_state=0,
  333. )
  334. assert X.shape == (n_samples, n_features)
  335. # compute the singular values of X using the slow exact method
  336. _, s, _ = linalg.svd(X, full_matrices=False)
  337. for normalizer in ["auto", "none", "LU", "QR"]:
  338. # compute the singular values of X using the fast approximate method
  339. # without the iterated power method
  340. _, sa, _ = randomized_svd(
  341. X, k, n_iter=0, power_iteration_normalizer=normalizer, random_state=0
  342. )
  343. # the approximation does not tolerate the noise:
  344. assert np.abs(s[:k] - sa).max() > 0.1
  345. # compute the singular values of X using the fast approximate method
  346. # with iterated power method
  347. _, sap, _ = randomized_svd(
  348. X, k, n_iter=5, power_iteration_normalizer=normalizer, random_state=0
  349. )
  350. # the iterated power method is still managing to get most of the
  351. # structure at the requested rank
  352. assert_almost_equal(s[:k], sap, decimal=3)
  353. def test_randomized_svd_transpose_consistency():
  354. # Check that transposing the design matrix has limited impact
  355. n_samples = 100
  356. n_features = 500
  357. rank = 4
  358. k = 10
  359. X = make_low_rank_matrix(
  360. n_samples=n_samples,
  361. n_features=n_features,
  362. effective_rank=rank,
  363. tail_strength=0.5,
  364. random_state=0,
  365. )
  366. assert X.shape == (n_samples, n_features)
  367. U1, s1, V1 = randomized_svd(X, k, n_iter=3, transpose=False, random_state=0)
  368. U2, s2, V2 = randomized_svd(X, k, n_iter=3, transpose=True, random_state=0)
  369. U3, s3, V3 = randomized_svd(X, k, n_iter=3, transpose="auto", random_state=0)
  370. U4, s4, V4 = linalg.svd(X, full_matrices=False)
  371. assert_almost_equal(s1, s4[:k], decimal=3)
  372. assert_almost_equal(s2, s4[:k], decimal=3)
  373. assert_almost_equal(s3, s4[:k], decimal=3)
  374. assert_almost_equal(np.dot(U1, V1), np.dot(U4[:, :k], V4[:k, :]), decimal=2)
  375. assert_almost_equal(np.dot(U2, V2), np.dot(U4[:, :k], V4[:k, :]), decimal=2)
  376. # in this case 'auto' is equivalent to transpose
  377. assert_almost_equal(s2, s3)
  378. def test_randomized_svd_power_iteration_normalizer():
  379. # randomized_svd with power_iteration_normalized='none' diverges for
  380. # large number of power iterations on this dataset
  381. rng = np.random.RandomState(42)
  382. X = make_low_rank_matrix(100, 500, effective_rank=50, random_state=rng)
  383. X += 3 * rng.randint(0, 2, size=X.shape)
  384. n_components = 50
  385. # Check that it diverges with many (non-normalized) power iterations
  386. U, s, Vt = randomized_svd(
  387. X, n_components, n_iter=2, power_iteration_normalizer="none", random_state=0
  388. )
  389. A = X - U.dot(np.diag(s).dot(Vt))
  390. error_2 = linalg.norm(A, ord="fro")
  391. U, s, Vt = randomized_svd(
  392. X, n_components, n_iter=20, power_iteration_normalizer="none", random_state=0
  393. )
  394. A = X - U.dot(np.diag(s).dot(Vt))
  395. error_20 = linalg.norm(A, ord="fro")
  396. assert np.abs(error_2 - error_20) > 100
  397. for normalizer in ["LU", "QR", "auto"]:
  398. U, s, Vt = randomized_svd(
  399. X,
  400. n_components,
  401. n_iter=2,
  402. power_iteration_normalizer=normalizer,
  403. random_state=0,
  404. )
  405. A = X - U.dot(np.diag(s).dot(Vt))
  406. error_2 = linalg.norm(A, ord="fro")
  407. for i in [5, 10, 50]:
  408. U, s, Vt = randomized_svd(
  409. X,
  410. n_components,
  411. n_iter=i,
  412. power_iteration_normalizer=normalizer,
  413. random_state=0,
  414. )
  415. A = X - U.dot(np.diag(s).dot(Vt))
  416. error = linalg.norm(A, ord="fro")
  417. assert 15 > np.abs(error_2 - error)
  418. def test_randomized_svd_sparse_warnings():
  419. # randomized_svd throws a warning for lil and dok matrix
  420. rng = np.random.RandomState(42)
  421. X = make_low_rank_matrix(50, 20, effective_rank=10, random_state=rng)
  422. n_components = 5
  423. for cls in (sparse.lil_matrix, sparse.dok_matrix):
  424. X = cls(X)
  425. warn_msg = (
  426. "Calculating SVD of a {} is expensive. "
  427. "csr_matrix is more efficient.".format(cls.__name__)
  428. )
  429. with pytest.warns(sparse.SparseEfficiencyWarning, match=warn_msg):
  430. randomized_svd(X, n_components, n_iter=1, power_iteration_normalizer="none")
  431. def test_svd_flip():
  432. # Check that svd_flip works in both situations, and reconstructs input.
  433. rs = np.random.RandomState(1999)
  434. n_samples = 20
  435. n_features = 10
  436. X = rs.randn(n_samples, n_features)
  437. # Check matrix reconstruction
  438. U, S, Vt = linalg.svd(X, full_matrices=False)
  439. U1, V1 = svd_flip(U, Vt, u_based_decision=False)
  440. assert_almost_equal(np.dot(U1 * S, V1), X, decimal=6)
  441. # Check transposed matrix reconstruction
  442. XT = X.T
  443. U, S, Vt = linalg.svd(XT, full_matrices=False)
  444. U2, V2 = svd_flip(U, Vt, u_based_decision=True)
  445. assert_almost_equal(np.dot(U2 * S, V2), XT, decimal=6)
  446. # Check that different flip methods are equivalent under reconstruction
  447. U_flip1, V_flip1 = svd_flip(U, Vt, u_based_decision=True)
  448. assert_almost_equal(np.dot(U_flip1 * S, V_flip1), XT, decimal=6)
  449. U_flip2, V_flip2 = svd_flip(U, Vt, u_based_decision=False)
  450. assert_almost_equal(np.dot(U_flip2 * S, V_flip2), XT, decimal=6)
  451. def test_randomized_svd_sign_flip():
  452. a = np.array([[2.0, 0.0], [0.0, 1.0]])
  453. u1, s1, v1 = randomized_svd(a, 2, flip_sign=True, random_state=41)
  454. for seed in range(10):
  455. u2, s2, v2 = randomized_svd(a, 2, flip_sign=True, random_state=seed)
  456. assert_almost_equal(u1, u2)
  457. assert_almost_equal(v1, v2)
  458. assert_almost_equal(np.dot(u2 * s2, v2), a)
  459. assert_almost_equal(np.dot(u2.T, u2), np.eye(2))
  460. assert_almost_equal(np.dot(v2.T, v2), np.eye(2))
  461. def test_randomized_svd_sign_flip_with_transpose():
  462. # Check if the randomized_svd sign flipping is always done based on u
  463. # irrespective of transpose.
  464. # See https://github.com/scikit-learn/scikit-learn/issues/5608
  465. # for more details.
  466. def max_loading_is_positive(u, v):
  467. """
  468. returns bool tuple indicating if the values maximising np.abs
  469. are positive across all rows for u and across all columns for v.
  470. """
  471. u_based = (np.abs(u).max(axis=0) == u.max(axis=0)).all()
  472. v_based = (np.abs(v).max(axis=1) == v.max(axis=1)).all()
  473. return u_based, v_based
  474. mat = np.arange(10 * 8).reshape(10, -1)
  475. # Without transpose
  476. u_flipped, _, v_flipped = randomized_svd(mat, 3, flip_sign=True, random_state=0)
  477. u_based, v_based = max_loading_is_positive(u_flipped, v_flipped)
  478. assert u_based
  479. assert not v_based
  480. # With transpose
  481. u_flipped_with_transpose, _, v_flipped_with_transpose = randomized_svd(
  482. mat, 3, flip_sign=True, transpose=True, random_state=0
  483. )
  484. u_based, v_based = max_loading_is_positive(
  485. u_flipped_with_transpose, v_flipped_with_transpose
  486. )
  487. assert u_based
  488. assert not v_based
  489. @pytest.mark.parametrize("n", [50, 100, 300])
  490. @pytest.mark.parametrize("m", [50, 100, 300])
  491. @pytest.mark.parametrize("k", [10, 20, 50])
  492. @pytest.mark.parametrize("seed", range(5))
  493. def test_randomized_svd_lapack_driver(n, m, k, seed):
  494. # Check that different SVD drivers provide consistent results
  495. # Matrix being compressed
  496. rng = np.random.RandomState(seed)
  497. X = rng.rand(n, m)
  498. # Number of components
  499. u1, s1, vt1 = randomized_svd(X, k, svd_lapack_driver="gesdd", random_state=0)
  500. u2, s2, vt2 = randomized_svd(X, k, svd_lapack_driver="gesvd", random_state=0)
  501. # Check shape and contents
  502. assert u1.shape == u2.shape
  503. assert_allclose(u1, u2, atol=0, rtol=1e-3)
  504. assert s1.shape == s2.shape
  505. assert_allclose(s1, s2, atol=0, rtol=1e-3)
  506. assert vt1.shape == vt2.shape
  507. assert_allclose(vt1, vt2, atol=0, rtol=1e-3)
  508. def test_cartesian():
  509. # Check if cartesian product delivers the right results
  510. axes = (np.array([1, 2, 3]), np.array([4, 5]), np.array([6, 7]))
  511. true_out = np.array(
  512. [
  513. [1, 4, 6],
  514. [1, 4, 7],
  515. [1, 5, 6],
  516. [1, 5, 7],
  517. [2, 4, 6],
  518. [2, 4, 7],
  519. [2, 5, 6],
  520. [2, 5, 7],
  521. [3, 4, 6],
  522. [3, 4, 7],
  523. [3, 5, 6],
  524. [3, 5, 7],
  525. ]
  526. )
  527. out = cartesian(axes)
  528. assert_array_equal(true_out, out)
  529. # check single axis
  530. x = np.arange(3)
  531. assert_array_equal(x[:, np.newaxis], cartesian((x,)))
  532. @pytest.mark.parametrize(
  533. "arrays, output_dtype",
  534. [
  535. (
  536. [np.array([1, 2, 3], dtype=np.int32), np.array([4, 5], dtype=np.int64)],
  537. np.dtype(np.int64),
  538. ),
  539. (
  540. [np.array([1, 2, 3], dtype=np.int32), np.array([4, 5], dtype=np.float64)],
  541. np.dtype(np.float64),
  542. ),
  543. (
  544. [np.array([1, 2, 3], dtype=np.int32), np.array(["x", "y"], dtype=object)],
  545. np.dtype(object),
  546. ),
  547. ],
  548. )
  549. def test_cartesian_mix_types(arrays, output_dtype):
  550. """Check that the cartesian product works with mixed types."""
  551. output = cartesian(arrays)
  552. assert output.dtype == output_dtype
  553. def test_logistic_sigmoid():
  554. # Check correctness and robustness of logistic sigmoid implementation
  555. def naive_log_logistic(x):
  556. return np.log(expit(x))
  557. x = np.linspace(-2, 2, 50)
  558. assert_array_almost_equal(log_logistic(x), naive_log_logistic(x))
  559. extreme_x = np.array([-100.0, 100.0])
  560. assert_array_almost_equal(log_logistic(extreme_x), [-100, 0])
  561. @pytest.fixture()
  562. def rng():
  563. return np.random.RandomState(42)
  564. @pytest.mark.parametrize("dtype", [np.float32, np.float64])
  565. def test_incremental_weighted_mean_and_variance_simple(rng, dtype):
  566. mult = 10
  567. X = rng.rand(1000, 20).astype(dtype) * mult
  568. sample_weight = rng.rand(X.shape[0]) * mult
  569. mean, var, _ = _incremental_mean_and_var(X, 0, 0, 0, sample_weight=sample_weight)
  570. expected_mean = np.average(X, weights=sample_weight, axis=0)
  571. expected_var = (
  572. np.average(X**2, weights=sample_weight, axis=0) - expected_mean**2
  573. )
  574. assert_almost_equal(mean, expected_mean)
  575. assert_almost_equal(var, expected_var)
  576. @pytest.mark.parametrize("mean", [0, 1e7, -1e7])
  577. @pytest.mark.parametrize("var", [1, 1e-8, 1e5])
  578. @pytest.mark.parametrize(
  579. "weight_loc, weight_scale", [(0, 1), (0, 1e-8), (1, 1e-8), (10, 1), (1e7, 1)]
  580. )
  581. def test_incremental_weighted_mean_and_variance(
  582. mean, var, weight_loc, weight_scale, rng
  583. ):
  584. # Testing of correctness and numerical stability
  585. def _assert(X, sample_weight, expected_mean, expected_var):
  586. n = X.shape[0]
  587. for chunk_size in [1, n // 10 + 1, n // 4 + 1, n // 2 + 1, n]:
  588. last_mean, last_weight_sum, last_var = 0, 0, 0
  589. for batch in gen_batches(n, chunk_size):
  590. last_mean, last_var, last_weight_sum = _incremental_mean_and_var(
  591. X[batch],
  592. last_mean,
  593. last_var,
  594. last_weight_sum,
  595. sample_weight=sample_weight[batch],
  596. )
  597. assert_allclose(last_mean, expected_mean)
  598. assert_allclose(last_var, expected_var, atol=1e-6)
  599. size = (100, 20)
  600. weight = rng.normal(loc=weight_loc, scale=weight_scale, size=size[0])
  601. # Compare to weighted average: np.average
  602. X = rng.normal(loc=mean, scale=var, size=size)
  603. expected_mean = _safe_accumulator_op(np.average, X, weights=weight, axis=0)
  604. expected_var = _safe_accumulator_op(
  605. np.average, (X - expected_mean) ** 2, weights=weight, axis=0
  606. )
  607. _assert(X, weight, expected_mean, expected_var)
  608. # Compare to unweighted mean: np.mean
  609. X = rng.normal(loc=mean, scale=var, size=size)
  610. ones_weight = np.ones(size[0])
  611. expected_mean = _safe_accumulator_op(np.mean, X, axis=0)
  612. expected_var = _safe_accumulator_op(np.var, X, axis=0)
  613. _assert(X, ones_weight, expected_mean, expected_var)
  614. @pytest.mark.parametrize("dtype", [np.float32, np.float64])
  615. def test_incremental_weighted_mean_and_variance_ignore_nan(dtype):
  616. old_means = np.array([535.0, 535.0, 535.0, 535.0])
  617. old_variances = np.array([4225.0, 4225.0, 4225.0, 4225.0])
  618. old_weight_sum = np.array([2, 2, 2, 2], dtype=np.int32)
  619. sample_weights_X = np.ones(3)
  620. sample_weights_X_nan = np.ones(4)
  621. X = np.array(
  622. [[170, 170, 170, 170], [430, 430, 430, 430], [300, 300, 300, 300]]
  623. ).astype(dtype)
  624. X_nan = np.array(
  625. [
  626. [170, np.nan, 170, 170],
  627. [np.nan, 170, 430, 430],
  628. [430, 430, np.nan, 300],
  629. [300, 300, 300, np.nan],
  630. ]
  631. ).astype(dtype)
  632. X_means, X_variances, X_count = _incremental_mean_and_var(
  633. X, old_means, old_variances, old_weight_sum, sample_weight=sample_weights_X
  634. )
  635. X_nan_means, X_nan_variances, X_nan_count = _incremental_mean_and_var(
  636. X_nan,
  637. old_means,
  638. old_variances,
  639. old_weight_sum,
  640. sample_weight=sample_weights_X_nan,
  641. )
  642. assert_allclose(X_nan_means, X_means)
  643. assert_allclose(X_nan_variances, X_variances)
  644. assert_allclose(X_nan_count, X_count)
  645. def test_incremental_variance_update_formulas():
  646. # Test Youngs and Cramer incremental variance formulas.
  647. # Doggie data from https://www.mathsisfun.com/data/standard-deviation.html
  648. A = np.array(
  649. [
  650. [600, 470, 170, 430, 300],
  651. [600, 470, 170, 430, 300],
  652. [600, 470, 170, 430, 300],
  653. [600, 470, 170, 430, 300],
  654. ]
  655. ).T
  656. idx = 2
  657. X1 = A[:idx, :]
  658. X2 = A[idx:, :]
  659. old_means = X1.mean(axis=0)
  660. old_variances = X1.var(axis=0)
  661. old_sample_count = np.full(X1.shape[1], X1.shape[0], dtype=np.int32)
  662. final_means, final_variances, final_count = _incremental_mean_and_var(
  663. X2, old_means, old_variances, old_sample_count
  664. )
  665. assert_almost_equal(final_means, A.mean(axis=0), 6)
  666. assert_almost_equal(final_variances, A.var(axis=0), 6)
  667. assert_almost_equal(final_count, A.shape[0])
  668. def test_incremental_mean_and_variance_ignore_nan():
  669. old_means = np.array([535.0, 535.0, 535.0, 535.0])
  670. old_variances = np.array([4225.0, 4225.0, 4225.0, 4225.0])
  671. old_sample_count = np.array([2, 2, 2, 2], dtype=np.int32)
  672. X = np.array([[170, 170, 170, 170], [430, 430, 430, 430], [300, 300, 300, 300]])
  673. X_nan = np.array(
  674. [
  675. [170, np.nan, 170, 170],
  676. [np.nan, 170, 430, 430],
  677. [430, 430, np.nan, 300],
  678. [300, 300, 300, np.nan],
  679. ]
  680. )
  681. X_means, X_variances, X_count = _incremental_mean_and_var(
  682. X, old_means, old_variances, old_sample_count
  683. )
  684. X_nan_means, X_nan_variances, X_nan_count = _incremental_mean_and_var(
  685. X_nan, old_means, old_variances, old_sample_count
  686. )
  687. assert_allclose(X_nan_means, X_means)
  688. assert_allclose(X_nan_variances, X_variances)
  689. assert_allclose(X_nan_count, X_count)
  690. @skip_if_32bit
  691. def test_incremental_variance_numerical_stability():
  692. # Test Youngs and Cramer incremental variance formulas.
  693. def np_var(A):
  694. return A.var(axis=0)
  695. # Naive one pass variance computation - not numerically stable
  696. # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
  697. def one_pass_var(X):
  698. n = X.shape[0]
  699. exp_x2 = (X**2).sum(axis=0) / n
  700. expx_2 = (X.sum(axis=0) / n) ** 2
  701. return exp_x2 - expx_2
  702. # Two-pass algorithm, stable.
  703. # We use it as a benchmark. It is not an online algorithm
  704. # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Two-pass_algorithm
  705. def two_pass_var(X):
  706. mean = X.mean(axis=0)
  707. Y = X.copy()
  708. return np.mean((Y - mean) ** 2, axis=0)
  709. # Naive online implementation
  710. # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm
  711. # This works only for chunks for size 1
  712. def naive_mean_variance_update(x, last_mean, last_variance, last_sample_count):
  713. updated_sample_count = last_sample_count + 1
  714. samples_ratio = last_sample_count / float(updated_sample_count)
  715. updated_mean = x / updated_sample_count + last_mean * samples_ratio
  716. updated_variance = (
  717. last_variance * samples_ratio
  718. + (x - last_mean) * (x - updated_mean) / updated_sample_count
  719. )
  720. return updated_mean, updated_variance, updated_sample_count
  721. # We want to show a case when one_pass_var has error > 1e-3 while
  722. # _batch_mean_variance_update has less.
  723. tol = 200
  724. n_features = 2
  725. n_samples = 10000
  726. x1 = np.array(1e8, dtype=np.float64)
  727. x2 = np.log(1e-5, dtype=np.float64)
  728. A0 = np.full((n_samples // 2, n_features), x1, dtype=np.float64)
  729. A1 = np.full((n_samples // 2, n_features), x2, dtype=np.float64)
  730. A = np.vstack((A0, A1))
  731. # Naive one pass var: >tol (=1063)
  732. assert np.abs(np_var(A) - one_pass_var(A)).max() > tol
  733. # Starting point for online algorithms: after A0
  734. # Naive implementation: >tol (436)
  735. mean, var, n = A0[0, :], np.zeros(n_features), n_samples // 2
  736. for i in range(A1.shape[0]):
  737. mean, var, n = naive_mean_variance_update(A1[i, :], mean, var, n)
  738. assert n == A.shape[0]
  739. # the mean is also slightly unstable
  740. assert np.abs(A.mean(axis=0) - mean).max() > 1e-6
  741. assert np.abs(np_var(A) - var).max() > tol
  742. # Robust implementation: <tol (177)
  743. mean, var = A0[0, :], np.zeros(n_features)
  744. n = np.full(n_features, n_samples // 2, dtype=np.int32)
  745. for i in range(A1.shape[0]):
  746. mean, var, n = _incremental_mean_and_var(
  747. A1[i, :].reshape((1, A1.shape[1])), mean, var, n
  748. )
  749. assert_array_equal(n, A.shape[0])
  750. assert_array_almost_equal(A.mean(axis=0), mean)
  751. assert tol > np.abs(np_var(A) - var).max()
  752. def test_incremental_variance_ddof():
  753. # Test that degrees of freedom parameter for calculations are correct.
  754. rng = np.random.RandomState(1999)
  755. X = rng.randn(50, 10)
  756. n_samples, n_features = X.shape
  757. for batch_size in [11, 20, 37]:
  758. steps = np.arange(0, X.shape[0], batch_size)
  759. if steps[-1] != X.shape[0]:
  760. steps = np.hstack([steps, n_samples])
  761. for i, j in zip(steps[:-1], steps[1:]):
  762. batch = X[i:j, :]
  763. if i == 0:
  764. incremental_means = batch.mean(axis=0)
  765. incremental_variances = batch.var(axis=0)
  766. # Assign this twice so that the test logic is consistent
  767. incremental_count = batch.shape[0]
  768. sample_count = np.full(batch.shape[1], batch.shape[0], dtype=np.int32)
  769. else:
  770. result = _incremental_mean_and_var(
  771. batch, incremental_means, incremental_variances, sample_count
  772. )
  773. (incremental_means, incremental_variances, incremental_count) = result
  774. sample_count += batch.shape[0]
  775. calculated_means = np.mean(X[:j], axis=0)
  776. calculated_variances = np.var(X[:j], axis=0)
  777. assert_almost_equal(incremental_means, calculated_means, 6)
  778. assert_almost_equal(incremental_variances, calculated_variances, 6)
  779. assert_array_equal(incremental_count, sample_count)
  780. def test_vector_sign_flip():
  781. # Testing that sign flip is working & largest value has positive sign
  782. data = np.random.RandomState(36).randn(5, 5)
  783. max_abs_rows = np.argmax(np.abs(data), axis=1)
  784. data_flipped = _deterministic_vector_sign_flip(data)
  785. max_rows = np.argmax(data_flipped, axis=1)
  786. assert_array_equal(max_abs_rows, max_rows)
  787. signs = np.sign(data[range(data.shape[0]), max_abs_rows])
  788. assert_array_equal(data, data_flipped * signs[:, np.newaxis])
  789. def test_softmax():
  790. rng = np.random.RandomState(0)
  791. X = rng.randn(3, 5)
  792. exp_X = np.exp(X)
  793. sum_exp_X = np.sum(exp_X, axis=1).reshape((-1, 1))
  794. assert_array_almost_equal(softmax(X), exp_X / sum_exp_X)
  795. def test_stable_cumsum():
  796. assert_array_equal(stable_cumsum([1, 2, 3]), np.cumsum([1, 2, 3]))
  797. r = np.random.RandomState(0).rand(100000)
  798. with pytest.warns(RuntimeWarning):
  799. stable_cumsum(r, rtol=0, atol=0)
  800. # test axis parameter
  801. A = np.random.RandomState(36).randint(1000, size=(5, 5, 5))
  802. assert_array_equal(stable_cumsum(A, axis=0), np.cumsum(A, axis=0))
  803. assert_array_equal(stable_cumsum(A, axis=1), np.cumsum(A, axis=1))
  804. assert_array_equal(stable_cumsum(A, axis=2), np.cumsum(A, axis=2))
  805. @pytest.mark.parametrize(
  806. "A_array_constr", [np.array, sparse.csr_matrix], ids=["dense", "sparse"]
  807. )
  808. @pytest.mark.parametrize(
  809. "B_array_constr", [np.array, sparse.csr_matrix], ids=["dense", "sparse"]
  810. )
  811. def test_safe_sparse_dot_2d(A_array_constr, B_array_constr):
  812. rng = np.random.RandomState(0)
  813. A = rng.random_sample((30, 10))
  814. B = rng.random_sample((10, 20))
  815. expected = np.dot(A, B)
  816. A = A_array_constr(A)
  817. B = B_array_constr(B)
  818. actual = safe_sparse_dot(A, B, dense_output=True)
  819. assert_allclose(actual, expected)
  820. def test_safe_sparse_dot_nd():
  821. rng = np.random.RandomState(0)
  822. # dense ND / sparse
  823. A = rng.random_sample((2, 3, 4, 5, 6))
  824. B = rng.random_sample((6, 7))
  825. expected = np.dot(A, B)
  826. B = sparse.csr_matrix(B)
  827. actual = safe_sparse_dot(A, B)
  828. assert_allclose(actual, expected)
  829. # sparse / dense ND
  830. A = rng.random_sample((2, 3))
  831. B = rng.random_sample((4, 5, 3, 6))
  832. expected = np.dot(A, B)
  833. A = sparse.csr_matrix(A)
  834. actual = safe_sparse_dot(A, B)
  835. assert_allclose(actual, expected)
  836. @pytest.mark.parametrize(
  837. "A_array_constr", [np.array, sparse.csr_matrix], ids=["dense", "sparse"]
  838. )
  839. def test_safe_sparse_dot_2d_1d(A_array_constr):
  840. rng = np.random.RandomState(0)
  841. B = rng.random_sample((10))
  842. # 2D @ 1D
  843. A = rng.random_sample((30, 10))
  844. expected = np.dot(A, B)
  845. A = A_array_constr(A)
  846. actual = safe_sparse_dot(A, B)
  847. assert_allclose(actual, expected)
  848. # 1D @ 2D
  849. A = rng.random_sample((10, 30))
  850. expected = np.dot(B, A)
  851. A = A_array_constr(A)
  852. actual = safe_sparse_dot(B, A)
  853. assert_allclose(actual, expected)
  854. @pytest.mark.parametrize("dense_output", [True, False])
  855. def test_safe_sparse_dot_dense_output(dense_output):
  856. rng = np.random.RandomState(0)
  857. A = sparse.random(30, 10, density=0.1, random_state=rng)
  858. B = sparse.random(10, 20, density=0.1, random_state=rng)
  859. expected = A.dot(B)
  860. actual = safe_sparse_dot(A, B, dense_output=dense_output)
  861. assert sparse.issparse(actual) == (not dense_output)
  862. if dense_output:
  863. expected = expected.toarray()
  864. assert_allclose_dense_sparse(actual, expected)