test_sparsefuncs.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916
  1. import numpy as np
  2. import pytest
  3. import scipy.sparse as sp
  4. from numpy.random import RandomState
  5. from numpy.testing import assert_array_almost_equal, assert_array_equal
  6. from scipy import linalg
  7. from sklearn.datasets import make_classification
  8. from sklearn.utils._testing import assert_allclose
  9. from sklearn.utils.sparsefuncs import (
  10. count_nonzero,
  11. csc_median_axis_0,
  12. incr_mean_variance_axis,
  13. inplace_column_scale,
  14. inplace_row_scale,
  15. inplace_swap_column,
  16. inplace_swap_row,
  17. mean_variance_axis,
  18. min_max_axis,
  19. )
  20. from sklearn.utils.sparsefuncs_fast import (
  21. assign_rows_csr,
  22. csr_row_norms,
  23. inplace_csr_row_normalize_l1,
  24. inplace_csr_row_normalize_l2,
  25. )
  26. def test_mean_variance_axis0():
  27. X, _ = make_classification(5, 4, random_state=0)
  28. # Sparsify the array a little bit
  29. X[0, 0] = 0
  30. X[2, 1] = 0
  31. X[4, 3] = 0
  32. X_lil = sp.lil_matrix(X)
  33. X_lil[1, 0] = 0
  34. X[1, 0] = 0
  35. with pytest.raises(TypeError):
  36. mean_variance_axis(X_lil, axis=0)
  37. X_csr = sp.csr_matrix(X_lil)
  38. X_csc = sp.csc_matrix(X_lil)
  39. expected_dtypes = [
  40. (np.float32, np.float32),
  41. (np.float64, np.float64),
  42. (np.int32, np.float64),
  43. (np.int64, np.float64),
  44. ]
  45. for input_dtype, output_dtype in expected_dtypes:
  46. X_test = X.astype(input_dtype)
  47. for X_sparse in (X_csr, X_csc):
  48. X_sparse = X_sparse.astype(input_dtype)
  49. X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
  50. assert X_means.dtype == output_dtype
  51. assert X_vars.dtype == output_dtype
  52. assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
  53. assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
  54. @pytest.mark.parametrize("dtype", [np.float32, np.float64])
  55. @pytest.mark.parametrize("sparse_constructor", [sp.csr_matrix, sp.csc_matrix])
  56. def test_mean_variance_axis0_precision(dtype, sparse_constructor):
  57. # Check that there's no big loss of precision when the real variance is
  58. # exactly 0. (#19766)
  59. rng = np.random.RandomState(0)
  60. X = np.full(fill_value=100.0, shape=(1000, 1), dtype=dtype)
  61. # Add some missing records which should be ignored:
  62. missing_indices = rng.choice(np.arange(X.shape[0]), 10, replace=False)
  63. X[missing_indices, 0] = np.nan
  64. X = sparse_constructor(X)
  65. # Random positive weights:
  66. sample_weight = rng.rand(X.shape[0]).astype(dtype)
  67. _, var = mean_variance_axis(X, weights=sample_weight, axis=0)
  68. assert var < np.finfo(dtype).eps
  69. def test_mean_variance_axis1():
  70. X, _ = make_classification(5, 4, random_state=0)
  71. # Sparsify the array a little bit
  72. X[0, 0] = 0
  73. X[2, 1] = 0
  74. X[4, 3] = 0
  75. X_lil = sp.lil_matrix(X)
  76. X_lil[1, 0] = 0
  77. X[1, 0] = 0
  78. with pytest.raises(TypeError):
  79. mean_variance_axis(X_lil, axis=1)
  80. X_csr = sp.csr_matrix(X_lil)
  81. X_csc = sp.csc_matrix(X_lil)
  82. expected_dtypes = [
  83. (np.float32, np.float32),
  84. (np.float64, np.float64),
  85. (np.int32, np.float64),
  86. (np.int64, np.float64),
  87. ]
  88. for input_dtype, output_dtype in expected_dtypes:
  89. X_test = X.astype(input_dtype)
  90. for X_sparse in (X_csr, X_csc):
  91. X_sparse = X_sparse.astype(input_dtype)
  92. X_means, X_vars = mean_variance_axis(X_sparse, axis=0)
  93. assert X_means.dtype == output_dtype
  94. assert X_vars.dtype == output_dtype
  95. assert_array_almost_equal(X_means, np.mean(X_test, axis=0))
  96. assert_array_almost_equal(X_vars, np.var(X_test, axis=0))
  97. @pytest.mark.parametrize(
  98. ["Xw", "X", "weights"],
  99. [
  100. ([[0, 0, 1], [0, 2, 3]], [[0, 0, 1], [0, 2, 3]], [1, 1, 1]),
  101. ([[0, 0, 1], [0, 1, 1]], [[0, 0, 0, 1], [0, 1, 1, 1]], [1, 2, 1]),
  102. ([[0, 0, 1], [0, 1, 1]], [[0, 0, 1], [0, 1, 1]], None),
  103. (
  104. [[0, np.nan, 2], [0, np.nan, np.nan]],
  105. [[0, np.nan, 2], [0, np.nan, np.nan]],
  106. [1.0, 1.0, 1.0],
  107. ),
  108. (
  109. [[0, 0], [1, np.nan], [2, 0], [0, 3], [np.nan, np.nan], [np.nan, 2]],
  110. [
  111. [0, 0, 0],
  112. [1, 1, np.nan],
  113. [2, 2, 0],
  114. [0, 0, 3],
  115. [np.nan, np.nan, np.nan],
  116. [np.nan, np.nan, 2],
  117. ],
  118. [2.0, 1.0],
  119. ),
  120. (
  121. [[1, 0, 1], [0, 3, 1]],
  122. [[1, 0, 0, 0, 1], [0, 3, 3, 3, 1]],
  123. np.array([1, 3, 1]),
  124. ),
  125. ],
  126. )
  127. @pytest.mark.parametrize("sparse_constructor", [sp.csc_matrix, sp.csr_matrix])
  128. @pytest.mark.parametrize("dtype", [np.float32, np.float64])
  129. def test_incr_mean_variance_axis_weighted_axis1(
  130. Xw, X, weights, sparse_constructor, dtype
  131. ):
  132. axis = 1
  133. Xw_sparse = sparse_constructor(Xw).astype(dtype)
  134. X_sparse = sparse_constructor(X).astype(dtype)
  135. last_mean = np.zeros(np.shape(Xw)[0], dtype=dtype)
  136. last_var = np.zeros_like(last_mean, dtype=dtype)
  137. last_n = np.zeros_like(last_mean, dtype=np.int64)
  138. means0, vars0, n_incr0 = incr_mean_variance_axis(
  139. X=X_sparse,
  140. axis=axis,
  141. last_mean=last_mean,
  142. last_var=last_var,
  143. last_n=last_n,
  144. weights=None,
  145. )
  146. means_w0, vars_w0, n_incr_w0 = incr_mean_variance_axis(
  147. X=Xw_sparse,
  148. axis=axis,
  149. last_mean=last_mean,
  150. last_var=last_var,
  151. last_n=last_n,
  152. weights=weights,
  153. )
  154. assert means_w0.dtype == dtype
  155. assert vars_w0.dtype == dtype
  156. assert n_incr_w0.dtype == dtype
  157. means_simple, vars_simple = mean_variance_axis(X=X_sparse, axis=axis)
  158. assert_array_almost_equal(means0, means_w0)
  159. assert_array_almost_equal(means0, means_simple)
  160. assert_array_almost_equal(vars0, vars_w0)
  161. assert_array_almost_equal(vars0, vars_simple)
  162. assert_array_almost_equal(n_incr0, n_incr_w0)
  163. # check second round for incremental
  164. means1, vars1, n_incr1 = incr_mean_variance_axis(
  165. X=X_sparse,
  166. axis=axis,
  167. last_mean=means0,
  168. last_var=vars0,
  169. last_n=n_incr0,
  170. weights=None,
  171. )
  172. means_w1, vars_w1, n_incr_w1 = incr_mean_variance_axis(
  173. X=Xw_sparse,
  174. axis=axis,
  175. last_mean=means_w0,
  176. last_var=vars_w0,
  177. last_n=n_incr_w0,
  178. weights=weights,
  179. )
  180. assert_array_almost_equal(means1, means_w1)
  181. assert_array_almost_equal(vars1, vars_w1)
  182. assert_array_almost_equal(n_incr1, n_incr_w1)
  183. assert means_w1.dtype == dtype
  184. assert vars_w1.dtype == dtype
  185. assert n_incr_w1.dtype == dtype
  186. @pytest.mark.parametrize(
  187. ["Xw", "X", "weights"],
  188. [
  189. ([[0, 0, 1], [0, 2, 3]], [[0, 0, 1], [0, 2, 3]], [1, 1]),
  190. ([[0, 0, 1], [0, 1, 1]], [[0, 0, 1], [0, 1, 1], [0, 1, 1]], [1, 2]),
  191. ([[0, 0, 1], [0, 1, 1]], [[0, 0, 1], [0, 1, 1]], None),
  192. (
  193. [[0, np.nan, 2], [0, np.nan, np.nan]],
  194. [[0, np.nan, 2], [0, np.nan, np.nan]],
  195. [1.0, 1.0],
  196. ),
  197. (
  198. [[0, 0, 1, np.nan, 2, 0], [0, 3, np.nan, np.nan, np.nan, 2]],
  199. [
  200. [0, 0, 1, np.nan, 2, 0],
  201. [0, 0, 1, np.nan, 2, 0],
  202. [0, 3, np.nan, np.nan, np.nan, 2],
  203. ],
  204. [2.0, 1.0],
  205. ),
  206. (
  207. [[1, 0, 1], [0, 0, 1]],
  208. [[1, 0, 1], [0, 0, 1], [0, 0, 1], [0, 0, 1]],
  209. np.array([1, 3]),
  210. ),
  211. ],
  212. )
  213. @pytest.mark.parametrize("sparse_constructor", [sp.csc_matrix, sp.csr_matrix])
  214. @pytest.mark.parametrize("dtype", [np.float32, np.float64])
  215. def test_incr_mean_variance_axis_weighted_axis0(
  216. Xw, X, weights, sparse_constructor, dtype
  217. ):
  218. axis = 0
  219. Xw_sparse = sparse_constructor(Xw).astype(dtype)
  220. X_sparse = sparse_constructor(X).astype(dtype)
  221. last_mean = np.zeros(np.size(Xw, 1), dtype=dtype)
  222. last_var = np.zeros_like(last_mean)
  223. last_n = np.zeros_like(last_mean, dtype=np.int64)
  224. means0, vars0, n_incr0 = incr_mean_variance_axis(
  225. X=X_sparse,
  226. axis=axis,
  227. last_mean=last_mean,
  228. last_var=last_var,
  229. last_n=last_n,
  230. weights=None,
  231. )
  232. means_w0, vars_w0, n_incr_w0 = incr_mean_variance_axis(
  233. X=Xw_sparse,
  234. axis=axis,
  235. last_mean=last_mean,
  236. last_var=last_var,
  237. last_n=last_n,
  238. weights=weights,
  239. )
  240. assert means_w0.dtype == dtype
  241. assert vars_w0.dtype == dtype
  242. assert n_incr_w0.dtype == dtype
  243. means_simple, vars_simple = mean_variance_axis(X=X_sparse, axis=axis)
  244. assert_array_almost_equal(means0, means_w0)
  245. assert_array_almost_equal(means0, means_simple)
  246. assert_array_almost_equal(vars0, vars_w0)
  247. assert_array_almost_equal(vars0, vars_simple)
  248. assert_array_almost_equal(n_incr0, n_incr_w0)
  249. # check second round for incremental
  250. means1, vars1, n_incr1 = incr_mean_variance_axis(
  251. X=X_sparse,
  252. axis=axis,
  253. last_mean=means0,
  254. last_var=vars0,
  255. last_n=n_incr0,
  256. weights=None,
  257. )
  258. means_w1, vars_w1, n_incr_w1 = incr_mean_variance_axis(
  259. X=Xw_sparse,
  260. axis=axis,
  261. last_mean=means_w0,
  262. last_var=vars_w0,
  263. last_n=n_incr_w0,
  264. weights=weights,
  265. )
  266. assert_array_almost_equal(means1, means_w1)
  267. assert_array_almost_equal(vars1, vars_w1)
  268. assert_array_almost_equal(n_incr1, n_incr_w1)
  269. assert means_w1.dtype == dtype
  270. assert vars_w1.dtype == dtype
  271. assert n_incr_w1.dtype == dtype
  272. def test_incr_mean_variance_axis():
  273. for axis in [0, 1]:
  274. rng = np.random.RandomState(0)
  275. n_features = 50
  276. n_samples = 10
  277. if axis == 0:
  278. data_chunks = [rng.randint(0, 2, size=n_features) for i in range(n_samples)]
  279. else:
  280. data_chunks = [rng.randint(0, 2, size=n_samples) for i in range(n_features)]
  281. # default params for incr_mean_variance
  282. last_mean = np.zeros(n_features) if axis == 0 else np.zeros(n_samples)
  283. last_var = np.zeros_like(last_mean)
  284. last_n = np.zeros_like(last_mean, dtype=np.int64)
  285. # Test errors
  286. X = np.array(data_chunks[0])
  287. X = np.atleast_2d(X)
  288. X = X.T if axis == 1 else X
  289. X_lil = sp.lil_matrix(X)
  290. X_csr = sp.csr_matrix(X_lil)
  291. with pytest.raises(TypeError):
  292. incr_mean_variance_axis(
  293. X=axis, axis=last_mean, last_mean=last_var, last_var=last_n
  294. )
  295. with pytest.raises(TypeError):
  296. incr_mean_variance_axis(
  297. X_lil, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
  298. )
  299. # Test _incr_mean_and_var with a 1 row input
  300. X_means, X_vars = mean_variance_axis(X_csr, axis)
  301. X_means_incr, X_vars_incr, n_incr = incr_mean_variance_axis(
  302. X_csr, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
  303. )
  304. assert_array_almost_equal(X_means, X_means_incr)
  305. assert_array_almost_equal(X_vars, X_vars_incr)
  306. # X.shape[axis] picks # samples
  307. assert_array_equal(X.shape[axis], n_incr)
  308. X_csc = sp.csc_matrix(X_lil)
  309. X_means, X_vars = mean_variance_axis(X_csc, axis)
  310. assert_array_almost_equal(X_means, X_means_incr)
  311. assert_array_almost_equal(X_vars, X_vars_incr)
  312. assert_array_equal(X.shape[axis], n_incr)
  313. # Test _incremental_mean_and_var with whole data
  314. X = np.vstack(data_chunks)
  315. X = X.T if axis == 1 else X
  316. X_lil = sp.lil_matrix(X)
  317. X_csr = sp.csr_matrix(X_lil)
  318. X_csc = sp.csc_matrix(X_lil)
  319. expected_dtypes = [
  320. (np.float32, np.float32),
  321. (np.float64, np.float64),
  322. (np.int32, np.float64),
  323. (np.int64, np.float64),
  324. ]
  325. for input_dtype, output_dtype in expected_dtypes:
  326. for X_sparse in (X_csr, X_csc):
  327. X_sparse = X_sparse.astype(input_dtype)
  328. last_mean = last_mean.astype(output_dtype)
  329. last_var = last_var.astype(output_dtype)
  330. X_means, X_vars = mean_variance_axis(X_sparse, axis)
  331. X_means_incr, X_vars_incr, n_incr = incr_mean_variance_axis(
  332. X_sparse,
  333. axis=axis,
  334. last_mean=last_mean,
  335. last_var=last_var,
  336. last_n=last_n,
  337. )
  338. assert X_means_incr.dtype == output_dtype
  339. assert X_vars_incr.dtype == output_dtype
  340. assert_array_almost_equal(X_means, X_means_incr)
  341. assert_array_almost_equal(X_vars, X_vars_incr)
  342. assert_array_equal(X.shape[axis], n_incr)
  343. @pytest.mark.parametrize("sparse_constructor", [sp.csc_matrix, sp.csr_matrix])
  344. def test_incr_mean_variance_axis_dim_mismatch(sparse_constructor):
  345. """Check that we raise proper error when axis=1 and the dimension mismatch.
  346. Non-regression test for:
  347. https://github.com/scikit-learn/scikit-learn/pull/18655
  348. """
  349. n_samples, n_features = 60, 4
  350. rng = np.random.RandomState(42)
  351. X = sparse_constructor(rng.rand(n_samples, n_features))
  352. last_mean = np.zeros(n_features)
  353. last_var = np.zeros_like(last_mean)
  354. last_n = np.zeros(last_mean.shape, dtype=np.int64)
  355. kwargs = dict(last_mean=last_mean, last_var=last_var, last_n=last_n)
  356. mean0, var0, _ = incr_mean_variance_axis(X, axis=0, **kwargs)
  357. assert_allclose(np.mean(X.toarray(), axis=0), mean0)
  358. assert_allclose(np.var(X.toarray(), axis=0), var0)
  359. # test ValueError if axis=1 and last_mean.size == n_features
  360. with pytest.raises(ValueError):
  361. incr_mean_variance_axis(X, axis=1, **kwargs)
  362. # test inconsistent shapes of last_mean, last_var, last_n
  363. kwargs = dict(last_mean=last_mean[:-1], last_var=last_var, last_n=last_n)
  364. with pytest.raises(ValueError):
  365. incr_mean_variance_axis(X, axis=0, **kwargs)
  366. @pytest.mark.parametrize(
  367. "X1, X2",
  368. [
  369. (
  370. sp.random(5, 2, density=0.8, format="csr", random_state=0),
  371. sp.random(13, 2, density=0.8, format="csr", random_state=0),
  372. ),
  373. (
  374. sp.random(5, 2, density=0.8, format="csr", random_state=0),
  375. sp.hstack(
  376. [
  377. sp.csr_matrix(np.full((13, 1), fill_value=np.nan)),
  378. sp.random(13, 1, density=0.8, random_state=42),
  379. ],
  380. format="csr",
  381. ),
  382. ),
  383. ],
  384. )
  385. def test_incr_mean_variance_axis_equivalence_mean_variance(X1, X2):
  386. # non-regression test for:
  387. # https://github.com/scikit-learn/scikit-learn/issues/16448
  388. # check that computing the incremental mean and variance is equivalent to
  389. # computing the mean and variance on the stacked dataset.
  390. axis = 0
  391. last_mean, last_var = np.zeros(X1.shape[1]), np.zeros(X1.shape[1])
  392. last_n = np.zeros(X1.shape[1], dtype=np.int64)
  393. updated_mean, updated_var, updated_n = incr_mean_variance_axis(
  394. X1, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
  395. )
  396. updated_mean, updated_var, updated_n = incr_mean_variance_axis(
  397. X2, axis=axis, last_mean=updated_mean, last_var=updated_var, last_n=updated_n
  398. )
  399. X = sp.vstack([X1, X2])
  400. assert_allclose(updated_mean, np.nanmean(X.toarray(), axis=axis))
  401. assert_allclose(updated_var, np.nanvar(X.toarray(), axis=axis))
  402. assert_allclose(updated_n, np.count_nonzero(~np.isnan(X.toarray()), axis=0))
  403. def test_incr_mean_variance_no_new_n():
  404. # check the behaviour when we update the variance with an empty matrix
  405. axis = 0
  406. X1 = sp.random(5, 1, density=0.8, random_state=0).tocsr()
  407. X2 = sp.random(0, 1, density=0.8, random_state=0).tocsr()
  408. last_mean, last_var = np.zeros(X1.shape[1]), np.zeros(X1.shape[1])
  409. last_n = np.zeros(X1.shape[1], dtype=np.int64)
  410. last_mean, last_var, last_n = incr_mean_variance_axis(
  411. X1, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
  412. )
  413. # update statistic with a column which should ignored
  414. updated_mean, updated_var, updated_n = incr_mean_variance_axis(
  415. X2, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
  416. )
  417. assert_allclose(updated_mean, last_mean)
  418. assert_allclose(updated_var, last_var)
  419. assert_allclose(updated_n, last_n)
  420. def test_incr_mean_variance_n_float():
  421. # check the behaviour when last_n is just a number
  422. axis = 0
  423. X = sp.random(5, 2, density=0.8, random_state=0).tocsr()
  424. last_mean, last_var = np.zeros(X.shape[1]), np.zeros(X.shape[1])
  425. last_n = 0
  426. _, _, new_n = incr_mean_variance_axis(
  427. X, axis=axis, last_mean=last_mean, last_var=last_var, last_n=last_n
  428. )
  429. assert_allclose(new_n, np.full(X.shape[1], X.shape[0]))
  430. @pytest.mark.parametrize("axis", [0, 1])
  431. @pytest.mark.parametrize("sparse_constructor", [sp.csc_matrix, sp.csr_matrix])
  432. def test_incr_mean_variance_axis_ignore_nan(axis, sparse_constructor):
  433. old_means = np.array([535.0, 535.0, 535.0, 535.0])
  434. old_variances = np.array([4225.0, 4225.0, 4225.0, 4225.0])
  435. old_sample_count = np.array([2, 2, 2, 2], dtype=np.int64)
  436. X = sparse_constructor(
  437. np.array([[170, 170, 170, 170], [430, 430, 430, 430], [300, 300, 300, 300]])
  438. )
  439. X_nan = sparse_constructor(
  440. np.array(
  441. [
  442. [170, np.nan, 170, 170],
  443. [np.nan, 170, 430, 430],
  444. [430, 430, np.nan, 300],
  445. [300, 300, 300, np.nan],
  446. ]
  447. )
  448. )
  449. # we avoid creating specific data for axis 0 and 1: translating the data is
  450. # enough.
  451. if axis:
  452. X = X.T
  453. X_nan = X_nan.T
  454. # take a copy of the old statistics since they are modified in place.
  455. X_means, X_vars, X_sample_count = incr_mean_variance_axis(
  456. X,
  457. axis=axis,
  458. last_mean=old_means.copy(),
  459. last_var=old_variances.copy(),
  460. last_n=old_sample_count.copy(),
  461. )
  462. X_nan_means, X_nan_vars, X_nan_sample_count = incr_mean_variance_axis(
  463. X_nan,
  464. axis=axis,
  465. last_mean=old_means.copy(),
  466. last_var=old_variances.copy(),
  467. last_n=old_sample_count.copy(),
  468. )
  469. assert_allclose(X_nan_means, X_means)
  470. assert_allclose(X_nan_vars, X_vars)
  471. assert_allclose(X_nan_sample_count, X_sample_count)
  472. def test_mean_variance_illegal_axis():
  473. X, _ = make_classification(5, 4, random_state=0)
  474. # Sparsify the array a little bit
  475. X[0, 0] = 0
  476. X[2, 1] = 0
  477. X[4, 3] = 0
  478. X_csr = sp.csr_matrix(X)
  479. with pytest.raises(ValueError):
  480. mean_variance_axis(X_csr, axis=-3)
  481. with pytest.raises(ValueError):
  482. mean_variance_axis(X_csr, axis=2)
  483. with pytest.raises(ValueError):
  484. mean_variance_axis(X_csr, axis=-1)
  485. with pytest.raises(ValueError):
  486. incr_mean_variance_axis(
  487. X_csr, axis=-3, last_mean=None, last_var=None, last_n=None
  488. )
  489. with pytest.raises(ValueError):
  490. incr_mean_variance_axis(
  491. X_csr, axis=2, last_mean=None, last_var=None, last_n=None
  492. )
  493. with pytest.raises(ValueError):
  494. incr_mean_variance_axis(
  495. X_csr, axis=-1, last_mean=None, last_var=None, last_n=None
  496. )
  497. def test_densify_rows():
  498. for dtype in (np.float32, np.float64):
  499. X = sp.csr_matrix(
  500. [[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=dtype
  501. )
  502. X_rows = np.array([0, 2, 3], dtype=np.intp)
  503. out = np.ones((6, X.shape[1]), dtype=dtype)
  504. out_rows = np.array([1, 3, 4], dtype=np.intp)
  505. expect = np.ones_like(out)
  506. expect[out_rows] = X[X_rows, :].toarray()
  507. assign_rows_csr(X, X_rows, out_rows, out)
  508. assert_array_equal(out, expect)
  509. def test_inplace_column_scale():
  510. rng = np.random.RandomState(0)
  511. X = sp.rand(100, 200, 0.05)
  512. Xr = X.tocsr()
  513. Xc = X.tocsc()
  514. XA = X.toarray()
  515. scale = rng.rand(200)
  516. XA *= scale
  517. inplace_column_scale(Xc, scale)
  518. inplace_column_scale(Xr, scale)
  519. assert_array_almost_equal(Xr.toarray(), Xc.toarray())
  520. assert_array_almost_equal(XA, Xc.toarray())
  521. assert_array_almost_equal(XA, Xr.toarray())
  522. with pytest.raises(TypeError):
  523. inplace_column_scale(X.tolil(), scale)
  524. X = X.astype(np.float32)
  525. scale = scale.astype(np.float32)
  526. Xr = X.tocsr()
  527. Xc = X.tocsc()
  528. XA = X.toarray()
  529. XA *= scale
  530. inplace_column_scale(Xc, scale)
  531. inplace_column_scale(Xr, scale)
  532. assert_array_almost_equal(Xr.toarray(), Xc.toarray())
  533. assert_array_almost_equal(XA, Xc.toarray())
  534. assert_array_almost_equal(XA, Xr.toarray())
  535. with pytest.raises(TypeError):
  536. inplace_column_scale(X.tolil(), scale)
  537. def test_inplace_row_scale():
  538. rng = np.random.RandomState(0)
  539. X = sp.rand(100, 200, 0.05)
  540. Xr = X.tocsr()
  541. Xc = X.tocsc()
  542. XA = X.toarray()
  543. scale = rng.rand(100)
  544. XA *= scale.reshape(-1, 1)
  545. inplace_row_scale(Xc, scale)
  546. inplace_row_scale(Xr, scale)
  547. assert_array_almost_equal(Xr.toarray(), Xc.toarray())
  548. assert_array_almost_equal(XA, Xc.toarray())
  549. assert_array_almost_equal(XA, Xr.toarray())
  550. with pytest.raises(TypeError):
  551. inplace_column_scale(X.tolil(), scale)
  552. X = X.astype(np.float32)
  553. scale = scale.astype(np.float32)
  554. Xr = X.tocsr()
  555. Xc = X.tocsc()
  556. XA = X.toarray()
  557. XA *= scale.reshape(-1, 1)
  558. inplace_row_scale(Xc, scale)
  559. inplace_row_scale(Xr, scale)
  560. assert_array_almost_equal(Xr.toarray(), Xc.toarray())
  561. assert_array_almost_equal(XA, Xc.toarray())
  562. assert_array_almost_equal(XA, Xr.toarray())
  563. with pytest.raises(TypeError):
  564. inplace_column_scale(X.tolil(), scale)
  565. def test_inplace_swap_row():
  566. X = np.array(
  567. [[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64
  568. )
  569. X_csr = sp.csr_matrix(X)
  570. X_csc = sp.csc_matrix(X)
  571. swap = linalg.get_blas_funcs(("swap",), (X,))
  572. swap = swap[0]
  573. X[0], X[-1] = swap(X[0], X[-1])
  574. inplace_swap_row(X_csr, 0, -1)
  575. inplace_swap_row(X_csc, 0, -1)
  576. assert_array_equal(X_csr.toarray(), X_csc.toarray())
  577. assert_array_equal(X, X_csc.toarray())
  578. assert_array_equal(X, X_csr.toarray())
  579. X[2], X[3] = swap(X[2], X[3])
  580. inplace_swap_row(X_csr, 2, 3)
  581. inplace_swap_row(X_csc, 2, 3)
  582. assert_array_equal(X_csr.toarray(), X_csc.toarray())
  583. assert_array_equal(X, X_csc.toarray())
  584. assert_array_equal(X, X_csr.toarray())
  585. with pytest.raises(TypeError):
  586. inplace_swap_row(X_csr.tolil())
  587. X = np.array(
  588. [[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float32
  589. )
  590. X_csr = sp.csr_matrix(X)
  591. X_csc = sp.csc_matrix(X)
  592. swap = linalg.get_blas_funcs(("swap",), (X,))
  593. swap = swap[0]
  594. X[0], X[-1] = swap(X[0], X[-1])
  595. inplace_swap_row(X_csr, 0, -1)
  596. inplace_swap_row(X_csc, 0, -1)
  597. assert_array_equal(X_csr.toarray(), X_csc.toarray())
  598. assert_array_equal(X, X_csc.toarray())
  599. assert_array_equal(X, X_csr.toarray())
  600. X[2], X[3] = swap(X[2], X[3])
  601. inplace_swap_row(X_csr, 2, 3)
  602. inplace_swap_row(X_csc, 2, 3)
  603. assert_array_equal(X_csr.toarray(), X_csc.toarray())
  604. assert_array_equal(X, X_csc.toarray())
  605. assert_array_equal(X, X_csr.toarray())
  606. with pytest.raises(TypeError):
  607. inplace_swap_row(X_csr.tolil())
  608. def test_inplace_swap_column():
  609. X = np.array(
  610. [[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64
  611. )
  612. X_csr = sp.csr_matrix(X)
  613. X_csc = sp.csc_matrix(X)
  614. swap = linalg.get_blas_funcs(("swap",), (X,))
  615. swap = swap[0]
  616. X[:, 0], X[:, -1] = swap(X[:, 0], X[:, -1])
  617. inplace_swap_column(X_csr, 0, -1)
  618. inplace_swap_column(X_csc, 0, -1)
  619. assert_array_equal(X_csr.toarray(), X_csc.toarray())
  620. assert_array_equal(X, X_csc.toarray())
  621. assert_array_equal(X, X_csr.toarray())
  622. X[:, 0], X[:, 1] = swap(X[:, 0], X[:, 1])
  623. inplace_swap_column(X_csr, 0, 1)
  624. inplace_swap_column(X_csc, 0, 1)
  625. assert_array_equal(X_csr.toarray(), X_csc.toarray())
  626. assert_array_equal(X, X_csc.toarray())
  627. assert_array_equal(X, X_csr.toarray())
  628. with pytest.raises(TypeError):
  629. inplace_swap_column(X_csr.tolil())
  630. X = np.array(
  631. [[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float32
  632. )
  633. X_csr = sp.csr_matrix(X)
  634. X_csc = sp.csc_matrix(X)
  635. swap = linalg.get_blas_funcs(("swap",), (X,))
  636. swap = swap[0]
  637. X[:, 0], X[:, -1] = swap(X[:, 0], X[:, -1])
  638. inplace_swap_column(X_csr, 0, -1)
  639. inplace_swap_column(X_csc, 0, -1)
  640. assert_array_equal(X_csr.toarray(), X_csc.toarray())
  641. assert_array_equal(X, X_csc.toarray())
  642. assert_array_equal(X, X_csr.toarray())
  643. X[:, 0], X[:, 1] = swap(X[:, 0], X[:, 1])
  644. inplace_swap_column(X_csr, 0, 1)
  645. inplace_swap_column(X_csc, 0, 1)
  646. assert_array_equal(X_csr.toarray(), X_csc.toarray())
  647. assert_array_equal(X, X_csc.toarray())
  648. assert_array_equal(X, X_csr.toarray())
  649. with pytest.raises(TypeError):
  650. inplace_swap_column(X_csr.tolil())
  651. @pytest.mark.parametrize("dtype", [np.float32, np.float64])
  652. @pytest.mark.parametrize("axis", [0, 1, None])
  653. @pytest.mark.parametrize("sparse_format", [sp.csr_matrix, sp.csc_matrix])
  654. @pytest.mark.parametrize(
  655. "missing_values, min_func, max_func, ignore_nan",
  656. [(0, np.min, np.max, False), (np.nan, np.nanmin, np.nanmax, True)],
  657. )
  658. @pytest.mark.parametrize("large_indices", [True, False])
  659. def test_min_max(
  660. dtype,
  661. axis,
  662. sparse_format,
  663. missing_values,
  664. min_func,
  665. max_func,
  666. ignore_nan,
  667. large_indices,
  668. ):
  669. X = np.array(
  670. [
  671. [0, 3, 0],
  672. [2, -1, missing_values],
  673. [0, 0, 0],
  674. [9, missing_values, 7],
  675. [4, 0, 5],
  676. ],
  677. dtype=dtype,
  678. )
  679. X_sparse = sparse_format(X)
  680. if large_indices:
  681. X_sparse.indices = X_sparse.indices.astype("int64")
  682. X_sparse.indptr = X_sparse.indptr.astype("int64")
  683. mins_sparse, maxs_sparse = min_max_axis(X_sparse, axis=axis, ignore_nan=ignore_nan)
  684. assert_array_equal(mins_sparse, min_func(X, axis=axis))
  685. assert_array_equal(maxs_sparse, max_func(X, axis=axis))
  686. def test_min_max_axis_errors():
  687. X = np.array(
  688. [[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64
  689. )
  690. X_csr = sp.csr_matrix(X)
  691. X_csc = sp.csc_matrix(X)
  692. with pytest.raises(TypeError):
  693. min_max_axis(X_csr.tolil(), axis=0)
  694. with pytest.raises(ValueError):
  695. min_max_axis(X_csr, axis=2)
  696. with pytest.raises(ValueError):
  697. min_max_axis(X_csc, axis=-3)
  698. def test_count_nonzero():
  699. X = np.array(
  700. [[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64
  701. )
  702. X_csr = sp.csr_matrix(X)
  703. X_csc = sp.csc_matrix(X)
  704. X_nonzero = X != 0
  705. sample_weight = [0.5, 0.2, 0.3, 0.1, 0.1]
  706. X_nonzero_weighted = X_nonzero * np.array(sample_weight)[:, None]
  707. for axis in [0, 1, -1, -2, None]:
  708. assert_array_almost_equal(
  709. count_nonzero(X_csr, axis=axis), X_nonzero.sum(axis=axis)
  710. )
  711. assert_array_almost_equal(
  712. count_nonzero(X_csr, axis=axis, sample_weight=sample_weight),
  713. X_nonzero_weighted.sum(axis=axis),
  714. )
  715. with pytest.raises(TypeError):
  716. count_nonzero(X_csc)
  717. with pytest.raises(ValueError):
  718. count_nonzero(X_csr, axis=2)
  719. assert count_nonzero(X_csr, axis=0).dtype == count_nonzero(X_csr, axis=1).dtype
  720. assert (
  721. count_nonzero(X_csr, axis=0, sample_weight=sample_weight).dtype
  722. == count_nonzero(X_csr, axis=1, sample_weight=sample_weight).dtype
  723. )
  724. # Check dtypes with large sparse matrices too
  725. # XXX: test fails on 32bit (Windows/Linux)
  726. try:
  727. X_csr.indices = X_csr.indices.astype(np.int64)
  728. X_csr.indptr = X_csr.indptr.astype(np.int64)
  729. assert count_nonzero(X_csr, axis=0).dtype == count_nonzero(X_csr, axis=1).dtype
  730. assert (
  731. count_nonzero(X_csr, axis=0, sample_weight=sample_weight).dtype
  732. == count_nonzero(X_csr, axis=1, sample_weight=sample_weight).dtype
  733. )
  734. except TypeError as e:
  735. assert "according to the rule 'safe'" in e.args[0] and np.intp().nbytes < 8, e
  736. def test_csc_row_median():
  737. # Test csc_row_median actually calculates the median.
  738. # Test that it gives the same output when X is dense.
  739. rng = np.random.RandomState(0)
  740. X = rng.rand(100, 50)
  741. dense_median = np.median(X, axis=0)
  742. csc = sp.csc_matrix(X)
  743. sparse_median = csc_median_axis_0(csc)
  744. assert_array_equal(sparse_median, dense_median)
  745. # Test that it gives the same output when X is sparse
  746. X = rng.rand(51, 100)
  747. X[X < 0.7] = 0.0
  748. ind = rng.randint(0, 50, 10)
  749. X[ind] = -X[ind]
  750. csc = sp.csc_matrix(X)
  751. dense_median = np.median(X, axis=0)
  752. sparse_median = csc_median_axis_0(csc)
  753. assert_array_equal(sparse_median, dense_median)
  754. # Test for toy data.
  755. X = [[0, -2], [-1, -1], [1, 0], [2, 1]]
  756. csc = sp.csc_matrix(X)
  757. assert_array_equal(csc_median_axis_0(csc), np.array([0.5, -0.5]))
  758. X = [[0, -2], [-1, -5], [1, -3]]
  759. csc = sp.csc_matrix(X)
  760. assert_array_equal(csc_median_axis_0(csc), np.array([0.0, -3]))
  761. # Test that it raises an Error for non-csc matrices.
  762. with pytest.raises(TypeError):
  763. csc_median_axis_0(sp.csr_matrix(X))
  764. def test_inplace_normalize():
  765. ones = np.ones((10, 1))
  766. rs = RandomState(10)
  767. for inplace_csr_row_normalize in (
  768. inplace_csr_row_normalize_l1,
  769. inplace_csr_row_normalize_l2,
  770. ):
  771. for dtype in (np.float64, np.float32):
  772. X = rs.randn(10, 5).astype(dtype)
  773. X_csr = sp.csr_matrix(X)
  774. for index_dtype in [np.int32, np.int64]:
  775. # csr_matrix will use int32 indices by default,
  776. # up-casting those to int64 when necessary
  777. if index_dtype is np.int64:
  778. X_csr.indptr = X_csr.indptr.astype(index_dtype)
  779. X_csr.indices = X_csr.indices.astype(index_dtype)
  780. assert X_csr.indices.dtype == index_dtype
  781. assert X_csr.indptr.dtype == index_dtype
  782. inplace_csr_row_normalize(X_csr)
  783. assert X_csr.dtype == dtype
  784. if inplace_csr_row_normalize is inplace_csr_row_normalize_l2:
  785. X_csr.data **= 2
  786. assert_array_almost_equal(np.abs(X_csr).sum(axis=1), ones)
  787. @pytest.mark.parametrize("dtype", [np.float32, np.float64])
  788. def test_csr_row_norms(dtype):
  789. # checks that csr_row_norms returns the same output as
  790. # scipy.sparse.linalg.norm, and that the dype is the same as X.dtype.
  791. X = sp.random(100, 10, format="csr", dtype=dtype, random_state=42)
  792. scipy_norms = sp.linalg.norm(X, axis=1) ** 2
  793. norms = csr_row_norms(X)
  794. assert norms.dtype == dtype
  795. rtol = 1e-6 if dtype == np.float32 else 1e-7
  796. assert_allclose(norms, scipy_norms, rtol=rtol)