sparsefuncs.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630
  1. # Authors: Manoj Kumar
  2. # Thomas Unterthiner
  3. # Giorgio Patrini
  4. #
  5. # License: BSD 3 clause
  6. import numpy as np
  7. import scipy.sparse as sp
  8. from ..utils.validation import _check_sample_weight
  9. from .sparsefuncs_fast import (
  10. csc_mean_variance_axis0 as _csc_mean_var_axis0,
  11. )
  12. from .sparsefuncs_fast import (
  13. csr_mean_variance_axis0 as _csr_mean_var_axis0,
  14. )
  15. from .sparsefuncs_fast import (
  16. incr_mean_variance_axis0 as _incr_mean_var_axis0,
  17. )
  18. def _raise_typeerror(X):
  19. """Raises a TypeError if X is not a CSR or CSC matrix"""
  20. input_type = X.format if sp.issparse(X) else type(X)
  21. err = "Expected a CSR or CSC sparse matrix, got %s." % input_type
  22. raise TypeError(err)
  23. def _raise_error_wrong_axis(axis):
  24. if axis not in (0, 1):
  25. raise ValueError(
  26. "Unknown axis value: %d. Use 0 for rows, or 1 for columns" % axis
  27. )
  28. def inplace_csr_column_scale(X, scale):
  29. """Inplace column scaling of a CSR matrix.
  30. Scale each feature of the data matrix by multiplying with specific scale
  31. provided by the caller assuming a (n_samples, n_features) shape.
  32. Parameters
  33. ----------
  34. X : sparse matrix of shape (n_samples, n_features)
  35. Matrix to normalize using the variance of the features.
  36. It should be of CSR format.
  37. scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}
  38. Array of precomputed feature-wise values to use for scaling.
  39. """
  40. assert scale.shape[0] == X.shape[1]
  41. X.data *= scale.take(X.indices, mode="clip")
  42. def inplace_csr_row_scale(X, scale):
  43. """Inplace row scaling of a CSR matrix.
  44. Scale each sample of the data matrix by multiplying with specific scale
  45. provided by the caller assuming a (n_samples, n_features) shape.
  46. Parameters
  47. ----------
  48. X : sparse matrix of shape (n_samples, n_features)
  49. Matrix to be scaled. It should be of CSR format.
  50. scale : ndarray of float of shape (n_samples,)
  51. Array of precomputed sample-wise values to use for scaling.
  52. """
  53. assert scale.shape[0] == X.shape[0]
  54. X.data *= np.repeat(scale, np.diff(X.indptr))
  55. def mean_variance_axis(X, axis, weights=None, return_sum_weights=False):
  56. """Compute mean and variance along an axis on a CSR or CSC matrix.
  57. Parameters
  58. ----------
  59. X : sparse matrix of shape (n_samples, n_features)
  60. Input data. It can be of CSR or CSC format.
  61. axis : {0, 1}
  62. Axis along which the axis should be computed.
  63. weights : ndarray of shape (n_samples,) or (n_features,), default=None
  64. If axis is set to 0 shape is (n_samples,) or
  65. if axis is set to 1 shape is (n_features,).
  66. If it is set to None, then samples are equally weighted.
  67. .. versionadded:: 0.24
  68. return_sum_weights : bool, default=False
  69. If True, returns the sum of weights seen for each feature
  70. if `axis=0` or each sample if `axis=1`.
  71. .. versionadded:: 0.24
  72. Returns
  73. -------
  74. means : ndarray of shape (n_features,), dtype=floating
  75. Feature-wise means.
  76. variances : ndarray of shape (n_features,), dtype=floating
  77. Feature-wise variances.
  78. sum_weights : ndarray of shape (n_features,), dtype=floating
  79. Returned if `return_sum_weights` is `True`.
  80. """
  81. _raise_error_wrong_axis(axis)
  82. if sp.issparse(X) and X.format == "csr":
  83. if axis == 0:
  84. return _csr_mean_var_axis0(
  85. X, weights=weights, return_sum_weights=return_sum_weights
  86. )
  87. else:
  88. return _csc_mean_var_axis0(
  89. X.T, weights=weights, return_sum_weights=return_sum_weights
  90. )
  91. elif sp.issparse(X) and X.format == "csc":
  92. if axis == 0:
  93. return _csc_mean_var_axis0(
  94. X, weights=weights, return_sum_weights=return_sum_weights
  95. )
  96. else:
  97. return _csr_mean_var_axis0(
  98. X.T, weights=weights, return_sum_weights=return_sum_weights
  99. )
  100. else:
  101. _raise_typeerror(X)
  102. def incr_mean_variance_axis(X, *, axis, last_mean, last_var, last_n, weights=None):
  103. """Compute incremental mean and variance along an axis on a CSR or CSC matrix.
  104. last_mean, last_var are the statistics computed at the last step by this
  105. function. Both must be initialized to 0-arrays of the proper size, i.e.
  106. the number of features in X. last_n is the number of samples encountered
  107. until now.
  108. Parameters
  109. ----------
  110. X : CSR or CSC sparse matrix of shape (n_samples, n_features)
  111. Input data.
  112. axis : {0, 1}
  113. Axis along which the axis should be computed.
  114. last_mean : ndarray of shape (n_features,) or (n_samples,), dtype=floating
  115. Array of means to update with the new data X.
  116. Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1.
  117. last_var : ndarray of shape (n_features,) or (n_samples,), dtype=floating
  118. Array of variances to update with the new data X.
  119. Should be of shape (n_features,) if axis=0 or (n_samples,) if axis=1.
  120. last_n : float or ndarray of shape (n_features,) or (n_samples,), \
  121. dtype=floating
  122. Sum of the weights seen so far, excluding the current weights
  123. If not float, it should be of shape (n_features,) if
  124. axis=0 or (n_samples,) if axis=1. If float it corresponds to
  125. having same weights for all samples (or features).
  126. weights : ndarray of shape (n_samples,) or (n_features,), default=None
  127. If axis is set to 0 shape is (n_samples,) or
  128. if axis is set to 1 shape is (n_features,).
  129. If it is set to None, then samples are equally weighted.
  130. .. versionadded:: 0.24
  131. Returns
  132. -------
  133. means : ndarray of shape (n_features,) or (n_samples,), dtype=floating
  134. Updated feature-wise means if axis = 0 or
  135. sample-wise means if axis = 1.
  136. variances : ndarray of shape (n_features,) or (n_samples,), dtype=floating
  137. Updated feature-wise variances if axis = 0 or
  138. sample-wise variances if axis = 1.
  139. n : ndarray of shape (n_features,) or (n_samples,), dtype=integral
  140. Updated number of seen samples per feature if axis=0
  141. or number of seen features per sample if axis=1.
  142. If weights is not None, n is a sum of the weights of the seen
  143. samples or features instead of the actual number of seen
  144. samples or features.
  145. Notes
  146. -----
  147. NaNs are ignored in the algorithm.
  148. """
  149. _raise_error_wrong_axis(axis)
  150. if not (sp.issparse(X) and X.format in ("csc", "csr")):
  151. _raise_typeerror(X)
  152. if np.size(last_n) == 1:
  153. last_n = np.full(last_mean.shape, last_n, dtype=last_mean.dtype)
  154. if not (np.size(last_mean) == np.size(last_var) == np.size(last_n)):
  155. raise ValueError("last_mean, last_var, last_n do not have the same shapes.")
  156. if axis == 1:
  157. if np.size(last_mean) != X.shape[0]:
  158. raise ValueError(
  159. "If axis=1, then last_mean, last_n, last_var should be of "
  160. f"size n_samples {X.shape[0]} (Got {np.size(last_mean)})."
  161. )
  162. else: # axis == 0
  163. if np.size(last_mean) != X.shape[1]:
  164. raise ValueError(
  165. "If axis=0, then last_mean, last_n, last_var should be of "
  166. f"size n_features {X.shape[1]} (Got {np.size(last_mean)})."
  167. )
  168. X = X.T if axis == 1 else X
  169. if weights is not None:
  170. weights = _check_sample_weight(weights, X, dtype=X.dtype)
  171. return _incr_mean_var_axis0(
  172. X, last_mean=last_mean, last_var=last_var, last_n=last_n, weights=weights
  173. )
  174. def inplace_column_scale(X, scale):
  175. """Inplace column scaling of a CSC/CSR matrix.
  176. Scale each feature of the data matrix by multiplying with specific scale
  177. provided by the caller assuming a (n_samples, n_features) shape.
  178. Parameters
  179. ----------
  180. X : sparse matrix of shape (n_samples, n_features)
  181. Matrix to normalize using the variance of the features. It should be
  182. of CSC or CSR format.
  183. scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}
  184. Array of precomputed feature-wise values to use for scaling.
  185. """
  186. if sp.issparse(X) and X.format == "csc":
  187. inplace_csr_row_scale(X.T, scale)
  188. elif sp.issparse(X) and X.format == "csr":
  189. inplace_csr_column_scale(X, scale)
  190. else:
  191. _raise_typeerror(X)
  192. def inplace_row_scale(X, scale):
  193. """Inplace row scaling of a CSR or CSC matrix.
  194. Scale each row of the data matrix by multiplying with specific scale
  195. provided by the caller assuming a (n_samples, n_features) shape.
  196. Parameters
  197. ----------
  198. X : sparse matrix of shape (n_samples, n_features)
  199. Matrix to be scaled. It should be of CSR or CSC format.
  200. scale : ndarray of shape (n_features,), dtype={np.float32, np.float64}
  201. Array of precomputed sample-wise values to use for scaling.
  202. """
  203. if sp.issparse(X) and X.format == "csc":
  204. inplace_csr_column_scale(X.T, scale)
  205. elif sp.issparse(X) and X.format == "csr":
  206. inplace_csr_row_scale(X, scale)
  207. else:
  208. _raise_typeerror(X)
  209. def inplace_swap_row_csc(X, m, n):
  210. """Swap two rows of a CSC matrix in-place.
  211. Parameters
  212. ----------
  213. X : sparse matrix of shape (n_samples, n_features)
  214. Matrix whose two rows are to be swapped. It should be of
  215. CSC format.
  216. m : int
  217. Index of the row of X to be swapped.
  218. n : int
  219. Index of the row of X to be swapped.
  220. """
  221. for t in [m, n]:
  222. if isinstance(t, np.ndarray):
  223. raise TypeError("m and n should be valid integers")
  224. if m < 0:
  225. m += X.shape[0]
  226. if n < 0:
  227. n += X.shape[0]
  228. m_mask = X.indices == m
  229. X.indices[X.indices == n] = m
  230. X.indices[m_mask] = n
  231. def inplace_swap_row_csr(X, m, n):
  232. """Swap two rows of a CSR matrix in-place.
  233. Parameters
  234. ----------
  235. X : sparse matrix of shape (n_samples, n_features)
  236. Matrix whose two rows are to be swapped. It should be of
  237. CSR format.
  238. m : int
  239. Index of the row of X to be swapped.
  240. n : int
  241. Index of the row of X to be swapped.
  242. """
  243. for t in [m, n]:
  244. if isinstance(t, np.ndarray):
  245. raise TypeError("m and n should be valid integers")
  246. if m < 0:
  247. m += X.shape[0]
  248. if n < 0:
  249. n += X.shape[0]
  250. # The following swapping makes life easier since m is assumed to be the
  251. # smaller integer below.
  252. if m > n:
  253. m, n = n, m
  254. indptr = X.indptr
  255. m_start = indptr[m]
  256. m_stop = indptr[m + 1]
  257. n_start = indptr[n]
  258. n_stop = indptr[n + 1]
  259. nz_m = m_stop - m_start
  260. nz_n = n_stop - n_start
  261. if nz_m != nz_n:
  262. # Modify indptr first
  263. X.indptr[m + 2 : n] += nz_n - nz_m
  264. X.indptr[m + 1] = m_start + nz_n
  265. X.indptr[n] = n_stop - nz_m
  266. X.indices = np.concatenate(
  267. [
  268. X.indices[:m_start],
  269. X.indices[n_start:n_stop],
  270. X.indices[m_stop:n_start],
  271. X.indices[m_start:m_stop],
  272. X.indices[n_stop:],
  273. ]
  274. )
  275. X.data = np.concatenate(
  276. [
  277. X.data[:m_start],
  278. X.data[n_start:n_stop],
  279. X.data[m_stop:n_start],
  280. X.data[m_start:m_stop],
  281. X.data[n_stop:],
  282. ]
  283. )
  284. def inplace_swap_row(X, m, n):
  285. """
  286. Swap two rows of a CSC/CSR matrix in-place.
  287. Parameters
  288. ----------
  289. X : sparse matrix of shape (n_samples, n_features)
  290. Matrix whose two rows are to be swapped. It should be of CSR or
  291. CSC format.
  292. m : int
  293. Index of the row of X to be swapped.
  294. n : int
  295. Index of the row of X to be swapped.
  296. """
  297. if sp.issparse(X) and X.format == "csc":
  298. inplace_swap_row_csc(X, m, n)
  299. elif sp.issparse(X) and X.format == "csr":
  300. inplace_swap_row_csr(X, m, n)
  301. else:
  302. _raise_typeerror(X)
  303. def inplace_swap_column(X, m, n):
  304. """
  305. Swap two columns of a CSC/CSR matrix in-place.
  306. Parameters
  307. ----------
  308. X : sparse matrix of shape (n_samples, n_features)
  309. Matrix whose two columns are to be swapped. It should be of
  310. CSR or CSC format.
  311. m : int
  312. Index of the column of X to be swapped.
  313. n : int
  314. Index of the column of X to be swapped.
  315. """
  316. if m < 0:
  317. m += X.shape[1]
  318. if n < 0:
  319. n += X.shape[1]
  320. if sp.issparse(X) and X.format == "csc":
  321. inplace_swap_row_csr(X, m, n)
  322. elif sp.issparse(X) and X.format == "csr":
  323. inplace_swap_row_csc(X, m, n)
  324. else:
  325. _raise_typeerror(X)
  326. def _minor_reduce(X, ufunc):
  327. major_index = np.flatnonzero(np.diff(X.indptr))
  328. # reduceat tries casts X.indptr to intp, which errors
  329. # if it is int64 on a 32 bit system.
  330. # Reinitializing prevents this where possible, see #13737
  331. X = type(X)((X.data, X.indices, X.indptr), shape=X.shape)
  332. value = ufunc.reduceat(X.data, X.indptr[major_index])
  333. return major_index, value
  334. def _min_or_max_axis(X, axis, min_or_max):
  335. N = X.shape[axis]
  336. if N == 0:
  337. raise ValueError("zero-size array to reduction operation")
  338. M = X.shape[1 - axis]
  339. mat = X.tocsc() if axis == 0 else X.tocsr()
  340. mat.sum_duplicates()
  341. major_index, value = _minor_reduce(mat, min_or_max)
  342. not_full = np.diff(mat.indptr)[major_index] < N
  343. value[not_full] = min_or_max(value[not_full], 0)
  344. mask = value != 0
  345. major_index = np.compress(mask, major_index)
  346. value = np.compress(mask, value)
  347. if axis == 0:
  348. res = sp.coo_matrix(
  349. (value, (np.zeros(len(value)), major_index)), dtype=X.dtype, shape=(1, M)
  350. )
  351. else:
  352. res = sp.coo_matrix(
  353. (value, (major_index, np.zeros(len(value)))), dtype=X.dtype, shape=(M, 1)
  354. )
  355. return res.A.ravel()
  356. def _sparse_min_or_max(X, axis, min_or_max):
  357. if axis is None:
  358. if 0 in X.shape:
  359. raise ValueError("zero-size array to reduction operation")
  360. zero = X.dtype.type(0)
  361. if X.nnz == 0:
  362. return zero
  363. m = min_or_max.reduce(X.data.ravel())
  364. if X.nnz != np.prod(X.shape):
  365. m = min_or_max(zero, m)
  366. return m
  367. if axis < 0:
  368. axis += 2
  369. if (axis == 0) or (axis == 1):
  370. return _min_or_max_axis(X, axis, min_or_max)
  371. else:
  372. raise ValueError("invalid axis, use 0 for rows, or 1 for columns")
  373. def _sparse_min_max(X, axis):
  374. return (
  375. _sparse_min_or_max(X, axis, np.minimum),
  376. _sparse_min_or_max(X, axis, np.maximum),
  377. )
  378. def _sparse_nan_min_max(X, axis):
  379. return (_sparse_min_or_max(X, axis, np.fmin), _sparse_min_or_max(X, axis, np.fmax))
  380. def min_max_axis(X, axis, ignore_nan=False):
  381. """Compute minimum and maximum along an axis on a CSR or CSC matrix.
  382. Optionally ignore NaN values.
  383. Parameters
  384. ----------
  385. X : sparse matrix of shape (n_samples, n_features)
  386. Input data. It should be of CSR or CSC format.
  387. axis : {0, 1}
  388. Axis along which the axis should be computed.
  389. ignore_nan : bool, default=False
  390. Ignore or passing through NaN values.
  391. .. versionadded:: 0.20
  392. Returns
  393. -------
  394. mins : ndarray of shape (n_features,), dtype={np.float32, np.float64}
  395. Feature-wise minima.
  396. maxs : ndarray of shape (n_features,), dtype={np.float32, np.float64}
  397. Feature-wise maxima.
  398. """
  399. if sp.issparse(X) and X.format in ("csr", "csc"):
  400. if ignore_nan:
  401. return _sparse_nan_min_max(X, axis=axis)
  402. else:
  403. return _sparse_min_max(X, axis=axis)
  404. else:
  405. _raise_typeerror(X)
  406. def count_nonzero(X, axis=None, sample_weight=None):
  407. """A variant of X.getnnz() with extension to weighting on axis 0.
  408. Useful in efficiently calculating multilabel metrics.
  409. Parameters
  410. ----------
  411. X : sparse matrix of shape (n_samples, n_labels)
  412. Input data. It should be of CSR format.
  413. axis : {0, 1}, default=None
  414. The axis on which the data is aggregated.
  415. sample_weight : array-like of shape (n_samples,), default=None
  416. Weight for each row of X.
  417. Returns
  418. -------
  419. nnz : int, float, ndarray of shape (n_samples,) or ndarray of shape (n_features,)
  420. Number of non-zero values in the array along a given axis. Otherwise,
  421. the total number of non-zero values in the array is returned.
  422. """
  423. if axis == -1:
  424. axis = 1
  425. elif axis == -2:
  426. axis = 0
  427. elif X.format != "csr":
  428. raise TypeError("Expected CSR sparse format, got {0}".format(X.format))
  429. # We rely here on the fact that np.diff(Y.indptr) for a CSR
  430. # will return the number of nonzero entries in each row.
  431. # A bincount over Y.indices will return the number of nonzeros
  432. # in each column. See ``csr_matrix.getnnz`` in scipy >= 0.14.
  433. if axis is None:
  434. if sample_weight is None:
  435. return X.nnz
  436. else:
  437. return np.dot(np.diff(X.indptr), sample_weight)
  438. elif axis == 1:
  439. out = np.diff(X.indptr)
  440. if sample_weight is None:
  441. # astype here is for consistency with axis=0 dtype
  442. return out.astype("intp")
  443. return out * sample_weight
  444. elif axis == 0:
  445. if sample_weight is None:
  446. return np.bincount(X.indices, minlength=X.shape[1])
  447. else:
  448. weights = np.repeat(sample_weight, np.diff(X.indptr))
  449. return np.bincount(X.indices, minlength=X.shape[1], weights=weights)
  450. else:
  451. raise ValueError("Unsupported axis: {0}".format(axis))
  452. def _get_median(data, n_zeros):
  453. """Compute the median of data with n_zeros additional zeros.
  454. This function is used to support sparse matrices; it modifies data
  455. in-place.
  456. """
  457. n_elems = len(data) + n_zeros
  458. if not n_elems:
  459. return np.nan
  460. n_negative = np.count_nonzero(data < 0)
  461. middle, is_odd = divmod(n_elems, 2)
  462. data.sort()
  463. if is_odd:
  464. return _get_elem_at_rank(middle, data, n_negative, n_zeros)
  465. return (
  466. _get_elem_at_rank(middle - 1, data, n_negative, n_zeros)
  467. + _get_elem_at_rank(middle, data, n_negative, n_zeros)
  468. ) / 2.0
  469. def _get_elem_at_rank(rank, data, n_negative, n_zeros):
  470. """Find the value in data augmented with n_zeros for the given rank"""
  471. if rank < n_negative:
  472. return data[rank]
  473. if rank - n_negative < n_zeros:
  474. return 0
  475. return data[rank - n_zeros]
  476. def csc_median_axis_0(X):
  477. """Find the median across axis 0 of a CSC matrix.
  478. It is equivalent to doing np.median(X, axis=0).
  479. Parameters
  480. ----------
  481. X : sparse matrix of shape (n_samples, n_features)
  482. Input data. It should be of CSC format.
  483. Returns
  484. -------
  485. median : ndarray of shape (n_features,)
  486. Median.
  487. """
  488. if not (sp.issparse(X) and X.format == "csc"):
  489. raise TypeError("Expected matrix of CSC format, got %s" % X.format)
  490. indptr = X.indptr
  491. n_samples, n_features = X.shape
  492. median = np.zeros(n_features)
  493. for f_ind, (start, end) in enumerate(zip(indptr[:-1], indptr[1:])):
  494. # Prevent modifying X in place
  495. data = np.copy(X.data[start:end])
  496. nz = n_samples - data.size
  497. median[f_ind] = _get_median(data, nz)
  498. return median