random.py 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. # Author: Hamzeh Alsalhi <ha258@cornell.edu>
  2. #
  3. # License: BSD 3 clause
  4. import array
  5. import numpy as np
  6. import scipy.sparse as sp
  7. from . import check_random_state
  8. from ._random import sample_without_replacement
  9. __all__ = ["sample_without_replacement"]
  10. def _random_choice_csc(n_samples, classes, class_probability=None, random_state=None):
  11. """Generate a sparse random matrix given column class distributions
  12. Parameters
  13. ----------
  14. n_samples : int,
  15. Number of samples to draw in each column.
  16. classes : list of size n_outputs of arrays of size (n_classes,)
  17. List of classes for each column.
  18. class_probability : list of size n_outputs of arrays of \
  19. shape (n_classes,), default=None
  20. Class distribution of each column. If None, uniform distribution is
  21. assumed.
  22. random_state : int, RandomState instance or None, default=None
  23. Controls the randomness of the sampled classes.
  24. See :term:`Glossary <random_state>`.
  25. Returns
  26. -------
  27. random_matrix : sparse csc matrix of size (n_samples, n_outputs)
  28. """
  29. data = array.array("i")
  30. indices = array.array("i")
  31. indptr = array.array("i", [0])
  32. for j in range(len(classes)):
  33. classes[j] = np.asarray(classes[j])
  34. if classes[j].dtype.kind != "i":
  35. raise ValueError("class dtype %s is not supported" % classes[j].dtype)
  36. classes[j] = classes[j].astype(np.int64, copy=False)
  37. # use uniform distribution if no class_probability is given
  38. if class_probability is None:
  39. class_prob_j = np.empty(shape=classes[j].shape[0])
  40. class_prob_j.fill(1 / classes[j].shape[0])
  41. else:
  42. class_prob_j = np.asarray(class_probability[j])
  43. if not np.isclose(np.sum(class_prob_j), 1.0):
  44. raise ValueError(
  45. "Probability array at index {0} does not sum to one".format(j)
  46. )
  47. if class_prob_j.shape[0] != classes[j].shape[0]:
  48. raise ValueError(
  49. "classes[{0}] (length {1}) and "
  50. "class_probability[{0}] (length {2}) have "
  51. "different length.".format(
  52. j, classes[j].shape[0], class_prob_j.shape[0]
  53. )
  54. )
  55. # If 0 is not present in the classes insert it with a probability 0.0
  56. if 0 not in classes[j]:
  57. classes[j] = np.insert(classes[j], 0, 0)
  58. class_prob_j = np.insert(class_prob_j, 0, 0.0)
  59. # If there are nonzero classes choose randomly using class_probability
  60. rng = check_random_state(random_state)
  61. if classes[j].shape[0] > 1:
  62. index_class_0 = np.flatnonzero(classes[j] == 0).item()
  63. p_nonzero = 1 - class_prob_j[index_class_0]
  64. nnz = int(n_samples * p_nonzero)
  65. ind_sample = sample_without_replacement(
  66. n_population=n_samples, n_samples=nnz, random_state=random_state
  67. )
  68. indices.extend(ind_sample)
  69. # Normalize probabilities for the nonzero elements
  70. classes_j_nonzero = classes[j] != 0
  71. class_probability_nz = class_prob_j[classes_j_nonzero]
  72. class_probability_nz_norm = class_probability_nz / np.sum(
  73. class_probability_nz
  74. )
  75. classes_ind = np.searchsorted(
  76. class_probability_nz_norm.cumsum(), rng.uniform(size=nnz)
  77. )
  78. data.extend(classes[j][classes_j_nonzero][classes_ind])
  79. indptr.append(len(indices))
  80. return sp.csc_matrix((data, indices, indptr), (n_samples, len(classes)), dtype=int)