| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899 |
- # Author: Hamzeh Alsalhi <ha258@cornell.edu>
- #
- # License: BSD 3 clause
- import array
- import numpy as np
- import scipy.sparse as sp
- from . import check_random_state
- from ._random import sample_without_replacement
- __all__ = ["sample_without_replacement"]
- def _random_choice_csc(n_samples, classes, class_probability=None, random_state=None):
- """Generate a sparse random matrix given column class distributions
- Parameters
- ----------
- n_samples : int,
- Number of samples to draw in each column.
- classes : list of size n_outputs of arrays of size (n_classes,)
- List of classes for each column.
- class_probability : list of size n_outputs of arrays of \
- shape (n_classes,), default=None
- Class distribution of each column. If None, uniform distribution is
- assumed.
- random_state : int, RandomState instance or None, default=None
- Controls the randomness of the sampled classes.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- random_matrix : sparse csc matrix of size (n_samples, n_outputs)
- """
- data = array.array("i")
- indices = array.array("i")
- indptr = array.array("i", [0])
- for j in range(len(classes)):
- classes[j] = np.asarray(classes[j])
- if classes[j].dtype.kind != "i":
- raise ValueError("class dtype %s is not supported" % classes[j].dtype)
- classes[j] = classes[j].astype(np.int64, copy=False)
- # use uniform distribution if no class_probability is given
- if class_probability is None:
- class_prob_j = np.empty(shape=classes[j].shape[0])
- class_prob_j.fill(1 / classes[j].shape[0])
- else:
- class_prob_j = np.asarray(class_probability[j])
- if not np.isclose(np.sum(class_prob_j), 1.0):
- raise ValueError(
- "Probability array at index {0} does not sum to one".format(j)
- )
- if class_prob_j.shape[0] != classes[j].shape[0]:
- raise ValueError(
- "classes[{0}] (length {1}) and "
- "class_probability[{0}] (length {2}) have "
- "different length.".format(
- j, classes[j].shape[0], class_prob_j.shape[0]
- )
- )
- # If 0 is not present in the classes insert it with a probability 0.0
- if 0 not in classes[j]:
- classes[j] = np.insert(classes[j], 0, 0)
- class_prob_j = np.insert(class_prob_j, 0, 0.0)
- # If there are nonzero classes choose randomly using class_probability
- rng = check_random_state(random_state)
- if classes[j].shape[0] > 1:
- index_class_0 = np.flatnonzero(classes[j] == 0).item()
- p_nonzero = 1 - class_prob_j[index_class_0]
- nnz = int(n_samples * p_nonzero)
- ind_sample = sample_without_replacement(
- n_population=n_samples, n_samples=nnz, random_state=random_state
- )
- indices.extend(ind_sample)
- # Normalize probabilities for the nonzero elements
- classes_j_nonzero = classes[j] != 0
- class_probability_nz = class_prob_j[classes_j_nonzero]
- class_probability_nz_norm = class_probability_nz / np.sum(
- class_probability_nz
- )
- classes_ind = np.searchsorted(
- class_probability_nz_norm.cumsum(), rng.uniform(size=nnz)
- )
- data.extend(classes[j][classes_j_nonzero][classes_ind])
- indptr.append(len(indices))
- return sp.csc_matrix((data, indices, indptr), (n_samples, len(classes)), dtype=int)
|