| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192 |
- import numpy as np
- import pytest
- import scipy.sparse as sp
- from numpy.testing import assert_array_almost_equal
- from scipy.special import comb
- from sklearn.utils._random import _our_rand_r_py
- from sklearn.utils.random import _random_choice_csc, sample_without_replacement
- ###############################################################################
- # test custom sampling without replacement algorithm
- ###############################################################################
- def test_invalid_sample_without_replacement_algorithm():
- with pytest.raises(ValueError):
- sample_without_replacement(5, 4, "unknown")
- def test_sample_without_replacement_algorithms():
- methods = ("auto", "tracking_selection", "reservoir_sampling", "pool")
- for m in methods:
- def sample_without_replacement_method(
- n_population, n_samples, random_state=None
- ):
- return sample_without_replacement(
- n_population, n_samples, method=m, random_state=random_state
- )
- check_edge_case_of_sample_int(sample_without_replacement_method)
- check_sample_int(sample_without_replacement_method)
- check_sample_int_distribution(sample_without_replacement_method)
- def check_edge_case_of_sample_int(sample_without_replacement):
- # n_population < n_sample
- with pytest.raises(ValueError):
- sample_without_replacement(0, 1)
- with pytest.raises(ValueError):
- sample_without_replacement(1, 2)
- # n_population == n_samples
- assert sample_without_replacement(0, 0).shape == (0,)
- assert sample_without_replacement(1, 1).shape == (1,)
- # n_population >= n_samples
- assert sample_without_replacement(5, 0).shape == (0,)
- assert sample_without_replacement(5, 1).shape == (1,)
- # n_population < 0 or n_samples < 0
- with pytest.raises(ValueError):
- sample_without_replacement(-1, 5)
- with pytest.raises(ValueError):
- sample_without_replacement(5, -1)
- def check_sample_int(sample_without_replacement):
- # This test is heavily inspired from test_random.py of python-core.
- #
- # For the entire allowable range of 0 <= k <= N, validate that
- # the sample is of the correct length and contains only unique items
- n_population = 100
- for n_samples in range(n_population + 1):
- s = sample_without_replacement(n_population, n_samples)
- assert len(s) == n_samples
- unique = np.unique(s)
- assert np.size(unique) == n_samples
- assert np.all(unique < n_population)
- # test edge case n_population == n_samples == 0
- assert np.size(sample_without_replacement(0, 0)) == 0
- def check_sample_int_distribution(sample_without_replacement):
- # This test is heavily inspired from test_random.py of python-core.
- #
- # For the entire allowable range of 0 <= k <= N, validate that
- # sample generates all possible permutations
- n_population = 10
- # a large number of trials prevents false negatives without slowing normal
- # case
- n_trials = 10000
- for n_samples in range(n_population):
- # Counting the number of combinations is not as good as counting the
- # the number of permutations. However, it works with sampling algorithm
- # that does not provide a random permutation of the subset of integer.
- n_expected = comb(n_population, n_samples, exact=True)
- output = {}
- for i in range(n_trials):
- output[frozenset(sample_without_replacement(n_population, n_samples))] = (
- None
- )
- if len(output) == n_expected:
- break
- else:
- raise AssertionError(
- "number of combinations != number of expected (%s != %s)"
- % (len(output), n_expected)
- )
- def test_random_choice_csc(n_samples=10000, random_state=24):
- # Explicit class probabilities
- classes = [np.array([0, 1]), np.array([0, 1, 2])]
- class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
- got = _random_choice_csc(n_samples, classes, class_probabilities, random_state)
- assert sp.issparse(got)
- for k in range(len(classes)):
- p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples)
- assert_array_almost_equal(class_probabilities[k], p, decimal=1)
- # Implicit class probabilities
- classes = [[0, 1], [1, 2]] # test for array-like support
- class_probabilities = [np.array([0.5, 0.5]), np.array([0, 1 / 2, 1 / 2])]
- got = _random_choice_csc(
- n_samples=n_samples, classes=classes, random_state=random_state
- )
- assert sp.issparse(got)
- for k in range(len(classes)):
- p = np.bincount(got.getcol(k).toarray().ravel()) / float(n_samples)
- assert_array_almost_equal(class_probabilities[k], p, decimal=1)
- # Edge case probabilities 1.0 and 0.0
- classes = [np.array([0, 1]), np.array([0, 1, 2])]
- class_probabilities = [np.array([0.0, 1.0]), np.array([0.0, 1.0, 0.0])]
- got = _random_choice_csc(n_samples, classes, class_probabilities, random_state)
- assert sp.issparse(got)
- for k in range(len(classes)):
- p = (
- np.bincount(
- got.getcol(k).toarray().ravel(), minlength=len(class_probabilities[k])
- )
- / n_samples
- )
- assert_array_almost_equal(class_probabilities[k], p, decimal=1)
- # One class target data
- classes = [[1], [0]] # test for array-like support
- class_probabilities = [np.array([0.0, 1.0]), np.array([1.0])]
- got = _random_choice_csc(
- n_samples=n_samples, classes=classes, random_state=random_state
- )
- assert sp.issparse(got)
- for k in range(len(classes)):
- p = np.bincount(got.getcol(k).toarray().ravel()) / n_samples
- assert_array_almost_equal(class_probabilities[k], p, decimal=1)
- def test_random_choice_csc_errors():
- # the length of an array in classes and class_probabilities is mismatched
- classes = [np.array([0, 1]), np.array([0, 1, 2, 3])]
- class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
- with pytest.raises(ValueError):
- _random_choice_csc(4, classes, class_probabilities, 1)
- # the class dtype is not supported
- classes = [np.array(["a", "1"]), np.array(["z", "1", "2"])]
- class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
- with pytest.raises(ValueError):
- _random_choice_csc(4, classes, class_probabilities, 1)
- # the class dtype is not supported
- classes = [np.array([4.2, 0.1]), np.array([0.1, 0.2, 9.4])]
- class_probabilities = [np.array([0.5, 0.5]), np.array([0.6, 0.1, 0.3])]
- with pytest.raises(ValueError):
- _random_choice_csc(4, classes, class_probabilities, 1)
- # Given probabilities don't sum to 1
- classes = [np.array([0, 1]), np.array([0, 1, 2])]
- class_probabilities = [np.array([0.5, 0.6]), np.array([0.6, 0.1, 0.3])]
- with pytest.raises(ValueError):
- _random_choice_csc(4, classes, class_probabilities, 1)
- def test_our_rand_r():
- assert 131541053 == _our_rand_r_py(1273642419)
- assert 270369 == _our_rand_r_py(0)
|