test_murmurhash.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. # Author: Olivier Grisel <olivier.grisel@ensta.org>
  2. #
  3. # License: BSD 3 clause
  4. import numpy as np
  5. from numpy.testing import assert_array_almost_equal, assert_array_equal
  6. from sklearn.utils.murmurhash import murmurhash3_32
  7. def test_mmhash3_int():
  8. assert murmurhash3_32(3) == 847579505
  9. assert murmurhash3_32(3, seed=0) == 847579505
  10. assert murmurhash3_32(3, seed=42) == -1823081949
  11. assert murmurhash3_32(3, positive=False) == 847579505
  12. assert murmurhash3_32(3, seed=0, positive=False) == 847579505
  13. assert murmurhash3_32(3, seed=42, positive=False) == -1823081949
  14. assert murmurhash3_32(3, positive=True) == 847579505
  15. assert murmurhash3_32(3, seed=0, positive=True) == 847579505
  16. assert murmurhash3_32(3, seed=42, positive=True) == 2471885347
  17. def test_mmhash3_int_array():
  18. rng = np.random.RandomState(42)
  19. keys = rng.randint(-5342534, 345345, size=3 * 2 * 1).astype(np.int32)
  20. keys = keys.reshape((3, 2, 1))
  21. for seed in [0, 42]:
  22. expected = np.array([murmurhash3_32(int(k), seed) for k in keys.flat])
  23. expected = expected.reshape(keys.shape)
  24. assert_array_equal(murmurhash3_32(keys, seed), expected)
  25. for seed in [0, 42]:
  26. expected = np.array([murmurhash3_32(k, seed, positive=True) for k in keys.flat])
  27. expected = expected.reshape(keys.shape)
  28. assert_array_equal(murmurhash3_32(keys, seed, positive=True), expected)
  29. def test_mmhash3_bytes():
  30. assert murmurhash3_32(b"foo", 0) == -156908512
  31. assert murmurhash3_32(b"foo", 42) == -1322301282
  32. assert murmurhash3_32(b"foo", 0, positive=True) == 4138058784
  33. assert murmurhash3_32(b"foo", 42, positive=True) == 2972666014
  34. def test_mmhash3_unicode():
  35. assert murmurhash3_32("foo", 0) == -156908512
  36. assert murmurhash3_32("foo", 42) == -1322301282
  37. assert murmurhash3_32("foo", 0, positive=True) == 4138058784
  38. assert murmurhash3_32("foo", 42, positive=True) == 2972666014
  39. def test_no_collision_on_byte_range():
  40. previous_hashes = set()
  41. for i in range(100):
  42. h = murmurhash3_32(" " * i, 0)
  43. assert h not in previous_hashes, "Found collision on growing empty string"
  44. def test_uniform_distribution():
  45. n_bins, n_samples = 10, 100000
  46. bins = np.zeros(n_bins, dtype=np.float64)
  47. for i in range(n_samples):
  48. bins[murmurhash3_32(i, positive=True) % n_bins] += 1
  49. means = bins / n_samples
  50. expected = np.full(n_bins, 1.0 / n_bins)
  51. assert_array_almost_equal(means / expected, np.ones(n_bins), 2)