| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499 |
- import re
- import numpy as np
- import pytest
- from scipy.sparse import csr_matrix
- from sklearn.datasets import make_classification
- from sklearn.kernel_approximation import (
- AdditiveChi2Sampler,
- Nystroem,
- PolynomialCountSketch,
- RBFSampler,
- SkewedChi2Sampler,
- )
- from sklearn.metrics.pairwise import (
- chi2_kernel,
- kernel_metrics,
- polynomial_kernel,
- rbf_kernel,
- )
- from sklearn.utils._testing import (
- assert_allclose,
- assert_array_almost_equal,
- assert_array_equal,
- )
- # generate data
- rng = np.random.RandomState(0)
- X = rng.random_sample(size=(300, 50))
- Y = rng.random_sample(size=(300, 50))
- X /= X.sum(axis=1)[:, np.newaxis]
- Y /= Y.sum(axis=1)[:, np.newaxis]
- @pytest.mark.parametrize("gamma", [0.1, 1, 2.5])
- @pytest.mark.parametrize("degree, n_components", [(1, 500), (2, 500), (3, 5000)])
- @pytest.mark.parametrize("coef0", [0, 2.5])
- def test_polynomial_count_sketch(gamma, degree, coef0, n_components):
- # test that PolynomialCountSketch approximates polynomial
- # kernel on random data
- # compute exact kernel
- kernel = polynomial_kernel(X, Y, gamma=gamma, degree=degree, coef0=coef0)
- # approximate kernel mapping
- ps_transform = PolynomialCountSketch(
- n_components=n_components,
- gamma=gamma,
- coef0=coef0,
- degree=degree,
- random_state=42,
- )
- X_trans = ps_transform.fit_transform(X)
- Y_trans = ps_transform.transform(Y)
- kernel_approx = np.dot(X_trans, Y_trans.T)
- error = kernel - kernel_approx
- assert np.abs(np.mean(error)) <= 0.05 # close to unbiased
- np.abs(error, out=error)
- assert np.max(error) <= 0.1 # nothing too far off
- assert np.mean(error) <= 0.05 # mean is fairly close
- @pytest.mark.parametrize("gamma", [0.1, 1.0])
- @pytest.mark.parametrize("degree", [1, 2, 3])
- @pytest.mark.parametrize("coef0", [0, 2.5])
- def test_polynomial_count_sketch_dense_sparse(gamma, degree, coef0):
- """Check that PolynomialCountSketch results are the same for dense and sparse
- input.
- """
- ps_dense = PolynomialCountSketch(
- n_components=500, gamma=gamma, degree=degree, coef0=coef0, random_state=42
- )
- Xt_dense = ps_dense.fit_transform(X)
- Yt_dense = ps_dense.transform(Y)
- ps_sparse = PolynomialCountSketch(
- n_components=500, gamma=gamma, degree=degree, coef0=coef0, random_state=42
- )
- Xt_sparse = ps_sparse.fit_transform(csr_matrix(X))
- Yt_sparse = ps_sparse.transform(csr_matrix(Y))
- assert_allclose(Xt_dense, Xt_sparse)
- assert_allclose(Yt_dense, Yt_sparse)
- def _linear_kernel(X, Y):
- return np.dot(X, Y.T)
- def test_additive_chi2_sampler():
- # test that AdditiveChi2Sampler approximates kernel on random data
- # compute exact kernel
- # abbreviations for easier formula
- X_ = X[:, np.newaxis, :]
- Y_ = Y[np.newaxis, :, :]
- large_kernel = 2 * X_ * Y_ / (X_ + Y_)
- # reduce to n_samples_x x n_samples_y by summing over features
- kernel = large_kernel.sum(axis=2)
- # approximate kernel mapping
- transform = AdditiveChi2Sampler(sample_steps=3)
- X_trans = transform.fit_transform(X)
- Y_trans = transform.transform(Y)
- kernel_approx = np.dot(X_trans, Y_trans.T)
- assert_array_almost_equal(kernel, kernel_approx, 1)
- X_sp_trans = transform.fit_transform(csr_matrix(X))
- Y_sp_trans = transform.transform(csr_matrix(Y))
- assert_array_equal(X_trans, X_sp_trans.toarray())
- assert_array_equal(Y_trans, Y_sp_trans.toarray())
- # test error is raised on negative input
- Y_neg = Y.copy()
- Y_neg[0, 0] = -1
- msg = "Negative values in data passed to"
- with pytest.raises(ValueError, match=msg):
- transform.fit(Y_neg)
- @pytest.mark.parametrize("method", ["fit", "fit_transform", "transform"])
- @pytest.mark.parametrize("sample_steps", range(1, 4))
- def test_additive_chi2_sampler_sample_steps(method, sample_steps):
- """Check that the input sample step doesn't raise an error
- and that sample interval doesn't change after fit.
- """
- transformer = AdditiveChi2Sampler(sample_steps=sample_steps)
- getattr(transformer, method)(X)
- sample_interval = 0.5
- transformer = AdditiveChi2Sampler(
- sample_steps=sample_steps,
- sample_interval=sample_interval,
- )
- getattr(transformer, method)(X)
- transformer.sample_interval == sample_interval
- # TODO(1.5): remove
- def test_additive_chi2_sampler_future_warnings():
- """Check that we raise a FutureWarning when accessing to `sample_interval_`."""
- transformer = AdditiveChi2Sampler()
- transformer.fit(X)
- msg = re.escape(
- "The ``sample_interval_`` attribute was deprecated in version 1.3 and "
- "will be removed 1.5."
- )
- with pytest.warns(FutureWarning, match=msg):
- assert transformer.sample_interval_ is not None
- @pytest.mark.parametrize("method", ["fit", "fit_transform", "transform"])
- def test_additive_chi2_sampler_wrong_sample_steps(method):
- """Check that we raise a ValueError on invalid sample_steps"""
- transformer = AdditiveChi2Sampler(sample_steps=4)
- msg = re.escape(
- "If sample_steps is not in [1, 2, 3], you need to provide sample_interval"
- )
- with pytest.raises(ValueError, match=msg):
- getattr(transformer, method)(X)
- def test_skewed_chi2_sampler():
- # test that RBFSampler approximates kernel on random data
- # compute exact kernel
- c = 0.03
- # set on negative component but greater than c to ensure that the kernel
- # approximation is valid on the group (-c; +\infty) endowed with the skewed
- # multiplication.
- Y[0, 0] = -c / 2.0
- # abbreviations for easier formula
- X_c = (X + c)[:, np.newaxis, :]
- Y_c = (Y + c)[np.newaxis, :, :]
- # we do it in log-space in the hope that it's more stable
- # this array is n_samples_x x n_samples_y big x n_features
- log_kernel = (
- (np.log(X_c) / 2.0) + (np.log(Y_c) / 2.0) + np.log(2.0) - np.log(X_c + Y_c)
- )
- # reduce to n_samples_x x n_samples_y by summing over features in log-space
- kernel = np.exp(log_kernel.sum(axis=2))
- # approximate kernel mapping
- transform = SkewedChi2Sampler(skewedness=c, n_components=1000, random_state=42)
- X_trans = transform.fit_transform(X)
- Y_trans = transform.transform(Y)
- kernel_approx = np.dot(X_trans, Y_trans.T)
- assert_array_almost_equal(kernel, kernel_approx, 1)
- assert np.isfinite(kernel).all(), "NaNs found in the Gram matrix"
- assert np.isfinite(kernel_approx).all(), "NaNs found in the approximate Gram matrix"
- # test error is raised on when inputs contains values smaller than -c
- Y_neg = Y.copy()
- Y_neg[0, 0] = -c * 2.0
- msg = "X may not contain entries smaller than -skewedness"
- with pytest.raises(ValueError, match=msg):
- transform.transform(Y_neg)
- def test_additive_chi2_sampler_exceptions():
- """Ensures correct error message"""
- transformer = AdditiveChi2Sampler()
- X_neg = X.copy()
- X_neg[0, 0] = -1
- with pytest.raises(ValueError, match="X in AdditiveChi2Sampler.fit"):
- transformer.fit(X_neg)
- with pytest.raises(ValueError, match="X in AdditiveChi2Sampler.transform"):
- transformer.fit(X)
- transformer.transform(X_neg)
- def test_rbf_sampler():
- # test that RBFSampler approximates kernel on random data
- # compute exact kernel
- gamma = 10.0
- kernel = rbf_kernel(X, Y, gamma=gamma)
- # approximate kernel mapping
- rbf_transform = RBFSampler(gamma=gamma, n_components=1000, random_state=42)
- X_trans = rbf_transform.fit_transform(X)
- Y_trans = rbf_transform.transform(Y)
- kernel_approx = np.dot(X_trans, Y_trans.T)
- error = kernel - kernel_approx
- assert np.abs(np.mean(error)) <= 0.01 # close to unbiased
- np.abs(error, out=error)
- assert np.max(error) <= 0.1 # nothing too far off
- assert np.mean(error) <= 0.05 # mean is fairly close
- def test_rbf_sampler_fitted_attributes_dtype(global_dtype):
- """Check that the fitted attributes are stored accordingly to the
- data type of X."""
- rbf = RBFSampler()
- X = np.array([[1, 2], [3, 4], [5, 6]], dtype=global_dtype)
- rbf.fit(X)
- assert rbf.random_offset_.dtype == global_dtype
- assert rbf.random_weights_.dtype == global_dtype
- def test_rbf_sampler_dtype_equivalence():
- """Check the equivalence of the results with 32 and 64 bits input."""
- rbf32 = RBFSampler(random_state=42)
- X32 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32)
- rbf32.fit(X32)
- rbf64 = RBFSampler(random_state=42)
- X64 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float64)
- rbf64.fit(X64)
- assert_allclose(rbf32.random_offset_, rbf64.random_offset_)
- assert_allclose(rbf32.random_weights_, rbf64.random_weights_)
- def test_rbf_sampler_gamma_scale():
- """Check the inner value computed when `gamma='scale'`."""
- X, y = [[0.0], [1.0]], [0, 1]
- rbf = RBFSampler(gamma="scale")
- rbf.fit(X, y)
- assert rbf._gamma == pytest.approx(4)
- def test_skewed_chi2_sampler_fitted_attributes_dtype(global_dtype):
- """Check that the fitted attributes are stored accordingly to the
- data type of X."""
- skewed_chi2_sampler = SkewedChi2Sampler()
- X = np.array([[1, 2], [3, 4], [5, 6]], dtype=global_dtype)
- skewed_chi2_sampler.fit(X)
- assert skewed_chi2_sampler.random_offset_.dtype == global_dtype
- assert skewed_chi2_sampler.random_weights_.dtype == global_dtype
- def test_skewed_chi2_sampler_dtype_equivalence():
- """Check the equivalence of the results with 32 and 64 bits input."""
- skewed_chi2_sampler_32 = SkewedChi2Sampler(random_state=42)
- X_32 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float32)
- skewed_chi2_sampler_32.fit(X_32)
- skewed_chi2_sampler_64 = SkewedChi2Sampler(random_state=42)
- X_64 = np.array([[1, 2], [3, 4], [5, 6]], dtype=np.float64)
- skewed_chi2_sampler_64.fit(X_64)
- assert_allclose(
- skewed_chi2_sampler_32.random_offset_, skewed_chi2_sampler_64.random_offset_
- )
- assert_allclose(
- skewed_chi2_sampler_32.random_weights_, skewed_chi2_sampler_64.random_weights_
- )
- def test_input_validation():
- # Regression test: kernel approx. transformers should work on lists
- # No assertions; the old versions would simply crash
- X = [[1, 2], [3, 4], [5, 6]]
- AdditiveChi2Sampler().fit(X).transform(X)
- SkewedChi2Sampler().fit(X).transform(X)
- RBFSampler().fit(X).transform(X)
- X = csr_matrix(X)
- RBFSampler().fit(X).transform(X)
- def test_nystroem_approximation():
- # some basic tests
- rnd = np.random.RandomState(0)
- X = rnd.uniform(size=(10, 4))
- # With n_components = n_samples this is exact
- X_transformed = Nystroem(n_components=X.shape[0]).fit_transform(X)
- K = rbf_kernel(X)
- assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)
- trans = Nystroem(n_components=2, random_state=rnd)
- X_transformed = trans.fit(X).transform(X)
- assert X_transformed.shape == (X.shape[0], 2)
- # test callable kernel
- trans = Nystroem(n_components=2, kernel=_linear_kernel, random_state=rnd)
- X_transformed = trans.fit(X).transform(X)
- assert X_transformed.shape == (X.shape[0], 2)
- # test that available kernels fit and transform
- kernels_available = kernel_metrics()
- for kern in kernels_available:
- trans = Nystroem(n_components=2, kernel=kern, random_state=rnd)
- X_transformed = trans.fit(X).transform(X)
- assert X_transformed.shape == (X.shape[0], 2)
- def test_nystroem_default_parameters():
- rnd = np.random.RandomState(42)
- X = rnd.uniform(size=(10, 4))
- # rbf kernel should behave as gamma=None by default
- # aka gamma = 1 / n_features
- nystroem = Nystroem(n_components=10)
- X_transformed = nystroem.fit_transform(X)
- K = rbf_kernel(X, gamma=None)
- K2 = np.dot(X_transformed, X_transformed.T)
- assert_array_almost_equal(K, K2)
- # chi2 kernel should behave as gamma=1 by default
- nystroem = Nystroem(kernel="chi2", n_components=10)
- X_transformed = nystroem.fit_transform(X)
- K = chi2_kernel(X, gamma=1)
- K2 = np.dot(X_transformed, X_transformed.T)
- assert_array_almost_equal(K, K2)
- def test_nystroem_singular_kernel():
- # test that nystroem works with singular kernel matrix
- rng = np.random.RandomState(0)
- X = rng.rand(10, 20)
- X = np.vstack([X] * 2) # duplicate samples
- gamma = 100
- N = Nystroem(gamma=gamma, n_components=X.shape[0]).fit(X)
- X_transformed = N.transform(X)
- K = rbf_kernel(X, gamma=gamma)
- assert_array_almost_equal(K, np.dot(X_transformed, X_transformed.T))
- assert np.all(np.isfinite(Y))
- def test_nystroem_poly_kernel_params():
- # Non-regression: Nystroem should pass other parameters beside gamma.
- rnd = np.random.RandomState(37)
- X = rnd.uniform(size=(10, 4))
- K = polynomial_kernel(X, degree=3.1, coef0=0.1)
- nystroem = Nystroem(
- kernel="polynomial", n_components=X.shape[0], degree=3.1, coef0=0.1
- )
- X_transformed = nystroem.fit_transform(X)
- assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)
- def test_nystroem_callable():
- # Test Nystroem on a callable.
- rnd = np.random.RandomState(42)
- n_samples = 10
- X = rnd.uniform(size=(n_samples, 4))
- def logging_histogram_kernel(x, y, log):
- """Histogram kernel that writes to a log."""
- log.append(1)
- return np.minimum(x, y).sum()
- kernel_log = []
- X = list(X) # test input validation
- Nystroem(
- kernel=logging_histogram_kernel,
- n_components=(n_samples - 1),
- kernel_params={"log": kernel_log},
- ).fit(X)
- assert len(kernel_log) == n_samples * (n_samples - 1) / 2
- # if degree, gamma or coef0 is passed, we raise a ValueError
- msg = "Don't pass gamma, coef0 or degree to Nystroem"
- params = ({"gamma": 1}, {"coef0": 1}, {"degree": 2})
- for param in params:
- ny = Nystroem(kernel=_linear_kernel, n_components=(n_samples - 1), **param)
- with pytest.raises(ValueError, match=msg):
- ny.fit(X)
- def test_nystroem_precomputed_kernel():
- # Non-regression: test Nystroem on precomputed kernel.
- # PR - 14706
- rnd = np.random.RandomState(12)
- X = rnd.uniform(size=(10, 4))
- K = polynomial_kernel(X, degree=2, coef0=0.1)
- nystroem = Nystroem(kernel="precomputed", n_components=X.shape[0])
- X_transformed = nystroem.fit_transform(K)
- assert_array_almost_equal(np.dot(X_transformed, X_transformed.T), K)
- # if degree, gamma or coef0 is passed, we raise a ValueError
- msg = "Don't pass gamma, coef0 or degree to Nystroem"
- params = ({"gamma": 1}, {"coef0": 1}, {"degree": 2})
- for param in params:
- ny = Nystroem(kernel="precomputed", n_components=X.shape[0], **param)
- with pytest.raises(ValueError, match=msg):
- ny.fit(K)
- def test_nystroem_component_indices():
- """Check that `component_indices_` corresponds to the subset of
- training points used to construct the feature map.
- Non-regression test for:
- https://github.com/scikit-learn/scikit-learn/issues/20474
- """
- X, _ = make_classification(n_samples=100, n_features=20)
- feature_map_nystroem = Nystroem(
- n_components=10,
- random_state=0,
- )
- feature_map_nystroem.fit(X)
- assert feature_map_nystroem.component_indices_.shape == (10,)
- @pytest.mark.parametrize(
- "Estimator", [PolynomialCountSketch, RBFSampler, SkewedChi2Sampler, Nystroem]
- )
- def test_get_feature_names_out(Estimator):
- """Check get_feature_names_out"""
- est = Estimator().fit(X)
- X_trans = est.transform(X)
- names_out = est.get_feature_names_out()
- class_name = Estimator.__name__.lower()
- expected_names = [f"{class_name}{i}" for i in range(X_trans.shape[1])]
- assert_array_equal(names_out, expected_names)
- def test_additivechi2sampler_get_feature_names_out():
- """Check get_feature_names_out for AdditiveChi2Sampler."""
- rng = np.random.RandomState(0)
- X = rng.random_sample(size=(300, 3))
- chi2_sampler = AdditiveChi2Sampler(sample_steps=3).fit(X)
- input_names = ["f0", "f1", "f2"]
- suffixes = [
- "f0_sqrt",
- "f1_sqrt",
- "f2_sqrt",
- "f0_cos1",
- "f1_cos1",
- "f2_cos1",
- "f0_sin1",
- "f1_sin1",
- "f2_sin1",
- "f0_cos2",
- "f1_cos2",
- "f2_cos2",
- "f0_sin2",
- "f1_sin2",
- "f2_sin2",
- ]
- names_out = chi2_sampler.get_feature_names_out(input_features=input_names)
- expected_names = [f"additivechi2sampler_{suffix}" for suffix in suffixes]
- assert_array_equal(names_out, expected_names)
|