| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990 |
- """
- Tests for chi2, currently the only feature selection function designed
- specifically to work with sparse matrices.
- """
- import warnings
- import numpy as np
- import pytest
- import scipy.stats
- from scipy.sparse import coo_matrix, csr_matrix
- from sklearn.feature_selection import SelectKBest, chi2
- from sklearn.feature_selection._univariate_selection import _chisquare
- from sklearn.utils._testing import assert_array_almost_equal, assert_array_equal
- # Feature 0 is highly informative for class 1;
- # feature 1 is the same everywhere;
- # feature 2 is a bit informative for class 2.
- X = [[2, 1, 2], [9, 1, 1], [6, 1, 2], [0, 1, 2]]
- y = [0, 1, 2, 2]
- def mkchi2(k):
- """Make k-best chi2 selector"""
- return SelectKBest(chi2, k=k)
- def test_chi2():
- # Test Chi2 feature extraction
- chi2 = mkchi2(k=1).fit(X, y)
- chi2 = mkchi2(k=1).fit(X, y)
- assert_array_equal(chi2.get_support(indices=True), [0])
- assert_array_equal(chi2.transform(X), np.array(X)[:, [0]])
- chi2 = mkchi2(k=2).fit(X, y)
- assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2])
- Xsp = csr_matrix(X, dtype=np.float64)
- chi2 = mkchi2(k=2).fit(Xsp, y)
- assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2])
- Xtrans = chi2.transform(Xsp)
- assert_array_equal(Xtrans.shape, [Xsp.shape[0], 2])
- # == doesn't work on scipy.sparse matrices
- Xtrans = Xtrans.toarray()
- Xtrans2 = mkchi2(k=2).fit_transform(Xsp, y).toarray()
- assert_array_almost_equal(Xtrans, Xtrans2)
- def test_chi2_coo():
- # Check that chi2 works with a COO matrix
- # (as returned by CountVectorizer, DictVectorizer)
- Xcoo = coo_matrix(X)
- mkchi2(k=2).fit_transform(Xcoo, y)
- # if we got here without an exception, we're safe
- def test_chi2_negative():
- # Check for proper error on negative numbers in the input X.
- X, y = [[0, 1], [-1e-20, 1]], [0, 1]
- for X in (X, np.array(X), csr_matrix(X)):
- with pytest.raises(ValueError):
- chi2(X, y)
- def test_chi2_unused_feature():
- # Unused feature should evaluate to NaN
- # and should issue no runtime warning
- with warnings.catch_warnings(record=True) as warned:
- warnings.simplefilter("always")
- chi, p = chi2([[1, 0], [0, 0]], [1, 0])
- for w in warned:
- if "divide by zero" in repr(w):
- raise AssertionError("Found unexpected warning %s" % w)
- assert_array_equal(chi, [1, np.nan])
- assert_array_equal(p[1], np.nan)
- def test_chisquare():
- # Test replacement for scipy.stats.chisquare against the original.
- obs = np.array([[2.0, 2.0], [1.0, 1.0]])
- exp = np.array([[1.5, 1.5], [1.5, 1.5]])
- # call SciPy first because our version overwrites obs
- chi_scp, p_scp = scipy.stats.chisquare(obs, exp)
- chi_our, p_our = _chisquare(obs, exp)
- assert_array_almost_equal(chi_scp, chi_our)
- assert_array_almost_equal(p_scp, p_our)
|