| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490 |
- # Author: Nikolay Mayorov <n59_ru@hotmail.com>
- # License: 3-clause BSD
- from numbers import Integral
- import numpy as np
- from scipy.sparse import issparse
- from scipy.special import digamma
- from ..metrics.cluster import mutual_info_score
- from ..neighbors import KDTree, NearestNeighbors
- from ..preprocessing import scale
- from ..utils import check_random_state
- from ..utils._param_validation import Interval, StrOptions, validate_params
- from ..utils.multiclass import check_classification_targets
- from ..utils.validation import check_array, check_X_y
- def _compute_mi_cc(x, y, n_neighbors):
- """Compute mutual information between two continuous variables.
- Parameters
- ----------
- x, y : ndarray, shape (n_samples,)
- Samples of two continuous random variables, must have an identical
- shape.
- n_neighbors : int
- Number of nearest neighbors to search for each point, see [1]_.
- Returns
- -------
- mi : float
- Estimated mutual information. If it turned out to be negative it is
- replace by 0.
- Notes
- -----
- True mutual information can't be negative. If its estimate by a numerical
- method is negative, it means (providing the method is adequate) that the
- mutual information is close to 0 and replacing it by 0 is a reasonable
- strategy.
- References
- ----------
- .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
- information". Phys. Rev. E 69, 2004.
- """
- n_samples = x.size
- x = x.reshape((-1, 1))
- y = y.reshape((-1, 1))
- xy = np.hstack((x, y))
- # Here we rely on NearestNeighbors to select the fastest algorithm.
- nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors)
- nn.fit(xy)
- radius = nn.kneighbors()[0]
- radius = np.nextafter(radius[:, -1], 0)
- # KDTree is explicitly fit to allow for the querying of number of
- # neighbors within a specified radius
- kd = KDTree(x, metric="chebyshev")
- nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
- nx = np.array(nx) - 1.0
- kd = KDTree(y, metric="chebyshev")
- ny = kd.query_radius(y, radius, count_only=True, return_distance=False)
- ny = np.array(ny) - 1.0
- mi = (
- digamma(n_samples)
- + digamma(n_neighbors)
- - np.mean(digamma(nx + 1))
- - np.mean(digamma(ny + 1))
- )
- return max(0, mi)
- def _compute_mi_cd(c, d, n_neighbors):
- """Compute mutual information between continuous and discrete variables.
- Parameters
- ----------
- c : ndarray, shape (n_samples,)
- Samples of a continuous random variable.
- d : ndarray, shape (n_samples,)
- Samples of a discrete random variable.
- n_neighbors : int
- Number of nearest neighbors to search for each point, see [1]_.
- Returns
- -------
- mi : float
- Estimated mutual information. If it turned out to be negative it is
- replace by 0.
- Notes
- -----
- True mutual information can't be negative. If its estimate by a numerical
- method is negative, it means (providing the method is adequate) that the
- mutual information is close to 0 and replacing it by 0 is a reasonable
- strategy.
- References
- ----------
- .. [1] B. C. Ross "Mutual Information between Discrete and Continuous
- Data Sets". PLoS ONE 9(2), 2014.
- """
- n_samples = c.shape[0]
- c = c.reshape((-1, 1))
- radius = np.empty(n_samples)
- label_counts = np.empty(n_samples)
- k_all = np.empty(n_samples)
- nn = NearestNeighbors()
- for label in np.unique(d):
- mask = d == label
- count = np.sum(mask)
- if count > 1:
- k = min(n_neighbors, count - 1)
- nn.set_params(n_neighbors=k)
- nn.fit(c[mask])
- r = nn.kneighbors()[0]
- radius[mask] = np.nextafter(r[:, -1], 0)
- k_all[mask] = k
- label_counts[mask] = count
- # Ignore points with unique labels.
- mask = label_counts > 1
- n_samples = np.sum(mask)
- label_counts = label_counts[mask]
- k_all = k_all[mask]
- c = c[mask]
- radius = radius[mask]
- kd = KDTree(c)
- m_all = kd.query_radius(c, radius, count_only=True, return_distance=False)
- m_all = np.array(m_all)
- mi = (
- digamma(n_samples)
- + np.mean(digamma(k_all))
- - np.mean(digamma(label_counts))
- - np.mean(digamma(m_all))
- )
- return max(0, mi)
- def _compute_mi(x, y, x_discrete, y_discrete, n_neighbors=3):
- """Compute mutual information between two variables.
- This is a simple wrapper which selects a proper function to call based on
- whether `x` and `y` are discrete or not.
- """
- if x_discrete and y_discrete:
- return mutual_info_score(x, y)
- elif x_discrete and not y_discrete:
- return _compute_mi_cd(y, x, n_neighbors)
- elif not x_discrete and y_discrete:
- return _compute_mi_cd(x, y, n_neighbors)
- else:
- return _compute_mi_cc(x, y, n_neighbors)
- def _iterate_columns(X, columns=None):
- """Iterate over columns of a matrix.
- Parameters
- ----------
- X : ndarray or csc_matrix, shape (n_samples, n_features)
- Matrix over which to iterate.
- columns : iterable or None, default=None
- Indices of columns to iterate over. If None, iterate over all columns.
- Yields
- ------
- x : ndarray, shape (n_samples,)
- Columns of `X` in dense format.
- """
- if columns is None:
- columns = range(X.shape[1])
- if issparse(X):
- for i in columns:
- x = np.zeros(X.shape[0])
- start_ptr, end_ptr = X.indptr[i], X.indptr[i + 1]
- x[X.indices[start_ptr:end_ptr]] = X.data[start_ptr:end_ptr]
- yield x
- else:
- for i in columns:
- yield X[:, i]
- def _estimate_mi(
- X,
- y,
- discrete_features="auto",
- discrete_target=False,
- n_neighbors=3,
- copy=True,
- random_state=None,
- ):
- """Estimate mutual information between the features and the target.
- Parameters
- ----------
- X : array-like or sparse matrix, shape (n_samples, n_features)
- Feature matrix.
- y : array-like of shape (n_samples,)
- Target vector.
- discrete_features : {'auto', bool, array-like}, default='auto'
- If bool, then determines whether to consider all features discrete
- or continuous. If array, then it should be either a boolean mask
- with shape (n_features,) or array with indices of discrete features.
- If 'auto', it is assigned to False for dense `X` and to True for
- sparse `X`.
- discrete_target : bool, default=False
- Whether to consider `y` as a discrete variable.
- n_neighbors : int, default=3
- Number of neighbors to use for MI estimation for continuous variables,
- see [1]_ and [2]_. Higher values reduce variance of the estimation, but
- could introduce a bias.
- copy : bool, default=True
- Whether to make a copy of the given data. If set to False, the initial
- data will be overwritten.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for adding small noise to
- continuous variables in order to remove repeated values.
- Pass an int for reproducible results across multiple function calls.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- mi : ndarray, shape (n_features,)
- Estimated mutual information between each feature and the target.
- A negative value will be replaced by 0.
- References
- ----------
- .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
- information". Phys. Rev. E 69, 2004.
- .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
- Data Sets". PLoS ONE 9(2), 2014.
- """
- X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target)
- n_samples, n_features = X.shape
- if isinstance(discrete_features, (str, bool)):
- if isinstance(discrete_features, str):
- if discrete_features == "auto":
- discrete_features = issparse(X)
- else:
- raise ValueError("Invalid string value for discrete_features.")
- discrete_mask = np.empty(n_features, dtype=bool)
- discrete_mask.fill(discrete_features)
- else:
- discrete_features = check_array(discrete_features, ensure_2d=False)
- if discrete_features.dtype != "bool":
- discrete_mask = np.zeros(n_features, dtype=bool)
- discrete_mask[discrete_features] = True
- else:
- discrete_mask = discrete_features
- continuous_mask = ~discrete_mask
- if np.any(continuous_mask) and issparse(X):
- raise ValueError("Sparse matrix `X` can't have continuous features.")
- rng = check_random_state(random_state)
- if np.any(continuous_mask):
- X = X.astype(np.float64, copy=copy)
- X[:, continuous_mask] = scale(
- X[:, continuous_mask], with_mean=False, copy=False
- )
- # Add small noise to continuous features as advised in Kraskov et. al.
- means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0))
- X[:, continuous_mask] += (
- 1e-10
- * means
- * rng.standard_normal(size=(n_samples, np.sum(continuous_mask)))
- )
- if not discrete_target:
- y = scale(y, with_mean=False)
- y += (
- 1e-10
- * np.maximum(1, np.mean(np.abs(y)))
- * rng.standard_normal(size=n_samples)
- )
- mi = [
- _compute_mi(x, y, discrete_feature, discrete_target, n_neighbors)
- for x, discrete_feature in zip(_iterate_columns(X), discrete_mask)
- ]
- return np.array(mi)
- @validate_params(
- {
- "X": ["array-like", "sparse matrix"],
- "y": ["array-like"],
- "discrete_features": [StrOptions({"auto"}), "boolean", "array-like"],
- "n_neighbors": [Interval(Integral, 1, None, closed="left")],
- "copy": ["boolean"],
- "random_state": ["random_state"],
- },
- prefer_skip_nested_validation=True,
- )
- def mutual_info_regression(
- X, y, *, discrete_features="auto", n_neighbors=3, copy=True, random_state=None
- ):
- """Estimate mutual information for a continuous target variable.
- Mutual information (MI) [1]_ between two random variables is a non-negative
- value, which measures the dependency between the variables. It is equal
- to zero if and only if two random variables are independent, and higher
- values mean higher dependency.
- The function relies on nonparametric methods based on entropy estimation
- from k-nearest neighbors distances as described in [2]_ and [3]_. Both
- methods are based on the idea originally proposed in [4]_.
- It can be used for univariate features selection, read more in the
- :ref:`User Guide <univariate_feature_selection>`.
- Parameters
- ----------
- X : array-like or sparse matrix, shape (n_samples, n_features)
- Feature matrix.
- y : array-like of shape (n_samples,)
- Target vector.
- discrete_features : {'auto', bool, array-like}, default='auto'
- If bool, then determines whether to consider all features discrete
- or continuous. If array, then it should be either a boolean mask
- with shape (n_features,) or array with indices of discrete features.
- If 'auto', it is assigned to False for dense `X` and to True for
- sparse `X`.
- n_neighbors : int, default=3
- Number of neighbors to use for MI estimation for continuous variables,
- see [2]_ and [3]_. Higher values reduce variance of the estimation, but
- could introduce a bias.
- copy : bool, default=True
- Whether to make a copy of the given data. If set to False, the initial
- data will be overwritten.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for adding small noise to
- continuous variables in order to remove repeated values.
- Pass an int for reproducible results across multiple function calls.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- mi : ndarray, shape (n_features,)
- Estimated mutual information between each feature and the target.
- Notes
- -----
- 1. The term "discrete features" is used instead of naming them
- "categorical", because it describes the essence more accurately.
- For example, pixel intensities of an image are discrete features
- (but hardly categorical) and you will get better results if mark them
- as such. Also note, that treating a continuous variable as discrete and
- vice versa will usually give incorrect results, so be attentive about
- that.
- 2. True mutual information can't be negative. If its estimate turns out
- to be negative, it is replaced by zero.
- References
- ----------
- .. [1] `Mutual Information
- <https://en.wikipedia.org/wiki/Mutual_information>`_
- on Wikipedia.
- .. [2] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
- information". Phys. Rev. E 69, 2004.
- .. [3] B. C. Ross "Mutual Information between Discrete and Continuous
- Data Sets". PLoS ONE 9(2), 2014.
- .. [4] L. F. Kozachenko, N. N. Leonenko, "Sample Estimate of the Entropy
- of a Random Vector", Probl. Peredachi Inf., 23:2 (1987), 9-16
- """
- return _estimate_mi(X, y, discrete_features, False, n_neighbors, copy, random_state)
- @validate_params(
- {
- "X": ["array-like", "sparse matrix"],
- "y": ["array-like"],
- "discrete_features": [StrOptions({"auto"}), "boolean", "array-like"],
- "n_neighbors": [Interval(Integral, 1, None, closed="left")],
- "copy": ["boolean"],
- "random_state": ["random_state"],
- },
- prefer_skip_nested_validation=True,
- )
- def mutual_info_classif(
- X, y, *, discrete_features="auto", n_neighbors=3, copy=True, random_state=None
- ):
- """Estimate mutual information for a discrete target variable.
- Mutual information (MI) [1]_ between two random variables is a non-negative
- value, which measures the dependency between the variables. It is equal
- to zero if and only if two random variables are independent, and higher
- values mean higher dependency.
- The function relies on nonparametric methods based on entropy estimation
- from k-nearest neighbors distances as described in [2]_ and [3]_. Both
- methods are based on the idea originally proposed in [4]_.
- It can be used for univariate features selection, read more in the
- :ref:`User Guide <univariate_feature_selection>`.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Feature matrix.
- y : array-like of shape (n_samples,)
- Target vector.
- discrete_features : 'auto', bool or array-like, default='auto'
- If bool, then determines whether to consider all features discrete
- or continuous. If array, then it should be either a boolean mask
- with shape (n_features,) or array with indices of discrete features.
- If 'auto', it is assigned to False for dense `X` and to True for
- sparse `X`.
- n_neighbors : int, default=3
- Number of neighbors to use for MI estimation for continuous variables,
- see [2]_ and [3]_. Higher values reduce variance of the estimation, but
- could introduce a bias.
- copy : bool, default=True
- Whether to make a copy of the given data. If set to False, the initial
- data will be overwritten.
- random_state : int, RandomState instance or None, default=None
- Determines random number generation for adding small noise to
- continuous variables in order to remove repeated values.
- Pass an int for reproducible results across multiple function calls.
- See :term:`Glossary <random_state>`.
- Returns
- -------
- mi : ndarray, shape (n_features,)
- Estimated mutual information between each feature and the target.
- Notes
- -----
- 1. The term "discrete features" is used instead of naming them
- "categorical", because it describes the essence more accurately.
- For example, pixel intensities of an image are discrete features
- (but hardly categorical) and you will get better results if mark them
- as such. Also note, that treating a continuous variable as discrete and
- vice versa will usually give incorrect results, so be attentive about
- that.
- 2. True mutual information can't be negative. If its estimate turns out
- to be negative, it is replaced by zero.
- References
- ----------
- .. [1] `Mutual Information
- <https://en.wikipedia.org/wiki/Mutual_information>`_
- on Wikipedia.
- .. [2] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
- information". Phys. Rev. E 69, 2004.
- .. [3] B. C. Ross "Mutual Information between Discrete and Continuous
- Data Sets". PLoS ONE 9(2), 2014.
- .. [4] L. F. Kozachenko, N. N. Leonenko, "Sample Estimate of the Entropy
- of a Random Vector:, Probl. Peredachi Inf., 23:2 (1987), 9-16
- """
- check_classification_targets(y)
- return _estimate_mi(X, y, discrete_features, True, n_neighbors, copy, random_state)
|