_mutual_info.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490
  1. # Author: Nikolay Mayorov <n59_ru@hotmail.com>
  2. # License: 3-clause BSD
  3. from numbers import Integral
  4. import numpy as np
  5. from scipy.sparse import issparse
  6. from scipy.special import digamma
  7. from ..metrics.cluster import mutual_info_score
  8. from ..neighbors import KDTree, NearestNeighbors
  9. from ..preprocessing import scale
  10. from ..utils import check_random_state
  11. from ..utils._param_validation import Interval, StrOptions, validate_params
  12. from ..utils.multiclass import check_classification_targets
  13. from ..utils.validation import check_array, check_X_y
  14. def _compute_mi_cc(x, y, n_neighbors):
  15. """Compute mutual information between two continuous variables.
  16. Parameters
  17. ----------
  18. x, y : ndarray, shape (n_samples,)
  19. Samples of two continuous random variables, must have an identical
  20. shape.
  21. n_neighbors : int
  22. Number of nearest neighbors to search for each point, see [1]_.
  23. Returns
  24. -------
  25. mi : float
  26. Estimated mutual information. If it turned out to be negative it is
  27. replace by 0.
  28. Notes
  29. -----
  30. True mutual information can't be negative. If its estimate by a numerical
  31. method is negative, it means (providing the method is adequate) that the
  32. mutual information is close to 0 and replacing it by 0 is a reasonable
  33. strategy.
  34. References
  35. ----------
  36. .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
  37. information". Phys. Rev. E 69, 2004.
  38. """
  39. n_samples = x.size
  40. x = x.reshape((-1, 1))
  41. y = y.reshape((-1, 1))
  42. xy = np.hstack((x, y))
  43. # Here we rely on NearestNeighbors to select the fastest algorithm.
  44. nn = NearestNeighbors(metric="chebyshev", n_neighbors=n_neighbors)
  45. nn.fit(xy)
  46. radius = nn.kneighbors()[0]
  47. radius = np.nextafter(radius[:, -1], 0)
  48. # KDTree is explicitly fit to allow for the querying of number of
  49. # neighbors within a specified radius
  50. kd = KDTree(x, metric="chebyshev")
  51. nx = kd.query_radius(x, radius, count_only=True, return_distance=False)
  52. nx = np.array(nx) - 1.0
  53. kd = KDTree(y, metric="chebyshev")
  54. ny = kd.query_radius(y, radius, count_only=True, return_distance=False)
  55. ny = np.array(ny) - 1.0
  56. mi = (
  57. digamma(n_samples)
  58. + digamma(n_neighbors)
  59. - np.mean(digamma(nx + 1))
  60. - np.mean(digamma(ny + 1))
  61. )
  62. return max(0, mi)
  63. def _compute_mi_cd(c, d, n_neighbors):
  64. """Compute mutual information between continuous and discrete variables.
  65. Parameters
  66. ----------
  67. c : ndarray, shape (n_samples,)
  68. Samples of a continuous random variable.
  69. d : ndarray, shape (n_samples,)
  70. Samples of a discrete random variable.
  71. n_neighbors : int
  72. Number of nearest neighbors to search for each point, see [1]_.
  73. Returns
  74. -------
  75. mi : float
  76. Estimated mutual information. If it turned out to be negative it is
  77. replace by 0.
  78. Notes
  79. -----
  80. True mutual information can't be negative. If its estimate by a numerical
  81. method is negative, it means (providing the method is adequate) that the
  82. mutual information is close to 0 and replacing it by 0 is a reasonable
  83. strategy.
  84. References
  85. ----------
  86. .. [1] B. C. Ross "Mutual Information between Discrete and Continuous
  87. Data Sets". PLoS ONE 9(2), 2014.
  88. """
  89. n_samples = c.shape[0]
  90. c = c.reshape((-1, 1))
  91. radius = np.empty(n_samples)
  92. label_counts = np.empty(n_samples)
  93. k_all = np.empty(n_samples)
  94. nn = NearestNeighbors()
  95. for label in np.unique(d):
  96. mask = d == label
  97. count = np.sum(mask)
  98. if count > 1:
  99. k = min(n_neighbors, count - 1)
  100. nn.set_params(n_neighbors=k)
  101. nn.fit(c[mask])
  102. r = nn.kneighbors()[0]
  103. radius[mask] = np.nextafter(r[:, -1], 0)
  104. k_all[mask] = k
  105. label_counts[mask] = count
  106. # Ignore points with unique labels.
  107. mask = label_counts > 1
  108. n_samples = np.sum(mask)
  109. label_counts = label_counts[mask]
  110. k_all = k_all[mask]
  111. c = c[mask]
  112. radius = radius[mask]
  113. kd = KDTree(c)
  114. m_all = kd.query_radius(c, radius, count_only=True, return_distance=False)
  115. m_all = np.array(m_all)
  116. mi = (
  117. digamma(n_samples)
  118. + np.mean(digamma(k_all))
  119. - np.mean(digamma(label_counts))
  120. - np.mean(digamma(m_all))
  121. )
  122. return max(0, mi)
  123. def _compute_mi(x, y, x_discrete, y_discrete, n_neighbors=3):
  124. """Compute mutual information between two variables.
  125. This is a simple wrapper which selects a proper function to call based on
  126. whether `x` and `y` are discrete or not.
  127. """
  128. if x_discrete and y_discrete:
  129. return mutual_info_score(x, y)
  130. elif x_discrete and not y_discrete:
  131. return _compute_mi_cd(y, x, n_neighbors)
  132. elif not x_discrete and y_discrete:
  133. return _compute_mi_cd(x, y, n_neighbors)
  134. else:
  135. return _compute_mi_cc(x, y, n_neighbors)
  136. def _iterate_columns(X, columns=None):
  137. """Iterate over columns of a matrix.
  138. Parameters
  139. ----------
  140. X : ndarray or csc_matrix, shape (n_samples, n_features)
  141. Matrix over which to iterate.
  142. columns : iterable or None, default=None
  143. Indices of columns to iterate over. If None, iterate over all columns.
  144. Yields
  145. ------
  146. x : ndarray, shape (n_samples,)
  147. Columns of `X` in dense format.
  148. """
  149. if columns is None:
  150. columns = range(X.shape[1])
  151. if issparse(X):
  152. for i in columns:
  153. x = np.zeros(X.shape[0])
  154. start_ptr, end_ptr = X.indptr[i], X.indptr[i + 1]
  155. x[X.indices[start_ptr:end_ptr]] = X.data[start_ptr:end_ptr]
  156. yield x
  157. else:
  158. for i in columns:
  159. yield X[:, i]
  160. def _estimate_mi(
  161. X,
  162. y,
  163. discrete_features="auto",
  164. discrete_target=False,
  165. n_neighbors=3,
  166. copy=True,
  167. random_state=None,
  168. ):
  169. """Estimate mutual information between the features and the target.
  170. Parameters
  171. ----------
  172. X : array-like or sparse matrix, shape (n_samples, n_features)
  173. Feature matrix.
  174. y : array-like of shape (n_samples,)
  175. Target vector.
  176. discrete_features : {'auto', bool, array-like}, default='auto'
  177. If bool, then determines whether to consider all features discrete
  178. or continuous. If array, then it should be either a boolean mask
  179. with shape (n_features,) or array with indices of discrete features.
  180. If 'auto', it is assigned to False for dense `X` and to True for
  181. sparse `X`.
  182. discrete_target : bool, default=False
  183. Whether to consider `y` as a discrete variable.
  184. n_neighbors : int, default=3
  185. Number of neighbors to use for MI estimation for continuous variables,
  186. see [1]_ and [2]_. Higher values reduce variance of the estimation, but
  187. could introduce a bias.
  188. copy : bool, default=True
  189. Whether to make a copy of the given data. If set to False, the initial
  190. data will be overwritten.
  191. random_state : int, RandomState instance or None, default=None
  192. Determines random number generation for adding small noise to
  193. continuous variables in order to remove repeated values.
  194. Pass an int for reproducible results across multiple function calls.
  195. See :term:`Glossary <random_state>`.
  196. Returns
  197. -------
  198. mi : ndarray, shape (n_features,)
  199. Estimated mutual information between each feature and the target.
  200. A negative value will be replaced by 0.
  201. References
  202. ----------
  203. .. [1] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
  204. information". Phys. Rev. E 69, 2004.
  205. .. [2] B. C. Ross "Mutual Information between Discrete and Continuous
  206. Data Sets". PLoS ONE 9(2), 2014.
  207. """
  208. X, y = check_X_y(X, y, accept_sparse="csc", y_numeric=not discrete_target)
  209. n_samples, n_features = X.shape
  210. if isinstance(discrete_features, (str, bool)):
  211. if isinstance(discrete_features, str):
  212. if discrete_features == "auto":
  213. discrete_features = issparse(X)
  214. else:
  215. raise ValueError("Invalid string value for discrete_features.")
  216. discrete_mask = np.empty(n_features, dtype=bool)
  217. discrete_mask.fill(discrete_features)
  218. else:
  219. discrete_features = check_array(discrete_features, ensure_2d=False)
  220. if discrete_features.dtype != "bool":
  221. discrete_mask = np.zeros(n_features, dtype=bool)
  222. discrete_mask[discrete_features] = True
  223. else:
  224. discrete_mask = discrete_features
  225. continuous_mask = ~discrete_mask
  226. if np.any(continuous_mask) and issparse(X):
  227. raise ValueError("Sparse matrix `X` can't have continuous features.")
  228. rng = check_random_state(random_state)
  229. if np.any(continuous_mask):
  230. X = X.astype(np.float64, copy=copy)
  231. X[:, continuous_mask] = scale(
  232. X[:, continuous_mask], with_mean=False, copy=False
  233. )
  234. # Add small noise to continuous features as advised in Kraskov et. al.
  235. means = np.maximum(1, np.mean(np.abs(X[:, continuous_mask]), axis=0))
  236. X[:, continuous_mask] += (
  237. 1e-10
  238. * means
  239. * rng.standard_normal(size=(n_samples, np.sum(continuous_mask)))
  240. )
  241. if not discrete_target:
  242. y = scale(y, with_mean=False)
  243. y += (
  244. 1e-10
  245. * np.maximum(1, np.mean(np.abs(y)))
  246. * rng.standard_normal(size=n_samples)
  247. )
  248. mi = [
  249. _compute_mi(x, y, discrete_feature, discrete_target, n_neighbors)
  250. for x, discrete_feature in zip(_iterate_columns(X), discrete_mask)
  251. ]
  252. return np.array(mi)
  253. @validate_params(
  254. {
  255. "X": ["array-like", "sparse matrix"],
  256. "y": ["array-like"],
  257. "discrete_features": [StrOptions({"auto"}), "boolean", "array-like"],
  258. "n_neighbors": [Interval(Integral, 1, None, closed="left")],
  259. "copy": ["boolean"],
  260. "random_state": ["random_state"],
  261. },
  262. prefer_skip_nested_validation=True,
  263. )
  264. def mutual_info_regression(
  265. X, y, *, discrete_features="auto", n_neighbors=3, copy=True, random_state=None
  266. ):
  267. """Estimate mutual information for a continuous target variable.
  268. Mutual information (MI) [1]_ between two random variables is a non-negative
  269. value, which measures the dependency between the variables. It is equal
  270. to zero if and only if two random variables are independent, and higher
  271. values mean higher dependency.
  272. The function relies on nonparametric methods based on entropy estimation
  273. from k-nearest neighbors distances as described in [2]_ and [3]_. Both
  274. methods are based on the idea originally proposed in [4]_.
  275. It can be used for univariate features selection, read more in the
  276. :ref:`User Guide <univariate_feature_selection>`.
  277. Parameters
  278. ----------
  279. X : array-like or sparse matrix, shape (n_samples, n_features)
  280. Feature matrix.
  281. y : array-like of shape (n_samples,)
  282. Target vector.
  283. discrete_features : {'auto', bool, array-like}, default='auto'
  284. If bool, then determines whether to consider all features discrete
  285. or continuous. If array, then it should be either a boolean mask
  286. with shape (n_features,) or array with indices of discrete features.
  287. If 'auto', it is assigned to False for dense `X` and to True for
  288. sparse `X`.
  289. n_neighbors : int, default=3
  290. Number of neighbors to use for MI estimation for continuous variables,
  291. see [2]_ and [3]_. Higher values reduce variance of the estimation, but
  292. could introduce a bias.
  293. copy : bool, default=True
  294. Whether to make a copy of the given data. If set to False, the initial
  295. data will be overwritten.
  296. random_state : int, RandomState instance or None, default=None
  297. Determines random number generation for adding small noise to
  298. continuous variables in order to remove repeated values.
  299. Pass an int for reproducible results across multiple function calls.
  300. See :term:`Glossary <random_state>`.
  301. Returns
  302. -------
  303. mi : ndarray, shape (n_features,)
  304. Estimated mutual information between each feature and the target.
  305. Notes
  306. -----
  307. 1. The term "discrete features" is used instead of naming them
  308. "categorical", because it describes the essence more accurately.
  309. For example, pixel intensities of an image are discrete features
  310. (but hardly categorical) and you will get better results if mark them
  311. as such. Also note, that treating a continuous variable as discrete and
  312. vice versa will usually give incorrect results, so be attentive about
  313. that.
  314. 2. True mutual information can't be negative. If its estimate turns out
  315. to be negative, it is replaced by zero.
  316. References
  317. ----------
  318. .. [1] `Mutual Information
  319. <https://en.wikipedia.org/wiki/Mutual_information>`_
  320. on Wikipedia.
  321. .. [2] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
  322. information". Phys. Rev. E 69, 2004.
  323. .. [3] B. C. Ross "Mutual Information between Discrete and Continuous
  324. Data Sets". PLoS ONE 9(2), 2014.
  325. .. [4] L. F. Kozachenko, N. N. Leonenko, "Sample Estimate of the Entropy
  326. of a Random Vector", Probl. Peredachi Inf., 23:2 (1987), 9-16
  327. """
  328. return _estimate_mi(X, y, discrete_features, False, n_neighbors, copy, random_state)
  329. @validate_params(
  330. {
  331. "X": ["array-like", "sparse matrix"],
  332. "y": ["array-like"],
  333. "discrete_features": [StrOptions({"auto"}), "boolean", "array-like"],
  334. "n_neighbors": [Interval(Integral, 1, None, closed="left")],
  335. "copy": ["boolean"],
  336. "random_state": ["random_state"],
  337. },
  338. prefer_skip_nested_validation=True,
  339. )
  340. def mutual_info_classif(
  341. X, y, *, discrete_features="auto", n_neighbors=3, copy=True, random_state=None
  342. ):
  343. """Estimate mutual information for a discrete target variable.
  344. Mutual information (MI) [1]_ between two random variables is a non-negative
  345. value, which measures the dependency between the variables. It is equal
  346. to zero if and only if two random variables are independent, and higher
  347. values mean higher dependency.
  348. The function relies on nonparametric methods based on entropy estimation
  349. from k-nearest neighbors distances as described in [2]_ and [3]_. Both
  350. methods are based on the idea originally proposed in [4]_.
  351. It can be used for univariate features selection, read more in the
  352. :ref:`User Guide <univariate_feature_selection>`.
  353. Parameters
  354. ----------
  355. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  356. Feature matrix.
  357. y : array-like of shape (n_samples,)
  358. Target vector.
  359. discrete_features : 'auto', bool or array-like, default='auto'
  360. If bool, then determines whether to consider all features discrete
  361. or continuous. If array, then it should be either a boolean mask
  362. with shape (n_features,) or array with indices of discrete features.
  363. If 'auto', it is assigned to False for dense `X` and to True for
  364. sparse `X`.
  365. n_neighbors : int, default=3
  366. Number of neighbors to use for MI estimation for continuous variables,
  367. see [2]_ and [3]_. Higher values reduce variance of the estimation, but
  368. could introduce a bias.
  369. copy : bool, default=True
  370. Whether to make a copy of the given data. If set to False, the initial
  371. data will be overwritten.
  372. random_state : int, RandomState instance or None, default=None
  373. Determines random number generation for adding small noise to
  374. continuous variables in order to remove repeated values.
  375. Pass an int for reproducible results across multiple function calls.
  376. See :term:`Glossary <random_state>`.
  377. Returns
  378. -------
  379. mi : ndarray, shape (n_features,)
  380. Estimated mutual information between each feature and the target.
  381. Notes
  382. -----
  383. 1. The term "discrete features" is used instead of naming them
  384. "categorical", because it describes the essence more accurately.
  385. For example, pixel intensities of an image are discrete features
  386. (but hardly categorical) and you will get better results if mark them
  387. as such. Also note, that treating a continuous variable as discrete and
  388. vice versa will usually give incorrect results, so be attentive about
  389. that.
  390. 2. True mutual information can't be negative. If its estimate turns out
  391. to be negative, it is replaced by zero.
  392. References
  393. ----------
  394. .. [1] `Mutual Information
  395. <https://en.wikipedia.org/wiki/Mutual_information>`_
  396. on Wikipedia.
  397. .. [2] A. Kraskov, H. Stogbauer and P. Grassberger, "Estimating mutual
  398. information". Phys. Rev. E 69, 2004.
  399. .. [3] B. C. Ross "Mutual Information between Discrete and Continuous
  400. Data Sets". PLoS ONE 9(2), 2014.
  401. .. [4] L. F. Kozachenko, N. N. Leonenko, "Sample Estimate of the Entropy
  402. of a Random Vector:, Probl. Peredachi Inf., 23:2 (1987), 9-16
  403. """
  404. check_classification_targets(y)
  405. return _estimate_mi(X, y, discrete_features, True, n_neighbors, copy, random_state)