_base.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. """Generic feature selection mixin"""
  2. # Authors: G. Varoquaux, A. Gramfort, L. Buitinck, J. Nothman
  3. # License: BSD 3 clause
  4. import warnings
  5. from abc import ABCMeta, abstractmethod
  6. from operator import attrgetter
  7. import numpy as np
  8. from scipy.sparse import csc_matrix, issparse
  9. from ..base import TransformerMixin
  10. from ..utils import (
  11. _safe_indexing,
  12. check_array,
  13. safe_sqr,
  14. )
  15. from ..utils._set_output import _get_output_config
  16. from ..utils._tags import _safe_tags
  17. from ..utils.validation import _check_feature_names_in, check_is_fitted
  18. class SelectorMixin(TransformerMixin, metaclass=ABCMeta):
  19. """
  20. Transformer mixin that performs feature selection given a support mask
  21. This mixin provides a feature selector implementation with `transform` and
  22. `inverse_transform` functionality given an implementation of
  23. `_get_support_mask`.
  24. """
  25. def get_support(self, indices=False):
  26. """
  27. Get a mask, or integer index, of the features selected.
  28. Parameters
  29. ----------
  30. indices : bool, default=False
  31. If True, the return value will be an array of integers, rather
  32. than a boolean mask.
  33. Returns
  34. -------
  35. support : array
  36. An index that selects the retained features from a feature vector.
  37. If `indices` is False, this is a boolean array of shape
  38. [# input features], in which an element is True iff its
  39. corresponding feature is selected for retention. If `indices` is
  40. True, this is an integer array of shape [# output features] whose
  41. values are indices into the input feature vector.
  42. """
  43. mask = self._get_support_mask()
  44. return mask if not indices else np.where(mask)[0]
  45. @abstractmethod
  46. def _get_support_mask(self):
  47. """
  48. Get the boolean mask indicating which features are selected
  49. Returns
  50. -------
  51. support : boolean array of shape [# input features]
  52. An element is True iff its corresponding feature is selected for
  53. retention.
  54. """
  55. def transform(self, X):
  56. """Reduce X to the selected features.
  57. Parameters
  58. ----------
  59. X : array of shape [n_samples, n_features]
  60. The input samples.
  61. Returns
  62. -------
  63. X_r : array of shape [n_samples, n_selected_features]
  64. The input samples with only the selected features.
  65. """
  66. # Preserve X when X is a dataframe and the output is configured to
  67. # be pandas.
  68. output_config_dense = _get_output_config("transform", estimator=self)["dense"]
  69. preserve_X = hasattr(X, "iloc") and output_config_dense == "pandas"
  70. # note: we use _safe_tags instead of _get_tags because this is a
  71. # public Mixin.
  72. X = self._validate_data(
  73. X,
  74. dtype=None,
  75. accept_sparse="csr",
  76. force_all_finite=not _safe_tags(self, key="allow_nan"),
  77. cast_to_ndarray=not preserve_X,
  78. reset=False,
  79. )
  80. return self._transform(X)
  81. def _transform(self, X):
  82. """Reduce X to the selected features."""
  83. mask = self.get_support()
  84. if not mask.any():
  85. warnings.warn(
  86. (
  87. "No features were selected: either the data is"
  88. " too noisy or the selection test too strict."
  89. ),
  90. UserWarning,
  91. )
  92. if hasattr(X, "iloc"):
  93. return X.iloc[:, :0]
  94. return np.empty(0, dtype=X.dtype).reshape((X.shape[0], 0))
  95. return _safe_indexing(X, mask, axis=1)
  96. def inverse_transform(self, X):
  97. """Reverse the transformation operation.
  98. Parameters
  99. ----------
  100. X : array of shape [n_samples, n_selected_features]
  101. The input samples.
  102. Returns
  103. -------
  104. X_r : array of shape [n_samples, n_original_features]
  105. `X` with columns of zeros inserted where features would have
  106. been removed by :meth:`transform`.
  107. """
  108. if issparse(X):
  109. X = X.tocsc()
  110. # insert additional entries in indptr:
  111. # e.g. if transform changed indptr from [0 2 6 7] to [0 2 3]
  112. # col_nonzeros here will be [2 0 1] so indptr becomes [0 2 2 3]
  113. it = self.inverse_transform(np.diff(X.indptr).reshape(1, -1))
  114. col_nonzeros = it.ravel()
  115. indptr = np.concatenate([[0], np.cumsum(col_nonzeros)])
  116. Xt = csc_matrix(
  117. (X.data, X.indices, indptr),
  118. shape=(X.shape[0], len(indptr) - 1),
  119. dtype=X.dtype,
  120. )
  121. return Xt
  122. support = self.get_support()
  123. X = check_array(X, dtype=None)
  124. if support.sum() != X.shape[1]:
  125. raise ValueError("X has a different shape than during fitting.")
  126. if X.ndim == 1:
  127. X = X[None, :]
  128. Xt = np.zeros((X.shape[0], support.size), dtype=X.dtype)
  129. Xt[:, support] = X
  130. return Xt
  131. def get_feature_names_out(self, input_features=None):
  132. """Mask feature names according to selected features.
  133. Parameters
  134. ----------
  135. input_features : array-like of str or None, default=None
  136. Input features.
  137. - If `input_features` is `None`, then `feature_names_in_` is
  138. used as feature names in. If `feature_names_in_` is not defined,
  139. then the following input feature names are generated:
  140. `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
  141. - If `input_features` is an array-like, then `input_features` must
  142. match `feature_names_in_` if `feature_names_in_` is defined.
  143. Returns
  144. -------
  145. feature_names_out : ndarray of str objects
  146. Transformed feature names.
  147. """
  148. check_is_fitted(self)
  149. input_features = _check_feature_names_in(self, input_features)
  150. return input_features[self.get_support()]
  151. def _get_feature_importances(estimator, getter, transform_func=None, norm_order=1):
  152. """
  153. Retrieve and aggregate (ndim > 1) the feature importances
  154. from an estimator. Also optionally applies transformation.
  155. Parameters
  156. ----------
  157. estimator : estimator
  158. A scikit-learn estimator from which we want to get the feature
  159. importances.
  160. getter : "auto", str or callable
  161. An attribute or a callable to get the feature importance. If `"auto"`,
  162. `estimator` is expected to expose `coef_` or `feature_importances`.
  163. transform_func : {"norm", "square"}, default=None
  164. The transform to apply to the feature importances. By default (`None`)
  165. no transformation is applied.
  166. norm_order : int, default=1
  167. The norm order to apply when `transform_func="norm"`. Only applied
  168. when `importances.ndim > 1`.
  169. Returns
  170. -------
  171. importances : ndarray of shape (n_features,)
  172. The features importances, optionally transformed.
  173. """
  174. if isinstance(getter, str):
  175. if getter == "auto":
  176. if hasattr(estimator, "coef_"):
  177. getter = attrgetter("coef_")
  178. elif hasattr(estimator, "feature_importances_"):
  179. getter = attrgetter("feature_importances_")
  180. else:
  181. raise ValueError(
  182. "when `importance_getter=='auto'`, the underlying "
  183. f"estimator {estimator.__class__.__name__} should have "
  184. "`coef_` or `feature_importances_` attribute. Either "
  185. "pass a fitted estimator to feature selector or call fit "
  186. "before calling transform."
  187. )
  188. else:
  189. getter = attrgetter(getter)
  190. elif not callable(getter):
  191. raise ValueError("`importance_getter` has to be a string or `callable`")
  192. importances = getter(estimator)
  193. if transform_func is None:
  194. return importances
  195. elif transform_func == "norm":
  196. if importances.ndim == 1:
  197. importances = np.abs(importances)
  198. else:
  199. importances = np.linalg.norm(importances, axis=0, ord=norm_order)
  200. elif transform_func == "square":
  201. if importances.ndim == 1:
  202. importances = safe_sqr(importances)
  203. else:
  204. importances = safe_sqr(importances).sum(axis=0)
  205. else:
  206. raise ValueError(
  207. "Valid values for `transform_func` are "
  208. + "None, 'norm' and 'square'. Those two "
  209. + "transformation are only supported now"
  210. )
  211. return importances