| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247 |
- """Generic feature selection mixin"""
- # Authors: G. Varoquaux, A. Gramfort, L. Buitinck, J. Nothman
- # License: BSD 3 clause
- import warnings
- from abc import ABCMeta, abstractmethod
- from operator import attrgetter
- import numpy as np
- from scipy.sparse import csc_matrix, issparse
- from ..base import TransformerMixin
- from ..utils import (
- _safe_indexing,
- check_array,
- safe_sqr,
- )
- from ..utils._set_output import _get_output_config
- from ..utils._tags import _safe_tags
- from ..utils.validation import _check_feature_names_in, check_is_fitted
- class SelectorMixin(TransformerMixin, metaclass=ABCMeta):
- """
- Transformer mixin that performs feature selection given a support mask
- This mixin provides a feature selector implementation with `transform` and
- `inverse_transform` functionality given an implementation of
- `_get_support_mask`.
- """
- def get_support(self, indices=False):
- """
- Get a mask, or integer index, of the features selected.
- Parameters
- ----------
- indices : bool, default=False
- If True, the return value will be an array of integers, rather
- than a boolean mask.
- Returns
- -------
- support : array
- An index that selects the retained features from a feature vector.
- If `indices` is False, this is a boolean array of shape
- [# input features], in which an element is True iff its
- corresponding feature is selected for retention. If `indices` is
- True, this is an integer array of shape [# output features] whose
- values are indices into the input feature vector.
- """
- mask = self._get_support_mask()
- return mask if not indices else np.where(mask)[0]
- @abstractmethod
- def _get_support_mask(self):
- """
- Get the boolean mask indicating which features are selected
- Returns
- -------
- support : boolean array of shape [# input features]
- An element is True iff its corresponding feature is selected for
- retention.
- """
- def transform(self, X):
- """Reduce X to the selected features.
- Parameters
- ----------
- X : array of shape [n_samples, n_features]
- The input samples.
- Returns
- -------
- X_r : array of shape [n_samples, n_selected_features]
- The input samples with only the selected features.
- """
- # Preserve X when X is a dataframe and the output is configured to
- # be pandas.
- output_config_dense = _get_output_config("transform", estimator=self)["dense"]
- preserve_X = hasattr(X, "iloc") and output_config_dense == "pandas"
- # note: we use _safe_tags instead of _get_tags because this is a
- # public Mixin.
- X = self._validate_data(
- X,
- dtype=None,
- accept_sparse="csr",
- force_all_finite=not _safe_tags(self, key="allow_nan"),
- cast_to_ndarray=not preserve_X,
- reset=False,
- )
- return self._transform(X)
- def _transform(self, X):
- """Reduce X to the selected features."""
- mask = self.get_support()
- if not mask.any():
- warnings.warn(
- (
- "No features were selected: either the data is"
- " too noisy or the selection test too strict."
- ),
- UserWarning,
- )
- if hasattr(X, "iloc"):
- return X.iloc[:, :0]
- return np.empty(0, dtype=X.dtype).reshape((X.shape[0], 0))
- return _safe_indexing(X, mask, axis=1)
- def inverse_transform(self, X):
- """Reverse the transformation operation.
- Parameters
- ----------
- X : array of shape [n_samples, n_selected_features]
- The input samples.
- Returns
- -------
- X_r : array of shape [n_samples, n_original_features]
- `X` with columns of zeros inserted where features would have
- been removed by :meth:`transform`.
- """
- if issparse(X):
- X = X.tocsc()
- # insert additional entries in indptr:
- # e.g. if transform changed indptr from [0 2 6 7] to [0 2 3]
- # col_nonzeros here will be [2 0 1] so indptr becomes [0 2 2 3]
- it = self.inverse_transform(np.diff(X.indptr).reshape(1, -1))
- col_nonzeros = it.ravel()
- indptr = np.concatenate([[0], np.cumsum(col_nonzeros)])
- Xt = csc_matrix(
- (X.data, X.indices, indptr),
- shape=(X.shape[0], len(indptr) - 1),
- dtype=X.dtype,
- )
- return Xt
- support = self.get_support()
- X = check_array(X, dtype=None)
- if support.sum() != X.shape[1]:
- raise ValueError("X has a different shape than during fitting.")
- if X.ndim == 1:
- X = X[None, :]
- Xt = np.zeros((X.shape[0], support.size), dtype=X.dtype)
- Xt[:, support] = X
- return Xt
- def get_feature_names_out(self, input_features=None):
- """Mask feature names according to selected features.
- Parameters
- ----------
- input_features : array-like of str or None, default=None
- Input features.
- - If `input_features` is `None`, then `feature_names_in_` is
- used as feature names in. If `feature_names_in_` is not defined,
- then the following input feature names are generated:
- `["x0", "x1", ..., "x(n_features_in_ - 1)"]`.
- - If `input_features` is an array-like, then `input_features` must
- match `feature_names_in_` if `feature_names_in_` is defined.
- Returns
- -------
- feature_names_out : ndarray of str objects
- Transformed feature names.
- """
- check_is_fitted(self)
- input_features = _check_feature_names_in(self, input_features)
- return input_features[self.get_support()]
- def _get_feature_importances(estimator, getter, transform_func=None, norm_order=1):
- """
- Retrieve and aggregate (ndim > 1) the feature importances
- from an estimator. Also optionally applies transformation.
- Parameters
- ----------
- estimator : estimator
- A scikit-learn estimator from which we want to get the feature
- importances.
- getter : "auto", str or callable
- An attribute or a callable to get the feature importance. If `"auto"`,
- `estimator` is expected to expose `coef_` or `feature_importances`.
- transform_func : {"norm", "square"}, default=None
- The transform to apply to the feature importances. By default (`None`)
- no transformation is applied.
- norm_order : int, default=1
- The norm order to apply when `transform_func="norm"`. Only applied
- when `importances.ndim > 1`.
- Returns
- -------
- importances : ndarray of shape (n_features,)
- The features importances, optionally transformed.
- """
- if isinstance(getter, str):
- if getter == "auto":
- if hasattr(estimator, "coef_"):
- getter = attrgetter("coef_")
- elif hasattr(estimator, "feature_importances_"):
- getter = attrgetter("feature_importances_")
- else:
- raise ValueError(
- "when `importance_getter=='auto'`, the underlying "
- f"estimator {estimator.__class__.__name__} should have "
- "`coef_` or `feature_importances_` attribute. Either "
- "pass a fitted estimator to feature selector or call fit "
- "before calling transform."
- )
- else:
- getter = attrgetter(getter)
- elif not callable(getter):
- raise ValueError("`importance_getter` has to be a string or `callable`")
- importances = getter(estimator)
- if transform_func is None:
- return importances
- elif transform_func == "norm":
- if importances.ndim == 1:
- importances = np.abs(importances)
- else:
- importances = np.linalg.norm(importances, axis=0, ord=norm_order)
- elif transform_func == "square":
- if importances.ndim == 1:
- importances = safe_sqr(importances)
- else:
- importances = safe_sqr(importances).sum(axis=0)
- else:
- raise ValueError(
- "Valid values for `transform_func` are "
- + "None, 'norm' and 'square'. Those two "
- + "transformation are only supported now"
- )
- return importances
|