_function_transformer.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350
  1. import warnings
  2. import numpy as np
  3. from ..base import BaseEstimator, TransformerMixin, _fit_context
  4. from ..utils._param_validation import StrOptions
  5. from ..utils.metaestimators import available_if
  6. from ..utils.validation import (
  7. _allclose_dense_sparse,
  8. _check_feature_names_in,
  9. check_array,
  10. )
  11. def _identity(X):
  12. """The identity function."""
  13. return X
  14. class FunctionTransformer(TransformerMixin, BaseEstimator):
  15. """Constructs a transformer from an arbitrary callable.
  16. A FunctionTransformer forwards its X (and optionally y) arguments to a
  17. user-defined function or function object and returns the result of this
  18. function. This is useful for stateless transformations such as taking the
  19. log of frequencies, doing custom scaling, etc.
  20. Note: If a lambda is used as the function, then the resulting
  21. transformer will not be pickleable.
  22. .. versionadded:: 0.17
  23. Read more in the :ref:`User Guide <function_transformer>`.
  24. Parameters
  25. ----------
  26. func : callable, default=None
  27. The callable to use for the transformation. This will be passed
  28. the same arguments as transform, with args and kwargs forwarded.
  29. If func is None, then func will be the identity function.
  30. inverse_func : callable, default=None
  31. The callable to use for the inverse transformation. This will be
  32. passed the same arguments as inverse transform, with args and
  33. kwargs forwarded. If inverse_func is None, then inverse_func
  34. will be the identity function.
  35. validate : bool, default=False
  36. Indicate that the input X array should be checked before calling
  37. ``func``. The possibilities are:
  38. - If False, there is no input validation.
  39. - If True, then X will be converted to a 2-dimensional NumPy array or
  40. sparse matrix. If the conversion is not possible an exception is
  41. raised.
  42. .. versionchanged:: 0.22
  43. The default of ``validate`` changed from True to False.
  44. accept_sparse : bool, default=False
  45. Indicate that func accepts a sparse matrix as input. If validate is
  46. False, this has no effect. Otherwise, if accept_sparse is false,
  47. sparse matrix inputs will cause an exception to be raised.
  48. check_inverse : bool, default=True
  49. Whether to check that or ``func`` followed by ``inverse_func`` leads to
  50. the original inputs. It can be used for a sanity check, raising a
  51. warning when the condition is not fulfilled.
  52. .. versionadded:: 0.20
  53. feature_names_out : callable, 'one-to-one' or None, default=None
  54. Determines the list of feature names that will be returned by the
  55. `get_feature_names_out` method. If it is 'one-to-one', then the output
  56. feature names will be equal to the input feature names. If it is a
  57. callable, then it must take two positional arguments: this
  58. `FunctionTransformer` (`self`) and an array-like of input feature names
  59. (`input_features`). It must return an array-like of output feature
  60. names. The `get_feature_names_out` method is only defined if
  61. `feature_names_out` is not None.
  62. See ``get_feature_names_out`` for more details.
  63. .. versionadded:: 1.1
  64. kw_args : dict, default=None
  65. Dictionary of additional keyword arguments to pass to func.
  66. .. versionadded:: 0.18
  67. inv_kw_args : dict, default=None
  68. Dictionary of additional keyword arguments to pass to inverse_func.
  69. .. versionadded:: 0.18
  70. Attributes
  71. ----------
  72. n_features_in_ : int
  73. Number of features seen during :term:`fit`.
  74. .. versionadded:: 0.24
  75. feature_names_in_ : ndarray of shape (`n_features_in_`,)
  76. Names of features seen during :term:`fit`. Defined only when `X` has feature
  77. names that are all strings.
  78. .. versionadded:: 1.0
  79. See Also
  80. --------
  81. MaxAbsScaler : Scale each feature by its maximum absolute value.
  82. StandardScaler : Standardize features by removing the mean and
  83. scaling to unit variance.
  84. LabelBinarizer : Binarize labels in a one-vs-all fashion.
  85. MultiLabelBinarizer : Transform between iterable of iterables
  86. and a multilabel format.
  87. Examples
  88. --------
  89. >>> import numpy as np
  90. >>> from sklearn.preprocessing import FunctionTransformer
  91. >>> transformer = FunctionTransformer(np.log1p)
  92. >>> X = np.array([[0, 1], [2, 3]])
  93. >>> transformer.transform(X)
  94. array([[0. , 0.6931...],
  95. [1.0986..., 1.3862...]])
  96. """
  97. _parameter_constraints: dict = {
  98. "func": [callable, None],
  99. "inverse_func": [callable, None],
  100. "validate": ["boolean"],
  101. "accept_sparse": ["boolean"],
  102. "check_inverse": ["boolean"],
  103. "feature_names_out": [callable, StrOptions({"one-to-one"}), None],
  104. "kw_args": [dict, None],
  105. "inv_kw_args": [dict, None],
  106. }
  107. def __init__(
  108. self,
  109. func=None,
  110. inverse_func=None,
  111. *,
  112. validate=False,
  113. accept_sparse=False,
  114. check_inverse=True,
  115. feature_names_out=None,
  116. kw_args=None,
  117. inv_kw_args=None,
  118. ):
  119. self.func = func
  120. self.inverse_func = inverse_func
  121. self.validate = validate
  122. self.accept_sparse = accept_sparse
  123. self.check_inverse = check_inverse
  124. self.feature_names_out = feature_names_out
  125. self.kw_args = kw_args
  126. self.inv_kw_args = inv_kw_args
  127. def _check_input(self, X, *, reset):
  128. if self.validate:
  129. return self._validate_data(X, accept_sparse=self.accept_sparse, reset=reset)
  130. elif reset:
  131. # Set feature_names_in_ and n_features_in_ even if validate=False
  132. # We run this only when reset==True to store the attributes but not
  133. # validate them, because validate=False
  134. self._check_n_features(X, reset=reset)
  135. self._check_feature_names(X, reset=reset)
  136. return X
  137. def _check_inverse_transform(self, X):
  138. """Check that func and inverse_func are the inverse."""
  139. idx_selected = slice(None, None, max(1, X.shape[0] // 100))
  140. X_round_trip = self.inverse_transform(self.transform(X[idx_selected]))
  141. if hasattr(X, "dtype"):
  142. dtypes = [X.dtype]
  143. elif hasattr(X, "dtypes"):
  144. # Dataframes can have multiple dtypes
  145. dtypes = X.dtypes
  146. if not all(np.issubdtype(d, np.number) for d in dtypes):
  147. raise ValueError(
  148. "'check_inverse' is only supported when all the elements in `X` is"
  149. " numerical."
  150. )
  151. if not _allclose_dense_sparse(X[idx_selected], X_round_trip):
  152. warnings.warn(
  153. (
  154. "The provided functions are not strictly"
  155. " inverse of each other. If you are sure you"
  156. " want to proceed regardless, set"
  157. " 'check_inverse=False'."
  158. ),
  159. UserWarning,
  160. )
  161. @_fit_context(prefer_skip_nested_validation=True)
  162. def fit(self, X, y=None):
  163. """Fit transformer by checking X.
  164. If ``validate`` is ``True``, ``X`` will be checked.
  165. Parameters
  166. ----------
  167. X : {array-like, sparse-matrix} of shape (n_samples, n_features) \
  168. if `validate=True` else any object that `func` can handle
  169. Input array.
  170. y : Ignored
  171. Not used, present here for API consistency by convention.
  172. Returns
  173. -------
  174. self : object
  175. FunctionTransformer class instance.
  176. """
  177. X = self._check_input(X, reset=True)
  178. if self.check_inverse and not (self.func is None or self.inverse_func is None):
  179. self._check_inverse_transform(X)
  180. return self
  181. def transform(self, X):
  182. """Transform X using the forward function.
  183. Parameters
  184. ----------
  185. X : {array-like, sparse-matrix} of shape (n_samples, n_features) \
  186. if `validate=True` else any object that `func` can handle
  187. Input array.
  188. Returns
  189. -------
  190. X_out : array-like, shape (n_samples, n_features)
  191. Transformed input.
  192. """
  193. X = self._check_input(X, reset=False)
  194. return self._transform(X, func=self.func, kw_args=self.kw_args)
  195. def inverse_transform(self, X):
  196. """Transform X using the inverse function.
  197. Parameters
  198. ----------
  199. X : {array-like, sparse-matrix} of shape (n_samples, n_features) \
  200. if `validate=True` else any object that `inverse_func` can handle
  201. Input array.
  202. Returns
  203. -------
  204. X_out : array-like, shape (n_samples, n_features)
  205. Transformed input.
  206. """
  207. if self.validate:
  208. X = check_array(X, accept_sparse=self.accept_sparse)
  209. return self._transform(X, func=self.inverse_func, kw_args=self.inv_kw_args)
  210. @available_if(lambda self: self.feature_names_out is not None)
  211. def get_feature_names_out(self, input_features=None):
  212. """Get output feature names for transformation.
  213. This method is only defined if `feature_names_out` is not None.
  214. Parameters
  215. ----------
  216. input_features : array-like of str or None, default=None
  217. Input feature names.
  218. - If `input_features` is None, then `feature_names_in_` is
  219. used as the input feature names. If `feature_names_in_` is not
  220. defined, then names are generated:
  221. `[x0, x1, ..., x(n_features_in_ - 1)]`.
  222. - If `input_features` is array-like, then `input_features` must
  223. match `feature_names_in_` if `feature_names_in_` is defined.
  224. Returns
  225. -------
  226. feature_names_out : ndarray of str objects
  227. Transformed feature names.
  228. - If `feature_names_out` is 'one-to-one', the input feature names
  229. are returned (see `input_features` above). This requires
  230. `feature_names_in_` and/or `n_features_in_` to be defined, which
  231. is done automatically if `validate=True`. Alternatively, you can
  232. set them in `func`.
  233. - If `feature_names_out` is a callable, then it is called with two
  234. arguments, `self` and `input_features`, and its return value is
  235. returned by this method.
  236. """
  237. if hasattr(self, "n_features_in_") or input_features is not None:
  238. input_features = _check_feature_names_in(self, input_features)
  239. if self.feature_names_out == "one-to-one":
  240. names_out = input_features
  241. elif callable(self.feature_names_out):
  242. names_out = self.feature_names_out(self, input_features)
  243. else:
  244. raise ValueError(
  245. f"feature_names_out={self.feature_names_out!r} is invalid. "
  246. 'It must either be "one-to-one" or a callable with two '
  247. "arguments: the function transformer and an array-like of "
  248. "input feature names. The callable must return an array-like "
  249. "of output feature names."
  250. )
  251. return np.asarray(names_out, dtype=object)
  252. def _transform(self, X, func=None, kw_args=None):
  253. if func is None:
  254. func = _identity
  255. return func(X, **(kw_args if kw_args else {}))
  256. def __sklearn_is_fitted__(self):
  257. """Return True since FunctionTransfomer is stateless."""
  258. return True
  259. def _more_tags(self):
  260. return {"no_validation": not self.validate, "stateless": True}
  261. def set_output(self, *, transform=None):
  262. """Set output container.
  263. See :ref:`sphx_glr_auto_examples_miscellaneous_plot_set_output.py`
  264. for an example on how to use the API.
  265. Parameters
  266. ----------
  267. transform : {"default", "pandas"}, default=None
  268. Configure output of `transform` and `fit_transform`.
  269. - `"default"`: Default output format of a transformer
  270. - `"pandas"`: DataFrame output
  271. - `None`: Transform configuration is unchanged
  272. Returns
  273. -------
  274. self : estimator instance
  275. Estimator instance.
  276. """
  277. if hasattr(super(), "set_output"):
  278. return super().set_output(transform=transform)
  279. if transform == "pandas" and self.feature_names_out is None:
  280. warnings.warn(
  281. 'With transform="pandas", `func` should return a DataFrame to follow'
  282. " the set_output API."
  283. )
  284. return self