_dict_vectorizer.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447
  1. # Authors: Lars Buitinck
  2. # Dan Blanchard <dblanchard@ets.org>
  3. # License: BSD 3 clause
  4. from array import array
  5. from collections.abc import Iterable, Mapping
  6. from numbers import Number
  7. from operator import itemgetter
  8. import numpy as np
  9. import scipy.sparse as sp
  10. from ..base import BaseEstimator, TransformerMixin, _fit_context
  11. from ..utils import check_array
  12. from ..utils.validation import check_is_fitted
  13. class DictVectorizer(TransformerMixin, BaseEstimator):
  14. """Transforms lists of feature-value mappings to vectors.
  15. This transformer turns lists of mappings (dict-like objects) of feature
  16. names to feature values into Numpy arrays or scipy.sparse matrices for use
  17. with scikit-learn estimators.
  18. When feature values are strings, this transformer will do a binary one-hot
  19. (aka one-of-K) coding: one boolean-valued feature is constructed for each
  20. of the possible string values that the feature can take on. For instance,
  21. a feature "f" that can take on the values "ham" and "spam" will become two
  22. features in the output, one signifying "f=ham", the other "f=spam".
  23. If a feature value is a sequence or set of strings, this transformer
  24. will iterate over the values and will count the occurrences of each string
  25. value.
  26. However, note that this transformer will only do a binary one-hot encoding
  27. when feature values are of type string. If categorical features are
  28. represented as numeric values such as int or iterables of strings, the
  29. DictVectorizer can be followed by
  30. :class:`~sklearn.preprocessing.OneHotEncoder` to complete
  31. binary one-hot encoding.
  32. Features that do not occur in a sample (mapping) will have a zero value
  33. in the resulting array/matrix.
  34. For an efficiency comparision of the different feature extractors, see
  35. :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.
  36. Read more in the :ref:`User Guide <dict_feature_extraction>`.
  37. Parameters
  38. ----------
  39. dtype : dtype, default=np.float64
  40. The type of feature values. Passed to Numpy array/scipy.sparse matrix
  41. constructors as the dtype argument.
  42. separator : str, default="="
  43. Separator string used when constructing new features for one-hot
  44. coding.
  45. sparse : bool, default=True
  46. Whether transform should produce scipy.sparse matrices.
  47. sort : bool, default=True
  48. Whether ``feature_names_`` and ``vocabulary_`` should be
  49. sorted when fitting.
  50. Attributes
  51. ----------
  52. vocabulary_ : dict
  53. A dictionary mapping feature names to feature indices.
  54. feature_names_ : list
  55. A list of length n_features containing the feature names (e.g., "f=ham"
  56. and "f=spam").
  57. See Also
  58. --------
  59. FeatureHasher : Performs vectorization using only a hash function.
  60. sklearn.preprocessing.OrdinalEncoder : Handles nominal/categorical
  61. features encoded as columns of arbitrary data types.
  62. Examples
  63. --------
  64. >>> from sklearn.feature_extraction import DictVectorizer
  65. >>> v = DictVectorizer(sparse=False)
  66. >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
  67. >>> X = v.fit_transform(D)
  68. >>> X
  69. array([[2., 0., 1.],
  70. [0., 1., 3.]])
  71. >>> v.inverse_transform(X) == [{'bar': 2.0, 'foo': 1.0},
  72. ... {'baz': 1.0, 'foo': 3.0}]
  73. True
  74. >>> v.transform({'foo': 4, 'unseen_feature': 3})
  75. array([[0., 0., 4.]])
  76. """
  77. _parameter_constraints: dict = {
  78. "dtype": "no_validation", # validation delegated to numpy,
  79. "separator": [str],
  80. "sparse": ["boolean"],
  81. "sort": ["boolean"],
  82. }
  83. def __init__(self, *, dtype=np.float64, separator="=", sparse=True, sort=True):
  84. self.dtype = dtype
  85. self.separator = separator
  86. self.sparse = sparse
  87. self.sort = sort
  88. def _add_iterable_element(
  89. self,
  90. f,
  91. v,
  92. feature_names,
  93. vocab,
  94. *,
  95. fitting=True,
  96. transforming=False,
  97. indices=None,
  98. values=None,
  99. ):
  100. """Add feature names for iterable of strings"""
  101. for vv in v:
  102. if isinstance(vv, str):
  103. feature_name = "%s%s%s" % (f, self.separator, vv)
  104. vv = 1
  105. else:
  106. raise TypeError(
  107. f"Unsupported type {type(vv)} in iterable "
  108. "value. Only iterables of string are "
  109. "supported."
  110. )
  111. if fitting and feature_name not in vocab:
  112. vocab[feature_name] = len(feature_names)
  113. feature_names.append(feature_name)
  114. if transforming and feature_name in vocab:
  115. indices.append(vocab[feature_name])
  116. values.append(self.dtype(vv))
  117. @_fit_context(prefer_skip_nested_validation=True)
  118. def fit(self, X, y=None):
  119. """Learn a list of feature name -> indices mappings.
  120. Parameters
  121. ----------
  122. X : Mapping or iterable over Mappings
  123. Dict(s) or Mapping(s) from feature names (arbitrary Python
  124. objects) to feature values (strings or convertible to dtype).
  125. .. versionchanged:: 0.24
  126. Accepts multiple string values for one categorical feature.
  127. y : (ignored)
  128. Ignored parameter.
  129. Returns
  130. -------
  131. self : object
  132. DictVectorizer class instance.
  133. """
  134. feature_names = []
  135. vocab = {}
  136. for x in X:
  137. for f, v in x.items():
  138. if isinstance(v, str):
  139. feature_name = "%s%s%s" % (f, self.separator, v)
  140. elif isinstance(v, Number) or (v is None):
  141. feature_name = f
  142. elif isinstance(v, Mapping):
  143. raise TypeError(
  144. f"Unsupported value type {type(v)} "
  145. f"for {f}: {v}.\n"
  146. "Mapping objects are not supported."
  147. )
  148. elif isinstance(v, Iterable):
  149. feature_name = None
  150. self._add_iterable_element(f, v, feature_names, vocab)
  151. if feature_name is not None:
  152. if feature_name not in vocab:
  153. vocab[feature_name] = len(feature_names)
  154. feature_names.append(feature_name)
  155. if self.sort:
  156. feature_names.sort()
  157. vocab = {f: i for i, f in enumerate(feature_names)}
  158. self.feature_names_ = feature_names
  159. self.vocabulary_ = vocab
  160. return self
  161. def _transform(self, X, fitting):
  162. # Sanity check: Python's array has no way of explicitly requesting the
  163. # signed 32-bit integers that scipy.sparse needs, so we use the next
  164. # best thing: typecode "i" (int). However, if that gives larger or
  165. # smaller integers than 32-bit ones, np.frombuffer screws up.
  166. assert array("i").itemsize == 4, (
  167. "sizeof(int) != 4 on your platform; please report this at"
  168. " https://github.com/scikit-learn/scikit-learn/issues and"
  169. " include the output from platform.platform() in your bug report"
  170. )
  171. dtype = self.dtype
  172. if fitting:
  173. feature_names = []
  174. vocab = {}
  175. else:
  176. feature_names = self.feature_names_
  177. vocab = self.vocabulary_
  178. transforming = True
  179. # Process everything as sparse regardless of setting
  180. X = [X] if isinstance(X, Mapping) else X
  181. indices = array("i")
  182. indptr = [0]
  183. # XXX we could change values to an array.array as well, but it
  184. # would require (heuristic) conversion of dtype to typecode...
  185. values = []
  186. # collect all the possible feature names and build sparse matrix at
  187. # same time
  188. for x in X:
  189. for f, v in x.items():
  190. if isinstance(v, str):
  191. feature_name = "%s%s%s" % (f, self.separator, v)
  192. v = 1
  193. elif isinstance(v, Number) or (v is None):
  194. feature_name = f
  195. elif not isinstance(v, Mapping) and isinstance(v, Iterable):
  196. feature_name = None
  197. self._add_iterable_element(
  198. f,
  199. v,
  200. feature_names,
  201. vocab,
  202. fitting=fitting,
  203. transforming=transforming,
  204. indices=indices,
  205. values=values,
  206. )
  207. else:
  208. raise TypeError(
  209. f"Unsupported value Type {type(v)} "
  210. f"for {f}: {v}.\n"
  211. f"{type(v)} objects are not supported."
  212. )
  213. if feature_name is not None:
  214. if fitting and feature_name not in vocab:
  215. vocab[feature_name] = len(feature_names)
  216. feature_names.append(feature_name)
  217. if feature_name in vocab:
  218. indices.append(vocab[feature_name])
  219. values.append(self.dtype(v))
  220. indptr.append(len(indices))
  221. if len(indptr) == 1:
  222. raise ValueError("Sample sequence X is empty.")
  223. indices = np.frombuffer(indices, dtype=np.intc)
  224. shape = (len(indptr) - 1, len(vocab))
  225. result_matrix = sp.csr_matrix(
  226. (values, indices, indptr), shape=shape, dtype=dtype
  227. )
  228. # Sort everything if asked
  229. if fitting and self.sort:
  230. feature_names.sort()
  231. map_index = np.empty(len(feature_names), dtype=np.int32)
  232. for new_val, f in enumerate(feature_names):
  233. map_index[new_val] = vocab[f]
  234. vocab[f] = new_val
  235. result_matrix = result_matrix[:, map_index]
  236. if self.sparse:
  237. result_matrix.sort_indices()
  238. else:
  239. result_matrix = result_matrix.toarray()
  240. if fitting:
  241. self.feature_names_ = feature_names
  242. self.vocabulary_ = vocab
  243. return result_matrix
  244. @_fit_context(prefer_skip_nested_validation=True)
  245. def fit_transform(self, X, y=None):
  246. """Learn a list of feature name -> indices mappings and transform X.
  247. Like fit(X) followed by transform(X), but does not require
  248. materializing X in memory.
  249. Parameters
  250. ----------
  251. X : Mapping or iterable over Mappings
  252. Dict(s) or Mapping(s) from feature names (arbitrary Python
  253. objects) to feature values (strings or convertible to dtype).
  254. .. versionchanged:: 0.24
  255. Accepts multiple string values for one categorical feature.
  256. y : (ignored)
  257. Ignored parameter.
  258. Returns
  259. -------
  260. Xa : {array, sparse matrix}
  261. Feature vectors; always 2-d.
  262. """
  263. return self._transform(X, fitting=True)
  264. def inverse_transform(self, X, dict_type=dict):
  265. """Transform array or sparse matrix X back to feature mappings.
  266. X must have been produced by this DictVectorizer's transform or
  267. fit_transform method; it may only have passed through transformers
  268. that preserve the number of features and their order.
  269. In the case of one-hot/one-of-K coding, the constructed feature
  270. names and values are returned rather than the original ones.
  271. Parameters
  272. ----------
  273. X : {array-like, sparse matrix} of shape (n_samples, n_features)
  274. Sample matrix.
  275. dict_type : type, default=dict
  276. Constructor for feature mappings. Must conform to the
  277. collections.Mapping API.
  278. Returns
  279. -------
  280. D : list of dict_type objects of shape (n_samples,)
  281. Feature mappings for the samples in X.
  282. """
  283. # COO matrix is not subscriptable
  284. X = check_array(X, accept_sparse=["csr", "csc"])
  285. n_samples = X.shape[0]
  286. names = self.feature_names_
  287. dicts = [dict_type() for _ in range(n_samples)]
  288. if sp.issparse(X):
  289. for i, j in zip(*X.nonzero()):
  290. dicts[i][names[j]] = X[i, j]
  291. else:
  292. for i, d in enumerate(dicts):
  293. for j, v in enumerate(X[i, :]):
  294. if v != 0:
  295. d[names[j]] = X[i, j]
  296. return dicts
  297. def transform(self, X):
  298. """Transform feature->value dicts to array or sparse matrix.
  299. Named features not encountered during fit or fit_transform will be
  300. silently ignored.
  301. Parameters
  302. ----------
  303. X : Mapping or iterable over Mappings of shape (n_samples,)
  304. Dict(s) or Mapping(s) from feature names (arbitrary Python
  305. objects) to feature values (strings or convertible to dtype).
  306. Returns
  307. -------
  308. Xa : {array, sparse matrix}
  309. Feature vectors; always 2-d.
  310. """
  311. return self._transform(X, fitting=False)
  312. def get_feature_names_out(self, input_features=None):
  313. """Get output feature names for transformation.
  314. Parameters
  315. ----------
  316. input_features : array-like of str or None, default=None
  317. Not used, present here for API consistency by convention.
  318. Returns
  319. -------
  320. feature_names_out : ndarray of str objects
  321. Transformed feature names.
  322. """
  323. check_is_fitted(self, "feature_names_")
  324. if any(not isinstance(name, str) for name in self.feature_names_):
  325. feature_names = [str(name) for name in self.feature_names_]
  326. else:
  327. feature_names = self.feature_names_
  328. return np.asarray(feature_names, dtype=object)
  329. def restrict(self, support, indices=False):
  330. """Restrict the features to those in support using feature selection.
  331. This function modifies the estimator in-place.
  332. Parameters
  333. ----------
  334. support : array-like
  335. Boolean mask or list of indices (as returned by the get_support
  336. member of feature selectors).
  337. indices : bool, default=False
  338. Whether support is a list of indices.
  339. Returns
  340. -------
  341. self : object
  342. DictVectorizer class instance.
  343. Examples
  344. --------
  345. >>> from sklearn.feature_extraction import DictVectorizer
  346. >>> from sklearn.feature_selection import SelectKBest, chi2
  347. >>> v = DictVectorizer()
  348. >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
  349. >>> X = v.fit_transform(D)
  350. >>> support = SelectKBest(chi2, k=2).fit(X, [0, 1])
  351. >>> v.get_feature_names_out()
  352. array(['bar', 'baz', 'foo'], ...)
  353. >>> v.restrict(support.get_support())
  354. DictVectorizer()
  355. >>> v.get_feature_names_out()
  356. array(['bar', 'foo'], ...)
  357. """
  358. if not indices:
  359. support = np.where(support)[0]
  360. names = self.feature_names_
  361. new_vocab = {}
  362. for i in support:
  363. new_vocab[names[i]] = len(new_vocab)
  364. self.vocabulary_ = new_vocab
  365. self.feature_names_ = [
  366. f for f, i in sorted(new_vocab.items(), key=itemgetter(1))
  367. ]
  368. return self
  369. def _more_tags(self):
  370. return {"X_types": ["dict"]}