| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447 |
- # Authors: Lars Buitinck
- # Dan Blanchard <dblanchard@ets.org>
- # License: BSD 3 clause
- from array import array
- from collections.abc import Iterable, Mapping
- from numbers import Number
- from operator import itemgetter
- import numpy as np
- import scipy.sparse as sp
- from ..base import BaseEstimator, TransformerMixin, _fit_context
- from ..utils import check_array
- from ..utils.validation import check_is_fitted
- class DictVectorizer(TransformerMixin, BaseEstimator):
- """Transforms lists of feature-value mappings to vectors.
- This transformer turns lists of mappings (dict-like objects) of feature
- names to feature values into Numpy arrays or scipy.sparse matrices for use
- with scikit-learn estimators.
- When feature values are strings, this transformer will do a binary one-hot
- (aka one-of-K) coding: one boolean-valued feature is constructed for each
- of the possible string values that the feature can take on. For instance,
- a feature "f" that can take on the values "ham" and "spam" will become two
- features in the output, one signifying "f=ham", the other "f=spam".
- If a feature value is a sequence or set of strings, this transformer
- will iterate over the values and will count the occurrences of each string
- value.
- However, note that this transformer will only do a binary one-hot encoding
- when feature values are of type string. If categorical features are
- represented as numeric values such as int or iterables of strings, the
- DictVectorizer can be followed by
- :class:`~sklearn.preprocessing.OneHotEncoder` to complete
- binary one-hot encoding.
- Features that do not occur in a sample (mapping) will have a zero value
- in the resulting array/matrix.
- For an efficiency comparision of the different feature extractors, see
- :ref:`sphx_glr_auto_examples_text_plot_hashing_vs_dict_vectorizer.py`.
- Read more in the :ref:`User Guide <dict_feature_extraction>`.
- Parameters
- ----------
- dtype : dtype, default=np.float64
- The type of feature values. Passed to Numpy array/scipy.sparse matrix
- constructors as the dtype argument.
- separator : str, default="="
- Separator string used when constructing new features for one-hot
- coding.
- sparse : bool, default=True
- Whether transform should produce scipy.sparse matrices.
- sort : bool, default=True
- Whether ``feature_names_`` and ``vocabulary_`` should be
- sorted when fitting.
- Attributes
- ----------
- vocabulary_ : dict
- A dictionary mapping feature names to feature indices.
- feature_names_ : list
- A list of length n_features containing the feature names (e.g., "f=ham"
- and "f=spam").
- See Also
- --------
- FeatureHasher : Performs vectorization using only a hash function.
- sklearn.preprocessing.OrdinalEncoder : Handles nominal/categorical
- features encoded as columns of arbitrary data types.
- Examples
- --------
- >>> from sklearn.feature_extraction import DictVectorizer
- >>> v = DictVectorizer(sparse=False)
- >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
- >>> X = v.fit_transform(D)
- >>> X
- array([[2., 0., 1.],
- [0., 1., 3.]])
- >>> v.inverse_transform(X) == [{'bar': 2.0, 'foo': 1.0},
- ... {'baz': 1.0, 'foo': 3.0}]
- True
- >>> v.transform({'foo': 4, 'unseen_feature': 3})
- array([[0., 0., 4.]])
- """
- _parameter_constraints: dict = {
- "dtype": "no_validation", # validation delegated to numpy,
- "separator": [str],
- "sparse": ["boolean"],
- "sort": ["boolean"],
- }
- def __init__(self, *, dtype=np.float64, separator="=", sparse=True, sort=True):
- self.dtype = dtype
- self.separator = separator
- self.sparse = sparse
- self.sort = sort
- def _add_iterable_element(
- self,
- f,
- v,
- feature_names,
- vocab,
- *,
- fitting=True,
- transforming=False,
- indices=None,
- values=None,
- ):
- """Add feature names for iterable of strings"""
- for vv in v:
- if isinstance(vv, str):
- feature_name = "%s%s%s" % (f, self.separator, vv)
- vv = 1
- else:
- raise TypeError(
- f"Unsupported type {type(vv)} in iterable "
- "value. Only iterables of string are "
- "supported."
- )
- if fitting and feature_name not in vocab:
- vocab[feature_name] = len(feature_names)
- feature_names.append(feature_name)
- if transforming and feature_name in vocab:
- indices.append(vocab[feature_name])
- values.append(self.dtype(vv))
- @_fit_context(prefer_skip_nested_validation=True)
- def fit(self, X, y=None):
- """Learn a list of feature name -> indices mappings.
- Parameters
- ----------
- X : Mapping or iterable over Mappings
- Dict(s) or Mapping(s) from feature names (arbitrary Python
- objects) to feature values (strings or convertible to dtype).
- .. versionchanged:: 0.24
- Accepts multiple string values for one categorical feature.
- y : (ignored)
- Ignored parameter.
- Returns
- -------
- self : object
- DictVectorizer class instance.
- """
- feature_names = []
- vocab = {}
- for x in X:
- for f, v in x.items():
- if isinstance(v, str):
- feature_name = "%s%s%s" % (f, self.separator, v)
- elif isinstance(v, Number) or (v is None):
- feature_name = f
- elif isinstance(v, Mapping):
- raise TypeError(
- f"Unsupported value type {type(v)} "
- f"for {f}: {v}.\n"
- "Mapping objects are not supported."
- )
- elif isinstance(v, Iterable):
- feature_name = None
- self._add_iterable_element(f, v, feature_names, vocab)
- if feature_name is not None:
- if feature_name not in vocab:
- vocab[feature_name] = len(feature_names)
- feature_names.append(feature_name)
- if self.sort:
- feature_names.sort()
- vocab = {f: i for i, f in enumerate(feature_names)}
- self.feature_names_ = feature_names
- self.vocabulary_ = vocab
- return self
- def _transform(self, X, fitting):
- # Sanity check: Python's array has no way of explicitly requesting the
- # signed 32-bit integers that scipy.sparse needs, so we use the next
- # best thing: typecode "i" (int). However, if that gives larger or
- # smaller integers than 32-bit ones, np.frombuffer screws up.
- assert array("i").itemsize == 4, (
- "sizeof(int) != 4 on your platform; please report this at"
- " https://github.com/scikit-learn/scikit-learn/issues and"
- " include the output from platform.platform() in your bug report"
- )
- dtype = self.dtype
- if fitting:
- feature_names = []
- vocab = {}
- else:
- feature_names = self.feature_names_
- vocab = self.vocabulary_
- transforming = True
- # Process everything as sparse regardless of setting
- X = [X] if isinstance(X, Mapping) else X
- indices = array("i")
- indptr = [0]
- # XXX we could change values to an array.array as well, but it
- # would require (heuristic) conversion of dtype to typecode...
- values = []
- # collect all the possible feature names and build sparse matrix at
- # same time
- for x in X:
- for f, v in x.items():
- if isinstance(v, str):
- feature_name = "%s%s%s" % (f, self.separator, v)
- v = 1
- elif isinstance(v, Number) or (v is None):
- feature_name = f
- elif not isinstance(v, Mapping) and isinstance(v, Iterable):
- feature_name = None
- self._add_iterable_element(
- f,
- v,
- feature_names,
- vocab,
- fitting=fitting,
- transforming=transforming,
- indices=indices,
- values=values,
- )
- else:
- raise TypeError(
- f"Unsupported value Type {type(v)} "
- f"for {f}: {v}.\n"
- f"{type(v)} objects are not supported."
- )
- if feature_name is not None:
- if fitting and feature_name not in vocab:
- vocab[feature_name] = len(feature_names)
- feature_names.append(feature_name)
- if feature_name in vocab:
- indices.append(vocab[feature_name])
- values.append(self.dtype(v))
- indptr.append(len(indices))
- if len(indptr) == 1:
- raise ValueError("Sample sequence X is empty.")
- indices = np.frombuffer(indices, dtype=np.intc)
- shape = (len(indptr) - 1, len(vocab))
- result_matrix = sp.csr_matrix(
- (values, indices, indptr), shape=shape, dtype=dtype
- )
- # Sort everything if asked
- if fitting and self.sort:
- feature_names.sort()
- map_index = np.empty(len(feature_names), dtype=np.int32)
- for new_val, f in enumerate(feature_names):
- map_index[new_val] = vocab[f]
- vocab[f] = new_val
- result_matrix = result_matrix[:, map_index]
- if self.sparse:
- result_matrix.sort_indices()
- else:
- result_matrix = result_matrix.toarray()
- if fitting:
- self.feature_names_ = feature_names
- self.vocabulary_ = vocab
- return result_matrix
- @_fit_context(prefer_skip_nested_validation=True)
- def fit_transform(self, X, y=None):
- """Learn a list of feature name -> indices mappings and transform X.
- Like fit(X) followed by transform(X), but does not require
- materializing X in memory.
- Parameters
- ----------
- X : Mapping or iterable over Mappings
- Dict(s) or Mapping(s) from feature names (arbitrary Python
- objects) to feature values (strings or convertible to dtype).
- .. versionchanged:: 0.24
- Accepts multiple string values for one categorical feature.
- y : (ignored)
- Ignored parameter.
- Returns
- -------
- Xa : {array, sparse matrix}
- Feature vectors; always 2-d.
- """
- return self._transform(X, fitting=True)
- def inverse_transform(self, X, dict_type=dict):
- """Transform array or sparse matrix X back to feature mappings.
- X must have been produced by this DictVectorizer's transform or
- fit_transform method; it may only have passed through transformers
- that preserve the number of features and their order.
- In the case of one-hot/one-of-K coding, the constructed feature
- names and values are returned rather than the original ones.
- Parameters
- ----------
- X : {array-like, sparse matrix} of shape (n_samples, n_features)
- Sample matrix.
- dict_type : type, default=dict
- Constructor for feature mappings. Must conform to the
- collections.Mapping API.
- Returns
- -------
- D : list of dict_type objects of shape (n_samples,)
- Feature mappings for the samples in X.
- """
- # COO matrix is not subscriptable
- X = check_array(X, accept_sparse=["csr", "csc"])
- n_samples = X.shape[0]
- names = self.feature_names_
- dicts = [dict_type() for _ in range(n_samples)]
- if sp.issparse(X):
- for i, j in zip(*X.nonzero()):
- dicts[i][names[j]] = X[i, j]
- else:
- for i, d in enumerate(dicts):
- for j, v in enumerate(X[i, :]):
- if v != 0:
- d[names[j]] = X[i, j]
- return dicts
- def transform(self, X):
- """Transform feature->value dicts to array or sparse matrix.
- Named features not encountered during fit or fit_transform will be
- silently ignored.
- Parameters
- ----------
- X : Mapping or iterable over Mappings of shape (n_samples,)
- Dict(s) or Mapping(s) from feature names (arbitrary Python
- objects) to feature values (strings or convertible to dtype).
- Returns
- -------
- Xa : {array, sparse matrix}
- Feature vectors; always 2-d.
- """
- return self._transform(X, fitting=False)
- def get_feature_names_out(self, input_features=None):
- """Get output feature names for transformation.
- Parameters
- ----------
- input_features : array-like of str or None, default=None
- Not used, present here for API consistency by convention.
- Returns
- -------
- feature_names_out : ndarray of str objects
- Transformed feature names.
- """
- check_is_fitted(self, "feature_names_")
- if any(not isinstance(name, str) for name in self.feature_names_):
- feature_names = [str(name) for name in self.feature_names_]
- else:
- feature_names = self.feature_names_
- return np.asarray(feature_names, dtype=object)
- def restrict(self, support, indices=False):
- """Restrict the features to those in support using feature selection.
- This function modifies the estimator in-place.
- Parameters
- ----------
- support : array-like
- Boolean mask or list of indices (as returned by the get_support
- member of feature selectors).
- indices : bool, default=False
- Whether support is a list of indices.
- Returns
- -------
- self : object
- DictVectorizer class instance.
- Examples
- --------
- >>> from sklearn.feature_extraction import DictVectorizer
- >>> from sklearn.feature_selection import SelectKBest, chi2
- >>> v = DictVectorizer()
- >>> D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
- >>> X = v.fit_transform(D)
- >>> support = SelectKBest(chi2, k=2).fit(X, [0, 1])
- >>> v.get_feature_names_out()
- array(['bar', 'baz', 'foo'], ...)
- >>> v.restrict(support.get_support())
- DictVectorizer()
- >>> v.get_feature_names_out()
- array(['bar', 'foo'], ...)
- """
- if not indices:
- support = np.where(support)[0]
- names = self.feature_names_
- new_vocab = {}
- for i in support:
- new_vocab[names[i]] = len(new_vocab)
- self.vocabulary_ = new_vocab
- self.feature_names_ = [
- f for f, i in sorted(new_vocab.items(), key=itemgetter(1))
- ]
- return self
- def _more_tags(self):
- return {"X_types": ["dict"]}
|