feature_extraction_sequence_utils.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. # coding=utf-8
  2. # Copyright 2021 The HuggingFace Inc. team.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """
  16. Sequence feature extraction class for common feature extractors to preprocess sequences.
  17. """
  18. from typing import Dict, List, Optional, Union
  19. import numpy as np
  20. from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin
  21. from .utils import PaddingStrategy, TensorType, is_tf_tensor, is_torch_tensor, logging, to_numpy
  22. logger = logging.get_logger(__name__)
  23. class SequenceFeatureExtractor(FeatureExtractionMixin):
  24. """
  25. This is a general feature extraction class for speech recognition.
  26. Args:
  27. feature_size (`int`):
  28. The feature dimension of the extracted features.
  29. sampling_rate (`int`):
  30. The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
  31. padding_value (`float`):
  32. The value that is used to fill the padding values / vectors.
  33. """
  34. def __init__(self, feature_size: int, sampling_rate: int, padding_value: float, **kwargs):
  35. self.feature_size = feature_size
  36. self.sampling_rate = sampling_rate
  37. self.padding_value = padding_value
  38. self.padding_side = kwargs.pop("padding_side", "right")
  39. self.return_attention_mask = kwargs.pop("return_attention_mask", True)
  40. super().__init__(**kwargs)
  41. def pad(
  42. self,
  43. processed_features: Union[
  44. BatchFeature,
  45. List[BatchFeature],
  46. Dict[str, BatchFeature],
  47. Dict[str, List[BatchFeature]],
  48. List[Dict[str, BatchFeature]],
  49. ],
  50. padding: Union[bool, str, PaddingStrategy] = True,
  51. max_length: Optional[int] = None,
  52. truncation: bool = False,
  53. pad_to_multiple_of: Optional[int] = None,
  54. return_attention_mask: Optional[bool] = None,
  55. return_tensors: Optional[Union[str, TensorType]] = None,
  56. ) -> BatchFeature:
  57. """
  58. Pad input values / input vectors or a batch of input values / input vectors up to predefined length or to the
  59. max sequence length in the batch.
  60. Padding side (left/right) padding values are defined at the feature extractor level (with `self.padding_side`,
  61. `self.padding_value`)
  62. <Tip>
  63. If the `processed_features` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
  64. result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
  65. PyTorch tensors, you will lose the specific device of your tensors however.
  66. </Tip>
  67. Args:
  68. processed_features ([`BatchFeature`], list of [`BatchFeature`], `Dict[str, List[float]]`, `Dict[str, List[List[float]]` or `List[Dict[str, List[float]]]`):
  69. Processed inputs. Can represent one input ([`BatchFeature`] or `Dict[str, List[float]]`) or a batch of
  70. input values / vectors (list of [`BatchFeature`], *Dict[str, List[List[float]]]* or *List[Dict[str,
  71. List[float]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader
  72. collate function.
  73. Instead of `List[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
  74. see the note above for the return type.
  75. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
  76. Select a strategy to pad the returned sequences (according to the model's padding side and padding
  77. index) among:
  78. - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
  79. sequence if provided).
  80. - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
  81. acceptable input length for the model if that argument is not provided.
  82. - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
  83. lengths).
  84. max_length (`int`, *optional*):
  85. Maximum length of the returned list and optionally padding length (see above).
  86. truncation (`bool`):
  87. Activates truncation to cut input sequences longer than `max_length` to `max_length`.
  88. pad_to_multiple_of (`int`, *optional*):
  89. If set will pad the sequence to a multiple of the provided value.
  90. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
  91. `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
  92. return_attention_mask (`bool`, *optional*):
  93. Whether to return the attention mask. If left to the default, will return the attention mask according
  94. to the specific feature_extractor's default.
  95. [What are attention masks?](../glossary#attention-mask)
  96. return_tensors (`str` or [`~utils.TensorType`], *optional*):
  97. If set, will return tensors instead of list of python integers. Acceptable values are:
  98. - `'tf'`: Return TensorFlow `tf.constant` objects.
  99. - `'pt'`: Return PyTorch `torch.Tensor` objects.
  100. - `'np'`: Return Numpy `np.ndarray` objects.
  101. """
  102. # If we have a list of dicts, let's convert it in a dict of lists
  103. # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
  104. if isinstance(processed_features, (list, tuple)) and isinstance(processed_features[0], (dict, BatchFeature)):
  105. processed_features = {
  106. key: [example[key] for example in processed_features] for key in processed_features[0].keys()
  107. }
  108. # The model's main input name, usually `input_values`, has be passed for padding
  109. if self.model_input_names[0] not in processed_features:
  110. raise ValueError(
  111. "You should supply an instance of `transformers.BatchFeature` or list of `transformers.BatchFeature`"
  112. f" to this method that includes {self.model_input_names[0]}, but you provided"
  113. f" {list(processed_features.keys())}"
  114. )
  115. required_input = processed_features[self.model_input_names[0]]
  116. return_attention_mask = (
  117. return_attention_mask if return_attention_mask is not None else self.return_attention_mask
  118. )
  119. if len(required_input) == 0:
  120. if return_attention_mask:
  121. processed_features["attention_mask"] = []
  122. return processed_features
  123. # If we have PyTorch/TF tensors or lists as inputs, we cast them as Numpy arrays
  124. # and rebuild them afterwards if no return_tensors is specified
  125. # Note that we lose the specific device the tensor may be on for PyTorch
  126. first_element = required_input[0]
  127. if isinstance(first_element, (list, tuple)):
  128. # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
  129. index = 0
  130. while len(required_input[index]) == 0:
  131. index += 1
  132. if index < len(required_input):
  133. first_element = required_input[index][0]
  134. if return_tensors is None:
  135. if is_tf_tensor(first_element):
  136. return_tensors = "tf"
  137. elif is_torch_tensor(first_element):
  138. return_tensors = "pt"
  139. elif isinstance(first_element, (int, float, list, tuple, np.ndarray)):
  140. return_tensors = "np"
  141. else:
  142. raise ValueError(
  143. f"type of {first_element} unknown: {type(first_element)}. "
  144. "Should be one of a python, numpy, pytorch or tensorflow object."
  145. )
  146. for key, value in processed_features.items():
  147. if isinstance(value[0], (int, float)):
  148. processed_features[key] = to_numpy(value)
  149. else:
  150. processed_features[key] = [to_numpy(v) for v in value]
  151. # Convert padding_strategy in PaddingStrategy
  152. padding_strategy = self._get_padding_strategies(padding=padding, max_length=max_length)
  153. required_input = processed_features[self.model_input_names[0]]
  154. batch_size = len(required_input)
  155. if not all(len(v) == batch_size for v in processed_features.values()):
  156. raise ValueError("Some items in the output dictionary have a different batch size than others.")
  157. truncated_inputs = []
  158. for i in range(batch_size):
  159. inputs = {k: v[i] for k, v in processed_features.items()}
  160. # truncation
  161. inputs_slice = self._truncate(
  162. inputs,
  163. max_length=max_length,
  164. pad_to_multiple_of=pad_to_multiple_of,
  165. truncation=truncation,
  166. )
  167. truncated_inputs.append(inputs_slice)
  168. if padding_strategy == PaddingStrategy.LONGEST:
  169. # make sure that `max_length` cannot be longer than the longest truncated length
  170. max_length = max(len(input_slice[self.model_input_names[0]]) for input_slice in truncated_inputs)
  171. padding_strategy = PaddingStrategy.MAX_LENGTH
  172. batch_outputs = {}
  173. for i in range(batch_size):
  174. # padding
  175. outputs = self._pad(
  176. truncated_inputs[i],
  177. max_length=max_length,
  178. padding_strategy=padding_strategy,
  179. pad_to_multiple_of=pad_to_multiple_of,
  180. return_attention_mask=return_attention_mask,
  181. )
  182. for key, value in outputs.items():
  183. if key not in batch_outputs:
  184. batch_outputs[key] = []
  185. if value.dtype is np.dtype(np.float64):
  186. value = value.astype(np.float32)
  187. batch_outputs[key].append(value)
  188. return BatchFeature(batch_outputs, tensor_type=return_tensors)
  189. def _pad(
  190. self,
  191. processed_features: Union[Dict[str, np.ndarray], BatchFeature],
  192. max_length: Optional[int] = None,
  193. padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
  194. pad_to_multiple_of: Optional[int] = None,
  195. return_attention_mask: Optional[bool] = None,
  196. ) -> dict:
  197. """
  198. Pad inputs (on left/right and up to predefined length or max length in the batch)
  199. Args:
  200. processed_features (`Union[Dict[str, np.ndarray], BatchFeature]`):
  201. Dictionary of input values (`np.ndarray[float]`) / input vectors (`List[np.ndarray[float]]`) or batch
  202. of inputs values (`List[np.ndarray[int]]`) / input vectors (`List[np.ndarray[int]]`)
  203. max_length (`int`, *optional*):
  204. Maximum length of the returned list and optionally padding length (see below)
  205. padding_strategy (`PaddingStrategy`, *optional*, default to `PaddingStrategy.DO_NOT_PAD`):
  206. PaddingStrategy to use for padding.
  207. - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
  208. - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
  209. - PaddingStrategy.DO_NOT_PAD: Do not pad
  210. The feature_extractor padding sides are defined in self.padding_side:
  211. - 'left': pads on the left of the sequences
  212. - 'right': pads on the right of the sequences
  213. pad_to_multiple_of (`int`, *optional*):
  214. Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to
  215. enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs
  216. which benefit from having sequence lengths be a multiple of 128.
  217. return_attention_mask (`bool`, *optional*):
  218. Set to False to avoid returning attention mask (default: set to model specifics)
  219. """
  220. required_input = processed_features[self.model_input_names[0]]
  221. if padding_strategy == PaddingStrategy.LONGEST:
  222. max_length = len(required_input)
  223. if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
  224. max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
  225. needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) < max_length
  226. if return_attention_mask and "attention_mask" not in processed_features:
  227. processed_features["attention_mask"] = np.ones(len(required_input), dtype=np.int32)
  228. if needs_to_be_padded:
  229. difference = max_length - len(required_input)
  230. if self.padding_side == "right":
  231. if return_attention_mask:
  232. processed_features["attention_mask"] = np.pad(
  233. processed_features["attention_mask"], (0, difference)
  234. )
  235. padding_shape = ((0, difference), (0, 0)) if self.feature_size > 1 else (0, difference)
  236. processed_features[self.model_input_names[0]] = np.pad(
  237. required_input, padding_shape, "constant", constant_values=self.padding_value
  238. )
  239. elif self.padding_side == "left":
  240. if return_attention_mask:
  241. processed_features["attention_mask"] = np.pad(
  242. processed_features["attention_mask"], (difference, 0)
  243. )
  244. padding_shape = ((difference, 0), (0, 0)) if self.feature_size > 1 else (difference, 0)
  245. processed_features[self.model_input_names[0]] = np.pad(
  246. required_input, padding_shape, "constant", constant_values=self.padding_value
  247. )
  248. else:
  249. raise ValueError("Invalid padding strategy:" + str(self.padding_side))
  250. return processed_features
  251. def _truncate(
  252. self,
  253. processed_features: Union[Dict[str, np.ndarray], BatchFeature],
  254. max_length: Optional[int] = None,
  255. pad_to_multiple_of: Optional[int] = None,
  256. truncation: Optional[bool] = None,
  257. ):
  258. """
  259. Truncate inputs to predefined length or max length in the batch
  260. Args:
  261. processed_features(`Union[Dict[str, np.ndarray], BatchFeature]`):
  262. Dictionary of input values (`np.ndarray[float]`) / input vectors (`List[np.ndarray[float]]`) or batch
  263. of inputs values (`List[np.ndarray[int]]`) / input vectors (`List[np.ndarray[int]]`)
  264. max_length (`int`, *optional*):
  265. maximum length of the returned list and optionally padding length (see below)
  266. pad_to_multiple_of (`int`, *optional*) :
  267. Integer if set will pad the sequence to a multiple of the provided value. This is especially useful to
  268. enable the use of Tensor Core on NVIDIA hardware with compute capability `>= 7.5` (Volta), or on TPUs
  269. which benefit from having sequence lengths be a multiple of 128.
  270. truncation (`bool`, *optional*):
  271. Activates truncation to cut input sequences longer than `max_length` to `max_length`.
  272. """
  273. if not truncation:
  274. return processed_features
  275. elif truncation and max_length is None:
  276. raise ValueError("When setting ``truncation=True``, make sure that ``max_length`` is defined.")
  277. required_input = processed_features[self.model_input_names[0]]
  278. # find `max_length` that fits `pad_to_multiple_of`
  279. if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
  280. max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
  281. needs_to_be_truncated = len(required_input) > max_length
  282. if needs_to_be_truncated:
  283. processed_features[self.model_input_names[0]] = processed_features[self.model_input_names[0]][:max_length]
  284. if "attention_mask" in processed_features:
  285. processed_features["attention_mask"] = processed_features["attention_mask"][:max_length]
  286. return processed_features
  287. def _get_padding_strategies(self, padding=False, max_length=None):
  288. """
  289. Find the correct padding strategy
  290. """
  291. # Get padding strategy
  292. if padding is not False:
  293. if padding is True:
  294. padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch
  295. elif not isinstance(padding, PaddingStrategy):
  296. padding_strategy = PaddingStrategy(padding)
  297. elif isinstance(padding, PaddingStrategy):
  298. padding_strategy = padding
  299. else:
  300. padding_strategy = PaddingStrategy.DO_NOT_PAD
  301. # Set max length if needed
  302. if max_length is None:
  303. if padding_strategy == PaddingStrategy.MAX_LENGTH:
  304. raise ValueError(
  305. f"When setting ``padding={PaddingStrategy.MAX_LENGTH}``, make sure that max_length is defined"
  306. )
  307. # Test if we have a padding value
  308. if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.padding_value is None):
  309. raise ValueError(
  310. "Asking to pad but the feature_extractor does not have a padding value. Please select a value to use"
  311. " as `padding_value`. For example: `feature_extractor.padding_value = 0.0`."
  312. )
  313. return padding_strategy