feature_extraction_dac.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170
  1. # coding=utf-8
  2. # Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved.
  3. #
  4. # Licensed under the Apache License, Version 2.0 (the "License");
  5. # you may not use this file except in compliance with the License.
  6. # You may obtain a copy of the License at
  7. #
  8. # http://www.apache.org/licenses/LICENSE-2.0
  9. #
  10. # Unless required by applicable law or agreed to in writing, software
  11. # distributed under the License is distributed on an "AS IS" BASIS,
  12. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. # See the License for the specific language governing permissions and
  14. # limitations under the License.
  15. """Feature extractor class for DAC"""
  16. from typing import List, Optional, Union
  17. import numpy as np
  18. from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
  19. from ...feature_extraction_utils import BatchFeature
  20. from ...utils import PaddingStrategy, TensorType, logging
  21. logger = logging.get_logger(__name__)
  22. class DacFeatureExtractor(SequenceFeatureExtractor):
  23. r"""
  24. Constructs an Dac feature extractor.
  25. This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
  26. most of the main methods. Users should refer to this superclass for more information regarding those methods.
  27. Args:
  28. feature_size (`int`, *optional*, defaults to 1):
  29. The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
  30. sampling_rate (`int`, *optional*, defaults to 16000):
  31. The sampling rate at which the audio waveform should be digitalized, expressed in hertz (Hz).
  32. padding_value (`float`, *optional*, defaults to 0.0):
  33. The value that is used for padding.
  34. hop_length (`int`, *optional*, defaults to 512):
  35. Overlap length between successive windows.
  36. """
  37. model_input_names = ["input_values", "n_quantizers"]
  38. def __init__(
  39. self,
  40. feature_size: int = 1,
  41. sampling_rate: int = 16000,
  42. padding_value: float = 0.0,
  43. hop_length: int = 512,
  44. **kwargs,
  45. ):
  46. super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
  47. self.hop_length = hop_length
  48. def __call__(
  49. self,
  50. raw_audio: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
  51. padding: Optional[Union[bool, str, PaddingStrategy]] = None,
  52. truncation: Optional[bool] = False,
  53. max_length: Optional[int] = None,
  54. return_tensors: Optional[Union[str, TensorType]] = None,
  55. sampling_rate: Optional[int] = None,
  56. ) -> BatchFeature:
  57. """
  58. Main method to featurize and prepare for the model one or several sequence(s).
  59. Args:
  60. raw_audio (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
  61. The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
  62. values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
  63. `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
  64. (`feature_size = 2`).
  65. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
  66. Select a strategy to pad the returned sequences (according to the model's padding side and padding
  67. index) among:
  68. - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
  69. sequence if provided).
  70. - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
  71. acceptable input length for the model if that argument is not provided.
  72. - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
  73. lengths).
  74. truncation (`bool`, *optional*, defaults to `False`):
  75. Activates truncation to cut input sequences longer than `max_length` to `max_length`.
  76. max_length (`int`, *optional*):
  77. Maximum length of the returned list and optionally padding length (see above).
  78. return_tensors (`str` or [`~utils.TensorType`], *optional*, default to 'pt'):
  79. If set, will return tensors instead of list of python integers. Acceptable values are:
  80. - `'tf'`: Return TensorFlow `tf.constant` objects.
  81. - `'pt'`: Return PyTorch `torch.Tensor` objects.
  82. - `'np'`: Return Numpy `np.ndarray` objects.
  83. sampling_rate (`int`, *optional*):
  84. The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
  85. `sampling_rate` at the forward call to prevent silent errors.
  86. """
  87. if sampling_rate is not None:
  88. if sampling_rate != self.sampling_rate:
  89. raise ValueError(
  90. f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
  91. f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
  92. f" {self.sampling_rate} and not {sampling_rate}."
  93. )
  94. else:
  95. logger.warning(
  96. "It is strongly recommended to pass the `sampling_rate` argument to this function. "
  97. "Failing to do so can result in silent errors that might be hard to debug."
  98. )
  99. if padding and truncation:
  100. raise ValueError("Both padding and truncation were set. Make sure you only set one.")
  101. elif padding is None:
  102. # by default let's pad the inputs
  103. padding = True
  104. is_batched = bool(
  105. isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
  106. )
  107. if is_batched:
  108. raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
  109. elif not is_batched and not isinstance(raw_audio, np.ndarray):
  110. raw_audio = np.asarray(raw_audio, dtype=np.float32)
  111. elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
  112. raw_audio = raw_audio.astype(np.float32)
  113. # always return batch
  114. if not is_batched:
  115. raw_audio = [np.asarray(raw_audio).T]
  116. # verify inputs are valid
  117. for idx, example in enumerate(raw_audio):
  118. if example.ndim > 2:
  119. raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
  120. if self.feature_size == 1 and example.ndim != 1:
  121. raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
  122. if self.feature_size == 2:
  123. raise ValueError("Stereo audio isn't supported for now")
  124. input_values = BatchFeature({"input_values": raw_audio})
  125. # normal padding on batch
  126. padded_inputs = self.pad(
  127. input_values,
  128. max_length=max_length,
  129. truncation=truncation,
  130. padding=padding,
  131. return_attention_mask=False,
  132. pad_to_multiple_of=self.hop_length,
  133. )
  134. if padding:
  135. padded_inputs.input_values = padded_inputs.input_values[:, np.newaxis, :]
  136. input_values = []
  137. for example in padded_inputs.pop("input_values"):
  138. if self.feature_size == 1:
  139. example = example[..., None]
  140. input_values.append(example.T)
  141. padded_inputs["input_values"] = input_values
  142. if return_tensors is not None:
  143. padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
  144. return padded_inputs