audio_utils.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. # Copyright 2023 The HuggingFace Team. All rights reserved.
  2. import datetime
  3. import platform
  4. import subprocess
  5. from typing import Optional, Tuple, Union, List
  6. import numpy as np
  7. def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
  8. """
  9. Helper function to read an audio file through ffmpeg.
  10. """
  11. ar = f"{sampling_rate}"
  12. ac = "1"
  13. format_for_conversion = "f32le"
  14. ffmpeg_command = [
  15. "ffmpeg",
  16. "-i",
  17. "pipe:0",
  18. "-ac",
  19. ac,
  20. "-ar",
  21. ar,
  22. "-f",
  23. format_for_conversion,
  24. "-hide_banner",
  25. "-loglevel",
  26. "quiet",
  27. "pipe:1",
  28. ]
  29. try:
  30. with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process:
  31. output_stream = ffmpeg_process.communicate(bpayload)
  32. except FileNotFoundError as error:
  33. raise ValueError("ffmpeg was not found but is required to load audio files from filename") from error
  34. out_bytes = output_stream[0]
  35. audio = np.frombuffer(out_bytes, np.float32)
  36. if audio.shape[0] == 0:
  37. raise ValueError(
  38. "Soundfile is either not in the correct format or is malformed. Ensure that the soundfile has "
  39. "a valid audio file extension (e.g. wav, flac or mp3) and is not corrupted. If reading from a remote "
  40. "URL, ensure that the URL is the full address to **download** the audio file."
  41. )
  42. return audio
  43. def ffmpeg_microphone(
  44. sampling_rate: int,
  45. chunk_length_s: float,
  46. format_for_conversion: str = "f32le",
  47. ffmpeg_input_device: Optional[str] = None,
  48. ffmpeg_additional_args: Optional[List[str]] = None,
  49. ):
  50. """
  51. Helper function to read audio from a microphone using ffmpeg. The default input device will be used unless another
  52. input device is specified using the `ffmpeg_input_device` argument. Uses 'alsa' on Linux, 'avfoundation' on MacOS and
  53. 'dshow' on Windows.
  54. Arguments:
  55. sampling_rate (`int`):
  56. The sampling_rate to use when reading the data from the microphone. Try using the model's sampling_rate to
  57. avoid resampling later.
  58. chunk_length_s (`float` or `int`):
  59. The length of the maximum chunk of audio to be sent returned.
  60. format_for_conversion (`str`, defaults to `f32le`):
  61. The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le`
  62. could also be used.
  63. ffmpeg_input_device (`str`, *optional*):
  64. The indentifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
  65. the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
  66. for how to specify and list input devices.
  67. ffmpeg_additional_args (`list[str]`, *optional*):
  68. Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background
  69. process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags
  70. with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]).
  71. Returns:
  72. A generator yielding audio chunks of `chunk_length_s` seconds as `bytes` objects of length
  73. `int(round(sampling_rate * chunk_length_s)) * size_of_sample`.
  74. """
  75. ar = f"{sampling_rate}"
  76. ac = "1"
  77. if format_for_conversion == "s16le":
  78. size_of_sample = 2
  79. elif format_for_conversion == "f32le":
  80. size_of_sample = 4
  81. else:
  82. raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`")
  83. system = platform.system()
  84. if system == "Linux":
  85. format_ = "alsa"
  86. input_ = ffmpeg_input_device or "default"
  87. elif system == "Darwin":
  88. format_ = "avfoundation"
  89. input_ = ffmpeg_input_device or ":default"
  90. elif system == "Windows":
  91. format_ = "dshow"
  92. input_ = ffmpeg_input_device or _get_microphone_name()
  93. ffmpeg_additional_args = [] if ffmpeg_additional_args is None else ffmpeg_additional_args
  94. ffmpeg_command = [
  95. "ffmpeg",
  96. "-f",
  97. format_,
  98. "-i",
  99. input_,
  100. "-ac",
  101. ac,
  102. "-ar",
  103. ar,
  104. "-f",
  105. format_for_conversion,
  106. "-fflags",
  107. "nobuffer",
  108. "-hide_banner",
  109. "-loglevel",
  110. "quiet",
  111. "pipe:1",
  112. ]
  113. ffmpeg_command.extend(ffmpeg_additional_args)
  114. chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample
  115. iterator = _ffmpeg_stream(ffmpeg_command, chunk_len)
  116. for item in iterator:
  117. yield item
  118. def ffmpeg_microphone_live(
  119. sampling_rate: int,
  120. chunk_length_s: float,
  121. stream_chunk_s: Optional[int] = None,
  122. stride_length_s: Optional[Union[Tuple[float, float], float]] = None,
  123. format_for_conversion: str = "f32le",
  124. ffmpeg_input_device: Optional[str] = None,
  125. ffmpeg_additional_args: Optional[List[str]] = None,
  126. ):
  127. """
  128. Helper function to read audio from a microphone using ffmpeg. This will output `partial` overlapping chunks starting
  129. from `stream_chunk_s` (if it is defined) until `chunk_length_s` is reached. It will make use of striding to avoid
  130. errors on the "sides" of the various chunks. The default input device will be used unless another input device is
  131. specified using the `ffmpeg_input_device` argument. Uses 'alsa' on Linux, 'avfoundation' on MacOS and 'dshow' on Windows.
  132. Arguments:
  133. sampling_rate (`int`):
  134. The sampling_rate to use when reading the data from the microphone. Try using the model's sampling_rate to
  135. avoid resampling later.
  136. chunk_length_s (`float` or `int`):
  137. The length of the maximum chunk of audio to be sent returned. This includes the eventual striding.
  138. stream_chunk_s (`float` or `int`):
  139. The length of the minimal temporary audio to be returned.
  140. stride_length_s (`float` or `int` or `(float, float)`, *optional*):
  141. The length of the striding to be used. Stride is used to provide context to a model on the (left, right) of
  142. an audio sample but without using that part to actually make the prediction. Setting this does not change
  143. the length of the chunk.
  144. format_for_conversion (`str`, *optional*, defaults to `f32le`):
  145. The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le`
  146. could also be used.
  147. ffmpeg_input_device (`str`, *optional*):
  148. The identifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
  149. the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
  150. for how to specify and list input devices.
  151. ffmpeg_additional_args (`list[str]`, *optional*):
  152. Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background
  153. process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags
  154. with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]).
  155. Return:
  156. A generator yielding dictionaries of the following form
  157. `{"sampling_rate": int, "raw": np.array(), "partial" bool}` With optionally a `"stride" (int, int)` key if
  158. `stride_length_s` is defined.
  159. `stride` and `raw` are all expressed in `samples`, and `partial` is a boolean saying if the current yield item
  160. is a whole chunk, or a partial temporary result to be later replaced by another larger chunk.
  161. """
  162. if stream_chunk_s is not None:
  163. chunk_s = stream_chunk_s
  164. else:
  165. chunk_s = chunk_length_s
  166. microphone = ffmpeg_microphone(
  167. sampling_rate,
  168. chunk_s,
  169. format_for_conversion=format_for_conversion,
  170. ffmpeg_input_device=ffmpeg_input_device,
  171. ffmpeg_additional_args=[] if ffmpeg_additional_args is None else ffmpeg_additional_args,
  172. )
  173. if format_for_conversion == "s16le":
  174. dtype = np.int16
  175. size_of_sample = 2
  176. elif format_for_conversion == "f32le":
  177. dtype = np.float32
  178. size_of_sample = 4
  179. else:
  180. raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`")
  181. if stride_length_s is None:
  182. stride_length_s = chunk_length_s / 6
  183. chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample
  184. if isinstance(stride_length_s, (int, float)):
  185. stride_length_s = [stride_length_s, stride_length_s]
  186. stride_left = int(round(sampling_rate * stride_length_s[0])) * size_of_sample
  187. stride_right = int(round(sampling_rate * stride_length_s[1])) * size_of_sample
  188. audio_time = datetime.datetime.now()
  189. delta = datetime.timedelta(seconds=chunk_s)
  190. for item in chunk_bytes_iter(microphone, chunk_len, stride=(stride_left, stride_right), stream=True):
  191. # Put everything back in numpy scale
  192. item["raw"] = np.frombuffer(item["raw"], dtype=dtype)
  193. item["stride"] = (
  194. item["stride"][0] // size_of_sample,
  195. item["stride"][1] // size_of_sample,
  196. )
  197. item["sampling_rate"] = sampling_rate
  198. audio_time += delta
  199. if datetime.datetime.now() > audio_time + 10 * delta:
  200. # We're late !! SKIP
  201. continue
  202. yield item
  203. def chunk_bytes_iter(iterator, chunk_len: int, stride: Tuple[int, int], stream: bool = False):
  204. """
  205. Reads raw bytes from an iterator and does chunks of length `chunk_len`. Optionally adds `stride` to each chunks to
  206. get overlaps. `stream` is used to return partial results even if a full `chunk_len` is not yet available.
  207. """
  208. acc = b""
  209. stride_left, stride_right = stride
  210. if stride_left + stride_right >= chunk_len:
  211. raise ValueError(
  212. f"Stride needs to be strictly smaller than chunk_len: ({stride_left}, {stride_right}) vs {chunk_len}"
  213. )
  214. _stride_left = 0
  215. for raw in iterator:
  216. acc += raw
  217. if stream and len(acc) < chunk_len:
  218. stride = (_stride_left, 0)
  219. yield {"raw": acc[:chunk_len], "stride": stride, "partial": True}
  220. else:
  221. while len(acc) >= chunk_len:
  222. # We are flushing the accumulator
  223. stride = (_stride_left, stride_right)
  224. item = {"raw": acc[:chunk_len], "stride": stride}
  225. if stream:
  226. item["partial"] = False
  227. yield item
  228. _stride_left = stride_left
  229. acc = acc[chunk_len - stride_left - stride_right :]
  230. # Last chunk
  231. if len(acc) > stride_left:
  232. item = {"raw": acc, "stride": (_stride_left, 0)}
  233. if stream:
  234. item["partial"] = False
  235. yield item
  236. def _ffmpeg_stream(ffmpeg_command, buflen: int):
  237. """
  238. Internal function to create the generator of data through ffmpeg
  239. """
  240. bufsize = 2**24 # 16Mo
  241. try:
  242. with subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, bufsize=bufsize) as ffmpeg_process:
  243. while True:
  244. raw = ffmpeg_process.stdout.read(buflen)
  245. if raw == b"":
  246. break
  247. yield raw
  248. except FileNotFoundError as error:
  249. raise ValueError("ffmpeg was not found but is required to stream audio files from filename") from error
  250. def _get_microphone_name():
  251. """
  252. Retrieve the microphone name in Windows .
  253. """
  254. command = ["ffmpeg", "-list_devices", "true", "-f", "dshow", "-i", ""]
  255. try:
  256. ffmpeg_devices = subprocess.run(command, text=True, stderr=subprocess.PIPE, encoding="utf-8")
  257. microphone_lines = [line for line in ffmpeg_devices.stderr.splitlines() if "(audio)" in line]
  258. if microphone_lines:
  259. microphone_name = microphone_lines[0].split('"')[1]
  260. print(f"Using microphone: {microphone_name}")
  261. return f"audio={microphone_name}"
  262. except FileNotFoundError:
  263. print("ffmpeg was not found. Please install it or make sure it is in your system PATH.")
  264. return "default"