| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127 |
- # coding=utf-8
- # Copyright 2023 The HuggingFace Inc. team and the librosa & torchaudio authors.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- """
- Audio processing functions to extract features from audio waveforms. This code is pure numpy to support all frameworks
- and remove unnecessary dependencies.
- """
- import warnings
- from typing import List, Optional, Tuple, Union
- import numpy as np
- def hertz_to_mel(freq: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]:
- """
- Convert frequency from hertz to mels.
- Args:
- freq (`float` or `np.ndarray`):
- The frequency, or multiple frequencies, in hertz (Hz).
- mel_scale (`str`, *optional*, defaults to `"htk"`):
- The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`.
- Returns:
- `float` or `np.ndarray`: The frequencies on the mel scale.
- """
- if mel_scale not in ["slaney", "htk", "kaldi"]:
- raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".')
- if mel_scale == "htk":
- return 2595.0 * np.log10(1.0 + (freq / 700.0))
- elif mel_scale == "kaldi":
- return 1127.0 * np.log(1.0 + (freq / 700.0))
- min_log_hertz = 1000.0
- min_log_mel = 15.0
- logstep = 27.0 / np.log(6.4)
- mels = 3.0 * freq / 200.0
- if isinstance(freq, np.ndarray):
- log_region = freq >= min_log_hertz
- mels[log_region] = min_log_mel + np.log(freq[log_region] / min_log_hertz) * logstep
- elif freq >= min_log_hertz:
- mels = min_log_mel + np.log(freq / min_log_hertz) * logstep
- return mels
- def mel_to_hertz(mels: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]:
- """
- Convert frequency from mels to hertz.
- Args:
- mels (`float` or `np.ndarray`):
- The frequency, or multiple frequencies, in mels.
- mel_scale (`str`, *optional*, `"htk"`):
- The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`.
- Returns:
- `float` or `np.ndarray`: The frequencies in hertz.
- """
- if mel_scale not in ["slaney", "htk", "kaldi"]:
- raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".')
- if mel_scale == "htk":
- return 700.0 * (np.power(10, mels / 2595.0) - 1.0)
- elif mel_scale == "kaldi":
- return 700.0 * (np.exp(mels / 1127.0) - 1.0)
- min_log_hertz = 1000.0
- min_log_mel = 15.0
- logstep = np.log(6.4) / 27.0
- freq = 200.0 * mels / 3.0
- if isinstance(mels, np.ndarray):
- log_region = mels >= min_log_mel
- freq[log_region] = min_log_hertz * np.exp(logstep * (mels[log_region] - min_log_mel))
- elif mels >= min_log_mel:
- freq = min_log_hertz * np.exp(logstep * (mels - min_log_mel))
- return freq
- def hertz_to_octave(
- freq: Union[float, np.ndarray], tuning: Optional[float] = 0.0, bins_per_octave: Optional[int] = 12
- ):
- """
- Convert frequency from hertz to fractional octave numbers.
- Adapted from *librosa*.
- Args:
- freq (`float` or `np.ndarray`):
- The frequency, or multiple frequencies, in hertz (Hz).
- tuning (`float`, defaults to `0.`):
- Tuning deviation from the Stuttgart pitch (A440) in (fractional) bins per octave.
- bins_per_octave (`int`, defaults to `12`):
- Number of bins per octave.
- Returns:
- `float` or `np.ndarray`: The frequencies on the octave scale.
- """
- stuttgart_pitch = 440.0 * 2.0 ** (tuning / bins_per_octave)
- octave = np.log2(freq / (float(stuttgart_pitch) / 16))
- return octave
- def _create_triangular_filter_bank(fft_freqs: np.ndarray, filter_freqs: np.ndarray) -> np.ndarray:
- """
- Creates a triangular filter bank.
- Adapted from *torchaudio* and *librosa*.
- Args:
- fft_freqs (`np.ndarray` of shape `(num_frequency_bins,)`):
- Discrete frequencies of the FFT bins in Hz.
- filter_freqs (`np.ndarray` of shape `(num_mel_filters,)`):
- Center frequencies of the triangular filters to create, in Hz.
- Returns:
- `np.ndarray` of shape `(num_frequency_bins, num_mel_filters)`
- """
- filter_diff = np.diff(filter_freqs)
- slopes = np.expand_dims(filter_freqs, 0) - np.expand_dims(fft_freqs, 1)
- down_slopes = -slopes[:, :-2] / filter_diff[:-1]
- up_slopes = slopes[:, 2:] / filter_diff[1:]
- return np.maximum(np.zeros(1), np.minimum(down_slopes, up_slopes))
- def chroma_filter_bank(
- num_frequency_bins: int,
- num_chroma: int,
- sampling_rate: int,
- tuning: float = 0.0,
- power: Optional[float] = 2.0,
- weighting_parameters: Optional[Tuple[float]] = (5.0, 2),
- start_at_c_chroma: Optional[bool] = True,
- ):
- """
- Creates a chroma filter bank, i.e a linear transformation to project spectrogram bins onto chroma bins.
- Adapted from *librosa*.
- Args:
- num_frequency_bins (`int`):
- Number of frequencies used to compute the spectrogram (should be the same as in `stft`).
- num_chroma (`int`):
- Number of chroma bins (i.e pitch classes).
- sampling_rate (`float`):
- Sample rate of the audio waveform.
- tuning (`float`):
- Tuning deviation from A440 in fractions of a chroma bin.
- power (`float`, *optional*, defaults to 2.0):
- If 12.0, normalizes each column with their L2 norm. If 1.0, normalizes each column with their L1 norm.
- weighting_parameters (`Tuple[float]`, *optional*, defaults to `(5., 2.)`):
- If specified, apply a Gaussian weighting parameterized by the first element of the tuple being the center and
- the second element being the Gaussian half-width.
- start_at_c_chroma (`float`, *optional*, defaults to `True`):
- If True, the filter bank will start at the 'C' pitch class. Otherwise, it will start at 'A'.
- Returns:
- `np.ndarray` of shape `(num_frequency_bins, num_chroma)`
- """
- # Get the FFT bins, not counting the DC component
- frequencies = np.linspace(0, sampling_rate, num_frequency_bins, endpoint=False)[1:]
- freq_bins = num_chroma * hertz_to_octave(frequencies, tuning=tuning, bins_per_octave=num_chroma)
- # make up a value for the 0 Hz bin = 1.5 octaves below bin 1
- # (so chroma is 50% rotated from bin 1, and bin width is broad)
- freq_bins = np.concatenate(([freq_bins[0] - 1.5 * num_chroma], freq_bins))
- bins_width = np.concatenate((np.maximum(freq_bins[1:] - freq_bins[:-1], 1.0), [1]))
- chroma_filters = np.subtract.outer(freq_bins, np.arange(0, num_chroma, dtype="d")).T
- num_chroma2 = np.round(float(num_chroma) / 2)
- # Project into range -num_chroma/2 .. num_chroma/2
- # add on fixed offset of 10*num_chroma to ensure all values passed to
- # rem are positive
- chroma_filters = np.remainder(chroma_filters + num_chroma2 + 10 * num_chroma, num_chroma) - num_chroma2
- # Gaussian bumps - 2*D to make them narrower
- chroma_filters = np.exp(-0.5 * (2 * chroma_filters / np.tile(bins_width, (num_chroma, 1))) ** 2)
- # normalize each column
- if power is not None:
- chroma_filters = chroma_filters / np.sum(chroma_filters**power, axis=0, keepdims=True) ** (1.0 / power)
- # Maybe apply scaling for fft bins
- if weighting_parameters is not None:
- center, half_width = weighting_parameters
- chroma_filters *= np.tile(
- np.exp(-0.5 * (((freq_bins / num_chroma - center) / half_width) ** 2)),
- (num_chroma, 1),
- )
- if start_at_c_chroma:
- chroma_filters = np.roll(chroma_filters, -3 * (num_chroma // 12), axis=0)
- # remove aliasing columns, copy to ensure row-contiguity
- return np.ascontiguousarray(chroma_filters[:, : int(1 + num_frequency_bins / 2)])
- def mel_filter_bank(
- num_frequency_bins: int,
- num_mel_filters: int,
- min_frequency: float,
- max_frequency: float,
- sampling_rate: int,
- norm: Optional[str] = None,
- mel_scale: str = "htk",
- triangularize_in_mel_space: bool = False,
- ) -> np.ndarray:
- """
- Creates a frequency bin conversion matrix used to obtain a mel spectrogram. This is called a *mel filter bank*, and
- various implementation exist, which differ in the number of filters, the shape of the filters, the way the filters
- are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these
- features is to approximate the non-linear human perception of the variation in pitch with respect to the frequency.
- Different banks of mel filters were introduced in the literature. The following variations are supported:
- - MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHz and a speech
- bandwidth of `[0, 4600]` Hz.
- - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) (1995) uses a filter bank of 24 filters for a speech
- bandwidth of `[0, 8000]` Hz. This assumes sampling rate ≥ 16 kHz.
- - MFCC FB-40: from the Auditory Toolbox for MATLAB written by Slaney in 1998, assumes a sampling rate of 16 kHz and
- speech bandwidth of `[133, 6854]` Hz. This version also includes area normalization.
- - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes a sampling rate of
- 12.5 kHz and speech bandwidth of `[0, 6250]` Hz.
- This code is adapted from *torchaudio* and *librosa*. Note that the default parameters of torchaudio's
- `melscale_fbanks` implement the `"htk"` filters while librosa uses the `"slaney"` implementation.
- Args:
- num_frequency_bins (`int`):
- Number of frequencies used to compute the spectrogram (should be the same as in `stft`).
- num_mel_filters (`int`):
- Number of mel filters to generate.
- min_frequency (`float`):
- Lowest frequency of interest in Hz.
- max_frequency (`float`):
- Highest frequency of interest in Hz. This should not exceed `sampling_rate / 2`.
- sampling_rate (`int`):
- Sample rate of the audio waveform.
- norm (`str`, *optional*):
- If `"slaney"`, divide the triangular mel weights by the width of the mel band (area normalization).
- mel_scale (`str`, *optional*, defaults to `"htk"`):
- The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`.
- triangularize_in_mel_space (`bool`, *optional*, defaults to `False`):
- If this option is enabled, the triangular filter is applied in mel space rather than frequency space. This
- should be set to `true` in order to get the same results as `torchaudio` when computing mel filters.
- Returns:
- `np.ndarray` of shape (`num_frequency_bins`, `num_mel_filters`): Triangular filter bank matrix. This is a
- projection matrix to go from a spectrogram to a mel spectrogram.
- """
- if norm is not None and norm != "slaney":
- raise ValueError('norm must be one of None or "slaney"')
- # center points of the triangular mel filters
- mel_min = hertz_to_mel(min_frequency, mel_scale=mel_scale)
- mel_max = hertz_to_mel(max_frequency, mel_scale=mel_scale)
- mel_freqs = np.linspace(mel_min, mel_max, num_mel_filters + 2)
- filter_freqs = mel_to_hertz(mel_freqs, mel_scale=mel_scale)
- if triangularize_in_mel_space:
- # frequencies of FFT bins in Hz, but filters triangularized in mel space
- fft_bin_width = sampling_rate / (num_frequency_bins * 2)
- fft_freqs = hertz_to_mel(fft_bin_width * np.arange(num_frequency_bins), mel_scale=mel_scale)
- filter_freqs = mel_freqs
- else:
- # frequencies of FFT bins in Hz
- fft_freqs = np.linspace(0, sampling_rate // 2, num_frequency_bins)
- mel_filters = _create_triangular_filter_bank(fft_freqs, filter_freqs)
- if norm is not None and norm == "slaney":
- # Slaney-style mel is scaled to be approx constant energy per channel
- enorm = 2.0 / (filter_freqs[2 : num_mel_filters + 2] - filter_freqs[:num_mel_filters])
- mel_filters *= np.expand_dims(enorm, 0)
- if (mel_filters.max(axis=0) == 0.0).any():
- warnings.warn(
- "At least one mel filter has all zero values. "
- f"The value for `num_mel_filters` ({num_mel_filters}) may be set too high. "
- f"Or, the value for `num_frequency_bins` ({num_frequency_bins}) may be set too low."
- )
- return mel_filters
- def optimal_fft_length(window_length: int) -> int:
- """
- Finds the best FFT input size for a given `window_length`. This function takes a given window length and, if not
- already a power of two, rounds it up to the next power or two.
- The FFT algorithm works fastest when the length of the input is a power of two, which may be larger than the size
- of the window or analysis frame. For example, if the window is 400 samples, using an FFT input size of 512 samples
- is more optimal than an FFT size of 400 samples. Using a larger FFT size does not affect the detected frequencies,
- it simply gives a higher frequency resolution (i.e. the frequency bins are smaller).
- """
- return 2 ** int(np.ceil(np.log2(window_length)))
- def window_function(
- window_length: int,
- name: str = "hann",
- periodic: bool = True,
- frame_length: Optional[int] = None,
- center: bool = True,
- ) -> np.ndarray:
- """
- Returns an array containing the specified window. This window is intended to be used with `stft`.
- The following window types are supported:
- - `"boxcar"`: a rectangular window
- - `"hamming"`: the Hamming window
- - `"hann"`: the Hann window
- - `"povey"`: the Povey window
- Args:
- window_length (`int`):
- The length of the window in samples.
- name (`str`, *optional*, defaults to `"hann"`):
- The name of the window function.
- periodic (`bool`, *optional*, defaults to `True`):
- Whether the window is periodic or symmetric.
- frame_length (`int`, *optional*):
- The length of the analysis frames in samples. Provide a value for `frame_length` if the window is smaller
- than the frame length, so that it will be zero-padded.
- center (`bool`, *optional*, defaults to `True`):
- Whether to center the window inside the FFT buffer. Only used when `frame_length` is provided.
- Returns:
- `np.ndarray` of shape `(window_length,)` or `(frame_length,)` containing the window.
- """
- length = window_length + 1 if periodic else window_length
- if name == "boxcar":
- window = np.ones(length)
- elif name in ["hamming", "hamming_window"]:
- window = np.hamming(length)
- elif name in ["hann", "hann_window"]:
- window = np.hanning(length)
- elif name in ["povey"]:
- window = np.power(np.hanning(length), 0.85)
- else:
- raise ValueError(f"Unknown window function '{name}'")
- if periodic:
- window = window[:-1]
- if frame_length is None:
- return window
- if window_length > frame_length:
- raise ValueError(
- f"Length of the window ({window_length}) may not be larger than frame_length ({frame_length})"
- )
- padded_window = np.zeros(frame_length)
- offset = (frame_length - window_length) // 2 if center else 0
- padded_window[offset : offset + window_length] = window
- return padded_window
- # TODO This method does not support batching yet as we are mainly focused on inference.
- def spectrogram(
- waveform: np.ndarray,
- window: np.ndarray,
- frame_length: int,
- hop_length: int,
- fft_length: Optional[int] = None,
- power: Optional[float] = 1.0,
- center: bool = True,
- pad_mode: str = "reflect",
- onesided: bool = True,
- preemphasis: Optional[float] = None,
- mel_filters: Optional[np.ndarray] = None,
- mel_floor: float = 1e-10,
- log_mel: Optional[str] = None,
- reference: float = 1.0,
- min_value: float = 1e-10,
- db_range: Optional[float] = None,
- remove_dc_offset: Optional[bool] = None,
- dtype: np.dtype = np.float32,
- ) -> np.ndarray:
- """
- Calculates a spectrogram over one waveform using the Short-Time Fourier Transform.
- This function can create the following kinds of spectrograms:
- - amplitude spectrogram (`power = 1.0`)
- - power spectrogram (`power = 2.0`)
- - complex-valued spectrogram (`power = None`)
- - log spectrogram (use `log_mel` argument)
- - mel spectrogram (provide `mel_filters`)
- - log-mel spectrogram (provide `mel_filters` and `log_mel`)
- How this works:
- 1. The input waveform is split into frames of size `frame_length` that are partially overlapping by `frame_length
- - hop_length` samples.
- 2. Each frame is multiplied by the window and placed into a buffer of size `fft_length`.
- 3. The DFT is taken of each windowed frame.
- 4. The results are stacked into a spectrogram.
- We make a distinction between the following "blocks" of sample data, each of which may have a different lengths:
- - The analysis frame. This is the size of the time slices that the input waveform is split into.
- - The window. Each analysis frame is multiplied by the window to avoid spectral leakage.
- - The FFT input buffer. The length of this determines how many frequency bins are in the spectrogram.
- In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. A
- padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame,
- typically the next power of two.
- Note: This function is not optimized for speed yet. It should be mostly compatible with `librosa.stft` and
- `torchaudio.functional.transforms.Spectrogram`, although it is more flexible due to the different ways spectrograms
- can be constructed.
- Args:
- waveform (`np.ndarray` of shape `(length,)`):
- The input waveform. This must be a single real-valued, mono waveform.
- window (`np.ndarray` of shape `(frame_length,)`):
- The windowing function to apply, including zero-padding if necessary. The actual window length may be
- shorter than `frame_length`, but we're assuming the array has already been zero-padded.
- frame_length (`int`):
- The length of the analysis frames in samples. With librosa this is always equal to `fft_length` but we also
- allow smaller sizes.
- hop_length (`int`):
- The stride between successive analysis frames in samples.
- fft_length (`int`, *optional*):
- The size of the FFT buffer in samples. This determines how many frequency bins the spectrogram will have.
- For optimal speed, this should be a power of two. If `None`, uses `frame_length`.
- power (`float`, *optional*, defaults to 1.0):
- If 1.0, returns the amplitude spectrogram. If 2.0, returns the power spectrogram. If `None`, returns
- complex numbers.
- center (`bool`, *optional*, defaults to `True`):
- Whether to pad the waveform so that frame `t` is centered around time `t * hop_length`. If `False`, frame
- `t` will start at time `t * hop_length`.
- pad_mode (`str`, *optional*, defaults to `"reflect"`):
- Padding mode used when `center` is `True`. Possible values are: `"constant"` (pad with zeros), `"edge"`
- (pad with edge values), `"reflect"` (pads with mirrored values).
- onesided (`bool`, *optional*, defaults to `True`):
- If True, only computes the positive frequencies and returns a spectrogram containing `fft_length // 2 + 1`
- frequency bins. If False, also computes the negative frequencies and returns `fft_length` frequency bins.
- preemphasis (`float`, *optional*)
- Coefficient for a low-pass filter that applies pre-emphasis before the DFT.
- mel_filters (`np.ndarray` of shape `(num_freq_bins, num_mel_filters)`, *optional*):
- The mel filter bank. If supplied, applies a this filter bank to create a mel spectrogram.
- mel_floor (`float`, *optional*, defaults to 1e-10):
- Minimum value of mel frequency banks.
- log_mel (`str`, *optional*):
- How to convert the spectrogram to log scale. Possible options are: `None` (don't convert), `"log"` (take
- the natural logarithm) `"log10"` (take the base-10 logarithm), `"dB"` (convert to decibels). Can only be
- used when `power` is not `None`.
- reference (`float`, *optional*, defaults to 1.0):
- Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
- the loudest part to 0 dB. Must be greater than zero.
- min_value (`float`, *optional*, defaults to `1e-10`):
- The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
- `log(0)`. For a power spectrogram, the default of `1e-10` corresponds to a minimum of -100 dB. For an
- amplitude spectrogram, the value `1e-5` corresponds to -100 dB. Must be greater than zero.
- db_range (`float`, *optional*):
- Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
- peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
- remove_dc_offset (`bool`, *optional*):
- Subtract mean from waveform on each frame, applied before pre-emphasis. This should be set to `true` in
- order to get the same results as `torchaudio.compliance.kaldi.fbank` when computing mel filters.
- dtype (`np.dtype`, *optional*, defaults to `np.float32`):
- Data type of the spectrogram tensor. If `power` is None, this argument is ignored and the dtype will be
- `np.complex64`.
- Returns:
- `nd.array` containing a spectrogram of shape `(num_frequency_bins, length)` for a regular spectrogram or shape
- `(num_mel_filters, length)` for a mel spectrogram.
- """
- window_length = len(window)
- if fft_length is None:
- fft_length = frame_length
- if frame_length > fft_length:
- raise ValueError(f"frame_length ({frame_length}) may not be larger than fft_length ({fft_length})")
- if window_length != frame_length:
- raise ValueError(f"Length of the window ({window_length}) must equal frame_length ({frame_length})")
- if hop_length <= 0:
- raise ValueError("hop_length must be greater than zero")
- if waveform.ndim != 1:
- raise ValueError(f"Input waveform must have only one dimension, shape is {waveform.shape}")
- if np.iscomplexobj(waveform):
- raise ValueError("Complex-valued input waveforms are not currently supported")
- if power is None and mel_filters is not None:
- raise ValueError(
- "You have provided `mel_filters` but `power` is `None`. Mel spectrogram computation is not yet supported for complex-valued spectrogram."
- "Specify `power` to fix this issue."
- )
- # center pad the waveform
- if center:
- padding = [(int(frame_length // 2), int(frame_length // 2))]
- waveform = np.pad(waveform, padding, mode=pad_mode)
- # promote to float64, since np.fft uses float64 internally
- waveform = waveform.astype(np.float64)
- window = window.astype(np.float64)
- # split waveform into frames of frame_length size
- num_frames = int(1 + np.floor((waveform.size - frame_length) / hop_length))
- num_frequency_bins = (fft_length // 2) + 1 if onesided else fft_length
- spectrogram = np.empty((num_frames, num_frequency_bins), dtype=np.complex64)
- # rfft is faster than fft
- fft_func = np.fft.rfft if onesided else np.fft.fft
- buffer = np.zeros(fft_length)
- timestep = 0
- for frame_idx in range(num_frames):
- buffer[:frame_length] = waveform[timestep : timestep + frame_length]
- if remove_dc_offset:
- buffer[:frame_length] = buffer[:frame_length] - buffer[:frame_length].mean()
- if preemphasis is not None:
- buffer[1:frame_length] -= preemphasis * buffer[: frame_length - 1]
- buffer[0] *= 1 - preemphasis
- buffer[:frame_length] *= window
- spectrogram[frame_idx] = fft_func(buffer)
- timestep += hop_length
- # note: ** is much faster than np.power
- if power is not None:
- spectrogram = np.abs(spectrogram, dtype=np.float64) ** power
- spectrogram = spectrogram.T
- if mel_filters is not None:
- spectrogram = np.maximum(mel_floor, np.dot(mel_filters.T, spectrogram))
- if power is not None and log_mel is not None:
- if log_mel == "log":
- spectrogram = np.log(spectrogram)
- elif log_mel == "log10":
- spectrogram = np.log10(spectrogram)
- elif log_mel == "dB":
- if power == 1.0:
- spectrogram = amplitude_to_db(spectrogram, reference, min_value, db_range)
- elif power == 2.0:
- spectrogram = power_to_db(spectrogram, reference, min_value, db_range)
- else:
- raise ValueError(f"Cannot use log_mel option '{log_mel}' with power {power}")
- else:
- raise ValueError(f"Unknown log_mel option: {log_mel}")
- spectrogram = np.asarray(spectrogram, dtype)
- return spectrogram
- def spectrogram_batch(
- waveform_list: List[np.ndarray],
- window: np.ndarray,
- frame_length: int,
- hop_length: int,
- fft_length: Optional[int] = None,
- power: Optional[float] = 1.0,
- center: bool = True,
- pad_mode: str = "reflect",
- onesided: bool = True,
- preemphasis: Optional[float] = None,
- mel_filters: Optional[np.ndarray] = None,
- mel_floor: float = 1e-10,
- log_mel: Optional[str] = None,
- reference: float = 1.0,
- min_value: float = 1e-10,
- db_range: Optional[float] = None,
- remove_dc_offset: Optional[bool] = None,
- dtype: np.dtype = np.float32,
- ) -> List[np.ndarray]:
- """
- Calculates spectrograms for a list of waveforms using the Short-Time Fourier Transform, optimized for batch processing.
- This function extends the capabilities of the `spectrogram` function to handle multiple waveforms efficiently by leveraging broadcasting.
- It supports generating various types of spectrograms:
- - amplitude spectrogram (`power = 1.0`)
- - power spectrogram (`power = 2.0`)
- - complex-valued spectrogram (`power = None`)
- - log spectrogram (use `log_mel` argument)
- - mel spectrogram (provide `mel_filters`)
- - log-mel spectrogram (provide `mel_filters` and `log_mel`)
- How this works:
- 1. The input waveform is split into frames of size `frame_length` that are partially overlapping by `frame_length
- - hop_length` samples.
- 2. Each frame is multiplied by the window and placed into a buffer of size `fft_length`.
- 3. The DFT is taken of each windowed frame.
- 4. The results are stacked into a spectrogram.
- We make a distinction between the following "blocks" of sample data, each of which may have a different lengths:
- - The analysis frame. This is the size of the time slices that the input waveform is split into.
- - The window. Each analysis frame is multiplied by the window to avoid spectral leakage.
- - The FFT input buffer. The length of this determines how many frequency bins are in the spectrogram.
- In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. A
- padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame,
- typically the next power of two.
- Note: This function is designed for efficient batch processing of multiple waveforms but retains compatibility with individual waveform processing methods like `librosa.stft`.
- Args:
- waveform_list (`List[np.ndarray]` with arrays of shape `(length,)`):
- The list of input waveforms, each a single-channel (mono) signal.
- window (`np.ndarray` of shape `(frame_length,)`):
- The windowing function to apply, including zero-padding if necessary.
- frame_length (`int`):
- The length of each frame for analysis.
- hop_length (`int`):
- The step size between successive frames.
- fft_length (`int`, *optional*):
- The size of the FFT buffer, defining frequency bin resolution.
- power (`float`, *optional*, defaults to 1.0):
- Determines the type of spectrogram: 1.0 for amplitude, 2.0 for power, None for complex.
- center (`bool`, *optional*, defaults to `True`):
- Whether to center-pad the waveform frames.
- pad_mode (`str`, *optional*, defaults to `"reflect"`):
- The padding strategy when `center` is `True`.
- onesided (`bool`, *optional*, defaults to `True`):
- If True, returns a one-sided spectrogram for real input signals.
- preemphasis (`float`, *optional*):
- Applies a pre-emphasis filter to each frame.
- mel_filters (`np.ndarray`, *optional*):
- Mel filter bank for converting to mel spectrogram.
- mel_floor (`float`, *optional*, defaults to 1e-10):
- Floor value for mel spectrogram to avoid log(0).
- log_mel (`str`, *optional*):
- Specifies log scaling strategy; options are None, "log", "log10", "dB".
- reference (`float`, *optional*, defaults to 1.0):
- Reference value for dB conversion in log_mel.
- min_value (`float`, *optional*, defaults to 1e-10):
- Minimum floor value for log scale conversions.
- db_range (`float`, *optional*):
- Dynamic range for dB scale spectrograms.
- remove_dc_offset (`bool`, *optional*):
- Whether to remove the DC offset from each frame.
- dtype (`np.dtype`, *optional*, defaults to `np.float32`):
- Data type of the output spectrogram.
- Returns:
- List[`np.ndarray`]: A list of spectrogram arrays, one for each input waveform.
- """
- window_length = len(window)
- if fft_length is None:
- fft_length = frame_length
- if frame_length > fft_length:
- raise ValueError(f"frame_length ({frame_length}) may not be larger than fft_length ({fft_length})")
- if window_length != frame_length:
- raise ValueError(f"Length of the window ({window_length}) must equal frame_length ({frame_length})")
- if hop_length <= 0:
- raise ValueError("hop_length must be greater than zero")
- # Check the dimensions of the waveform
- for waveform in waveform_list:
- if waveform.ndim != 1:
- raise ValueError(f"Input waveform must have only one dimension, shape is {waveform.shape}")
- # Check if waveform is complex
- for waveform in waveform_list:
- if np.iscomplexobj(waveform):
- raise ValueError("Complex-valued input waveforms are not currently supported")
- # Center pad the waveform
- if center:
- padding = [(int(frame_length // 2), int(frame_length // 2))]
- waveform_list = [
- np.pad(
- waveform,
- padding,
- mode=pad_mode,
- )
- for waveform in waveform_list
- ]
- original_waveform_lengths = [
- len(waveform) for waveform in waveform_list
- ] # these lengths will be used to remove padding later
- # Batch pad the waveform
- max_length = max(original_waveform_lengths)
- padded_waveform_batch = np.array(
- [
- np.pad(waveform, (0, max_length - len(waveform)), mode="constant", constant_values=0)
- for waveform in waveform_list
- ],
- dtype=dtype,
- )
- # Promote to float64, since np.fft uses float64 internally
- padded_waveform_batch = padded_waveform_batch.astype(np.float64)
- window = window.astype(np.float64)
- # Split waveform into frames of frame_length size
- num_frames = int(1 + np.floor((padded_waveform_batch.shape[1] - frame_length) / hop_length))
- # these lengths will be used to remove padding later
- true_num_frames = [int(1 + np.floor((length - frame_length) / hop_length)) for length in original_waveform_lengths]
- num_batches = padded_waveform_batch.shape[0]
- num_frequency_bins = (fft_length // 2) + 1 if onesided else fft_length
- spectrogram = np.empty((num_batches, num_frames, num_frequency_bins), dtype=np.complex64)
- # rfft is faster than fft
- fft_func = np.fft.rfft if onesided else np.fft.fft
- buffer = np.zeros((num_batches, fft_length))
- for frame_idx in range(num_frames):
- timestep = frame_idx * hop_length
- buffer[:, :frame_length] = padded_waveform_batch[:, timestep : timestep + frame_length]
- if remove_dc_offset:
- buffer[:, :frame_length] -= buffer[:, :frame_length].mean(axis=1, keepdims=True)
- if preemphasis is not None:
- buffer[:, 1:frame_length] -= preemphasis * buffer[:, : frame_length - 1]
- buffer[:, 0] *= 1 - preemphasis
- buffer[:, :frame_length] *= window
- spectrogram[:, frame_idx] = fft_func(buffer)
- # Note: ** is much faster than np.power
- if power is not None:
- spectrogram = np.abs(spectrogram, dtype=np.float64) ** power
- # Apply mel filters if provided
- if mel_filters is not None:
- result = np.tensordot(spectrogram, mel_filters.T, axes=([2], [1]))
- spectrogram = np.maximum(mel_floor, result)
- # Convert to log scale if specified
- if power is not None and log_mel is not None:
- if log_mel == "log":
- spectrogram = np.log(spectrogram)
- elif log_mel == "log10":
- spectrogram = np.log10(spectrogram)
- elif log_mel == "dB":
- if power == 1.0:
- spectrogram = amplitude_to_db_batch(spectrogram, reference, min_value, db_range)
- elif power == 2.0:
- spectrogram = power_to_db_batch(spectrogram, reference, min_value, db_range)
- else:
- raise ValueError(f"Cannot use log_mel option '{log_mel}' with power {power}")
- else:
- raise ValueError(f"Unknown log_mel option: {log_mel}")
- spectrogram = np.asarray(spectrogram, dtype)
- spectrogram_list = [spectrogram[i, : true_num_frames[i], :].T for i in range(len(true_num_frames))]
- return spectrogram_list
- def power_to_db(
- spectrogram: np.ndarray,
- reference: float = 1.0,
- min_value: float = 1e-10,
- db_range: Optional[float] = None,
- ) -> np.ndarray:
- """
- Converts a power spectrogram to the decibel scale. This computes `10 * log10(spectrogram / reference)`, using basic
- logarithm properties for numerical stability.
- The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a
- linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it.
- This means that large variations in energy may not sound all that different if the sound is loud to begin with.
- This compression operation makes the (mel) spectrogram features match more closely what humans actually hear.
- Based on the implementation of `librosa.power_to_db`.
- Args:
- spectrogram (`np.ndarray`):
- The input power (mel) spectrogram. Note that a power spectrogram has the amplitudes squared!
- reference (`float`, *optional*, defaults to 1.0):
- Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
- the loudest part to 0 dB. Must be greater than zero.
- min_value (`float`, *optional*, defaults to `1e-10`):
- The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
- `log(0)`. The default of `1e-10` corresponds to a minimum of -100 dB. Must be greater than zero.
- db_range (`float`, *optional*):
- Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
- peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
- Returns:
- `np.ndarray`: the spectrogram in decibels
- """
- if reference <= 0.0:
- raise ValueError("reference must be greater than zero")
- if min_value <= 0.0:
- raise ValueError("min_value must be greater than zero")
- reference = max(min_value, reference)
- spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
- spectrogram = 10.0 * (np.log10(spectrogram) - np.log10(reference))
- if db_range is not None:
- if db_range <= 0.0:
- raise ValueError("db_range must be greater than zero")
- spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None)
- return spectrogram
- def power_to_db_batch(
- spectrogram: np.ndarray,
- reference: float = 1.0,
- min_value: float = 1e-10,
- db_range: Optional[float] = None,
- ) -> np.ndarray:
- """
- Converts a batch of power spectrograms to the decibel scale. This computes `10 * log10(spectrogram / reference)`,
- using basic logarithm properties for numerical stability.
- This function supports batch processing, where each item in the batch is an individual power (mel) spectrogram.
- Args:
- spectrogram (`np.ndarray`):
- The input batch of power (mel) spectrograms. Expected shape is (batch_size, *spectrogram_shape).
- Note that a power spectrogram has the amplitudes squared!
- reference (`float`, *optional*, defaults to 1.0):
- Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
- the loudest part to 0 dB. Must be greater than zero.
- min_value (`float`, *optional*, defaults to `1e-10`):
- The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
- `log(0)`. The default of `1e-10` corresponds to a minimum of -100 dB. Must be greater than zero.
- db_range (`float`, *optional*):
- Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
- peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
- Returns:
- `np.ndarray`: the batch of spectrograms in decibels
- """
- if reference <= 0.0:
- raise ValueError("reference must be greater than zero")
- if min_value <= 0.0:
- raise ValueError("min_value must be greater than zero")
- reference = max(min_value, reference)
- spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
- spectrogram = 10.0 * (np.log10(spectrogram) - np.log10(reference))
- if db_range is not None:
- if db_range <= 0.0:
- raise ValueError("db_range must be greater than zero")
- # Apply db_range clipping per batch item
- max_values = spectrogram.max(axis=(1, 2), keepdims=True)
- spectrogram = np.clip(spectrogram, a_min=max_values - db_range, a_max=None)
- return spectrogram
- def amplitude_to_db(
- spectrogram: np.ndarray,
- reference: float = 1.0,
- min_value: float = 1e-5,
- db_range: Optional[float] = None,
- ) -> np.ndarray:
- """
- Converts an amplitude spectrogram to the decibel scale. This computes `20 * log10(spectrogram / reference)`, using
- basic logarithm properties for numerical stability.
- The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a
- linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it.
- This means that large variations in energy may not sound all that different if the sound is loud to begin with.
- This compression operation makes the (mel) spectrogram features match more closely what humans actually hear.
- Args:
- spectrogram (`np.ndarray`):
- The input amplitude (mel) spectrogram.
- reference (`float`, *optional*, defaults to 1.0):
- Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
- the loudest part to 0 dB. Must be greater than zero.
- min_value (`float`, *optional*, defaults to `1e-5`):
- The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
- `log(0)`. The default of `1e-5` corresponds to a minimum of -100 dB. Must be greater than zero.
- db_range (`float`, *optional*):
- Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
- peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
- Returns:
- `np.ndarray`: the spectrogram in decibels
- """
- if reference <= 0.0:
- raise ValueError("reference must be greater than zero")
- if min_value <= 0.0:
- raise ValueError("min_value must be greater than zero")
- reference = max(min_value, reference)
- spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
- spectrogram = 20.0 * (np.log10(spectrogram) - np.log10(reference))
- if db_range is not None:
- if db_range <= 0.0:
- raise ValueError("db_range must be greater than zero")
- spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None)
- return spectrogram
- def amplitude_to_db_batch(
- spectrogram: np.ndarray, reference: float = 1.0, min_value: float = 1e-5, db_range: Optional[float] = None
- ) -> np.ndarray:
- """
- Converts a batch of amplitude spectrograms to the decibel scale. This computes `20 * log10(spectrogram / reference)`,
- using basic logarithm properties for numerical stability.
- The function supports batch processing, where each item in the batch is an individual amplitude (mel) spectrogram.
- Args:
- spectrogram (`np.ndarray`):
- The input batch of amplitude (mel) spectrograms. Expected shape is (batch_size, *spectrogram_shape).
- reference (`float`, *optional*, defaults to 1.0):
- Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set
- the loudest part to 0 dB. Must be greater than zero.
- min_value (`float`, *optional*, defaults to `1e-5`):
- The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking
- `log(0)`. The default of `1e-5` corresponds to a minimum of -100 dB. Must be greater than zero.
- db_range (`float`, *optional*):
- Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the
- peak value and the smallest value will never be more than 80 dB. Must be greater than zero.
- Returns:
- `np.ndarray`: the batch of spectrograms in decibels
- """
- if reference <= 0.0:
- raise ValueError("reference must be greater than zero")
- if min_value <= 0.0:
- raise ValueError("min_value must be greater than zero")
- reference = max(min_value, reference)
- spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None)
- spectrogram = 20.0 * (np.log10(spectrogram) - np.log10(reference))
- if db_range is not None:
- if db_range <= 0.0:
- raise ValueError("db_range must be greater than zero")
- # Apply db_range clipping per batch item
- max_values = spectrogram.max(axis=(1, 2), keepdims=True)
- spectrogram = np.clip(spectrogram, a_min=max_values - db_range, a_max=None)
- return spectrogram
- ### deprecated functions below this line ###
- def get_mel_filter_banks(
- nb_frequency_bins: int,
- nb_mel_filters: int,
- frequency_min: float,
- frequency_max: float,
- sample_rate: int,
- norm: Optional[str] = None,
- mel_scale: str = "htk",
- ) -> np.array:
- warnings.warn(
- "The function `get_mel_filter_banks` is deprecated and will be removed in version 4.31.0 of Transformers",
- FutureWarning,
- )
- return mel_filter_bank(
- num_frequency_bins=nb_frequency_bins,
- num_mel_filters=nb_mel_filters,
- min_frequency=frequency_min,
- max_frequency=frequency_max,
- sampling_rate=sample_rate,
- norm=norm,
- mel_scale=mel_scale,
- )
- def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int = 400, center: bool = True):
- """
- In order to compute the short time fourier transform, the waveform needs to be split in overlapping windowed
- segments called `frames`.
- The window length (window_length) defines how much of the signal is contained in each frame, while the hop length
- defines the step between the beginning of each new frame.
- Args:
- waveform (`np.array` of shape `(sample_length,)`):
- The raw waveform which will be split into smaller chunks.
- hop_length (`int`, *optional*, defaults to 160):
- Step between each window of the waveform.
- fft_window_size (`int`, *optional*, defaults to 400):
- Defines the size of the window.
- center (`bool`, defaults to `True`):
- Whether or not to center each frame around the middle of the frame. Centering is done by reflecting the
- waveform on the left and on the right.
- Return:
- framed_waveform (`np.array` of shape `(waveform.shape // hop_length , fft_window_size)`):
- The framed waveforms that can be fed to `np.fft`.
- """
- warnings.warn(
- "The function `fram_wave` is deprecated and will be removed in version 4.31.0 of Transformers",
- FutureWarning,
- )
- frames = []
- for i in range(0, waveform.shape[0] + 1, hop_length):
- if center:
- half_window = (fft_window_size - 1) // 2 + 1
- start = i - half_window if i > half_window else 0
- end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0]
- frame = waveform[start:end]
- if start == 0:
- padd_width = (-i + half_window, 0)
- frame = np.pad(frame, pad_width=padd_width, mode="reflect")
- elif end == waveform.shape[0]:
- padd_width = (0, (i - waveform.shape[0] + half_window))
- frame = np.pad(frame, pad_width=padd_width, mode="reflect")
- else:
- frame = waveform[i : i + fft_window_size]
- frame_width = frame.shape[0]
- if frame_width < waveform.shape[0]:
- frame = np.lib.pad(
- frame, pad_width=(0, fft_window_size - frame_width), mode="constant", constant_values=0
- )
- frames.append(frame)
- frames = np.stack(frames, 0)
- return frames
- def stft(frames: np.array, windowing_function: np.array, fft_window_size: int = None):
- """
- Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same results
- as `torch.stft`.
- Args:
- frames (`np.array` of dimension `(num_frames, fft_window_size)`):
- A framed audio signal obtained using `audio_utils.fram_wav`.
- windowing_function (`np.array` of dimension `(nb_frequency_bins, nb_mel_filters)`:
- A array representing the function that will be used to reduces the amplitude of the discontinuities at the
- boundaries of each frame when computing the STFT. Each frame will be multiplied by the windowing_function.
- For more information on the discontinuities, called *Spectral leakage*, refer to [this
- tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
- fft_window_size (`int`, *optional*):
- Size of the window om which the Fourier transform is applied. This controls the frequency resolution of the
- spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples. The number of
- frequency bins (`nb_frequency_bins`) used to divide the window into equal strips is equal to
- `(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionnally.
- Example:
- ```python
- >>> from transformers.audio_utils import stft, fram_wave
- >>> import numpy as np
- >>> audio = np.random.rand(50)
- >>> fft_window_size = 10
- >>> hop_length = 2
- >>> framed_audio = fram_wave(audio, hop_length, fft_window_size)
- >>> spectrogram = stft(framed_audio, np.hanning(fft_window_size + 1))
- ```
- Returns:
- spectrogram (`np.ndarray`):
- A spectrogram of shape `(num_frames, nb_frequency_bins)` obtained using the STFT algorithm
- """
- warnings.warn(
- "The function `stft` is deprecated and will be removed in version 4.31.0 of Transformers",
- FutureWarning,
- )
- frame_size = frames.shape[1]
- if fft_window_size is None:
- fft_window_size = frame_size
- if fft_window_size < frame_size:
- raise ValueError("FFT size must greater or equal the frame size")
- # number of FFT bins to store
- nb_frequency_bins = (fft_window_size >> 1) + 1
- spectrogram = np.empty((len(frames), nb_frequency_bins), dtype=np.complex64)
- fft_signal = np.zeros(fft_window_size)
- for f, frame in enumerate(frames):
- if windowing_function is not None:
- np.multiply(frame, windowing_function, out=fft_signal[:frame_size])
- else:
- fft_signal[:frame_size] = frame
- spectrogram[f] = np.fft.fft(fft_signal, axis=0)[:nb_frequency_bins]
- return spectrogram.T
|