Source code for dcase_models.data.features

import numpy as np
import librosa
import openl3
import inspect
import sys

from .feature_extractor import FeatureExtractor


__all__ = ['Spectrogram', 'MelSpectrogram',
           'Openl3', 'RawAudio', 'FramesAudio']


[docs]class Spectrogram(FeatureExtractor): """ Spectrogram feature extractor. Extracts the log-scaled spectrogram of the audio signals. The spectrogram is calculated over the whole audio signal and then is separated in overlapped sequences (frames) Notes ----- Based in librosa.core.stft function. Parameters ---------- n_fft : int, default=1024 Number of samples used for FFT calculation. Refer to librosa.core.stft for further information. pad_mode : str or None, default='reflect' Mode of padding applied to the audio signal. This argument is passed to librosa.util.fix_length for padding the signal. If pad_mode is None, no padding is applied. See Also -------- FeatureExtractor : FeatureExtractor base class. MelSpectrogram : MelSpectrogram feature extractor. Examples -------- Extract features of a given file >>> from dcase_models.data.features import Spectrogram >>> from dcase_models.util.files import example_audio_file >>> features = Spectrogram() >>> features_shape = features.get_shape() >>> print(features_shape) (21, 32, 513) >>> file_name = example_audio_file() >>> spectrogram = features.calculate(file_name) >>> print(spectrogram.shape) (3, 32, 513) Extract features for each file in a given dataset. >>> from dcase_models.data.datasets import ESC50 >>> dataset = ESC50('../datasets/ESC50') >>> features.extract(dataset) """
[docs] def __init__(self, sequence_time=1.0, sequence_hop_time=0.5, audio_win=1024, audio_hop=680, sr=22050, n_fft=1024, pad_mode='reflect'): super().__init__(sequence_time=sequence_time, sequence_hop_time=sequence_hop_time, audio_win=audio_win, audio_hop=audio_hop, sr=sr) self.n_fft = n_fft self.pad_mode = pad_mode
[docs] def calculate(self, file_name): audio = self.load_audio(file_name) # Padding if self.pad_mode is not None: audio = librosa.util.fix_length( audio, audio.shape[0] + librosa.core.frames_to_samples( self.sequence_frames, self.audio_hop, n_fft=self.n_fft), axis=0, mode=self.pad_mode ) # Spectrogram, shape (N_frames, N_freqs) stft = librosa.core.stft(audio, n_fft=self.n_fft, hop_length=self.audio_hop, win_length=self.audio_win, center=True) # Power spectrogram = np.abs(stft)**2 # Convert to db spectrogram = librosa.power_to_db(spectrogram) # Transpose time and freq dims, shape spectrogram = spectrogram.T # Convert to sequences (frames), # shape (N_sequences, N_sequence_frames, N_freqs) spectrogram = np.ascontiguousarray(spectrogram) spectrogram = librosa.util.frame( spectrogram, self.sequence_frames, self.sequence_hop, axis=0 ) return spectrogram
[docs]class MelSpectrogram(FeatureExtractor): """ MelSpectrogram feature extractor. Extracts the log-scaled mel-spectrogram of the audio signals. The mel-spectrogram is calculated over the whole audio signal and then is separated in overlapped sequences (frames). Notes ----- Based in `librosa.core.stft` and `librosa.filters.mel` functions. Parameters ---------- n_fft : int, default=1024 Number of samples used for FFT calculation. Refer to `librosa.core.stft` for further information. mel_bands : int, default=64 Number of mel bands. pad_mode : str or None, default='reflect' Mode of padding applied to the audio signal. This argument is passed to librosa.util.fix_length for padding the signal. If pad_mode is None, no padding is applied. kwargs Additional keyword arguments to `librosa.filters.mel`. See Also -------- FeatureExtractor : FeatureExtractor base class Spectrogram : Spectrogram features Examples -------- Extract features of a given file. >>> from dcase_models.data.features import MelSpectrogram >>> from dcase_models.util.files import example_audio_file >>> features = MelSpectrogram() >>> features_shape = features.get_shape() >>> print(features_shape) (21, 32, 64) >>> file_name = example_audio_file() >>> mel_spectrogram = features.calculate(file_name) >>> print(mel_spectrogram.shape) (3, 32, 64) Extract features for each file in a given dataset. >>> from dcase_models.data.datasets import ESC50 >>> dataset = ESC50('../datasets/ESC50') >>> features.extract(dataset) """
[docs] def __init__(self, sequence_time=1.0, sequence_hop_time=0.5, audio_win=1024, audio_hop=680, sr=22050, n_fft=1024, mel_bands=64, pad_mode='reflect', **kwargs): super().__init__(sequence_time=sequence_time, sequence_hop_time=sequence_hop_time, audio_win=audio_win, audio_hop=audio_hop, sr=sr) self.n_fft = n_fft self.pad_mode = pad_mode self.mel_bands = mel_bands kwargs.setdefault('htk', True) kwargs.setdefault('fmax', None) self.mel_basis = librosa.filters.mel( sr, n_fft, mel_bands, **kwargs)
[docs] def calculate(self, file_name): # Load audio audio = self.load_audio(file_name) # if len(audio) < self.audio_win: # return None # Pad audio signal if self.pad_mode is not None: audio = librosa.util.fix_length( audio, audio.shape[0] + librosa.core.frames_to_samples( self.sequence_frames, self.audio_hop, n_fft=self.n_fft), axis=0, mode=self.pad_mode ) # Get the spectrogram, shape (N_freqs, N_frames) stft = librosa.core.stft(audio, n_fft=self.n_fft, hop_length=self.audio_hop, win_length=self.audio_win, center=False) # Convert to power spectrogram = np.abs(stft)**2 # Convert to mel_spectrogram, shape (N_bands, N_frames) mel_spectrogram = self.mel_basis.dot(spectrogram) assert mel_spectrogram.shape[0] == self.mel_bands # Convert to db mel_spectrogram = librosa.power_to_db(mel_spectrogram) # Transpose time and freq dims, shape (N_frames, N_bands) mel_spectrogram = mel_spectrogram.T # Pad the mel_spectrogram, shape (N_frames', N_bands) # mel_spectrogram = librosa.util.fix_length( # mel_spectrogram, # mel_spectrogram.shape[0]+self.sequence_frames, # axis=0, mode='reflect' # ) # Convert to sequences (frames), # shape (N_sequences, N_sequence_frames, N_bands) mel_spectrogram = np.ascontiguousarray(mel_spectrogram) mel_spectrogram = librosa.util.frame( mel_spectrogram, self.sequence_frames, self.sequence_hop, axis=0 ) return mel_spectrogram
[docs]class Openl3(FeatureExtractor): """ Openl3 feature extractor. Based in openl3 library. Parameters ---------- content_type : {'music' or 'env'}, default='env' Type of content used to train the embedding model. Refer to openl3.core.get_audio_embedding. input_repr : {'linear', 'mel128', or 'mel256'} Spectrogram representation used for model. Refer to openl3.core.get_audio_embedding. embedding_size : {6144 or 512}, default=512 Embedding dimensionality. Refer to openl3.core.get_audio_embedding. pad_mode : str or None, default='reflect' Mode of padding applied to the audio signal. This argument is passed to librosa.util.fix_length for padding the signal. If pad_mode is None, no padding is applied. See Also -------- FeatureExtractor : FeatureExtractor base class Spectrogram : Spectrogram features Examples -------- Extract features of a given file. >>> from dcase_models.data.features import Openl3 >>> from dcase_models.util.files import example_audio_file >>> features = Openl3() >>> features_shape = features.get_shape() >>> print(features_shape) (20, 512) >>> file_name = example_audio_file() >>> mel_spectrogram = features.calculate(file_name) >>> print(mel_spectrogram.shape) (3, 512) Extract features for each file in a given dataset. >>> from dcase_models.data.datasets import ESC50 >>> dataset = ESC50('../datasets/ESC50') >>> features.extract(dataset) """
[docs] def __init__(self, sequence_time=1.0, sequence_hop_time=0.5, audio_win=1024, audio_hop=680, sr=22050, content_type="env", input_repr="mel256", embedding_size=512): super().__init__(sequence_time=sequence_time, sequence_hop_time=sequence_hop_time, audio_win=audio_win, audio_hop=audio_hop, sr=sr) self.content_type = content_type self.input_repr = input_repr self.embedding_size = embedding_size
[docs] def calculate(self, file_name): audio = self.load_audio(file_name, change_sampling_rate=False) emb, ts = openl3.get_audio_embedding( audio, self.sr, content_type=self.content_type, embedding_size=self.embedding_size, input_repr=self.input_repr, hop_size=self.sequence_hop_time, verbose=False ) return emb
[docs]class RawAudio(FeatureExtractor): """ RawAudio feature extractor. Load the audio signal and create sequences (overlapped windows) Parameters ---------- pad_mode : str or None, default='reflect' Mode of padding applied to the audio signal. This argument is passed to librosa.util.fix_length for padding the signal. If pad_mode is None, no padding is applied. """
[docs] def __init__(self, sequence_time=1.0, sequence_hop_time=0.5, audio_win=1024, audio_hop=680, sr=22050, pad_mode='reflect'): super().__init__(sequence_time=sequence_time, sequence_hop_time=sequence_hop_time, audio_win=audio_win, audio_hop=audio_hop, sr=sr) self.pad_mode = pad_mode self.sequence_samples = int(librosa.core.frames_to_samples( self.sequence_frames, audio_hop)) self.sequence_hop_samples = int(librosa.core.frames_to_samples( self.sequence_hop, audio_hop))
[docs] def calculate(self, file_name): audio = self.load_audio(file_name, change_sampling_rate=False) if self.pad_mode is not None: audio = librosa.util.fix_length( audio, audio.shape[0] + self.sequence_samples, axis=0, mode=self.pad_mode ) audio = np.ascontiguousarray(audio) audio_seqs = librosa.util.frame( audio, self.sequence_samples, self.sequence_hop_samples, axis=0 ) return audio_seqs
[docs]class FramesAudio(FeatureExtractor): """ FramesAudio feature extractor. Load the audio signal, convert it into time-short frames, and create sequences (overlapped windows). Parameters ---------- pad_mode : str or None, default='reflect' Mode of padding applied to the audio signal. This argument is passed to librosa.util.fix_length for padding the signal. If pad_mode is None, no padding is applied. """
[docs] def __init__(self, sequence_time=1.0, sequence_hop_time=0.5, audio_win=1024, audio_hop=680, sr=22050, n_fft=1024, pad_mode='reflect'): super().__init__(sequence_time=sequence_time, sequence_hop_time=sequence_hop_time, audio_win=audio_win, audio_hop=audio_hop, sr=sr) self.n_fft = n_fft self.pad_mode = pad_mode self.sequence_samples = librosa.core.frames_to_samples( self.sequence_frames, audio_hop, n_fft) self.sequence_hop_samples = librosa.core.frames_to_samples( self.sequence_hop, audio_hop, n_fft)
[docs] def calculate(self, file_name): audio = self.load_audio(file_name, change_sampling_rate=False) if self.pad_mode is not None: audio = librosa.util.fix_length( audio, audio.shape[0] + self.sequence_samples, axis=0, mode=self.pad_mode ) audio = np.ascontiguousarray(audio) audio_frames = librosa.util.frame( audio, self.audio_win, self.audio_hop, axis=0 ) # TODO: ADD WINDOWING audio_frames = np.ascontiguousarray(audio_frames) audio_seqs = librosa.util.frame( audio_frames, self.sequence_frames, self.sequence_hop, axis=0 ) return audio_seqs
def get_available_features(): available_features = {m[0]: m[1] for m in inspect.getmembers( sys.modules[__name__], inspect.isclass) if m[1].__module__ == __name__} return available_features