Source code for dcase_models.model.models

from functools import partial
import inspect
import sys

from keras.layers import GRU, Bidirectional
from keras.layers import TimeDistributed, Activation, Reshape
from keras.layers import GlobalAveragePooling2D
from keras.layers import GlobalMaxPooling2D
from keras.layers import Input, Lambda, Conv2D, MaxPooling2D
from keras.layers import Conv1D
from keras.layers import Dropout, Dense, Flatten
from keras.layers import BatchNormalization
from keras.models import Model
from keras.regularizers import l2
import keras.backend as K
from autopool import AutoPool1D

from .container import KerasModelContainer


__all__ = ['MLP', 'SB_CNN', 'SB_CNN_SED', 'A_CRNN',
           'VGGish', 'SMel', 'MST']


[docs]class MLP(KerasModelContainer): """ KerasModelContainer for a generic MLP model. Parameters ---------- n_classes : int, default=10 Number of classes (dimmension output). n_frames : int or None, default=64 Length of the input (number of frames of each sequence). Use None to not use frame-level input and output. In this case the input has shape (None, n_freqs). n_freqs : int, default=12 Number of frequency bins. The model's input has shape (n_frames, n_freqs). hidden_layers_size : list of int, default=[128, 64] Dimmension of each hidden layer. Note that the length of this list defines the number of hidden layers. dropout_rates : list of float, default=[0.5, 0.5] List of dropout rate use after each hidden layer. The length of this list must be equal to the length of hidden_layers_size. Use 0.0 (or negative) to not use dropout. hidden_activation : str, default='relu' Activation for hidden layers. l2_reg : float, default=1e-5 Weight of the l2 regularizers. Use 0.0 to not use regularization. final_activation : str, default='softmax' Activation of the last layer. temporal_integration : {'mean', 'sum', 'autopool'}, default='mean' Temporal integration operation used after last layer. kwargs Additional keyword arguments to `Dense layers`. Attributes ---------- model : keras.models.Model Keras model. Examples -------- >>> from dcase_models.model.models import MLP >>> model_container = MLP() >>> model_container.model.summary() _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input (InputLayer) (None, 64, 12) 0 _________________________________________________________________ time_distributed_1 (TimeDist (None, 64, 128) 1664 _________________________________________________________________ dropout_1 (Dropout) (None, 64, 128) 0 _________________________________________________________________ time_distributed_2 (TimeDist (None, 64, 64) 8256 _________________________________________________________________ dropout_2 (Dropout) (None, 64, 64) 0 _________________________________________________________________ time_distributed_3 (TimeDist (None, 64, 10) 650 _________________________________________________________________ temporal_integration (Lambda (None, 10) 0 ================================================================= Total params: 10,570 Trainable params: 10,570 Non-trainable params: 0 _________________________________________________________________ """
[docs] def __init__(self, model=None, model_path=None, metrics=['classification'], n_classes=10, n_frames=64, n_freqs=12, hidden_layers_size=[128, 64], dropout_rates=[0.5, 0.5], hidden_activation='relu', l2_reg=1e-5, final_activation='softmax', temporal_integration='mean', **kwargs): # self.input_shape = input_shape self.n_classes = n_classes self.n_frames = n_frames self.n_freqs = n_freqs self.hidden_layers_size = hidden_layers_size self.dropout_rates = dropout_rates self.l2_reg = l2_reg self.temporal_integration = temporal_integration self.use_time_distributed = n_frames is not None self.hidden_activation = hidden_activation self.final_activation = final_activation self.kwargs = kwargs super().__init__(model=model, model_path=model_path, model_name='MLP', metrics=metrics)
[docs] def build(self): """ Missing docstring here """ # input if self.use_time_distributed: input_shape = (self.n_frames, self.n_freqs) else: input_shape = (self.n_freqs) inputs = Input(shape=input_shape, dtype='float32', name='input') # Hidden layers for idx in range(len(self.hidden_layers_size)): if idx == 0: y = inputs dense_layer = Dense(self.hidden_layers_size[idx], activation=self.hidden_activation, kernel_regularizer=l2(self.l2_reg), name='dense_{}'.format(idx+1), **self.kwargs) if self.use_time_distributed: y = TimeDistributed(dense_layer)(y) else: y = dense_layer(y) # Dropout if self.dropout_rates[idx] > 0: y = Dropout(self.dropout_rates[idx])(y) # Output layer dense_layer = Dense(self.n_classes, activation=self.final_activation, kernel_regularizer=l2(self.l2_reg), name='output', **self.kwargs) if self.use_time_distributed: y = TimeDistributed(dense_layer)(y) else: y = dense_layer(y) # Temporal integration if self.temporal_integration == 'mean': y = Lambda(lambda x: K.mean(x, 1), name='temporal_integration')(y) elif self.temporal_integration == 'sum': y = Lambda(lambda x: K.sum(x, 1), name='temporal_integration')(y) elif self.temporal_integration == 'autopool': y = AutoPool1D(axis=1, name='output')(y) # Create model self.model = Model(inputs=inputs, outputs=y, name='model') super().build()
[docs]class SB_CNN(KerasModelContainer): """ KerasModelContainer for SB_CNN model. J. Salamon and J. P. Bello. "Deep Convolutional Neural Networks and Data Augmentation For Environmental Sound Classification". IEEE Signal Processing Letters, 24(3), pages 279 - 283. 2017. Notes ----- Code based on Salamon's implementation https://github.com/justinsalamon/scaper_waspaa2017 Parameters ---------- n_classes : int, default=10 Number of classes (dimmension output). n_frames_cnn : int or None, default=64 Length of the input (number of frames of each sequence). n_freq_cnn : int, default=128 Number of frequency bins. The model's input has shape (n_frames, n_freqs). filter_size_cnn : tuple, default=(5,5) Kernel dimmension for convolutional layers. pool_size_cnn : tuple, default=(2,2) Pooling dimmension for maxpooling layers. n_dense_cnn : int, default=64 Dimmension of penultimate dense layer. n_channels : int, default=0 Number of input channels 0 : mono signals. Input shape = (n_frames_cnn, n_freq_cnn) 1 : mono signals. Input shape = (n_frames_cnn, n_freq_cnn, 1) 2 : stereo signals. Input shape = (n_frames_cnn, n_freq_cnn, 2) n > 2 : multi-representations. Input shape = (n_frames_cnn, n_freq_cnn, n_channels) Attributes ---------- model : keras.models.Model Keras model. Examples -------- >>> from dcase_models.model.models import SB_CNN >>> model_container = SB_CNN() >>> model_container.model.summary() _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input (InputLayer) (None, 64, 128) 0 _________________________________________________________________ lambda (Lambda) (None, 64, 128, 1) 0 _________________________________________________________________ conv1 (Conv2D) (None, 60, 124, 24) 624 _________________________________________________________________ maxpool1 (MaxPooling2D) (None, 30, 62, 24) 0 _________________________________________________________________ batchnorm1 (BatchNormalizati (None, 30, 62, 24) 96 _________________________________________________________________ conv2 (Conv2D) (None, 26, 58, 48) 28848 _________________________________________________________________ maxpool2 (MaxPooling2D) (None, 6, 29, 48) 0 _________________________________________________________________ batchnorm2 (BatchNormalizati (None, 6, 29, 48) 192 _________________________________________________________________ conv3 (Conv2D) (None, 2, 25, 48) 57648 _________________________________________________________________ batchnorm3 (BatchNormalizati (None, 2, 25, 48) 192 _________________________________________________________________ flatten (Flatten) (None, 2400) 0 _________________________________________________________________ dropout1 (Dropout) (None, 2400) 0 _________________________________________________________________ dense1 (Dense) (None, 64) 153664 _________________________________________________________________ dropout2 (Dropout) (None, 64) 0 _________________________________________________________________ out (Dense) (None, 10) 650 ================================================================= Total params: 241,914 Trainable params: 241,674 Non-trainable params: 240 _________________________________________________________________ """
[docs] def __init__(self, model=None, model_path=None, metrics=['classification'], n_classes=10, n_frames_cnn=64, n_freq_cnn=128, filter_size_cnn=(5, 5), pool_size_cnn=(2, 2), n_dense_cnn=64, n_channels=0): """ Initialization of the SB-CNN model. """ self.n_classes = n_classes self.n_frames_cnn = n_frames_cnn self.n_freq_cnn = n_freq_cnn self.filter_size_cnn = filter_size_cnn self.pool_size_cnn = pool_size_cnn self.n_dense_cnn = n_dense_cnn self.n_channels = n_channels super().__init__( model=model, model_path=model_path, model_name='SB_CNN', metrics=metrics )
[docs] def build(self): """ Builds the CNN Keras model according to the initialized parameters. """ # Here define the keras model if self.n_channels == 0: x = Input(shape=(self.n_frames_cnn, self.n_freq_cnn), dtype='float32', name='input') y = Lambda(lambda x: K.expand_dims(x, -1), name='lambda')(x) else: x = Input( shape=(self.n_frames_cnn, self.n_freq_cnn, self.n_channels), dtype='float32', name='input' ) y = Lambda(lambda x: x, name='lambda')(x) # CONV 1 y = Conv2D(24, self.filter_size_cnn, padding='valid', activation='relu', name='conv1')(y) y = MaxPooling2D(pool_size=(2, 2), strides=None, padding='valid', name='maxpool1')(y) y = BatchNormalization(name='batchnorm1')(y) # CONV 2 y = Conv2D(48, self.filter_size_cnn, padding='valid', activation='relu', name='conv2')(y) y = MaxPooling2D(pool_size=(4, 2), strides=None, padding='valid', name='maxpool2')(y) y = BatchNormalization(name='batchnorm2')(y) # CONV 3 y = Conv2D(48, self.filter_size_cnn, padding='valid', activation='relu', name='conv3')(y) y = BatchNormalization(name='batchnorm3')(y) # Flatten and dense layers y = Flatten(name='flatten')(y) y = Dropout(0.5, name='dropout1')(y) y = Dense(self.n_dense_cnn, activation='relu', kernel_regularizer=l2( 0.001), bias_regularizer=l2(0.001), name='dense1')(y) y = Dropout(0.5, name='dropout2')(y) y = Dense(self.n_classes, activation='softmax', kernel_regularizer=l2( 0.001), bias_regularizer=l2(0.001), name='out')(y) # creates keras Model self.model = Model(inputs=x, outputs=y)
[docs] def sub_model(self): """ Missing docstring here """ # example code on how define a new model based on the original new_model = Model(inputs=self.model.input, outputs=self.model.get_layer('dense1').output) return new_model
# def train(...): # i.e if want to redefine train function
[docs]class SB_CNN_SED(KerasModelContainer): """ KerasModelContainer for SB_CNN_SED model. J. Salamon, D. MacConnell, M. Cartwright, P. Li, and J. P. Bello. "Scaper: A Library for Soundscape Synthesis and Augmentation". IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA). New Paltz, NY, USA, Oct. 2017 Notes ----- Code based on Salamon's implementation https://github.com/justinsalamon/scaper_waspaa2017 Parameters ---------- n_classes : int, default=10 Number of classes (dimmension output). n_frames_cnn : int or None, default=64 Length of the input (number of frames of each sequence). n_freq_cnn : int, default=128 Number of frequency bins. The model's input has shape (n_frames, n_freqs). filter_size_cnn : tuple, default=(5,5) Kernel dimmension for convolutional layers. pool_size_cnn : tuple, default=(2,2) Pooling dimmension for maxpooling layers. large_cnn : bool, default=False If large_cnn is true, add other dense layer after penultimate layer. n_dense_cnn : int, default=64 Dimmension of penultimate dense layer. n_channels : int, default=0 Number of input channels. 0 : mono signals. Input shape = (n_frames_cnn, n_freq_cnn) 1 : mono signals. Input shape = (n_frames_cnn, n_freq_cnn, 1) 2 : stereo signals. Input shape = (n_frames_cnn, n_freq_cnn, 2) n > 2 : multi-representations. Input shape = (n_frames_cnn, n_freq_cnn, n_channels) Attributes ---------- model : keras.models.Model Keras model. Examples -------- >>> from dcase_models.model.models import SB_CNN_SED >>> model_container = SB_CNN_SED() >>> model_container.model.summary() _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input_1 (InputLayer) (None, 64, 128) 0 _________________________________________________________________ lambda_1 (Lambda) (None, 64, 128, 1) 0 _________________________________________________________________ conv2d_1 (Conv2D) (None, 60, 124, 64) 1664 _________________________________________________________________ max_pooling2d_1 (MaxPooling2 (None, 30, 62, 64) 0 _________________________________________________________________ batch_normalization_1 (Batch (None, 30, 62, 64) 256 _________________________________________________________________ conv2d_2 (Conv2D) (None, 26, 58, 64) 102464 _________________________________________________________________ max_pooling2d_2 (MaxPooling2 (None, 13, 29, 64) 0 _________________________________________________________________ batch_normalization_2 (Batch (None, 13, 29, 64) 256 _________________________________________________________________ conv2d_3 (Conv2D) (None, 9, 25, 64) 102464 _________________________________________________________________ batch_normalization_3 (Batch (None, 9, 25, 64) 256 _________________________________________________________________ flatten_1 (Flatten) (None, 14400) 0 _________________________________________________________________ dropout_3 (Dropout) (None, 14400) 0 _________________________________________________________________ dense_1 (Dense) (None, 64) 921664 _________________________________________________________________ dropout_4 (Dropout) (None, 64) 0 _________________________________________________________________ dense_2 (Dense) (None, 10) 650 ================================================================= Total params: 1,129,674 Trainable params: 1,129,290 Non-trainable params: 384 _________________________________________________________________ """
[docs] def __init__(self, model=None, model_path=None, metrics=['sed'], n_classes=10, n_frames_cnn=64, n_freq_cnn=128, filter_size_cnn=(5, 5), pool_size_cnn=(2, 2), large_cnn=False, n_dense_cnn=64, n_filters_cnn=64, n_chanels=0): """ Initialization of the SB-CNN-SED model. """ self.n_classes = n_classes self.n_frames_cnn = n_frames_cnn self.n_freq_cnn = n_freq_cnn self.filter_size_cnn = filter_size_cnn self.pool_size_cnn = pool_size_cnn self.large_cnn = large_cnn self.n_dense_cnn = n_dense_cnn self.n_filters_cnn = n_filters_cnn self.n_chanels = n_chanels super().__init__(model=model, model_path=model_path, model_name='SB_CNN_SED', metrics=metrics)
[docs] def build(self): """ Missing docstring here """ # Here define the keras model if self.large_cnn: self.n_filters_cnn = 128 self.n_dense_cnn = 128 # INPUT x = Input(shape=(self.n_frames_cnn, self.n_freq_cnn), dtype='float32') y = Lambda(lambda x: K.expand_dims(x, -1))(x) # CONV 1 y = Conv2D(self.n_filters_cnn, self.filter_size_cnn, padding='valid', activation='relu')(y) y = MaxPooling2D(pool_size=self.pool_size_cnn, strides=None, padding='valid')(y) y = BatchNormalization()(y) # CONV 2 y = Conv2D(self.n_filters_cnn, self.filter_size_cnn, padding='valid', activation='relu')(y) y = MaxPooling2D(pool_size=self.pool_size_cnn, strides=None, padding='valid')(y) y = BatchNormalization()(y) # CONV 3 y = Conv2D(self.n_filters_cnn, self.filter_size_cnn, padding='valid', activation='relu')(y) # y = MaxPooling2D(pool_size=pool_size_cnn, # strides=None, padding='valid')(y) y = BatchNormalization()(y) # Flatten for dense layers y = Flatten()(y) y = Dropout(0.5)(y) y = Dense(self.n_dense_cnn, activation='relu')(y) if self.large_cnn: y = Dropout(0.5)(y) y = Dense(self.n_dense_cnn, activation='relu')(y) y = Dropout(0.5)(y) y = Dense(self.n_classes, activation='sigmoid')(y) self.model = Model(inputs=x, outputs=y) super().build()
[docs]class A_CRNN(KerasModelContainer): """ KerasModelContainer for A_CRNN model. S. Adavanne, P. Pertilä, T. Virtanen "Sound event detection using spatial features and convolutional recurrent neural network" International Conference on Acoustics, Speech, and Signal Processing. 2017. https://arxiv.org/pdf/1706.02291.pdf Notes ----- Code based on Adavanne's implementation https://github.com/sharathadavanne/sed-crnn Parameters ---------- n_classes : int, default=10 Number of classes (dimmension output). n_frames_cnn : int or None, default=64 Length of the input (number of frames of each sequence). n_freq_cnn : int, default=128 Number of frequency bins. The model's input has shape (n_frames, n_freqs). cnn_nb_filt : int, default=128 Number of filters used in convolutional layers. cnn_pool_size : tuple, default=(5, 2, 2) Pooling dimmension for maxpooling layers. rnn_nb : list, default=[32, 32] Number of units in each recursive layer. fc_nb : list, default=[32] Number of units in each dense layer. dropout_rate : float, default=0.5 Dropout rate. n_channels : int, default=0 Number of input channels 0 : mono signals. Input shape = (n_frames_cnn, n_freq_cnn) 1 : mono signals. Input shape = (n_frames_cnn, n_freq_cnn, 1) 2 : stereo signals. Input shape = (n_frames_cnn, n_freq_cnn, 2) n > 2 : multi-representations. Input shape = (n_frames_cnn, n_freq_cnn, n_channels) final_activation : str, default='softmax' Activation of the last layer. sed : bool, default=False If sed is True, the output is frame-level. If False the output is time averaged. bidirectional : bool, default=False If bidirectional is True, the recursive layers are bidirectional. Attributes ---------- model : keras.models.Model Keras model. Examples -------- >>> from dcase_models.model.models import A_CRNN >>> model_container = A_CRNN() >>> model_container.model.summary() _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input (InputLayer) (None, 64, 128) 0 _________________________________________________________________ lambda (Lambda) (None, 64, 128, 1) 0 _________________________________________________________________ conv2d_7 (Conv2D) (None, 64, 128, 128) 1280 _________________________________________________________________ batch_normalization_7 (Batch (None, 64, 128, 128) 512 _________________________________________________________________ activation_4 (Activation) (None, 64, 128, 128) 0 _________________________________________________________________ max_pooling2d_6 (MaxPooling2 (None, 64, 25, 128) 0 _________________________________________________________________ dropout_9 (Dropout) (None, 64, 25, 128) 0 _________________________________________________________________ conv2d_8 (Conv2D) (None, 64, 25, 128) 147584 _________________________________________________________________ batch_normalization_8 (Batch (None, 64, 25, 128) 100 _________________________________________________________________ activation_5 (Activation) (None, 64, 25, 128) 0 _________________________________________________________________ max_pooling2d_7 (MaxPooling2 (None, 64, 12, 128) 0 _________________________________________________________________ dropout_10 (Dropout) (None, 64, 12, 128) 0 _________________________________________________________________ conv2d_9 (Conv2D) (None, 64, 12, 128) 147584 _________________________________________________________________ batch_normalization_9 (Batch (None, 64, 12, 128) 48 _________________________________________________________________ activation_6 (Activation) (None, 64, 12, 128) 0 _________________________________________________________________ max_pooling2d_8 (MaxPooling2 (None, 64, 6, 128) 0 _________________________________________________________________ dropout_11 (Dropout) (None, 64, 6, 128) 0 _________________________________________________________________ reshape_2 (Reshape) (None, 64, 768) 0 _________________________________________________________________ gru_3 (GRU) (None, 64, 32) 76896 _________________________________________________________________ gru_4 (GRU) (None, 64, 32) 6240 _________________________________________________________________ time_distributed_6 (TimeDist (None, 64, 32) 1056 _________________________________________________________________ dropout_12 (Dropout) (None, 64, 32) 0 _________________________________________________________________ time_distributed_7 (TimeDist (None, 64, 10) 330 _________________________________________________________________ mean (Lambda) (None, 10) 0 _________________________________________________________________ strong_out (Activation) (None, 10) 0 ================================================================= Total params: 381,630 Trainable params: 381,300 Non-trainable params: 330 _________________________________________________________________ """
[docs] def __init__(self, model=None, model_path=None, metrics=['sed'], n_classes=10, n_frames_cnn=64, n_freq_cnn=128, cnn_nb_filt=128, cnn_pool_size=[5, 2, 2], rnn_nb=[32, 32], fc_nb=[32], dropout_rate=0.5, n_channels=0, final_activation='softmax', sed=False, bidirectional=False): ''' ''' self.n_classes = n_classes self.n_frames_cnn = n_frames_cnn self.n_freq_cnn = n_freq_cnn self.cnn_nb_filt = cnn_nb_filt self.cnn_pool_size = cnn_pool_size self.rnn_nb = rnn_nb self.fc_nb = fc_nb self.dropout_rate = dropout_rate self.n_channels = n_channels self.final_activation = final_activation self.sed = sed self.bidirectional = bidirectional super().__init__( model=model, model_path=model_path, model_name='A_CRNN', metrics=metrics )
[docs] def build(self): """ Builds the CRNN Keras model. """ if self.n_channels == 0: x = Input(shape=(self.n_frames_cnn, self.n_freq_cnn), dtype='float32', name='input') spec_start = Lambda( lambda x: K.expand_dims(x, -1), name='lambda')(x) else: x = Input( shape=(self.n_frames_cnn, self.n_freq_cnn, self.n_channels), dtype='float32', name='input' ) spec_start = Lambda(lambda x: x, name='lambda')(x) spec_x = spec_start for i, cnt in enumerate(self.cnn_pool_size): spec_x = Conv2D(filters=self.cnn_nb_filt, kernel_size=( 3, 3), padding='same')(spec_x) # spec_x = BatchNormalization(axis=1)(spec_x) spec_x = BatchNormalization(axis=2)(spec_x) spec_x = Activation('relu')(spec_x) spec_x = MaxPooling2D(pool_size=(1, cnt))(spec_x) spec_x = Dropout(self.dropout_rate)(spec_x) # spec_x = Permute((2, 1, 3))(spec_x) spec_x = Reshape((self.n_frames_cnn, -1))(spec_x) for r in self.rnn_nb: if self.bidirectional: spec_x = Bidirectional( GRU(r, activation='tanh', dropout=self.dropout_rate, recurrent_dropout=self.dropout_rate, return_sequences=True), merge_mode='mul')(spec_x) else: spec_x = GRU(r, activation='tanh', dropout=self.dropout_rate, recurrent_dropout=self.dropout_rate, return_sequences=True)(spec_x) for f in self.fc_nb: spec_x = TimeDistributed(Dense(f))(spec_x) spec_x = Dropout(self.dropout_rate)(spec_x) spec_x = TimeDistributed(Dense(self.n_classes))(spec_x) if not self.sed: spec_x = Lambda(lambda x: K.mean(x, 1), name='mean')(spec_x) out = Activation(self.final_activation, name='strong_out')(spec_x) # out = Activation('sigmoid', name='strong_out')(spec_x) self.model = Model(inputs=x, outputs=out) super().build()
[docs]class VGGish(KerasModelContainer): """ KerasModelContainer for VGGish model Jort F. Gemmeke et al. Audio Set: An ontology and human-labeled dataset for audio events International Conference on Acoustics, Speech, and Signal Processing. New Orleans, LA, 2017. Notes ----- https://research.google.com/audioset/ Based on vggish-keras https://pypi.org/project/vggish-keras/ Parameters ---------- n_frames_cnn : int or None, default=96 Length of the input (number of frames of each sequence). n_freq_cnn : int, default=64 Number of frequency bins. The model's input has shape (n_frames, n_freqs). n_classes : int, default=10 Number of classes (dimmension output). n_channels : int, default=0 Number of input channels 0 : mono signals. Input shape = (n_frames_cnn, n_freq_cnn) 1 : mono signals. Input shape = (n_frames_cnn, n_freq_cnn, 1) 2 : stereo signals. Input shape = (n_frames_cnn, n_freq_cnn, 2) n > 2 : multi-representations. Input shape = (n_frames_cnn, n_freq_cnn, n_channels) embedding_size : int, default=128 Number of units in the embeddings layer. pooling : {'avg', max}, default='avg' Use AveragePooling or Maxpooling. include_top : bool, default=False Include fully-connected layers. compress : bool, default=False Apply PCA. Attributes ---------- model : keras.models.Model Keras model. Examples -------- >>> from dcase_models.model.models import VGGish >>> model_container = VGGish() >>> model_container.model.summary() _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input (InputLayer) (None, 96, 64) 0 _________________________________________________________________ lambda (Lambda) (None, 96, 64, 1) 0 _________________________________________________________________ conv1 (Conv2D) (None, 96, 64, 64) 640 _________________________________________________________________ pool1 (MaxPooling2D) (None, 48, 32, 64) 0 _________________________________________________________________ conv2 (Conv2D) (None, 48, 32, 128) 73856 _________________________________________________________________ pool2 (MaxPooling2D) (None, 24, 16, 128) 0 _________________________________________________________________ conv3/conv3_1 (Conv2D) (None, 24, 16, 256) 295168 _________________________________________________________________ conv3/conv3_2 (Conv2D) (None, 24, 16, 256) 590080 _________________________________________________________________ pool3 (MaxPooling2D) (None, 12, 8, 256) 0 _________________________________________________________________ conv4/conv4_1 (Conv2D) (None, 12, 8, 512) 1180160 _________________________________________________________________ conv4/conv4_2 (Conv2D) (None, 12, 8, 512) 2359808 _________________________________________________________________ pool4 (MaxPooling2D) (None, 6, 4, 512) 0 _________________________________________________________________ global_average_pooling2d_1 ( (None, 512) 0 ================================================================= Total params: 4,499,712 Trainable params: 4,499,712 Non-trainable params: 0 _________________________________________________________________ """
[docs] def __init__(self, model=None, model_path=None, metrics=['classification'], n_frames_cnn=96, n_freq_cnn=64, n_classes=10, n_channels=0, embedding_size=128, pooling='avg', include_top=False, compress=False): self.n_frames_cnn = n_frames_cnn self.n_freq_cnn = n_freq_cnn self.n_classes = n_classes self.n_channels = n_channels self.embedding_size = embedding_size self.pooling = pooling self.include_top = include_top self.compress = compress super().__init__( model=model, model_path=model_path, model_name='VGGish', metrics=metrics )
[docs] def build(self): """ Builds the VGGish Keras model. """ if self.n_channels == 0: inputs = Input(shape=(self.n_frames_cnn, self.n_freq_cnn), dtype='float32', name='input') x = Lambda( lambda x: K.expand_dims(x, -1), name='lambda' )(inputs) else: inputs = Input( shape=(self.n_frames_cnn, self.n_freq_cnn, self.n_channels), dtype='float32', name='input' ) x = Lambda(lambda x: x, name='lambda')(inputs) # setup layer params conv = partial(Conv2D, kernel_size=(3, 3), strides=( 1, 1), activation='relu', padding='same') maxpool = partial(MaxPooling2D, pool_size=(2, 2), strides=(2, 2), padding='same') # Block 1 x = conv(64, name='conv1')(x) x = maxpool(name='pool1')(x) # Block 2 x = conv(128, name='conv2')(x) x = maxpool(name='pool2')(x) # Block 3 x = conv(256, name='conv3/conv3_1')(x) x = conv(256, name='conv3/conv3_2')(x) x = maxpool(name='pool3')(x) # Block 4 x = conv(512, name='conv4/conv4_1')(x) x = conv(512, name='conv4/conv4_2')(x) x = maxpool(name='pool4')(x) if self.include_top: dense = partial(Dense, activation='relu') # FC block x = Flatten(name='flatten_')(x) x = dense(4096, name='fc1/fc1_1')(x) x = dense(4096, name='fc1/fc1_2')(x) x = dense(self.embedding_size, name='fc2')(x) # if compress: # x = Postprocess()(x) else: globalpool = ( GlobalAveragePooling2D() if self.pooling == 'avg' else GlobalMaxPooling2D() if self.pooling == 'max' else None) if globalpool: x = globalpool(x) # Create model self.model = Model(inputs, x, name='vggish_model') super().build()
[docs]class SMel(KerasModelContainer): """ KerasModelContainer for SMel model. P. Zinemanas, P. Cancela, M. Rocamora. "End–to–end Convolutional Neural Networks for Sound Event Detection in Urban Environments" Proceedings of the 24th Conference of Open Innovations Association FRUCT, 3rd IEEE FRUCT International Workshop on Semantic Audio and the Internet of Things. Moscow, Russia, April 2019. Parameters ---------- mel_bands : int, default=128 Number of mel bands. n_seqs : int, default=64 Time dimmension of the input. audio_win : int, default=1024 Length of the audio window (number of samples of each frame). audio_hop : int, default=512 Length of the hop size (in samples). alpha : int, default=1 Multiply factor before apply log (compression factor). scaler : tuple, list or None If scaler is not None, this is used before output. amin : float, default=1e-10 (-100 dB) Minimum value for db calculation. Attributes ---------- model : keras.models.Model Keras model. Examples -------- >>> from dcase_models.model.models import SMel >>> model_container = SMel() >>> model_container.model.summary() _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input_1 (InputLayer) (None, 64, 1024) 0 _________________________________________________________________ lambda (Lambda) (None, 64, 1024, 1) 0 _________________________________________________________________ time_distributed_1 (TimeDist (None, 64, 64, 128) 131200 _________________________________________________________________ lambda_1 (Lambda) (None, 64, 64, 128) 0 _________________________________________________________________ lambda_2 (Lambda) (None, 64, 128) 0 _________________________________________________________________ lambda_3 (Lambda) (None, 64, 128) 0 ================================================================= Total params: 131,200 Trainable params: 131,200 Non-trainable params: 0 _________________________________________________________________ """
[docs] def __init__(self, model=None, model_path=None, metrics=['mean_squared_error'], mel_bands=128, n_seqs=64, audio_win=1024, audio_hop=512, alpha=1, scaler=None, amin=1e-10): self.mel_bands = mel_bands self.n_seqs = n_seqs self.audio_win = audio_win self.audio_hop = audio_hop self.alpha = alpha self.scaler = scaler self.amin = amin super().__init__(model=model, model_path=model_path, model_name='SMel', metrics=metrics)
[docs] def build(self): """ Builds the SMel Keras model. """ x = Input(shape=(self.n_seqs, self.audio_win), dtype='float32') y = Lambda(lambda x: K.expand_dims(x, -1), name='lambda')(x) y = TimeDistributed( Conv1D( self.mel_bands, 1024, strides=16, padding='same', use_bias=True ))(y) y = Lambda(lambda x: x*x)(y) y = Lambda(lambda x: self.audio_win*K.mean(x, axis=2))(y) y = Lambda( lambda x: 10*K.log(K.maximum(self.amin, x*self.alpha))/K.log(10.) )(y) if self.scaler is not None: y = Lambda( lambda x: 2*((x-self.scaler[0]) / (self.scaler[1]-self.scaler[0])-0.5) )(y) self.model = Model(inputs=x, outputs=y) super().build()
[docs]class MST(KerasModelContainer): """ KerasModelContainer for MST model. T. M. S. Tax, J. L. D. Antich, H. Purwins, and L. Maaløe. “Utilizing domain knowledge in end-to-end audio processing” 31st Conference on Neural Information Processing Systems (NIPS). Long Beach, CA, USA, 2017. Parameters ---------- mel_bands : int, default=128 Number of mel bands. sequence_samples : int, default=22050 Number of samples of each input. audio_win : int, default=1024 Length of the audio window (number of samples of each frame). audio_hop : int, default=512 Length of the hop size (in samples). Attributes ---------- model : keras.models.Model Keras model. Examples -------- >>> from dcase_models.model.models import SMel >>> model_container = SMel() >>> model_container.model.summary() _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= input_2 (InputLayer) (None, 22050) 0 _________________________________________________________________ lambda (Lambda) (None, 22050, 1) 0 _________________________________________________________________ conv1d_2 (Conv1D) (None, 44, 512) 524800 _________________________________________________________________ batch_normalization_1 (Batch (None, 44, 512) 2048 _________________________________________________________________ activation_1 (Activation) (None, 44, 512) 0 _________________________________________________________________ conv1d_3 (Conv1D) (None, 44, 256) 393472 _________________________________________________________________ batch_normalization_2 (Batch (None, 44, 256) 1024 _________________________________________________________________ activation_2 (Activation) (None, 44, 256) 0 _________________________________________________________________ conv1d_4 (Conv1D) (None, 44, 128) 98432 _________________________________________________________________ batch_normalization_3 (Batch (None, 44, 128) 512 _________________________________________________________________ activation_3 (Activation) (None, 44, 128) 0 ================================================================= Total params: 1,020,288 Trainable params: 1,018,496 Non-trainable params: 1,792 _________________________________________________________________ """
[docs] def __init__(self, model=None, model_path=None, metrics=['mean_squared_error'], mel_bands=128, sequence_samples=22050, audio_win=1024, audio_hop=512): self.mel_bands = mel_bands self.sequence_samples = sequence_samples self.audio_win = audio_win self.audio_hop = audio_hop super().__init__(model=model, model_path=model_path, model_name='MST', metrics=metrics)
[docs] def build(self): """ Builds the MST Keras model. """ x = Input(shape=(self.sequence_samples, ), dtype='float32') y = Lambda(lambda x: K.expand_dims(x, -1), name='lambda')(x) y = Conv1D(512, self.audio_win, strides=self.audio_hop, padding='same')(y) y = BatchNormalization()(y) y = Activation('relu')(y) y = Conv1D(256, 3, strides=1, padding='same')(y) y = BatchNormalization()(y) y = Activation('relu')(y) y = Conv1D(self.mel_bands, 3, strides=1, padding='same')(y) y = BatchNormalization()(y) y = Activation('tanh')(y) self.model = Model(inputs=x, outputs=y) super().build()
class ConcatenatedModel(KerasModelContainer): """ KerasModelContainer for concatenating models. """ def __init__(self, model_list, model_path=None, model_name='ConcatenatedModel', metrics=['sed'], use_batch_norm=False): """ Initialization of ConcatenatedModel. """ self.model_list = model_list self.use_batch_norm = use_batch_norm super().__init__(model=None, model_path=model_path, model_name=model_name, metrics=metrics) def build(self): """ Missing docstring here """ input_shape = self.model_list[0].model.input_shape print(input_shape) x = Input(shape=input_shape[1:], dtype='float32') for j in range(len(self.model_list)): if j == 0: y = x print(y.shape) y = self.model_list[j].model(y) print(y.shape) if self.use_batch_norm and (j < len(self.model_list) - 1): y = BatchNormalization()(y) self.model = Model(inputs=x, outputs=y) super().build() def get_available_models(): """ Missing docstring here """ available_models = {m[0]: m[1] for m in inspect.getmembers( sys.modules[__name__], inspect.isclass) if m[1].__module__ == __name__} return available_models