Source code for dcase_models.model.models

from functools import partial
import inspect
import sys

from keras.layers import GRU, Bidirectional
from keras.layers import TimeDistributed, Activation, Reshape
from keras.layers import GlobalAveragePooling2D
from keras.layers import GlobalMaxPooling2D
from keras.layers import Input, Lambda, Conv2D, MaxPooling2D
from keras.layers import Conv1D
from keras.layers import Dropout, Dense, Flatten
from keras.layers import BatchNormalization
from keras.models import Model
from keras.regularizers import l2
import keras.backend as K
from autopool import AutoPool1D

from .container import KerasModelContainer


__all__ = ['MLP', 'SB_CNN', 'SB_CNN_SED', 'A_CRNN',
           'VGGish', 'SMel', 'MST']


[docs]class MLP(KerasModelContainer):
    """ KerasModelContainer for a generic MLP model.

    Parameters
    ----------
    n_classes : int, default=10
        Number of classes (dimmension output).

    n_frames : int or None, default=64
        Length of the input (number of frames of each sequence).
        Use None to not use frame-level input and output. In this case the
        input has shape (None, n_freqs).

    n_freqs : int, default=12
        Number of frequency bins. The model's input has shape
        (n_frames, n_freqs).

    hidden_layers_size : list of int, default=[128, 64]
        Dimmension of each hidden layer. Note that the length of this list
        defines the number of hidden layers.

    dropout_rates : list of float, default=[0.5, 0.5]
        List of dropout rate use after each hidden layer. The length of this
        list must be equal to the length of hidden_layers_size. Use 0.0
        (or negative) to not use dropout.

    hidden_activation : str, default='relu'
        Activation for hidden layers.

    l2_reg : float, default=1e-5
        Weight of the l2 regularizers. Use 0.0 to not use regularization.

    final_activation : str, default='softmax'
        Activation of the last layer.

    temporal_integration : {'mean', 'sum', 'autopool'}, default='mean'
        Temporal integration operation used after last layer.

    kwargs
        Additional keyword arguments to `Dense layers`.


    Attributes
    ----------
    model : keras.models.Model
        Keras model.

    Examples
    --------
    >>> from dcase_models.model.models import MLP
    >>> model_container = MLP()
    >>> model_container.model.summary()
    _________________________________________________________________
    Layer (type)                 Output Shape              Param #
    =================================================================
    input (InputLayer)           (None, 64, 12)            0
    _________________________________________________________________
    time_distributed_1 (TimeDist (None, 64, 128)           1664
    _________________________________________________________________
    dropout_1 (Dropout)          (None, 64, 128)           0
    _________________________________________________________________
    time_distributed_2 (TimeDist (None, 64, 64)            8256
    _________________________________________________________________
    dropout_2 (Dropout)          (None, 64, 64)            0
    _________________________________________________________________
    time_distributed_3 (TimeDist (None, 64, 10)            650
    _________________________________________________________________
    temporal_integration (Lambda (None, 10)                0
    =================================================================
    Total params: 10,570
    Trainable params: 10,570
    Non-trainable params: 0
    _________________________________________________________________

    """

[docs]    def __init__(self, model=None, model_path=None,
                 metrics=['classification'], n_classes=10,
                 n_frames=64, n_freqs=12,
                 hidden_layers_size=[128, 64],
                 dropout_rates=[0.5, 0.5], hidden_activation='relu',
                 l2_reg=1e-5, final_activation='softmax',
                 temporal_integration='mean', **kwargs):

        # self.input_shape = input_shape
        self.n_classes = n_classes
        self.n_frames = n_frames
        self.n_freqs = n_freqs
        self.hidden_layers_size = hidden_layers_size
        self.dropout_rates = dropout_rates
        self.l2_reg = l2_reg
        self.temporal_integration = temporal_integration
        self.use_time_distributed = n_frames is not None
        self.hidden_activation = hidden_activation
        self.final_activation = final_activation
        self.kwargs = kwargs

        super().__init__(model=model, model_path=model_path,
                         model_name='MLP', metrics=metrics)

[docs]    def build(self):
        """ Missing docstring here
        """
        # input
        if self.use_time_distributed:
            input_shape = (self.n_frames, self.n_freqs)
        else:
            input_shape = (self.n_freqs)

        inputs = Input(shape=input_shape, dtype='float32', name='input')

        # Hidden layers
        for idx in range(len(self.hidden_layers_size)):
            if idx == 0:
                y = inputs
            dense_layer = Dense(self.hidden_layers_size[idx],
                                activation=self.hidden_activation,
                                kernel_regularizer=l2(self.l2_reg),
                                name='dense_{}'.format(idx+1), **self.kwargs)
            if self.use_time_distributed:
                y = TimeDistributed(dense_layer)(y)
            else:
                y = dense_layer(y)

            # Dropout
            if self.dropout_rates[idx] > 0:
                y = Dropout(self.dropout_rates[idx])(y)
        # Output layer
        dense_layer = Dense(self.n_classes, activation=self.final_activation,
                            kernel_regularizer=l2(self.l2_reg),
                            name='output', **self.kwargs)

        if self.use_time_distributed:
            y = TimeDistributed(dense_layer)(y)
        else:
            y = dense_layer(y)

        # Temporal integration
        if self.temporal_integration == 'mean':
            y = Lambda(lambda x: K.mean(x, 1), name='temporal_integration')(y)
        elif self.temporal_integration == 'sum':
            y = Lambda(lambda x: K.sum(x, 1), name='temporal_integration')(y)
        elif self.temporal_integration == 'autopool':
            y = AutoPool1D(axis=1, name='output')(y)

        # Create model
        self.model = Model(inputs=inputs, outputs=y, name='model')

        super().build()


[docs]class SB_CNN(KerasModelContainer):
    """ KerasModelContainer for SB_CNN model.

    J. Salamon and J. P. Bello.
    "Deep Convolutional Neural Networks and Data Augmentation
    For Environmental Sound Classification".
    IEEE Signal Processing Letters, 24(3), pages 279 - 283.
    2017.

    Notes
    -----
    Code based on Salamon's implementation
    https://github.com/justinsalamon/scaper_waspaa2017


    Parameters
    ----------
    n_classes : int, default=10
        Number of classes (dimmension output).

    n_frames_cnn : int or None, default=64
        Length of the input (number of frames of each sequence).

    n_freq_cnn : int, default=128
        Number of frequency bins. The model's input has shape
        (n_frames, n_freqs).

    filter_size_cnn : tuple, default=(5,5)
        Kernel dimmension for convolutional layers.

    pool_size_cnn : tuple, default=(2,2)
        Pooling dimmension for maxpooling layers.

    n_dense_cnn : int, default=64
        Dimmension of penultimate dense layer.

    n_channels : int, default=0
        Number of input channels

        0 : mono signals.
            Input shape = (n_frames_cnn, n_freq_cnn)
        1 : mono signals.
            Input shape = (n_frames_cnn, n_freq_cnn, 1)
        2 : stereo signals.
            Input shape = (n_frames_cnn, n_freq_cnn, 2)
        n > 2 : multi-representations.
            Input shape = (n_frames_cnn, n_freq_cnn, n_channels)


    Attributes
    ----------
    model : keras.models.Model
        Keras model.

    Examples
    --------
    >>> from dcase_models.model.models import SB_CNN
    >>> model_container = SB_CNN()
    >>> model_container.model.summary()
    _________________________________________________________________
    Layer (type)                 Output Shape              Param #
    =================================================================
    input (InputLayer)           (None, 64, 128)           0
    _________________________________________________________________
    lambda (Lambda)              (None, 64, 128, 1)        0
    _________________________________________________________________
    conv1 (Conv2D)               (None, 60, 124, 24)       624
    _________________________________________________________________
    maxpool1 (MaxPooling2D)      (None, 30, 62, 24)        0
    _________________________________________________________________
    batchnorm1 (BatchNormalizati (None, 30, 62, 24)        96
    _________________________________________________________________
    conv2 (Conv2D)               (None, 26, 58, 48)        28848
    _________________________________________________________________
    maxpool2 (MaxPooling2D)      (None, 6, 29, 48)         0
    _________________________________________________________________
    batchnorm2 (BatchNormalizati (None, 6, 29, 48)         192
    _________________________________________________________________
    conv3 (Conv2D)               (None, 2, 25, 48)         57648
    _________________________________________________________________
    batchnorm3 (BatchNormalizati (None, 2, 25, 48)         192
    _________________________________________________________________
    flatten (Flatten)            (None, 2400)              0
    _________________________________________________________________
    dropout1 (Dropout)           (None, 2400)              0
    _________________________________________________________________
    dense1 (Dense)               (None, 64)                153664
    _________________________________________________________________
    dropout2 (Dropout)           (None, 64)                0
    _________________________________________________________________
    out (Dense)                  (None, 10)                650
    =================================================================
    Total params: 241,914
    Trainable params: 241,674
    Non-trainable params: 240
    _________________________________________________________________
    """

[docs]    def __init__(self, model=None, model_path=None, metrics=['classification'],
                 n_classes=10, n_frames_cnn=64,
                 n_freq_cnn=128, filter_size_cnn=(5, 5), pool_size_cnn=(2, 2),
                 n_dense_cnn=64, n_channels=0):
        """ Initialization of the SB-CNN model.

        """
        self.n_classes = n_classes
        self.n_frames_cnn = n_frames_cnn
        self.n_freq_cnn = n_freq_cnn
        self.filter_size_cnn = filter_size_cnn
        self.pool_size_cnn = pool_size_cnn
        self.n_dense_cnn = n_dense_cnn
        self.n_channels = n_channels

        super().__init__(
            model=model, model_path=model_path,
            model_name='SB_CNN', metrics=metrics
        )

[docs]    def build(self):
        """ Builds the CNN Keras model according to the initialized parameters.
        """
        # Here define the keras model
        if self.n_channels == 0:
            x = Input(shape=(self.n_frames_cnn, self.n_freq_cnn),
                      dtype='float32', name='input')
            y = Lambda(lambda x: K.expand_dims(x, -1), name='lambda')(x)
        else:
            x = Input(
                shape=(self.n_frames_cnn, self.n_freq_cnn, self.n_channels),
                dtype='float32', name='input'
            )
            y = Lambda(lambda x: x, name='lambda')(x)

        # CONV 1
        y = Conv2D(24, self.filter_size_cnn, padding='valid',
                   activation='relu', name='conv1')(y)
        y = MaxPooling2D(pool_size=(2, 2), strides=None,
                         padding='valid', name='maxpool1')(y)
        y = BatchNormalization(name='batchnorm1')(y)

        # CONV 2
        y = Conv2D(48, self.filter_size_cnn, padding='valid',
                   activation='relu', name='conv2')(y)
        y = MaxPooling2D(pool_size=(4, 2), strides=None,
                         padding='valid', name='maxpool2')(y)
        y = BatchNormalization(name='batchnorm2')(y)

        # CONV 3
        y = Conv2D(48, self.filter_size_cnn, padding='valid',
                   activation='relu', name='conv3')(y)
        y = BatchNormalization(name='batchnorm3')(y)

        # Flatten and dense layers
        y = Flatten(name='flatten')(y)
        y = Dropout(0.5, name='dropout1')(y)
        y = Dense(self.n_dense_cnn, activation='relu', kernel_regularizer=l2(
            0.001), bias_regularizer=l2(0.001), name='dense1')(y)
        y = Dropout(0.5, name='dropout2')(y)
        y = Dense(self.n_classes, activation='softmax', kernel_regularizer=l2(
            0.001), bias_regularizer=l2(0.001), name='out')(y)

        # creates keras Model
        self.model = Model(inputs=x, outputs=y)

[docs]    def sub_model(self):
        """ Missing docstring here
        """
        # example code on how define a new model based on the original
        new_model = Model(inputs=self.model.input,
                          outputs=self.model.get_layer('dense1').output)
        return new_model

    # def train(...):  # i.e if want to redefine train function


[docs]class SB_CNN_SED(KerasModelContainer):
    """ KerasModelContainer for SB_CNN_SED model.

    J. Salamon, D. MacConnell, M. Cartwright, P. Li, and J. P. Bello.
    "Scaper: A Library for Soundscape Synthesis and Augmentation".
    IEEE Workshop on Applications of Signal Processing to
    Audio and Acoustics (WASPAA).
    New Paltz, NY, USA, Oct. 2017

    Notes
    -----
    Code based on Salamon's implementation
    https://github.com/justinsalamon/scaper_waspaa2017

    Parameters
    ----------
    n_classes : int, default=10
        Number of classes (dimmension output).

    n_frames_cnn : int or None, default=64
        Length of the input (number of frames of each sequence).

    n_freq_cnn : int, default=128
        Number of frequency bins. The model's input has shape
        (n_frames, n_freqs).

    filter_size_cnn : tuple, default=(5,5)
        Kernel dimmension for convolutional layers.

    pool_size_cnn : tuple, default=(2,2)
        Pooling dimmension for maxpooling layers.

    large_cnn : bool, default=False
        If large_cnn is true, add other dense layer after penultimate layer.

    n_dense_cnn : int, default=64
        Dimmension of penultimate dense layer.

    n_channels : int, default=0
        Number of input channels.

        0 : mono signals.
            Input shape = (n_frames_cnn, n_freq_cnn)
        1 : mono signals.
            Input shape = (n_frames_cnn, n_freq_cnn, 1)
        2 : stereo signals.
            Input shape = (n_frames_cnn, n_freq_cnn, 2)
        n > 2 : multi-representations.
            Input shape = (n_frames_cnn, n_freq_cnn, n_channels)


    Attributes
    ----------
    model : keras.models.Model
        Keras model.

    Examples
    --------
    >>> from dcase_models.model.models import SB_CNN_SED
    >>> model_container = SB_CNN_SED()
    >>> model_container.model.summary()
    _________________________________________________________________
    Layer (type)                 Output Shape              Param #
    =================================================================
    input_1 (InputLayer)         (None, 64, 128)           0
    _________________________________________________________________
    lambda_1 (Lambda)            (None, 64, 128, 1)        0
    _________________________________________________________________
    conv2d_1 (Conv2D)            (None, 60, 124, 64)       1664
    _________________________________________________________________
    max_pooling2d_1 (MaxPooling2 (None, 30, 62, 64)        0
    _________________________________________________________________
    batch_normalization_1 (Batch (None, 30, 62, 64)        256
    _________________________________________________________________
    conv2d_2 (Conv2D)            (None, 26, 58, 64)        102464
    _________________________________________________________________
    max_pooling2d_2 (MaxPooling2 (None, 13, 29, 64)        0
    _________________________________________________________________
    batch_normalization_2 (Batch (None, 13, 29, 64)        256
    _________________________________________________________________
    conv2d_3 (Conv2D)            (None, 9, 25, 64)         102464
    _________________________________________________________________
    batch_normalization_3 (Batch (None, 9, 25, 64)         256
    _________________________________________________________________
    flatten_1 (Flatten)          (None, 14400)             0
    _________________________________________________________________
    dropout_3 (Dropout)          (None, 14400)             0
    _________________________________________________________________
    dense_1 (Dense)              (None, 64)                921664
    _________________________________________________________________
    dropout_4 (Dropout)          (None, 64)                0
    _________________________________________________________________
    dense_2 (Dense)              (None, 10)                650
    =================================================================
    Total params: 1,129,674
    Trainable params: 1,129,290
    Non-trainable params: 384
    _________________________________________________________________

    """

[docs]    def __init__(self, model=None, model_path=None, metrics=['sed'],
                 n_classes=10, n_frames_cnn=64,
                 n_freq_cnn=128, filter_size_cnn=(5, 5), pool_size_cnn=(2, 2),
                 large_cnn=False, n_dense_cnn=64,
                 n_filters_cnn=64, n_chanels=0):
        """ Initialization of the SB-CNN-SED model.

        """

        self.n_classes = n_classes
        self.n_frames_cnn = n_frames_cnn
        self.n_freq_cnn = n_freq_cnn
        self.filter_size_cnn = filter_size_cnn
        self.pool_size_cnn = pool_size_cnn
        self.large_cnn = large_cnn
        self.n_dense_cnn = n_dense_cnn
        self.n_filters_cnn = n_filters_cnn
        self.n_chanels = n_chanels

        super().__init__(model=model, model_path=model_path,
                         model_name='SB_CNN_SED', metrics=metrics)

[docs]    def build(self):
        """ Missing docstring here
        """
        # Here define the keras model
        if self.large_cnn:
            self.n_filters_cnn = 128
            self.n_dense_cnn = 128

        # INPUT
        x = Input(shape=(self.n_frames_cnn, self.n_freq_cnn), dtype='float32')

        y = Lambda(lambda x: K.expand_dims(x, -1))(x)

        # CONV 1
        y = Conv2D(self.n_filters_cnn, self.filter_size_cnn, padding='valid',
                   activation='relu')(y)
        y = MaxPooling2D(pool_size=self.pool_size_cnn,
                         strides=None, padding='valid')(y)
        y = BatchNormalization()(y)

        # CONV 2
        y = Conv2D(self.n_filters_cnn, self.filter_size_cnn, padding='valid',
                   activation='relu')(y)
        y = MaxPooling2D(pool_size=self.pool_size_cnn,
                         strides=None, padding='valid')(y)
        y = BatchNormalization()(y)

        # CONV 3
        y = Conv2D(self.n_filters_cnn, self.filter_size_cnn, padding='valid',
                   activation='relu')(y)
        # y = MaxPooling2D(pool_size=pool_size_cnn,
        #                  strides=None, padding='valid')(y)
        y = BatchNormalization()(y)

        # Flatten for dense layers
        y = Flatten()(y)
        y = Dropout(0.5)(y)
        y = Dense(self.n_dense_cnn, activation='relu')(y)
        if self.large_cnn:
            y = Dropout(0.5)(y)
            y = Dense(self.n_dense_cnn, activation='relu')(y)
        y = Dropout(0.5)(y)
        y = Dense(self.n_classes, activation='sigmoid')(y)

        self.model = Model(inputs=x, outputs=y)
        super().build()


[docs]class A_CRNN(KerasModelContainer):
    """ KerasModelContainer for A_CRNN model.

    S. Adavanne, P. Pertilä, T. Virtanen
    "Sound event detection using spatial features and
    convolutional recurrent neural network"
    International Conference on Acoustics, Speech, and Signal Processing.
    2017. https://arxiv.org/pdf/1706.02291.pdf

    Notes
    -----
    Code based on Adavanne's implementation
    https://github.com/sharathadavanne/sed-crnn

    Parameters
    ----------
    n_classes : int, default=10
        Number of classes (dimmension output).

    n_frames_cnn : int or None, default=64
        Length of the input (number of frames of each sequence).

    n_freq_cnn : int, default=128
        Number of frequency bins. The model's input has shape
        (n_frames, n_freqs).

    cnn_nb_filt : int, default=128
        Number of filters used in convolutional layers.

    cnn_pool_size : tuple, default=(5, 2, 2)
        Pooling dimmension for maxpooling layers.

    rnn_nb : list, default=[32, 32]
        Number of units in each recursive layer.

    fc_nb : list, default=[32]
        Number of units in each dense layer.

    dropout_rate : float, default=0.5
        Dropout rate.

    n_channels : int, default=0
        Number of input channels

        0 : mono signals.
            Input shape = (n_frames_cnn, n_freq_cnn)
        1 : mono signals.
            Input shape = (n_frames_cnn, n_freq_cnn, 1)
        2 : stereo signals.
            Input shape = (n_frames_cnn, n_freq_cnn, 2)
        n > 2 : multi-representations.
            Input shape = (n_frames_cnn, n_freq_cnn, n_channels)

    final_activation : str, default='softmax'
        Activation of the last layer.

    sed : bool, default=False
        If sed is True, the output is frame-level. If False the output is
        time averaged.

    bidirectional : bool, default=False
        If bidirectional is True, the recursive layers are bidirectional.

    Attributes
    ----------
    model : keras.models.Model
        Keras model.

    Examples
    --------
    >>> from dcase_models.model.models import A_CRNN
    >>> model_container = A_CRNN()
    >>> model_container.model.summary()
    _________________________________________________________________
    Layer (type)                 Output Shape              Param #
    =================================================================
    input (InputLayer)           (None, 64, 128)           0
    _________________________________________________________________
    lambda (Lambda)              (None, 64, 128, 1)        0
    _________________________________________________________________
    conv2d_7 (Conv2D)            (None, 64, 128, 128)      1280
    _________________________________________________________________
    batch_normalization_7 (Batch (None, 64, 128, 128)      512
    _________________________________________________________________
    activation_4 (Activation)    (None, 64, 128, 128)      0
    _________________________________________________________________
    max_pooling2d_6 (MaxPooling2 (None, 64, 25, 128)       0
    _________________________________________________________________
    dropout_9 (Dropout)          (None, 64, 25, 128)       0
    _________________________________________________________________
    conv2d_8 (Conv2D)            (None, 64, 25, 128)       147584
    _________________________________________________________________
    batch_normalization_8 (Batch (None, 64, 25, 128)       100
    _________________________________________________________________
    activation_5 (Activation)    (None, 64, 25, 128)       0
    _________________________________________________________________
    max_pooling2d_7 (MaxPooling2 (None, 64, 12, 128)       0
    _________________________________________________________________
    dropout_10 (Dropout)         (None, 64, 12, 128)       0
    _________________________________________________________________
    conv2d_9 (Conv2D)            (None, 64, 12, 128)       147584
    _________________________________________________________________
    batch_normalization_9 (Batch (None, 64, 12, 128)       48
    _________________________________________________________________
    activation_6 (Activation)    (None, 64, 12, 128)       0
    _________________________________________________________________
    max_pooling2d_8 (MaxPooling2 (None, 64, 6, 128)        0
    _________________________________________________________________
    dropout_11 (Dropout)         (None, 64, 6, 128)        0
    _________________________________________________________________
    reshape_2 (Reshape)          (None, 64, 768)           0
    _________________________________________________________________
    gru_3 (GRU)                  (None, 64, 32)            76896
    _________________________________________________________________
    gru_4 (GRU)                  (None, 64, 32)            6240
    _________________________________________________________________
    time_distributed_6 (TimeDist (None, 64, 32)            1056
    _________________________________________________________________
    dropout_12 (Dropout)         (None, 64, 32)            0
    _________________________________________________________________
    time_distributed_7 (TimeDist (None, 64, 10)            330
    _________________________________________________________________
    mean (Lambda)                (None, 10)                0
    _________________________________________________________________
    strong_out (Activation)      (None, 10)                0
    =================================================================
    Total params: 381,630
    Trainable params: 381,300
    Non-trainable params: 330
    _________________________________________________________________

    """

[docs]    def __init__(self, model=None, model_path=None, metrics=['sed'],
                 n_classes=10, n_frames_cnn=64,
                 n_freq_cnn=128, cnn_nb_filt=128,
                 cnn_pool_size=[5, 2, 2], rnn_nb=[32, 32],
                 fc_nb=[32], dropout_rate=0.5, n_channels=0,
                 final_activation='softmax', sed=False,
                 bidirectional=False):
        '''


        '''
        self.n_classes = n_classes
        self.n_frames_cnn = n_frames_cnn
        self.n_freq_cnn = n_freq_cnn
        self.cnn_nb_filt = cnn_nb_filt
        self.cnn_pool_size = cnn_pool_size
        self.rnn_nb = rnn_nb
        self.fc_nb = fc_nb
        self.dropout_rate = dropout_rate
        self.n_channels = n_channels
        self.final_activation = final_activation
        self.sed = sed
        self.bidirectional = bidirectional

        super().__init__(
            model=model, model_path=model_path,
            model_name='A_CRNN', metrics=metrics
        )

[docs]    def build(self):
        """ Builds the CRNN Keras model.
        """
        if self.n_channels == 0:
            x = Input(shape=(self.n_frames_cnn, self.n_freq_cnn),
                      dtype='float32', name='input')
            spec_start = Lambda(
                lambda x: K.expand_dims(x, -1), name='lambda')(x)
        else:
            x = Input(
                shape=(self.n_frames_cnn, self.n_freq_cnn, self.n_channels),
                dtype='float32', name='input'
            )
            spec_start = Lambda(lambda x: x, name='lambda')(x)

        spec_x = spec_start
        for i, cnt in enumerate(self.cnn_pool_size):
            spec_x = Conv2D(filters=self.cnn_nb_filt, kernel_size=(
                3, 3), padding='same')(spec_x)
            # spec_x = BatchNormalization(axis=1)(spec_x)
            spec_x = BatchNormalization(axis=2)(spec_x)
            spec_x = Activation('relu')(spec_x)
            spec_x = MaxPooling2D(pool_size=(1, cnt))(spec_x)
            spec_x = Dropout(self.dropout_rate)(spec_x)
        # spec_x = Permute((2, 1, 3))(spec_x)
        spec_x = Reshape((self.n_frames_cnn, -1))(spec_x)

        for r in self.rnn_nb:
            if self.bidirectional:
                spec_x = Bidirectional(
                    GRU(r, activation='tanh', dropout=self.dropout_rate,
                        recurrent_dropout=self.dropout_rate,
                        return_sequences=True),
                    merge_mode='mul')(spec_x)
            else:
                spec_x = GRU(r, activation='tanh', dropout=self.dropout_rate,
                             recurrent_dropout=self.dropout_rate,
                             return_sequences=True)(spec_x)

        for f in self.fc_nb:
            spec_x = TimeDistributed(Dense(f))(spec_x)
            spec_x = Dropout(self.dropout_rate)(spec_x)

        spec_x = TimeDistributed(Dense(self.n_classes))(spec_x)

        if not self.sed:
            spec_x = Lambda(lambda x: K.mean(x, 1), name='mean')(spec_x)
        out = Activation(self.final_activation, name='strong_out')(spec_x)

        # out = Activation('sigmoid', name='strong_out')(spec_x)

        self.model = Model(inputs=x, outputs=out)

        super().build()


[docs]class VGGish(KerasModelContainer):
    """ KerasModelContainer for VGGish model

    Jort F. Gemmeke et al.
    Audio Set: An ontology and human-labeled dataset for audio events
    International Conference on Acoustics, Speech, and Signal Processing.
    New Orleans, LA, 2017.

    Notes
    -----
    https://research.google.com/audioset/
    Based on vggish-keras https://pypi.org/project/vggish-keras/

    Parameters
    ----------
    n_frames_cnn : int or None, default=96
        Length of the input (number of frames of each sequence).

    n_freq_cnn : int, default=64
        Number of frequency bins. The model's input has shape
        (n_frames, n_freqs).

    n_classes : int, default=10
        Number of classes (dimmension output).

    n_channels : int, default=0
        Number of input channels

        0 : mono signals.
            Input shape = (n_frames_cnn, n_freq_cnn)
        1 : mono signals.
            Input shape = (n_frames_cnn, n_freq_cnn, 1)
        2 : stereo signals.
            Input shape = (n_frames_cnn, n_freq_cnn, 2)
        n > 2 : multi-representations.
            Input shape = (n_frames_cnn, n_freq_cnn, n_channels)

    embedding_size : int, default=128
        Number of units in the embeddings layer.

    pooling : {'avg', max}, default='avg'
        Use AveragePooling or Maxpooling.

    include_top : bool, default=False
        Include fully-connected layers.

    compress : bool, default=False
        Apply PCA.


    Attributes
    ----------
    model : keras.models.Model
        Keras model.

    Examples
    --------
    >>> from dcase_models.model.models import VGGish
    >>> model_container = VGGish()
    >>> model_container.model.summary()
    _________________________________________________________________
    Layer (type)                 Output Shape              Param #
    =================================================================
    input (InputLayer)           (None, 96, 64)            0
    _________________________________________________________________
    lambda (Lambda)              (None, 96, 64, 1)         0
    _________________________________________________________________
    conv1 (Conv2D)               (None, 96, 64, 64)        640
    _________________________________________________________________
    pool1 (MaxPooling2D)         (None, 48, 32, 64)        0
    _________________________________________________________________
    conv2 (Conv2D)               (None, 48, 32, 128)       73856
    _________________________________________________________________
    pool2 (MaxPooling2D)         (None, 24, 16, 128)       0
    _________________________________________________________________
    conv3/conv3_1 (Conv2D)       (None, 24, 16, 256)       295168
    _________________________________________________________________
    conv3/conv3_2 (Conv2D)       (None, 24, 16, 256)       590080
    _________________________________________________________________
    pool3 (MaxPooling2D)         (None, 12, 8, 256)        0
    _________________________________________________________________
    conv4/conv4_1 (Conv2D)       (None, 12, 8, 512)        1180160
    _________________________________________________________________
    conv4/conv4_2 (Conv2D)       (None, 12, 8, 512)        2359808
    _________________________________________________________________
    pool4 (MaxPooling2D)         (None, 6, 4, 512)         0
    _________________________________________________________________
    global_average_pooling2d_1 ( (None, 512)               0
    =================================================================
    Total params: 4,499,712
    Trainable params: 4,499,712
    Non-trainable params: 0
    _________________________________________________________________
    """

[docs]    def __init__(self, model=None, model_path=None, metrics=['classification'],
                 n_frames_cnn=96, n_freq_cnn=64, n_classes=10,
                 n_channels=0, embedding_size=128, pooling='avg',
                 include_top=False, compress=False):

        self.n_frames_cnn = n_frames_cnn
        self.n_freq_cnn = n_freq_cnn
        self.n_classes = n_classes
        self.n_channels = n_channels
        self.embedding_size = embedding_size
        self.pooling = pooling
        self.include_top = include_top
        self.compress = compress

        super().__init__(
            model=model, model_path=model_path,
            model_name='VGGish', metrics=metrics
        )

[docs]    def build(self):
        """ Builds the VGGish Keras model.
        """
        if self.n_channels == 0:
            inputs = Input(shape=(self.n_frames_cnn, self.n_freq_cnn),
                           dtype='float32', name='input')
            x = Lambda(
                lambda x: K.expand_dims(x, -1), name='lambda'
            )(inputs)
        else:
            inputs = Input(
                shape=(self.n_frames_cnn, self.n_freq_cnn, self.n_channels),
                dtype='float32', name='input'
            )
            x = Lambda(lambda x: x, name='lambda')(inputs)

        # setup layer params
        conv = partial(Conv2D, kernel_size=(3, 3), strides=(
            1, 1), activation='relu', padding='same')
        maxpool = partial(MaxPooling2D, pool_size=(2, 2),
                          strides=(2, 2), padding='same')

        # Block 1
        x = conv(64, name='conv1')(x)
        x = maxpool(name='pool1')(x)

        # Block 2
        x = conv(128, name='conv2')(x)
        x = maxpool(name='pool2')(x)

        # Block 3
        x = conv(256, name='conv3/conv3_1')(x)
        x = conv(256, name='conv3/conv3_2')(x)
        x = maxpool(name='pool3')(x)

        # Block 4
        x = conv(512, name='conv4/conv4_1')(x)
        x = conv(512, name='conv4/conv4_2')(x)
        x = maxpool(name='pool4')(x)

        if self.include_top:
            dense = partial(Dense, activation='relu')

            # FC block
            x = Flatten(name='flatten_')(x)
            x = dense(4096, name='fc1/fc1_1')(x)
            x = dense(4096, name='fc1/fc1_2')(x)
            x = dense(self.embedding_size, name='fc2')(x)

            # if compress:
            #    x = Postprocess()(x)
        else:
            globalpool = (
                GlobalAveragePooling2D() if self.pooling == 'avg' else
                GlobalMaxPooling2D() if self.pooling == 'max' else None)

            if globalpool:
                x = globalpool(x)

        # Create model
        self.model = Model(inputs, x, name='vggish_model')

        super().build()


[docs]class SMel(KerasModelContainer):
    """ KerasModelContainer for SMel model.

    P. Zinemanas, P. Cancela, M. Rocamora.
    "End–to–end Convolutional Neural Networks for Sound Event Detection
    in Urban Environments"
    Proceedings of the 24th Conference of Open Innovations Association FRUCT,
    3rd IEEE FRUCT International Workshop on Semantic Audio
    and the Internet of Things.
    Moscow, Russia, April 2019.

    Parameters
    ----------
    mel_bands : int, default=128
        Number of mel bands.

    n_seqs : int, default=64
        Time dimmension of the input.

    audio_win : int, default=1024
        Length of the audio window (number of samples of each frame).

    audio_hop : int, default=512
        Length of the hop size (in samples).

    alpha : int, default=1
        Multiply factor before apply log (compression factor).

    scaler : tuple, list or None
        If scaler is not None, this is used before output.

    amin : float, default=1e-10 (-100 dB)
        Minimum value for db calculation.

    Attributes
    ----------
    model : keras.models.Model
        Keras model.

    Examples
    --------
    >>> from dcase_models.model.models import SMel
    >>> model_container = SMel()
    >>> model_container.model.summary()
    _________________________________________________________________
    Layer (type)                 Output Shape              Param #
    =================================================================
    input_1 (InputLayer)         (None, 64, 1024)          0
    _________________________________________________________________
    lambda (Lambda)              (None, 64, 1024, 1)       0
    _________________________________________________________________
    time_distributed_1 (TimeDist (None, 64, 64, 128)       131200
    _________________________________________________________________
    lambda_1 (Lambda)            (None, 64, 64, 128)       0
    _________________________________________________________________
    lambda_2 (Lambda)            (None, 64, 128)           0
    _________________________________________________________________
    lambda_3 (Lambda)            (None, 64, 128)           0
    =================================================================
    Total params: 131,200
    Trainable params: 131,200
    Non-trainable params: 0
    _________________________________________________________________

    """

[docs]    def __init__(self, model=None, model_path=None,
                 metrics=['mean_squared_error'],
                 mel_bands=128, n_seqs=64,
                 audio_win=1024, audio_hop=512,
                 alpha=1, scaler=None, amin=1e-10):
        self.mel_bands = mel_bands
        self.n_seqs = n_seqs
        self.audio_win = audio_win
        self.audio_hop = audio_hop
        self.alpha = alpha
        self.scaler = scaler
        self.amin = amin

        super().__init__(model=model, model_path=model_path,
                         model_name='SMel', metrics=metrics)

[docs]    def build(self):
        """ Builds the SMel Keras model.
        """
        x = Input(shape=(self.n_seqs, self.audio_win), dtype='float32')

        y = Lambda(lambda x: K.expand_dims(x, -1), name='lambda')(x)

        y = TimeDistributed(
            Conv1D(
                self.mel_bands, 1024, strides=16, padding='same', use_bias=True
            ))(y)

        y = Lambda(lambda x: x*x)(y)

        y = Lambda(lambda x: self.audio_win*K.mean(x, axis=2))(y)

        y = Lambda(
            lambda x: 10*K.log(K.maximum(self.amin, x*self.alpha))/K.log(10.)
        )(y)

        if self.scaler is not None:
            y = Lambda(
                lambda x: 2*((x-self.scaler[0]) /
                             (self.scaler[1]-self.scaler[0])-0.5)
            )(y)

        self.model = Model(inputs=x, outputs=y)

        super().build()


[docs]class MST(KerasModelContainer):
    """ KerasModelContainer for MST model.

    T. M. S. Tax, J. L. D. Antich, H. Purwins, and L. Maaløe.
    “Utilizing domain knowledge in end-to-end audio processing”
    31st Conference on Neural Information Processing Systems (NIPS).
    Long Beach, CA, USA, 2017.

    Parameters
    ----------
    mel_bands : int, default=128
        Number of mel bands.

    sequence_samples : int, default=22050
        Number of samples of each input.

    audio_win : int, default=1024
        Length of the audio window (number of samples of each frame).

    audio_hop : int, default=512
        Length of the hop size (in samples).


    Attributes
    ----------
    model : keras.models.Model
        Keras model.

    Examples
    --------
    >>> from dcase_models.model.models import SMel
    >>> model_container = SMel()
    >>> model_container.model.summary()
    _________________________________________________________________
    Layer (type)                 Output Shape              Param #
    =================================================================
    input_2 (InputLayer)         (None, 22050)             0
    _________________________________________________________________
    lambda (Lambda)              (None, 22050, 1)          0
    _________________________________________________________________
    conv1d_2 (Conv1D)            (None, 44, 512)           524800
    _________________________________________________________________
    batch_normalization_1 (Batch (None, 44, 512)           2048
    _________________________________________________________________
    activation_1 (Activation)    (None, 44, 512)           0
    _________________________________________________________________
    conv1d_3 (Conv1D)            (None, 44, 256)           393472
    _________________________________________________________________
    batch_normalization_2 (Batch (None, 44, 256)           1024
    _________________________________________________________________
    activation_2 (Activation)    (None, 44, 256)           0
    _________________________________________________________________
    conv1d_4 (Conv1D)            (None, 44, 128)           98432
    _________________________________________________________________
    batch_normalization_3 (Batch (None, 44, 128)           512
    _________________________________________________________________
    activation_3 (Activation)    (None, 44, 128)           0
    =================================================================
    Total params: 1,020,288
    Trainable params: 1,018,496
    Non-trainable params: 1,792
    _________________________________________________________________
    """

[docs]    def __init__(self, model=None, model_path=None,
                 metrics=['mean_squared_error'],
                 mel_bands=128, sequence_samples=22050,
                 audio_win=1024, audio_hop=512):
        self.mel_bands = mel_bands
        self.sequence_samples = sequence_samples
        self.audio_win = audio_win
        self.audio_hop = audio_hop

        super().__init__(model=model, model_path=model_path,
                         model_name='MST', metrics=metrics)

[docs]    def build(self):
        """ Builds the MST Keras model.
        """
        x = Input(shape=(self.sequence_samples, ), dtype='float32')

        y = Lambda(lambda x: K.expand_dims(x, -1), name='lambda')(x)

        y = Conv1D(512, self.audio_win,
                   strides=self.audio_hop, padding='same')(y)
        y = BatchNormalization()(y)
        y = Activation('relu')(y)

        y = Conv1D(256, 3, strides=1, padding='same')(y)
        y = BatchNormalization()(y)
        y = Activation('relu')(y)

        y = Conv1D(self.mel_bands, 3, strides=1, padding='same')(y)
        y = BatchNormalization()(y)
        y = Activation('tanh')(y)

        self.model = Model(inputs=x, outputs=y)

        super().build()


class ConcatenatedModel(KerasModelContainer):
    """ KerasModelContainer for concatenating models.

    """

    def __init__(self, model_list, model_path=None,
                 model_name='ConcatenatedModel', metrics=['sed'],
                 use_batch_norm=False):
        """ Initialization of ConcatenatedModel.

        """
        self.model_list = model_list
        self.use_batch_norm = use_batch_norm

        super().__init__(model=None, model_path=model_path,
                         model_name=model_name, metrics=metrics)

    def build(self):
        """ Missing docstring here
        """
        input_shape = self.model_list[0].model.input_shape
        print(input_shape)
        x = Input(shape=input_shape[1:], dtype='float32')
        for j in range(len(self.model_list)):
            if j == 0:
                y = x
            print(y.shape)
            y = self.model_list[j].model(y)
            print(y.shape)
            if self.use_batch_norm and (j < len(self.model_list) - 1):
                y = BatchNormalization()(y)

        self.model = Model(inputs=x, outputs=y)
        super().build()


def get_available_models():
    """ Missing docstring here
    """
    available_models = {m[0]: m[1] for m in inspect.getmembers(
        sys.modules[__name__], inspect.isclass) if m[1].__module__ == __name__}

    return available_models