Source code for dcase_models.util.metrics

# encoding: utf-8
"""Metric functions"""

# from scipy import interpolate
import numpy as np
from scipy.stats import mode
from dcase_models.util.events import event_roll_to_event_list
from dcase_models.util.events import tag_probabilities_to_tag_list
from sed_eval.sound_event import SegmentBasedMetrics
from sed_eval.scene import SceneClassificationMetrics
from sed_eval.audio_tag import AudioTaggingMetrics

eps = 1e-6


[docs]def predictions_temporal_integration(Y_predicted, type='sum'):
    """ Integrate temporal dimension.

    Parameters
    ----------
    Y_predicted : ndarray
        Signal to be integrated.
        e.g. shape (N_times, N_classes)
    type : str
        Type of integration ('sum', 'mean', 'autopool')

    Returns
    -------
    array
        Integrated signal.
        e.g. shape (N_classes,)

    """
    if type == 'sum':
        Y_predicted = np.sum(Y_predicted, axis=0)
    if type == 'max':
        Y_predicted = np.max(Y_predicted, axis=0)
    if type == 'mode':
        Y_predicted, _ = mode(Y_predicted, axis=0)
        Y_predicted = np.squeeze(Y_predicted, axis=0)
    return Y_predicted


[docs]def evaluate_metrics(model, data, metrics, **kwargs):
    """ Calculate metrics over files with different length

    Parameters
    ----------
    model : keras Model
        model to get the predictions
    data : tuple or KerasDataGenerator
        Validation data for model evaluation
        (X_val, Y_val) or KerasDataGenerator

        X_val : list of ndarray
            Each element in list is a 3D array with the mel-spectrograms
            of one file. Shape of each element:
            (N_windows, N_hops, N_mel_bands)
            N_windows can be different in each file (element)
        Y_val : list ndarray
            Each element in the list is a 1D array with
            the annotations (one hot encoding).
            Shape of each element (N_classes,)

    metrics : list
        List of metrics to apply.
        Each element can be a metric name or a function.

    Returns
    -------
    dict
        Dict with the results information.

        {'annotations' : [Y0, Y1, ...],
         'predictions' : [Yp0, Yp1, ...],
         metrics[0]: 0.1,
         metrics[1]: 0.54}

    """
    predictions = []
    annotations = []
    results = {}

    if type(data) in [list, tuple]:
        X_val = data[0]
        Y_val = data[1]
        n_files = len(X_val)
        for i in range(n_files):
            X = X_val[i]
            Y_predicted = model.predict(X)
            # if multiple outputs, select the first
            if type(Y_predicted) == list:
                Y_predicted = Y_predicted[0]
            predictions.append(Y_predicted)

        annotations = Y_val

    else:
        # data type is DataGenerator
        for batch_index in range(0, len(data)):
            X_val, Y_val = data.get_data_batch(batch_index)
            n_files = len(X_val)
            for i in range(n_files):
                X = X_val[i]
                Y_predicted = model.predict(X)
                if type(Y_predicted) == list:
                    Y_predicted = Y_predicted[0]
                predictions.append(Y_predicted)

            annotations.extend(Y_val)

    results['annotations'] = annotations
    results['predictions'] = predictions

    for metric in metrics:
        if callable(metric):
            metric_function = metric
        else:
            metric_function = globals()[metric]

        results[metric] = metric_function(annotations, predictions, **kwargs)
    return results


[docs]def sed(Y_val, Y_predicted, sequence_time_sec=0.5,
        metric_resolution_sec=1.0, label_list=[]):
    """ Calculate metrics for Sound Event Detection

    Parameters
    ----------
    Y_val : ndarray
        2D array with the ground-truth event roll
        shape: (N_times, N_classes)
    Y_predicted : ndarray
        2D array with the predicted event roll
        shape: (N_times, N_classes)
    sequence_time_sec : float
        Resolution of Y_val and Y_predicted.
    metric_resolution_sec : float
        Resolution of the metrics.
    label_list:
        Label list.

    Returns
    -------
    sef_eval.sound_events.SegmentBasedMetrics
        Object with the SED results

    """

    seg_metrics = SegmentBasedMetrics(
        label_list, time_resolution=metric_resolution_sec
    )

    n_files = len(Y_val)

    for i in range(n_files):
        y_true = Y_val[i]
        pred = Y_predicted[i]

        pred = (pred > 0.5).astype(int)
        event_list_val = event_roll_to_event_list(
            y_true, label_list, sequence_time_sec)
        event_list_pred = event_roll_to_event_list(
            pred, label_list, sequence_time_sec)

        seg_metrics.evaluate(event_list_val, event_list_pred)

    return seg_metrics


[docs]def classification(Y_val, Y_predicted, label_list=[]):
    """ Calculate metrics for Audio Classification

    Parameters
    ----------
    Y_val : ndarray
        2D array with the ground-truth event roll
        shape: (N_times, N_classes)
    Y_predicted : ndarray
        2D array with the predicted event roll
        shape: (N_times, N_classes)
    label_list:
        Label list.

    Returns
    -------
    sef_eval.scenes.SceneClassificationMetrics
        Object with the classification results

    """
    acc_metrics = SceneClassificationMetrics(label_list)

    n_files = len(Y_val)
    for i in range(n_files):
        y_true = Y_val[i]
        pred = Y_predicted[i]
        pred = np.sum(pred, axis=0)
        pred = np.argmax(pred)
        label_predicted = label_list[pred]
        label_gt = label_list[np.argmax(y_true[0])]
        acc_metrics.evaluate(
            [{'scene_label': label_gt, 'file': ''}],
            [{'scene_label': label_predicted, 'file': ''}])

    return acc_metrics


[docs]def tagging(Y_val, Y_predicted, label_list=[]):
    """ Calculate metrics for Audio Tagging

    Parameters
    ----------
    Y_val : ndarray
        2D array with the ground-truth event roll
        shape: (N_times, N_classes)
    Y_predicted : ndarray
        2D array with the predicted event roll
        shape: (N_times, N_classes)
    label_list:
        Label list.

    Returns
    -------
    sef_eval.scenes.AudioTaggingMetrics
        Object with the tagging results

    """
    tagging_metrics = AudioTaggingMetrics(label_list)

    n_files = len(Y_val)

    for i in range(n_files):
        y_true = Y_val[i]
        pred = Y_predicted[i]
        pred = np.mean(pred, axis=0)

        tag_list_val = tag_probabilities_to_tag_list(
            y_true[0], label_list, threshold=0.5)
        tag_list_pred = tag_probabilities_to_tag_list(
            pred, label_list, threshold=0.5)

        tagging_metrics.evaluate(
            [{'tags': tag_list_val, 'file': ''}],
            [{'tags': tag_list_pred, 'file': ''}])

    return tagging_metrics


[docs]def accuracy(Y_val, Y_predicted):
    n_files = len(Y_val)

    predictions = np.zeros(n_files)
    annotations = np.zeros(n_files)

    for i in range(n_files):
        Y = Y_val[i]
        pred = predictions_temporal_integration(Y_predicted[i], 'sum')
        pred = np.argmax(pred)
        Y = np.argmax(Y)
        annotations[i] = Y
        predictions[i] = pred

    acc = np.mean(annotations == predictions)

    return acc


[docs]def ER(Y_val, Y_predicted, sequence_time_sec=0.5, metric_resolution_sec=1.0):
    n_files = len(Y_val)

    predictions = []
    annotations = []

    for i in range(n_files):
        y_true = Y_val[i]
        pred = Y_predicted[i]

        if pred.shape[0] == y_true.shape[0]:
            y_pred = pred
        else:
            y_pred = np.zeros_like(y_true)
            ratio = int(np.round(metric_resolution_sec / sequence_time_sec))
            for j in range(len(y_true)):
                y_pred[j] = np.mean(pred[j*ratio:(j+1)*ratio], axis=0)

        annotations.append(y_true)
        predictions.append(y_pred)

    annotations = np.concatenate(annotations, axis=0)
    predictions = np.concatenate(predictions, axis=0)
    assert annotations.shape[0] == predictions.shape[0]
    assert annotations.shape[1] == predictions.shape[1]

    predictions = (predictions > 0.5).astype(int)
    Ntp = np.sum(predictions + annotations > 1)
    Nref = np.sum(annotations)
    Nsys = np.sum(predictions)

    Sus = min(Nref, Nsys) - Ntp
    Del = max(0.0, Nref - Nsys)
    Ins = max(0.0, Nsys - Nref)

    ER = (Sus+Del+Ins)/float(Nref + eps)

    return ER


[docs]def F1(Y_val, Y_predicted, sequence_time_sec=0.5, metric_resolution_sec=1.0):
    n_files = len(Y_val)

    predictions = []
    annotations = []

    for i in range(n_files):
        y_true = Y_val[i]
        pred = Y_predicted[i]

        if pred.shape[0] == y_true.shape[0]:
            y_pred = pred
        else:
            y_pred = np.zeros_like(y_true)
            ratio = int(np.round(metric_resolution_sec / sequence_time_sec))
            for j in range(len(y_true)):
                y_pred[j] = np.mean(pred[j*ratio:(j+1)*ratio], axis=0)

        annotations.append(y_true)
        predictions.append(y_pred)

    annotations = np.concatenate(annotations, axis=0)
    predictions = np.concatenate(predictions, axis=0)
    assert annotations.shape[0] == predictions.shape[0]
    assert annotations.shape[1] == predictions.shape[1]

    predictions = (predictions > 0.5).astype(int)
    Ntp = np.sum(predictions + annotations > 1)
    # Ntn = np.sum(predictions + annotations > 0)
    # Nfp = np.sum(predictions - annotations > 0)
    # Nfn = np.sum(annotations - predictions > 0)
    Nref = np.sum(annotations)
    Nsys = np.sum(predictions)

    P = Ntp / float(Nsys + eps)
    R = Ntp / float(Nref + eps)

    Fmeasure = 2*P*R/(P + R + eps)
    return Fmeasure