File size: 2,497 Bytes
5e740f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210a1a2
 
 
5e740f6
210a1a2
5e740f6
210a1a2
 
5e740f6
210a1a2
 
 
 
5e740f6
 
210a1a2
5e740f6
210a1a2
 
 
5e740f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import gradio as gr

#import os
import matplotlib.pyplot as plt
import numpy as np
import librosa
from panns_inference import SoundEventDetection, labels

def plot_sound_event_detection_result(framewise_output):
    """Visualization of sound event detection result. 

    Args:
      framewise_output: (time_steps, classes_num)
    """
    out_fig_path = 'sed.png'
    #os.makedirs(os.path.dirname(out_fig_path), exist_ok=True)

    classwise_output = np.max(framewise_output, axis=0) # (classes_num,)

    idxes = np.argsort(classwise_output)[::-1]
    idxes = idxes[0:5]

    ix_to_lb = {i : label for i, label in enumerate(labels)}
    lines = []
    for idx in idxes:
        line, = plt.plot(framewise_output[:, idx], label=ix_to_lb[idx])
        lines.append(line)

    plt.legend(handles=lines)
    plt.xlabel('Frames')
    plt.ylabel('Probability')
    plt.ylim(0, 1.)
    plt.savefig(out_fig_path)
    plt.close()
    print('Save fig to {}'.format(out_fig_path))
    # modified
    return plt.imread(out_fig_path)

def pred(audio):
    rate, y = audio
    device = 'cpu' # 'cuda' | 'cpu'
    #print('sample rate ', rate)
    #print('shape ', y.shape)
    #print('raw data', y)
    y = y.astype(np.float32)
    #print('float', y)
    y = librosa.core.to_mono(y.T)
    #print('shape ', y.shape)
    #print('mono', y)
    y = librosa.core.resample(y, orig_sr=rate, target_sr=32000)
    #print('shape ', y.shape)
    #print('resampled', y)
    #print(y.mean())
    #print(y.std())
    #y = (y - y.mean())/y.std()
    y = y/y.max()
    #print('normalized', y)
    #print(rate)
    #plt.plot(y)
    #plt.savefig('wave.png')
    #plt.close()
    y = y[None, :]  # (batch_size, segment_samples)
    #print(y)
    
    #print('------ Audio tagging ------')
    #at = AudioTagging(checkpoint_path=None, device=device)
    #(clipwise_output, embedding) = at.inference(waveform)
    #"""clipwise_output: (batch_size, classes_num), embedding: (batch_size, embedding_size)"""

    #print_audio_tagging_result(clipwise_output[0])

    print('------ Sound event detection ------')
    sed = SoundEventDetection(checkpoint_path=None, device=device)
    framewise_output = sed.inference(y)
    """(batch_size, time_steps, classes_num)"""

    # modified
    return plot_sound_event_detection_result(framewise_output[0])


demo = gr.Interface(
    pred,
    gr.Audio(source="upload"),
    "image",
    examples=[
        "telephone_speech.wav",
        "ringtone.wav", "animals.wav",
    ],
)

demo.launch()