import gradio as gr #import os import matplotlib.pyplot as plt import numpy as np import librosa from panns_inference import SoundEventDetection, labels def plot_sound_event_detection_result(framewise_output): """Visualization of sound event detection result. Args: framewise_output: (time_steps, classes_num) """ out_fig_path = 'sed.png' #os.makedirs(os.path.dirname(out_fig_path), exist_ok=True) classwise_output = np.max(framewise_output, axis=0) # (classes_num,) idxes = np.argsort(classwise_output)[::-1] idxes = idxes[0:5] ix_to_lb = {i : label for i, label in enumerate(labels)} lines = [] for idx in idxes: line, = plt.plot(framewise_output[:, idx], label=ix_to_lb[idx]) lines.append(line) plt.legend(handles=lines) plt.xlabel('Frames') plt.ylabel('Probability') plt.ylim(0, 1.) plt.savefig(out_fig_path) plt.close() print('Save fig to {}'.format(out_fig_path)) # modified return plt.imread(out_fig_path) def pred(audio): rate, y = audio device = 'cpu' # 'cuda' | 'cpu' #print('sample rate ', rate) #print('shape ', y.shape) #print('raw data', y) y = y.astype(np.float32) #print('float', y) y = librosa.core.to_mono(y.T) #print('shape ', y.shape) #print('mono', y) y = librosa.core.resample(y, orig_sr=rate, target_sr=32000) #print('shape ', y.shape) #print('resampled', y) #print(y.mean()) #print(y.std()) #y = (y - y.mean())/y.std() y = y/y.max() #print('normalized', y) #print(rate) #plt.plot(y) #plt.savefig('wave.png') #plt.close() y = y[None, :] # (batch_size, segment_samples) #print(y) #print('------ Audio tagging ------') #at = AudioTagging(checkpoint_path=None, device=device) #(clipwise_output, embedding) = at.inference(waveform) #"""clipwise_output: (batch_size, classes_num), embedding: (batch_size, embedding_size)""" #print_audio_tagging_result(clipwise_output[0]) print('------ Sound event detection ------') sed = SoundEventDetection(checkpoint_path=None, device=device) framewise_output = sed.inference(y) """(batch_size, time_steps, classes_num)""" # modified return plot_sound_event_detection_result(framewise_output[0]) demo = gr.Interface( pred, gr.Audio(source="upload"), "image", examples=[ "telephone_speech.wav", "ringtone.wav", "animals.wav", ], title="Sound Event Detection", description="This is a demo huggingface space app for PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition. Please view README for more details.", ) demo.launch()