import os import matplotlib.pyplot as plt import numpy as np import librosa import panns_inference from panns_inference import AudioTagging, SoundEventDetection, labels def print_audio_tagging_result(clipwise_output): """Visualization of audio tagging result. Args: clipwise_output: (classes_num,) """ sorted_indexes = np.argsort(clipwise_output)[::-1] # Print audio tagging top probabilities for k in range(10): print('{}: {:.3f}'.format(np.array(labels)[sorted_indexes[k]], clipwise_output[sorted_indexes[k]])) def plot_sound_event_detection_result(framewise_output): """Visualization of sound event detection result. Args: framewise_output: (time_steps, classes_num) """ out_fig_path = 'results/sed_result.png' os.makedirs(os.path.dirname(out_fig_path), exist_ok=True) classwise_output = np.max(framewise_output, axis=0) # (classes_num,) idxes = np.argsort(classwise_output)[::-1] idxes = idxes[0:5] ix_to_lb = {i : label for i, label in enumerate(labels)} lines = [] for idx in idxes: line, = plt.plot(framewise_output[:, idx], label=ix_to_lb[idx]) lines.append(line) plt.legend(handles=lines) plt.xlabel('Frames') plt.ylabel('Probability') plt.ylim(0, 1.) plt.savefig(out_fig_path) print('Save fig to {}'.format(out_fig_path)) if __name__ == '__main__': """Example of using panns_inferece for audio tagging and sound evetn detection. """ device = 'cpu' # 'cuda' | 'cpu' audio_path = 'resources/R9_ZSCveAHg_7s.wav' (audio, _) = librosa.core.load(audio_path, sr=32000, mono=True) #print(audio) plt.plot(audio) plt.savefig('sample.png') audio = audio[None, :] # (batch_size, segment_samples) #print(audio) print('------ Audio tagging ------') at = AudioTagging(checkpoint_path=None, device=device) (clipwise_output, embedding) = at.inference(audio) """clipwise_output: (batch_size, classes_num), embedding: (batch_size, embedding_size)""" print_audio_tagging_result(clipwise_output[0]) print('------ Sound event detection ------') sed = SoundEventDetection(checkpoint_path=None, device=device) framewise_output = sed.inference(audio) """(batch_size, time_steps, classes_num)""" plot_sound_event_detection_result(framewise_output[0])