Spaces:
Sleeping
Sleeping
draft
Browse files- Cnn14_mAP=0.431.pth +0 -3
- LICENSE.MIT +21 -0
- app.py +92 -0
- example.py +74 -0
- Cnn14_DecisionLevelMax.pth → panns_data/Cnn14_DecisionLevelMax.pth +0 -0
- class_labels_indices.csv → panns_data/class_labels_indices.csv +0 -0
- panns_inference/__init__.py +2 -0
- panns_inference/config.py +34 -0
- panns_inference/inference.py +66 -0
- panns_inference/models.py +178 -0
- panns_inference/pytorch_utils.py +48 -0
- requirements.txt +4 -0
- setup.py +23 -0
Cnn14_mAP=0.431.pth
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:0dc499e40e9761ef5ea061ffc77697697f277f6a960894903df3ada000e34b31
|
3 |
-
size 327428481
|
|
|
|
|
|
|
|
LICENSE.MIT
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
The MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2010-2017 Google, Inc. http://angularjs.org
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in
|
13 |
+
all copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21 |
+
THE SOFTWARE.
|
app.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
#import os
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import numpy as np
|
6 |
+
import librosa
|
7 |
+
from panns_inference import SoundEventDetection, labels
|
8 |
+
|
9 |
+
def plot_sound_event_detection_result(framewise_output):
|
10 |
+
"""Visualization of sound event detection result.
|
11 |
+
|
12 |
+
Args:
|
13 |
+
framewise_output: (time_steps, classes_num)
|
14 |
+
"""
|
15 |
+
out_fig_path = 'sed.png'
|
16 |
+
#os.makedirs(os.path.dirname(out_fig_path), exist_ok=True)
|
17 |
+
|
18 |
+
classwise_output = np.max(framewise_output, axis=0) # (classes_num,)
|
19 |
+
|
20 |
+
idxes = np.argsort(classwise_output)[::-1]
|
21 |
+
idxes = idxes[0:5]
|
22 |
+
|
23 |
+
ix_to_lb = {i : label for i, label in enumerate(labels)}
|
24 |
+
lines = []
|
25 |
+
for idx in idxes:
|
26 |
+
line, = plt.plot(framewise_output[:, idx], label=ix_to_lb[idx])
|
27 |
+
lines.append(line)
|
28 |
+
|
29 |
+
plt.legend(handles=lines)
|
30 |
+
plt.xlabel('Frames')
|
31 |
+
plt.ylabel('Probability')
|
32 |
+
plt.ylim(0, 1.)
|
33 |
+
plt.savefig(out_fig_path)
|
34 |
+
plt.close()
|
35 |
+
print('Save fig to {}'.format(out_fig_path))
|
36 |
+
# modified
|
37 |
+
return plt.imread(out_fig_path)
|
38 |
+
|
39 |
+
def pred(audio):
|
40 |
+
rate, y = audio
|
41 |
+
device = 'cpu' # 'cuda' | 'cpu'
|
42 |
+
print('sample rate ', rate)
|
43 |
+
print('shape ', y.shape)
|
44 |
+
print('raw data', y)
|
45 |
+
y = y.astype(np.float32)
|
46 |
+
print('float', y)
|
47 |
+
y = librosa.core.to_mono(y.T)
|
48 |
+
print('shape ', y.shape)
|
49 |
+
print('mono', y)
|
50 |
+
y = librosa.core.resample(y, orig_sr=rate, target_sr=32000)
|
51 |
+
print('shape ', y.shape)
|
52 |
+
print('resampled', y)
|
53 |
+
print(y.mean())
|
54 |
+
print(y.std())
|
55 |
+
#y = (y - y.mean())/y.std()
|
56 |
+
y = y/y.max()
|
57 |
+
print('normalized', y)
|
58 |
+
#print(rate)
|
59 |
+
plt.plot(y)
|
60 |
+
plt.savefig('wave.png')
|
61 |
+
plt.close()
|
62 |
+
y = y[None, :] # (batch_size, segment_samples)
|
63 |
+
#print(y)
|
64 |
+
|
65 |
+
#print('------ Audio tagging ------')
|
66 |
+
#at = AudioTagging(checkpoint_path=None, device=device)
|
67 |
+
#(clipwise_output, embedding) = at.inference(waveform)
|
68 |
+
#"""clipwise_output: (batch_size, classes_num), embedding: (batch_size, embedding_size)"""
|
69 |
+
|
70 |
+
#print_audio_tagging_result(clipwise_output[0])
|
71 |
+
|
72 |
+
print('------ Sound event detection ------')
|
73 |
+
sed = SoundEventDetection(checkpoint_path=None, device=device)
|
74 |
+
framewise_output = sed.inference(y)
|
75 |
+
"""(batch_size, time_steps, classes_num)"""
|
76 |
+
|
77 |
+
# modified
|
78 |
+
return plot_sound_event_detection_result(framewise_output[0])
|
79 |
+
|
80 |
+
|
81 |
+
demo = gr.Interface(
|
82 |
+
pred,
|
83 |
+
gr.Audio(source="upload"),
|
84 |
+
"image",
|
85 |
+
examples=[
|
86 |
+
"telephone_speech.wav",
|
87 |
+
"ringtone.wav", "animals.wav",
|
88 |
+
],
|
89 |
+
interpretation="default",
|
90 |
+
)
|
91 |
+
|
92 |
+
demo.launch()
|
example.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
import numpy as np
|
4 |
+
import librosa
|
5 |
+
import panns_inference
|
6 |
+
from panns_inference import AudioTagging, SoundEventDetection, labels
|
7 |
+
|
8 |
+
def print_audio_tagging_result(clipwise_output):
|
9 |
+
"""Visualization of audio tagging result.
|
10 |
+
|
11 |
+
Args:
|
12 |
+
clipwise_output: (classes_num,)
|
13 |
+
"""
|
14 |
+
sorted_indexes = np.argsort(clipwise_output)[::-1]
|
15 |
+
|
16 |
+
# Print audio tagging top probabilities
|
17 |
+
for k in range(10):
|
18 |
+
print('{}: {:.3f}'.format(np.array(labels)[sorted_indexes[k]],
|
19 |
+
clipwise_output[sorted_indexes[k]]))
|
20 |
+
|
21 |
+
|
22 |
+
def plot_sound_event_detection_result(framewise_output):
|
23 |
+
"""Visualization of sound event detection result.
|
24 |
+
|
25 |
+
Args:
|
26 |
+
framewise_output: (time_steps, classes_num)
|
27 |
+
"""
|
28 |
+
out_fig_path = 'results/sed_result.png'
|
29 |
+
os.makedirs(os.path.dirname(out_fig_path), exist_ok=True)
|
30 |
+
|
31 |
+
classwise_output = np.max(framewise_output, axis=0) # (classes_num,)
|
32 |
+
|
33 |
+
idxes = np.argsort(classwise_output)[::-1]
|
34 |
+
idxes = idxes[0:5]
|
35 |
+
|
36 |
+
ix_to_lb = {i : label for i, label in enumerate(labels)}
|
37 |
+
lines = []
|
38 |
+
for idx in idxes:
|
39 |
+
line, = plt.plot(framewise_output[:, idx], label=ix_to_lb[idx])
|
40 |
+
lines.append(line)
|
41 |
+
|
42 |
+
plt.legend(handles=lines)
|
43 |
+
plt.xlabel('Frames')
|
44 |
+
plt.ylabel('Probability')
|
45 |
+
plt.ylim(0, 1.)
|
46 |
+
plt.savefig(out_fig_path)
|
47 |
+
print('Save fig to {}'.format(out_fig_path))
|
48 |
+
|
49 |
+
|
50 |
+
if __name__ == '__main__':
|
51 |
+
"""Example of using panns_inferece for audio tagging and sound evetn detection.
|
52 |
+
"""
|
53 |
+
device = 'cpu' # 'cuda' | 'cpu'
|
54 |
+
audio_path = 'resources/R9_ZSCveAHg_7s.wav'
|
55 |
+
(audio, _) = librosa.core.load(audio_path, sr=32000, mono=True)
|
56 |
+
#print(audio)
|
57 |
+
plt.plot(audio)
|
58 |
+
plt.savefig('sample.png')
|
59 |
+
audio = audio[None, :] # (batch_size, segment_samples)
|
60 |
+
#print(audio)
|
61 |
+
|
62 |
+
print('------ Audio tagging ------')
|
63 |
+
at = AudioTagging(checkpoint_path=None, device=device)
|
64 |
+
(clipwise_output, embedding) = at.inference(audio)
|
65 |
+
"""clipwise_output: (batch_size, classes_num), embedding: (batch_size, embedding_size)"""
|
66 |
+
|
67 |
+
print_audio_tagging_result(clipwise_output[0])
|
68 |
+
|
69 |
+
print('------ Sound event detection ------')
|
70 |
+
sed = SoundEventDetection(checkpoint_path=None, device=device)
|
71 |
+
framewise_output = sed.inference(audio)
|
72 |
+
"""(batch_size, time_steps, classes_num)"""
|
73 |
+
|
74 |
+
plot_sound_event_detection_result(framewise_output[0])
|
Cnn14_DecisionLevelMax.pth → panns_data/Cnn14_DecisionLevelMax.pth
RENAMED
File without changes
|
class_labels_indices.csv → panns_data/class_labels_indices.csv
RENAMED
File without changes
|
panns_inference/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from .inference import SoundEventDetection
|
2 |
+
from .config import labels
|
panns_inference/config.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
#import numpy as np
|
3 |
+
import csv
|
4 |
+
#from pathlib import Path
|
5 |
+
|
6 |
+
sample_rate = 32000
|
7 |
+
|
8 |
+
labels_csv_path = 'panns_data/class_labels_indices.csv' # changed to current directory
|
9 |
+
|
10 |
+
# Download labels if not exist
|
11 |
+
if not os.path.isfile(labels_csv_path):
|
12 |
+
os.makedirs(os.path.dirname(labels_csv_path), exist_ok=True)
|
13 |
+
os.system('wget -O "{}" "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv"'.format(labels_csv_path))
|
14 |
+
|
15 |
+
# Load label
|
16 |
+
with open(labels_csv_path, 'r') as f:
|
17 |
+
reader = csv.reader(f, delimiter=',')
|
18 |
+
lines = list(reader)
|
19 |
+
|
20 |
+
labels = []
|
21 |
+
ids = [] # Each label has a unique id such as "/m/068hy"
|
22 |
+
for i1 in range(1, len(lines)):
|
23 |
+
id = lines[i1][1]
|
24 |
+
label = lines[i1][2]
|
25 |
+
ids.append(id)
|
26 |
+
labels.append(label)
|
27 |
+
|
28 |
+
classes_num = len(labels)
|
29 |
+
|
30 |
+
lb_to_ix = {label : i for i, label in enumerate(labels)}
|
31 |
+
ix_to_lb = {i : label for i, label in enumerate(labels)}
|
32 |
+
|
33 |
+
id_to_ix = {id : i for i, id in enumerate(ids)}
|
34 |
+
ix_to_id = {i : id for i, id in enumerate(ids)}
|
panns_inference/inference.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
|
4 |
+
from .pytorch_utils import move_data_to_device
|
5 |
+
from .models import Cnn14_DecisionLevelMax
|
6 |
+
from .config import labels, classes_num
|
7 |
+
|
8 |
+
def create_folder(fd):
|
9 |
+
if not os.path.exists(fd):
|
10 |
+
os.makedirs(fd)
|
11 |
+
|
12 |
+
def get_filename(path):
|
13 |
+
path = os.path.realpath(path)
|
14 |
+
na_ext = path.split('/')[-1]
|
15 |
+
na = os.path.splitext(na_ext)[0]
|
16 |
+
return na
|
17 |
+
|
18 |
+
class SoundEventDetection(object):
|
19 |
+
def __init__(self, model=None, checkpoint_path=None, device='cuda'):
|
20 |
+
"""Sound event detection inference wrapper.
|
21 |
+
"""
|
22 |
+
if not checkpoint_path:
|
23 |
+
checkpoint_path='panns_data/Cnn14_DecisionLevelMax.pth' # moved to current directory
|
24 |
+
print('Checkpoint path: {}'.format(checkpoint_path))
|
25 |
+
|
26 |
+
if not os.path.exists(checkpoint_path) or os.path.getsize(checkpoint_path) < 3e8:
|
27 |
+
create_folder(os.path.dirname(checkpoint_path))
|
28 |
+
os.system('wget -O "{}" https://zenodo.org/record/3987831/files/Cnn14_DecisionLevelMax_mAP%3D0.385.pth?download=1'.format(checkpoint_path))
|
29 |
+
|
30 |
+
if device == 'cuda' and torch.cuda.is_available():
|
31 |
+
self.device = 'cuda'
|
32 |
+
else:
|
33 |
+
self.device = 'cpu'
|
34 |
+
|
35 |
+
self.labels = labels
|
36 |
+
self.classes_num = classes_num
|
37 |
+
|
38 |
+
# Model
|
39 |
+
if model is None:
|
40 |
+
self.model = Cnn14_DecisionLevelMax(sample_rate=32000, window_size=1024,
|
41 |
+
hop_size=320, mel_bins=64, fmin=50, fmax=14000,
|
42 |
+
classes_num=self.classes_num)
|
43 |
+
else:
|
44 |
+
self.model = model
|
45 |
+
|
46 |
+
checkpoint = torch.load(checkpoint_path, map_location=self.device)
|
47 |
+
self.model.load_state_dict(checkpoint['model'])
|
48 |
+
|
49 |
+
# Parallel
|
50 |
+
if 'cuda' in str(self.device):
|
51 |
+
self.model.to(self.device)
|
52 |
+
print('GPU number: {}'.format(torch.cuda.device_count()))
|
53 |
+
self.model = torch.nn.DataParallel(self.model)
|
54 |
+
else:
|
55 |
+
print('Using CPU.')
|
56 |
+
|
57 |
+
def inference(self, audio):
|
58 |
+
audio = move_data_to_device(audio, self.device)
|
59 |
+
|
60 |
+
with torch.no_grad():
|
61 |
+
self.model.eval()
|
62 |
+
output_dict = self.model(audio, None)
|
63 |
+
|
64 |
+
framewise_output = output_dict['framewise_output'].data.cpu().numpy()
|
65 |
+
|
66 |
+
return framewise_output
|
panns_inference/models.py
ADDED
@@ -0,0 +1,178 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""This models.py contains selected models from:
|
2 |
+
https://github.com/qiuqiangkong/audioset_tagging_cnn/blob/master/pytorch/models.py
|
3 |
+
"""
|
4 |
+
#import os
|
5 |
+
#import sys
|
6 |
+
#import math
|
7 |
+
#import time
|
8 |
+
#import numpy as np
|
9 |
+
#import matplotlib.pyplot as plt
|
10 |
+
|
11 |
+
import torch
|
12 |
+
import torch.nn as nn
|
13 |
+
import torch.nn.functional as F
|
14 |
+
|
15 |
+
from torchlibrosa.stft import Spectrogram, LogmelFilterBank
|
16 |
+
from torchlibrosa.augmentation import SpecAugmentation
|
17 |
+
from .pytorch_utils import do_mixup, interpolate, pad_framewise_output
|
18 |
+
|
19 |
+
def init_layer(layer):
|
20 |
+
"""Initialize a Linear or Convolutional layer. """
|
21 |
+
nn.init.xavier_uniform_(layer.weight)
|
22 |
+
|
23 |
+
if hasattr(layer, 'bias'):
|
24 |
+
if layer.bias is not None:
|
25 |
+
layer.bias.data.fill_(0.)
|
26 |
+
|
27 |
+
|
28 |
+
def init_bn(bn):
|
29 |
+
"""Initialize a Batchnorm layer. """
|
30 |
+
bn.bias.data.fill_(0.)
|
31 |
+
bn.weight.data.fill_(1.)
|
32 |
+
|
33 |
+
|
34 |
+
class ConvBlock(nn.Module):
|
35 |
+
def __init__(self, in_channels, out_channels):
|
36 |
+
|
37 |
+
super(ConvBlock, self).__init__()
|
38 |
+
|
39 |
+
self.conv1 = nn.Conv2d(in_channels=in_channels,
|
40 |
+
out_channels=out_channels,
|
41 |
+
kernel_size=(3, 3), stride=(1, 1),
|
42 |
+
padding=(1, 1), bias=False)
|
43 |
+
|
44 |
+
self.conv2 = nn.Conv2d(in_channels=out_channels,
|
45 |
+
out_channels=out_channels,
|
46 |
+
kernel_size=(3, 3), stride=(1, 1),
|
47 |
+
padding=(1, 1), bias=False)
|
48 |
+
|
49 |
+
self.bn1 = nn.BatchNorm2d(out_channels)
|
50 |
+
self.bn2 = nn.BatchNorm2d(out_channels)
|
51 |
+
|
52 |
+
self.init_weight()
|
53 |
+
|
54 |
+
def init_weight(self):
|
55 |
+
init_layer(self.conv1)
|
56 |
+
init_layer(self.conv2)
|
57 |
+
init_bn(self.bn1)
|
58 |
+
init_bn(self.bn2)
|
59 |
+
|
60 |
+
|
61 |
+
def forward(self, input, pool_size=(2, 2), pool_type='avg'):
|
62 |
+
|
63 |
+
x = input
|
64 |
+
x = F.relu_(self.bn1(self.conv1(x)))
|
65 |
+
x = F.relu_(self.bn2(self.conv2(x)))
|
66 |
+
if pool_type == 'max':
|
67 |
+
x = F.max_pool2d(x, kernel_size=pool_size)
|
68 |
+
elif pool_type == 'avg':
|
69 |
+
x = F.avg_pool2d(x, kernel_size=pool_size)
|
70 |
+
elif pool_type == 'avg+max':
|
71 |
+
x1 = F.avg_pool2d(x, kernel_size=pool_size)
|
72 |
+
x2 = F.max_pool2d(x, kernel_size=pool_size)
|
73 |
+
x = x1 + x2
|
74 |
+
else:
|
75 |
+
raise Exception('Incorrect argument!')
|
76 |
+
|
77 |
+
return x
|
78 |
+
|
79 |
+
|
80 |
+
class Cnn14_DecisionLevelMax(nn.Module):
|
81 |
+
def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
|
82 |
+
fmax, classes_num):
|
83 |
+
|
84 |
+
super(Cnn14_DecisionLevelMax, self).__init__()
|
85 |
+
|
86 |
+
window = 'hann'
|
87 |
+
center = True
|
88 |
+
pad_mode = 'reflect'
|
89 |
+
ref = 1.0
|
90 |
+
amin = 1e-10
|
91 |
+
top_db = None
|
92 |
+
self.interpolate_ratio = 32 # Downsampled ratio
|
93 |
+
|
94 |
+
# Spectrogram extractor
|
95 |
+
self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
|
96 |
+
win_length=window_size, window=window, center=center, pad_mode=pad_mode,
|
97 |
+
freeze_parameters=True)
|
98 |
+
|
99 |
+
# Logmel feature extractor
|
100 |
+
self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
|
101 |
+
n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
|
102 |
+
freeze_parameters=True)
|
103 |
+
|
104 |
+
# Spec augmenter
|
105 |
+
self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
|
106 |
+
freq_drop_width=8, freq_stripes_num=2)
|
107 |
+
|
108 |
+
self.bn0 = nn.BatchNorm2d(64)
|
109 |
+
|
110 |
+
self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
|
111 |
+
self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
|
112 |
+
self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
|
113 |
+
self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
|
114 |
+
self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
|
115 |
+
self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
|
116 |
+
|
117 |
+
self.fc1 = nn.Linear(2048, 2048, bias=True)
|
118 |
+
self.fc_audioset = nn.Linear(2048, classes_num, bias=True)
|
119 |
+
|
120 |
+
self.init_weight()
|
121 |
+
|
122 |
+
def init_weight(self):
|
123 |
+
init_bn(self.bn0)
|
124 |
+
init_layer(self.fc1)
|
125 |
+
init_layer(self.fc_audioset)
|
126 |
+
|
127 |
+
def forward(self, input, mixup_lambda=None):
|
128 |
+
"""
|
129 |
+
Input: (batch_size, data_length)"""
|
130 |
+
|
131 |
+
x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins)
|
132 |
+
x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins)
|
133 |
+
|
134 |
+
frames_num = x.shape[2]
|
135 |
+
|
136 |
+
x = x.transpose(1, 3)
|
137 |
+
x = self.bn0(x)
|
138 |
+
x = x.transpose(1, 3)
|
139 |
+
|
140 |
+
if self.training:
|
141 |
+
x = self.spec_augmenter(x)
|
142 |
+
|
143 |
+
# Mixup on spectrogram
|
144 |
+
if self.training and mixup_lambda is not None:
|
145 |
+
x = do_mixup(x, mixup_lambda)
|
146 |
+
|
147 |
+
x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
|
148 |
+
x = F.dropout(x, p=0.2, training=self.training)
|
149 |
+
x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
|
150 |
+
x = F.dropout(x, p=0.2, training=self.training)
|
151 |
+
x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
|
152 |
+
x = F.dropout(x, p=0.2, training=self.training)
|
153 |
+
x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
|
154 |
+
x = F.dropout(x, p=0.2, training=self.training)
|
155 |
+
x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
|
156 |
+
x = F.dropout(x, p=0.2, training=self.training)
|
157 |
+
x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
|
158 |
+
x = F.dropout(x, p=0.2, training=self.training)
|
159 |
+
x = torch.mean(x, dim=3)
|
160 |
+
|
161 |
+
x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
|
162 |
+
x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
|
163 |
+
x = x1 + x2
|
164 |
+
x = F.dropout(x, p=0.5, training=self.training)
|
165 |
+
x = x.transpose(1, 2)
|
166 |
+
x = F.relu_(self.fc1(x))
|
167 |
+
x = F.dropout(x, p=0.5, training=self.training)
|
168 |
+
segmentwise_output = torch.sigmoid(self.fc_audioset(x))
|
169 |
+
(clipwise_output, _) = torch.max(segmentwise_output, dim=1)
|
170 |
+
|
171 |
+
# Get framewise output
|
172 |
+
framewise_output = interpolate(segmentwise_output, self.interpolate_ratio)
|
173 |
+
framewise_output = pad_framewise_output(framewise_output, frames_num)
|
174 |
+
|
175 |
+
output_dict = {'framewise_output': framewise_output,
|
176 |
+
'clipwise_output': clipwise_output}
|
177 |
+
|
178 |
+
return output_dict
|
panns_inference/pytorch_utils.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""This pytorch_utils.py contains functions from:
|
2 |
+
https://github.com/qiuqiangkong/audioset_tagging_cnn/blob/master/pytorch/pytorch_utils.py
|
3 |
+
"""
|
4 |
+
import torch
|
5 |
+
|
6 |
+
def move_data_to_device(x, device):
|
7 |
+
if 'float' in str(x.dtype):
|
8 |
+
x = torch.Tensor(x)
|
9 |
+
elif 'int' in str(x.dtype):
|
10 |
+
x = torch.LongTensor(x)
|
11 |
+
else:
|
12 |
+
return x
|
13 |
+
|
14 |
+
return x.to(device)
|
15 |
+
|
16 |
+
def interpolate(x, ratio):
|
17 |
+
"""Interpolate the prediction to compensate the downsampling operation in a
|
18 |
+
CNN.
|
19 |
+
|
20 |
+
Args:
|
21 |
+
x: (batch_size, time_steps, classes_num)
|
22 |
+
ratio: int, ratio to upsample
|
23 |
+
"""
|
24 |
+
(batch_size, time_steps, classes_num) = x.shape
|
25 |
+
upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
|
26 |
+
upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
|
27 |
+
return upsampled
|
28 |
+
|
29 |
+
def pad_framewise_output(framewise_output, frames_num):
|
30 |
+
"""Pad framewise_output to the same length as input frames.
|
31 |
+
Args:
|
32 |
+
framewise_output: (batch_size, frames_num, classes_num)
|
33 |
+
frames_num: int, number of frames to pad
|
34 |
+
Outputs:
|
35 |
+
output: (batch_size, frames_num, classes_num)
|
36 |
+
"""
|
37 |
+
pad = framewise_output[:, -1 :, :].repeat(1, frames_num - framewise_output.shape[1], 1)
|
38 |
+
"""tensor for padding"""
|
39 |
+
|
40 |
+
output = torch.cat((framewise_output, pad), dim=1)
|
41 |
+
"""(batch_size, frames_num, classes_num)"""
|
42 |
+
|
43 |
+
return output
|
44 |
+
|
45 |
+
def do_mixup(x, mixup_lambda):
|
46 |
+
out = x[0::2].transpose(0, -1) * mixup_lambda[0::2] + \
|
47 |
+
x[1::2].transpose(0, -1) * mixup_lambda[1::2]
|
48 |
+
return out.transpose(0, -1)
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
torch
|
2 |
+
librosa
|
3 |
+
torchlibrosa
|
4 |
+
matplotlib
|
setup.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import setuptools
|
2 |
+
|
3 |
+
with open("README.md", "r") as fh:
|
4 |
+
long_description = fh.read()
|
5 |
+
|
6 |
+
setuptools.setup(
|
7 |
+
name="panns-inference", # Replace with your own username
|
8 |
+
version="0.0.7",
|
9 |
+
author="Qiuqiang Kong",
|
10 |
+
author_email="[email protected]",
|
11 |
+
description="panns_inference: audio tagging and sound event detection inference toolbox",
|
12 |
+
long_description=long_description,
|
13 |
+
long_description_content_type="text/markdown",
|
14 |
+
url="https://github.com/qiuqiangkong/panns_inference",
|
15 |
+
packages=setuptools.find_packages(),
|
16 |
+
classifiers=[
|
17 |
+
"Programming Language :: Python :: 3",
|
18 |
+
"License :: OSI Approved :: MIT License",
|
19 |
+
"Operating System :: OS Independent",
|
20 |
+
],
|
21 |
+
install_requires=['matplotlib', 'librosa', 'torchlibrosa'],
|
22 |
+
python_requires='>=3.6',
|
23 |
+
)
|