pcdarvin commited on
Commit
5e740f6
·
1 Parent(s): d8b6c83
Cnn14_mAP=0.431.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0dc499e40e9761ef5ea061ffc77697697f277f6a960894903df3ada000e34b31
3
- size 327428481
 
 
 
 
LICENSE.MIT ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The MIT License
2
+
3
+ Copyright (c) 2010-2017 Google, Inc. http://angularjs.org
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+
3
+ #import os
4
+ import matplotlib.pyplot as plt
5
+ import numpy as np
6
+ import librosa
7
+ from panns_inference import SoundEventDetection, labels
8
+
9
+ def plot_sound_event_detection_result(framewise_output):
10
+ """Visualization of sound event detection result.
11
+
12
+ Args:
13
+ framewise_output: (time_steps, classes_num)
14
+ """
15
+ out_fig_path = 'sed.png'
16
+ #os.makedirs(os.path.dirname(out_fig_path), exist_ok=True)
17
+
18
+ classwise_output = np.max(framewise_output, axis=0) # (classes_num,)
19
+
20
+ idxes = np.argsort(classwise_output)[::-1]
21
+ idxes = idxes[0:5]
22
+
23
+ ix_to_lb = {i : label for i, label in enumerate(labels)}
24
+ lines = []
25
+ for idx in idxes:
26
+ line, = plt.plot(framewise_output[:, idx], label=ix_to_lb[idx])
27
+ lines.append(line)
28
+
29
+ plt.legend(handles=lines)
30
+ plt.xlabel('Frames')
31
+ plt.ylabel('Probability')
32
+ plt.ylim(0, 1.)
33
+ plt.savefig(out_fig_path)
34
+ plt.close()
35
+ print('Save fig to {}'.format(out_fig_path))
36
+ # modified
37
+ return plt.imread(out_fig_path)
38
+
39
+ def pred(audio):
40
+ rate, y = audio
41
+ device = 'cpu' # 'cuda' | 'cpu'
42
+ print('sample rate ', rate)
43
+ print('shape ', y.shape)
44
+ print('raw data', y)
45
+ y = y.astype(np.float32)
46
+ print('float', y)
47
+ y = librosa.core.to_mono(y.T)
48
+ print('shape ', y.shape)
49
+ print('mono', y)
50
+ y = librosa.core.resample(y, orig_sr=rate, target_sr=32000)
51
+ print('shape ', y.shape)
52
+ print('resampled', y)
53
+ print(y.mean())
54
+ print(y.std())
55
+ #y = (y - y.mean())/y.std()
56
+ y = y/y.max()
57
+ print('normalized', y)
58
+ #print(rate)
59
+ plt.plot(y)
60
+ plt.savefig('wave.png')
61
+ plt.close()
62
+ y = y[None, :] # (batch_size, segment_samples)
63
+ #print(y)
64
+
65
+ #print('------ Audio tagging ------')
66
+ #at = AudioTagging(checkpoint_path=None, device=device)
67
+ #(clipwise_output, embedding) = at.inference(waveform)
68
+ #"""clipwise_output: (batch_size, classes_num), embedding: (batch_size, embedding_size)"""
69
+
70
+ #print_audio_tagging_result(clipwise_output[0])
71
+
72
+ print('------ Sound event detection ------')
73
+ sed = SoundEventDetection(checkpoint_path=None, device=device)
74
+ framewise_output = sed.inference(y)
75
+ """(batch_size, time_steps, classes_num)"""
76
+
77
+ # modified
78
+ return plot_sound_event_detection_result(framewise_output[0])
79
+
80
+
81
+ demo = gr.Interface(
82
+ pred,
83
+ gr.Audio(source="upload"),
84
+ "image",
85
+ examples=[
86
+ "telephone_speech.wav",
87
+ "ringtone.wav", "animals.wav",
88
+ ],
89
+ interpretation="default",
90
+ )
91
+
92
+ demo.launch()
example.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import matplotlib.pyplot as plt
3
+ import numpy as np
4
+ import librosa
5
+ import panns_inference
6
+ from panns_inference import AudioTagging, SoundEventDetection, labels
7
+
8
+ def print_audio_tagging_result(clipwise_output):
9
+ """Visualization of audio tagging result.
10
+
11
+ Args:
12
+ clipwise_output: (classes_num,)
13
+ """
14
+ sorted_indexes = np.argsort(clipwise_output)[::-1]
15
+
16
+ # Print audio tagging top probabilities
17
+ for k in range(10):
18
+ print('{}: {:.3f}'.format(np.array(labels)[sorted_indexes[k]],
19
+ clipwise_output[sorted_indexes[k]]))
20
+
21
+
22
+ def plot_sound_event_detection_result(framewise_output):
23
+ """Visualization of sound event detection result.
24
+
25
+ Args:
26
+ framewise_output: (time_steps, classes_num)
27
+ """
28
+ out_fig_path = 'results/sed_result.png'
29
+ os.makedirs(os.path.dirname(out_fig_path), exist_ok=True)
30
+
31
+ classwise_output = np.max(framewise_output, axis=0) # (classes_num,)
32
+
33
+ idxes = np.argsort(classwise_output)[::-1]
34
+ idxes = idxes[0:5]
35
+
36
+ ix_to_lb = {i : label for i, label in enumerate(labels)}
37
+ lines = []
38
+ for idx in idxes:
39
+ line, = plt.plot(framewise_output[:, idx], label=ix_to_lb[idx])
40
+ lines.append(line)
41
+
42
+ plt.legend(handles=lines)
43
+ plt.xlabel('Frames')
44
+ plt.ylabel('Probability')
45
+ plt.ylim(0, 1.)
46
+ plt.savefig(out_fig_path)
47
+ print('Save fig to {}'.format(out_fig_path))
48
+
49
+
50
+ if __name__ == '__main__':
51
+ """Example of using panns_inferece for audio tagging and sound evetn detection.
52
+ """
53
+ device = 'cpu' # 'cuda' | 'cpu'
54
+ audio_path = 'resources/R9_ZSCveAHg_7s.wav'
55
+ (audio, _) = librosa.core.load(audio_path, sr=32000, mono=True)
56
+ #print(audio)
57
+ plt.plot(audio)
58
+ plt.savefig('sample.png')
59
+ audio = audio[None, :] # (batch_size, segment_samples)
60
+ #print(audio)
61
+
62
+ print('------ Audio tagging ------')
63
+ at = AudioTagging(checkpoint_path=None, device=device)
64
+ (clipwise_output, embedding) = at.inference(audio)
65
+ """clipwise_output: (batch_size, classes_num), embedding: (batch_size, embedding_size)"""
66
+
67
+ print_audio_tagging_result(clipwise_output[0])
68
+
69
+ print('------ Sound event detection ------')
70
+ sed = SoundEventDetection(checkpoint_path=None, device=device)
71
+ framewise_output = sed.inference(audio)
72
+ """(batch_size, time_steps, classes_num)"""
73
+
74
+ plot_sound_event_detection_result(framewise_output[0])
Cnn14_DecisionLevelMax.pth → panns_data/Cnn14_DecisionLevelMax.pth RENAMED
File without changes
class_labels_indices.csv → panns_data/class_labels_indices.csv RENAMED
File without changes
panns_inference/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .inference import SoundEventDetection
2
+ from .config import labels
panns_inference/config.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ #import numpy as np
3
+ import csv
4
+ #from pathlib import Path
5
+
6
+ sample_rate = 32000
7
+
8
+ labels_csv_path = 'panns_data/class_labels_indices.csv' # changed to current directory
9
+
10
+ # Download labels if not exist
11
+ if not os.path.isfile(labels_csv_path):
12
+ os.makedirs(os.path.dirname(labels_csv_path), exist_ok=True)
13
+ os.system('wget -O "{}" "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv"'.format(labels_csv_path))
14
+
15
+ # Load label
16
+ with open(labels_csv_path, 'r') as f:
17
+ reader = csv.reader(f, delimiter=',')
18
+ lines = list(reader)
19
+
20
+ labels = []
21
+ ids = [] # Each label has a unique id such as "/m/068hy"
22
+ for i1 in range(1, len(lines)):
23
+ id = lines[i1][1]
24
+ label = lines[i1][2]
25
+ ids.append(id)
26
+ labels.append(label)
27
+
28
+ classes_num = len(labels)
29
+
30
+ lb_to_ix = {label : i for i, label in enumerate(labels)}
31
+ ix_to_lb = {i : label for i, label in enumerate(labels)}
32
+
33
+ id_to_ix = {id : i for i, id in enumerate(ids)}
34
+ ix_to_id = {i : id for i, id in enumerate(ids)}
panns_inference/inference.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+
4
+ from .pytorch_utils import move_data_to_device
5
+ from .models import Cnn14_DecisionLevelMax
6
+ from .config import labels, classes_num
7
+
8
+ def create_folder(fd):
9
+ if not os.path.exists(fd):
10
+ os.makedirs(fd)
11
+
12
+ def get_filename(path):
13
+ path = os.path.realpath(path)
14
+ na_ext = path.split('/')[-1]
15
+ na = os.path.splitext(na_ext)[0]
16
+ return na
17
+
18
+ class SoundEventDetection(object):
19
+ def __init__(self, model=None, checkpoint_path=None, device='cuda'):
20
+ """Sound event detection inference wrapper.
21
+ """
22
+ if not checkpoint_path:
23
+ checkpoint_path='panns_data/Cnn14_DecisionLevelMax.pth' # moved to current directory
24
+ print('Checkpoint path: {}'.format(checkpoint_path))
25
+
26
+ if not os.path.exists(checkpoint_path) or os.path.getsize(checkpoint_path) < 3e8:
27
+ create_folder(os.path.dirname(checkpoint_path))
28
+ os.system('wget -O "{}" https://zenodo.org/record/3987831/files/Cnn14_DecisionLevelMax_mAP%3D0.385.pth?download=1'.format(checkpoint_path))
29
+
30
+ if device == 'cuda' and torch.cuda.is_available():
31
+ self.device = 'cuda'
32
+ else:
33
+ self.device = 'cpu'
34
+
35
+ self.labels = labels
36
+ self.classes_num = classes_num
37
+
38
+ # Model
39
+ if model is None:
40
+ self.model = Cnn14_DecisionLevelMax(sample_rate=32000, window_size=1024,
41
+ hop_size=320, mel_bins=64, fmin=50, fmax=14000,
42
+ classes_num=self.classes_num)
43
+ else:
44
+ self.model = model
45
+
46
+ checkpoint = torch.load(checkpoint_path, map_location=self.device)
47
+ self.model.load_state_dict(checkpoint['model'])
48
+
49
+ # Parallel
50
+ if 'cuda' in str(self.device):
51
+ self.model.to(self.device)
52
+ print('GPU number: {}'.format(torch.cuda.device_count()))
53
+ self.model = torch.nn.DataParallel(self.model)
54
+ else:
55
+ print('Using CPU.')
56
+
57
+ def inference(self, audio):
58
+ audio = move_data_to_device(audio, self.device)
59
+
60
+ with torch.no_grad():
61
+ self.model.eval()
62
+ output_dict = self.model(audio, None)
63
+
64
+ framewise_output = output_dict['framewise_output'].data.cpu().numpy()
65
+
66
+ return framewise_output
panns_inference/models.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This models.py contains selected models from:
2
+ https://github.com/qiuqiangkong/audioset_tagging_cnn/blob/master/pytorch/models.py
3
+ """
4
+ #import os
5
+ #import sys
6
+ #import math
7
+ #import time
8
+ #import numpy as np
9
+ #import matplotlib.pyplot as plt
10
+
11
+ import torch
12
+ import torch.nn as nn
13
+ import torch.nn.functional as F
14
+
15
+ from torchlibrosa.stft import Spectrogram, LogmelFilterBank
16
+ from torchlibrosa.augmentation import SpecAugmentation
17
+ from .pytorch_utils import do_mixup, interpolate, pad_framewise_output
18
+
19
+ def init_layer(layer):
20
+ """Initialize a Linear or Convolutional layer. """
21
+ nn.init.xavier_uniform_(layer.weight)
22
+
23
+ if hasattr(layer, 'bias'):
24
+ if layer.bias is not None:
25
+ layer.bias.data.fill_(0.)
26
+
27
+
28
+ def init_bn(bn):
29
+ """Initialize a Batchnorm layer. """
30
+ bn.bias.data.fill_(0.)
31
+ bn.weight.data.fill_(1.)
32
+
33
+
34
+ class ConvBlock(nn.Module):
35
+ def __init__(self, in_channels, out_channels):
36
+
37
+ super(ConvBlock, self).__init__()
38
+
39
+ self.conv1 = nn.Conv2d(in_channels=in_channels,
40
+ out_channels=out_channels,
41
+ kernel_size=(3, 3), stride=(1, 1),
42
+ padding=(1, 1), bias=False)
43
+
44
+ self.conv2 = nn.Conv2d(in_channels=out_channels,
45
+ out_channels=out_channels,
46
+ kernel_size=(3, 3), stride=(1, 1),
47
+ padding=(1, 1), bias=False)
48
+
49
+ self.bn1 = nn.BatchNorm2d(out_channels)
50
+ self.bn2 = nn.BatchNorm2d(out_channels)
51
+
52
+ self.init_weight()
53
+
54
+ def init_weight(self):
55
+ init_layer(self.conv1)
56
+ init_layer(self.conv2)
57
+ init_bn(self.bn1)
58
+ init_bn(self.bn2)
59
+
60
+
61
+ def forward(self, input, pool_size=(2, 2), pool_type='avg'):
62
+
63
+ x = input
64
+ x = F.relu_(self.bn1(self.conv1(x)))
65
+ x = F.relu_(self.bn2(self.conv2(x)))
66
+ if pool_type == 'max':
67
+ x = F.max_pool2d(x, kernel_size=pool_size)
68
+ elif pool_type == 'avg':
69
+ x = F.avg_pool2d(x, kernel_size=pool_size)
70
+ elif pool_type == 'avg+max':
71
+ x1 = F.avg_pool2d(x, kernel_size=pool_size)
72
+ x2 = F.max_pool2d(x, kernel_size=pool_size)
73
+ x = x1 + x2
74
+ else:
75
+ raise Exception('Incorrect argument!')
76
+
77
+ return x
78
+
79
+
80
+ class Cnn14_DecisionLevelMax(nn.Module):
81
+ def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
82
+ fmax, classes_num):
83
+
84
+ super(Cnn14_DecisionLevelMax, self).__init__()
85
+
86
+ window = 'hann'
87
+ center = True
88
+ pad_mode = 'reflect'
89
+ ref = 1.0
90
+ amin = 1e-10
91
+ top_db = None
92
+ self.interpolate_ratio = 32 # Downsampled ratio
93
+
94
+ # Spectrogram extractor
95
+ self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
96
+ win_length=window_size, window=window, center=center, pad_mode=pad_mode,
97
+ freeze_parameters=True)
98
+
99
+ # Logmel feature extractor
100
+ self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
101
+ n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
102
+ freeze_parameters=True)
103
+
104
+ # Spec augmenter
105
+ self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
106
+ freq_drop_width=8, freq_stripes_num=2)
107
+
108
+ self.bn0 = nn.BatchNorm2d(64)
109
+
110
+ self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
111
+ self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
112
+ self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
113
+ self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
114
+ self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
115
+ self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
116
+
117
+ self.fc1 = nn.Linear(2048, 2048, bias=True)
118
+ self.fc_audioset = nn.Linear(2048, classes_num, bias=True)
119
+
120
+ self.init_weight()
121
+
122
+ def init_weight(self):
123
+ init_bn(self.bn0)
124
+ init_layer(self.fc1)
125
+ init_layer(self.fc_audioset)
126
+
127
+ def forward(self, input, mixup_lambda=None):
128
+ """
129
+ Input: (batch_size, data_length)"""
130
+
131
+ x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins)
132
+ x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins)
133
+
134
+ frames_num = x.shape[2]
135
+
136
+ x = x.transpose(1, 3)
137
+ x = self.bn0(x)
138
+ x = x.transpose(1, 3)
139
+
140
+ if self.training:
141
+ x = self.spec_augmenter(x)
142
+
143
+ # Mixup on spectrogram
144
+ if self.training and mixup_lambda is not None:
145
+ x = do_mixup(x, mixup_lambda)
146
+
147
+ x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
148
+ x = F.dropout(x, p=0.2, training=self.training)
149
+ x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
150
+ x = F.dropout(x, p=0.2, training=self.training)
151
+ x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
152
+ x = F.dropout(x, p=0.2, training=self.training)
153
+ x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
154
+ x = F.dropout(x, p=0.2, training=self.training)
155
+ x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
156
+ x = F.dropout(x, p=0.2, training=self.training)
157
+ x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
158
+ x = F.dropout(x, p=0.2, training=self.training)
159
+ x = torch.mean(x, dim=3)
160
+
161
+ x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
162
+ x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
163
+ x = x1 + x2
164
+ x = F.dropout(x, p=0.5, training=self.training)
165
+ x = x.transpose(1, 2)
166
+ x = F.relu_(self.fc1(x))
167
+ x = F.dropout(x, p=0.5, training=self.training)
168
+ segmentwise_output = torch.sigmoid(self.fc_audioset(x))
169
+ (clipwise_output, _) = torch.max(segmentwise_output, dim=1)
170
+
171
+ # Get framewise output
172
+ framewise_output = interpolate(segmentwise_output, self.interpolate_ratio)
173
+ framewise_output = pad_framewise_output(framewise_output, frames_num)
174
+
175
+ output_dict = {'framewise_output': framewise_output,
176
+ 'clipwise_output': clipwise_output}
177
+
178
+ return output_dict
panns_inference/pytorch_utils.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This pytorch_utils.py contains functions from:
2
+ https://github.com/qiuqiangkong/audioset_tagging_cnn/blob/master/pytorch/pytorch_utils.py
3
+ """
4
+ import torch
5
+
6
+ def move_data_to_device(x, device):
7
+ if 'float' in str(x.dtype):
8
+ x = torch.Tensor(x)
9
+ elif 'int' in str(x.dtype):
10
+ x = torch.LongTensor(x)
11
+ else:
12
+ return x
13
+
14
+ return x.to(device)
15
+
16
+ def interpolate(x, ratio):
17
+ """Interpolate the prediction to compensate the downsampling operation in a
18
+ CNN.
19
+
20
+ Args:
21
+ x: (batch_size, time_steps, classes_num)
22
+ ratio: int, ratio to upsample
23
+ """
24
+ (batch_size, time_steps, classes_num) = x.shape
25
+ upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
26
+ upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
27
+ return upsampled
28
+
29
+ def pad_framewise_output(framewise_output, frames_num):
30
+ """Pad framewise_output to the same length as input frames.
31
+ Args:
32
+ framewise_output: (batch_size, frames_num, classes_num)
33
+ frames_num: int, number of frames to pad
34
+ Outputs:
35
+ output: (batch_size, frames_num, classes_num)
36
+ """
37
+ pad = framewise_output[:, -1 :, :].repeat(1, frames_num - framewise_output.shape[1], 1)
38
+ """tensor for padding"""
39
+
40
+ output = torch.cat((framewise_output, pad), dim=1)
41
+ """(batch_size, frames_num, classes_num)"""
42
+
43
+ return output
44
+
45
+ def do_mixup(x, mixup_lambda):
46
+ out = x[0::2].transpose(0, -1) * mixup_lambda[0::2] + \
47
+ x[1::2].transpose(0, -1) * mixup_lambda[1::2]
48
+ return out.transpose(0, -1)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch
2
+ librosa
3
+ torchlibrosa
4
+ matplotlib
setup.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import setuptools
2
+
3
+ with open("README.md", "r") as fh:
4
+ long_description = fh.read()
5
+
6
+ setuptools.setup(
7
+ name="panns-inference", # Replace with your own username
8
+ version="0.0.7",
9
+ author="Qiuqiang Kong",
10
+ author_email="[email protected]",
11
+ description="panns_inference: audio tagging and sound event detection inference toolbox",
12
+ long_description=long_description,
13
+ long_description_content_type="text/markdown",
14
+ url="https://github.com/qiuqiangkong/panns_inference",
15
+ packages=setuptools.find_packages(),
16
+ classifiers=[
17
+ "Programming Language :: Python :: 3",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Operating System :: OS Independent",
20
+ ],
21
+ install_requires=['matplotlib', 'librosa', 'torchlibrosa'],
22
+ python_requires='>=3.6',
23
+ )