Spaces:

pauldarvin
/

wav_SED

Sleeping

App Files Files Community

pcdarvin commited on Jun 16, 2022

Commit

5e740f6

1 Parent(s): d8b6c83

draft

Browse files

Files changed (13) hide show

Cnn14_mAP=0.431.pth +0 -3
LICENSE.MIT +21 -0
app.py +92 -0
example.py +74 -0
Cnn14_DecisionLevelMax.pth → panns_data/Cnn14_DecisionLevelMax.pth +0 -0
class_labels_indices.csv → panns_data/class_labels_indices.csv +0 -0
panns_inference/__init__.py +2 -0
panns_inference/config.py +34 -0
panns_inference/inference.py +66 -0
panns_inference/models.py +178 -0
panns_inference/pytorch_utils.py +48 -0
requirements.txt +4 -0
setup.py +23 -0

Cnn14_mAP=0.431.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0dc499e40e9761ef5ea061ffc77697697f277f6a960894903df3ada000e34b31
-size 327428481

LICENSE.MIT ADDED Viewed

	@@ -0,0 +1,21 @@

+The MIT License
+Copyright (c) 2010-2017 Google, Inc. http://angularjs.org
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import gradio as gr
+#import os
+import matplotlib.pyplot as plt
+import numpy as np
+import librosa
+from panns_inference import SoundEventDetection, labels
+def plot_sound_event_detection_result(framewise_output):
+    """Visualization of sound event detection result.
+    Args:
+      framewise_output: (time_steps, classes_num)
+    """
+    out_fig_path = 'sed.png'
+    #os.makedirs(os.path.dirname(out_fig_path), exist_ok=True)
+    classwise_output = np.max(framewise_output, axis=0) # (classes_num,)
+    idxes = np.argsort(classwise_output)[::-1]
+    idxes = idxes[0:5]
+    ix_to_lb = {i : label for i, label in enumerate(labels)}
+    lines = []
+    for idx in idxes:
+        line, = plt.plot(framewise_output[:, idx], label=ix_to_lb[idx])
+        lines.append(line)
+    plt.legend(handles=lines)
+    plt.xlabel('Frames')
+    plt.ylabel('Probability')
+    plt.ylim(0, 1.)
+    plt.savefig(out_fig_path)
+    plt.close()
+    print('Save fig to {}'.format(out_fig_path))
+    # modified
+    return plt.imread(out_fig_path)
+def pred(audio):
+    rate, y = audio
+    device = 'cpu' # 'cuda' | 'cpu'
+    print('sample rate ', rate)
+    print('shape ', y.shape)
+    print('raw data', y)
+    y = y.astype(np.float32)
+    print('float', y)
+    y = librosa.core.to_mono(y.T)
+    print('shape ', y.shape)
+    print('mono', y)
+    y = librosa.core.resample(y, orig_sr=rate, target_sr=32000)
+    print('shape ', y.shape)
+    print('resampled', y)
+    print(y.mean())
+    print(y.std())
+    #y = (y - y.mean())/y.std()
+    y = y/y.max()
+    print('normalized', y)
+    #print(rate)
+    plt.plot(y)
+    plt.savefig('wave.png')
+    plt.close()
+    y = y[None, :]  # (batch_size, segment_samples)
+    #print(y)
+    #print('------ Audio tagging ------')
+    #at = AudioTagging(checkpoint_path=None, device=device)
+    #(clipwise_output, embedding) = at.inference(waveform)
+    #"""clipwise_output: (batch_size, classes_num), embedding: (batch_size, embedding_size)"""
+    #print_audio_tagging_result(clipwise_output[0])
+    print('------ Sound event detection ------')
+    sed = SoundEventDetection(checkpoint_path=None, device=device)
+    framewise_output = sed.inference(y)
+    """(batch_size, time_steps, classes_num)"""
+    # modified
+    return plot_sound_event_detection_result(framewise_output[0])
+demo = gr.Interface(
+    pred,
+    gr.Audio(source="upload"),
+    "image",
+    examples=[
+        "telephone_speech.wav",
+        "ringtone.wav", "animals.wav",
+    ],
+    interpretation="default",
+)
+demo.launch()

example.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import os
+import matplotlib.pyplot as plt
+import numpy as np
+import librosa
+import panns_inference
+from panns_inference import AudioTagging, SoundEventDetection, labels
+def print_audio_tagging_result(clipwise_output):
+    """Visualization of audio tagging result.
+    Args:
+      clipwise_output: (classes_num,)
+    """
+    sorted_indexes = np.argsort(clipwise_output)[::-1]
+    # Print audio tagging top probabilities
+    for k in range(10):
+        print('{}: {:.3f}'.format(np.array(labels)[sorted_indexes[k]],
+            clipwise_output[sorted_indexes[k]]))
+def plot_sound_event_detection_result(framewise_output):
+    """Visualization of sound event detection result.
+    Args:
+      framewise_output: (time_steps, classes_num)
+    """
+    out_fig_path = 'results/sed_result.png'
+    os.makedirs(os.path.dirname(out_fig_path), exist_ok=True)
+    classwise_output = np.max(framewise_output, axis=0) # (classes_num,)
+    idxes = np.argsort(classwise_output)[::-1]
+    idxes = idxes[0:5]
+    ix_to_lb = {i : label for i, label in enumerate(labels)}
+    lines = []
+    for idx in idxes:
+        line, = plt.plot(framewise_output[:, idx], label=ix_to_lb[idx])
+        lines.append(line)
+    plt.legend(handles=lines)
+    plt.xlabel('Frames')
+    plt.ylabel('Probability')
+    plt.ylim(0, 1.)
+    plt.savefig(out_fig_path)
+    print('Save fig to {}'.format(out_fig_path))
+if __name__ == '__main__':
+    """Example of using panns_inferece for audio tagging and sound evetn detection.
+    """
+    device = 'cpu' # 'cuda' | 'cpu'
+    audio_path = 'resources/R9_ZSCveAHg_7s.wav'
+    (audio, _) = librosa.core.load(audio_path, sr=32000, mono=True)
+    #print(audio)
+    plt.plot(audio)
+    plt.savefig('sample.png')
+    audio = audio[None, :]  # (batch_size, segment_samples)
+    #print(audio)
+    print('------ Audio tagging ------')
+    at = AudioTagging(checkpoint_path=None, device=device)
+    (clipwise_output, embedding) = at.inference(audio)
+    """clipwise_output: (batch_size, classes_num), embedding: (batch_size, embedding_size)"""
+    print_audio_tagging_result(clipwise_output[0])
+    print('------ Sound event detection ------')
+    sed = SoundEventDetection(checkpoint_path=None, device=device)
+    framewise_output = sed.inference(audio)
+    """(batch_size, time_steps, classes_num)"""
+    plot_sound_event_detection_result(framewise_output[0])

Cnn14_DecisionLevelMax.pth → panns_data/Cnn14_DecisionLevelMax.pth RENAMED Viewed

File without changes

class_labels_indices.csv → panns_data/class_labels_indices.csv RENAMED Viewed

File without changes

panns_inference/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .inference import SoundEventDetection
2	+ from .config import labels

panns_inference/config.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import os
+#import numpy as np
+import csv
+#from pathlib import Path
+sample_rate = 32000
+labels_csv_path = 'panns_data/class_labels_indices.csv' # changed to current directory
+# Download labels if not exist
+if not os.path.isfile(labels_csv_path):
+	os.makedirs(os.path.dirname(labels_csv_path), exist_ok=True)
+	os.system('wget -O "{}" "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv"'.format(labels_csv_path))
+# Load label
+with open(labels_csv_path, 'r') as f:
+    reader = csv.reader(f, delimiter=',')
+    lines = list(reader)
+labels = []
+ids = []    # Each label has a unique id such as "/m/068hy"
+for i1 in range(1, len(lines)):
+    id = lines[i1][1]
+    label = lines[i1][2]
+    ids.append(id)
+    labels.append(label)
+classes_num = len(labels)
+lb_to_ix = {label : i for i, label in enumerate(labels)}
+ix_to_lb = {i : label for i, label in enumerate(labels)}
+id_to_ix = {id : i for i, id in enumerate(ids)}
+ix_to_id = {i : id for i, id in enumerate(ids)}

panns_inference/inference.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import os
+import torch
+from .pytorch_utils import move_data_to_device
+from .models import Cnn14_DecisionLevelMax
+from .config import labels, classes_num
+def create_folder(fd):
+    if not os.path.exists(fd):
+        os.makedirs(fd)
+def get_filename(path):
+    path = os.path.realpath(path)
+    na_ext = path.split('/')[-1]
+    na = os.path.splitext(na_ext)[0]
+    return na
+class SoundEventDetection(object):
+    def __init__(self, model=None, checkpoint_path=None, device='cuda'):
+        """Sound event detection inference wrapper.
+        """
+        if not checkpoint_path:
+            checkpoint_path='panns_data/Cnn14_DecisionLevelMax.pth' # moved to current directory
+        print('Checkpoint path: {}'.format(checkpoint_path))
+        if not os.path.exists(checkpoint_path) or os.path.getsize(checkpoint_path) < 3e8:
+            create_folder(os.path.dirname(checkpoint_path))
+            os.system('wget -O "{}" https://zenodo.org/record/3987831/files/Cnn14_DecisionLevelMax_mAP%3D0.385.pth?download=1'.format(checkpoint_path))
+        if device == 'cuda' and torch.cuda.is_available():
+            self.device = 'cuda'
+        else:
+            self.device = 'cpu'
+        self.labels = labels
+        self.classes_num = classes_num
+        # Model
+        if model is None:
+            self.model = Cnn14_DecisionLevelMax(sample_rate=32000, window_size=1024,
+                hop_size=320, mel_bins=64, fmin=50, fmax=14000,
+                classes_num=self.classes_num)
+        else:
+            self.model = model
+        checkpoint = torch.load(checkpoint_path, map_location=self.device)
+        self.model.load_state_dict(checkpoint['model'])
+        # Parallel
+        if 'cuda' in str(self.device):
+            self.model.to(self.device)
+            print('GPU number: {}'.format(torch.cuda.device_count()))
+            self.model = torch.nn.DataParallel(self.model)
+        else:
+            print('Using CPU.')
+    def inference(self, audio):
+        audio = move_data_to_device(audio, self.device)
+        with torch.no_grad():
+            self.model.eval()
+            output_dict = self.model(audio, None)
+        framewise_output = output_dict['framewise_output'].data.cpu().numpy()
+        return framewise_output

panns_inference/models.py ADDED Viewed

	@@ -0,0 +1,178 @@

+"""This models.py contains selected models from:
+https://github.com/qiuqiangkong/audioset_tagging_cnn/blob/master/pytorch/models.py
+"""
+#import os
+#import sys
+#import math
+#import time
+#import numpy as np
+#import matplotlib.pyplot as plt
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchlibrosa.stft import Spectrogram, LogmelFilterBank
+from torchlibrosa.augmentation import SpecAugmentation
+from .pytorch_utils import do_mixup, interpolate, pad_framewise_output
+def init_layer(layer):
+    """Initialize a Linear or Convolutional layer. """
+    nn.init.xavier_uniform_(layer.weight)
+    if hasattr(layer, 'bias'):
+        if layer.bias is not None:
+            layer.bias.data.fill_(0.)
+def init_bn(bn):
+    """Initialize a Batchnorm layer. """
+    bn.bias.data.fill_(0.)
+    bn.weight.data.fill_(1.)
+class ConvBlock(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(ConvBlock, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels=in_channels,
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+        self.conv2 = nn.Conv2d(in_channels=out_channels,
+                              out_channels=out_channels,
+                              kernel_size=(3, 3), stride=(1, 1),
+                              padding=(1, 1), bias=False)
+        self.bn1 = nn.BatchNorm2d(out_channels)
+        self.bn2 = nn.BatchNorm2d(out_channels)
+        self.init_weight()
+    def init_weight(self):
+        init_layer(self.conv1)
+        init_layer(self.conv2)
+        init_bn(self.bn1)
+        init_bn(self.bn2)
+    def forward(self, input, pool_size=(2, 2), pool_type='avg'):
+        x = input
+        x = F.relu_(self.bn1(self.conv1(x)))
+        x = F.relu_(self.bn2(self.conv2(x)))
+        if pool_type == 'max':
+            x = F.max_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg':
+            x = F.avg_pool2d(x, kernel_size=pool_size)
+        elif pool_type == 'avg+max':
+            x1 = F.avg_pool2d(x, kernel_size=pool_size)
+            x2 = F.max_pool2d(x, kernel_size=pool_size)
+            x = x1 + x2
+        else:
+            raise Exception('Incorrect argument!')
+        return x
+class Cnn14_DecisionLevelMax(nn.Module):
+    def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin,
+        fmax, classes_num):
+        super(Cnn14_DecisionLevelMax, self).__init__()
+        window = 'hann'
+        center = True
+        pad_mode = 'reflect'
+        ref = 1.0
+        amin = 1e-10
+        top_db = None
+        self.interpolate_ratio = 32     # Downsampled ratio
+        # Spectrogram extractor
+        self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size,
+            win_length=window_size, window=window, center=center, pad_mode=pad_mode,
+            freeze_parameters=True)
+        # Logmel feature extractor
+        self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size,
+            n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db,
+            freeze_parameters=True)
+        # Spec augmenter
+        self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2,
+            freq_drop_width=8, freq_stripes_num=2)
+        self.bn0 = nn.BatchNorm2d(64)
+        self.conv_block1 = ConvBlock(in_channels=1, out_channels=64)
+        self.conv_block2 = ConvBlock(in_channels=64, out_channels=128)
+        self.conv_block3 = ConvBlock(in_channels=128, out_channels=256)
+        self.conv_block4 = ConvBlock(in_channels=256, out_channels=512)
+        self.conv_block5 = ConvBlock(in_channels=512, out_channels=1024)
+        self.conv_block6 = ConvBlock(in_channels=1024, out_channels=2048)
+        self.fc1 = nn.Linear(2048, 2048, bias=True)
+        self.fc_audioset = nn.Linear(2048, classes_num, bias=True)
+        self.init_weight()
+    def init_weight(self):
+        init_bn(self.bn0)
+        init_layer(self.fc1)
+        init_layer(self.fc_audioset)
+    def forward(self, input, mixup_lambda=None):
+        """
+        Input: (batch_size, data_length)"""
+        x = self.spectrogram_extractor(input)   # (batch_size, 1, time_steps, freq_bins)
+        x = self.logmel_extractor(x)    # (batch_size, 1, time_steps, mel_bins)
+        frames_num = x.shape[2]
+        x = x.transpose(1, 3)
+        x = self.bn0(x)
+        x = x.transpose(1, 3)
+        if self.training:
+            x = self.spec_augmenter(x)
+        # Mixup on spectrogram
+        if self.training and mixup_lambda is not None:
+            x = do_mixup(x, mixup_lambda)
+        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
+        x = F.dropout(x, p=0.2, training=self.training)
+        x = torch.mean(x, dim=3)
+        x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1)
+        x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1)
+        x = x1 + x2
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = x.transpose(1, 2)
+        x = F.relu_(self.fc1(x))
+        x = F.dropout(x, p=0.5, training=self.training)
+        segmentwise_output = torch.sigmoid(self.fc_audioset(x))
+        (clipwise_output, _) = torch.max(segmentwise_output, dim=1)
+        # Get framewise output
+        framewise_output = interpolate(segmentwise_output, self.interpolate_ratio)
+        framewise_output = pad_framewise_output(framewise_output, frames_num)
+        output_dict = {'framewise_output': framewise_output,
+            'clipwise_output': clipwise_output}
+        return output_dict

panns_inference/pytorch_utils.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""This pytorch_utils.py contains functions from:
+https://github.com/qiuqiangkong/audioset_tagging_cnn/blob/master/pytorch/pytorch_utils.py
+"""
+import torch
+def move_data_to_device(x, device):
+    if 'float' in str(x.dtype):
+        x = torch.Tensor(x)
+    elif 'int' in str(x.dtype):
+        x = torch.LongTensor(x)
+    else:
+        return x
+    return x.to(device)
+def interpolate(x, ratio):
+    """Interpolate the prediction to compensate the downsampling operation in a
+    CNN.
+    Args:
+      x: (batch_size, time_steps, classes_num)
+      ratio: int, ratio to upsample
+    """
+    (batch_size, time_steps, classes_num) = x.shape
+    upsampled = x[:, :, None, :].repeat(1, 1, ratio, 1)
+    upsampled = upsampled.reshape(batch_size, time_steps * ratio, classes_num)
+    return upsampled
+def pad_framewise_output(framewise_output, frames_num):
+    """Pad framewise_output to the same length as input frames.
+    Args:
+      framewise_output: (batch_size, frames_num, classes_num)
+      frames_num: int, number of frames to pad
+    Outputs:
+      output: (batch_size, frames_num, classes_num)
+    """
+    pad = framewise_output[:, -1 :, :].repeat(1, frames_num - framewise_output.shape[1], 1)
+    """tensor for padding"""
+    output = torch.cat((framewise_output, pad), dim=1)
+    """(batch_size, frames_num, classes_num)"""
+    return output
+def do_mixup(x, mixup_lambda):
+    out = x[0::2].transpose(0, -1) * mixup_lambda[0::2] + \
+        x[1::2].transpose(0, -1) * mixup_lambda[1::2]
+    return out.transpose(0, -1)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+torch
+librosa
+torchlibrosa
+matplotlib

setup.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import setuptools
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+setuptools.setup(
+    name="panns-inference", # Replace with your own username
+    version="0.0.7",
+    author="Qiuqiang Kong",
+    author_email="[email protected]",
+    description="panns_inference: audio tagging and sound event detection inference toolbox",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/qiuqiangkong/panns_inference",
+    packages=setuptools.find_packages(),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    install_requires=['matplotlib', 'librosa', 'torchlibrosa'],
+    python_requires='>=3.6',
+)