|
""" |
|
Module needed for pre-processing of uploaded audio |
|
Uses silero_vad for silence removal and librosa for image generation |
|
|
|
Author: Jakub Polnis |
|
Copyright: Copyright 2025, Jakub Polnis |
|
License: Apache 2.0 |
|
Email: [email protected] |
|
""" |
|
|
|
import io |
|
import torch |
|
import librosa |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
from PIL import Image |
|
from silero_vad import (load_silero_vad, |
|
read_audio, |
|
get_speech_timestamps, |
|
save_audio, |
|
VADIterator, |
|
collect_chunks) |
|
USE_ONNX = False |
|
model = load_silero_vad(onnx=USE_ONNX) |
|
|
|
SAMPLING_RATE = 16000 |
|
|
|
def silero_vad_remove_silence(audio_file_path): |
|
torch.set_num_threads(1) |
|
audio = read_audio(audio_file_path, sampling_rate=SAMPLING_RATE) |
|
|
|
|
|
speech_timestamps = get_speech_timestamps(audio, model, sampling_rate=SAMPLING_RATE) |
|
|
|
if not speech_timestamps: |
|
print(f"No speech detected in {audio_file_path}. Returning original audio.") |
|
return audio |
|
else: |
|
|
|
processed_audio = collect_chunks(speech_timestamps, audio) |
|
return processed_audio |
|
|
|
def create_mel_spectrograms(file_path, segment_duration, start_offset): |
|
|
|
duration = segment_duration |
|
startOffset = start_offset |
|
pil_images = [] |
|
|
|
|
|
processed_audio = silero_vad_remove_silence(file_path) |
|
y = processed_audio.numpy() |
|
sr = SAMPLING_RATE |
|
|
|
|
|
audio_duration = librosa.get_duration(y=y, sr=sr) |
|
|
|
|
|
segment_duration_samples = int(duration * sr) |
|
|
|
|
|
rounded_duration = int(np.round(audio_duration)) |
|
|
|
|
|
if len(y) > rounded_duration * sr: |
|
y = y[:rounded_duration * sr] |
|
elif len(y) < rounded_duration * sr: |
|
y = np.pad(y, (0, rounded_duration * sr - len(y)), mode='constant') |
|
|
|
|
|
for i in range(int(rounded_duration)): |
|
|
|
start_sample = i * sr |
|
|
|
|
|
end_sample = start_sample + segment_duration_samples |
|
|
|
if end_sample > len(y): |
|
continue |
|
|
|
y_segment = y[start_sample:end_sample] |
|
|
|
if len(y_segment) > 0: |
|
|
|
|
|
S = librosa.feature.melspectrogram(y=y_segment, sr=sr, n_mels=128, fmax=8000, center=True) |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(224 / 100, 224 / 100)) |
|
|
|
|
|
S_dB = librosa.power_to_db(S, ref=np.max) |
|
|
|
|
|
img = librosa.display.specshow(S_dB, sr=sr, fmax=8000, ax=ax) |
|
ax.set_xlim(0, S.shape[-1]) |
|
ax.set_ylim(0, S.shape[0]) |
|
ax.set_xticks([]) |
|
ax.set_yticks([]) |
|
ax.set_xticklabels([]) |
|
ax.set_yticklabels([]) |
|
|
|
|
|
buffer = io.BytesIO() |
|
plt.savefig(buffer, format='PNG', bbox_inches=None, pad_inches=0, |
|
dpi=100, transparent=True) |
|
buffer.seek(0) |
|
|
|
|
|
pil_image = Image.open(buffer) |
|
pil_images.append(pil_image.copy()) |
|
|
|
|
|
buffer.close() |
|
plt.close(fig) |
|
|
|
print(pil_images) |
|
|
|
return pil_images |