Spaces:

kubinooo
/

convnext-tiny-224-audio-deepfake-detection

Running

App Files Files Community

convnext-tiny-224-audio-deepfake-detection / process_audio.py

kubinooo

fixing broken predictions - debug prints enabled

01d3f3f 16 days ago

raw

history blame contribute delete

3.66 kB

	"""
	Module needed for pre-processing of uploaded audio
	Uses silero_vad for silence removal and librosa for image generation

	Author: Jakub Polnis
	Copyright: Copyright 2025, Jakub Polnis
	License: Apache 2.0
	Email: [email protected]
	"""

	import io
	import torch
	import librosa
	import numpy as np
	import matplotlib.pyplot as plt
	from PIL import Image
	from silero_vad import (load_silero_vad,
	read_audio,
	get_speech_timestamps,
	save_audio,
	VADIterator,
	collect_chunks)
	USE_ONNX = False
	model = load_silero_vad(onnx=USE_ONNX)

	SAMPLING_RATE = 16000

	def silero_vad_remove_silence(audio_file_path):
	torch.set_num_threads(1)
	audio = read_audio(audio_file_path, sampling_rate=SAMPLING_RATE)

	# Get speech timestamps from full audio file
	speech_timestamps = get_speech_timestamps(audio, model, sampling_rate=SAMPLING_RATE)

	if not speech_timestamps:
	print(f"No speech detected in {audio_file_path}. Returning original audio.")
	return audio # Return unmodified audio
	else:
	# Merge all speech chunks and return the result
	processed_audio = collect_chunks(speech_timestamps, audio)
	return processed_audio

	def create_mel_spectrograms(file_path, segment_duration, start_offset):

	duration = segment_duration
	startOffset = start_offset
	pil_images = []

	# Call silero_vad to remove silence
	processed_audio = silero_vad_remove_silence(file_path)
	y = processed_audio.numpy()
	sr = SAMPLING_RATE

	# Calc duration of audio in seconds
	audio_duration = librosa.get_duration(y=y, sr=sr)

	# Calc duration of audio file in samples
	segment_duration_samples = int(duration * sr)

	# Calc the closest round number in seconds
	rounded_duration = int(np.round(audio_duration))

	# Trim the signal
	if len(y) > rounded_duration * sr:
	y = y[:rounded_duration * sr]
	elif len(y) < rounded_duration * sr:
	y = np.pad(y, (0, rounded_duration * sr - len(y)), mode='constant')

	# Loop through the signal
	for i in range(int(rounded_duration)):
	# Starting index
	start_sample = i * sr

	# End index
	end_sample = start_sample + segment_duration_samples

	if end_sample > len(y):
	continue

	y_segment = y[start_sample:end_sample]

	if len(y_segment) > 0:

	# Creat mel-spectrogram
	S = librosa.feature.melspectrogram(y=y_segment, sr=sr, n_mels=128, fmax=8000, center=True)

	# Save it as img
	fig, ax = plt.subplots(figsize=(224 / 100, 224 / 100))

	# power_to_db
	S_dB = librosa.power_to_db(S, ref=np.max)

	# Setup axis
	img = librosa.display.specshow(S_dB, sr=sr, fmax=8000, ax=ax)
	ax.set_xlim(0, S.shape[-1])
	ax.set_ylim(0, S.shape[0])
	ax.set_xticks([])
	ax.set_yticks([])
	ax.set_xticklabels([])
	ax.set_yticklabels([])

	# Save into the buffer so we can return PIL images
	buffer = io.BytesIO()
	plt.savefig(buffer, format='PNG', bbox_inches=None, pad_inches=0,
	dpi=100, transparent=True)
	buffer.seek(0)

	# Convert buffer to PIL Image
	pil_image = Image.open(buffer)
	pil_images.append(pil_image.copy()) # Copy to avoid buffer issues

	# Close buffer and figure to free memory
	buffer.close()
	plt.close(fig)

	print(pil_images)

	return pil_images