Spaces:

junseok520
/

VoxSIM

Running

VoxSIM / app.py

junseok

new commit

08cc398 3 months ago

1.37 kB

	import os
	import torch
	import torch.nn.functional as F
	from ssl_ecapa_model import SSL_ECAPA_TDNN
	from score import loadModel
	from predict import loadWav
	import gradio as gr

	model = loadModel('voxsim_wavlm_ecapa.model')
	model.eval()

	def calc_voxsim(inp_path, ref_path):
	inp_wav = loadWav(inp_path, max_frames=0)
	ref_wav = loadWav(ref_path, max_frames=0)

	with torch.no_grad():
	input_emb = F.normalize(model.forward(inp_wav), p=2, dim=1)
	ref_emb = F.normalize(model.forward(ref_wav), p=2, dim=1)

	score = torch.matmul(input_emb, ref_emb.T)
	return score.detach().cpu().numpy()

	description = """
	Voice similarity demo using wavlm-ecapa model, which is trained on Voxsim dataset.
	This demo only accepts .wav format. Best at 16 kHz sampling rate.
	The inference process of this Spaces demo is suboptimal due to the limitations of a basic CPU. To obtain an accurate score, refer to the "[voxsim_trainer](https://github.com/kaistmm/voxsim_trainer)" repository and run the code via the CLI.

	Paper is available [here](https://arxiv.org/abs/2407.18505)
	"""

	iface = gr.Interface(
	fn=calc_voxsim,
	inputs=(
	gr.Audio(label="Input Audio", type='filepath'),
	gr.Audio(label="Reference Audio", type='filepath')
	),
	outputs="text",
	title="voice similarity with VoxSim",
	description=description,
	).launch()