Spaces:
Sleeping
Sleeping
import os | |
import torch | |
import torch.nn.functional as F | |
from ssl_ecapa_model import SSL_ECAPA_TDNN | |
from score import loadModel | |
from predict import loadWav | |
import gradio as gr | |
model = loadModel('voxsim_wavlm_ecapa.model') | |
model.eval() | |
def calc_voxsim(inp_path, ref_path): | |
inp_wav = loadWav(inp_path, max_frames=0) | |
ref_wav = loadWav(ref_path, max_frames=0) | |
with torch.no_grad(): | |
input_emb = F.normalize(model.forward(inp_wav), p=2, dim=1) | |
ref_emb = F.normalize(model.forward(ref_wav), p=2, dim=1) | |
score = torch.matmul(input_emb, ref_emb.T) | |
return score.detach().cpu().numpy() | |
description = """ | |
Voice similarity demo using wavlm-ecapa model, which is trained on Voxsim dataset. | |
This demo only accepts .wav format. Best at 16 kHz sampling rate. | |
The inference process of this Spaces demo is suboptimal due to the limitations of a basic CPU. To obtain an accurate score, refer to the "[voxsim_trainer](https://github.com/kaistmm/voxsim_trainer)" repository and run the code via the CLI. | |
Paper is available [here](https://arxiv.org/abs/2407.18505) | |
""" | |
iface = gr.Interface( | |
fn=calc_voxsim, | |
inputs=( | |
gr.Audio(label="Input Audio", type='filepath'), | |
gr.Audio(label="Reference Audio", type='filepath') | |
), | |
outputs="text", | |
title="voice similarity with VoxSim", | |
description=description, | |
).launch() |