Spaces:
Running
Running
File size: 1,373 Bytes
08cc398 2216a22 08cc398 2216a22 08cc398 2216a22 f96e2ca 08cc398 2216a22 08cc398 2216a22 08cc398 2216a22 08cc398 2216a22 f96e2ca 2216a22 08cc398 2216a22 b5cf8a0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
import os
import torch
import torch.nn.functional as F
from ssl_ecapa_model import SSL_ECAPA_TDNN
from score import loadModel
from predict import loadWav
import gradio as gr
model = loadModel('voxsim_wavlm_ecapa.model')
model.eval()
def calc_voxsim(inp_path, ref_path):
inp_wav = loadWav(inp_path, max_frames=0)
ref_wav = loadWav(ref_path, max_frames=0)
with torch.no_grad():
input_emb = F.normalize(model.forward(inp_wav), p=2, dim=1)
ref_emb = F.normalize(model.forward(ref_wav), p=2, dim=1)
score = torch.matmul(input_emb, ref_emb.T)
return score.detach().cpu().numpy()
description = """
Voice similarity demo using wavlm-ecapa model, which is trained on Voxsim dataset.
This demo only accepts .wav format. Best at 16 kHz sampling rate.
The inference process of this Spaces demo is suboptimal due to the limitations of a basic CPU. To obtain an accurate score, refer to the "[voxsim_trainer](https://github.com/kaistmm/voxsim_trainer)" repository and run the code via the CLI.
Paper is available [here](https://arxiv.org/abs/2407.18505)
"""
iface = gr.Interface(
fn=calc_voxsim,
inputs=(
gr.Audio(label="Input Audio", type='filepath'),
gr.Audio(label="Reference Audio", type='filepath')
),
outputs="text",
title="voice similarity with VoxSim",
description=description,
).launch() |