Spaces:

junseok520
/

VoxSIM

Running

File size: 1,373 Bytes

import os
import torch
import torch.nn.functional as F
from ssl_ecapa_model import SSL_ECAPA_TDNN
from score import loadModel
from predict import loadWav
import gradio as gr

model = loadModel('voxsim_wavlm_ecapa.model')
model.eval()

def calc_voxsim(inp_path, ref_path):
    inp_wav = loadWav(inp_path, max_frames=0)
    ref_wav = loadWav(ref_path, max_frames=0)

    with torch.no_grad():
        input_emb = F.normalize(model.forward(inp_wav), p=2, dim=1)
        ref_emb = F.normalize(model.forward(ref_wav), p=2, dim=1)

        score = torch.matmul(input_emb, ref_emb.T)
        return score.detach().cpu().numpy()

description = """
Voice similarity demo using wavlm-ecapa model, which is trained on Voxsim dataset.
This demo only accepts .wav format. Best at 16 kHz sampling rate.
The inference process of this Spaces demo is suboptimal due to the limitations of a basic CPU. To obtain an accurate score, refer to the "[voxsim_trainer](https://github.com/kaistmm/voxsim_trainer)" repository and run the code via the CLI.

Paper is available [here](https://arxiv.org/abs/2407.18505)
"""

iface = gr.Interface(
    fn=calc_voxsim,
    inputs=(
        gr.Audio(label="Input Audio", type='filepath'),
        gr.Audio(label="Reference Audio", type='filepath')
    ),
    outputs="text",
    title="voice similarity with VoxSim",
    description=description,
).launch()