File size: 1,373 Bytes
08cc398
2216a22
 
08cc398
 
 
2216a22
 
08cc398
2216a22
 
f96e2ca
08cc398
 
2216a22
 
08cc398
 
2216a22
08cc398
2216a22
 
 
 
 
08cc398
2216a22
 
 
 
 
f96e2ca
2216a22
08cc398
 
2216a22
 
 
 
b5cf8a0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import os
import torch
import torch.nn.functional as F
from ssl_ecapa_model import SSL_ECAPA_TDNN
from score import loadModel
from predict import loadWav
import gradio as gr

model = loadModel('voxsim_wavlm_ecapa.model')
model.eval()

def calc_voxsim(inp_path, ref_path):
    inp_wav = loadWav(inp_path, max_frames=0)
    ref_wav = loadWav(ref_path, max_frames=0)

    with torch.no_grad():
        input_emb = F.normalize(model.forward(inp_wav), p=2, dim=1)
        ref_emb = F.normalize(model.forward(ref_wav), p=2, dim=1)

        score = torch.matmul(input_emb, ref_emb.T)
        return score.detach().cpu().numpy()

description = """
Voice similarity demo using wavlm-ecapa model, which is trained on Voxsim dataset.
This demo only accepts .wav format. Best at 16 kHz sampling rate.
The inference process of this Spaces demo is suboptimal due to the limitations of a basic CPU. To obtain an accurate score, refer to the "[voxsim_trainer](https://github.com/kaistmm/voxsim_trainer)" repository and run the code via the CLI.

Paper is available [here](https://arxiv.org/abs/2407.18505)
"""

iface = gr.Interface(
    fn=calc_voxsim,
    inputs=(
        gr.Audio(label="Input Audio", type='filepath'),
        gr.Audio(label="Reference Audio", type='filepath')
    ),
    outputs="text",
    title="voice similarity with VoxSim",
    description=description,
).launch()