versa / universal_metrics.yaml
ftshijt
fix docker setup for sdk
6c509e2
# Universal Metrics Configuration for Versa
# This file contains the configuration for various universal metrics used in speech quality assessment.
# visqol metric
# -- visqol: visual quality of speech
- name: visqol
model: default
# Word error rate with ESPnet-OWSM model
# More model_tag can be from the ESPnet huggingface https://huggingface.co/espnet .
# The default model is `espnet/owsm_v3.1_ebf`.
# --lid: the nbest language tag
- name: lid
model_tag: default
nbest: 1
# nomad (reference-based) metric
# -- nomad: nomad reference-based model
- name: nomad
model_cache: versa_cache/nomad_pt-models
# srmr related metrics
# -- srmr: speech-to-reverberation modulation energy ratio
- name: srmr
n_cochlear_filters: 23
low_freq: 125
min_cf: 4
max_cf: 128
fast: True
norm: False
# Emotion similarity calculated based on emo2vec
# --emo2vec_similarity: the emotion similarity with emo2vec
- name: emo2vec_similarity
# noresqa related metrics
# -- noresqa: non-matching reference based speech quality assessment
- name: noresqa
metric_type: 1 #0: NORESQA-score, 1: NORESQA-MOS
# pysepm related metrics
# -- pysepm_fwsegsnr: frequency-weighted segmental SNR
# -- pysepm_llr: Log likelihood ratio
# -- pysepm_wss: weighted spectral slope
# -- pysepm_cd: cepstral distance objective speech quality measure
# -- pysepm_Csig, pysepm_Cbak, pysepm_Covl: composite objective speech quality
# -- pysepm_csii_high, pysepm_csii_mid, pysepm_csii_low: coherence and speech intelligibility index
# -- pysepm_ncm: normalized-covariance measure
- name: pysepm
# nisqa score for speech quality assessment
# -- nisqa_mos_pred: NISQA MOS prediction
# -- nisqa_noi_pred: NISQA noise prediction
# -- nisqa_dis_pred: NISQA distortion prediction
# -- nisqa_col_pred: NISQA color prediction
# --nisqa_loud_pred: NISQA loudness prediction
# NOTE(jiatong): pretrain model can be downloaded with `./tools/setup_nisqa.sh`
- name: nisqa
nisqa_model_path: ./tools/NISQA/weights/nisqa.tar
# discrete speech metrics
# -- speech_bert: speech bert score
# -- speech_bleu: speech bleu score
# -- speech_token_distance: speech token distance score
- name: discrete_speech
# mcd f0 related metrics
# -- mcd: mel cepstral distortion
# -- f0_corr: f0 correlation
# -- f0_rmse: f0 root mean square error
- name: mcd_f0
f0min: 40
f0max: 800
mcep_shift: 5
mcep_fftl: 1024
mcep_dim: 39
mcep_alpha: 0.466
seq_mismatch_tolerance: 0.1
power_threshold: -20
dtw: false
# An overall model on MOS-bench from Sheet toolkit
# --sheet_ssqa: the mos prediction from sheet_ssqa
- name: sheet_ssqa
# pesq related metrics
# -- pesq: perceptual evaluation of speech quality
- name: pesq
# stoi related metrics
# -- stoi: short-time objective intelligibility
- name: stoi
# pseudo subjective metrics
# -- utmos: UT-MOS score
# -- dnsmos: DNS-MOS score
# -- plcmos: PLC-MOS score
# -- aecmos: AEC-MOS score
- name: pseudo_mos
predictor_types: ["utmos", "dnsmos", "plcmos", "singmos", "utmosv2"]
predictor_args:
utmos:
fs: 16000
dnsmos:
fs: 16000
plcmos:
fs: 16000
singmos:
fs: 16000
utmosv2:
fs: 16000
# Word error rate with OpenAI-Whisper model
# -- whisper_wer: word error rate of openai-whisper
- name: whisper_wer
model_tag: default
beam_size: 1
text_cleaner: whisper_basic
# scoreq (reference-based) metric
# -- scoreq_ref: scoreq reference-based model
- name: scoreq_ref
data_domain: natrual
model_cache: versa_cache/scoreq_pt-models
# scoreq (non-reference-based) metric
# -- scoreq_nr: scoreq non-reference-based model
- name: scoreq_nr
data_domain: natural
model_cache: versa_cache/scoreq_pt-models
# Speech Enhancement-based Metrics
# model tag can be any ESPnet-SE huggingface repo
# -- se_si_snr: the SI-SNR from a rerference speech enhancement model
- name: se_snr
model_tag: default
# PAM: Prompting Audio-Language Models for Audio Quality Assessment
# https://github.com/soham97/PAM/tree/main
- name: pam
repro: true
cache_dir: versa_cache/pam
io: soundfile
# TEXT ENCODER CONFIG
text_model: 'gpt2'
text_len: 77
transformer_embed_dim: 768
freeze_text_encoder_weights: True
# AUDIO ENCODER CONFIG
audioenc_name: 'HTSAT'
out_emb: 768
sampling_rate: 44100
duration: 7
fmin: 50
fmax: 8000 #14000
n_fft: 1024 # 1028
hop_size: 320
mel_bins: 64
window_size: 1024
# PROJECTION SPACE CONFIG
d_proj: 1024
temperature: 0.003
# TRAINING AND EVALUATION CONFIG
num_classes: 527
batch_size: 1024
demo: False
# Speaking rate calculating
# --speaking_rate: correct matching words/character counts
- name: speaking_rate
model_tag: default
beam_size: 1
text_cleaner: whisper_basic
# Audiobox Aesthetics (Unified automatic quality assessment for speech, music, and sound.)
- name: audiobox_aesthetics
batch_size: 1
cache_dir: versa_cache/audiobox
# ASR-match calculating
# --asr_match_error_rate: correct matching words/character counts
- name: asr_match
model_tag: default
beam_size: 1
text_cleaner: whisper_basic
# speaker related metrics
# -- spk_similarity: speaker cosine similarity
- name: speaker
model_tag: default
# asvspoof related metrics
# -- asvspoof_score: evaluate how the generated speech is likely to be classifiied by a deepfake classifier
- name: asvspoof_score
# signal related metrics
# -- sir: signal to interference ratio
# -- sar: signal to artifact ratio
# -- sdr: signal to distortion ratio
# -- ci-sdr: scale-invariant signal to distortion ratio
# -- si-snri: scale-invariant signal to noise ratio improvement
- name: signal_metric