Spaces:
Sleeping
Sleeping
File size: 5,614 Bytes
37d87af 6c509e2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
# Universal Metrics Configuration for Versa
# This file contains the configuration for various universal metrics used in speech quality assessment.
# visqol metric
# -- visqol: visual quality of speech
- name: visqol
model: default
# Word error rate with ESPnet-OWSM model
# More model_tag can be from the ESPnet huggingface https://huggingface.co/espnet .
# The default model is `espnet/owsm_v3.1_ebf`.
# --lid: the nbest language tag
- name: lid
model_tag: default
nbest: 1
# nomad (reference-based) metric
# -- nomad: nomad reference-based model
- name: nomad
model_cache: versa_cache/nomad_pt-models
# srmr related metrics
# -- srmr: speech-to-reverberation modulation energy ratio
- name: srmr
n_cochlear_filters: 23
low_freq: 125
min_cf: 4
max_cf: 128
fast: True
norm: False
# Emotion similarity calculated based on emo2vec
# --emo2vec_similarity: the emotion similarity with emo2vec
- name: emo2vec_similarity
# noresqa related metrics
# -- noresqa: non-matching reference based speech quality assessment
- name: noresqa
metric_type: 1 #0: NORESQA-score, 1: NORESQA-MOS
# pysepm related metrics
# -- pysepm_fwsegsnr: frequency-weighted segmental SNR
# -- pysepm_llr: Log likelihood ratio
# -- pysepm_wss: weighted spectral slope
# -- pysepm_cd: cepstral distance objective speech quality measure
# -- pysepm_Csig, pysepm_Cbak, pysepm_Covl: composite objective speech quality
# -- pysepm_csii_high, pysepm_csii_mid, pysepm_csii_low: coherence and speech intelligibility index
# -- pysepm_ncm: normalized-covariance measure
- name: pysepm
# nisqa score for speech quality assessment
# -- nisqa_mos_pred: NISQA MOS prediction
# -- nisqa_noi_pred: NISQA noise prediction
# -- nisqa_dis_pred: NISQA distortion prediction
# -- nisqa_col_pred: NISQA color prediction
# --nisqa_loud_pred: NISQA loudness prediction
# NOTE(jiatong): pretrain model can be downloaded with `./tools/setup_nisqa.sh`
- name: nisqa
nisqa_model_path: ./tools/NISQA/weights/nisqa.tar
# discrete speech metrics
# -- speech_bert: speech bert score
# -- speech_bleu: speech bleu score
# -- speech_token_distance: speech token distance score
- name: discrete_speech
# mcd f0 related metrics
# -- mcd: mel cepstral distortion
# -- f0_corr: f0 correlation
# -- f0_rmse: f0 root mean square error
- name: mcd_f0
f0min: 40
f0max: 800
mcep_shift: 5
mcep_fftl: 1024
mcep_dim: 39
mcep_alpha: 0.466
seq_mismatch_tolerance: 0.1
power_threshold: -20
dtw: false
# An overall model on MOS-bench from Sheet toolkit
# --sheet_ssqa: the mos prediction from sheet_ssqa
- name: sheet_ssqa
# pesq related metrics
# -- pesq: perceptual evaluation of speech quality
- name: pesq
# stoi related metrics
# -- stoi: short-time objective intelligibility
- name: stoi
# pseudo subjective metrics
# -- utmos: UT-MOS score
# -- dnsmos: DNS-MOS score
# -- plcmos: PLC-MOS score
# -- aecmos: AEC-MOS score
- name: pseudo_mos
predictor_types: ["utmos", "dnsmos", "plcmos", "singmos", "utmosv2"]
predictor_args:
utmos:
fs: 16000
dnsmos:
fs: 16000
plcmos:
fs: 16000
singmos:
fs: 16000
utmosv2:
fs: 16000
# Word error rate with OpenAI-Whisper model
# -- whisper_wer: word error rate of openai-whisper
- name: whisper_wer
model_tag: default
beam_size: 1
text_cleaner: whisper_basic
# scoreq (reference-based) metric
# -- scoreq_ref: scoreq reference-based model
- name: scoreq_ref
data_domain: natrual
model_cache: versa_cache/scoreq_pt-models
# scoreq (non-reference-based) metric
# -- scoreq_nr: scoreq non-reference-based model
- name: scoreq_nr
data_domain: natural
model_cache: versa_cache/scoreq_pt-models
# Speech Enhancement-based Metrics
# model tag can be any ESPnet-SE huggingface repo
# -- se_si_snr: the SI-SNR from a rerference speech enhancement model
- name: se_snr
model_tag: default
# PAM: Prompting Audio-Language Models for Audio Quality Assessment
# https://github.com/soham97/PAM/tree/main
- name: pam
repro: true
cache_dir: versa_cache/pam
io: soundfile
# TEXT ENCODER CONFIG
text_model: 'gpt2'
text_len: 77
transformer_embed_dim: 768
freeze_text_encoder_weights: True
# AUDIO ENCODER CONFIG
audioenc_name: 'HTSAT'
out_emb: 768
sampling_rate: 44100
duration: 7
fmin: 50
fmax: 8000 #14000
n_fft: 1024 # 1028
hop_size: 320
mel_bins: 64
window_size: 1024
# PROJECTION SPACE CONFIG
d_proj: 1024
temperature: 0.003
# TRAINING AND EVALUATION CONFIG
num_classes: 527
batch_size: 1024
demo: False
# Speaking rate calculating
# --speaking_rate: correct matching words/character counts
- name: speaking_rate
model_tag: default
beam_size: 1
text_cleaner: whisper_basic
# Audiobox Aesthetics (Unified automatic quality assessment for speech, music, and sound.)
- name: audiobox_aesthetics
batch_size: 1
cache_dir: versa_cache/audiobox
# ASR-match calculating
# --asr_match_error_rate: correct matching words/character counts
- name: asr_match
model_tag: default
beam_size: 1
text_cleaner: whisper_basic
# speaker related metrics
# -- spk_similarity: speaker cosine similarity
- name: speaker
model_tag: default
# asvspoof related metrics
# -- asvspoof_score: evaluate how the generated speech is likely to be classifiied by a deepfake classifier
- name: asvspoof_score
# signal related metrics
# -- sir: signal to interference ratio
# -- sar: signal to artifact ratio
# -- sdr: signal to distortion ratio
# -- ci-sdr: scale-invariant signal to distortion ratio
# -- si-snri: scale-invariant signal to noise ratio improvement
- name: signal_metric
|