File size: 5,614 Bytes
37d87af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c509e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# Universal Metrics Configuration for Versa
# This file contains the configuration for various universal metrics used in speech quality assessment.

# visqol metric
# -- visqol: visual quality of speech
- name: visqol
  model: default

# Word error rate with ESPnet-OWSM model
# More model_tag can be from the ESPnet huggingface https://huggingface.co/espnet .
# The default model is `espnet/owsm_v3.1_ebf`.
# --lid: the nbest language tag
- name: lid
  model_tag: default
  nbest: 1

# nomad (reference-based) metric
# -- nomad: nomad reference-based model
- name: nomad
  model_cache: versa_cache/nomad_pt-models

# srmr related metrics
# -- srmr: speech-to-reverberation modulation energy ratio
- name: srmr
  n_cochlear_filters: 23
  low_freq: 125
  min_cf: 4
  max_cf: 128
  fast: True
  norm: False

# Emotion similarity calculated based on emo2vec
# --emo2vec_similarity: the emotion similarity with emo2vec
- name: emo2vec_similarity

# noresqa related metrics
# -- noresqa: non-matching reference based speech quality assessment
- name: noresqa
  metric_type: 1 #0: NORESQA-score, 1: NORESQA-MOS

# pysepm related metrics
# -- pysepm_fwsegsnr: frequency-weighted segmental SNR
# -- pysepm_llr: Log likelihood ratio
# -- pysepm_wss: weighted spectral slope
# -- pysepm_cd: cepstral distance objective speech quality measure
# -- pysepm_Csig, pysepm_Cbak, pysepm_Covl: composite objective speech quality
# -- pysepm_csii_high, pysepm_csii_mid, pysepm_csii_low: coherence and speech intelligibility index 
# -- pysepm_ncm: normalized-covariance measure
- name: pysepm

# nisqa score for speech quality assessment
#  -- nisqa_mos_pred: NISQA MOS prediction
#  -- nisqa_noi_pred: NISQA noise prediction
#  -- nisqa_dis_pred: NISQA distortion prediction
#  -- nisqa_col_pred: NISQA color prediction
#  --nisqa_loud_pred: NISQA loudness prediction
# NOTE(jiatong): pretrain model can be downloaded with `./tools/setup_nisqa.sh`
- name: nisqa
  nisqa_model_path: ./tools/NISQA/weights/nisqa.tar

# discrete speech metrics
# -- speech_bert: speech bert score
# -- speech_bleu: speech bleu score
# -- speech_token_distance: speech token distance score
- name: discrete_speech

# mcd f0 related metrics
#  -- mcd: mel cepstral distortion
#  -- f0_corr: f0 correlation
#  -- f0_rmse: f0 root mean square error
- name: mcd_f0
  f0min: 40
  f0max: 800
  mcep_shift: 5
  mcep_fftl: 1024
  mcep_dim: 39
  mcep_alpha: 0.466
  seq_mismatch_tolerance: 0.1
  power_threshold: -20
  dtw: false

# An overall model on MOS-bench from Sheet toolkit
# --sheet_ssqa: the mos prediction from sheet_ssqa
- name: sheet_ssqa

# pesq related metrics
# -- pesq: perceptual evaluation of speech quality
- name: pesq

# stoi related metrics
# -- stoi: short-time objective intelligibility
- name: stoi

# pseudo subjective metrics
# -- utmos: UT-MOS score
# -- dnsmos: DNS-MOS score
# -- plcmos: PLC-MOS score
# -- aecmos: AEC-MOS score
- name: pseudo_mos
  predictor_types: ["utmos", "dnsmos", "plcmos", "singmos", "utmosv2"]
  predictor_args:
    utmos:
      fs: 16000
    dnsmos:
      fs: 16000
    plcmos:
      fs: 16000
    singmos:
      fs: 16000
    utmosv2:
      fs: 16000

# Word error rate with OpenAI-Whisper model
# -- whisper_wer: word error rate of openai-whisper
- name: whisper_wer
  model_tag: default
  beam_size: 1
  text_cleaner: whisper_basic

# scoreq (reference-based) metric
# -- scoreq_ref: scoreq reference-based model
- name: scoreq_ref
  data_domain: natrual
  model_cache: versa_cache/scoreq_pt-models

# scoreq (non-reference-based) metric
# -- scoreq_nr: scoreq non-reference-based model
- name: scoreq_nr
  data_domain: natural
  model_cache: versa_cache/scoreq_pt-models

# Speech Enhancement-based Metrics
# model tag can be any ESPnet-SE huggingface repo
# -- se_si_snr: the SI-SNR from a rerference speech enhancement model
- name: se_snr
  model_tag: default

# PAM: Prompting Audio-Language Models for Audio Quality Assessment
# https://github.com/soham97/PAM/tree/main

- name: pam
  repro: true
  cache_dir: versa_cache/pam
  io: soundfile
  # TEXT ENCODER CONFIG
  text_model: 'gpt2'
  text_len: 77
  transformer_embed_dim: 768
  freeze_text_encoder_weights: True
  # AUDIO ENCODER CONFIG
  audioenc_name: 'HTSAT'
  out_emb: 768
  sampling_rate: 44100
  duration: 7
  fmin: 50
  fmax: 8000 #14000 
  n_fft: 1024 # 1028 
  hop_size: 320
  mel_bins: 64
  window_size: 1024
  # PROJECTION SPACE CONFIG 
  d_proj: 1024
  temperature: 0.003
  # TRAINING AND EVALUATION CONFIG
  num_classes: 527
  batch_size: 1024
  demo: False

# Speaking rate calculating
# --speaking_rate: correct matching words/character counts
- name: speaking_rate
  model_tag: default
  beam_size: 1
  text_cleaner: whisper_basic

# Audiobox Aesthetics (Unified automatic quality assessment for speech, music, and sound.)
- name: audiobox_aesthetics
  batch_size: 1
  cache_dir: versa_cache/audiobox

# ASR-match calculating
# --asr_match_error_rate: correct matching words/character counts
- name: asr_match
  model_tag: default
  beam_size: 1
  text_cleaner: whisper_basic

# speaker related metrics
# -- spk_similarity: speaker cosine similarity
- name: speaker
  model_tag: default

# asvspoof related metrics
# -- asvspoof_score: evaluate how the generated speech is likely to be classifiied by a deepfake classifier
- name: asvspoof_score

# signal related metrics
# -- sir: signal to interference ratio
# -- sar: signal to artifact ratio
# -- sdr: signal to distortion ratio
# -- ci-sdr: scale-invariant signal to distortion ratio
# -- si-snri: scale-invariant signal to noise ratio improvement
- name: signal_metric