Spaces:

ftshijt
/

versa

Sleeping

File size: 5,614 Bytes

# Universal Metrics Configuration for Versa
# This file contains the configuration for various universal metrics used in speech quality assessment.

# visqol metric
# -- visqol: visual quality of speech
- name: visqol
  model: default

# Word error rate with ESPnet-OWSM model
# More model_tag can be from the ESPnet huggingface https://huggingface.co/espnet .
# The default model is `espnet/owsm_v3.1_ebf`.
# --lid: the nbest language tag
- name: lid
  model_tag: default
  nbest: 1

# nomad (reference-based) metric
# -- nomad: nomad reference-based model
- name: nomad
  model_cache: versa_cache/nomad_pt-models

# srmr related metrics
# -- srmr: speech-to-reverberation modulation energy ratio
- name: srmr
  n_cochlear_filters: 23
  low_freq: 125
  min_cf: 4
  max_cf: 128
  fast: True
  norm: False

# Emotion similarity calculated based on emo2vec
# --emo2vec_similarity: the emotion similarity with emo2vec
- name: emo2vec_similarity

# noresqa related metrics
# -- noresqa: non-matching reference based speech quality assessment
- name: noresqa
  metric_type: 1 #0: NORESQA-score, 1: NORESQA-MOS

# pysepm related metrics
# -- pysepm_fwsegsnr: frequency-weighted segmental SNR
# -- pysepm_llr: Log likelihood ratio
# -- pysepm_wss: weighted spectral slope
# -- pysepm_cd: cepstral distance objective speech quality measure
# -- pysepm_Csig, pysepm_Cbak, pysepm_Covl: composite objective speech quality
# -- pysepm_csii_high, pysepm_csii_mid, pysepm_csii_low: coherence and speech intelligibility index 
# -- pysepm_ncm: normalized-covariance measure
- name: pysepm

# nisqa score for speech quality assessment
#  -- nisqa_mos_pred: NISQA MOS prediction
#  -- nisqa_noi_pred: NISQA noise prediction
#  -- nisqa_dis_pred: NISQA distortion prediction
#  -- nisqa_col_pred: NISQA color prediction
#  --nisqa_loud_pred: NISQA loudness prediction
# NOTE(jiatong): pretrain model can be downloaded with `./tools/setup_nisqa.sh`
- name: nisqa
  nisqa_model_path: ./tools/NISQA/weights/nisqa.tar

# discrete speech metrics
# -- speech_bert: speech bert score
# -- speech_bleu: speech bleu score
# -- speech_token_distance: speech token distance score
- name: discrete_speech

# mcd f0 related metrics
#  -- mcd: mel cepstral distortion
#  -- f0_corr: f0 correlation
#  -- f0_rmse: f0 root mean square error
- name: mcd_f0
  f0min: 40
  f0max: 800
  mcep_shift: 5
  mcep_fftl: 1024
  mcep_dim: 39
  mcep_alpha: 0.466
  seq_mismatch_tolerance: 0.1
  power_threshold: -20
  dtw: false

# An overall model on MOS-bench from Sheet toolkit
# --sheet_ssqa: the mos prediction from sheet_ssqa
- name: sheet_ssqa

# pesq related metrics
# -- pesq: perceptual evaluation of speech quality
- name: pesq

# stoi related metrics
# -- stoi: short-time objective intelligibility
- name: stoi

# pseudo subjective metrics
# -- utmos: UT-MOS score
# -- dnsmos: DNS-MOS score
# -- plcmos: PLC-MOS score
# -- aecmos: AEC-MOS score
- name: pseudo_mos
  predictor_types: ["utmos", "dnsmos", "plcmos", "singmos", "utmosv2"]
  predictor_args:
    utmos:
      fs: 16000
    dnsmos:
      fs: 16000
    plcmos:
      fs: 16000
    singmos:
      fs: 16000
    utmosv2:
      fs: 16000

# Word error rate with OpenAI-Whisper model
# -- whisper_wer: word error rate of openai-whisper
- name: whisper_wer
  model_tag: default
  beam_size: 1
  text_cleaner: whisper_basic

# scoreq (reference-based) metric
# -- scoreq_ref: scoreq reference-based model
- name: scoreq_ref
  data_domain: natrual
  model_cache: versa_cache/scoreq_pt-models

# scoreq (non-reference-based) metric
# -- scoreq_nr: scoreq non-reference-based model
- name: scoreq_nr
  data_domain: natural
  model_cache: versa_cache/scoreq_pt-models

# Speech Enhancement-based Metrics
# model tag can be any ESPnet-SE huggingface repo
# -- se_si_snr: the SI-SNR from a rerference speech enhancement model
- name: se_snr
  model_tag: default

# PAM: Prompting Audio-Language Models for Audio Quality Assessment
# https://github.com/soham97/PAM/tree/main

- name: pam
  repro: true
  cache_dir: versa_cache/pam
  io: soundfile
  # TEXT ENCODER CONFIG
  text_model: 'gpt2'
  text_len: 77
  transformer_embed_dim: 768
  freeze_text_encoder_weights: True
  # AUDIO ENCODER CONFIG
  audioenc_name: 'HTSAT'
  out_emb: 768
  sampling_rate: 44100
  duration: 7
  fmin: 50
  fmax: 8000 #14000 
  n_fft: 1024 # 1028 
  hop_size: 320
  mel_bins: 64
  window_size: 1024
  # PROJECTION SPACE CONFIG 
  d_proj: 1024
  temperature: 0.003
  # TRAINING AND EVALUATION CONFIG
  num_classes: 527
  batch_size: 1024
  demo: False

# Speaking rate calculating
# --speaking_rate: correct matching words/character counts
- name: speaking_rate
  model_tag: default
  beam_size: 1
  text_cleaner: whisper_basic

# Audiobox Aesthetics (Unified automatic quality assessment for speech, music, and sound.)
- name: audiobox_aesthetics
  batch_size: 1
  cache_dir: versa_cache/audiobox

# ASR-match calculating
# --asr_match_error_rate: correct matching words/character counts
- name: asr_match
  model_tag: default
  beam_size: 1
  text_cleaner: whisper_basic

# speaker related metrics
# -- spk_similarity: speaker cosine similarity
- name: speaker
  model_tag: default

# asvspoof related metrics
# -- asvspoof_score: evaluate how the generated speech is likely to be classifiied by a deepfake classifier
- name: asvspoof_score

# signal related metrics
# -- sir: signal to interference ratio
# -- sar: signal to artifact ratio
# -- sdr: signal to distortion ratio
# -- ci-sdr: scale-invariant signal to distortion ratio
# -- si-snri: scale-invariant signal to noise ratio improvement
- name: signal_metric