Spaces:

ftshijt
/

versa

Running on L4

versa / universal_metrics.yaml

ftshijt

fix docker setup for sdk

6c509e2 17 days ago

5.61 kB

	# Universal Metrics Configuration for Versa
	# This file contains the configuration for various universal metrics used in speech quality assessment.

	# visqol metric
	# -- visqol: visual quality of speech
	- name: visqol
	model: default

	# Word error rate with ESPnet-OWSM model
	# More model_tag can be from the ESPnet huggingface https://huggingface.co/espnet .
	# The default model is `espnet/owsm_v3.1_ebf`.
	# --lid: the nbest language tag
	- name: lid
	model_tag: default
	nbest: 1

	# nomad (reference-based) metric
	# -- nomad: nomad reference-based model
	- name: nomad
	model_cache: versa_cache/nomad_pt-models

	# srmr related metrics
	# -- srmr: speech-to-reverberation modulation energy ratio
	- name: srmr
	n_cochlear_filters: 23
	low_freq: 125
	min_cf: 4
	max_cf: 128
	fast: True
	norm: False

	# Emotion similarity calculated based on emo2vec
	# --emo2vec_similarity: the emotion similarity with emo2vec
	- name: emo2vec_similarity

	# noresqa related metrics
	# -- noresqa: non-matching reference based speech quality assessment
	- name: noresqa
	metric_type: 1 #0: NORESQA-score, 1: NORESQA-MOS

	# pysepm related metrics
	# -- pysepm_fwsegsnr: frequency-weighted segmental SNR
	# -- pysepm_llr: Log likelihood ratio
	# -- pysepm_wss: weighted spectral slope
	# -- pysepm_cd: cepstral distance objective speech quality measure
	# -- pysepm_Csig, pysepm_Cbak, pysepm_Covl: composite objective speech quality
	# -- pysepm_csii_high, pysepm_csii_mid, pysepm_csii_low: coherence and speech intelligibility index
	# -- pysepm_ncm: normalized-covariance measure
	- name: pysepm

	# nisqa score for speech quality assessment
	# -- nisqa_mos_pred: NISQA MOS prediction
	# -- nisqa_noi_pred: NISQA noise prediction
	# -- nisqa_dis_pred: NISQA distortion prediction
	# -- nisqa_col_pred: NISQA color prediction
	# --nisqa_loud_pred: NISQA loudness prediction
	# NOTE(jiatong): pretrain model can be downloaded with `./tools/setup_nisqa.sh`
	- name: nisqa
	nisqa_model_path: ./tools/NISQA/weights/nisqa.tar

	# discrete speech metrics
	# -- speech_bert: speech bert score
	# -- speech_bleu: speech bleu score
	# -- speech_token_distance: speech token distance score
	- name: discrete_speech

	# mcd f0 related metrics
	# -- mcd: mel cepstral distortion
	# -- f0_corr: f0 correlation
	# -- f0_rmse: f0 root mean square error
	- name: mcd_f0
	f0min: 40
	f0max: 800
	mcep_shift: 5
	mcep_fftl: 1024
	mcep_dim: 39
	mcep_alpha: 0.466
	seq_mismatch_tolerance: 0.1
	power_threshold: -20
	dtw: false

	# An overall model on MOS-bench from Sheet toolkit
	# --sheet_ssqa: the mos prediction from sheet_ssqa
	- name: sheet_ssqa

	# pesq related metrics
	# -- pesq: perceptual evaluation of speech quality
	- name: pesq

	# stoi related metrics
	# -- stoi: short-time objective intelligibility
	- name: stoi

	# pseudo subjective metrics
	# -- utmos: UT-MOS score
	# -- dnsmos: DNS-MOS score
	# -- plcmos: PLC-MOS score
	# -- aecmos: AEC-MOS score
	- name: pseudo_mos
	predictor_types: ["utmos", "dnsmos", "plcmos", "singmos", "utmosv2"]
	predictor_args:
	utmos:
	fs: 16000
	dnsmos:
	fs: 16000
	plcmos:
	fs: 16000
	singmos:
	fs: 16000
	utmosv2:
	fs: 16000

	# Word error rate with OpenAI-Whisper model
	# -- whisper_wer: word error rate of openai-whisper
	- name: whisper_wer
	model_tag: default
	beam_size: 1
	text_cleaner: whisper_basic

	# scoreq (reference-based) metric
	# -- scoreq_ref: scoreq reference-based model
	- name: scoreq_ref
	data_domain: natrual
	model_cache: versa_cache/scoreq_pt-models

	# scoreq (non-reference-based) metric
	# -- scoreq_nr: scoreq non-reference-based model
	- name: scoreq_nr
	data_domain: natural
	model_cache: versa_cache/scoreq_pt-models

	# Speech Enhancement-based Metrics
	# model tag can be any ESPnet-SE huggingface repo
	# -- se_si_snr: the SI-SNR from a rerference speech enhancement model
	- name: se_snr
	model_tag: default

	# PAM: Prompting Audio-Language Models for Audio Quality Assessment
	# https://github.com/soham97/PAM/tree/main

	- name: pam
	repro: true
	cache_dir: versa_cache/pam
	io: soundfile
	# TEXT ENCODER CONFIG
	text_model: 'gpt2'
	text_len: 77
	transformer_embed_dim: 768
	freeze_text_encoder_weights: True
	# AUDIO ENCODER CONFIG
	audioenc_name: 'HTSAT'
	out_emb: 768
	sampling_rate: 44100
	duration: 7
	fmin: 50
	fmax: 8000 #14000
	n_fft: 1024 # 1028
	hop_size: 320
	mel_bins: 64
	window_size: 1024
	# PROJECTION SPACE CONFIG
	d_proj: 1024
	temperature: 0.003
	# TRAINING AND EVALUATION CONFIG
	num_classes: 527
	batch_size: 1024
	demo: False

	# Speaking rate calculating
	# --speaking_rate: correct matching words/character counts
	- name: speaking_rate
	model_tag: default
	beam_size: 1
	text_cleaner: whisper_basic

	# Audiobox Aesthetics (Unified automatic quality assessment for speech, music, and sound.)
	- name: audiobox_aesthetics
	batch_size: 1
	cache_dir: versa_cache/audiobox

	# ASR-match calculating
	# --asr_match_error_rate: correct matching words/character counts
	- name: asr_match
	model_tag: default
	beam_size: 1
	text_cleaner: whisper_basic

	# speaker related metrics
	# -- spk_similarity: speaker cosine similarity
	- name: speaker
	model_tag: default

	# asvspoof related metrics
	# -- asvspoof_score: evaluate how the generated speech is likely to be classifiied by a deepfake classifier
	- name: asvspoof_score

	# signal related metrics
	# -- sir: signal to interference ratio
	# -- sar: signal to artifact ratio
	# -- sdr: signal to distortion ratio
	# -- ci-sdr: scale-invariant signal to distortion ratio
	# -- si-snri: scale-invariant signal to noise ratio improvement
	- name: signal_metric