ftshijt commited on
Commit
6c509e2
·
1 Parent(s): 37d87af

fix docker setup for sdk

Browse files
Files changed (2) hide show
  1. README.md +4 -4
  2. universal_metrics.yaml +46 -1
README.md CHANGED
@@ -3,10 +3,10 @@ title: VERSA Speech & Audio Evaluation Demo
3
  emoji: 🎙️
4
  colorFrom: blue
5
  colorTo: indigo
6
- sdk: gradio
7
- sdk_version: 5.25.0
8
- app_file: app.py
9
  pinned: false
 
 
10
  license: apache-2.0
11
  ---
12
 
@@ -51,4 +51,4 @@ If you use VERSA in your research, please cite:
51
  primaryClass={cs.SD},
52
  url={https://arxiv.org/abs/2412.17667},
53
  }
54
- ```
 
3
  emoji: 🎙️
4
  colorFrom: blue
5
  colorTo: indigo
6
+ sdk: docker
 
 
7
  pinned: false
8
+ license: mit
9
+ hf_oauth: false
10
  license: apache-2.0
11
  ---
12
 
 
51
  primaryClass={cs.SD},
52
  url={https://arxiv.org/abs/2412.17667},
53
  }
54
+ ```
universal_metrics.yaml CHANGED
@@ -155,4 +155,49 @@
155
  fmin: 50
156
  fmax: 8000 #14000
157
  n_fft: 1024 # 1028
158
- hop_size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  fmin: 50
156
  fmax: 8000 #14000
157
  n_fft: 1024 # 1028
158
+ hop_size: 320
159
+ mel_bins: 64
160
+ window_size: 1024
161
+ # PROJECTION SPACE CONFIG
162
+ d_proj: 1024
163
+ temperature: 0.003
164
+ # TRAINING AND EVALUATION CONFIG
165
+ num_classes: 527
166
+ batch_size: 1024
167
+ demo: False
168
+
169
+ # Speaking rate calculating
170
+ # --speaking_rate: correct matching words/character counts
171
+ - name: speaking_rate
172
+ model_tag: default
173
+ beam_size: 1
174
+ text_cleaner: whisper_basic
175
+
176
+ # Audiobox Aesthetics (Unified automatic quality assessment for speech, music, and sound.)
177
+ - name: audiobox_aesthetics
178
+ batch_size: 1
179
+ cache_dir: versa_cache/audiobox
180
+
181
+ # ASR-match calculating
182
+ # --asr_match_error_rate: correct matching words/character counts
183
+ - name: asr_match
184
+ model_tag: default
185
+ beam_size: 1
186
+ text_cleaner: whisper_basic
187
+
188
+ # speaker related metrics
189
+ # -- spk_similarity: speaker cosine similarity
190
+ - name: speaker
191
+ model_tag: default
192
+
193
+ # asvspoof related metrics
194
+ # -- asvspoof_score: evaluate how the generated speech is likely to be classifiied by a deepfake classifier
195
+ - name: asvspoof_score
196
+
197
+ # signal related metrics
198
+ # -- sir: signal to interference ratio
199
+ # -- sar: signal to artifact ratio
200
+ # -- sdr: signal to distortion ratio
201
+ # -- ci-sdr: scale-invariant signal to distortion ratio
202
+ # -- si-snri: scale-invariant signal to noise ratio improvement
203
+ - name: signal_metric