Spaces:
Sleeping
Sleeping
ftshijt
commited on
Commit
·
6c509e2
1
Parent(s):
37d87af
fix docker setup for sdk
Browse files- README.md +4 -4
- universal_metrics.yaml +46 -1
README.md
CHANGED
@@ -3,10 +3,10 @@ title: VERSA Speech & Audio Evaluation Demo
|
|
3 |
emoji: 🎙️
|
4 |
colorFrom: blue
|
5 |
colorTo: indigo
|
6 |
-
sdk:
|
7 |
-
sdk_version: 5.25.0
|
8 |
-
app_file: app.py
|
9 |
pinned: false
|
|
|
|
|
10 |
license: apache-2.0
|
11 |
---
|
12 |
|
@@ -51,4 +51,4 @@ If you use VERSA in your research, please cite:
|
|
51 |
primaryClass={cs.SD},
|
52 |
url={https://arxiv.org/abs/2412.17667},
|
53 |
}
|
54 |
-
```
|
|
|
3 |
emoji: 🎙️
|
4 |
colorFrom: blue
|
5 |
colorTo: indigo
|
6 |
+
sdk: docker
|
|
|
|
|
7 |
pinned: false
|
8 |
+
license: mit
|
9 |
+
hf_oauth: false
|
10 |
license: apache-2.0
|
11 |
---
|
12 |
|
|
|
51 |
primaryClass={cs.SD},
|
52 |
url={https://arxiv.org/abs/2412.17667},
|
53 |
}
|
54 |
+
```
|
universal_metrics.yaml
CHANGED
@@ -155,4 +155,49 @@
|
|
155 |
fmin: 50
|
156 |
fmax: 8000 #14000
|
157 |
n_fft: 1024 # 1028
|
158 |
-
hop_size
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
fmin: 50
|
156 |
fmax: 8000 #14000
|
157 |
n_fft: 1024 # 1028
|
158 |
+
hop_size: 320
|
159 |
+
mel_bins: 64
|
160 |
+
window_size: 1024
|
161 |
+
# PROJECTION SPACE CONFIG
|
162 |
+
d_proj: 1024
|
163 |
+
temperature: 0.003
|
164 |
+
# TRAINING AND EVALUATION CONFIG
|
165 |
+
num_classes: 527
|
166 |
+
batch_size: 1024
|
167 |
+
demo: False
|
168 |
+
|
169 |
+
# Speaking rate calculating
|
170 |
+
# --speaking_rate: correct matching words/character counts
|
171 |
+
- name: speaking_rate
|
172 |
+
model_tag: default
|
173 |
+
beam_size: 1
|
174 |
+
text_cleaner: whisper_basic
|
175 |
+
|
176 |
+
# Audiobox Aesthetics (Unified automatic quality assessment for speech, music, and sound.)
|
177 |
+
- name: audiobox_aesthetics
|
178 |
+
batch_size: 1
|
179 |
+
cache_dir: versa_cache/audiobox
|
180 |
+
|
181 |
+
# ASR-match calculating
|
182 |
+
# --asr_match_error_rate: correct matching words/character counts
|
183 |
+
- name: asr_match
|
184 |
+
model_tag: default
|
185 |
+
beam_size: 1
|
186 |
+
text_cleaner: whisper_basic
|
187 |
+
|
188 |
+
# speaker related metrics
|
189 |
+
# -- spk_similarity: speaker cosine similarity
|
190 |
+
- name: speaker
|
191 |
+
model_tag: default
|
192 |
+
|
193 |
+
# asvspoof related metrics
|
194 |
+
# -- asvspoof_score: evaluate how the generated speech is likely to be classifiied by a deepfake classifier
|
195 |
+
- name: asvspoof_score
|
196 |
+
|
197 |
+
# signal related metrics
|
198 |
+
# -- sir: signal to interference ratio
|
199 |
+
# -- sar: signal to artifact ratio
|
200 |
+
# -- sdr: signal to distortion ratio
|
201 |
+
# -- ci-sdr: scale-invariant signal to distortion ratio
|
202 |
+
# -- si-snri: scale-invariant signal to noise ratio improvement
|
203 |
+
- name: signal_metric
|