Update README.md
Browse files
README.md
CHANGED
@@ -43,18 +43,16 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
43 |
|
44 |
### Performance
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
<details>
|
47 |
<summary>Multimodality -> Text</summary>
|
48 |
|
49 |
-
<
|
50 |
-
.tg {border-collapse:collapse;border-spacing:0;}
|
51 |
-
.tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
|
52 |
-
overflow:hidden;padding:10px 5px;word-break:normal;}
|
53 |
-
.tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
|
54 |
-
font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
|
55 |
-
.tg .tg-0lax{text-align:left;vertical-align:top}
|
56 |
-
</style>
|
57 |
-
<table class=""><thead>
|
58 |
<tr>
|
59 |
<th class="tg-0lax">Datasets</th>
|
60 |
<th class="tg-0lax">Model</th>
|
@@ -76,7 +74,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
76 |
</tr>
|
77 |
<tr>
|
78 |
<td class="tg-0lax">video-SALMONN</td>
|
79 |
-
<td class="tg-0lax">34.11%|31.70%|<
|
80 |
</tr>
|
81 |
<tr>
|
82 |
<td class="tg-0lax">UnifiedIO2-xlarge</td>
|
@@ -84,23 +82,19 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
84 |
</tr>
|
85 |
<tr>
|
86 |
<td class="tg-0lax">UnifiedIO2-xxlarge</td>
|
87 |
-
<td class="tg-0lax">34.24%|36.98%|29.25%|38.00%</td>
|
88 |
-
</tr>
|
89 |
-
<tr>
|
90 |
-
<td class="tg-0lax">MiniCPM-o</td>
|
91 |
<td class="tg-0lax">34.24%|36.98%|24.53%|33.98%</td>
|
92 |
</tr>
|
93 |
<tr>
|
94 |
-
<td class="tg-0lax">
|
95 |
<td class="tg-0lax">-|-|-|40.50%</td>
|
96 |
</tr>
|
97 |
<tr>
|
98 |
-
<td class="tg-0lax">
|
99 |
<td class="tg-0lax">-|-|-|42.90%</td>
|
100 |
</tr>
|
101 |
<tr>
|
102 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
103 |
-
<td class="tg-0lax"><
|
104 |
</tr>
|
105 |
</tbody></table>
|
106 |
</details>
|
@@ -109,16 +103,8 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
109 |
<details>
|
110 |
<summary>Audio -> Text</summary>
|
111 |
|
112 |
-
|
113 |
-
|
114 |
-
.tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
|
115 |
-
overflow:hidden;padding:10px 5px;word-break:normal;}
|
116 |
-
.tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
|
117 |
-
font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
|
118 |
-
.tg .tg-9j4x{font-style:italic;font-weight:bold;text-align:center;text-decoration:underline;vertical-align:top}
|
119 |
-
.tg .tg-0lax{text-align:left;vertical-align:top}
|
120 |
-
</style>
|
121 |
-
<table class=""><thead>
|
122 |
<tr>
|
123 |
<th class="tg-0lax">Datasets</th>
|
124 |
<th class="tg-0lax">Model</th>
|
@@ -151,7 +137,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
151 |
</tr>
|
152 |
<tr>
|
153 |
<td class="tg-0lax">Seed-ASR-Multilingual</td>
|
154 |
-
<td class="tg-0lax">-|-|<
|
155 |
</tr>
|
156 |
<tr>
|
157 |
<td class="tg-0lax">MiniCPM-o</td>
|
@@ -167,7 +153,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
167 |
</tr>
|
168 |
<tr>
|
169 |
<td class="tg-0lax">Qwen2-Audio</td>
|
170 |
-
<td class="tg-0lax"><
|
171 |
</tr>
|
172 |
<tr>
|
173 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
@@ -176,7 +162,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
176 |
<tr>
|
177 |
<td class="tg-0lax" rowspan="4">Common Voice 15<br>en | zh | yue | fr</td>
|
178 |
<td class="tg-0lax">Whisper-large-v3</td>
|
179 |
-
<td class="tg-0lax">9.
|
180 |
</tr>
|
181 |
<tr>
|
182 |
<td class="tg-0lax">MinMo</td>
|
@@ -184,11 +170,11 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
184 |
</tr>
|
185 |
<tr>
|
186 |
<td class="tg-0lax">Qwen2-Audio</td>
|
187 |
-
<td class="tg-0lax">8.6|6.9|<
|
188 |
</tr>
|
189 |
<tr>
|
190 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
191 |
-
<td class="tg-0lax"><
|
192 |
</tr>
|
193 |
<tr>
|
194 |
<td class="tg-0lax" rowspan="7">Fleurs<br>zh | en</td>
|
@@ -197,7 +183,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
197 |
</tr>
|
198 |
<tr>
|
199 |
<td class="tg-0lax">Seed-ASR-Multilingual</td>
|
200 |
-
<td class="tg-0lax">-|<
|
201 |
</tr>
|
202 |
<tr>
|
203 |
<td class="tg-0lax">Megrez-3B-Omni</td>
|
@@ -217,12 +203,12 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
217 |
</tr>
|
218 |
<tr>
|
219 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
220 |
-
<td class="tg-0lax"><
|
221 |
</tr>
|
222 |
<tr>
|
223 |
<td class="tg-0lax" rowspan="5">Wenetspeech<br>test-net | test-meeting</td>
|
224 |
<td class="tg-0lax">Seed-ASR-Chinese</td>
|
225 |
-
<td class="tg-0lax"><
|
226 |
</tr>
|
227 |
<tr>
|
228 |
<td class="tg-0lax">Megrez-3B-Omni</td>
|
@@ -247,7 +233,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
247 |
</tr>
|
248 |
<tr>
|
249 |
<td class="tg-0lax">Llama-3-70B</td>
|
250 |
-
<td class="tg-0lax"><
|
251 |
</tr>
|
252 |
<tr>
|
253 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
@@ -271,11 +257,11 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
271 |
</tr>
|
272 |
<tr>
|
273 |
<td class="tg-0lax">MiniCPM-o</td>
|
274 |
-
<td class="tg-0lax">-|-|<
|
275 |
</tr>
|
276 |
<tr>
|
277 |
<td class="tg-0lax">MinMo</td>
|
278 |
-
<td class="tg-0lax">-|<
|
279 |
</tr>
|
280 |
<tr>
|
281 |
<td class="tg-0lax">Qwen-Audio</td>
|
@@ -287,7 +273,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
287 |
</tr>
|
288 |
<tr>
|
289 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
290 |
-
<td class="tg-0lax"><
|
291 |
</tr>
|
292 |
<tr>
|
293 |
<td class="tg-9j4x" colspan="3">SER</td>
|
@@ -311,7 +297,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
311 |
</tr>
|
312 |
<tr>
|
313 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
314 |
-
<td class="tg-0lax"><
|
315 |
</tr>
|
316 |
<tr>
|
317 |
<td class="tg-9j4x" colspan="3">VSC</td>
|
@@ -331,11 +317,11 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
331 |
</tr>
|
332 |
<tr>
|
333 |
<td class="tg-0lax">Qwen2-Audio</td>
|
334 |
-
<td class="tg-0lax"><
|
335 |
</tr>
|
336 |
<tr>
|
337 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
338 |
-
<td class="tg-0lax"><
|
339 |
</tr>
|
340 |
<tr>
|
341 |
<td class="tg-9j4x" colspan="3">Music</td>
|
@@ -347,16 +333,16 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
347 |
</tr>
|
348 |
<tr>
|
349 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
350 |
-
<td class="tg-0lax"><
|
351 |
</tr>
|
352 |
<tr>
|
353 |
<td class="tg-0lax" rowspan="2">MusicCaps</td>
|
354 |
<td class="tg-0lax">LP-MusicCaps</td>
|
355 |
-
<td class="tg-0lax">0.291|0.149|0.089|<
|
356 |
</tr>
|
357 |
<tr>
|
358 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
359 |
-
<td class="tg-0lax"><
|
360 |
</tr>
|
361 |
<tr>
|
362 |
<td class="tg-9j4x" colspan="3">Audio Reasoning</td>
|
@@ -368,11 +354,11 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
368 |
</tr>
|
369 |
<tr>
|
370 |
<td class="tg-0lax">Qwen2-Audio</td>
|
371 |
-
<td class="tg-0lax">54.95|50.98|42.04|49.20
|
372 |
</tr>
|
373 |
<tr>
|
374 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
375 |
-
<td class="tg-0lax"><
|
376 |
</tr>
|
377 |
<tr>
|
378 |
<td class="tg-9j4x" colspan="3">Voice Chatting</td>
|
@@ -380,7 +366,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
380 |
<tr>
|
381 |
<td class="tg-0lax" rowspan="8">VoiceBench<br>AlpacaEval | CommonEval | SD-QA | MMSU</td>
|
382 |
<td class="tg-0lax">Ultravox-v0.4.1-LLaMA-3.1-8B</td>
|
383 |
-
<td class="tg-0lax"><
|
384 |
</tr>
|
385 |
<tr>
|
386 |
<td class="tg-0lax">MERaLiON</td>
|
@@ -396,7 +382,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
396 |
</tr>
|
397 |
<tr>
|
398 |
<td class="tg-0lax">MiniCPM-o</td>
|
399 |
-
<td class="tg-0lax">4.42|<
|
400 |
</tr>
|
401 |
<tr>
|
402 |
<td class="tg-0lax">Baichuan-Omni-1.5</td>
|
@@ -408,12 +394,12 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
408 |
</tr>
|
409 |
<tr>
|
410 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
411 |
-
<td class="tg-0lax">4.49|3.93|<
|
412 |
</tr>
|
413 |
<tr>
|
414 |
<td class="tg-0lax" rowspan="8">VoiceBench<br>OpenBookQA | IFEval | AdvBench | Avg</td>
|
415 |
<td class="tg-0lax">Ultravox-v0.4.1-LLaMA-3.1-8B</td>
|
416 |
-
<td class="tg-0lax">65.27|<
|
417 |
</tr>
|
418 |
<tr>
|
419 |
<td class="tg-0lax">MERaLiON</td>
|
@@ -441,7 +427,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
441 |
</tr>
|
442 |
<tr>
|
443 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
444 |
-
<td class="tg-0lax"><
|
445 |
</tr>
|
446 |
</tbody></table>
|
447 |
</details>
|
@@ -473,16 +459,16 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
473 |
|
474 |
| Dataset | Qwen2.5-Omni-7B | Qwen2.5-VL-7B | Grounding DINO | Gemini 1.5 Pro |
|
475 |
|--------------------------|--------------|---------------|----------------|----------------|
|
476 |
-
| Refcoco<sub>val</sub> |
|
477 |
-
| Refcoco<sub>textA</sub> | **93.
|
478 |
-
| Refcoco<sub>textB</sub> | 86.
|
479 |
-
| Refcoco+<sub>val</sub> | 85.
|
480 |
| Refcoco+<sub>textA</sub> | **91.0** | 89.1 | 89.0 | 63.9 |
|
481 |
-
| Refcoco+<sub>textB</sub> | **79.
|
482 |
-
| Refcocog+<sub>val</sub> | **87.
|
483 |
-
| Refcocog+<sub>test</sub> | **
|
484 |
| ODinW | 42.4 | 37.3 | **55.0** | 36.7 |
|
485 |
-
| PointGrounding |
|
486 |
</details>
|
487 |
|
488 |
|
@@ -491,26 +477,17 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
491 |
|
492 |
| Dataset | Qwen2.5-Omni-7B | Other Best | Qwen2.5-VL-7B | GPT-4o-mini |
|
493 |
|-----------------------------|--------------|------------|---------------|-------------|
|
494 |
-
| Video-MME<sub>w/o sub</sub> |
|
495 |
-
| Video-MME<sub>w sub</sub> | **72.
|
496 |
-
| MVBench |
|
497 |
-
| EgoSchema<sub>test</sub> | **
|
498 |
</details>
|
499 |
|
500 |
-
|
501 |
<details>
|
502 |
<summary>Zero-shot Speech Generation</summary>
|
503 |
|
504 |
-
|
505 |
-
|
506 |
-
.tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
|
507 |
-
overflow:hidden;padding:10px 5px;word-break:normal;}
|
508 |
-
.tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
|
509 |
-
font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
|
510 |
-
.tg .tg-9j4x{font-style:italic;font-weight:bold;text-align:center;text-decoration:underline;vertical-align:top}
|
511 |
-
.tg .tg-0lax{text-align:left;vertical-align:top}
|
512 |
-
</style>
|
513 |
-
<table class=""><thead>
|
514 |
<tr>
|
515 |
<th class="tg-0lax">Datasets</th>
|
516 |
<th class="tg-0lax">Model</th>
|
@@ -527,7 +504,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
527 |
</tr>
|
528 |
<tr>
|
529 |
<td class="tg-0lax">Seed-TTS_RL</td>
|
530 |
-
<td class="tg-0lax"><
|
531 |
</tr>
|
532 |
<tr>
|
533 |
<td class="tg-0lax">MaskGCT</td>
|
@@ -539,7 +516,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
539 |
</tr>
|
540 |
<tr>
|
541 |
<td class="tg-0lax">F5-TTS</td>
|
542 |
-
<td class="tg-0lax">1.56 | <
|
543 |
</tr>
|
544 |
<tr>
|
545 |
<td class="tg-0lax">CosyVoice 2</td>
|
@@ -567,7 +544,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
567 |
</tr>
|
568 |
<tr>
|
569 |
<td class="tg-0lax">Seed-TTS_RL</td>
|
570 |
-
<td class="tg-0lax"><
|
571 |
</tr>
|
572 |
<tr>
|
573 |
<td class="tg-0lax">MaskGCT</td>
|
@@ -611,10 +588,10 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
|
|
611 |
| GPQA | 30.8 | **36.4** | 34.3 | 32.8 | 32.8 |
|
612 |
| MATH | 71.5 | **75.5** | 52.9 | 51.9 | 44.3 |
|
613 |
| GSM8K | 88.7 | **91.6** | 85.7 | 84.5 | 76.7 |
|
614 |
-
| HumanEval |
|
615 |
-
| MBPP | 73.
|
616 |
-
| MultiPL-E |
|
617 |
-
| LiveCodeBench<sub>2305-2409</sub> |
|
618 |
</details>
|
619 |
|
620 |
## Quickstart
|
|
|
43 |
|
44 |
### Performance
|
45 |
|
46 |
+
We conducted a comprehensive evaluation of Qwen2.5-Omni, which demonstrates strong performance across all modalities when compared to similarly sized single-modality models and closed-source models like Qwen2.5-VL-7B, Qwen2-Audio, and Gemini-1.5-pro. In tasks requiring the integration of multiple modalities, such as OmniBench, Qwen2.5-Omni achieves state-of-the-art performance. Furthermore, in single-modality tasks, it excels in areas including speech recognition (Common Voice), translation (CoVoST2), audio understanding (MMAU), image reasoning (MMMU, MMStar), video understanding (MVBench), and speech generation (Seed-tts-eval and subjective naturalness).
|
47 |
+
|
48 |
+
<p align="center">
|
49 |
+
<img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/bar.png" width="80%"/>
|
50 |
+
<p>
|
51 |
+
|
52 |
<details>
|
53 |
<summary>Multimodality -> Text</summary>
|
54 |
|
55 |
+
<table class="tg"><thead>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
<tr>
|
57 |
<th class="tg-0lax">Datasets</th>
|
58 |
<th class="tg-0lax">Model</th>
|
|
|
74 |
</tr>
|
75 |
<tr>
|
76 |
<td class="tg-0lax">video-SALMONN</td>
|
77 |
+
<td class="tg-0lax">34.11%|31.70%|<strong>56.60%</strong>|35.64%</td>
|
78 |
</tr>
|
79 |
<tr>
|
80 |
<td class="tg-0lax">UnifiedIO2-xlarge</td>
|
|
|
82 |
</tr>
|
83 |
<tr>
|
84 |
<td class="tg-0lax">UnifiedIO2-xxlarge</td>
|
|
|
|
|
|
|
|
|
85 |
<td class="tg-0lax">34.24%|36.98%|24.53%|33.98%</td>
|
86 |
</tr>
|
87 |
<tr>
|
88 |
+
<td class="tg-0lax">MiniCPM-o</td>
|
89 |
<td class="tg-0lax">-|-|-|40.50%</td>
|
90 |
</tr>
|
91 |
<tr>
|
92 |
+
<td class="tg-0lax">Baichuan-Omni-1.5</td>
|
93 |
<td class="tg-0lax">-|-|-|42.90%</td>
|
94 |
</tr>
|
95 |
<tr>
|
96 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
97 |
+
<td class="tg-0lax"><strong>55.25%</strong>|<strong>60.00%</strong>|52.83%|<strong>56.13%</strong></td>
|
98 |
</tr>
|
99 |
</tbody></table>
|
100 |
</details>
|
|
|
103 |
<details>
|
104 |
<summary>Audio -> Text</summary>
|
105 |
|
106 |
+
|
107 |
+
<table class="tg"><thead>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
<tr>
|
109 |
<th class="tg-0lax">Datasets</th>
|
110 |
<th class="tg-0lax">Model</th>
|
|
|
137 |
</tr>
|
138 |
<tr>
|
139 |
<td class="tg-0lax">Seed-ASR-Multilingual</td>
|
140 |
+
<td class="tg-0lax">-|-|<strong>1.6</strong>|<strong>2.8</strong></td>
|
141 |
</tr>
|
142 |
<tr>
|
143 |
<td class="tg-0lax">MiniCPM-o</td>
|
|
|
153 |
</tr>
|
154 |
<tr>
|
155 |
<td class="tg-0lax">Qwen2-Audio</td>
|
156 |
+
<td class="tg-0lax"><strong>1.3</strong>|<strong>3.4</strong>|<strong>1.6</strong>|3.6</td>
|
157 |
</tr>
|
158 |
<tr>
|
159 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
|
|
162 |
<tr>
|
163 |
<td class="tg-0lax" rowspan="4">Common Voice 15<br>en | zh | yue | fr</td>
|
164 |
<td class="tg-0lax">Whisper-large-v3</td>
|
165 |
+
<td class="tg-0lax">9.3|12.8|10.9|10.8</td>
|
166 |
</tr>
|
167 |
<tr>
|
168 |
<td class="tg-0lax">MinMo</td>
|
|
|
170 |
</tr>
|
171 |
<tr>
|
172 |
<td class="tg-0lax">Qwen2-Audio</td>
|
173 |
+
<td class="tg-0lax">8.6|6.9|<strong>5.9</strong>|9.6</td>
|
174 |
</tr>
|
175 |
<tr>
|
176 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
177 |
+
<td class="tg-0lax"><strong>7.6</strong>|<strong>5.2</strong>|7.3|<strong>7.5</strong></td>
|
178 |
</tr>
|
179 |
<tr>
|
180 |
<td class="tg-0lax" rowspan="7">Fleurs<br>zh | en</td>
|
|
|
183 |
</tr>
|
184 |
<tr>
|
185 |
<td class="tg-0lax">Seed-ASR-Multilingual</td>
|
186 |
+
<td class="tg-0lax">-|<strong>3.4</strong></td>
|
187 |
</tr>
|
188 |
<tr>
|
189 |
<td class="tg-0lax">Megrez-3B-Omni</td>
|
|
|
203 |
</tr>
|
204 |
<tr>
|
205 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
206 |
+
<td class="tg-0lax"><strong>3.0</strong>|4.1</td>
|
207 |
</tr>
|
208 |
<tr>
|
209 |
<td class="tg-0lax" rowspan="5">Wenetspeech<br>test-net | test-meeting</td>
|
210 |
<td class="tg-0lax">Seed-ASR-Chinese</td>
|
211 |
+
<td class="tg-0lax"><strong>4.7|5.7</strong></td>
|
212 |
</tr>
|
213 |
<tr>
|
214 |
<td class="tg-0lax">Megrez-3B-Omni</td>
|
|
|
233 |
</tr>
|
234 |
<tr>
|
235 |
<td class="tg-0lax">Llama-3-70B</td>
|
236 |
+
<td class="tg-0lax"><strong>5.7</strong></td>
|
237 |
</tr>
|
238 |
<tr>
|
239 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
|
|
257 |
</tr>
|
258 |
<tr>
|
259 |
<td class="tg-0lax">MiniCPM-o</td>
|
260 |
+
<td class="tg-0lax">-|-|<strong>48.2</strong>|27.2</td>
|
261 |
</tr>
|
262 |
<tr>
|
263 |
<td class="tg-0lax">MinMo</td>
|
264 |
+
<td class="tg-0lax">-|<strong>39.9</strong>|46.7|26.0</td>
|
265 |
</tr>
|
266 |
<tr>
|
267 |
<td class="tg-0lax">Qwen-Audio</td>
|
|
|
273 |
</tr>
|
274 |
<tr>
|
275 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
276 |
+
<td class="tg-0lax"><strong>30.2</strong>|37.7|41.4|<strong>29.4</strong></td>
|
277 |
</tr>
|
278 |
<tr>
|
279 |
<td class="tg-9j4x" colspan="3">SER</td>
|
|
|
297 |
</tr>
|
298 |
<tr>
|
299 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
300 |
+
<td class="tg-0lax"><strong>0.570</strong></td>
|
301 |
</tr>
|
302 |
<tr>
|
303 |
<td class="tg-9j4x" colspan="3">VSC</td>
|
|
|
317 |
</tr>
|
318 |
<tr>
|
319 |
<td class="tg-0lax">Qwen2-Audio</td>
|
320 |
+
<td class="tg-0lax"><strong>0.939</strong></td>
|
321 |
</tr>
|
322 |
<tr>
|
323 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
324 |
+
<td class="tg-0lax"><strong>0.939</strong></td>
|
325 |
</tr>
|
326 |
<tr>
|
327 |
<td class="tg-9j4x" colspan="3">Music</td>
|
|
|
333 |
</tr>
|
334 |
<tr>
|
335 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
336 |
+
<td class="tg-0lax"><strong>0.88</strong></td>
|
337 |
</tr>
|
338 |
<tr>
|
339 |
<td class="tg-0lax" rowspan="2">MusicCaps</td>
|
340 |
<td class="tg-0lax">LP-MusicCaps</td>
|
341 |
+
<td class="tg-0lax">0.291|0.149|0.089|<strong>0.061</strong>|<strong>0.129</strong>|0.130</td>
|
342 |
</tr>
|
343 |
<tr>
|
344 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
345 |
+
<td class="tg-0lax"><strong>0.328</strong>|<strong>0.162</strong>|<strong>0.090</strong>|0.055|0.127|<strong>0.225</strong></td>
|
346 |
</tr>
|
347 |
<tr>
|
348 |
<td class="tg-9j4x" colspan="3">Audio Reasoning</td>
|
|
|
354 |
</tr>
|
355 |
<tr>
|
356 |
<td class="tg-0lax">Qwen2-Audio</td>
|
357 |
+
<td class="tg-0lax">54.95|50.98|42.04|49.20</td>
|
358 |
</tr>
|
359 |
<tr>
|
360 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
361 |
+
<td class="tg-0lax"><strong>67.87|69.16|59.76|65.60</strong></td>
|
362 |
</tr>
|
363 |
<tr>
|
364 |
<td class="tg-9j4x" colspan="3">Voice Chatting</td>
|
|
|
366 |
<tr>
|
367 |
<td class="tg-0lax" rowspan="8">VoiceBench<br>AlpacaEval | CommonEval | SD-QA | MMSU</td>
|
368 |
<td class="tg-0lax">Ultravox-v0.4.1-LLaMA-3.1-8B</td>
|
369 |
+
<td class="tg-0lax"><strong>4.55</strong>|3.90|53.35|47.17</td>
|
370 |
</tr>
|
371 |
<tr>
|
372 |
<td class="tg-0lax">MERaLiON</td>
|
|
|
382 |
</tr>
|
383 |
<tr>
|
384 |
<td class="tg-0lax">MiniCPM-o</td>
|
385 |
+
<td class="tg-0lax">4.42|<strong>4.15</strong>|50.72|54.78</td>
|
386 |
</tr>
|
387 |
<tr>
|
388 |
<td class="tg-0lax">Baichuan-Omni-1.5</td>
|
|
|
394 |
</tr>
|
395 |
<tr>
|
396 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
397 |
+
<td class="tg-0lax">4.49|3.93|<strong>55.71</strong>|<strong>61.32</strong></td>
|
398 |
</tr>
|
399 |
<tr>
|
400 |
<td class="tg-0lax" rowspan="8">VoiceBench<br>OpenBookQA | IFEval | AdvBench | Avg</td>
|
401 |
<td class="tg-0lax">Ultravox-v0.4.1-LLaMA-3.1-8B</td>
|
402 |
+
<td class="tg-0lax">65.27|<strong>66.88</strong>|98.46|71.45</td>
|
403 |
</tr>
|
404 |
<tr>
|
405 |
<td class="tg-0lax">MERaLiON</td>
|
|
|
427 |
</tr>
|
428 |
<tr>
|
429 |
<td class="tg-0lax">Qwen2.5-Omni-7B</td>
|
430 |
+
<td class="tg-0lax"><strong>81.10</strong>|52.87|<strong>99.42</strong>|<strong>74.12</strong></td>
|
431 |
</tr>
|
432 |
</tbody></table>
|
433 |
</details>
|
|
|
459 |
|
460 |
| Dataset | Qwen2.5-Omni-7B | Qwen2.5-VL-7B | Grounding DINO | Gemini 1.5 Pro |
|
461 |
|--------------------------|--------------|---------------|----------------|----------------|
|
462 |
+
| Refcoco<sub>val</sub> | 90.5 | 90.0 | **90.6** | 73.2 |
|
463 |
+
| Refcoco<sub>textA</sub> | **93.5** | 92.5 | 93.2 | 72.9 |
|
464 |
+
| Refcoco<sub>textB</sub> | 86.6 | 85.4 | **88.2** | 74.6 |
|
465 |
+
| Refcoco+<sub>val</sub> | 85.4 | 84.2 | **88.2** | 62.5 |
|
466 |
| Refcoco+<sub>textA</sub> | **91.0** | 89.1 | 89.0 | 63.9 |
|
467 |
+
| Refcoco+<sub>textB</sub> | **79.3** | 76.9 | 75.9 | 65.0 |
|
468 |
+
| Refcocog+<sub>val</sub> | **87.4** | 87.2 | 86.1 | 75.2 |
|
469 |
+
| Refcocog+<sub>test</sub> | **87.9** | 87.2 | 87.0 | 76.2 |
|
470 |
| ODinW | 42.4 | 37.3 | **55.0** | 36.7 |
|
471 |
+
| PointGrounding | 66.5 | **67.3** | - | - |
|
472 |
</details>
|
473 |
|
474 |
|
|
|
477 |
|
478 |
| Dataset | Qwen2.5-Omni-7B | Other Best | Qwen2.5-VL-7B | GPT-4o-mini |
|
479 |
|-----------------------------|--------------|------------|---------------|-------------|
|
480 |
+
| Video-MME<sub>w/o sub</sub> | 64.3 | 63.9 | **65.1** | 64.8 |
|
481 |
+
| Video-MME<sub>w sub</sub> | **72.4** | 67.9 | 71.6 | - |
|
482 |
+
| MVBench | **70.3** | 67.2 | 69.6 | - |
|
483 |
+
| EgoSchema<sub>test</sub> | **68.6** | 63.2 | 65.0 | - |
|
484 |
</details>
|
485 |
|
|
|
486 |
<details>
|
487 |
<summary>Zero-shot Speech Generation</summary>
|
488 |
|
489 |
+
|
490 |
+
<table class="tg"><thead>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
491 |
<tr>
|
492 |
<th class="tg-0lax">Datasets</th>
|
493 |
<th class="tg-0lax">Model</th>
|
|
|
504 |
</tr>
|
505 |
<tr>
|
506 |
<td class="tg-0lax">Seed-TTS_RL</td>
|
507 |
+
<td class="tg-0lax"><strong>1.00</strong> | 1.94 | <strong>6.42</strong></td>
|
508 |
</tr>
|
509 |
<tr>
|
510 |
<td class="tg-0lax">MaskGCT</td>
|
|
|
516 |
</tr>
|
517 |
<tr>
|
518 |
<td class="tg-0lax">F5-TTS</td>
|
519 |
+
<td class="tg-0lax">1.56 | <strong>1.83</strong> | 8.67</td>
|
520 |
</tr>
|
521 |
<tr>
|
522 |
<td class="tg-0lax">CosyVoice 2</td>
|
|
|
544 |
</tr>
|
545 |
<tr>
|
546 |
<td class="tg-0lax">Seed-TTS_RL</td>
|
547 |
+
<td class="tg-0lax"><strong>0.801</strong> | <strong>0.766</strong> | <strong>0.782</strong></td>
|
548 |
</tr>
|
549 |
<tr>
|
550 |
<td class="tg-0lax">MaskGCT</td>
|
|
|
588 |
| GPQA | 30.8 | **36.4** | 34.3 | 32.8 | 32.8 |
|
589 |
| MATH | 71.5 | **75.5** | 52.9 | 51.9 | 44.3 |
|
590 |
| GSM8K | 88.7 | **91.6** | 85.7 | 84.5 | 76.7 |
|
591 |
+
| HumanEval | 78.7 | **84.8** | 79.9 | 72.6 | 68.9 |
|
592 |
+
| MBPP | 73.2 | **79.2** | 67.2 | 69.6 | 74.9 |
|
593 |
+
| MultiPL-E | 65.8 | **70.4** | 59.1 | 50.7 | 53.4 |
|
594 |
+
| LiveCodeBench<sub>2305-2409</sub> | 24.6 | **28.7** | 23.9 | 8.3 | 18.9 |
|
595 |
</details>
|
596 |
|
597 |
## Quickstart
|