xiongwang commited on
Commit
fbeb40e
·
verified ·
1 Parent(s): bb2b272

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +59 -82
README.md CHANGED
@@ -43,18 +43,16 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
43
 
44
  ### Performance
45
 
 
 
 
 
 
 
46
  <details>
47
  <summary>Multimodality -> Text</summary>
48
 
49
- <style type="text/css">
50
- .tg {border-collapse:collapse;border-spacing:0;}
51
- .tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
52
- overflow:hidden;padding:10px 5px;word-break:normal;}
53
- .tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
54
- font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
55
- .tg .tg-0lax{text-align:left;vertical-align:top}
56
- </style>
57
- <table class=""><thead>
58
  <tr>
59
  <th class="tg-0lax">Datasets</th>
60
  <th class="tg-0lax">Model</th>
@@ -76,7 +74,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
76
  </tr>
77
  <tr>
78
  <td class="tg-0lax">video-SALMONN</td>
79
- <td class="tg-0lax">34.11%|31.70%|<span style="font-weight:bold">56.60%</span>|35.64%</td>
80
  </tr>
81
  <tr>
82
  <td class="tg-0lax">UnifiedIO2-xlarge</td>
@@ -84,23 +82,19 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
84
  </tr>
85
  <tr>
86
  <td class="tg-0lax">UnifiedIO2-xxlarge</td>
87
- <td class="tg-0lax">34.24%|36.98%|29.25%|38.00%</td>
88
- </tr>
89
- <tr>
90
- <td class="tg-0lax">MiniCPM-o</td>
91
  <td class="tg-0lax">34.24%|36.98%|24.53%|33.98%</td>
92
  </tr>
93
  <tr>
94
- <td class="tg-0lax">Baichuan-Omni-1.5</td>
95
  <td class="tg-0lax">-|-|-|40.50%</td>
96
  </tr>
97
  <tr>
98
- <td class="tg-0lax">Qwen2-Audio</td>
99
  <td class="tg-0lax">-|-|-|42.90%</td>
100
  </tr>
101
  <tr>
102
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
103
- <td class="tg-0lax"><span style="font-weight:bold">55.25%</span>|<span style="font-weight:bold">60.00%</span>|52.83%|<span style="font-weight:bold">56.13%</span></td>
104
  </tr>
105
  </tbody></table>
106
  </details>
@@ -109,16 +103,8 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
109
  <details>
110
  <summary>Audio -> Text</summary>
111
 
112
- <style type="text/css">
113
- .tg {border-collapse:collapse;border-spacing:0;}
114
- .tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
115
- overflow:hidden;padding:10px 5px;word-break:normal;}
116
- .tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
117
- font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
118
- .tg .tg-9j4x{font-style:italic;font-weight:bold;text-align:center;text-decoration:underline;vertical-align:top}
119
- .tg .tg-0lax{text-align:left;vertical-align:top}
120
- </style>
121
- <table class=""><thead>
122
  <tr>
123
  <th class="tg-0lax">Datasets</th>
124
  <th class="tg-0lax">Model</th>
@@ -151,7 +137,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
151
  </tr>
152
  <tr>
153
  <td class="tg-0lax">Seed-ASR-Multilingual</td>
154
- <td class="tg-0lax">-|-|<span style="font-weight:bold">1.6</span>|<span style="font-weight:bold">2.8</span></td>
155
  </tr>
156
  <tr>
157
  <td class="tg-0lax">MiniCPM-o</td>
@@ -167,7 +153,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
167
  </tr>
168
  <tr>
169
  <td class="tg-0lax">Qwen2-Audio</td>
170
- <td class="tg-0lax"><span style="font-weight:bold">1.3</span>|<span style="font-weight:bold">3.4</span>|<span style="font-weight:bold">1.6</span>|3.6</td>
171
  </tr>
172
  <tr>
173
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
@@ -176,7 +162,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
176
  <tr>
177
  <td class="tg-0lax" rowspan="4">Common Voice 15<br>en | zh | yue | fr</td>
178
  <td class="tg-0lax">Whisper-large-v3</td>
179
- <td class="tg-0lax">9.8|12.8|10.9|10.8</td>
180
  </tr>
181
  <tr>
182
  <td class="tg-0lax">MinMo</td>
@@ -184,11 +170,11 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
184
  </tr>
185
  <tr>
186
  <td class="tg-0lax">Qwen2-Audio</td>
187
- <td class="tg-0lax">8.6|6.9|<span style="font-weight:bold">5.9</span>|9.6</td>
188
  </tr>
189
  <tr>
190
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
191
- <td class="tg-0lax"><span style="font-weight:bold">7.6</span>|<span style="font-weight:bold">5.2</span>|7.3|<span style="font-weight:bold">7.5</span></td>
192
  </tr>
193
  <tr>
194
  <td class="tg-0lax" rowspan="7">Fleurs<br>zh | en</td>
@@ -197,7 +183,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
197
  </tr>
198
  <tr>
199
  <td class="tg-0lax">Seed-ASR-Multilingual</td>
200
- <td class="tg-0lax">-|<span style="font-weight:bold">3.4</span></td>
201
  </tr>
202
  <tr>
203
  <td class="tg-0lax">Megrez-3B-Omni</td>
@@ -217,12 +203,12 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
217
  </tr>
218
  <tr>
219
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
220
- <td class="tg-0lax"><span style="font-weight:bold">3.0</span>|4.1</td>
221
  </tr>
222
  <tr>
223
  <td class="tg-0lax" rowspan="5">Wenetspeech<br>test-net | test-meeting</td>
224
  <td class="tg-0lax">Seed-ASR-Chinese</td>
225
- <td class="tg-0lax"><span style="font-weight:bold">4.7|5.7</span></td>
226
  </tr>
227
  <tr>
228
  <td class="tg-0lax">Megrez-3B-Omni</td>
@@ -247,7 +233,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
247
  </tr>
248
  <tr>
249
  <td class="tg-0lax">Llama-3-70B</td>
250
- <td class="tg-0lax"><span style="font-weight:bold">5.7</span></td>
251
  </tr>
252
  <tr>
253
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
@@ -271,11 +257,11 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
271
  </tr>
272
  <tr>
273
  <td class="tg-0lax">MiniCPM-o</td>
274
- <td class="tg-0lax">-|-|<span style="font-weight:bold">48.2</span>|27.2</td>
275
  </tr>
276
  <tr>
277
  <td class="tg-0lax">MinMo</td>
278
- <td class="tg-0lax">-|<span style="font-weight:bold">39.9</span>|46.7|26.0</td>
279
  </tr>
280
  <tr>
281
  <td class="tg-0lax">Qwen-Audio</td>
@@ -287,7 +273,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
287
  </tr>
288
  <tr>
289
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
290
- <td class="tg-0lax"><span style="font-weight:bold">30.2</span>|37.7|41.4|<span style="font-weight:bold">29.4</span></td>
291
  </tr>
292
  <tr>
293
  <td class="tg-9j4x" colspan="3">SER</td>
@@ -311,7 +297,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
311
  </tr>
312
  <tr>
313
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
314
- <td class="tg-0lax"><span style="font-weight:bold">0.570</span></td>
315
  </tr>
316
  <tr>
317
  <td class="tg-9j4x" colspan="3">VSC</td>
@@ -331,11 +317,11 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
331
  </tr>
332
  <tr>
333
  <td class="tg-0lax">Qwen2-Audio</td>
334
- <td class="tg-0lax"><span style="font-weight:bold">0.939</span></td>
335
  </tr>
336
  <tr>
337
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
338
- <td class="tg-0lax"><span style="font-weight:bold">0.939</span></td>
339
  </tr>
340
  <tr>
341
  <td class="tg-9j4x" colspan="3">Music</td>
@@ -347,16 +333,16 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
347
  </tr>
348
  <tr>
349
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
350
- <td class="tg-0lax"><span style="font-weight:bold">0.88</span></td>
351
  </tr>
352
  <tr>
353
  <td class="tg-0lax" rowspan="2">MusicCaps</td>
354
  <td class="tg-0lax">LP-MusicCaps</td>
355
- <td class="tg-0lax">0.291|0.149|0.089|<span style="font-weight:bold">0.061</span>|<span style="font-weight:bold">0.129</span>|0.130</td>
356
  </tr>
357
  <tr>
358
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
359
- <td class="tg-0lax"><span style="font-weight:bold">0.328</span>|<span style="font-weight:bold">0.162</span>|<span style="font-weight:bold">0.090</span>|0.055|0.127|<span style="font-weight:bold">0.225</span></td>
360
  </tr>
361
  <tr>
362
  <td class="tg-9j4x" colspan="3">Audio Reasoning</td>
@@ -368,11 +354,11 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
368
  </tr>
369
  <tr>
370
  <td class="tg-0lax">Qwen2-Audio</td>
371
- <td class="tg-0lax">54.95|50.98|42.04|49.20.5</td>
372
  </tr>
373
  <tr>
374
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
375
- <td class="tg-0lax"><span style="font-weight:bold">67.87|69.16|59.76|65.60</span></td>
376
  </tr>
377
  <tr>
378
  <td class="tg-9j4x" colspan="3">Voice Chatting</td>
@@ -380,7 +366,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
380
  <tr>
381
  <td class="tg-0lax" rowspan="8">VoiceBench<br>AlpacaEval | CommonEval | SD-QA | MMSU</td>
382
  <td class="tg-0lax">Ultravox-v0.4.1-LLaMA-3.1-8B</td>
383
- <td class="tg-0lax"><span style="font-weight:bold">4.55</span>|3.90|53.35|47.17</td>
384
  </tr>
385
  <tr>
386
  <td class="tg-0lax">MERaLiON</td>
@@ -396,7 +382,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
396
  </tr>
397
  <tr>
398
  <td class="tg-0lax">MiniCPM-o</td>
399
- <td class="tg-0lax">4.42|<span style="font-weight:bold">4.15</span>|50.72|54.78</td>
400
  </tr>
401
  <tr>
402
  <td class="tg-0lax">Baichuan-Omni-1.5</td>
@@ -408,12 +394,12 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
408
  </tr>
409
  <tr>
410
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
411
- <td class="tg-0lax">4.49|3.93|<span style="font-weight:bold">55.71</span>|<span style="font-weight:bold">61.32</span></td>
412
  </tr>
413
  <tr>
414
  <td class="tg-0lax" rowspan="8">VoiceBench<br>OpenBookQA | IFEval | AdvBench | Avg</td>
415
  <td class="tg-0lax">Ultravox-v0.4.1-LLaMA-3.1-8B</td>
416
- <td class="tg-0lax">65.27|<span style="font-weight:bold">66.88</span>|98.46|71.45</td>
417
  </tr>
418
  <tr>
419
  <td class="tg-0lax">MERaLiON</td>
@@ -441,7 +427,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
441
  </tr>
442
  <tr>
443
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
444
- <td class="tg-0lax"><span style="font-weight:bold">81.10</span>|52.87|<span style="font-weight:bold">99.42</span>|<span style="font-weight:bold">74.12</span></td>
445
  </tr>
446
  </tbody></table>
447
  </details>
@@ -473,16 +459,16 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
473
 
474
  | Dataset | Qwen2.5-Omni-7B | Qwen2.5-VL-7B | Grounding DINO | Gemini 1.5 Pro |
475
  |--------------------------|--------------|---------------|----------------|----------------|
476
- | Refcoco<sub>val</sub> | **90.6** | 90.0 | **90.6** | 73.2 |
477
- | Refcoco<sub>textA</sub> | **93.4** | 92.5 | 93.2 | 72.9 |
478
- | Refcoco<sub>textB</sub> | 86.8 | 85.4 | **88.2** | 74.6 |
479
- | Refcoco+<sub>val</sub> | 85.3 | 84.2 | **88.2** | 62.5 |
480
  | Refcoco+<sub>textA</sub> | **91.0** | 89.1 | 89.0 | 63.9 |
481
- | Refcoco+<sub>textB</sub> | **79.2** | 76.9 | 75.9 | 65.0 |
482
- | Refcocog+<sub>val</sub> | **87.6** | 87.2 | 86.1 | 75.2 |
483
- | Refcocog+<sub>test</sub> | **88.0** | 87.2 | 87.0 | 76.2 |
484
  | ODinW | 42.4 | 37.3 | **55.0** | 36.7 |
485
- | PointGrounding | 65.3 | **67.3** | - | - |
486
  </details>
487
 
488
 
@@ -491,26 +477,17 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
491
 
492
  | Dataset | Qwen2.5-Omni-7B | Other Best | Qwen2.5-VL-7B | GPT-4o-mini |
493
  |-----------------------------|--------------|------------|---------------|-------------|
494
- | Video-MME<sub>w/o sub</sub> | **65.9** | 63.9 | 65.1 | 64.8 |
495
- | Video-MME<sub>w sub</sub> | **72.9** | 67.9 | 71.6 | - |
496
- | MVBench | 68.6 | 67.2 | **69.6** | - |
497
- | EgoSchema<sub>test</sub> | **69.6** | 63.2 | 65.0 | - |
498
  </details>
499
 
500
-
501
  <details>
502
  <summary>Zero-shot Speech Generation</summary>
503
 
504
- <style type="text/css">
505
- .tg {border-collapse:collapse;border-spacing:0;}
506
- .tg td{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
507
- overflow:hidden;padding:10px 5px;word-break:normal;}
508
- .tg th{border-color:black;border-style:solid;border-width:1px;font-family:Arial, sans-serif;font-size:14px;
509
- font-weight:normal;overflow:hidden;padding:10px 5px;word-break:normal;}
510
- .tg .tg-9j4x{font-style:italic;font-weight:bold;text-align:center;text-decoration:underline;vertical-align:top}
511
- .tg .tg-0lax{text-align:left;vertical-align:top}
512
- </style>
513
- <table class=""><thead>
514
  <tr>
515
  <th class="tg-0lax">Datasets</th>
516
  <th class="tg-0lax">Model</th>
@@ -527,7 +504,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
527
  </tr>
528
  <tr>
529
  <td class="tg-0lax">Seed-TTS_RL</td>
530
- <td class="tg-0lax"><span style="font-weight:bold">1.00</span> | 1.94 | <span style="font-weight:bold">6.42</span></td>
531
  </tr>
532
  <tr>
533
  <td class="tg-0lax">MaskGCT</td>
@@ -539,7 +516,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
539
  </tr>
540
  <tr>
541
  <td class="tg-0lax">F5-TTS</td>
542
- <td class="tg-0lax">1.56 | <span style="font-weight:bold">1.83</span> | 8.67</td>
543
  </tr>
544
  <tr>
545
  <td class="tg-0lax">CosyVoice 2</td>
@@ -567,7 +544,7 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
567
  </tr>
568
  <tr>
569
  <td class="tg-0lax">Seed-TTS_RL</td>
570
- <td class="tg-0lax"><span style="font-weight:bold">0.801</span> | <span style="font-weight:bold">0.766</span> | <span style="font-weight:bold">0.782</span></td>
571
  </tr>
572
  <tr>
573
  <td class="tg-0lax">MaskGCT</td>
@@ -611,10 +588,10 @@ Qwen2.5-Omni is an end-to-end multimodal model designed to perceive diverse moda
611
  | GPQA | 30.8 | **36.4** | 34.3 | 32.8 | 32.8 |
612
  | MATH | 71.5 | **75.5** | 52.9 | 51.9 | 44.3 |
613
  | GSM8K | 88.7 | **91.6** | 85.7 | 84.5 | 76.7 |
614
- | HumanEval | 79.9 | **84.8** | 79.9 | 72.6 | 68.9 |
615
- | MBPP | 73.7 | **79.2** | 67.2 | 69.6 | 74.9 |
616
- | MultiPL-E | 67.0 | **70.4** | 59.1 | 50.7 | 53.4 |
617
- | LiveCodeBench<sub>2305-2409</sub> | 25.2 | **28.7** | 23.9 | 8.3 | 18.9 |
618
  </details>
619
 
620
  ## Quickstart
 
43
 
44
  ### Performance
45
 
46
+ We conducted a comprehensive evaluation of Qwen2.5-Omni, which demonstrates strong performance across all modalities when compared to similarly sized single-modality models and closed-source models like Qwen2.5-VL-7B, Qwen2-Audio, and Gemini-1.5-pro. In tasks requiring the integration of multiple modalities, such as OmniBench, Qwen2.5-Omni achieves state-of-the-art performance. Furthermore, in single-modality tasks, it excels in areas including speech recognition (Common Voice), translation (CoVoST2), audio understanding (MMAU), image reasoning (MMMU, MMStar), video understanding (MVBench), and speech generation (Seed-tts-eval and subjective naturalness).
47
+
48
+ <p align="center">
49
+ <img src="https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/bar.png" width="80%"/>
50
+ <p>
51
+
52
  <details>
53
  <summary>Multimodality -> Text</summary>
54
 
55
+ <table class="tg"><thead>
 
 
 
 
 
 
 
 
56
  <tr>
57
  <th class="tg-0lax">Datasets</th>
58
  <th class="tg-0lax">Model</th>
 
74
  </tr>
75
  <tr>
76
  <td class="tg-0lax">video-SALMONN</td>
77
+ <td class="tg-0lax">34.11%|31.70%|<strong>56.60%</strong>|35.64%</td>
78
  </tr>
79
  <tr>
80
  <td class="tg-0lax">UnifiedIO2-xlarge</td>
 
82
  </tr>
83
  <tr>
84
  <td class="tg-0lax">UnifiedIO2-xxlarge</td>
 
 
 
 
85
  <td class="tg-0lax">34.24%|36.98%|24.53%|33.98%</td>
86
  </tr>
87
  <tr>
88
+ <td class="tg-0lax">MiniCPM-o</td>
89
  <td class="tg-0lax">-|-|-|40.50%</td>
90
  </tr>
91
  <tr>
92
+ <td class="tg-0lax">Baichuan-Omni-1.5</td>
93
  <td class="tg-0lax">-|-|-|42.90%</td>
94
  </tr>
95
  <tr>
96
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
97
+ <td class="tg-0lax"><strong>55.25%</strong>|<strong>60.00%</strong>|52.83%|<strong>56.13%</strong></td>
98
  </tr>
99
  </tbody></table>
100
  </details>
 
103
  <details>
104
  <summary>Audio -> Text</summary>
105
 
106
+
107
+ <table class="tg"><thead>
 
 
 
 
 
 
 
 
108
  <tr>
109
  <th class="tg-0lax">Datasets</th>
110
  <th class="tg-0lax">Model</th>
 
137
  </tr>
138
  <tr>
139
  <td class="tg-0lax">Seed-ASR-Multilingual</td>
140
+ <td class="tg-0lax">-|-|<strong>1.6</strong>|<strong>2.8</strong></td>
141
  </tr>
142
  <tr>
143
  <td class="tg-0lax">MiniCPM-o</td>
 
153
  </tr>
154
  <tr>
155
  <td class="tg-0lax">Qwen2-Audio</td>
156
+ <td class="tg-0lax"><strong>1.3</strong>|<strong>3.4</strong>|<strong>1.6</strong>|3.6</td>
157
  </tr>
158
  <tr>
159
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
 
162
  <tr>
163
  <td class="tg-0lax" rowspan="4">Common Voice 15<br>en | zh | yue | fr</td>
164
  <td class="tg-0lax">Whisper-large-v3</td>
165
+ <td class="tg-0lax">9.3|12.8|10.9|10.8</td>
166
  </tr>
167
  <tr>
168
  <td class="tg-0lax">MinMo</td>
 
170
  </tr>
171
  <tr>
172
  <td class="tg-0lax">Qwen2-Audio</td>
173
+ <td class="tg-0lax">8.6|6.9|<strong>5.9</strong>|9.6</td>
174
  </tr>
175
  <tr>
176
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
177
+ <td class="tg-0lax"><strong>7.6</strong>|<strong>5.2</strong>|7.3|<strong>7.5</strong></td>
178
  </tr>
179
  <tr>
180
  <td class="tg-0lax" rowspan="7">Fleurs<br>zh | en</td>
 
183
  </tr>
184
  <tr>
185
  <td class="tg-0lax">Seed-ASR-Multilingual</td>
186
+ <td class="tg-0lax">-|<strong>3.4</strong></td>
187
  </tr>
188
  <tr>
189
  <td class="tg-0lax">Megrez-3B-Omni</td>
 
203
  </tr>
204
  <tr>
205
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
206
+ <td class="tg-0lax"><strong>3.0</strong>|4.1</td>
207
  </tr>
208
  <tr>
209
  <td class="tg-0lax" rowspan="5">Wenetspeech<br>test-net | test-meeting</td>
210
  <td class="tg-0lax">Seed-ASR-Chinese</td>
211
+ <td class="tg-0lax"><strong>4.7|5.7</strong></td>
212
  </tr>
213
  <tr>
214
  <td class="tg-0lax">Megrez-3B-Omni</td>
 
233
  </tr>
234
  <tr>
235
  <td class="tg-0lax">Llama-3-70B</td>
236
+ <td class="tg-0lax"><strong>5.7</strong></td>
237
  </tr>
238
  <tr>
239
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
 
257
  </tr>
258
  <tr>
259
  <td class="tg-0lax">MiniCPM-o</td>
260
+ <td class="tg-0lax">-|-|<strong>48.2</strong>|27.2</td>
261
  </tr>
262
  <tr>
263
  <td class="tg-0lax">MinMo</td>
264
+ <td class="tg-0lax">-|<strong>39.9</strong>|46.7|26.0</td>
265
  </tr>
266
  <tr>
267
  <td class="tg-0lax">Qwen-Audio</td>
 
273
  </tr>
274
  <tr>
275
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
276
+ <td class="tg-0lax"><strong>30.2</strong>|37.7|41.4|<strong>29.4</strong></td>
277
  </tr>
278
  <tr>
279
  <td class="tg-9j4x" colspan="3">SER</td>
 
297
  </tr>
298
  <tr>
299
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
300
+ <td class="tg-0lax"><strong>0.570</strong></td>
301
  </tr>
302
  <tr>
303
  <td class="tg-9j4x" colspan="3">VSC</td>
 
317
  </tr>
318
  <tr>
319
  <td class="tg-0lax">Qwen2-Audio</td>
320
+ <td class="tg-0lax"><strong>0.939</strong></td>
321
  </tr>
322
  <tr>
323
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
324
+ <td class="tg-0lax"><strong>0.939</strong></td>
325
  </tr>
326
  <tr>
327
  <td class="tg-9j4x" colspan="3">Music</td>
 
333
  </tr>
334
  <tr>
335
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
336
+ <td class="tg-0lax"><strong>0.88</strong></td>
337
  </tr>
338
  <tr>
339
  <td class="tg-0lax" rowspan="2">MusicCaps</td>
340
  <td class="tg-0lax">LP-MusicCaps</td>
341
+ <td class="tg-0lax">0.291|0.149|0.089|<strong>0.061</strong>|<strong>0.129</strong>|0.130</td>
342
  </tr>
343
  <tr>
344
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
345
+ <td class="tg-0lax"><strong>0.328</strong>|<strong>0.162</strong>|<strong>0.090</strong>|0.055|0.127|<strong>0.225</strong></td>
346
  </tr>
347
  <tr>
348
  <td class="tg-9j4x" colspan="3">Audio Reasoning</td>
 
354
  </tr>
355
  <tr>
356
  <td class="tg-0lax">Qwen2-Audio</td>
357
+ <td class="tg-0lax">54.95|50.98|42.04|49.20</td>
358
  </tr>
359
  <tr>
360
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
361
+ <td class="tg-0lax"><strong>67.87|69.16|59.76|65.60</strong></td>
362
  </tr>
363
  <tr>
364
  <td class="tg-9j4x" colspan="3">Voice Chatting</td>
 
366
  <tr>
367
  <td class="tg-0lax" rowspan="8">VoiceBench<br>AlpacaEval | CommonEval | SD-QA | MMSU</td>
368
  <td class="tg-0lax">Ultravox-v0.4.1-LLaMA-3.1-8B</td>
369
+ <td class="tg-0lax"><strong>4.55</strong>|3.90|53.35|47.17</td>
370
  </tr>
371
  <tr>
372
  <td class="tg-0lax">MERaLiON</td>
 
382
  </tr>
383
  <tr>
384
  <td class="tg-0lax">MiniCPM-o</td>
385
+ <td class="tg-0lax">4.42|<strong>4.15</strong>|50.72|54.78</td>
386
  </tr>
387
  <tr>
388
  <td class="tg-0lax">Baichuan-Omni-1.5</td>
 
394
  </tr>
395
  <tr>
396
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
397
+ <td class="tg-0lax">4.49|3.93|<strong>55.71</strong>|<strong>61.32</strong></td>
398
  </tr>
399
  <tr>
400
  <td class="tg-0lax" rowspan="8">VoiceBench<br>OpenBookQA | IFEval | AdvBench | Avg</td>
401
  <td class="tg-0lax">Ultravox-v0.4.1-LLaMA-3.1-8B</td>
402
+ <td class="tg-0lax">65.27|<strong>66.88</strong>|98.46|71.45</td>
403
  </tr>
404
  <tr>
405
  <td class="tg-0lax">MERaLiON</td>
 
427
  </tr>
428
  <tr>
429
  <td class="tg-0lax">Qwen2.5-Omni-7B</td>
430
+ <td class="tg-0lax"><strong>81.10</strong>|52.87|<strong>99.42</strong>|<strong>74.12</strong></td>
431
  </tr>
432
  </tbody></table>
433
  </details>
 
459
 
460
  | Dataset | Qwen2.5-Omni-7B | Qwen2.5-VL-7B | Grounding DINO | Gemini 1.5 Pro |
461
  |--------------------------|--------------|---------------|----------------|----------------|
462
+ | Refcoco<sub>val</sub> | 90.5 | 90.0 | **90.6** | 73.2 |
463
+ | Refcoco<sub>textA</sub> | **93.5** | 92.5 | 93.2 | 72.9 |
464
+ | Refcoco<sub>textB</sub> | 86.6 | 85.4 | **88.2** | 74.6 |
465
+ | Refcoco+<sub>val</sub> | 85.4 | 84.2 | **88.2** | 62.5 |
466
  | Refcoco+<sub>textA</sub> | **91.0** | 89.1 | 89.0 | 63.9 |
467
+ | Refcoco+<sub>textB</sub> | **79.3** | 76.9 | 75.9 | 65.0 |
468
+ | Refcocog+<sub>val</sub> | **87.4** | 87.2 | 86.1 | 75.2 |
469
+ | Refcocog+<sub>test</sub> | **87.9** | 87.2 | 87.0 | 76.2 |
470
  | ODinW | 42.4 | 37.3 | **55.0** | 36.7 |
471
+ | PointGrounding | 66.5 | **67.3** | - | - |
472
  </details>
473
 
474
 
 
477
 
478
  | Dataset | Qwen2.5-Omni-7B | Other Best | Qwen2.5-VL-7B | GPT-4o-mini |
479
  |-----------------------------|--------------|------------|---------------|-------------|
480
+ | Video-MME<sub>w/o sub</sub> | 64.3 | 63.9 | **65.1** | 64.8 |
481
+ | Video-MME<sub>w sub</sub> | **72.4** | 67.9 | 71.6 | - |
482
+ | MVBench | **70.3** | 67.2 | 69.6 | - |
483
+ | EgoSchema<sub>test</sub> | **68.6** | 63.2 | 65.0 | - |
484
  </details>
485
 
 
486
  <details>
487
  <summary>Zero-shot Speech Generation</summary>
488
 
489
+
490
+ <table class="tg"><thead>
 
 
 
 
 
 
 
 
491
  <tr>
492
  <th class="tg-0lax">Datasets</th>
493
  <th class="tg-0lax">Model</th>
 
504
  </tr>
505
  <tr>
506
  <td class="tg-0lax">Seed-TTS_RL</td>
507
+ <td class="tg-0lax"><strong>1.00</strong> | 1.94 | <strong>6.42</strong></td>
508
  </tr>
509
  <tr>
510
  <td class="tg-0lax">MaskGCT</td>
 
516
  </tr>
517
  <tr>
518
  <td class="tg-0lax">F5-TTS</td>
519
+ <td class="tg-0lax">1.56 | <strong>1.83</strong> | 8.67</td>
520
  </tr>
521
  <tr>
522
  <td class="tg-0lax">CosyVoice 2</td>
 
544
  </tr>
545
  <tr>
546
  <td class="tg-0lax">Seed-TTS_RL</td>
547
+ <td class="tg-0lax"><strong>0.801</strong> | <strong>0.766</strong> | <strong>0.782</strong></td>
548
  </tr>
549
  <tr>
550
  <td class="tg-0lax">MaskGCT</td>
 
588
  | GPQA | 30.8 | **36.4** | 34.3 | 32.8 | 32.8 |
589
  | MATH | 71.5 | **75.5** | 52.9 | 51.9 | 44.3 |
590
  | GSM8K | 88.7 | **91.6** | 85.7 | 84.5 | 76.7 |
591
+ | HumanEval | 78.7 | **84.8** | 79.9 | 72.6 | 68.9 |
592
+ | MBPP | 73.2 | **79.2** | 67.2 | 69.6 | 74.9 |
593
+ | MultiPL-E | 65.8 | **70.4** | 59.1 | 50.7 | 53.4 |
594
+ | LiveCodeBench<sub>2305-2409</sub> | 24.6 | **28.7** | 23.9 | 8.3 | 18.9 |
595
  </details>
596
 
597
  ## Quickstart