nm-research commited on
Commit
2fb57dc
·
verified ·
1 Parent(s): 9037a40

Add reasoning evals

Browse files
Files changed (1) hide show
  1. README.md +25 -0
README.md CHANGED
@@ -176,6 +176,31 @@ lm_eval \
176
  </thead>
177
  <tbody>
178
  <tr>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  <td rowspan="7"><b>OpenLLM V1</b></td>
180
  <td>ARC-Challenge (Acc-Norm, 25-shot)</td>
181
  <td>63.65</td>
 
176
  </thead>
177
  <tbody>
178
  <tr>
179
+ <td rowspan="4"><b>Reasoning</b></td>
180
+ <td>AIME 2024 (pass@1)</td>
181
+ <td>67.83</td>
182
+ <td>67.78</td>
183
+ <td>99.93%</td>
184
+ </tr>
185
+ <tr>
186
+ <td>MATH-500 (pass@1)</td>
187
+ <td>95.29</td>
188
+ <td>95.27</td>
189
+ <td>99.98%</td>
190
+ </tr>
191
+ <tr>
192
+ <td>GPQA Diamond (pass@1)</td>
193
+ <td>65.57</td>
194
+ <td>65.01</td>
195
+ <td>99.15%</td>
196
+ </tr>
197
+ <tr>
198
+ <td><b>Average Score</b></td>
199
+ <td><b>76.23</b></td>
200
+ <td><b>76.02</b></td>
201
+ <td><b>99.72%</b></td>
202
+ </tr>
203
+ <tr>
204
  <td rowspan="7"><b>OpenLLM V1</b></td>
205
  <td>ARC-Challenge (Acc-Norm, 25-shot)</td>
206
  <td>63.65</td>