nm-research commited on
Commit
fb3637a
·
verified ·
1 Parent(s): 1aee80e

Add reasoning evals

Browse files
Files changed (1) hide show
  1. README.md +25 -0
README.md CHANGED
@@ -158,6 +158,31 @@ lm_eval \
158
  </thead>
159
  <tbody>
160
  <tr>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  <td rowspan="7"><b>OpenLLM V1</b></td>
162
  <td>ARC-Challenge (Acc-Norm, 25-shot)</td>
163
  <td>63.65</td>
 
158
  </thead>
159
  <tbody>
160
  <tr>
161
+ <td rowspan="4"><b>Reasoning</b></td>
162
+ <td>AIME 2024 (pass@1)</td>
163
+ <td>67.83</td>
164
+ <td>69.17</td>
165
+ <td>101.98%</td>
166
+ </tr>
167
+ <tr>
168
+ <td>MATH-500 (pass@1)</td>
169
+ <td>95.29</td>
170
+ <td>95.14</td>
171
+ <td>99.84%</td>
172
+ </tr>
173
+ <tr>
174
+ <td>GPQA Diamond (pass@1)</td>
175
+ <td>65.57</td>
176
+ <td>65.15</td>
177
+ <td>99.36%</td>
178
+ </tr>
179
+ <tr>
180
+ <td><b>Average Score</b></td>
181
+ <td><b>76.23</b></td>
182
+ <td><b>76.49</b></td>
183
+ <td><b>100.34%</b></td>
184
+ </tr>
185
+ <tr>
186
  <td rowspan="7"><b>OpenLLM V1</b></td>
187
  <td>ARC-Challenge (Acc-Norm, 25-shot)</td>
188
  <td>63.65</td>