ranarag commited on
Commit
85233b2
·
verified ·
1 Parent(s): de4d392

Update README.md

Browse files

added BBH scores.

Files changed (1) hide show
  1. README.md +14 -13
README.md CHANGED
@@ -190,8 +190,8 @@ By redesigning a common household item like the plastic bottle, we can create a
190
  <th style="text-align:center; background-color: #001d6c; color: white;">MMLU</th>
191
  <th style="text-align:center; background-color: #001d6c; color: white;">PopQA</th>
192
  <th style="text-align:center; background-color: #001d6c; color: white;">TruthfulQA</th>
193
- <!-- <th style="text-align:center; background-color: #001d6c; color: white;">BigBenchHard</th> -->
194
- <th style="text-align:center; background-color: #001d6c; color: white;">DROP<sup id="fnref2"><a href="#fn2">2</a></sup></th>
195
  <th style="text-align:center; background-color: #001d6c; color: white;">GSM8K</th>
196
  <th style="text-align:center; background-color: #001d6c; color: white;">HumanEval</th>
197
  <th style="text-align:center; background-color: #001d6c; color: white;">HumanEval+</th>
@@ -206,7 +206,7 @@ By redesigning a common household item like the plastic bottle, we can create a
206
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">57.11</td>
207
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.55</td>
208
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.79</td>
209
- <!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">54.46</td> -->
210
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.99</td>
211
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.55</td>
212
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.45</td>
@@ -221,7 +221,7 @@ By redesigning a common household item like the plastic bottle, we can create a
221
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">57.18</td>
222
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.56</td>
223
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.8</td>
224
- <!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">52.27</td> -->
225
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">23.84</td>
226
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.02</td>
227
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">80.13</td>
@@ -236,7 +236,7 @@ By redesigning a common household item like the plastic bottle, we can create a
236
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 55.88 </td>
237
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 18.4 </td>
238
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 58.97 </td>
239
- <!-- <td style="text-align:center; background-color: #DAE8FF; color: black;"> 52.51 </td> -->
240
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 44.33 </td>
241
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 72.48 </td>
242
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.51 </td>
@@ -252,7 +252,7 @@ By redesigning a common household item like the plastic bottle, we can create a
252
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">69.15</td>
253
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.79</td>
254
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">52.79</td>
255
- <!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">72.66</td> -->
256
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">71.23</td>
257
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">83.24</td>
258
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">85.32</td>
@@ -268,7 +268,7 @@ By redesigning a common household item like the plastic bottle, we can create a
268
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">45.80</td>
269
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">13.25</td>
270
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.43</td>
271
- <!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.71</td> -->
272
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">49.73</td>
273
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">72.18</td>
274
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.54</td>
@@ -284,7 +284,7 @@ By redesigning a common household item like the plastic bottle, we can create a
284
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">74.30</td>
285
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">18.12</td>
286
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">63.06</td>
287
- <!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">70.40</td> -->
288
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">64.06</td>
289
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">84.46</td>
290
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">93.35</td>
@@ -300,7 +300,7 @@ By redesigning a common household item like the plastic bottle, we can create a
300
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">50.72</td>
301
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">9.94</td>
302
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.14</td>
303
- <!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.04</td> -->
304
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">51.78</td>
305
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">78.47</td>
306
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.89</td>
@@ -315,7 +315,7 @@ By redesigning a common household item like the plastic bottle, we can create a
315
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.77</td>
316
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.7</td>
317
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.84</td>
318
- <!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">68.55</td> -->
319
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">58.57</td>
320
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.15</td>
321
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.63</td>
@@ -331,7 +331,7 @@ By redesigning a common household item like the plastic bottle, we can create a
331
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.79</td>
332
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.04</td>
333
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.92</td>
334
- <!-- <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">64.77</td> -->
335
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">58.29</td>
336
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">81.65</td>
337
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.35</td>
@@ -346,7 +346,7 @@ By redesigning a common household item like the plastic bottle, we can create a
346
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 65.54 </td>
347
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 26.17 </td>
348
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 66.86 </td>
349
- <!-- <td style="text-align:center; background-color: #DAE8FF; color: black;"> 59.01 </td> -->
350
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 59.36 </td>
351
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.89 </td>
352
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 89.73 </td>
@@ -414,7 +414,8 @@ Granite-3.3-8B-Instruct builds upon Granite-3.3-8B-Base, leveraging both permiss
414
  - 💡 Learn about the latest Granite learning resources: https://ibm.biz/granite-learning-resources
415
 
416
  <p><a href="#fnref1" title="Jump back to reference">[1]</a> Evaluated using <a href="https://github.com/allenai/olmes">OLMES</a> (except AttaQ and Arena-Hard scores)</p>
417
- <p><a href="#fnref2" title="Jump back to reference">[2]</a> Modified the implementation to handle some of the issues mentioned <a href="https://huggingface.co/blog/open-llm-leaderboard-drop">here</a></p>
 
418
  <!-- ## Citation
419
  <!-- ## Citation
420
  ```
 
190
  <th style="text-align:center; background-color: #001d6c; color: white;">MMLU</th>
191
  <th style="text-align:center; background-color: #001d6c; color: white;">PopQA</th>
192
  <th style="text-align:center; background-color: #001d6c; color: white;">TruthfulQA</th>
193
+ <th style="text-align:center; background-color: #001d6c; color: white;">BigBenchHard<sup id="fnref2"><a href="#fn2">2</a></sup></th>
194
+ <th style="text-align:center; background-color: #001d6c; color: white;">DROP<sup id="fnref3"><a href="#fn3">3</a></sup></th>
195
  <th style="text-align:center; background-color: #001d6c; color: white;">GSM8K</th>
196
  <th style="text-align:center; background-color: #001d6c; color: white;">HumanEval</th>
197
  <th style="text-align:center; background-color: #001d6c; color: white;">HumanEval+</th>
 
206
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">57.11</td>
207
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.55</td>
208
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.79</td>
209
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">61.82</td>
210
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.99</td>
211
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.55</td>
212
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.45</td>
 
221
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">57.18</td>
222
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.56</td>
223
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.8</td>
224
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">61.39</td>
225
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">23.84</td>
226
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.02</td>
227
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">80.13</td>
 
236
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 55.88 </td>
237
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 18.4 </td>
238
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 58.97 </td>
239
+ <td style="text-align:center; background-color: #DAE8FF; color: black;"> 63.91 </td>
240
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 44.33 </td>
241
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 72.48 </td>
242
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.51 </td>
 
252
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">69.15</td>
253
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.79</td>
254
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">52.79</td>
255
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">73.43</td>
256
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">71.23</td>
257
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">83.24</td>
258
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">85.32</td>
 
268
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">45.80</td>
269
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">13.25</td>
270
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.43</td>
271
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.39</td>
272
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">49.73</td>
273
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">72.18</td>
274
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.54</td>
 
284
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">74.30</td>
285
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">18.12</td>
286
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">63.06</td>
287
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">69.19</td>
288
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">64.06</td>
289
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">84.46</td>
290
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">93.35</td>
 
300
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">50.72</td>
301
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">9.94</td>
302
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.14</td>
303
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.38</td>
304
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">51.78</td>
305
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">78.47</td>
306
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.89</td>
 
315
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.77</td>
316
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.7</td>
317
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.84</td>
318
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">69.87</td>
319
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">58.57</td>
320
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.15</td>
321
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.63</td>
 
331
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.79</td>
332
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.04</td>
333
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.92</td>
334
+ <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">71.86</td>
335
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">58.29</td>
336
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">81.65</td>
337
  <td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.35</td>
 
346
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 65.54 </td>
347
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 26.17 </td>
348
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 66.86 </td>
349
+ <td style="text-align:center; background-color: #DAE8FF; color: black;"> 69.13 </td>
350
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 59.36 </td>
351
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.89 </td>
352
  <td style="text-align:center; background-color: #DAE8FF; color: black;"> 89.73 </td>
 
414
  - 💡 Learn about the latest Granite learning resources: https://ibm.biz/granite-learning-resources
415
 
416
  <p><a href="#fnref1" title="Jump back to reference">[1]</a> Evaluated using <a href="https://github.com/allenai/olmes">OLMES</a> (except AttaQ and Arena-Hard scores)</p>
417
+ <p><a href="#fnref2" title="Jump back to reference">[2]</a> Added regex for more efficient asnwer extraction.</a></p>
418
+ <p><a href="#fnref3" title="Jump back to reference">[3]</a> Modified the implementation to handle some of the issues mentioned <a href="https://huggingface.co/blog/open-llm-leaderboard-drop">here</a></p>
419
  <!-- ## Citation
420
  <!-- ## Citation
421
  ```