Update README.md
Browse filesadded BBH scores.
README.md
CHANGED
@@ -190,8 +190,8 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
190 |
<th style="text-align:center; background-color: #001d6c; color: white;">MMLU</th>
|
191 |
<th style="text-align:center; background-color: #001d6c; color: white;">PopQA</th>
|
192 |
<th style="text-align:center; background-color: #001d6c; color: white;">TruthfulQA</th>
|
193 |
-
|
194 |
-
<th style="text-align:center; background-color: #001d6c; color: white;">DROP<sup id="
|
195 |
<th style="text-align:center; background-color: #001d6c; color: white;">GSM8K</th>
|
196 |
<th style="text-align:center; background-color: #001d6c; color: white;">HumanEval</th>
|
197 |
<th style="text-align:center; background-color: #001d6c; color: white;">HumanEval+</th>
|
@@ -206,7 +206,7 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
206 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">57.11</td>
|
207 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.55</td>
|
208 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.79</td>
|
209 |
-
|
210 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.99</td>
|
211 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.55</td>
|
212 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.45</td>
|
@@ -221,7 +221,7 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
221 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">57.18</td>
|
222 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.56</td>
|
223 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.8</td>
|
224 |
-
|
225 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">23.84</td>
|
226 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.02</td>
|
227 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">80.13</td>
|
@@ -236,7 +236,7 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
236 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 55.88 </td>
|
237 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 18.4 </td>
|
238 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 58.97 </td>
|
239 |
-
|
240 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 44.33 </td>
|
241 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 72.48 </td>
|
242 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.51 </td>
|
@@ -252,7 +252,7 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
252 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">69.15</td>
|
253 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.79</td>
|
254 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">52.79</td>
|
255 |
-
|
256 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">71.23</td>
|
257 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">83.24</td>
|
258 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">85.32</td>
|
@@ -268,7 +268,7 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
268 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">45.80</td>
|
269 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">13.25</td>
|
270 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.43</td>
|
271 |
-
|
272 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">49.73</td>
|
273 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">72.18</td>
|
274 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.54</td>
|
@@ -284,7 +284,7 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
284 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">74.30</td>
|
285 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">18.12</td>
|
286 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">63.06</td>
|
287 |
-
|
288 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">64.06</td>
|
289 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">84.46</td>
|
290 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">93.35</td>
|
@@ -300,7 +300,7 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
300 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">50.72</td>
|
301 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">9.94</td>
|
302 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.14</td>
|
303 |
-
|
304 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">51.78</td>
|
305 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">78.47</td>
|
306 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.89</td>
|
@@ -315,7 +315,7 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
315 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.77</td>
|
316 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.7</td>
|
317 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.84</td>
|
318 |
-
|
319 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">58.57</td>
|
320 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.15</td>
|
321 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.63</td>
|
@@ -331,7 +331,7 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
331 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.79</td>
|
332 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.04</td>
|
333 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.92</td>
|
334 |
-
|
335 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">58.29</td>
|
336 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">81.65</td>
|
337 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.35</td>
|
@@ -346,7 +346,7 @@ By redesigning a common household item like the plastic bottle, we can create a
|
|
346 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 65.54 </td>
|
347 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 26.17 </td>
|
348 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 66.86 </td>
|
349 |
-
|
350 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 59.36 </td>
|
351 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.89 </td>
|
352 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 89.73 </td>
|
@@ -414,7 +414,8 @@ Granite-3.3-8B-Instruct builds upon Granite-3.3-8B-Base, leveraging both permiss
|
|
414 |
- 💡 Learn about the latest Granite learning resources: https://ibm.biz/granite-learning-resources
|
415 |
|
416 |
<p><a href="#fnref1" title="Jump back to reference">[1]</a> Evaluated using <a href="https://github.com/allenai/olmes">OLMES</a> (except AttaQ and Arena-Hard scores)</p>
|
417 |
-
<p><a href="#fnref2" title="Jump back to reference">[2]</a>
|
|
|
418 |
<!-- ## Citation
|
419 |
<!-- ## Citation
|
420 |
```
|
|
|
190 |
<th style="text-align:center; background-color: #001d6c; color: white;">MMLU</th>
|
191 |
<th style="text-align:center; background-color: #001d6c; color: white;">PopQA</th>
|
192 |
<th style="text-align:center; background-color: #001d6c; color: white;">TruthfulQA</th>
|
193 |
+
<th style="text-align:center; background-color: #001d6c; color: white;">BigBenchHard<sup id="fnref2"><a href="#fn2">2</a></sup></th>
|
194 |
+
<th style="text-align:center; background-color: #001d6c; color: white;">DROP<sup id="fnref3"><a href="#fn3">3</a></sup></th>
|
195 |
<th style="text-align:center; background-color: #001d6c; color: white;">GSM8K</th>
|
196 |
<th style="text-align:center; background-color: #001d6c; color: white;">HumanEval</th>
|
197 |
<th style="text-align:center; background-color: #001d6c; color: white;">HumanEval+</th>
|
|
|
206 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">57.11</td>
|
207 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.55</td>
|
208 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.79</td>
|
209 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">61.82</td>
|
210 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.99</td>
|
211 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.55</td>
|
212 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.45</td>
|
|
|
221 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">57.18</td>
|
222 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">20.56</td>
|
223 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">59.8</td>
|
224 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">61.39</td>
|
225 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">23.84</td>
|
226 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.02</td>
|
227 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">80.13</td>
|
|
|
236 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 55.88 </td>
|
237 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 18.4 </td>
|
238 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 58.97 </td>
|
239 |
+
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 63.91 </td>
|
240 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 44.33 </td>
|
241 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 72.48 </td>
|
242 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.51 </td>
|
|
|
252 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">69.15</td>
|
253 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.79</td>
|
254 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">52.79</td>
|
255 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">73.43</td>
|
256 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">71.23</td>
|
257 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">83.24</td>
|
258 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">85.32</td>
|
|
|
268 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">45.80</td>
|
269 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">13.25</td>
|
270 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.43</td>
|
271 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.39</td>
|
272 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">49.73</td>
|
273 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">72.18</td>
|
274 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.54</td>
|
|
|
284 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">74.30</td>
|
285 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">18.12</td>
|
286 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">63.06</td>
|
287 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">69.19</td>
|
288 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">64.06</td>
|
289 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">84.46</td>
|
290 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">93.35</td>
|
|
|
300 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">50.72</td>
|
301 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">9.94</td>
|
302 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">47.14</td>
|
303 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">67.38</td>
|
304 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">51.78</td>
|
305 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">78.47</td>
|
306 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.89</td>
|
|
|
315 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.77</td>
|
316 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.7</td>
|
317 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">65.84</td>
|
318 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">69.87</td>
|
319 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">58.57</td>
|
320 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">79.15</td>
|
321 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.63</td>
|
|
|
331 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.79</td>
|
332 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">28.04</td>
|
333 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">66.92</td>
|
334 |
+
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">71.86</td>
|
335 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">58.29</td>
|
336 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">81.65</td>
|
337 |
<td style="text-align:center; background-color: #FFFFFF; color: #2D2D2D;">89.35</td>
|
|
|
346 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 65.54 </td>
|
347 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 26.17 </td>
|
348 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 66.86 </td>
|
349 |
+
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 69.13 </td>
|
350 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 59.36 </td>
|
351 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 80.89 </td>
|
352 |
<td style="text-align:center; background-color: #DAE8FF; color: black;"> 89.73 </td>
|
|
|
414 |
- 💡 Learn about the latest Granite learning resources: https://ibm.biz/granite-learning-resources
|
415 |
|
416 |
<p><a href="#fnref1" title="Jump back to reference">[1]</a> Evaluated using <a href="https://github.com/allenai/olmes">OLMES</a> (except AttaQ and Arena-Hard scores)</p>
|
417 |
+
<p><a href="#fnref2" title="Jump back to reference">[2]</a> Added regex for more efficient asnwer extraction.</a></p>
|
418 |
+
<p><a href="#fnref3" title="Jump back to reference">[3]</a> Modified the implementation to handle some of the issues mentioned <a href="https://huggingface.co/blog/open-llm-leaderboard-drop">here</a></p>
|
419 |
<!-- ## Citation
|
420 |
<!-- ## Citation
|
421 |
```
|