Spaces:
Running
Running
Konstantin Chernyshev
commited on
Commit
·
148c1e7
1
Parent(s):
5a252e0
feat: add u-math vs mu-math
Browse files- README.md +1 -1
- app.py +43 -2
- src/populate.py +120 -61
README.md
CHANGED
@@ -42,4 +42,4 @@ If you encounter problem on the space, don't hesitate to restart it to remove th
|
|
42 |
You'll find
|
43 |
- the main table' columns names and properties in `src/display/utils.py`
|
44 |
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
45 |
-
- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
|
|
42 |
You'll find
|
43 |
- the main table' columns names and properties in `src/display/utils.py`
|
44 |
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
45 |
+
- the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
app.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import os
|
|
|
2 |
|
3 |
import gradio as gr
|
4 |
import pandas as pd
|
@@ -9,9 +10,11 @@ from src.about import CITATION_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TIT
|
|
9 |
from src.populate import (
|
10 |
MU_MATH_COLUMNS_DICT,
|
11 |
U_MATH_COLUMNS_DICT,
|
|
|
12 |
Field,
|
13 |
get_mu_math_leaderboard_df,
|
14 |
get_u_math_leaderboard_df,
|
|
|
15 |
)
|
16 |
|
17 |
|
@@ -24,6 +27,7 @@ def restart_space():
|
|
24 |
|
25 |
LEADERBOARD_U_MATH_DF = get_u_math_leaderboard_df()
|
26 |
LEADERBOARD_MU_MATH_DF = get_mu_math_leaderboard_df()
|
|
|
27 |
|
28 |
|
29 |
def init_leaderboard(dataframe: pd.DataFrame, columns_dict: dict[str, Field]) -> gr.components.Component:
|
@@ -79,6 +83,15 @@ def init_leaderboard(dataframe: pd.DataFrame, columns_dict: dict[str, Field]) ->
|
|
79 |
filtered_df = full_df[full_df[columns_dict["model_size_symbol"].pretty_name] == query_symbol]
|
80 |
return filtered_df[current_df.columns]
|
81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
with gr.Column() as col:
|
83 |
# Add the controls
|
84 |
with gr.Accordion("➡️ See All Columns", open=False):
|
@@ -129,6 +142,14 @@ def init_leaderboard(dataframe: pd.DataFrame, columns_dict: dict[str, Field]) ->
|
|
129 |
interactive=True,
|
130 |
)
|
131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
# create the hidden and visible dataframes to display
|
133 |
hidden_leaderboard_df = gr.components.Dataframe(
|
134 |
value=dataframe,
|
@@ -163,6 +184,11 @@ def init_leaderboard(dataframe: pd.DataFrame, columns_dict: dict[str, Field]) ->
|
|
163 |
inputs=[hidden_leaderboard_df, leaderboard_df, model_size_filter_selector],
|
164 |
outputs=[leaderboard_df],
|
165 |
)
|
|
|
|
|
|
|
|
|
|
|
166 |
for tag, button in all_tags.items():
|
167 |
button.click(
|
168 |
fn=filter_dataframe_by_selected_tag_columns,
|
@@ -206,7 +232,22 @@ with demo:
|
|
206 |
tooltip=[MU_MATH_COLUMNS_DICT["full_model_name"].pretty_name, MU_MATH_COLUMNS_DICT["mu_math_f1"].pretty_name],
|
207 |
)
|
208 |
|
209 |
-
with gr.TabItem("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
211 |
|
212 |
citation_button = gr.Textbox(
|
@@ -222,4 +263,4 @@ scheduler = BackgroundScheduler()
|
|
222 |
scheduler.add_job(restart_space, "interval", seconds=60 * 60)
|
223 |
scheduler.start()
|
224 |
# demo.queue(default_concurrency_limit=40).launch(ssr_mode=False)
|
225 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
1 |
import os
|
2 |
+
from typing import Any
|
3 |
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
|
|
10 |
from src.populate import (
|
11 |
MU_MATH_COLUMNS_DICT,
|
12 |
U_MATH_COLUMNS_DICT,
|
13 |
+
U_MATH_AND_MU_MATH_COLUMNS_DICT,
|
14 |
Field,
|
15 |
get_mu_math_leaderboard_df,
|
16 |
get_u_math_leaderboard_df,
|
17 |
+
get_joined_leaderboard_df,
|
18 |
)
|
19 |
|
20 |
|
|
|
27 |
|
28 |
LEADERBOARD_U_MATH_DF = get_u_math_leaderboard_df()
|
29 |
LEADERBOARD_MU_MATH_DF = get_mu_math_leaderboard_df()
|
30 |
+
LEADERBOARD_U_MATH_MU_MATH_JOINED_DF = get_joined_leaderboard_df()
|
31 |
|
32 |
|
33 |
def init_leaderboard(dataframe: pd.DataFrame, columns_dict: dict[str, Field]) -> gr.components.Component:
|
|
|
83 |
filtered_df = full_df[full_df[columns_dict["model_size_symbol"].pretty_name] == query_symbol]
|
84 |
return filtered_df[current_df.columns]
|
85 |
|
86 |
+
def filter_dataframe_by_model_family(
|
87 |
+
full_df: pd.DataFrame, current_df: pd.DataFrame, filter_name: str,
|
88 |
+
) -> pd.DataFrame:
|
89 |
+
if filter_name == "All":
|
90 |
+
return full_df[current_df.columns]
|
91 |
+
else:
|
92 |
+
filtered_df = full_df[full_df[columns_dict["model_family"].pretty_name] == filter_name]
|
93 |
+
return filtered_df[current_df.columns]
|
94 |
+
|
95 |
with gr.Column() as col:
|
96 |
# Add the controls
|
97 |
with gr.Accordion("➡️ See All Columns", open=False):
|
|
|
142 |
interactive=True,
|
143 |
)
|
144 |
|
145 |
+
model_family_filter_selector = gr.Radio(
|
146 |
+
label="Filter model families:",
|
147 |
+
choices=["All"] + list(dataframe[columns_dict["model_family"].pretty_name].unique()),
|
148 |
+
value="All",
|
149 |
+
elem_id="model-family-filter",
|
150 |
+
interactive=True,
|
151 |
+
)
|
152 |
+
|
153 |
# create the hidden and visible dataframes to display
|
154 |
hidden_leaderboard_df = gr.components.Dataframe(
|
155 |
value=dataframe,
|
|
|
184 |
inputs=[hidden_leaderboard_df, leaderboard_df, model_size_filter_selector],
|
185 |
outputs=[leaderboard_df],
|
186 |
)
|
187 |
+
model_family_filter_selector.change(
|
188 |
+
fn=filter_dataframe_by_model_family,
|
189 |
+
inputs=[hidden_leaderboard_df, leaderboard_df, model_family_filter_selector],
|
190 |
+
outputs=[leaderboard_df],
|
191 |
+
)
|
192 |
for tag, button in all_tags.items():
|
193 |
button.click(
|
194 |
fn=filter_dataframe_by_selected_tag_columns,
|
|
|
232 |
tooltip=[MU_MATH_COLUMNS_DICT["full_model_name"].pretty_name, MU_MATH_COLUMNS_DICT["mu_math_f1"].pretty_name],
|
233 |
)
|
234 |
|
235 |
+
with gr.TabItem("📊 U-MATH vs μ-MATH", elem_id="u-math-vs-mu-math-tab-table", id=2):
|
236 |
+
leaderboard_aggregated = init_leaderboard(LEADERBOARD_U_MATH_MU_MATH_JOINED_DF, U_MATH_AND_MU_MATH_COLUMNS_DICT)
|
237 |
+
gr.ScatterPlot(
|
238 |
+
value=LEADERBOARD_U_MATH_MU_MATH_JOINED_DF,
|
239 |
+
title="U-MATH Accuracy (Solving) vs μ-MATH F1 Score (Judging)",
|
240 |
+
x=U_MATH_AND_MU_MATH_COLUMNS_DICT["u_math_acc"].pretty_name,
|
241 |
+
y=U_MATH_AND_MU_MATH_COLUMNS_DICT["mu_math_f1"].pretty_name,
|
242 |
+
color=U_MATH_AND_MU_MATH_COLUMNS_DICT["model_family"].pretty_name,
|
243 |
+
tooltip=[
|
244 |
+
U_MATH_AND_MU_MATH_COLUMNS_DICT["full_model_name"].pretty_name,
|
245 |
+
U_MATH_AND_MU_MATH_COLUMNS_DICT["u_math_text_acc"].pretty_name,
|
246 |
+
U_MATH_AND_MU_MATH_COLUMNS_DICT["u_math_visual_acc"].pretty_name,
|
247 |
+
],
|
248 |
+
)
|
249 |
+
|
250 |
+
with gr.TabItem("📝 About", elem_id="about-tab-table", id=3):
|
251 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
252 |
|
253 |
citation_button = gr.Textbox(
|
|
|
263 |
scheduler.add_job(restart_space, "interval", seconds=60 * 60)
|
264 |
scheduler.start()
|
265 |
# demo.queue(default_concurrency_limit=40).launch(ssr_mode=False)
|
266 |
+
demo.queue(default_concurrency_limit=40).launch(allowed_paths=[".cache"])
|
src/populate.py
CHANGED
@@ -3,26 +3,36 @@ import os
|
|
3 |
from dataclasses import dataclass, field
|
4 |
|
5 |
import pandas as pd
|
6 |
-
from huggingface_hub import model_info
|
7 |
from transformers import AutoConfig
|
8 |
|
9 |
|
10 |
UNKNOWN_MODEL_SHOW_SIZE = 150
|
11 |
|
12 |
|
13 |
-
def
|
14 |
-
model_name: str, revision: str, token: str = None, trust_remote_code=False
|
15 |
-
) -> tuple[bool, str | None, str | None]:
|
16 |
-
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
17 |
try:
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
except Exception:
|
23 |
-
return
|
24 |
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
def model_size_to_symbol(model_size_in_b_params: int | None) -> str:
|
27 |
"""Converts model size to a symbol"""
|
28 |
if model_size_in_b_params is None or model_size_in_b_params == 0 or not model_size_in_b_params:
|
@@ -67,23 +77,31 @@ def get_hf_data_by_model_name(model_name: str) -> dict:
|
|
67 |
elif 'deepseek' in model_name.lower():
|
68 |
model_family = "DeepSeek"
|
69 |
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
print(f"Model {model_name} is not on the hub, try unsloth/...")
|
73 |
model_name = "unsloth/" + model_name.split("/")[-1]
|
74 |
-
|
|
|
|
|
|
|
75 |
|
76 |
architecture = "Unknown"
|
77 |
-
if model_config
|
78 |
architectures = getattr(model_config, "architectures", None)
|
79 |
if architectures:
|
80 |
architecture = ";".join(architectures)
|
81 |
|
82 |
num_params = None
|
83 |
-
if
|
84 |
-
info = model_info(repo_id=model_name)
|
85 |
try:
|
86 |
-
num_params = round(
|
87 |
except Exception as e:
|
88 |
print("SafeTensors not found in", model_name, e)
|
89 |
if 'Pixtral-12B' in model_name:
|
@@ -94,15 +112,13 @@ def get_hf_data_by_model_name(model_name: str) -> dict:
|
|
94 |
print("num_params", model_name, num_params)
|
95 |
|
96 |
model_url = None
|
97 |
-
if
|
98 |
model_url = f"https://huggingface.co/{model_name}"
|
99 |
|
100 |
model_license = "Unknown"
|
101 |
-
if
|
102 |
-
|
103 |
-
|
104 |
-
model_license = info.card_data["license_name"]
|
105 |
-
model_license_link = info.card_data["license_link"]
|
106 |
if model_license_link:
|
107 |
model_license = f"[{model_license}]({model_license_link})"
|
108 |
if not model_license:
|
@@ -110,7 +126,7 @@ def get_hf_data_by_model_name(model_name: str) -> dict:
|
|
110 |
|
111 |
return {
|
112 |
"model_architecture": architecture,
|
113 |
-
"model_type": "Open-Weights" if
|
114 |
"model_size": num_params if num_params else None,
|
115 |
"model_url": model_url,
|
116 |
"model_license": model_license,
|
@@ -131,7 +147,7 @@ class Field:
|
|
131 |
MODEL_COLUMNS_DICT = {
|
132 |
"model_type_symbol": Field("T", "str", never_hidden=True),
|
133 |
"model_size_symbol": Field("S", "str", never_hidden=True),
|
134 |
-
"full_model_name": Field("Full Model Name", "markdown", fully_hidden=True),
|
135 |
"model_name": Field("Model Name", "markdown", never_hidden=True),
|
136 |
"model_type": Field("Type", "str", displayed_by_default=False),
|
137 |
"model_size": Field("#Params (B)", "number", displayed_by_default=False),
|
@@ -210,6 +226,21 @@ MU_MATH_COLUMNS_DICT = {
|
|
210 |
"Qwen2.5-72B-Instruct_ppv": Field("Qwen2.5-72B Subset PPV", "number", displayed_by_default=False),
|
211 |
"Qwen2.5-72B-Instruct_npv": Field("Qwen2.5-72B Subset NPV", "number", displayed_by_default=False),
|
212 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
213 |
|
214 |
|
215 |
def load_json_data(json_path: str, main_col: str | None = None) -> pd.DataFrame:
|
@@ -226,10 +257,30 @@ def load_json_data(json_path: str, main_col: str | None = None) -> pd.DataFrame:
|
|
226 |
return df
|
227 |
|
228 |
|
229 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
"""Creates a dataframe from json with U-MATH eval results"""
|
231 |
json_path = os.path.join("data", "u_math_eval_results.json")
|
232 |
df = load_json_data(json_path)
|
|
|
233 |
|
234 |
# flatten list [x, y, z] in columns as ["_acc", "_text_acc", "_visual_acc"] suffixes for columns
|
235 |
for col in [
|
@@ -251,33 +302,23 @@ def get_u_math_leaderboard_df() -> pd.DataFrame:
|
|
251 |
df["rank"] = range(1, len(df) + 1)
|
252 |
|
253 |
# populate with model info
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
df["model_architecture"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_architecture"])
|
258 |
-
df["model_license"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_license"])
|
259 |
-
df["model_type"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_type"])
|
260 |
-
df["model_type_symbol"] = df["model_type"].apply(model_type_to_symbol)
|
261 |
-
df["model_family"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_family"])
|
262 |
-
df["model_size"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_size"])
|
263 |
-
df["model_size_including_unknown"] = df["model_size"].apply(lambda x: x if x and pd.notna(x) else UNKNOWN_MODEL_SHOW_SIZE).astype(float)
|
264 |
-
df["model_size_symbol"] = df["model_size"].apply(model_size_to_symbol)
|
265 |
-
df["full_model_name"] = df["model_name"]
|
266 |
-
df["model_name"] = df["model_name"].apply(
|
267 |
-
lambda x: f"[{x}]({url})" if (url := model_to_meta_dict[x]["model_url"]) else x
|
268 |
-
)
|
269 |
|
270 |
# convert to pretty names and sort columns by order in dict
|
271 |
-
|
272 |
-
|
|
|
273 |
|
274 |
return df
|
275 |
|
276 |
|
277 |
-
def get_mu_math_leaderboard_df() -> pd.DataFrame:
|
278 |
"""Creates a dataframe from json with mu-MATH eval results"""
|
279 |
json_path = os.path.join("data", "mu_math_eval_results.json")
|
280 |
df = load_json_data(json_path)
|
|
|
281 |
|
282 |
# Calculate columns with prefixes f1, tpr, tnr, ppv, npv
|
283 |
for col in ["mu_math", "GPT-4o", "Gemini-1.5-Pro", "Llama-3.1-70B-Instruct", "Qwen2.5-72B-Instruct"]:
|
@@ -306,24 +347,42 @@ def get_mu_math_leaderboard_df() -> pd.DataFrame:
|
|
306 |
df["rank"] = range(1, len(df) + 1)
|
307 |
|
308 |
# populate with model info
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
324 |
|
325 |
# convert to pretty names and sort columns by order in dict
|
326 |
-
|
327 |
-
|
|
|
|
|
|
|
328 |
|
329 |
return df
|
|
|
3 |
from dataclasses import dataclass, field
|
4 |
|
5 |
import pandas as pd
|
6 |
+
from huggingface_hub import model_info, ModelInfo
|
7 |
from transformers import AutoConfig
|
8 |
|
9 |
|
10 |
UNKNOWN_MODEL_SHOW_SIZE = 150
|
11 |
|
12 |
|
13 |
+
def get_hf_model_info_card_or_none(model_name: str) -> ModelInfo | None:
|
|
|
|
|
|
|
14 |
try:
|
15 |
+
info = model_info(repo_id=model_name)
|
16 |
+
return info
|
17 |
+
except Exception:
|
18 |
+
return None
|
19 |
+
|
20 |
+
|
21 |
+
def get_hf_hub_config_or_none(model_name: str) -> AutoConfig | None:
|
22 |
+
try:
|
23 |
+
config = AutoConfig.from_pretrained(model_name, revision="main", trust_remote_code=True)
|
24 |
+
return config
|
25 |
except Exception:
|
26 |
+
return None
|
27 |
|
28 |
|
29 |
+
def get_hf_model_info_card_or_none(model_name: str) -> ModelInfo | None:
|
30 |
+
try:
|
31 |
+
info = model_info(repo_id=model_name)
|
32 |
+
return info
|
33 |
+
except Exception:
|
34 |
+
return None
|
35 |
+
|
36 |
def model_size_to_symbol(model_size_in_b_params: int | None) -> str:
|
37 |
"""Converts model size to a symbol"""
|
38 |
if model_size_in_b_params is None or model_size_in_b_params == 0 or not model_size_in_b_params:
|
|
|
77 |
elif 'deepseek' in model_name.lower():
|
78 |
model_family = "DeepSeek"
|
79 |
|
80 |
+
print(model_name, model_family)
|
81 |
+
model_config = get_hf_hub_config_or_none(model_name)
|
82 |
+
model_info_card = get_hf_model_info_card_or_none(model_name)
|
83 |
+
print('model_config', type(model_config))
|
84 |
+
print('model_info_card', type(model_info_card))
|
85 |
+
|
86 |
+
# If model name is a path, try to get the model name from the hub
|
87 |
+
if '/' in model_name:
|
88 |
print(f"Model {model_name} is not on the hub, try unsloth/...")
|
89 |
model_name = "unsloth/" + model_name.split("/")[-1]
|
90 |
+
if not model_config:
|
91 |
+
model_config = get_hf_hub_config_or_none(model_name)
|
92 |
+
if not model_info_card:
|
93 |
+
model_info_card = get_hf_model_info_card_or_none(model_name)
|
94 |
|
95 |
architecture = "Unknown"
|
96 |
+
if model_config:
|
97 |
architectures = getattr(model_config, "architectures", None)
|
98 |
if architectures:
|
99 |
architecture = ";".join(architectures)
|
100 |
|
101 |
num_params = None
|
102 |
+
if model_info_card:
|
|
|
103 |
try:
|
104 |
+
num_params = round(model_info_card.safetensors["total"] / 1e9, 1)
|
105 |
except Exception as e:
|
106 |
print("SafeTensors not found in", model_name, e)
|
107 |
if 'Pixtral-12B' in model_name:
|
|
|
112 |
print("num_params", model_name, num_params)
|
113 |
|
114 |
model_url = None
|
115 |
+
if model_config or model_info_card:
|
116 |
model_url = f"https://huggingface.co/{model_name}"
|
117 |
|
118 |
model_license = "Unknown"
|
119 |
+
if model_info_card:
|
120 |
+
model_license = model_info_card.card_data["license_name"]
|
121 |
+
model_license_link = model_info_card.card_data["license_link"]
|
|
|
|
|
122 |
if model_license_link:
|
123 |
model_license = f"[{model_license}]({model_license_link})"
|
124 |
if not model_license:
|
|
|
126 |
|
127 |
return {
|
128 |
"model_architecture": architecture,
|
129 |
+
"model_type": "Open-Weights" if model_info_card else "Proprietary",
|
130 |
"model_size": num_params if num_params else None,
|
131 |
"model_url": model_url,
|
132 |
"model_license": model_license,
|
|
|
147 |
MODEL_COLUMNS_DICT = {
|
148 |
"model_type_symbol": Field("T", "str", never_hidden=True),
|
149 |
"model_size_symbol": Field("S", "str", never_hidden=True),
|
150 |
+
"full_model_name": Field("Full Model Name", "markdown", fully_hidden=True, displayed_by_default=False),
|
151 |
"model_name": Field("Model Name", "markdown", never_hidden=True),
|
152 |
"model_type": Field("Type", "str", displayed_by_default=False),
|
153 |
"model_size": Field("#Params (B)", "number", displayed_by_default=False),
|
|
|
226 |
"Qwen2.5-72B-Instruct_ppv": Field("Qwen2.5-72B Subset PPV", "number", displayed_by_default=False),
|
227 |
"Qwen2.5-72B-Instruct_npv": Field("Qwen2.5-72B Subset NPV", "number", displayed_by_default=False),
|
228 |
}
|
229 |
+
U_MATH_AND_MU_MATH_COLUMNS_DICT = {
|
230 |
+
"u_math_rank": Field("U-MATH Rank", "number", never_hidden=True),
|
231 |
+
"mu_math_rank": Field("μ-MATH Rank", "number", never_hidden=True),
|
232 |
+
**MODEL_COLUMNS_DICT,
|
233 |
+
"u_math_acc": Field("U-MATH Acc", "number", tags=["main", "u_math", "mu_math"]),
|
234 |
+
"u_math_text_acc": Field("U-MATH Text Acc", "number", displayed_by_default=False, tags=["u_math"]),
|
235 |
+
"u_math_visual_acc": Field("U-MATH Visual Acc", "number", displayed_by_default=False, tags=["u_math"]),
|
236 |
+
"judge_model_name": Field("Judge Model Name", "markdown", displayed_by_default=False),
|
237 |
+
"extract_model_name": Field("Extract Model Name", "markdown", displayed_by_default=False),
|
238 |
+
"mu_math_f1": Field("μ-MATH F1", "number", tags=["main", "u_math", "mu_math"]),
|
239 |
+
"mu_math_tpr": Field("μ-MATH TPR", "number", displayed_by_default=False, tags=["mu_math"]),
|
240 |
+
"mu_math_tnr": Field("μ-MATH TNR", "number", displayed_by_default=False, tags=["mu_math"]),
|
241 |
+
"mu_math_ppv": Field("μ-MATH PPV", "number", displayed_by_default=False, tags=["mu_math"]),
|
242 |
+
"mu_math_npv": Field("μ-MATH NPV", "number", displayed_by_default=False, tags=["mu_math"]),
|
243 |
+
}
|
244 |
|
245 |
|
246 |
def load_json_data(json_path: str, main_col: str | None = None) -> pd.DataFrame:
|
|
|
257 |
return df
|
258 |
|
259 |
|
260 |
+
def get_model_meta_info_df(model_full_names: list[str]) -> pd.DataFrame:
|
261 |
+
"""Given a list of model names, returns a dataframe with model meta info"""
|
262 |
+
model_to_meta_dict = {
|
263 |
+
model_name: get_hf_data_by_model_name(model_name) for model_name in model_full_names
|
264 |
+
}
|
265 |
+
|
266 |
+
df = pd.DataFrame.from_dict(model_to_meta_dict, orient="index")
|
267 |
+
df["model_type_symbol"] = df["model_type"].apply(model_type_to_symbol)
|
268 |
+
df["model_size_symbol"] = df["model_size"].apply(model_size_to_symbol)
|
269 |
+
df["model_size_including_unknown"] = df["model_size"].apply(lambda x: x if x and pd.notna(x) else UNKNOWN_MODEL_SHOW_SIZE).astype(float)
|
270 |
+
df["full_model_name"] = df.index
|
271 |
+
df = df.reset_index(drop=True)
|
272 |
+
df["model_name"] = df["full_model_name"].apply(
|
273 |
+
lambda x: f"[{x}]({url})" if (url := model_to_meta_dict[x]["model_url"]) else x
|
274 |
+
)
|
275 |
+
|
276 |
+
return df
|
277 |
+
|
278 |
+
|
279 |
+
def get_u_math_leaderboard_df(use_pretty_names: bool = True, add_meta: bool = True) -> pd.DataFrame:
|
280 |
"""Creates a dataframe from json with U-MATH eval results"""
|
281 |
json_path = os.path.join("data", "u_math_eval_results.json")
|
282 |
df = load_json_data(json_path)
|
283 |
+
df = df.rename(columns={"model_name": "full_model_name"})
|
284 |
|
285 |
# flatten list [x, y, z] in columns as ["_acc", "_text_acc", "_visual_acc"] suffixes for columns
|
286 |
for col in [
|
|
|
302 |
df["rank"] = range(1, len(df) + 1)
|
303 |
|
304 |
# populate with model info
|
305 |
+
if add_meta:
|
306 |
+
df_meta = get_model_meta_info_df(df["full_model_name"].unique())
|
307 |
+
df = pd.merge(df, df_meta, on=["full_model_name"], how="left")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
|
309 |
# convert to pretty names and sort columns by order in dict
|
310 |
+
if use_pretty_names:
|
311 |
+
df = df[U_MATH_COLUMNS_DICT.keys()]
|
312 |
+
df = df.rename(columns={key: col.pretty_name for key, col in U_MATH_COLUMNS_DICT.items() if key in df.columns})
|
313 |
|
314 |
return df
|
315 |
|
316 |
|
317 |
+
def get_mu_math_leaderboard_df(use_pretty_names: bool = True, add_meta: bool = True) -> pd.DataFrame:
|
318 |
"""Creates a dataframe from json with mu-MATH eval results"""
|
319 |
json_path = os.path.join("data", "mu_math_eval_results.json")
|
320 |
df = load_json_data(json_path)
|
321 |
+
df = df.rename(columns={"model_name": "full_model_name"})
|
322 |
|
323 |
# Calculate columns with prefixes f1, tpr, tnr, ppv, npv
|
324 |
for col in ["mu_math", "GPT-4o", "Gemini-1.5-Pro", "Llama-3.1-70B-Instruct", "Qwen2.5-72B-Instruct"]:
|
|
|
347 |
df["rank"] = range(1, len(df) + 1)
|
348 |
|
349 |
# populate with model info
|
350 |
+
if add_meta:
|
351 |
+
df_meta = get_model_meta_info_df(df["full_model_name"].unique())
|
352 |
+
df = pd.merge(df, df_meta, on=["full_model_name"], how="left")
|
353 |
+
|
354 |
+
# convert to pretty names and sort columns by order in dict
|
355 |
+
if use_pretty_names:
|
356 |
+
df = df[MU_MATH_COLUMNS_DICT.keys()]
|
357 |
+
df = df.rename(columns={key: col.pretty_name for key, col in MU_MATH_COLUMNS_DICT.items() if key in df.columns})
|
358 |
+
|
359 |
+
return df
|
360 |
+
|
361 |
+
|
362 |
+
def get_joined_leaderboard_df(use_pretty_names: bool = True, add_meta: bool = True) -> pd.DataFrame:
|
363 |
+
"""Creates a dataframe from json with U-MATH and mu-MATH eval results"""
|
364 |
+
u_math_df = get_u_math_leaderboard_df(use_pretty_names=False, add_meta=False)
|
365 |
+
u_math_df = u_math_df.rename(columns={"rank": "u_math_rank"})
|
366 |
+
mu_math_df = get_mu_math_leaderboard_df(use_pretty_names=False, add_meta=False)
|
367 |
+
mu_math_df = mu_math_df.rename(columns={"rank": "mu_math_rank"})
|
368 |
+
assert set(u_math_df.columns).intersection(set(mu_math_df.columns)) == {"full_model_name"}, f"Columns overlap in {u_math_df.columns} and {mu_math_df.columns}"
|
369 |
+
|
370 |
+
# merge U-MATH and mu-MATH dataframes
|
371 |
+
df = pd.merge(u_math_df, mu_math_df, on=["full_model_name"], how="inner", suffixes=("", ""))
|
372 |
+
|
373 |
+
# sort by rank on u_math
|
374 |
+
df = df.sort_values(by=["u_math_rank"], ascending=True)
|
375 |
+
|
376 |
+
# add meta info
|
377 |
+
if add_meta:
|
378 |
+
df_meta = get_model_meta_info_df(df["full_model_name"].unique())
|
379 |
+
df = pd.merge(df, df_meta, on=["full_model_name"], how="left")
|
380 |
|
381 |
# convert to pretty names and sort columns by order in dict
|
382 |
+
if use_pretty_names:
|
383 |
+
df = df[U_MATH_AND_MU_MATH_COLUMNS_DICT.keys()]
|
384 |
+
df = df.rename(
|
385 |
+
columns={key: col.pretty_name for key, col in U_MATH_AND_MU_MATH_COLUMNS_DICT.items() if key in df.columns}
|
386 |
+
)
|
387 |
|
388 |
return df
|