Konstantin Chernyshev commited on
Commit
148c1e7
·
1 Parent(s): 5a252e0

feat: add u-math vs mu-math

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +43 -2
  3. src/populate.py +120 -61
README.md CHANGED
@@ -42,4 +42,4 @@ If you encounter problem on the space, don't hesitate to restart it to remove th
42
  You'll find
43
  - the main table' columns names and properties in `src/display/utils.py`
44
  - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
45
- - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
 
42
  You'll find
43
  - the main table' columns names and properties in `src/display/utils.py`
44
  - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
45
+ - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
 
3
  import gradio as gr
4
  import pandas as pd
@@ -9,9 +10,11 @@ from src.about import CITATION_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TIT
9
  from src.populate import (
10
  MU_MATH_COLUMNS_DICT,
11
  U_MATH_COLUMNS_DICT,
 
12
  Field,
13
  get_mu_math_leaderboard_df,
14
  get_u_math_leaderboard_df,
 
15
  )
16
 
17
 
@@ -24,6 +27,7 @@ def restart_space():
24
 
25
  LEADERBOARD_U_MATH_DF = get_u_math_leaderboard_df()
26
  LEADERBOARD_MU_MATH_DF = get_mu_math_leaderboard_df()
 
27
 
28
 
29
  def init_leaderboard(dataframe: pd.DataFrame, columns_dict: dict[str, Field]) -> gr.components.Component:
@@ -79,6 +83,15 @@ def init_leaderboard(dataframe: pd.DataFrame, columns_dict: dict[str, Field]) ->
79
  filtered_df = full_df[full_df[columns_dict["model_size_symbol"].pretty_name] == query_symbol]
80
  return filtered_df[current_df.columns]
81
 
 
 
 
 
 
 
 
 
 
82
  with gr.Column() as col:
83
  # Add the controls
84
  with gr.Accordion("➡️ See All Columns", open=False):
@@ -129,6 +142,14 @@ def init_leaderboard(dataframe: pd.DataFrame, columns_dict: dict[str, Field]) ->
129
  interactive=True,
130
  )
131
 
 
 
 
 
 
 
 
 
132
  # create the hidden and visible dataframes to display
133
  hidden_leaderboard_df = gr.components.Dataframe(
134
  value=dataframe,
@@ -163,6 +184,11 @@ def init_leaderboard(dataframe: pd.DataFrame, columns_dict: dict[str, Field]) ->
163
  inputs=[hidden_leaderboard_df, leaderboard_df, model_size_filter_selector],
164
  outputs=[leaderboard_df],
165
  )
 
 
 
 
 
166
  for tag, button in all_tags.items():
167
  button.click(
168
  fn=filter_dataframe_by_selected_tag_columns,
@@ -206,7 +232,22 @@ with demo:
206
  tooltip=[MU_MATH_COLUMNS_DICT["full_model_name"].pretty_name, MU_MATH_COLUMNS_DICT["mu_math_f1"].pretty_name],
207
  )
208
 
209
- with gr.TabItem("📝 About", elem_id="about-tab-table", id=2):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
211
 
212
  citation_button = gr.Textbox(
@@ -222,4 +263,4 @@ scheduler = BackgroundScheduler()
222
  scheduler.add_job(restart_space, "interval", seconds=60 * 60)
223
  scheduler.start()
224
  # demo.queue(default_concurrency_limit=40).launch(ssr_mode=False)
225
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import os
2
+ from typing import Any
3
 
4
  import gradio as gr
5
  import pandas as pd
 
10
  from src.populate import (
11
  MU_MATH_COLUMNS_DICT,
12
  U_MATH_COLUMNS_DICT,
13
+ U_MATH_AND_MU_MATH_COLUMNS_DICT,
14
  Field,
15
  get_mu_math_leaderboard_df,
16
  get_u_math_leaderboard_df,
17
+ get_joined_leaderboard_df,
18
  )
19
 
20
 
 
27
 
28
  LEADERBOARD_U_MATH_DF = get_u_math_leaderboard_df()
29
  LEADERBOARD_MU_MATH_DF = get_mu_math_leaderboard_df()
30
+ LEADERBOARD_U_MATH_MU_MATH_JOINED_DF = get_joined_leaderboard_df()
31
 
32
 
33
  def init_leaderboard(dataframe: pd.DataFrame, columns_dict: dict[str, Field]) -> gr.components.Component:
 
83
  filtered_df = full_df[full_df[columns_dict["model_size_symbol"].pretty_name] == query_symbol]
84
  return filtered_df[current_df.columns]
85
 
86
+ def filter_dataframe_by_model_family(
87
+ full_df: pd.DataFrame, current_df: pd.DataFrame, filter_name: str,
88
+ ) -> pd.DataFrame:
89
+ if filter_name == "All":
90
+ return full_df[current_df.columns]
91
+ else:
92
+ filtered_df = full_df[full_df[columns_dict["model_family"].pretty_name] == filter_name]
93
+ return filtered_df[current_df.columns]
94
+
95
  with gr.Column() as col:
96
  # Add the controls
97
  with gr.Accordion("➡️ See All Columns", open=False):
 
142
  interactive=True,
143
  )
144
 
145
+ model_family_filter_selector = gr.Radio(
146
+ label="Filter model families:",
147
+ choices=["All"] + list(dataframe[columns_dict["model_family"].pretty_name].unique()),
148
+ value="All",
149
+ elem_id="model-family-filter",
150
+ interactive=True,
151
+ )
152
+
153
  # create the hidden and visible dataframes to display
154
  hidden_leaderboard_df = gr.components.Dataframe(
155
  value=dataframe,
 
184
  inputs=[hidden_leaderboard_df, leaderboard_df, model_size_filter_selector],
185
  outputs=[leaderboard_df],
186
  )
187
+ model_family_filter_selector.change(
188
+ fn=filter_dataframe_by_model_family,
189
+ inputs=[hidden_leaderboard_df, leaderboard_df, model_family_filter_selector],
190
+ outputs=[leaderboard_df],
191
+ )
192
  for tag, button in all_tags.items():
193
  button.click(
194
  fn=filter_dataframe_by_selected_tag_columns,
 
232
  tooltip=[MU_MATH_COLUMNS_DICT["full_model_name"].pretty_name, MU_MATH_COLUMNS_DICT["mu_math_f1"].pretty_name],
233
  )
234
 
235
+ with gr.TabItem("📊 U-MATH vs μ-MATH", elem_id="u-math-vs-mu-math-tab-table", id=2):
236
+ leaderboard_aggregated = init_leaderboard(LEADERBOARD_U_MATH_MU_MATH_JOINED_DF, U_MATH_AND_MU_MATH_COLUMNS_DICT)
237
+ gr.ScatterPlot(
238
+ value=LEADERBOARD_U_MATH_MU_MATH_JOINED_DF,
239
+ title="U-MATH Accuracy (Solving) vs μ-MATH F1 Score (Judging)",
240
+ x=U_MATH_AND_MU_MATH_COLUMNS_DICT["u_math_acc"].pretty_name,
241
+ y=U_MATH_AND_MU_MATH_COLUMNS_DICT["mu_math_f1"].pretty_name,
242
+ color=U_MATH_AND_MU_MATH_COLUMNS_DICT["model_family"].pretty_name,
243
+ tooltip=[
244
+ U_MATH_AND_MU_MATH_COLUMNS_DICT["full_model_name"].pretty_name,
245
+ U_MATH_AND_MU_MATH_COLUMNS_DICT["u_math_text_acc"].pretty_name,
246
+ U_MATH_AND_MU_MATH_COLUMNS_DICT["u_math_visual_acc"].pretty_name,
247
+ ],
248
+ )
249
+
250
+ with gr.TabItem("📝 About", elem_id="about-tab-table", id=3):
251
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
252
 
253
  citation_button = gr.Textbox(
 
263
  scheduler.add_job(restart_space, "interval", seconds=60 * 60)
264
  scheduler.start()
265
  # demo.queue(default_concurrency_limit=40).launch(ssr_mode=False)
266
+ demo.queue(default_concurrency_limit=40).launch(allowed_paths=[".cache"])
src/populate.py CHANGED
@@ -3,26 +3,36 @@ import os
3
  from dataclasses import dataclass, field
4
 
5
  import pandas as pd
6
- from huggingface_hub import model_info
7
  from transformers import AutoConfig
8
 
9
 
10
  UNKNOWN_MODEL_SHOW_SIZE = 150
11
 
12
 
13
- def is_model_on_hub(
14
- model_name: str, revision: str, token: str = None, trust_remote_code=False
15
- ) -> tuple[bool, str | None, str | None]:
16
- """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
17
  try:
18
- config = AutoConfig.from_pretrained(
19
- model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
20
- )
21
- return True, None, config
 
 
 
 
 
 
22
  except Exception:
23
- return False, "was not found on hub!", None
24
 
25
 
 
 
 
 
 
 
 
26
  def model_size_to_symbol(model_size_in_b_params: int | None) -> str:
27
  """Converts model size to a symbol"""
28
  if model_size_in_b_params is None or model_size_in_b_params == 0 or not model_size_in_b_params:
@@ -67,23 +77,31 @@ def get_hf_data_by_model_name(model_name: str) -> dict:
67
  elif 'deepseek' in model_name.lower():
68
  model_family = "DeepSeek"
69
 
70
- still_on_hub, _, model_config = is_model_on_hub(model_name, "main", trust_remote_code=True)
71
- if not still_on_hub and '/' in model_name:
 
 
 
 
 
 
72
  print(f"Model {model_name} is not on the hub, try unsloth/...")
73
  model_name = "unsloth/" + model_name.split("/")[-1]
74
- still_on_hub, _, model_config = is_model_on_hub(model_name, "main", trust_remote_code=True)
 
 
 
75
 
76
  architecture = "Unknown"
77
- if model_config is not None:
78
  architectures = getattr(model_config, "architectures", None)
79
  if architectures:
80
  architecture = ";".join(architectures)
81
 
82
  num_params = None
83
- if still_on_hub:
84
- info = model_info(repo_id=model_name)
85
  try:
86
- num_params = round(info.safetensors["total"] / 1e9, 1)
87
  except Exception as e:
88
  print("SafeTensors not found in", model_name, e)
89
  if 'Pixtral-12B' in model_name:
@@ -94,15 +112,13 @@ def get_hf_data_by_model_name(model_name: str) -> dict:
94
  print("num_params", model_name, num_params)
95
 
96
  model_url = None
97
- if still_on_hub:
98
  model_url = f"https://huggingface.co/{model_name}"
99
 
100
  model_license = "Unknown"
101
- if model_config is not None:
102
- info = model_info(repo_id=model_name)
103
- # print(info.card_data)
104
- model_license = info.card_data["license_name"]
105
- model_license_link = info.card_data["license_link"]
106
  if model_license_link:
107
  model_license = f"[{model_license}]({model_license_link})"
108
  if not model_license:
@@ -110,7 +126,7 @@ def get_hf_data_by_model_name(model_name: str) -> dict:
110
 
111
  return {
112
  "model_architecture": architecture,
113
- "model_type": "Open-Weights" if still_on_hub else "Proprietary",
114
  "model_size": num_params if num_params else None,
115
  "model_url": model_url,
116
  "model_license": model_license,
@@ -131,7 +147,7 @@ class Field:
131
  MODEL_COLUMNS_DICT = {
132
  "model_type_symbol": Field("T", "str", never_hidden=True),
133
  "model_size_symbol": Field("S", "str", never_hidden=True),
134
- "full_model_name": Field("Full Model Name", "markdown", fully_hidden=True),
135
  "model_name": Field("Model Name", "markdown", never_hidden=True),
136
  "model_type": Field("Type", "str", displayed_by_default=False),
137
  "model_size": Field("#Params (B)", "number", displayed_by_default=False),
@@ -210,6 +226,21 @@ MU_MATH_COLUMNS_DICT = {
210
  "Qwen2.5-72B-Instruct_ppv": Field("Qwen2.5-72B Subset PPV", "number", displayed_by_default=False),
211
  "Qwen2.5-72B-Instruct_npv": Field("Qwen2.5-72B Subset NPV", "number", displayed_by_default=False),
212
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
213
 
214
 
215
  def load_json_data(json_path: str, main_col: str | None = None) -> pd.DataFrame:
@@ -226,10 +257,30 @@ def load_json_data(json_path: str, main_col: str | None = None) -> pd.DataFrame:
226
  return df
227
 
228
 
229
- def get_u_math_leaderboard_df() -> pd.DataFrame:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  """Creates a dataframe from json with U-MATH eval results"""
231
  json_path = os.path.join("data", "u_math_eval_results.json")
232
  df = load_json_data(json_path)
 
233
 
234
  # flatten list [x, y, z] in columns as ["_acc", "_text_acc", "_visual_acc"] suffixes for columns
235
  for col in [
@@ -251,33 +302,23 @@ def get_u_math_leaderboard_df() -> pd.DataFrame:
251
  df["rank"] = range(1, len(df) + 1)
252
 
253
  # populate with model info
254
- model_to_meta_dict = {
255
- model_name: get_hf_data_by_model_name(model_name) for model_name in df["model_name"].unique()
256
- }
257
- df["model_architecture"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_architecture"])
258
- df["model_license"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_license"])
259
- df["model_type"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_type"])
260
- df["model_type_symbol"] = df["model_type"].apply(model_type_to_symbol)
261
- df["model_family"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_family"])
262
- df["model_size"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_size"])
263
- df["model_size_including_unknown"] = df["model_size"].apply(lambda x: x if x and pd.notna(x) else UNKNOWN_MODEL_SHOW_SIZE).astype(float)
264
- df["model_size_symbol"] = df["model_size"].apply(model_size_to_symbol)
265
- df["full_model_name"] = df["model_name"]
266
- df["model_name"] = df["model_name"].apply(
267
- lambda x: f"[{x}]({url})" if (url := model_to_meta_dict[x]["model_url"]) else x
268
- )
269
 
270
  # convert to pretty names and sort columns by order in dict
271
- df = df[U_MATH_COLUMNS_DICT.keys()]
272
- df = df.rename(columns={key: col.pretty_name for key, col in U_MATH_COLUMNS_DICT.items() if key in df.columns})
 
273
 
274
  return df
275
 
276
 
277
- def get_mu_math_leaderboard_df() -> pd.DataFrame:
278
  """Creates a dataframe from json with mu-MATH eval results"""
279
  json_path = os.path.join("data", "mu_math_eval_results.json")
280
  df = load_json_data(json_path)
 
281
 
282
  # Calculate columns with prefixes f1, tpr, tnr, ppv, npv
283
  for col in ["mu_math", "GPT-4o", "Gemini-1.5-Pro", "Llama-3.1-70B-Instruct", "Qwen2.5-72B-Instruct"]:
@@ -306,24 +347,42 @@ def get_mu_math_leaderboard_df() -> pd.DataFrame:
306
  df["rank"] = range(1, len(df) + 1)
307
 
308
  # populate with model info
309
- model_to_meta_dict = {
310
- model_name: get_hf_data_by_model_name(model_name) for model_name in df["model_name"].unique()
311
- }
312
- df["model_architecture"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_architecture"])
313
- df["model_license"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_license"])
314
- df["model_type"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_type"])
315
- df["model_type_symbol"] = df["model_type"].apply(model_type_to_symbol)
316
- df["model_family"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_family"])
317
- df["model_size"] = df["model_name"].apply(lambda x: model_to_meta_dict[x]["model_size"])
318
- df["model_size_including_unknown"] = df["model_size"].apply(lambda x: x if x and pd.notna(x) else UNKNOWN_MODEL_SHOW_SIZE).astype(float)
319
- df["model_size_symbol"] = df["model_size"].apply(model_size_to_symbol)
320
- df["full_model_name"] = df["model_name"]
321
- df["model_name"] = df["model_name"].apply(
322
- lambda x: f"[{x}]({url})" if (url := model_to_meta_dict[x]["model_url"]) else x
323
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
 
325
  # convert to pretty names and sort columns by order in dict
326
- df = df[MU_MATH_COLUMNS_DICT.keys()]
327
- df = df.rename(columns={key: col.pretty_name for key, col in MU_MATH_COLUMNS_DICT.items() if key in df.columns})
 
 
 
328
 
329
  return df
 
3
  from dataclasses import dataclass, field
4
 
5
  import pandas as pd
6
+ from huggingface_hub import model_info, ModelInfo
7
  from transformers import AutoConfig
8
 
9
 
10
  UNKNOWN_MODEL_SHOW_SIZE = 150
11
 
12
 
13
+ def get_hf_model_info_card_or_none(model_name: str) -> ModelInfo | None:
 
 
 
14
  try:
15
+ info = model_info(repo_id=model_name)
16
+ return info
17
+ except Exception:
18
+ return None
19
+
20
+
21
+ def get_hf_hub_config_or_none(model_name: str) -> AutoConfig | None:
22
+ try:
23
+ config = AutoConfig.from_pretrained(model_name, revision="main", trust_remote_code=True)
24
+ return config
25
  except Exception:
26
+ return None
27
 
28
 
29
+ def get_hf_model_info_card_or_none(model_name: str) -> ModelInfo | None:
30
+ try:
31
+ info = model_info(repo_id=model_name)
32
+ return info
33
+ except Exception:
34
+ return None
35
+
36
  def model_size_to_symbol(model_size_in_b_params: int | None) -> str:
37
  """Converts model size to a symbol"""
38
  if model_size_in_b_params is None or model_size_in_b_params == 0 or not model_size_in_b_params:
 
77
  elif 'deepseek' in model_name.lower():
78
  model_family = "DeepSeek"
79
 
80
+ print(model_name, model_family)
81
+ model_config = get_hf_hub_config_or_none(model_name)
82
+ model_info_card = get_hf_model_info_card_or_none(model_name)
83
+ print('model_config', type(model_config))
84
+ print('model_info_card', type(model_info_card))
85
+
86
+ # If model name is a path, try to get the model name from the hub
87
+ if '/' in model_name:
88
  print(f"Model {model_name} is not on the hub, try unsloth/...")
89
  model_name = "unsloth/" + model_name.split("/")[-1]
90
+ if not model_config:
91
+ model_config = get_hf_hub_config_or_none(model_name)
92
+ if not model_info_card:
93
+ model_info_card = get_hf_model_info_card_or_none(model_name)
94
 
95
  architecture = "Unknown"
96
+ if model_config:
97
  architectures = getattr(model_config, "architectures", None)
98
  if architectures:
99
  architecture = ";".join(architectures)
100
 
101
  num_params = None
102
+ if model_info_card:
 
103
  try:
104
+ num_params = round(model_info_card.safetensors["total"] / 1e9, 1)
105
  except Exception as e:
106
  print("SafeTensors not found in", model_name, e)
107
  if 'Pixtral-12B' in model_name:
 
112
  print("num_params", model_name, num_params)
113
 
114
  model_url = None
115
+ if model_config or model_info_card:
116
  model_url = f"https://huggingface.co/{model_name}"
117
 
118
  model_license = "Unknown"
119
+ if model_info_card:
120
+ model_license = model_info_card.card_data["license_name"]
121
+ model_license_link = model_info_card.card_data["license_link"]
 
 
122
  if model_license_link:
123
  model_license = f"[{model_license}]({model_license_link})"
124
  if not model_license:
 
126
 
127
  return {
128
  "model_architecture": architecture,
129
+ "model_type": "Open-Weights" if model_info_card else "Proprietary",
130
  "model_size": num_params if num_params else None,
131
  "model_url": model_url,
132
  "model_license": model_license,
 
147
  MODEL_COLUMNS_DICT = {
148
  "model_type_symbol": Field("T", "str", never_hidden=True),
149
  "model_size_symbol": Field("S", "str", never_hidden=True),
150
+ "full_model_name": Field("Full Model Name", "markdown", fully_hidden=True, displayed_by_default=False),
151
  "model_name": Field("Model Name", "markdown", never_hidden=True),
152
  "model_type": Field("Type", "str", displayed_by_default=False),
153
  "model_size": Field("#Params (B)", "number", displayed_by_default=False),
 
226
  "Qwen2.5-72B-Instruct_ppv": Field("Qwen2.5-72B Subset PPV", "number", displayed_by_default=False),
227
  "Qwen2.5-72B-Instruct_npv": Field("Qwen2.5-72B Subset NPV", "number", displayed_by_default=False),
228
  }
229
+ U_MATH_AND_MU_MATH_COLUMNS_DICT = {
230
+ "u_math_rank": Field("U-MATH Rank", "number", never_hidden=True),
231
+ "mu_math_rank": Field("μ-MATH Rank", "number", never_hidden=True),
232
+ **MODEL_COLUMNS_DICT,
233
+ "u_math_acc": Field("U-MATH Acc", "number", tags=["main", "u_math", "mu_math"]),
234
+ "u_math_text_acc": Field("U-MATH Text Acc", "number", displayed_by_default=False, tags=["u_math"]),
235
+ "u_math_visual_acc": Field("U-MATH Visual Acc", "number", displayed_by_default=False, tags=["u_math"]),
236
+ "judge_model_name": Field("Judge Model Name", "markdown", displayed_by_default=False),
237
+ "extract_model_name": Field("Extract Model Name", "markdown", displayed_by_default=False),
238
+ "mu_math_f1": Field("μ-MATH F1", "number", tags=["main", "u_math", "mu_math"]),
239
+ "mu_math_tpr": Field("μ-MATH TPR", "number", displayed_by_default=False, tags=["mu_math"]),
240
+ "mu_math_tnr": Field("μ-MATH TNR", "number", displayed_by_default=False, tags=["mu_math"]),
241
+ "mu_math_ppv": Field("μ-MATH PPV", "number", displayed_by_default=False, tags=["mu_math"]),
242
+ "mu_math_npv": Field("μ-MATH NPV", "number", displayed_by_default=False, tags=["mu_math"]),
243
+ }
244
 
245
 
246
  def load_json_data(json_path: str, main_col: str | None = None) -> pd.DataFrame:
 
257
  return df
258
 
259
 
260
+ def get_model_meta_info_df(model_full_names: list[str]) -> pd.DataFrame:
261
+ """Given a list of model names, returns a dataframe with model meta info"""
262
+ model_to_meta_dict = {
263
+ model_name: get_hf_data_by_model_name(model_name) for model_name in model_full_names
264
+ }
265
+
266
+ df = pd.DataFrame.from_dict(model_to_meta_dict, orient="index")
267
+ df["model_type_symbol"] = df["model_type"].apply(model_type_to_symbol)
268
+ df["model_size_symbol"] = df["model_size"].apply(model_size_to_symbol)
269
+ df["model_size_including_unknown"] = df["model_size"].apply(lambda x: x if x and pd.notna(x) else UNKNOWN_MODEL_SHOW_SIZE).astype(float)
270
+ df["full_model_name"] = df.index
271
+ df = df.reset_index(drop=True)
272
+ df["model_name"] = df["full_model_name"].apply(
273
+ lambda x: f"[{x}]({url})" if (url := model_to_meta_dict[x]["model_url"]) else x
274
+ )
275
+
276
+ return df
277
+
278
+
279
+ def get_u_math_leaderboard_df(use_pretty_names: bool = True, add_meta: bool = True) -> pd.DataFrame:
280
  """Creates a dataframe from json with U-MATH eval results"""
281
  json_path = os.path.join("data", "u_math_eval_results.json")
282
  df = load_json_data(json_path)
283
+ df = df.rename(columns={"model_name": "full_model_name"})
284
 
285
  # flatten list [x, y, z] in columns as ["_acc", "_text_acc", "_visual_acc"] suffixes for columns
286
  for col in [
 
302
  df["rank"] = range(1, len(df) + 1)
303
 
304
  # populate with model info
305
+ if add_meta:
306
+ df_meta = get_model_meta_info_df(df["full_model_name"].unique())
307
+ df = pd.merge(df, df_meta, on=["full_model_name"], how="left")
 
 
 
 
 
 
 
 
 
 
 
 
308
 
309
  # convert to pretty names and sort columns by order in dict
310
+ if use_pretty_names:
311
+ df = df[U_MATH_COLUMNS_DICT.keys()]
312
+ df = df.rename(columns={key: col.pretty_name for key, col in U_MATH_COLUMNS_DICT.items() if key in df.columns})
313
 
314
  return df
315
 
316
 
317
+ def get_mu_math_leaderboard_df(use_pretty_names: bool = True, add_meta: bool = True) -> pd.DataFrame:
318
  """Creates a dataframe from json with mu-MATH eval results"""
319
  json_path = os.path.join("data", "mu_math_eval_results.json")
320
  df = load_json_data(json_path)
321
+ df = df.rename(columns={"model_name": "full_model_name"})
322
 
323
  # Calculate columns with prefixes f1, tpr, tnr, ppv, npv
324
  for col in ["mu_math", "GPT-4o", "Gemini-1.5-Pro", "Llama-3.1-70B-Instruct", "Qwen2.5-72B-Instruct"]:
 
347
  df["rank"] = range(1, len(df) + 1)
348
 
349
  # populate with model info
350
+ if add_meta:
351
+ df_meta = get_model_meta_info_df(df["full_model_name"].unique())
352
+ df = pd.merge(df, df_meta, on=["full_model_name"], how="left")
353
+
354
+ # convert to pretty names and sort columns by order in dict
355
+ if use_pretty_names:
356
+ df = df[MU_MATH_COLUMNS_DICT.keys()]
357
+ df = df.rename(columns={key: col.pretty_name for key, col in MU_MATH_COLUMNS_DICT.items() if key in df.columns})
358
+
359
+ return df
360
+
361
+
362
+ def get_joined_leaderboard_df(use_pretty_names: bool = True, add_meta: bool = True) -> pd.DataFrame:
363
+ """Creates a dataframe from json with U-MATH and mu-MATH eval results"""
364
+ u_math_df = get_u_math_leaderboard_df(use_pretty_names=False, add_meta=False)
365
+ u_math_df = u_math_df.rename(columns={"rank": "u_math_rank"})
366
+ mu_math_df = get_mu_math_leaderboard_df(use_pretty_names=False, add_meta=False)
367
+ mu_math_df = mu_math_df.rename(columns={"rank": "mu_math_rank"})
368
+ assert set(u_math_df.columns).intersection(set(mu_math_df.columns)) == {"full_model_name"}, f"Columns overlap in {u_math_df.columns} and {mu_math_df.columns}"
369
+
370
+ # merge U-MATH and mu-MATH dataframes
371
+ df = pd.merge(u_math_df, mu_math_df, on=["full_model_name"], how="inner", suffixes=("", ""))
372
+
373
+ # sort by rank on u_math
374
+ df = df.sort_values(by=["u_math_rank"], ascending=True)
375
+
376
+ # add meta info
377
+ if add_meta:
378
+ df_meta = get_model_meta_info_df(df["full_model_name"].unique())
379
+ df = pd.merge(df, df_meta, on=["full_model_name"], how="left")
380
 
381
  # convert to pretty names and sort columns by order in dict
382
+ if use_pretty_names:
383
+ df = df[U_MATH_AND_MU_MATH_COLUMNS_DICT.keys()]
384
+ df = df.rename(
385
+ columns={key: col.pretty_name for key, col in U_MATH_AND_MU_MATH_COLUMNS_DICT.items() if key in df.columns}
386
+ )
387
 
388
  return df