tathagataraha commited on
Commit
7f29568
Β·
1 Parent(s): 930ed8c
Files changed (1) hide show
  1. app.py +17 -113
app.py CHANGED
@@ -4,6 +4,7 @@ import gradio as gr
4
  import pandas as pd
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
  from huggingface_hub import snapshot_download
 
7
 
8
  from src.about import (
9
  CITATION_BUTTON_LABEL,
@@ -96,30 +97,38 @@ except Exception:
96
  # Span based results
97
  # changes to be made here
98
 
 
 
99
  _, harness_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "accuracy", "datasets")
100
  harness_datasets_leaderboard_df = harness_datasets_original_df.copy()
 
101
 
102
  _, open_ended_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OPEN_ENDED_COLS, OPEN_ENDED_BENCHMARK_COLS, "score", "open_ended")
103
  open_ended_leaderboard_df = open_ended_original_df.copy()
 
104
 
105
  _, med_safety_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MED_SAFETY_COLS, MED_SAFETY_BENCHMARK_COLS, "score", "med_safety")
106
  med_safety_leaderboard_df = med_safety_original_df.copy()
 
107
 
108
  _, medical_summarization_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDICAL_SUMMARIZATION_COLS, MEDICAL_SUMMARIZATION_BENCHMARK_COLS, "score", "medical_summarization")
109
  medical_summarization_leaderboard_df = medical_summarization_original_df.copy()
 
110
 
111
  _, aci_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ACI_COLS, ACI_BENCHMARK_COLS, "score", "aci")
112
  aci_leaderboard_df = aci_original_df.copy()
 
113
 
114
  _, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap")
115
  soap_leaderboard_df = soap_original_df.copy()
 
116
 
117
  _, healthbench_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_COLS, HEALTHBENCH_BENCHMARK_COLS, "score", "healthbench")
118
  healthbench_leaderboard_df = healthbench_original_df.copy()
119
 
120
  _, healthbench_hard_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_HARD_COLS, HEALTHBENCH_HARD_BENCHMARK_COLS, "score", "healthbench_hard")
121
  healthbench_hard_leaderboard_df = healthbench_hard_original_df.copy()
122
-
123
 
124
  _, open_ended_arabic_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedArabic_COLS, OpenEndedArabic_BENCHMARK_COLS, "score", "open_ended_arabic")
125
  _, open_ended_french_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedFrench_COLS, OpenEndedFrench_BENCHMARK_COLS, "score", "open_ended_french")
@@ -136,8 +145,14 @@ open_ended_portuguese_leaderboard_df = open_ended_portuguese_df.copy()
136
  open_ended_romanian_leaderboard_df = open_ended_romanian_df.copy()
137
  open_ended_greek_leaderboard_df = open_ended_greek_df.copy()
138
  open_ended_spanish_leaderboard_df = open_ended_spanish_df.copy()
 
 
139
  closed_ended_multilingual_leaderboard_df = closed_ended_multilingual_df.copy()
 
140
 
 
 
 
141
 
142
  # breakpoint()
143
  # # Token based results
@@ -154,7 +169,7 @@ closed_ended_multilingual_leaderboard_df = closed_ended_multilingual_df.copy()
154
  pending_eval_queue_df,
155
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
156
 
157
- breakpoint()
158
  def update_df(shown_columns, subset="datasets"):
159
  # changes to be made here
160
  if subset == "datasets":
@@ -1323,117 +1338,6 @@ with demo:
1323
  queue=True,
1324
  )
1325
 
1326
- with gr.Row():
1327
- with gr.Column():
1328
- with gr.Row():
1329
- search_bar = gr.Textbox(
1330
- placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...",
1331
- show_label=False,
1332
- elem_id="search-bar",
1333
- )
1334
- with gr.Row():
1335
- shown_columns = gr.CheckboxGroup(
1336
- choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)],
1337
- value=[
1338
- c.name
1339
- for c in fields(AutoEvalColumn)
1340
- if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)
1341
- ],
1342
- label="Select columns to show",
1343
- elem_id="column-select",
1344
- interactive=True,
1345
- )
1346
- # with gr.Row():
1347
- # deleted_models_visibility = gr.Checkbox(
1348
- # value=False, label="Show gated/private/deleted models", interactive=True
1349
- # )
1350
- with gr.Column(min_width=320):
1351
- # with gr.Box(elem_id="box-filter"):
1352
- filter_columns_type = gr.CheckboxGroup(
1353
- label="Model Types",
1354
- choices=[t.to_str() for t in ModelType],
1355
- value=[t.to_str() for t in ModelType],
1356
- interactive=True,
1357
- elem_id="filter-columns-type",
1358
- )
1359
- # filter_columns_architecture = gr.CheckboxGroup(
1360
- # label="Architecture Types",
1361
- # choices=[i.value.name for i in ModelArch],
1362
- # value=[i.value.name for i in ModelArch],
1363
- # interactive=True,
1364
- # elem_id="filter-columns-architecture",
1365
- # )
1366
- filter_domain_specific = gr.CheckboxGroup(
1367
- label="Domain Specificity",
1368
- choices=["πŸ₯ Clinical models", "Generic models"],
1369
- value=["πŸ₯ Clinical models", "Generic models"],
1370
- interactive=True,
1371
- elem_id="filter-columns-type",
1372
- )
1373
- filter_columns_size = gr.CheckboxGroup(
1374
- label="Model sizes (in billions of parameters)",
1375
- choices=list(NUMERIC_INTERVALS.keys()),
1376
- value=list(NUMERIC_INTERVALS.keys()),
1377
- interactive=True,
1378
- elem_id="filter-columns-size",
1379
- )
1380
-
1381
- datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="datasets")
1382
-
1383
- leaderboard_table = gr.components.Dataframe(
1384
- value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
1385
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
1386
- datatype=TYPES,
1387
- elem_id="leaderboard-table",
1388
- interactive=False,
1389
- visible=True,
1390
- )
1391
-
1392
- # Dummy leaderboard for handling the case when the user uses backspace key
1393
- hidden_leaderboard_table_for_search = gr.components.Dataframe(
1394
- value=datasets_original_df[DATASET_COLS],
1395
- headers=DATASET_COLS,
1396
- datatype=TYPES,
1397
- visible=False,
1398
- )
1399
-
1400
-
1401
- search_bar.submit(
1402
- update_table,
1403
- [
1404
- hidden_leaderboard_table_for_search,
1405
- shown_columns,
1406
- search_bar,
1407
- filter_columns_type,
1408
- filter_domain_specific,
1409
- filter_columns_size
1410
- # filter_columns_architecture
1411
- ],
1412
- leaderboard_table,
1413
- )
1414
- for selector in [
1415
- shown_columns,
1416
- filter_columns_type,
1417
- filter_domain_specific,
1418
- # filter_columns_architecture,
1419
- filter_columns_size,
1420
- # deleted_models_visibility,
1421
- ]:
1422
- selector.change(
1423
- update_table,
1424
- [
1425
- hidden_leaderboard_table_for_search,
1426
- shown_columns,
1427
- search_bar,
1428
- filter_columns_type,
1429
- filter_domain_specific,
1430
- filter_columns_size
1431
- # filter_columns_architecture,
1432
- ],
1433
- leaderboard_table,
1434
- queue=True,
1435
- )
1436
-
1437
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=7):
1438
  gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
1439
  gr.HTML(FIVE_PILLAR_DIAGRAM)
 
4
  import pandas as pd
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
  from huggingface_hub import snapshot_download
7
+ import time
8
 
9
  from src.about import (
10
  CITATION_BUTTON_LABEL,
 
97
  # Span based results
98
  # changes to be made here
99
 
100
+ start_time = time.time()
101
+
102
  _, harness_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "accuracy", "datasets")
103
  harness_datasets_leaderboard_df = harness_datasets_original_df.copy()
104
+ print("Closed ended English results loaded")
105
 
106
  _, open_ended_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OPEN_ENDED_COLS, OPEN_ENDED_BENCHMARK_COLS, "score", "open_ended")
107
  open_ended_leaderboard_df = open_ended_original_df.copy()
108
+ print("Open ended English results loaded")
109
 
110
  _, med_safety_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MED_SAFETY_COLS, MED_SAFETY_BENCHMARK_COLS, "score", "med_safety")
111
  med_safety_leaderboard_df = med_safety_original_df.copy()
112
+ print("Med safety results loaded")
113
 
114
  _, medical_summarization_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDICAL_SUMMARIZATION_COLS, MEDICAL_SUMMARIZATION_BENCHMARK_COLS, "score", "medical_summarization")
115
  medical_summarization_leaderboard_df = medical_summarization_original_df.copy()
116
+ print("Medical summarization results loaded")
117
 
118
  _, aci_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ACI_COLS, ACI_BENCHMARK_COLS, "score", "aci")
119
  aci_leaderboard_df = aci_original_df.copy()
120
+ print("ACI results loaded")
121
 
122
  _, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap")
123
  soap_leaderboard_df = soap_original_df.copy()
124
+ print("SOAP results loaded")
125
 
126
  _, healthbench_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_COLS, HEALTHBENCH_BENCHMARK_COLS, "score", "healthbench")
127
  healthbench_leaderboard_df = healthbench_original_df.copy()
128
 
129
  _, healthbench_hard_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_HARD_COLS, HEALTHBENCH_HARD_BENCHMARK_COLS, "score", "healthbench_hard")
130
  healthbench_hard_leaderboard_df = healthbench_hard_original_df.copy()
131
+ print("Healthbench results loaded")
132
 
133
  _, open_ended_arabic_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedArabic_COLS, OpenEndedArabic_BENCHMARK_COLS, "score", "open_ended_arabic")
134
  _, open_ended_french_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedFrench_COLS, OpenEndedFrench_BENCHMARK_COLS, "score", "open_ended_french")
 
145
  open_ended_romanian_leaderboard_df = open_ended_romanian_df.copy()
146
  open_ended_greek_leaderboard_df = open_ended_greek_df.copy()
147
  open_ended_spanish_leaderboard_df = open_ended_spanish_df.copy()
148
+ print("Open ended multilingual results loaded")
149
+
150
  closed_ended_multilingual_leaderboard_df = closed_ended_multilingual_df.copy()
151
+ print("Closed ended multilingual results loaded")
152
 
153
+ end_time = time.time()
154
+ total_time = end_time - start_time
155
+ print(f"Total time taken to load all results: {total_time:.2f} seconds")
156
 
157
  # breakpoint()
158
  # # Token based results
 
169
  pending_eval_queue_df,
170
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
171
 
172
+ # breakpoint()
173
  def update_df(shown_columns, subset="datasets"):
174
  # changes to be made here
175
  if subset == "datasets":
 
1338
  queue=True,
1339
  )
1340
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1341
  with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=7):
1342
  gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
1343
  gr.HTML(FIVE_PILLAR_DIAGRAM)