Spaces:
Running
Running
Commit
Β·
7f29568
1
Parent(s):
930ed8c
Merged
Browse files
app.py
CHANGED
@@ -4,6 +4,7 @@ import gradio as gr
|
|
4 |
import pandas as pd
|
5 |
from apscheduler.schedulers.background import BackgroundScheduler
|
6 |
from huggingface_hub import snapshot_download
|
|
|
7 |
|
8 |
from src.about import (
|
9 |
CITATION_BUTTON_LABEL,
|
@@ -96,30 +97,38 @@ except Exception:
|
|
96 |
# Span based results
|
97 |
# changes to be made here
|
98 |
|
|
|
|
|
99 |
_, harness_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "accuracy", "datasets")
|
100 |
harness_datasets_leaderboard_df = harness_datasets_original_df.copy()
|
|
|
101 |
|
102 |
_, open_ended_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OPEN_ENDED_COLS, OPEN_ENDED_BENCHMARK_COLS, "score", "open_ended")
|
103 |
open_ended_leaderboard_df = open_ended_original_df.copy()
|
|
|
104 |
|
105 |
_, med_safety_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MED_SAFETY_COLS, MED_SAFETY_BENCHMARK_COLS, "score", "med_safety")
|
106 |
med_safety_leaderboard_df = med_safety_original_df.copy()
|
|
|
107 |
|
108 |
_, medical_summarization_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDICAL_SUMMARIZATION_COLS, MEDICAL_SUMMARIZATION_BENCHMARK_COLS, "score", "medical_summarization")
|
109 |
medical_summarization_leaderboard_df = medical_summarization_original_df.copy()
|
|
|
110 |
|
111 |
_, aci_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ACI_COLS, ACI_BENCHMARK_COLS, "score", "aci")
|
112 |
aci_leaderboard_df = aci_original_df.copy()
|
|
|
113 |
|
114 |
_, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap")
|
115 |
soap_leaderboard_df = soap_original_df.copy()
|
|
|
116 |
|
117 |
_, healthbench_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_COLS, HEALTHBENCH_BENCHMARK_COLS, "score", "healthbench")
|
118 |
healthbench_leaderboard_df = healthbench_original_df.copy()
|
119 |
|
120 |
_, healthbench_hard_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_HARD_COLS, HEALTHBENCH_HARD_BENCHMARK_COLS, "score", "healthbench_hard")
|
121 |
healthbench_hard_leaderboard_df = healthbench_hard_original_df.copy()
|
122 |
-
|
123 |
|
124 |
_, open_ended_arabic_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedArabic_COLS, OpenEndedArabic_BENCHMARK_COLS, "score", "open_ended_arabic")
|
125 |
_, open_ended_french_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedFrench_COLS, OpenEndedFrench_BENCHMARK_COLS, "score", "open_ended_french")
|
@@ -136,8 +145,14 @@ open_ended_portuguese_leaderboard_df = open_ended_portuguese_df.copy()
|
|
136 |
open_ended_romanian_leaderboard_df = open_ended_romanian_df.copy()
|
137 |
open_ended_greek_leaderboard_df = open_ended_greek_df.copy()
|
138 |
open_ended_spanish_leaderboard_df = open_ended_spanish_df.copy()
|
|
|
|
|
139 |
closed_ended_multilingual_leaderboard_df = closed_ended_multilingual_df.copy()
|
|
|
140 |
|
|
|
|
|
|
|
141 |
|
142 |
# breakpoint()
|
143 |
# # Token based results
|
@@ -154,7 +169,7 @@ closed_ended_multilingual_leaderboard_df = closed_ended_multilingual_df.copy()
|
|
154 |
pending_eval_queue_df,
|
155 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
156 |
|
157 |
-
breakpoint()
|
158 |
def update_df(shown_columns, subset="datasets"):
|
159 |
# changes to be made here
|
160 |
if subset == "datasets":
|
@@ -1323,117 +1338,6 @@ with demo:
|
|
1323 |
queue=True,
|
1324 |
)
|
1325 |
|
1326 |
-
with gr.Row():
|
1327 |
-
with gr.Column():
|
1328 |
-
with gr.Row():
|
1329 |
-
search_bar = gr.Textbox(
|
1330 |
-
placeholder=" π Search for your model (separate multiple queries with `;`) and press ENTER...",
|
1331 |
-
show_label=False,
|
1332 |
-
elem_id="search-bar",
|
1333 |
-
)
|
1334 |
-
with gr.Row():
|
1335 |
-
shown_columns = gr.CheckboxGroup(
|
1336 |
-
choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)],
|
1337 |
-
value=[
|
1338 |
-
c.name
|
1339 |
-
for c in fields(AutoEvalColumn)
|
1340 |
-
if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)
|
1341 |
-
],
|
1342 |
-
label="Select columns to show",
|
1343 |
-
elem_id="column-select",
|
1344 |
-
interactive=True,
|
1345 |
-
)
|
1346 |
-
# with gr.Row():
|
1347 |
-
# deleted_models_visibility = gr.Checkbox(
|
1348 |
-
# value=False, label="Show gated/private/deleted models", interactive=True
|
1349 |
-
# )
|
1350 |
-
with gr.Column(min_width=320):
|
1351 |
-
# with gr.Box(elem_id="box-filter"):
|
1352 |
-
filter_columns_type = gr.CheckboxGroup(
|
1353 |
-
label="Model Types",
|
1354 |
-
choices=[t.to_str() for t in ModelType],
|
1355 |
-
value=[t.to_str() for t in ModelType],
|
1356 |
-
interactive=True,
|
1357 |
-
elem_id="filter-columns-type",
|
1358 |
-
)
|
1359 |
-
# filter_columns_architecture = gr.CheckboxGroup(
|
1360 |
-
# label="Architecture Types",
|
1361 |
-
# choices=[i.value.name for i in ModelArch],
|
1362 |
-
# value=[i.value.name for i in ModelArch],
|
1363 |
-
# interactive=True,
|
1364 |
-
# elem_id="filter-columns-architecture",
|
1365 |
-
# )
|
1366 |
-
filter_domain_specific = gr.CheckboxGroup(
|
1367 |
-
label="Domain Specificity",
|
1368 |
-
choices=["π₯ Clinical models", "Generic models"],
|
1369 |
-
value=["π₯ Clinical models", "Generic models"],
|
1370 |
-
interactive=True,
|
1371 |
-
elem_id="filter-columns-type",
|
1372 |
-
)
|
1373 |
-
filter_columns_size = gr.CheckboxGroup(
|
1374 |
-
label="Model sizes (in billions of parameters)",
|
1375 |
-
choices=list(NUMERIC_INTERVALS.keys()),
|
1376 |
-
value=list(NUMERIC_INTERVALS.keys()),
|
1377 |
-
interactive=True,
|
1378 |
-
elem_id="filter-columns-size",
|
1379 |
-
)
|
1380 |
-
|
1381 |
-
datasets_leaderboard_df, datasets_original_df = update_df(shown_columns.value, subset="datasets")
|
1382 |
-
|
1383 |
-
leaderboard_table = gr.components.Dataframe(
|
1384 |
-
value=datasets_leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value],
|
1385 |
-
headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
|
1386 |
-
datatype=TYPES,
|
1387 |
-
elem_id="leaderboard-table",
|
1388 |
-
interactive=False,
|
1389 |
-
visible=True,
|
1390 |
-
)
|
1391 |
-
|
1392 |
-
# Dummy leaderboard for handling the case when the user uses backspace key
|
1393 |
-
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
1394 |
-
value=datasets_original_df[DATASET_COLS],
|
1395 |
-
headers=DATASET_COLS,
|
1396 |
-
datatype=TYPES,
|
1397 |
-
visible=False,
|
1398 |
-
)
|
1399 |
-
|
1400 |
-
|
1401 |
-
search_bar.submit(
|
1402 |
-
update_table,
|
1403 |
-
[
|
1404 |
-
hidden_leaderboard_table_for_search,
|
1405 |
-
shown_columns,
|
1406 |
-
search_bar,
|
1407 |
-
filter_columns_type,
|
1408 |
-
filter_domain_specific,
|
1409 |
-
filter_columns_size
|
1410 |
-
# filter_columns_architecture
|
1411 |
-
],
|
1412 |
-
leaderboard_table,
|
1413 |
-
)
|
1414 |
-
for selector in [
|
1415 |
-
shown_columns,
|
1416 |
-
filter_columns_type,
|
1417 |
-
filter_domain_specific,
|
1418 |
-
# filter_columns_architecture,
|
1419 |
-
filter_columns_size,
|
1420 |
-
# deleted_models_visibility,
|
1421 |
-
]:
|
1422 |
-
selector.change(
|
1423 |
-
update_table,
|
1424 |
-
[
|
1425 |
-
hidden_leaderboard_table_for_search,
|
1426 |
-
shown_columns,
|
1427 |
-
search_bar,
|
1428 |
-
filter_columns_type,
|
1429 |
-
filter_domain_specific,
|
1430 |
-
filter_columns_size
|
1431 |
-
# filter_columns_architecture,
|
1432 |
-
],
|
1433 |
-
leaderboard_table,
|
1434 |
-
queue=True,
|
1435 |
-
)
|
1436 |
-
|
1437 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=7):
|
1438 |
gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
|
1439 |
gr.HTML(FIVE_PILLAR_DIAGRAM)
|
|
|
4 |
import pandas as pd
|
5 |
from apscheduler.schedulers.background import BackgroundScheduler
|
6 |
from huggingface_hub import snapshot_download
|
7 |
+
import time
|
8 |
|
9 |
from src.about import (
|
10 |
CITATION_BUTTON_LABEL,
|
|
|
97 |
# Span based results
|
98 |
# changes to be made here
|
99 |
|
100 |
+
start_time = time.time()
|
101 |
+
|
102 |
_, harness_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "accuracy", "datasets")
|
103 |
harness_datasets_leaderboard_df = harness_datasets_original_df.copy()
|
104 |
+
print("Closed ended English results loaded")
|
105 |
|
106 |
_, open_ended_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OPEN_ENDED_COLS, OPEN_ENDED_BENCHMARK_COLS, "score", "open_ended")
|
107 |
open_ended_leaderboard_df = open_ended_original_df.copy()
|
108 |
+
print("Open ended English results loaded")
|
109 |
|
110 |
_, med_safety_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MED_SAFETY_COLS, MED_SAFETY_BENCHMARK_COLS, "score", "med_safety")
|
111 |
med_safety_leaderboard_df = med_safety_original_df.copy()
|
112 |
+
print("Med safety results loaded")
|
113 |
|
114 |
_, medical_summarization_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDICAL_SUMMARIZATION_COLS, MEDICAL_SUMMARIZATION_BENCHMARK_COLS, "score", "medical_summarization")
|
115 |
medical_summarization_leaderboard_df = medical_summarization_original_df.copy()
|
116 |
+
print("Medical summarization results loaded")
|
117 |
|
118 |
_, aci_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ACI_COLS, ACI_BENCHMARK_COLS, "score", "aci")
|
119 |
aci_leaderboard_df = aci_original_df.copy()
|
120 |
+
print("ACI results loaded")
|
121 |
|
122 |
_, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap")
|
123 |
soap_leaderboard_df = soap_original_df.copy()
|
124 |
+
print("SOAP results loaded")
|
125 |
|
126 |
_, healthbench_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_COLS, HEALTHBENCH_BENCHMARK_COLS, "score", "healthbench")
|
127 |
healthbench_leaderboard_df = healthbench_original_df.copy()
|
128 |
|
129 |
_, healthbench_hard_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_HARD_COLS, HEALTHBENCH_HARD_BENCHMARK_COLS, "score", "healthbench_hard")
|
130 |
healthbench_hard_leaderboard_df = healthbench_hard_original_df.copy()
|
131 |
+
print("Healthbench results loaded")
|
132 |
|
133 |
_, open_ended_arabic_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedArabic_COLS, OpenEndedArabic_BENCHMARK_COLS, "score", "open_ended_arabic")
|
134 |
_, open_ended_french_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedFrench_COLS, OpenEndedFrench_BENCHMARK_COLS, "score", "open_ended_french")
|
|
|
145 |
open_ended_romanian_leaderboard_df = open_ended_romanian_df.copy()
|
146 |
open_ended_greek_leaderboard_df = open_ended_greek_df.copy()
|
147 |
open_ended_spanish_leaderboard_df = open_ended_spanish_df.copy()
|
148 |
+
print("Open ended multilingual results loaded")
|
149 |
+
|
150 |
closed_ended_multilingual_leaderboard_df = closed_ended_multilingual_df.copy()
|
151 |
+
print("Closed ended multilingual results loaded")
|
152 |
|
153 |
+
end_time = time.time()
|
154 |
+
total_time = end_time - start_time
|
155 |
+
print(f"Total time taken to load all results: {total_time:.2f} seconds")
|
156 |
|
157 |
# breakpoint()
|
158 |
# # Token based results
|
|
|
169 |
pending_eval_queue_df,
|
170 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
171 |
|
172 |
+
# breakpoint()
|
173 |
def update_df(shown_columns, subset="datasets"):
|
174 |
# changes to be made here
|
175 |
if subset == "datasets":
|
|
|
1338 |
queue=True,
|
1339 |
)
|
1340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1341 |
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=7):
|
1342 |
gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text")
|
1343 |
gr.HTML(FIVE_PILLAR_DIAGRAM)
|