Update app.py with new features from arena
Browse files
app.py
CHANGED
@@ -4,17 +4,12 @@ import glob
|
|
4 |
import pickle
|
5 |
import traceback
|
6 |
import numpy as np
|
7 |
-
from datetime import datetime
|
8 |
|
9 |
import pandas as pd
|
10 |
import gradio as gr
|
11 |
import numpy as np
|
12 |
|
13 |
|
14 |
-
basic_component_values = [None] * 6
|
15 |
-
leader_component_values = [None] * 5
|
16 |
-
|
17 |
-
|
18 |
promo_banner = """
|
19 |
<div style="background-color: #ffcc00; color: black; padding: 10px; text-align: center; font-weight: bold; font-size: 18px; border: 2px solid #000;">
|
20 |
USE THE LATEST VERSIONS OF THE BEST CHATBOTS IN RUSSIAN FOR FREE
|
@@ -23,18 +18,30 @@ promo_banner = """
|
|
23 |
|
24 |
deprecated_model_name = [
|
25 |
"GigaChat 3.1.25.3",
|
26 |
-
"GigaChat-Pro 2.2.25.3",
|
27 |
"saiga_llama3_8b_v6",
|
28 |
"saiga_phi3_medium",
|
29 |
"GigaChat-Plus 3.1.25.3",
|
30 |
"GigaChat-Pro 4.0.26.8",
|
31 |
"GigaChat 4.0.26.8",
|
32 |
-
"xAI: Grok 2",
|
33 |
"GigaChat-Pro 4.0.26.15",
|
34 |
"GigaChat 4.0.26.15",
|
35 |
-
"YandexGPT Experimental", "yandex-gpt-arena"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
]
|
37 |
|
|
|
38 |
def make_default_md_1():
|
39 |
leaderboard_md = f"""
|
40 |
# 🏆 LLM Arena in Russian: Leaderboard
|
@@ -44,24 +51,24 @@ def make_default_md_1():
|
|
44 |
"""
|
45 |
return leaderboard_md
|
46 |
|
47 |
-
|
48 |
def make_default_md_2():
|
49 |
leaderboard_md = f"""
|
50 |
-
|
51 |
The LLM Arena platform is an open crowdsourcing platform for evaluating large language models (LLM) in Russian. We collect pairwise comparisons from people to rank LLMs using the Bradley-Terry model and display model ratings on the Elo scale.
|
52 |
Chatbot Arena in Russian depends on community participation, so please contribute by casting your vote!
|
|
|
53 |
- To **add your model** to the comparison, contact us on TG: [Group](https://t.me/+bFEOl-Bdmok4NGUy)
|
54 |
- If you **found a bug** or **have a suggestion**, contact us: [Roman](https://t.me/roman_kucev)
|
55 |
- You can **contribute your vote** at [llmarena.ru](https://llmarena.ru/)!
|
56 |
"""
|
57 |
-
|
58 |
return leaderboard_md
|
59 |
|
60 |
|
61 |
def make_arena_leaderboard_md(arena_df, last_updated_time):
|
62 |
-
|
|
|
63 |
total_models = len(arena_df)
|
64 |
-
space = "
|
65 |
|
66 |
leaderboard_md = f"""
|
67 |
Total # of models: **{total_models}**.{space} Total # of votes: **{"{:,}".format(total_votes)}**.{space} Last updated: {last_updated_time}.
|
@@ -74,57 +81,166 @@ See Figure 1 below for a visualization of the confidence intervals of model rati
|
|
74 |
|
75 |
|
76 |
def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="site_visitors/medium_prompts:style control"):
|
77 |
-
total_votes = sum(arena_df["num_battles"])
|
78 |
total_models = len(arena_df)
|
79 |
-
space = "
|
80 |
-
total_subset_votes = sum(arena_subset_df["num_battles"])
|
81 |
total_subset_models = len(arena_subset_df)
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
84 |
"""
|
85 |
return leaderboard_md
|
86 |
|
87 |
-
|
88 |
def model_hyperlink(model_name, link):
|
89 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
90 |
|
91 |
|
92 |
-
def filter_deprecated_models_plots(fig, hidden_models=None):
|
93 |
"""
|
94 |
-
|
95 |
|
96 |
Args:
|
97 |
fig: The Plotly figure object.
|
98 |
-
hidden_models: A list of model names to remove.
|
|
|
|
|
|
|
|
|
99 |
"""
|
100 |
if fig is None:
|
101 |
-
return
|
102 |
-
|
103 |
-
if
|
|
|
|
|
|
|
|
|
|
|
104 |
return fig
|
105 |
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
def load_leaderboard_table_csv(filename, add_hyperlink=True):
|
130 |
lines = open(filename).readlines()
|
@@ -132,688 +248,842 @@ def load_leaderboard_table_csv(filename, add_hyperlink=True):
|
|
132 |
rows = []
|
133 |
for i in range(1, len(lines)):
|
134 |
row = [v.strip() for v in lines[i].split(",")]
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
if
|
139 |
-
|
140 |
v = int(ast.literal_eval(v))
|
141 |
-
|
142 |
-
v = np.nan
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
if v != "-":
|
150 |
-
v = round(ast.literal_eval(v[:-1]), 1)
|
151 |
-
else:
|
152 |
-
v = np.nan
|
153 |
-
elif h == "MT-bench (score)":
|
154 |
-
if v != "-":
|
155 |
-
v = round(ast.literal_eval(v), 2)
|
156 |
-
else:
|
157 |
-
v = np.nan
|
158 |
-
item[h] = v
|
159 |
-
if add_hyperlink:
|
160 |
item["Model"] = model_hyperlink(item["Model"], item["Link"])
|
|
|
161 |
rows.append(item)
|
162 |
-
|
163 |
return rows
|
164 |
|
165 |
|
166 |
def create_ranking_str(ranking, ranking_difference):
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
173 |
|
174 |
|
175 |
def recompute_final_ranking(arena_df):
|
176 |
-
# compute ranking based on CI
|
177 |
ranking = {}
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
continue
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
):
|
187 |
-
|
|
|
|
|
|
|
|
|
|
|
188 |
return list(ranking.values())
|
189 |
|
190 |
|
191 |
def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_models=None):
|
192 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
if hidden_models:
|
194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
|
196 |
-
arena_df = arena_df.sort_values(
|
197 |
-
by=["final_ranking", "rating"], ascending=[True, False]
|
198 |
-
)
|
199 |
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
|
205 |
-
|
206 |
-
|
207 |
-
# filter out models not in the arena_df
|
208 |
-
arena_subset_df = arena_subset_df[arena_subset_df.index.isin(arena_df.index)]
|
209 |
-
arena_subset_df = arena_subset_df.sort_values(by=["rating"], ascending=False)
|
210 |
-
arena_subset_df["final_ranking"] = recompute_final_ranking(arena_subset_df)
|
211 |
-
# keep only the models in the subset in arena_df and recompute final_ranking
|
212 |
-
arena_df = arena_df[arena_df.index.isin(arena_subset_df.index)]
|
213 |
-
# recompute final ranking
|
214 |
-
arena_df["final_ranking"] = recompute_final_ranking(arena_df)
|
215 |
-
|
216 |
-
# assign ranking by the order
|
217 |
-
arena_subset_df["final_ranking_no_tie"] = range(1, len(arena_subset_df) + 1)
|
218 |
-
arena_df["final_ranking_no_tie"] = range(1, len(arena_df) + 1)
|
219 |
-
# join arena_df and arena_subset_df on index
|
220 |
-
arena_df = arena_subset_df.join(
|
221 |
-
arena_df["final_ranking"], rsuffix="_global", how="inner"
|
222 |
-
)
|
223 |
-
arena_df["ranking_difference"] = (
|
224 |
-
arena_df["final_ranking_global"] - arena_df["final_ranking"]
|
225 |
-
)
|
226 |
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
axis=1,
|
233 |
-
)
|
234 |
|
235 |
-
|
|
|
|
|
|
|
236 |
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
try:
|
242 |
-
model_name = model_table_df[model_table_df["key"] == model_key][
|
243 |
-
"Model"
|
244 |
-
].values[0]
|
245 |
-
ranking = arena_df.iloc[i].get("final_ranking") or i + 1
|
246 |
-
row.append(ranking)
|
247 |
-
if arena_subset_df is not None:
|
248 |
-
row.append(arena_df.iloc[i].get("ranking_difference") or 0)
|
249 |
-
row.append(model_name)
|
250 |
-
row.append(round(arena_df.iloc[i]["rating"]))
|
251 |
-
upper_diff = round(
|
252 |
-
arena_df.iloc[i]["rating_q975"] - arena_df.iloc[i]["rating"]
|
253 |
-
)
|
254 |
-
lower_diff = round(
|
255 |
-
arena_df.iloc[i]["rating"] - arena_df.iloc[i]["rating_q025"]
|
256 |
)
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
|
|
263 |
)
|
264 |
-
|
265 |
-
|
|
|
|
|
|
|
266 |
)
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
if
|
271 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
else:
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
278 |
return values
|
279 |
|
280 |
|
281 |
key_to_category_name = {
|
282 |
-
|
|
|
283 |
"crowdsourcing/simple_prompts": "crowdsourcing/simple_prompts",
|
284 |
"site_visitors/medium_prompts": "site_visitors/medium_prompts",
|
285 |
-
"site_visitors/medium_prompts:style control": "site_visitors/medium_prompts:
|
286 |
}
|
287 |
cat_name_to_explanation = {
|
|
|
288 |
"Overall": "All queries",
|
289 |
-
"crowdsourcing/simple_prompts": "Queries collected
|
290 |
"site_visitors/medium_prompts": "Queries from website visitors. Contain more complex prompts.",
|
291 |
-
"site_visitors/medium_prompts:
|
292 |
}
|
293 |
cat_name_to_baseline = {
|
294 |
-
|
|
|
295 |
}
|
296 |
|
297 |
actual_categories = [
|
298 |
-
#
|
299 |
-
# "
|
|
|
300 |
"site_visitors/medium_prompts",
|
301 |
"site_visitors/medium_prompts:style control"
|
302 |
]
|
303 |
-
|
304 |
-
|
305 |
-
|
|
|
|
|
306 |
|
307 |
|
308 |
def read_elo_file(elo_results_file, leaderboard_table_file):
|
|
|
|
|
309 |
arena_dfs = {}
|
310 |
category_elo_results = {}
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
|
329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
330 |
|
|
|
331 |
return last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df
|
332 |
|
333 |
|
334 |
def build_leaderboard_tab(
|
335 |
elo_results_file, leaderboard_table_file, show_plot=False, mirror=False
|
336 |
):
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
#
|
350 |
-
|
351 |
-
|
352 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
353 |
with gr.Row():
|
354 |
with gr.Column(scale=4):
|
|
|
355 |
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
|
356 |
with gr.Column(scale=1):
|
357 |
vote_button = gr.Button("Vote!", link="https://llmarena.ru")
|
358 |
md_2 = gr.Markdown(default_md_2, elem_id="leaderboard_markdown")
|
359 |
-
|
360 |
-
if leaderboard_table_file:
|
361 |
-
data = load_leaderboard_table_csv(leaderboard_table_file)
|
362 |
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
value=[],
|
385 |
-
info="",
|
386 |
-
)
|
387 |
-
default_category_details = make_category_arena_leaderboard_md(
|
388 |
-
arena_df, arena_df, name=selected_category
|
389 |
-
)
|
390 |
-
|
391 |
-
with gr.Column(scale=4, variant="panel"):
|
392 |
-
category_deets = gr.Markdown(
|
393 |
-
default_category_details, elem_id="category_deets"
|
394 |
-
)
|
395 |
-
|
396 |
-
arena_vals = pd.DataFrame(
|
397 |
-
arena_table_vals,
|
398 |
-
columns=[
|
399 |
-
"Rank* (UB)",
|
400 |
-
"Model",
|
401 |
-
"Arena Elo",
|
402 |
-
"95% CI",
|
403 |
-
"Votes",
|
404 |
-
"Organization",
|
405 |
-
"License",
|
406 |
-
"Knowledge Cutoff",
|
407 |
-
],
|
408 |
-
)
|
409 |
-
elo_display_df = gr.Dataframe(
|
410 |
-
headers=[
|
411 |
-
"Rank* (UB)",
|
412 |
-
"Model",
|
413 |
-
"Arena Elo",
|
414 |
-
"95% CI",
|
415 |
-
"Votes",
|
416 |
-
"Organization",
|
417 |
-
"License",
|
418 |
-
"Knowledge Cutoff",
|
419 |
-
],
|
420 |
-
datatype=[
|
421 |
-
"str",
|
422 |
-
"markdown",
|
423 |
-
"number",
|
424 |
-
"str",
|
425 |
-
"number",
|
426 |
-
"str",
|
427 |
-
"str",
|
428 |
-
"str",
|
429 |
-
],
|
430 |
-
value=arena_vals.style,
|
431 |
-
elem_id="arena_leaderboard_dataframe",
|
432 |
-
height=700,
|
433 |
-
column_widths=[70, 190, 100, 100, 90, 130, 150, 100],
|
434 |
-
wrap=True,
|
435 |
)
|
436 |
|
437 |
-
|
438 |
-
|
|
|
|
|
|
|
|
|
|
|
439 |
)
|
440 |
|
441 |
-
|
|
|
|
|
|
|
442 |
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
)
|
448 |
-
with gr.Row():
|
449 |
-
with gr.Column():
|
450 |
-
gr.Markdown(
|
451 |
-
"#### Figure 1: Confidence Intervals on Model Strength (via Bootstrapping)",
|
452 |
-
elem_id="plot-title",
|
453 |
-
)
|
454 |
-
plot_3 = gr.Plot(p3, show_label=False)
|
455 |
-
with gr.Column():
|
456 |
-
gr.Markdown(
|
457 |
-
"#### Figure 2: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)",
|
458 |
-
elem_id="plot-title",
|
459 |
-
)
|
460 |
-
plot_4 = gr.Plot(p4, show_label=False)
|
461 |
-
with gr.Row():
|
462 |
-
with gr.Column():
|
463 |
-
gr.Markdown(
|
464 |
-
"#### Figure 3: Fraction of Model A Wins for All Non-tied A vs. B Battles",
|
465 |
-
elem_id="plot-title",
|
466 |
-
)
|
467 |
-
plot_1 = gr.Plot(
|
468 |
-
p1, show_label=False, elem_id="plot-container"
|
469 |
-
)
|
470 |
-
with gr.Column():
|
471 |
-
gr.Markdown(
|
472 |
-
"#### Figure 4: Battle Count for Each Combination of Models (without Ties)",
|
473 |
-
elem_id="plot-title",
|
474 |
-
)
|
475 |
-
plot_2 = gr.Plot(p2, show_label=False)
|
476 |
-
|
477 |
-
if not show_plot:
|
478 |
-
gr.Markdown(
|
479 |
-
"""
|
480 |
-
""",
|
481 |
-
elem_id="leaderboard_markdown",
|
482 |
-
)
|
483 |
-
else:
|
484 |
-
pass
|
485 |
|
486 |
-
|
487 |
-
|
488 |
-
|
|
|
489 |
columns=[
|
490 |
-
"Rank* (UB)",
|
491 |
-
"
|
492 |
-
"Model",
|
493 |
-
"Arena Elo",
|
494 |
-
"95% CI",
|
495 |
-
"Votes",
|
496 |
-
"Organization",
|
497 |
-
"License",
|
498 |
-
"Knowledge Cutoff",
|
499 |
-
],
|
500 |
-
)
|
501 |
-
|
502 |
-
def highlight_max(s):
|
503 |
-
return [
|
504 |
-
"color: green; font-weight: bold"
|
505 |
-
if "\u2191" in v
|
506 |
-
else "color: red; font-weight: bold"
|
507 |
-
if "\u2193" in v
|
508 |
-
else ""
|
509 |
-
for v in s
|
510 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
511 |
|
512 |
-
def highlight_rank_max(s):
|
513 |
-
return [
|
514 |
-
"color: green; font-weight: bold"
|
515 |
-
if v > 0
|
516 |
-
else "color: red; font-weight: bold"
|
517 |
-
if v < 0
|
518 |
-
else ""
|
519 |
-
for v in s
|
520 |
-
]
|
521 |
|
522 |
-
|
523 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
524 |
)
|
525 |
|
526 |
-
|
527 |
-
_, arena_dfs, category_elo_results, _, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
|
528 |
|
529 |
-
|
530 |
-
|
531 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
532 |
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
|
|
|
|
|
|
541 |
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
elo_subset_results["battle_count_heatmap"],
|
549 |
-
hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
|
550 |
-
)
|
551 |
-
p3 = filter_deprecated_models_plots(
|
552 |
-
elo_subset_results["bootstrap_elo_rating"],
|
553 |
-
hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
|
554 |
-
)
|
555 |
-
p4 = filter_deprecated_models_plots(
|
556 |
-
elo_subset_results["average_win_rate_bar"],
|
557 |
-
hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
|
558 |
-
)
|
559 |
-
if category != "Overall":
|
560 |
-
arena_values = update_leaderboard_df(arena_values)
|
561 |
-
arena_values = gr.Dataframe(
|
562 |
-
headers=[
|
563 |
-
"Rank* (UB)",
|
564 |
-
"Delta",
|
565 |
-
"Model",
|
566 |
-
"Arena Elo",
|
567 |
-
"95% CI",
|
568 |
-
"Votes",
|
569 |
-
"Organization",
|
570 |
-
"License",
|
571 |
-
"Knowledge Cutoff",
|
572 |
-
],
|
573 |
-
datatype=[
|
574 |
-
"str",
|
575 |
-
"number",
|
576 |
-
"markdown",
|
577 |
-
"number",
|
578 |
-
"str",
|
579 |
-
"number",
|
580 |
-
"str",
|
581 |
-
"str",
|
582 |
-
"str",
|
583 |
],
|
584 |
-
value=arena_values,
|
585 |
-
elem_id="arena_leaderboard_dataframe",
|
586 |
-
height=700,
|
587 |
-
column_widths=[70, 70, 200, 90, 100, 90, 120, 150, 100],
|
588 |
-
wrap=True,
|
589 |
)
|
590 |
-
|
591 |
-
|
592 |
-
|
593 |
-
|
594 |
-
"
|
595 |
-
"
|
596 |
-
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
|
603 |
-
"
|
604 |
-
|
605 |
-
|
606 |
-
|
607 |
-
|
608 |
-
|
609 |
-
"str",
|
610 |
-
"str",
|
611 |
-
],
|
612 |
-
value=arena_values,
|
613 |
-
elem_id="arena_leaderboard_dataframe",
|
614 |
-
height=700,
|
615 |
-
column_widths=[70, 190, 100, 100, 90, 140, 150, 100],
|
616 |
-
wrap=True,
|
617 |
)
|
|
|
618 |
|
619 |
-
|
620 |
-
|
621 |
-
|
622 |
-
|
623 |
-
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
-
|
628 |
-
|
629 |
-
|
630 |
-
|
631 |
-
|
632 |
-
|
633 |
-
|
634 |
-
|
635 |
-
|
636 |
-
|
637 |
-
|
638 |
-
|
639 |
-
|
640 |
-
|
641 |
-
|
642 |
-
|
643 |
-
|
644 |
-
|
645 |
-
|
646 |
-
|
647 |
-
|
648 |
-
|
649 |
-
|
650 |
-
|
651 |
-
|
652 |
-
|
653 |
-
|
654 |
-
|
655 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
656 |
)
|
657 |
-
if show_plot and leaderboard_table_file:
|
658 |
-
return [md_1, md_2, lb_description, category_deets, elo_display_df, plot_1, plot_2, plot_3, plot_4]
|
659 |
-
return [md_1]
|
660 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
661 |
|
662 |
-
|
663 |
-
|
664 |
-
|
665 |
-
theme.text_size = text_size
|
666 |
-
theme.set(
|
667 |
-
button_large_text_size="40px",
|
668 |
-
button_small_text_size="40px",
|
669 |
-
button_large_text_weight="1000",
|
670 |
-
button_small_text_weight="1000",
|
671 |
-
button_shadow="*shadow_drop_lg",
|
672 |
-
button_shadow_hover="*shadow_drop_lg",
|
673 |
-
checkbox_label_shadow="*shadow_drop_lg",
|
674 |
-
button_shadow_active="*shadow_inset",
|
675 |
-
button_secondary_background_fill="*primary_300",
|
676 |
-
button_secondary_background_fill_dark="*primary_700",
|
677 |
-
button_secondary_background_fill_hover="*primary_200",
|
678 |
-
button_secondary_background_fill_hover_dark="*primary_500",
|
679 |
-
button_secondary_text_color="*primary_800",
|
680 |
-
button_secondary_text_color_dark="white",
|
681 |
-
)
|
682 |
|
683 |
-
with gr.Blocks(
|
684 |
-
title="LLM arena: leaderboard",
|
685 |
-
theme=theme,
|
686 |
-
css=block_css,
|
687 |
-
) as demo:
|
688 |
-
build_leaderboard_tab(
|
689 |
-
elo_results_file, leaderboard_table_file, show_plot=True, mirror=True
|
690 |
-
)
|
691 |
-
return demo
|
692 |
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
-
|
697 |
-
|
698 |
-
|
699 |
-
|
700 |
-
#
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
|
705 |
-
|
706 |
-
|
707 |
-
#
|
708 |
-
|
709 |
-
|
710 |
-
|
711 |
-
|
712 |
-
|
713 |
-
|
714 |
-
|
715 |
-
|
716 |
-
|
717 |
-
|
718 |
-
padding-bottom: 6px;
|
719 |
-
}
|
720 |
-
#leaderboard_dataframe td {
|
721 |
-
line-height: 0.1em;
|
722 |
-
}
|
723 |
-
#about_markdown .prose {
|
724 |
-
font-size: 110% !important;
|
725 |
-
}
|
726 |
-
#ack_markdown .prose {
|
727 |
-
font-size: 110% !important;
|
728 |
-
}
|
729 |
-
#chatbot .prose {
|
730 |
-
font-size: 105% !important;
|
731 |
-
}
|
732 |
-
.sponsor-image-about img {
|
733 |
-
margin: 0 20px;
|
734 |
-
margin-top: 20px;
|
735 |
-
height: 40px;
|
736 |
-
max-height: 100%;
|
737 |
-
width: auto;
|
738 |
-
float: left;
|
739 |
-
}
|
740 |
|
741 |
-
.chatbot h1, h2, h3 {
|
742 |
-
margin-top: 8px; /* Adjust the value as needed */
|
743 |
-
margin-bottom: 0px; /* Adjust the value as needed */
|
744 |
-
padding-bottom: 0px;
|
745 |
-
}
|
746 |
|
747 |
-
|
748 |
-
|
749 |
-
|
750 |
-
.
|
751 |
-
|
752 |
-
|
753 |
-
.chatbot h3 {
|
754 |
-
font-size: 110%;
|
755 |
-
}
|
756 |
-
.chatbot p:not(:first-child) {
|
757 |
-
margin-top: 8px;
|
758 |
-
}
|
759 |
|
760 |
-
.typing {
|
761 |
-
display: inline-block;
|
762 |
-
}
|
763 |
|
764 |
-
|
765 |
-
display: inline-block;
|
766 |
-
width: 7px;
|
767 |
-
height: 1em;
|
768 |
-
background-color: black;
|
769 |
-
vertical-align: middle;
|
770 |
-
animation: blink 1s infinite;
|
771 |
-
}
|
772 |
|
773 |
-
.dark .cursor {
|
774 |
-
display: inline-block;
|
775 |
-
width: 7px;
|
776 |
-
height: 1em;
|
777 |
-
background-color: white;
|
778 |
-
vertical-align: middle;
|
779 |
-
animation: blink 1s infinite;
|
780 |
-
}
|
781 |
|
782 |
-
|
783 |
-
|
784 |
-
|
785 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
786 |
|
787 |
-
.
|
788 |
-
|
789 |
-
|
790 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
791 |
|
792 |
-
|
793 |
-
|
794 |
-
|
795 |
-
|
796 |
-
|
797 |
-
|
798 |
-
|
799 |
-
|
800 |
-
|
|
|
|
|
801 |
|
802 |
|
803 |
if __name__ == "__main__":
|
804 |
parser = argparse.ArgumentParser()
|
805 |
-
parser.add_argument("--share", action="store_true")
|
806 |
parser.add_argument("--host", default="0.0.0.0")
|
807 |
parser.add_argument("--port", type=int, default=7860)
|
|
|
808 |
args = parser.parse_args()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
809 |
|
810 |
-
elo_result_files = glob.glob("elo_results_*.pkl")
|
811 |
-
elo_result_files.sort(key=lambda x: int(x[12:-4]))
|
812 |
-
elo_result_file = elo_result_files[-1]
|
813 |
-
|
814 |
-
leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
|
815 |
-
leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
|
816 |
-
leaderboard_table_file = leaderboard_table_files[-1]
|
817 |
|
818 |
demo = build_demo(elo_result_file, leaderboard_table_file)
|
819 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
import pickle
|
5 |
import traceback
|
6 |
import numpy as np
|
|
|
7 |
|
8 |
import pandas as pd
|
9 |
import gradio as gr
|
10 |
import numpy as np
|
11 |
|
12 |
|
|
|
|
|
|
|
|
|
13 |
promo_banner = """
|
14 |
<div style="background-color: #ffcc00; color: black; padding: 10px; text-align: center; font-weight: bold; font-size: 18px; border: 2px solid #000;">
|
15 |
USE THE LATEST VERSIONS OF THE BEST CHATBOTS IN RUSSIAN FOR FREE
|
|
|
18 |
|
19 |
deprecated_model_name = [
|
20 |
"GigaChat 3.1.25.3",
|
21 |
+
"GigaChat-Pro 2.2.25.3",
|
22 |
"saiga_llama3_8b_v6",
|
23 |
"saiga_phi3_medium",
|
24 |
"GigaChat-Plus 3.1.25.3",
|
25 |
"GigaChat-Pro 4.0.26.8",
|
26 |
"GigaChat 4.0.26.8",
|
27 |
+
"xAI: Grok 2",
|
28 |
"GigaChat-Pro 4.0.26.15",
|
29 |
"GigaChat 4.0.26.15",
|
30 |
+
"YandexGPT Experimental", "yandex-gpt-arena",
|
31 |
+
"RefalMachine/ruadapt_llama3_instruct_lep_saiga_kto_ablitirated"
|
32 |
+
]
|
33 |
+
|
34 |
+
models_10b = [
|
35 |
+
"saiga_llama3_8b_v7",
|
36 |
+
"Vikhrmodels/Vikhr-YandexGPT-5-Lite-8B-it",
|
37 |
+
"T-lite-instruct-0.1",
|
38 |
+
"t-tech/T-lite-it-1.0",
|
39 |
+
"LLaMA-3 Chat (8B)",
|
40 |
+
"Llama 3.1 8B Instruct Turbo",
|
41 |
+
"MTSAIR/Cotype-Nano"
|
42 |
]
|
43 |
|
44 |
+
|
45 |
def make_default_md_1():
|
46 |
leaderboard_md = f"""
|
47 |
# 🏆 LLM Arena in Russian: Leaderboard
|
|
|
51 |
"""
|
52 |
return leaderboard_md
|
53 |
|
|
|
54 |
def make_default_md_2():
|
55 |
leaderboard_md = f"""
|
56 |
+
|
57 |
The LLM Arena platform is an open crowdsourcing platform for evaluating large language models (LLM) in Russian. We collect pairwise comparisons from people to rank LLMs using the Bradley-Terry model and display model ratings on the Elo scale.
|
58 |
Chatbot Arena in Russian depends on community participation, so please contribute by casting your vote!
|
59 |
+
|
60 |
- To **add your model** to the comparison, contact us on TG: [Group](https://t.me/+bFEOl-Bdmok4NGUy)
|
61 |
- If you **found a bug** or **have a suggestion**, contact us: [Roman](https://t.me/roman_kucev)
|
62 |
- You can **contribute your vote** at [llmarena.ru](https://llmarena.ru/)!
|
63 |
"""
|
|
|
64 |
return leaderboard_md
|
65 |
|
66 |
|
67 |
def make_arena_leaderboard_md(arena_df, last_updated_time):
|
68 |
+
# Using version from monitor.py (translated)
|
69 |
+
total_votes = sum(arena_df["num_battles"]) if not arena_df.empty else 0
|
70 |
total_models = len(arena_df)
|
71 |
+
space = " " # Using HTML space
|
72 |
|
73 |
leaderboard_md = f"""
|
74 |
Total # of models: **{total_models}**.{space} Total # of votes: **{"{:,}".format(total_votes)}**.{space} Last updated: {last_updated_time}.
|
|
|
81 |
|
82 |
|
83 |
def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="site_visitors/medium_prompts:style control"):
|
84 |
+
total_votes = sum(arena_df["num_battles"]) if not arena_df.empty else 0
|
85 |
total_models = len(arena_df)
|
86 |
+
space = " "
|
87 |
+
total_subset_votes = sum(arena_subset_df["num_battles"]) if not arena_subset_df.empty else 0
|
88 |
total_subset_models = len(arena_subset_df)
|
89 |
+
|
90 |
+
perc_models = round(total_subset_models / total_models * 100) if total_models > 0 else 0
|
91 |
+
perc_votes = round(total_subset_votes / total_votes * 100) if total_votes > 0 else 0
|
92 |
+
|
93 |
+
leaderboard_md = f"""### {cat_name_to_explanation.get(name, name)}
|
94 |
+
#### {space} #models: **{total_subset_models} ({perc_models}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({perc_votes}%)**{space}
|
95 |
"""
|
96 |
return leaderboard_md
|
97 |
|
|
|
98 |
def model_hyperlink(model_name, link):
|
99 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
100 |
|
101 |
|
102 |
+
def filter_deprecated_models_plots(fig, hidden_models=None, limit_to_top=25):
|
103 |
"""
|
104 |
+
Filters Plotly plots to show only top N models and optionally removes specific models.
|
105 |
|
106 |
Args:
|
107 |
fig: The Plotly figure object.
|
108 |
+
hidden_models (list, optional): A list of model names to remove. Defaults to None.
|
109 |
+
limit_to_top (int, optional): Limit display to top N models (0 or None means no limit). Defaults to 25.
|
110 |
+
|
111 |
+
Returns:
|
112 |
+
Plotly figure: The filtered figure object or the original if filtering fails or is not applicable.
|
113 |
"""
|
114 |
if fig is None:
|
115 |
+
return None
|
116 |
+
|
117 |
+
# Check if the figure has data
|
118 |
+
if not hasattr(fig, 'data') or len(fig.data) == 0:
|
119 |
+
return fig
|
120 |
+
|
121 |
+
# Check if data has a type attribute
|
122 |
+
if not hasattr(fig.data[0], 'type'):
|
123 |
return fig
|
124 |
|
125 |
+
# Check minimum number of models after initial hidden_models filtering
|
126 |
+
models_to_check = []
|
127 |
+
if hasattr(fig.data[0], 'x'):
|
128 |
+
models_to_check = fig.data[0].x
|
129 |
+
elif hasattr(fig.data[0], 'y'): # For some types like bar, X axis might be numeric
|
130 |
+
models_to_check = fig.data[0].y
|
131 |
+
|
132 |
+
if hidden_models is not None and models_to_check.any():
|
133 |
+
available_models = [x for x in models_to_check if x not in hidden_models]
|
134 |
+
# print(f"Available models before top N: {len(available_models)}") # Debug
|
135 |
+
if len(available_models) <= 2: # If less than 3 models remain before top_n
|
136 |
+
# print(f"Warning: Too few models left after initial filtering ({len(available_models)}), returning original plot.")
|
137 |
+
return fig # Return the original plot if too few models
|
138 |
+
|
139 |
+
if limit_to_top is not None and limit_to_top <= 0:
|
140 |
+
limit_to_top = None
|
141 |
+
|
142 |
+
try:
|
143 |
+
# Work on a deep copy to avoid modifying the original figure object
|
144 |
+
fig_copy = pickle.loads(pickle.dumps(fig))
|
145 |
+
data = fig_copy.data[0]
|
146 |
+
|
147 |
+
if data.type == 'heatmap':
|
148 |
+
# Apply hidden models filter
|
149 |
+
mask_x = ~np.isin(data.x, hidden_models) if hidden_models is not None else np.ones_like(data.x, dtype=bool)
|
150 |
+
mask_y = ~np.isin(data.y, hidden_models) if hidden_models is not None else np.ones_like(data.y, dtype=bool)
|
151 |
+
|
152 |
+
# Get initially filtered X and Y arrays
|
153 |
+
filtered_x = np.array(data.x)[mask_x]
|
154 |
+
filtered_y = np.array(data.y)[mask_y]
|
155 |
+
|
156 |
+
# Apply top N limit (assuming the order is already by rank/rating)
|
157 |
+
if limit_to_top is not None and len(filtered_x) > limit_to_top:
|
158 |
+
top_models = filtered_x[:limit_to_top]
|
159 |
+
# Create new masks based on the top models relative to the *original* data axes
|
160 |
+
mask_x = np.isin(data.x, top_models)
|
161 |
+
mask_y = np.isin(data.y, top_models)
|
162 |
+
# Get final filtered axes
|
163 |
+
filtered_x = np.array(data.x)[mask_x]
|
164 |
+
filtered_y = np.array(data.y)[mask_y]
|
165 |
+
elif len(filtered_x) <= 2: # If <=2 models remain after filtering
|
166 |
+
return fig # Return original
|
167 |
+
|
168 |
+
# Update the heatmap data
|
169 |
+
data.x = filtered_x
|
170 |
+
data.y = filtered_y
|
171 |
+
# Important: Indexing 'z' must use masks derived from the *original* data order
|
172 |
+
z_original = np.array(fig.data[0].z)
|
173 |
+
data.z = z_original[np.ix_(mask_y, mask_x)]
|
174 |
+
|
175 |
+
elif data.type == 'scatter':
|
176 |
+
trace = data
|
177 |
+
# Apply hidden models filter
|
178 |
+
mask = ~np.isin(trace.x, hidden_models) if hidden_models is not None else np.ones_like(trace.x, dtype=bool)
|
179 |
+
|
180 |
+
# Get initially filtered arrays
|
181 |
+
current_x = np.array(trace.x)[mask]
|
182 |
+
current_y = np.array(trace.y)[mask]
|
183 |
+
current_text = np.array(trace.text)[mask] if hasattr(trace, 'text') and trace.text is not None else None
|
184 |
+
# Handle error bars safely
|
185 |
+
current_error_y_array = np.array(trace.error_y['array'])[mask] if 'error_y' in trace and 'array' in trace.error_y and trace.error_y['array'] is not None else None
|
186 |
+
current_error_y_arrayminus = np.array(trace.error_y['arrayminus'])[mask] if 'error_y' in trace and 'arrayminus' in trace.error_y and trace.error_y['arrayminus'] is not None else None
|
187 |
+
|
188 |
+
# Apply top N limit
|
189 |
+
if limit_to_top is not None and len(current_x) > limit_to_top:
|
190 |
+
# Sort by y-value (rating) descending to find the top N
|
191 |
+
sort_indices = np.argsort(-current_y)[:limit_to_top]
|
192 |
+
current_x = current_x[sort_indices]
|
193 |
+
current_y = current_y[sort_indices]
|
194 |
+
if current_text is not None:
|
195 |
+
current_text = current_text[sort_indices]
|
196 |
+
if current_error_y_array is not None:
|
197 |
+
current_error_y_array = current_error_y_array[sort_indices]
|
198 |
+
if current_error_y_arrayminus is not None:
|
199 |
+
current_error_y_arrayminus = current_error_y_arrayminus[sort_indices]
|
200 |
+
elif len(current_x) <= 2: # If <=2 models remain after filtering
|
201 |
+
return fig # Return original
|
202 |
+
|
203 |
+
# Update the scatter trace data
|
204 |
+
trace.x, trace.y = current_x, current_y
|
205 |
+
if current_text is not None:
|
206 |
+
trace.text = current_text
|
207 |
+
# Update error bars if they exist
|
208 |
+
if current_error_y_array is not None:
|
209 |
+
# Ensure error_y exists before assigning
|
210 |
+
if 'error_y' not in trace: trace.error_y = {}
|
211 |
+
trace.error_y['array'] = current_error_y_array
|
212 |
+
if current_error_y_arrayminus is not None:
|
213 |
+
if 'error_y' not in trace: trace.error_y = {}
|
214 |
+
trace.error_y['arrayminus'] = current_error_y_arrayminus
|
215 |
+
|
216 |
+
elif data.type == 'bar':
|
217 |
+
trace = data
|
218 |
+
# Apply hidden models filter
|
219 |
+
mask = ~np.isin(trace.x, hidden_models) if hidden_models is not None else np.ones_like(trace.x, dtype=bool)
|
220 |
+
|
221 |
+
# Get initially filtered arrays
|
222 |
+
current_x = np.array(trace.x)[mask]
|
223 |
+
current_y = np.array(trace.y)[mask]
|
224 |
+
|
225 |
+
# Apply top N limit
|
226 |
+
if limit_to_top is not None and len(current_x) > limit_to_top:
|
227 |
+
# Sort by y-value (rating) descending
|
228 |
+
sort_indices = np.argsort(-current_y)[:limit_to_top]
|
229 |
+
current_x = current_x[sort_indices]
|
230 |
+
current_y = current_y[sort_indices]
|
231 |
+
elif len(current_x) <= 2: # If <=2 models remain after filtering
|
232 |
+
return fig # Return original
|
233 |
+
|
234 |
+
# Update the bar trace data
|
235 |
+
trace.x, trace.y = current_x, current_y
|
236 |
+
|
237 |
+
return fig_copy
|
238 |
+
|
239 |
+
except Exception as e:
|
240 |
+
print(f"Error filtering plot: {e}")
|
241 |
+
traceback.print_exc()
|
242 |
+
return fig # Return original figure on error
|
243 |
+
|
244 |
|
245 |
def load_leaderboard_table_csv(filename, add_hyperlink=True):
|
246 |
lines = open(filename).readlines()
|
|
|
248 |
rows = []
|
249 |
for i in range(1, len(lines)):
|
250 |
row = [v.strip() for v in lines[i].split(",")]
|
251 |
+
item = {} # Create dictionary once per row
|
252 |
+
for h, v in zip(heads, row):
|
253 |
+
if h == "Arena Elo rating":
|
254 |
+
if v != "-":
|
255 |
+
try:
|
256 |
v = int(ast.literal_eval(v))
|
257 |
+
except:
|
258 |
+
v = np.nan # Handle parsing errors
|
259 |
+
else:
|
260 |
+
v = np.nan
|
261 |
+
item[h] = v
|
262 |
+
if add_hyperlink and "Model" in item and "Link" in item: # Check keys exist
|
263 |
+
# Check for empty/missing link
|
264 |
+
if item["Link"] and item["Link"] != "-":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
item["Model"] = model_hyperlink(item["Model"], item["Link"])
|
266 |
+
# Otherwise, keep the model name as is
|
267 |
rows.append(item)
|
|
|
268 |
return rows
|
269 |
|
270 |
|
271 |
def create_ranking_str(ranking, ranking_difference):
|
272 |
+
# Convert rank to int before comparison
|
273 |
+
try:
|
274 |
+
# Ensure rank and difference are treated as numbers
|
275 |
+
ranking_val = int(float(ranking)) # Handle potential float input
|
276 |
+
ranking_difference_val = int(float(ranking_difference))
|
277 |
+
if ranking_difference_val > 0:
|
278 |
+
return f"{ranking_val} ↑"
|
279 |
+
elif ranking_difference_val < 0:
|
280 |
+
return f"{ranking_val} ↓"
|
281 |
+
else:
|
282 |
+
return f"{ranking_val}"
|
283 |
+
except (ValueError, TypeError): # Handle cases where rank is not numeric
|
284 |
+
return str(ranking)
|
285 |
|
286 |
|
287 |
def recompute_final_ranking(arena_df):
|
|
|
288 |
ranking = {}
|
289 |
+
if arena_df.empty:
|
290 |
+
return []
|
291 |
+
|
292 |
+
model_indices = arena_df.index
|
293 |
+
# Ensure CI columns exist before trying to access them
|
294 |
+
if "rating_q025" not in arena_df.columns or "rating_q975" not in arena_df.columns:
|
295 |
+
print("Warning: Confidence interval columns ('rating_q025', 'rating_q975') not found in DataFrame. Cannot compute UB Rank.")
|
296 |
+
# Return NaN or simple rank based on order
|
297 |
+
return [np.nan] * len(model_indices) # Or range(1, len(model_indices) + 1)
|
298 |
+
|
299 |
+
ratings_q025 = arena_df["rating_q025"].to_dict()
|
300 |
+
ratings_q975 = arena_df["rating_q975"].to_dict()
|
301 |
+
|
302 |
+
for model_a in model_indices:
|
303 |
+
rank = 1
|
304 |
+
rating_a_q975 = ratings_q975.get(model_a)
|
305 |
+
# Skip if model A has no CI data
|
306 |
+
if pd.isna(rating_a_q975):
|
307 |
+
ranking[model_a] = np.nan # Or assign max rank + 1
|
308 |
+
continue
|
309 |
+
|
310 |
+
for model_b in model_indices:
|
311 |
+
if model_a == model_b:
|
312 |
continue
|
313 |
+
|
314 |
+
rating_b_q025 = ratings_q025.get(model_b)
|
315 |
+
# Skip comparison if model B has no CI data
|
316 |
+
if pd.isna(rating_b_q025):
|
317 |
+
continue
|
318 |
+
|
319 |
+
# Check if B is statistically better than A
|
320 |
+
if rating_b_q025 > rating_a_q975:
|
321 |
+
rank += 1
|
322 |
+
ranking[model_a] = rank
|
323 |
return list(ranking.values())
|
324 |
|
325 |
|
326 |
def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_models=None):
|
327 |
+
"""
|
328 |
+
Generates the leaderboard table data.
|
329 |
+
'use_cache' parameter removed.
|
330 |
+
"""
|
331 |
+
# print(f'Calculating get_arena_table') # Debug
|
332 |
+
|
333 |
+
# Create copies to avoid modifying original DataFrames
|
334 |
+
arena_df_processed = arena_df.copy()
|
335 |
+
if arena_subset_df is not None:
|
336 |
+
arena_subset_df_processed = arena_subset_df.copy()
|
337 |
+
else:
|
338 |
+
arena_subset_df_processed = None
|
339 |
+
|
340 |
+
# Sort by rating initially to have a stable order before ranking
|
341 |
+
arena_df_processed = arena_df_processed.sort_values(by=["rating"], ascending=False)
|
342 |
+
# Compute 'final_ranking' based on CIs if possible
|
343 |
+
if "rating_q025" in arena_df_processed.columns and "rating_q975" in arena_df_processed.columns:
|
344 |
+
arena_df_processed["final_ranking"] = recompute_final_ranking(arena_df_processed)
|
345 |
+
arena_df_processed = arena_df_processed.sort_values(
|
346 |
+
by=["final_ranking", "rating"], ascending=[True, False]
|
347 |
+
)
|
348 |
+
else:
|
349 |
+
# Fallback to simple ordering if CI columns are missing
|
350 |
+
arena_df_processed["final_ranking"] = range(1, len(arena_df_processed) + 1)
|
351 |
+
|
352 |
if hidden_models:
|
353 |
+
arena_df_processed = arena_df_processed[~arena_df_processed.index.isin(hidden_models)].copy()
|
354 |
+
# Recompute ranks for the filtered view
|
355 |
+
if "rating_q025" in arena_df_processed.columns and "rating_q975" in arena_df_processed.columns:
|
356 |
+
arena_df_processed["final_ranking"] = recompute_final_ranking(arena_df_processed)
|
357 |
+
# Re-sort based on new ranks
|
358 |
+
arena_df_processed = arena_df_processed.sort_values(
|
359 |
+
by=["final_ranking", "rating"], ascending=[True, False]
|
360 |
+
)
|
361 |
+
else:
|
362 |
+
arena_df_processed["final_ranking"] = range(1, len(arena_df_processed) + 1)
|
363 |
|
|
|
|
|
|
|
364 |
|
365 |
+
if arena_subset_df_processed is not None:
|
366 |
+
# Filter subset by hidden_models first
|
367 |
+
if hidden_models:
|
368 |
+
arena_subset_df_processed = arena_subset_df_processed[~arena_subset_df_processed.index.isin(hidden_models)].copy()
|
369 |
|
370 |
+
# Ensure models in the subset are also present in the (filtered) main view
|
371 |
+
arena_subset_df_processed = arena_subset_df_processed[arena_subset_df_processed.index.isin(arena_df_processed.index)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
372 |
|
373 |
+
# Proceed only if subset is not empty and has CI columns
|
374 |
+
if not arena_subset_df_processed.empty and "rating_q025" in arena_subset_df_processed.columns and "rating_q975" in arena_subset_df_processed.columns:
|
375 |
+
# Rank within the subset
|
376 |
+
arena_subset_df_processed = arena_subset_df_processed.sort_values(by=["rating"], ascending=False)
|
377 |
+
arena_subset_df_processed["final_ranking_subset"] = recompute_final_ranking(arena_subset_df_processed) # Rank within category
|
|
|
|
|
378 |
|
379 |
+
# Filter the main processed DF to only include models from the subset
|
380 |
+
# 'final_ranking' here represents the rank *among these models* in the baseline category view
|
381 |
+
arena_df_for_join = arena_df_processed[arena_df_processed.index.isin(arena_subset_df_processed.index)][["final_ranking", "rating"]].copy()
|
382 |
+
arena_df_for_join.rename(columns={"final_ranking": "final_ranking_baseline"}, inplace=True)
|
383 |
|
384 |
+
|
385 |
+
# Join the subset ranks and baseline ranks
|
386 |
+
arena_df_combined = arena_subset_df_processed[["final_ranking_subset", "rating"]].join(
|
387 |
+
arena_df_for_join["final_ranking_baseline"], how="inner"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
)
|
389 |
+
|
390 |
+
# Calculate rank difference
|
391 |
+
arena_df_combined["ranking_difference"] = arena_df_combined["final_ranking_baseline"] - arena_df_combined["final_ranking_subset"]
|
392 |
+
|
393 |
+
# Sort by subset rank and rating
|
394 |
+
arena_df_combined = arena_df_combined.sort_values(
|
395 |
+
by=["final_ranking_subset", "rating"], ascending=[True, False]
|
396 |
)
|
397 |
+
|
398 |
+
# Format the rank string with delta for display
|
399 |
+
arena_df_combined["display_ranking"] = arena_df_combined.apply(
|
400 |
+
lambda x: create_ranking_str(x["final_ranking_subset"], x["ranking_difference"]),
|
401 |
+
axis=1,
|
402 |
)
|
403 |
+
arena_df_processed = arena_df_processed.loc[arena_df_combined.index] # Reorder arena_df_processed
|
404 |
+
|
405 |
+
columns_to_join = ["display_ranking", "ranking_difference", "final_ranking_subset"]
|
406 |
+
columns_to_join = [col for col in columns_to_join if col in arena_df_combined.columns]
|
407 |
+
arena_df_processed = arena_df_processed.join(arena_df_combined[columns_to_join], how="inner")
|
408 |
+
|
409 |
+
|
410 |
+
# Now sorting should work as the column exists
|
411 |
+
# Use the subset rank for final sorting if subset is active
|
412 |
+
# Check if 'final_ranking_subset' was successfully joined before sorting
|
413 |
+
if "final_ranking_subset" in arena_df_processed.columns:
|
414 |
+
arena_df_processed.sort_values(by=["final_ranking_subset", "rating"], ascending=[True, False], inplace=True)
|
415 |
else:
|
416 |
+
# Fallback sort if join failed for some reason
|
417 |
+
arena_df_processed.sort_values(by=["rating"], ascending=False, inplace=True)
|
418 |
+
|
419 |
+
|
420 |
+
else:
|
421 |
+
# If subset is empty or lacks CI, disable subset logic
|
422 |
+
arena_subset_df_processed = None
|
423 |
+
# Use the baseline ranking as the display ranking
|
424 |
+
arena_df_processed["display_ranking"] = arena_df_processed["final_ranking"].astype(str)
|
425 |
+
arena_df_processed.sort_values(by=["final_ranking", "rating"], ascending=[True, False], inplace=True)
|
426 |
+
|
427 |
+
|
428 |
+
else:
|
429 |
+
# If no subset is used, display ranking is just the final rank from the main DF
|
430 |
+
arena_df_processed["display_ranking"] = arena_df_processed["final_ranking"].astype(str)
|
431 |
+
# Ensure it's sorted correctly
|
432 |
+
arena_df_processed.sort_values(by=["final_ranking", "rating"], ascending=[True, False], inplace=True)
|
433 |
+
|
434 |
+
|
435 |
+
values = []
|
436 |
+
# Iterate using the final sorted index of arena_df_processed
|
437 |
+
for model_key in arena_df_processed.index:
|
438 |
+
row_data = arena_df_processed.loc[model_key]
|
439 |
+
# Find model metadata
|
440 |
+
model_info = model_table_df[model_table_df["key"] == model_key]
|
441 |
+
if model_info.empty:
|
442 |
+
# print(f"Warning: Model key '{model_key}' not found in model_table_df. Skipping.")
|
443 |
+
continue # Skip if no metadata
|
444 |
+
|
445 |
+
row = []
|
446 |
+
# Rank (Display)
|
447 |
+
row.append(row_data.get("display_ranking", "")) # Use the calculated display rank
|
448 |
+
|
449 |
+
# Delta (only if subset was processed successfully)
|
450 |
+
if arena_subset_df_processed is not None:
|
451 |
+
row.append(row_data.get("ranking_difference", 0))
|
452 |
+
|
453 |
+
# Model Name (hyperlink applied during loading)
|
454 |
+
row.append(model_info["Model"].values[0])
|
455 |
+
|
456 |
+
# Arena Elo
|
457 |
+
row.append(round(row_data["rating"]))
|
458 |
+
|
459 |
+
# 95% CI
|
460 |
+
# Check for NaN before calculation
|
461 |
+
upper_rating = row_data.get("rating_q975")
|
462 |
+
lower_rating = row_data.get("rating_q025")
|
463 |
+
current_rating = row_data.get("rating")
|
464 |
+
upper_diff = round(upper_rating - current_rating) if pd.notna(upper_rating) and pd.notna(current_rating) else '?'
|
465 |
+
lower_diff = round(current_rating - lower_rating) if pd.notna(current_rating) and pd.notna(lower_rating) else '?'
|
466 |
+
row.append(f"+{upper_diff}/-{lower_diff}")
|
467 |
+
|
468 |
+
|
469 |
+
# Votes
|
470 |
+
row.append(round(row_data["num_battles"]))
|
471 |
+
|
472 |
+
# Organization
|
473 |
+
row.append(model_info["Organization"].values[0])
|
474 |
+
|
475 |
+
# License
|
476 |
+
row.append(model_info["License"].values[0])
|
477 |
+
|
478 |
+
# Knowledge Cutoff
|
479 |
+
cutoff_date = model_info["Knowledge cutoff date"].values[0]
|
480 |
+
row.append("Unknown" if cutoff_date == "-" else cutoff_date)
|
481 |
+
|
482 |
+
values.append(row)
|
483 |
+
|
484 |
return values
|
485 |
|
486 |
|
487 |
key_to_category_name = {
|
488 |
+
# Mapping from internal key to display name (kept English for consistency)
|
489 |
+
"full": "Overall", # Might not be used if filtered out later
|
490 |
"crowdsourcing/simple_prompts": "crowdsourcing/simple_prompts",
|
491 |
"site_visitors/medium_prompts": "site_visitors/medium_prompts",
|
492 |
+
"site_visitors/medium_prompts:style control": "site_visitors/medium_prompts:style_control" # Use underscore for display consistency if needed
|
493 |
}
|
494 |
cat_name_to_explanation = {
|
495 |
+
# Translated explanations for display
|
496 |
"Overall": "All queries",
|
497 |
+
"crowdsourcing/simple_prompts": "Queries collected via crowdsourcing. Mostly simple ones.",
|
498 |
"site_visitors/medium_prompts": "Queries from website visitors. Contain more complex prompts.",
|
499 |
+
"site_visitors/medium_prompts:style_control": "Queries from website visitors. Contain more complex prompts. [Reduced stylistic influence](https://lmsys.org/blog/2024-08-28-style-control/) of the response on the rating."
|
500 |
}
|
501 |
cat_name_to_baseline = {
|
502 |
+
# Baseline category for comparison (if needed, seems unused now but kept)
|
503 |
+
# "Hard Prompts (English)": "English",
|
504 |
}
|
505 |
|
506 |
actual_categories = [
|
507 |
+
# Categories available in the dropdown (use the *keys* from key_to_category_name)
|
508 |
+
# "Overall", # Removed
|
509 |
+
# "crowdsourcing/simple_prompts", # Removed
|
510 |
"site_visitors/medium_prompts",
|
511 |
"site_visitors/medium_prompts:style control"
|
512 |
]
|
513 |
+
# Default selected category key
|
514 |
+
req_cat_key = "site_visitors/medium_prompts:style control"
|
515 |
+
selected_category_key = req_cat_key if req_cat_key in actual_categories else ("site_visitors/medium_prompts" if "site_visitors/medium_prompts" in actual_categories else (actual_categories[0] if actual_categories else None))
|
516 |
+
# Get the display name for the selected category
|
517 |
+
selected_category_display_name = key_to_category_name.get(selected_category_key, selected_category_key) # Fallback to key if not found
|
518 |
|
519 |
|
520 |
def read_elo_file(elo_results_file, leaderboard_table_file):
|
521 |
+
# Version from monitor.py, but no lazy_load or caching
|
522 |
+
print('Reading Elo file...')
|
523 |
arena_dfs = {}
|
524 |
category_elo_results = {}
|
525 |
+
last_updated_time = "N/A" # Default value
|
526 |
+
elo_results = {} # Default value
|
527 |
+
model_table_df = pd.DataFrame() # Default value
|
528 |
+
|
529 |
+
try:
|
530 |
+
# Use context manager for file operations
|
531 |
+
with open(elo_results_file, "rb") as fin:
|
532 |
+
elo_results = pickle.load(fin)
|
533 |
+
|
534 |
+
# Try to get last updated time from primary or fallback categories
|
535 |
+
main_cat_key = "site_visitors/medium_prompts:style control"
|
536 |
+
fallback_cat_key_1 = "site_visitors/medium_prompts"
|
537 |
+
fallback_cat_key_2 = "full" # Another fallback
|
538 |
+
|
539 |
+
if main_cat_key in elo_results and "last_updated_datetime" in elo_results[main_cat_key]:
|
540 |
+
last_updated_time = elo_results[main_cat_key]["last_updated_datetime"].split(" ")[0]
|
541 |
+
elif fallback_cat_key_1 in elo_results and "last_updated_datetime" in elo_results[fallback_cat_key_1]:
|
542 |
+
last_updated_time = elo_results[fallback_cat_key_1]["last_updated_datetime"].split(" ")[0]
|
543 |
+
elif fallback_cat_key_2 in elo_results and "last_updated_datetime" in elo_results[fallback_cat_key_2]:
|
544 |
+
last_updated_time = elo_results[fallback_cat_key_2]["last_updated_datetime"].split(" ")[0]
|
545 |
+
|
546 |
+
# Iterate through defined category keys
|
547 |
+
for key in key_to_category_name.keys():
|
548 |
+
display_name = key_to_category_name[key] # Get the display name
|
549 |
+
if key in elo_results:
|
550 |
+
# Check for required data within the category result
|
551 |
+
if "leaderboard_table_df" in elo_results[key] and isinstance(elo_results[key]["leaderboard_table_df"], pd.DataFrame):
|
552 |
+
df = elo_results[key]["leaderboard_table_df"]
|
553 |
+
# Filter by number of battles > 200
|
554 |
+
# Store using the *display_name* as the key for consistency with dropdown/UI
|
555 |
+
arena_dfs[display_name] = df[df["num_battles"] > 200].copy()
|
556 |
+
category_elo_results[display_name] = elo_results[key]
|
557 |
+
# else:
|
558 |
+
# print(f"Warning: 'leaderboard_table_df' not found or not a DataFrame for key '{key}'")
|
559 |
+
# else:
|
560 |
+
# print(f"Warning: Key '{key}' not found in elo_results")
|
561 |
+
|
562 |
+
# Load model metadata CSV
|
563 |
+
data = load_leaderboard_table_csv(leaderboard_table_file)
|
564 |
+
model_table_df = pd.DataFrame(data)
|
565 |
|
566 |
+
except FileNotFoundError:
|
567 |
+
print(f"Error: Elo results file not found at {elo_results_file}")
|
568 |
+
# Return empty structures
|
569 |
+
except Exception as e:
|
570 |
+
print(f"Error reading elo file: {e}")
|
571 |
+
traceback.print_exc()
|
572 |
+
# Return empty structures
|
573 |
|
574 |
+
# Ensure correct data types are returned even on error
|
575 |
return last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df
|
576 |
|
577 |
|
578 |
def build_leaderboard_tab(
|
579 |
elo_results_file, leaderboard_table_file, show_plot=False, mirror=False
|
580 |
):
|
581 |
+
# Load data once during build time
|
582 |
+
try:
|
583 |
+
last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
|
584 |
+
except Exception as e:
|
585 |
+
print(f"Failed to load initial data: {e}")
|
586 |
+
# Set empty defaults to prevent app crash
|
587 |
+
last_updated_time = "Error"
|
588 |
+
arena_dfs = {}
|
589 |
+
category_elo_results = {}
|
590 |
+
elo_results = {}
|
591 |
+
model_table_df = pd.DataFrame()
|
592 |
+
|
593 |
+
# Get data for the default selected category
|
594 |
+
# Use the *display name* derived from the selected key
|
595 |
+
if selected_category_display_name in arena_dfs:
|
596 |
+
arena_df = arena_dfs[selected_category_display_name]
|
597 |
+
elo_subset_results_init = category_elo_results[selected_category_display_name]
|
598 |
+
p1_init = elo_subset_results_init.get("win_fraction_heatmap")
|
599 |
+
p2_init = elo_subset_results_init.get("battle_count_heatmap")
|
600 |
+
p3_init = elo_subset_results_init.get("bootstrap_elo_rating")
|
601 |
+
p4_init = elo_subset_results_init.get("average_win_rate_bar")
|
602 |
+
else:
|
603 |
+
# Fallback if default category is missing
|
604 |
+
fallback_cat_display_name = None
|
605 |
+
if actual_categories:
|
606 |
+
# Try the first actual category's display name
|
607 |
+
first_cat_key = actual_categories[0]
|
608 |
+
fallback_cat_display_name = key_to_category_name.get(first_cat_key, first_cat_key)
|
609 |
+
|
610 |
+
if fallback_cat_display_name and fallback_cat_display_name in arena_dfs:
|
611 |
+
print(f"Warning: Selected category '{selected_category_display_name}' not found. Falling back to '{fallback_cat_display_name}'.")
|
612 |
+
arena_df = arena_dfs[fallback_cat_display_name]
|
613 |
+
elo_subset_results_init = category_elo_results[fallback_cat_display_name]
|
614 |
+
p1_init = elo_subset_results_init.get("win_fraction_heatmap")
|
615 |
+
p2_init = elo_subset_results_init.get("battle_count_heatmap")
|
616 |
+
p3_init = elo_subset_results_init.get("bootstrap_elo_rating")
|
617 |
+
p4_init = elo_subset_results_init.get("average_win_rate_bar")
|
618 |
+
else:
|
619 |
+
print(f"Warning: Default category '{selected_category_display_name}' and fallback categories not found in data.")
|
620 |
+
arena_df = pd.DataFrame() # Empty DataFrame
|
621 |
+
p1_init, p2_init, p3_init, p4_init = None, None, None, None
|
622 |
+
|
623 |
+
# Apply initial filtering to plots
|
624 |
+
p1_init = filter_deprecated_models_plots(p1_init, hidden_models=deprecated_model_name)
|
625 |
+
p2_init = filter_deprecated_models_plots(p2_init, hidden_models=deprecated_model_name)
|
626 |
+
p3_init = filter_deprecated_models_plots(p3_init, hidden_models=deprecated_model_name)
|
627 |
+
p4_init = filter_deprecated_models_plots(p4_init, hidden_models=deprecated_model_name)
|
628 |
+
|
629 |
+
default_md = make_default_md_1() # Parameters removed
|
630 |
+
default_md_2 = make_default_md_2() # Parameters removed
|
631 |
+
|
632 |
with gr.Row():
|
633 |
with gr.Column(scale=4):
|
634 |
+
# Removed Vote button
|
635 |
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
|
636 |
with gr.Column(scale=1):
|
637 |
vote_button = gr.Button("Vote!", link="https://llmarena.ru")
|
638 |
md_2 = gr.Markdown(default_md_2, elem_id="leaderboard_markdown")
|
|
|
|
|
|
|
639 |
|
640 |
+
# Generate initial table data
|
641 |
+
if not arena_df.empty and not model_table_df.empty:
|
642 |
+
# Pass the baseline DF and the model table; initially no subset difference is shown
|
643 |
+
arena_table_vals_init = get_arena_table(arena_df, model_table_df, hidden_models=deprecated_model_name)
|
644 |
+
else:
|
645 |
+
arena_table_vals_init = []
|
646 |
+
|
647 |
+
# Single "Arena" tab
|
648 |
+
with gr.Tab("Arena", id=0): # Removed Tabs() as only one tab
|
649 |
+
md_arena = make_arena_leaderboard_md(arena_df, last_updated_time)
|
650 |
+
lb_description = gr.Markdown(md_arena, elem_id="leaderboard_markdown")
|
651 |
+
|
652 |
+
with gr.Row():
|
653 |
+
with gr.Column(scale=2):
|
654 |
+
# Use *display names* for choices if they differ significantly from keys,
|
655 |
+
# but here keys are descriptive enough. Callback receives the *key*.
|
656 |
+
category_dropdown = gr.Dropdown(
|
657 |
+
# Choices should be the *keys* corresponding to display names
|
658 |
+
choices=actual_categories,
|
659 |
+
value=selected_category_key, # Use the key for the default value
|
660 |
+
label="Category", # Translated
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
661 |
)
|
662 |
|
663 |
+
with gr.Column(scale=2):
|
664 |
+
category_checkbox = gr.CheckboxGroup(
|
665 |
+
# Use user-friendly translated labels
|
666 |
+
["Show Deprecated", "Only <10B Models"], # Adjusted label for clarity
|
667 |
+
label="Apply Filter",
|
668 |
+
info="",
|
669 |
+
value=[], # Filters off by default
|
670 |
)
|
671 |
|
672 |
+
# Category details
|
673 |
+
default_category_details = make_category_arena_leaderboard_md(
|
674 |
+
arena_df, arena_df, name=selected_category_display_name # Pass arena_df twice for initial display
|
675 |
+
) if not arena_df.empty else "No data for category"
|
676 |
|
677 |
+
with gr.Column(scale=4, variant="panel"):
|
678 |
+
category_deets = gr.Markdown(
|
679 |
+
default_category_details, elem_id="category_deets"
|
680 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
681 |
|
682 |
+
# DataFrame for displaying the table
|
683 |
+
# Initial view doesn't have 'Delta' column
|
684 |
+
arena_vals = pd.DataFrame(
|
685 |
+
arena_table_vals_init,
|
686 |
columns=[
|
687 |
+
"Rank* (UB)", "Model", "Arena Elo", "95% CI",
|
688 |
+
"Votes", "Organization", "License", "Knowledge Cutoff"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
689 |
]
|
690 |
+
) if arena_table_vals_init else pd.DataFrame(columns=[ # Handle empty initial data
|
691 |
+
"Rank* (UB)", "Model", "Arena Elo", "95% CI",
|
692 |
+
"Votes", "Organization", "License", "Knowledge Cutoff"
|
693 |
+
])
|
694 |
+
|
695 |
+
# Sort by Elo for initial display
|
696 |
+
if "Arena Elo" in arena_vals.columns:
|
697 |
+
arena_vals = arena_vals.sort_values(by="Arena Elo", ascending=False)
|
698 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
699 |
|
700 |
+
elo_display_df = gr.Dataframe(
|
701 |
+
headers=[ # Translated headers
|
702 |
+
"Rank* (UB)", "Model", "Arena Elo", "95% CI",
|
703 |
+
"Votes", "Organization", "License", "Knowledge Cutoff"
|
704 |
+
],
|
705 |
+
datatype=[
|
706 |
+
"str", "markdown", "number", "str",
|
707 |
+
"number", "str", "str", "str"
|
708 |
+
],
|
709 |
+
value=arena_vals.style, # Apply Pandas styling if needed
|
710 |
+
elem_id="arena_leaderboard_dataframe",
|
711 |
+
height=700,
|
712 |
+
column_widths=[70, 190, 100, 100, 90, 130, 150, 100], # Widths from monitor.py
|
713 |
+
wrap=True,
|
714 |
)
|
715 |
|
716 |
+
gr.Markdown(elem_id="leaderboard_markdown") # Empty markdown for spacing
|
|
|
717 |
|
718 |
+
plot_1, plot_2, plot_3, plot_4 = None, None, None, None # Initialize plot variables
|
719 |
+
more_stats_md = None # Initialize markdown variable
|
720 |
+
if show_plot:
|
721 |
+
more_stats_md = gr.Markdown(
|
722 |
+
f"""## More Statistics for Chatbot Arena""", # Translated
|
723 |
+
elem_id="leaderboard_header_markdown",
|
724 |
+
)
|
725 |
+
with gr.Row(elem_id="leaderboard_bars"): # Use ID from monitor.py
|
726 |
+
with gr.Column():
|
727 |
+
gr.Markdown( # Translated title
|
728 |
+
"#### Figure 1: Confidence Intervals on Model Strength (via Bootstrapping)",
|
729 |
+
elem_id="plot-title",
|
730 |
+
)
|
731 |
+
plot_3 = gr.Plot(p3_init, show_label=False) # Use initial data
|
732 |
+
with gr.Column():
|
733 |
+
gr.Markdown( # Translated title
|
734 |
+
"#### Figure 2: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)",
|
735 |
+
elem_id="plot-title",
|
736 |
+
)
|
737 |
+
plot_4 = gr.Plot(p4_init, show_label=False) # Use initial data
|
738 |
+
with gr.Row(elem_id="leaderboard_plots"): # Use ID from monitor.py
|
739 |
+
with gr.Column():
|
740 |
+
gr.Markdown( # Translated title
|
741 |
+
"#### Figure 3: Fraction of Model A Wins for All Non-tied A vs. B Battles",
|
742 |
+
elem_id="plot-title",
|
743 |
+
)
|
744 |
+
plot_1 = gr.Plot(
|
745 |
+
p1_init, show_label=False, elem_id="plot-container" # Use initial data
|
746 |
+
)
|
747 |
+
with gr.Column():
|
748 |
+
gr.Markdown( # Translated title
|
749 |
+
"#### Figure 4: Battle Count for Each Combination of Models (without Ties)",
|
750 |
+
elem_id="plot-title",
|
751 |
+
)
|
752 |
+
plot_2 = gr.Plot(p2_init, show_label=False) # Use initial data
|
753 |
|
754 |
+
def update_leaderboard_df(arena_table_vals):
|
755 |
+
# Add error handling for empty or incorrect data
|
756 |
+
# Expects 9 columns when Delta is present
|
757 |
+
if not arena_table_vals or not isinstance(arena_table_vals, list) or not arena_table_vals[0] or len(arena_table_vals[0]) != 9:
|
758 |
+
print("Warning: Invalid data for styling in update_leaderboard_df. Returning empty DataFrame.")
|
759 |
+
# Return an empty styled DataFrame to avoid Gradio errors
|
760 |
+
empty_styled = pd.DataFrame(columns=[
|
761 |
+
"Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
|
762 |
+
"Votes", "Organization", "License", "Knowledge Cutoff"
|
763 |
+
]).style
|
764 |
+
return empty_styled
|
765 |
|
766 |
+
try:
|
767 |
+
elo_datarame = pd.DataFrame(
|
768 |
+
arena_table_vals,
|
769 |
+
columns=[
|
770 |
+
"Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
|
771 |
+
"Votes", "Organization", "License", "Knowledge Cutoff"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
772 |
],
|
|
|
|
|
|
|
|
|
|
|
773 |
)
|
774 |
+
|
775 |
+
def highlight_max(s):
|
776 |
+
# Check rank string for arrows
|
777 |
+
return [
|
778 |
+
"color: green; font-weight: bold" if "↑" in str(v) else
|
779 |
+
"color: red; font-weight: bold" if "↓" in str(v) else ""
|
780 |
+
for v in s
|
781 |
+
]
|
782 |
+
|
783 |
+
def highlight_rank_max(s):
|
784 |
+
# Check Delta value (ensure it's numeric)
|
785 |
+
return [
|
786 |
+
"color: green; font-weight: bold" if isinstance(v, (int, float)) and v > 0 else
|
787 |
+
"color: red; font-weight: bold" if isinstance(v, (int, float)) and v < 0 else ""
|
788 |
+
for v in s
|
789 |
+
]
|
790 |
+
# Apply styles
|
791 |
+
styled_df = elo_datarame.style.apply(highlight_max, subset=["Rank* (UB)"]).apply(
|
792 |
+
highlight_rank_max, subset=["Delta"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
793 |
)
|
794 |
+
return styled_df
|
795 |
|
796 |
+
except Exception as e:
|
797 |
+
print(f"Error applying styles in update_leaderboard_df: {e}")
|
798 |
+
traceback.print_exc()
|
799 |
+
# Return unstyled DataFrame on error
|
800 |
+
return pd.DataFrame(arena_table_vals, columns=[
|
801 |
+
"Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
|
802 |
+
"Votes", "Organization", "License", "Knowledge Cutoff"
|
803 |
+
]).style
|
804 |
+
|
805 |
+
def update_leaderboard_and_plots(category_key, filters): # Receives category *key* from dropdown
|
806 |
+
# No caching
|
807 |
+
# Reload data on each call
|
808 |
+
try:
|
809 |
+
current_last_updated_time, current_arena_dfs, current_category_elo_results, _, current_model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
|
810 |
+
except Exception as e:
|
811 |
+
print(f"Error reloading data in callback: {e}")
|
812 |
+
# Return empty updates to prevent UI crash
|
813 |
+
empty_df_update = gr.Dataframe(value=pd.DataFrame().style) # Empty DataFrame
|
814 |
+
empty_plot_update = gr.Plot(value=None) # Empty Plot
|
815 |
+
empty_md_update = gr.Markdown(value="Error loading data.") # Error Markdown
|
816 |
+
# Match the number of outputs expected by the .change() call
|
817 |
+
num_plots = 4 if show_plot else 0
|
818 |
+
return [empty_df_update] + [empty_plot_update] * num_plots + [empty_md_update, empty_md_update]
|
819 |
+
|
820 |
+
|
821 |
+
# Use the display name corresponding to the selected key
|
822 |
+
category_display_name = key_to_category_name.get(category_key, category_key)
|
823 |
+
|
824 |
+
# Check if data exists for the selected category (using display name as key now)
|
825 |
+
if not current_arena_dfs or category_display_name not in current_arena_dfs or category_display_name not in current_category_elo_results or current_model_table_df.empty:
|
826 |
+
print(f"Warning: Data missing for category '{category_display_name}' (key: '{category_key}') after reload.")
|
827 |
+
empty_df_update = gr.Dataframe(value=pd.DataFrame().style)
|
828 |
+
empty_plot_update = gr.Plot(value=None)
|
829 |
+
empty_md_update = gr.Markdown(value=f"No data available for category: {category_display_name}")
|
830 |
+
num_plots = 4 if show_plot else 0
|
831 |
+
# Match the number of outputs
|
832 |
+
return [empty_df_update] + [empty_plot_update] * num_plots + [empty_md_update, empty_md_update]
|
833 |
+
|
834 |
+
# Get the specific data slices using the display name
|
835 |
+
arena_subset_df = current_arena_dfs[category_display_name]
|
836 |
+
elo_subset_results = current_category_elo_results[category_display_name]
|
837 |
+
|
838 |
+
# Use the hardcoded baseline key, get its display name
|
839 |
+
baseline_key = "site_visitors/medium_prompts:style control"
|
840 |
+
baseline_display_name = key_to_category_name.get(baseline_key, baseline_key)
|
841 |
+
|
842 |
+
# Fallback if baseline is missing
|
843 |
+
if baseline_display_name not in current_arena_dfs:
|
844 |
+
print(f"Warning: Baseline category '{baseline_display_name}' not found. Using selected category '{category_display_name}' as baseline.")
|
845 |
+
baseline_display_name = category_display_name # Fallback to the selected category itself
|
846 |
+
|
847 |
+
arena_df_baseline = current_arena_dfs[baseline_display_name]
|
848 |
+
|
849 |
+
|
850 |
+
hidden_models_list = None # Default: show all
|
851 |
+
# Check filter labels (must match the translated CheckboxGroup choices)
|
852 |
+
if "Show Deprecated" not in filters:
|
853 |
+
hidden_models_list = deprecated_model_name.copy() # Hide deprecated
|
854 |
+
|
855 |
+
if "Only <10B Models" in filters:
|
856 |
+
# Get all models currently in the baseline view
|
857 |
+
all_models_in_view = arena_df_baseline.index.tolist()
|
858 |
+
# Find models *not* in the allowed list
|
859 |
+
models_to_hide = [model for model in all_models_in_view if model not in models_10b]
|
860 |
+
|
861 |
+
if hidden_models_list is None: # If deprecated are not hidden
|
862 |
+
hidden_models_list = models_to_hide
|
863 |
+
else: # If deprecated are already hidden, add the non-<10B ones
|
864 |
+
# Use set to avoid duplicates
|
865 |
+
hidden_models_list = list(set(hidden_models_list + models_to_hide))
|
866 |
+
|
867 |
+
arena_table_values = get_arena_table(
|
868 |
+
arena_df_baseline, # Use the determined baseline DataFrame
|
869 |
+
current_model_table_df,
|
870 |
+
# Pass subset only if it's different from the baseline
|
871 |
+
arena_subset_df=(arena_subset_df if category_display_name != baseline_display_name else None),
|
872 |
+
hidden_models=hidden_models_list
|
873 |
)
|
|
|
|
|
|
|
874 |
|
875 |
+
dataframe_update = None
|
876 |
+
# Show Delta column only if category is not the baseline and data exists
|
877 |
+
if category_display_name != baseline_display_name and arena_table_values:
|
878 |
+
styled_arena_values = update_leaderboard_df(arena_table_values) # Apply styling with Delta
|
879 |
+
# Check if styling was successful
|
880 |
+
if isinstance(styled_arena_values, pd.io.formats.style.Styler) and not styled_arena_values.data.empty:
|
881 |
+
dataframe_update = gr.Dataframe(
|
882 |
+
headers=[ # Headers including Delta
|
883 |
+
"Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
|
884 |
+
"Votes", "Organization", "License", "Knowledge Cutoff"
|
885 |
+
],
|
886 |
+
datatype=[
|
887 |
+
"str", "number", "markdown", "number", "str",
|
888 |
+
"number", "str", "str", "str"
|
889 |
+
],
|
890 |
+
value=styled_arena_values, # Pass the Styler object
|
891 |
+
elem_id="arena_leaderboard_dataframe",
|
892 |
+
height=700,
|
893 |
+
column_widths=[70, 70, 200, 90, 100, 90, 120, 150, 100], # Widths with Delta
|
894 |
+
wrap=True,
|
895 |
+
)
|
896 |
+
else: # Handle styling failure
|
897 |
+
dataframe_update = gr.Dataframe(value=pd.DataFrame().style) # Empty update
|
898 |
+
|
899 |
+
else: # Baseline category or no data for Delta
|
900 |
+
# Ensure data exists before creating DataFrame
|
901 |
+
if arena_table_values:
|
902 |
+
# Create DataFrame without Delta column from the raw values
|
903 |
+
df_no_delta = pd.DataFrame(arena_table_values, columns=[
|
904 |
+
"Rank* (UB)", "Model", "Arena Elo", "95% CI",
|
905 |
+
"Votes", "Organization", "License", "Knowledge Cutoff"
|
906 |
+
])
|
907 |
+
dataframe_update = gr.Dataframe(
|
908 |
+
headers=[ # Headers without Delta
|
909 |
+
"Rank* (UB)", "Model", "Arena Elo", "95% CI",
|
910 |
+
"Votes", "Organization", "License", "Knowledge Cutoff"
|
911 |
+
],
|
912 |
+
datatype=[
|
913 |
+
"str", "markdown", "number", "str", "number",
|
914 |
+
"str", "str", "str"
|
915 |
+
],
|
916 |
+
value=df_no_delta.style, # Apply basic Pandas styling
|
917 |
+
elem_id="arena_leaderboard_dataframe",
|
918 |
+
height=700,
|
919 |
+
column_widths=[70, 190, 100, 100, 90, 130, 150, 100], # Widths without Delta
|
920 |
+
wrap=True,
|
921 |
+
)
|
922 |
+
else:
|
923 |
+
dataframe_update = gr.Dataframe(value=pd.DataFrame().style) # Empty update
|
924 |
+
|
925 |
+
plot_updates = [gr.Plot(value=None)] * 4 # Default empty plot updates
|
926 |
+
if show_plot:
|
927 |
+
p1_updated = elo_subset_results.get("win_fraction_heatmap")
|
928 |
+
p2_updated = elo_subset_results.get("battle_count_heatmap")
|
929 |
+
p3_updated = elo_subset_results.get("bootstrap_elo_rating")
|
930 |
+
p4_updated = elo_subset_results.get("average_win_rate_bar")
|
931 |
+
|
932 |
+
# Filter plots
|
933 |
+
p1_filtered = filter_deprecated_models_plots(p1_updated, hidden_models=hidden_models_list)
|
934 |
+
p2_filtered = filter_deprecated_models_plots(p2_updated, hidden_models=hidden_models_list)
|
935 |
+
p3_filtered = filter_deprecated_models_plots(p3_updated, hidden_models=hidden_models_list)
|
936 |
+
p4_filtered = filter_deprecated_models_plots(p4_updated, hidden_models=hidden_models_list)
|
937 |
+
plot_updates = [p1_filtered, p2_filtered, p3_filtered, p4_filtered]
|
938 |
+
|
939 |
+
|
940 |
+
more_stats_md_updated_text = f"""## More Statistics for Chatbot Arena - {category_display_name} """ if show_plot else ""
|
941 |
+
more_stats_md_update = gr.Markdown(value=more_stats_md_updated_text)
|
942 |
+
|
943 |
+
# Use baseline DF for total counts, subset DF for category-specific counts
|
944 |
+
category_details_md_updated_text = make_category_arena_leaderboard_md(
|
945 |
+
arena_df_baseline, arena_subset_df, name=category_display_name # Pass display name
|
946 |
+
)
|
947 |
+
category_deets_update = gr.Markdown(value=category_details_md_updated_text)
|
948 |
|
949 |
+
# Return updates in the correct order matching outputs list
|
950 |
+
# Order: df, p1, p2, p3, p4, more_stats_md, category_deets
|
951 |
+
return [dataframe_update] + plot_updates + [more_stats_md_update, category_deets_update]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
952 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
953 |
|
954 |
+
# Define output components (must exist in the UI build)
|
955 |
+
outputs_list = [elo_display_df]
|
956 |
+
if show_plot:
|
957 |
+
# Add plot components if they exist
|
958 |
+
outputs_list.extend([plot_1, plot_2, plot_3, plot_4])
|
959 |
+
# Add markdown component if it exists
|
960 |
+
if more_stats_md: outputs_list.append(more_stats_md)
|
961 |
+
else: outputs_list.append(gr.Markdown(visible=False)) # Placeholder if MD wasn't created
|
962 |
+
else:
|
963 |
+
# Add placeholders if plots/MD are not shown
|
964 |
+
outputs_list.extend([gr.Plot(visible=False)] * 4)
|
965 |
+
outputs_list.append(gr.Markdown(visible=False))
|
966 |
+
outputs_list.append(category_deets) # Always update category details
|
967 |
+
|
968 |
+
# Attach change listeners
|
969 |
+
category_dropdown.change(
|
970 |
+
fn=update_leaderboard_and_plots,
|
971 |
+
inputs=[category_dropdown, category_checkbox],
|
972 |
+
outputs=outputs_list
|
973 |
+
)
|
974 |
+
category_checkbox.change(
|
975 |
+
fn=update_leaderboard_and_plots, # Use the same function
|
976 |
+
inputs=[category_dropdown, category_checkbox],
|
977 |
+
outputs=outputs_list
|
978 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
979 |
|
|
|
|
|
|
|
|
|
|
|
980 |
|
981 |
+
return_components = [md_1, md_2, lb_description, category_deets, elo_display_df]
|
982 |
+
if show_plot:
|
983 |
+
# Add plots if they were created
|
984 |
+
return_components.extend([plot_1, plot_2, plot_3, plot_4])
|
985 |
+
# Add the extra stats markdown if it was created
|
986 |
+
if more_stats_md: return_components.append(more_stats_md)
|
|
|
|
|
|
|
|
|
|
|
|
|
987 |
|
|
|
|
|
|
|
988 |
|
989 |
+
return return_components
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
990 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
991 |
|
992 |
+
def build_demo(elo_results_file, leaderboard_table_file):
|
993 |
+
# Assumes block_css is available or defined elsewhere
|
994 |
+
try:
|
995 |
+
from fastchat.serve.gradio_web_server import block_css
|
996 |
+
except ImportError:
|
997 |
+
print("Warning: fastchat.serve.gradio_web_server.block_css not found. Using fallback CSS.")
|
998 |
+
# Define a minimal fallback CSS or copy the content here
|
999 |
+
block_css = """
|
1000 |
+
/* Add minimal CSS rules here if needed */
|
1001 |
+
#arena_leaderboard_dataframe table { font-size: 105%; }
|
1002 |
+
#leaderboard_markdown .prose { font-size: 110% !important; }
|
1003 |
+
.app { max-width: 100% !important; padding: 20px !important; }
|
1004 |
+
a { color: #1976D2; text-decoration: none; }
|
1005 |
+
a:hover { color: #63A4FF; text-decoration: underline; }
|
1006 |
+
"""
|
1007 |
|
1008 |
+
text_size = gr.themes.sizes.text_lg
|
1009 |
+
# Assumes theme.json is present
|
1010 |
+
try:
|
1011 |
+
theme = gr.themes.Default.load("theme.json")
|
1012 |
+
except:
|
1013 |
+
print("Warning: theme.json not found. Using default Gradio theme.")
|
1014 |
+
theme = gr.themes.Default(text_size=text_size) # Fallback theme
|
1015 |
+
|
1016 |
+
if hasattr(theme, 'text_size'): theme.text_size = text_size
|
1017 |
+
# Apply custom settings if theme object supports it
|
1018 |
+
if hasattr(theme, 'set'):
|
1019 |
+
theme.set(
|
1020 |
+
button_large_text_size="40px",
|
1021 |
+
button_small_text_size="40px",
|
1022 |
+
button_large_text_weight="1000",
|
1023 |
+
button_small_text_weight="1000",
|
1024 |
+
button_shadow="*shadow_drop_lg",
|
1025 |
+
button_shadow_hover="*shadow_drop_lg",
|
1026 |
+
checkbox_label_shadow="*shadow_drop_lg",
|
1027 |
+
button_shadow_active="*shadow_inset",
|
1028 |
+
button_secondary_background_fill="*primary_300",
|
1029 |
+
button_secondary_background_fill_dark="*primary_700",
|
1030 |
+
button_secondary_background_fill_hover="*primary_200",
|
1031 |
+
button_secondary_background_fill_hover_dark="*primary_500",
|
1032 |
+
button_secondary_text_color="*primary_800",
|
1033 |
+
button_secondary_text_color_dark="white",
|
1034 |
+
)
|
1035 |
|
1036 |
+
with gr.Blocks(
|
1037 |
+
title="LLM Arena: Leaderboard", # Translated title
|
1038 |
+
theme=theme,
|
1039 |
+
css=block_css, # Use loaded or fallback CSS
|
1040 |
+
) as demo:
|
1041 |
+
# Build only the leaderboard tab content
|
1042 |
+
# show_plot=True to display plots
|
1043 |
+
leader_components = build_leaderboard_tab(
|
1044 |
+
elo_results_file, leaderboard_table_file, show_plot=True, mirror=False
|
1045 |
+
)
|
1046 |
+
return demo
|
1047 |
|
1048 |
|
1049 |
if __name__ == "__main__":
|
1050 |
parser = argparse.ArgumentParser()
|
1051 |
+
parser.add_argument("--share", action="store_true", default=False) # Default False for HF
|
1052 |
parser.add_argument("--host", default="0.0.0.0")
|
1053 |
parser.add_argument("--port", type=int, default=7860)
|
1054 |
+
# Removed args specific to monitor.py
|
1055 |
args = parser.parse_args()
|
1056 |
+
try:
|
1057 |
+
elo_result_files = glob.glob("elo_results_*.pkl")
|
1058 |
+
if not elo_result_files:
|
1059 |
+
raise FileNotFoundError("No elo_results_*.pkl files found.")
|
1060 |
+
# More robust sorting extracting the number
|
1061 |
+
elo_result_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))
|
1062 |
+
elo_result_file = elo_result_files[-1]
|
1063 |
+
print(f"Using Elo results file: {elo_result_file}")
|
1064 |
+
except Exception as e:
|
1065 |
+
print(f"Error finding Elo results file: {e}")
|
1066 |
+
print("Please ensure a file matching 'elo_results_NUMBER.pkl' exists.")
|
1067 |
+
exit(1) # Exit if file not found
|
1068 |
+
|
1069 |
+
try:
|
1070 |
+
leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
|
1071 |
+
if not leaderboard_table_files:
|
1072 |
+
raise FileNotFoundError("No leaderboard_table_*.csv files found.")
|
1073 |
+
leaderboard_table_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))
|
1074 |
+
leaderboard_table_file = leaderboard_table_files[-1]
|
1075 |
+
print(f"Using leaderboard table file: {leaderboard_table_file}")
|
1076 |
+
except Exception as e:
|
1077 |
+
print(f"Error finding leaderboard table file: {e}")
|
1078 |
+
print("Please ensure a file matching 'leaderboard_table_NUMBER.csv' exists.")
|
1079 |
+
exit(1) # Exit if file not found
|
1080 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1081 |
|
1082 |
demo = build_demo(elo_result_file, leaderboard_table_file)
|
1083 |
+
# Launch with args
|
1084 |
+
demo.launch(
|
1085 |
+
server_name=args.host,
|
1086 |
+
server_port=args.port,
|
1087 |
+
share=args.share,
|
1088 |
+
show_api=False
|
1089 |
+
)
|