LLMArena commited on
Commit
c32157e
·
verified ·
1 Parent(s): 1575c35

Update app.py with new features from arena

Browse files
Files changed (1) hide show
  1. app.py +895 -625
app.py CHANGED
@@ -4,17 +4,12 @@ import glob
4
  import pickle
5
  import traceback
6
  import numpy as np
7
- from datetime import datetime
8
 
9
  import pandas as pd
10
  import gradio as gr
11
  import numpy as np
12
 
13
 
14
- basic_component_values = [None] * 6
15
- leader_component_values = [None] * 5
16
-
17
-
18
  promo_banner = """
19
  <div style="background-color: #ffcc00; color: black; padding: 10px; text-align: center; font-weight: bold; font-size: 18px; border: 2px solid #000;">
20
  USE THE LATEST VERSIONS OF THE BEST CHATBOTS IN RUSSIAN FOR FREE
@@ -23,18 +18,30 @@ promo_banner = """
23
 
24
  deprecated_model_name = [
25
  "GigaChat 3.1.25.3",
26
- "GigaChat-Pro 2.2.25.3",
27
  "saiga_llama3_8b_v6",
28
  "saiga_phi3_medium",
29
  "GigaChat-Plus 3.1.25.3",
30
  "GigaChat-Pro 4.0.26.8",
31
  "GigaChat 4.0.26.8",
32
- "xAI: Grok 2",
33
  "GigaChat-Pro 4.0.26.15",
34
  "GigaChat 4.0.26.15",
35
- "YandexGPT Experimental", "yandex-gpt-arena"
 
 
 
 
 
 
 
 
 
 
 
36
  ]
37
 
 
38
  def make_default_md_1():
39
  leaderboard_md = f"""
40
  # 🏆 LLM Arena in Russian: Leaderboard
@@ -44,24 +51,24 @@ def make_default_md_1():
44
  """
45
  return leaderboard_md
46
 
47
-
48
  def make_default_md_2():
49
  leaderboard_md = f"""
50
-
51
  The LLM Arena platform is an open crowdsourcing platform for evaluating large language models (LLM) in Russian. We collect pairwise comparisons from people to rank LLMs using the Bradley-Terry model and display model ratings on the Elo scale.
52
  Chatbot Arena in Russian depends on community participation, so please contribute by casting your vote!
 
53
  - To **add your model** to the comparison, contact us on TG: [Group](https://t.me/+bFEOl-Bdmok4NGUy)
54
  - If you **found a bug** or **have a suggestion**, contact us: [Roman](https://t.me/roman_kucev)
55
  - You can **contribute your vote** at [llmarena.ru](https://llmarena.ru/)!
56
  """
57
-
58
  return leaderboard_md
59
 
60
 
61
  def make_arena_leaderboard_md(arena_df, last_updated_time):
62
- total_votes = sum(arena_df["num_battles"])
 
63
  total_models = len(arena_df)
64
- space = " "
65
 
66
  leaderboard_md = f"""
67
  Total # of models: **{total_models}**.{space} Total # of votes: **{"{:,}".format(total_votes)}**.{space} Last updated: {last_updated_time}.
@@ -74,57 +81,166 @@ See Figure 1 below for a visualization of the confidence intervals of model rati
74
 
75
 
76
  def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="site_visitors/medium_prompts:style control"):
77
- total_votes = sum(arena_df["num_battles"])
78
  total_models = len(arena_df)
79
- space = " "
80
- total_subset_votes = sum(arena_subset_df["num_battles"])
81
  total_subset_models = len(arena_subset_df)
82
- leaderboard_md = f"""### {cat_name_to_explanation[name]}
83
- #### {space} #models: **{total_subset_models} ({round(total_subset_models/total_models *100)}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({round(total_subset_votes/total_votes * 100)}%)**{space}
 
 
 
 
84
  """
85
  return leaderboard_md
86
 
87
-
88
  def model_hyperlink(model_name, link):
89
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
90
 
91
 
92
- def filter_deprecated_models_plots(fig, hidden_models=None):
93
  """
94
- Removes deprecated models from a Plotly figure.
95
 
96
  Args:
97
  fig: The Plotly figure object.
98
- hidden_models: A list of model names to remove.
 
 
 
 
99
  """
100
  if fig is None:
101
- return
102
-
103
- if hidden_models is None:
 
 
 
 
 
104
  return fig
105
 
106
- if fig.data[0].type == 'heatmap':
107
- data = fig.data[0]
108
- mask_x = ~np.isin(data.x, hidden_models)
109
- mask_y = ~np.isin(data.y, hidden_models)
110
- data.update({
111
- 'x': np.array(data.x)[mask_x],
112
- 'y': np.array(data.y)[mask_y],
113
- 'z': np.array(data.z)[np.ix_(mask_y, mask_x)]
114
- })
115
- elif fig.data[0].type == 'scatter':
116
- trace = fig.data[0]
117
- mask = ~np.isin(trace.x, hidden_models)
118
- trace.x, trace.y, trace.text = np.array(trace.x)[mask], np.array(trace.y)[mask], np.array(trace.text)[mask]
119
- for key in ['array', 'arrayminus']:
120
- if key in trace.error_y:
121
- trace.error_y[key] = trace.error_y[key][mask]
122
- elif fig.data[0].type == 'bar':
123
- mask = ~np.isin(fig.data[0].x, hidden_models)
124
- fig.data[0].x = fig.data[0].x[mask]
125
- fig.data[0].y = fig.data[0].y[mask]
126
-
127
- return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
  def load_leaderboard_table_csv(filename, add_hyperlink=True):
130
  lines = open(filename).readlines()
@@ -132,688 +248,842 @@ def load_leaderboard_table_csv(filename, add_hyperlink=True):
132
  rows = []
133
  for i in range(1, len(lines)):
134
  row = [v.strip() for v in lines[i].split(",")]
135
- for j in range(len(heads)):
136
- item = {}
137
- for h, v in zip(heads, row):
138
- if h == "Arena Elo rating":
139
- if v != "-":
140
  v = int(ast.literal_eval(v))
141
- else:
142
- v = np.nan
143
- elif h == "MMLU":
144
- if v != "-":
145
- v = round(ast.literal_eval(v) * 100, 1)
146
- else:
147
- v = np.nan
148
- elif h == "MT-bench (win rate %)":
149
- if v != "-":
150
- v = round(ast.literal_eval(v[:-1]), 1)
151
- else:
152
- v = np.nan
153
- elif h == "MT-bench (score)":
154
- if v != "-":
155
- v = round(ast.literal_eval(v), 2)
156
- else:
157
- v = np.nan
158
- item[h] = v
159
- if add_hyperlink:
160
  item["Model"] = model_hyperlink(item["Model"], item["Link"])
 
161
  rows.append(item)
162
-
163
  return rows
164
 
165
 
166
  def create_ranking_str(ranking, ranking_difference):
167
- if ranking_difference > 0:
168
- return f"{int(ranking)} \u2191"
169
- elif ranking_difference < 0:
170
- return f"{int(ranking)} \u2193"
171
- else:
172
- return f"{int(ranking)}"
 
 
 
 
 
 
 
173
 
174
 
175
  def recompute_final_ranking(arena_df):
176
- # compute ranking based on CI
177
  ranking = {}
178
- for i, model_a in enumerate(arena_df.index):
179
- ranking[model_a] = 1
180
- for j, model_b in enumerate(arena_df.index):
181
- if i == j:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  continue
183
- if (
184
- arena_df.loc[model_b]["rating_q025"]
185
- > arena_df.loc[model_a]["rating_q975"]
186
- ):
187
- ranking[model_a] += 1
 
 
 
 
 
188
  return list(ranking.values())
189
 
190
 
191
  def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_models=None):
192
- # Apply hidden_models filter first
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  if hidden_models:
194
- arena_df = arena_df[~arena_df.index.isin(hidden_models)].copy()
 
 
 
 
 
 
 
 
 
195
 
196
- arena_df = arena_df.sort_values(
197
- by=["final_ranking", "rating"], ascending=[True, False]
198
- )
199
 
200
- arena_df["final_ranking"] = recompute_final_ranking(arena_df)
201
- arena_df = arena_df.sort_values(
202
- by=["final_ranking", "rating"], ascending=[True, False]
203
- )
204
 
205
- # sort by rating
206
- if arena_subset_df is not None:
207
- # filter out models not in the arena_df
208
- arena_subset_df = arena_subset_df[arena_subset_df.index.isin(arena_df.index)]
209
- arena_subset_df = arena_subset_df.sort_values(by=["rating"], ascending=False)
210
- arena_subset_df["final_ranking"] = recompute_final_ranking(arena_subset_df)
211
- # keep only the models in the subset in arena_df and recompute final_ranking
212
- arena_df = arena_df[arena_df.index.isin(arena_subset_df.index)]
213
- # recompute final ranking
214
- arena_df["final_ranking"] = recompute_final_ranking(arena_df)
215
-
216
- # assign ranking by the order
217
- arena_subset_df["final_ranking_no_tie"] = range(1, len(arena_subset_df) + 1)
218
- arena_df["final_ranking_no_tie"] = range(1, len(arena_df) + 1)
219
- # join arena_df and arena_subset_df on index
220
- arena_df = arena_subset_df.join(
221
- arena_df["final_ranking"], rsuffix="_global", how="inner"
222
- )
223
- arena_df["ranking_difference"] = (
224
- arena_df["final_ranking_global"] - arena_df["final_ranking"]
225
- )
226
 
227
- arena_df = arena_df.sort_values(
228
- by=["final_ranking", "rating"], ascending=[True, False]
229
- )
230
- arena_df["final_ranking"] = arena_df.apply(
231
- lambda x: create_ranking_str(x["final_ranking"], x["ranking_difference"]),
232
- axis=1,
233
- )
234
 
235
- arena_df["final_ranking"] = arena_df["final_ranking"].astype(str)
 
 
 
236
 
237
- values = []
238
- for i in range(len(arena_df)):
239
- row = []
240
- model_key = arena_df.index[i]
241
- try:
242
- model_name = model_table_df[model_table_df["key"] == model_key][
243
- "Model"
244
- ].values[0]
245
- ranking = arena_df.iloc[i].get("final_ranking") or i + 1
246
- row.append(ranking)
247
- if arena_subset_df is not None:
248
- row.append(arena_df.iloc[i].get("ranking_difference") or 0)
249
- row.append(model_name)
250
- row.append(round(arena_df.iloc[i]["rating"]))
251
- upper_diff = round(
252
- arena_df.iloc[i]["rating_q975"] - arena_df.iloc[i]["rating"]
253
- )
254
- lower_diff = round(
255
- arena_df.iloc[i]["rating"] - arena_df.iloc[i]["rating_q025"]
256
  )
257
- row.append(f"+{upper_diff}/-{lower_diff}")
258
- row.append(round(arena_df.iloc[i]["num_battles"]))
259
- row.append(
260
- model_table_df[model_table_df["key"] == model_key][
261
- "Organization"
262
- ].values[0]
 
263
  )
264
- row.append(
265
- model_table_df[model_table_df["key"] == model_key]["License"].values[0]
 
 
 
266
  )
267
- cutoff_date = model_table_df[model_table_df["key"] == model_key][
268
- "Knowledge cutoff date"
269
- ].values[0]
270
- if cutoff_date == "-":
271
- row.append("Unknown")
 
 
 
 
 
 
 
272
  else:
273
- row.append(cutoff_date)
274
- values.append(row)
275
- except Exception as e:
276
- traceback.print_exc()
277
- print(f"{model_key} - {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  return values
279
 
280
 
281
  key_to_category_name = {
282
- "full": "Overall",
 
283
  "crowdsourcing/simple_prompts": "crowdsourcing/simple_prompts",
284
  "site_visitors/medium_prompts": "site_visitors/medium_prompts",
285
- "site_visitors/medium_prompts:style control": "site_visitors/medium_prompts:style control"
286
  }
287
  cat_name_to_explanation = {
 
288
  "Overall": "All queries",
289
- "crowdsourcing/simple_prompts": "Queries collected through crowdsourcing. Mostly simple ones.",
290
  "site_visitors/medium_prompts": "Queries from website visitors. Contain more complex prompts.",
291
- "site_visitors/medium_prompts:style control": "Queries from website visitors. Contain more complex prompts. [Reduced stylistic influence](https://lmsys.org/blog/2024-08-28-style-control/) of the response on the rating."
292
  }
293
  cat_name_to_baseline = {
294
- "Hard Prompts (English)": "English",
 
295
  }
296
 
297
  actual_categories = [
298
- # "Overall",
299
- # "crowdsourcing/simple_prompts",
 
300
  "site_visitors/medium_prompts",
301
  "site_visitors/medium_prompts:style control"
302
  ]
303
- req_cat = "site_visitors/medium_prompts:style control"
304
- # selected_category = req_cat if req_cat in actual_categories else "Overall"
305
- selected_category = req_cat if req_cat in actual_categories else "site_visitors/medium_prompts:style control"
 
 
306
 
307
 
308
  def read_elo_file(elo_results_file, leaderboard_table_file):
 
 
309
  arena_dfs = {}
310
  category_elo_results = {}
311
- with open(elo_results_file, "rb") as fin:
312
- elo_results = pickle.load(fin)
313
- last_updated_time = None
314
- if selected_category in elo_results:
315
- last_updated_time = elo_results[selected_category]["last_updated_datetime"].split(
316
- " "
317
- )[0]
318
- for k in key_to_category_name.keys():
319
- if k not in elo_results:
320
- continue
321
- arena_dfs[key_to_category_name[k]] = elo_results[k][
322
- "leaderboard_table_df"
323
- ]
324
- category_elo_results[key_to_category_name[k]] = elo_results[k]
325
-
326
- data = load_leaderboard_table_csv(leaderboard_table_file)
327
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
 
329
- model_table_df = pd.DataFrame(data)
 
 
 
 
 
 
330
 
 
331
  return last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df
332
 
333
 
334
  def build_leaderboard_tab(
335
  elo_results_file, leaderboard_table_file, show_plot=False, mirror=False
336
  ):
337
- arena_dfs = {}
338
- arena_df = pd.DataFrame()
339
- category_elo_results = {}
340
-
341
- last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
342
-
343
- arena_df = arena_dfs[selected_category]
344
-
345
- p1 = category_elo_results[selected_category]["win_fraction_heatmap"]
346
- p2 = category_elo_results[selected_category]["battle_count_heatmap"]
347
- p3 = category_elo_results[selected_category]["bootstrap_elo_rating"]
348
- p4 = category_elo_results[selected_category]["average_win_rate_bar"]
349
- # arena_df = arena_dfs["Overall"]
350
- default_md = make_default_md_1()
351
- default_md_2 = make_default_md_2()
352
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
353
  with gr.Row():
354
  with gr.Column(scale=4):
 
355
  md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
356
  with gr.Column(scale=1):
357
  vote_button = gr.Button("Vote!", link="https://llmarena.ru")
358
  md_2 = gr.Markdown(default_md_2, elem_id="leaderboard_markdown")
359
-
360
- if leaderboard_table_file:
361
- data = load_leaderboard_table_csv(leaderboard_table_file)
362
 
363
- model_table_df = pd.DataFrame(data)
364
-
365
- with gr.Tabs() as tabs:
366
- arena_table_vals = get_arena_table(arena_df, model_table_df, hidden_models=deprecated_model_name)
367
-
368
- with gr.Tab("Arena", id=0):
369
- md = make_arena_leaderboard_md(arena_dfs[selected_category], last_updated_time)
370
-
371
- lb_description = gr.Markdown(md, elem_id="leaderboard_markdown")
372
- with gr.Row():
373
- with gr.Column(scale=2):
374
- category_dropdown = gr.Dropdown(
375
- choices=actual_categories,
376
- value=selected_category,
377
- label="Category",
378
- )
379
-
380
- with gr.Column(scale=2):
381
- category_checkbox = gr.CheckboxGroup(
382
- ["Deprecated"],
383
- label="Filter",
384
- value=[],
385
- info="",
386
- )
387
- default_category_details = make_category_arena_leaderboard_md(
388
- arena_df, arena_df, name=selected_category
389
- )
390
-
391
- with gr.Column(scale=4, variant="panel"):
392
- category_deets = gr.Markdown(
393
- default_category_details, elem_id="category_deets"
394
- )
395
-
396
- arena_vals = pd.DataFrame(
397
- arena_table_vals,
398
- columns=[
399
- "Rank* (UB)",
400
- "Model",
401
- "Arena Elo",
402
- "95% CI",
403
- "Votes",
404
- "Organization",
405
- "License",
406
- "Knowledge Cutoff",
407
- ],
408
- )
409
- elo_display_df = gr.Dataframe(
410
- headers=[
411
- "Rank* (UB)",
412
- "Model",
413
- "Arena Elo",
414
- "95% CI",
415
- "Votes",
416
- "Organization",
417
- "License",
418
- "Knowledge Cutoff",
419
- ],
420
- datatype=[
421
- "str",
422
- "markdown",
423
- "number",
424
- "str",
425
- "number",
426
- "str",
427
- "str",
428
- "str",
429
- ],
430
- value=arena_vals.style,
431
- elem_id="arena_leaderboard_dataframe",
432
- height=700,
433
- column_widths=[70, 190, 100, 100, 90, 130, 150, 100],
434
- wrap=True,
435
  )
436
 
437
- gr.Markdown(
438
- elem_id="leaderboard_markdown",
 
 
 
 
 
439
  )
440
 
441
- leader_component_values[:] = [default_md, p1, p2, p3, p4]
 
 
 
442
 
443
- if show_plot:
444
- more_stats_md = gr.Markdown(
445
- f"""## More statistics on Chatbot Arena""",
446
- elem_id="leaderboard_header_markdown",
447
- )
448
- with gr.Row():
449
- with gr.Column():
450
- gr.Markdown(
451
- "#### Figure 1: Confidence Intervals on Model Strength (via Bootstrapping)",
452
- elem_id="plot-title",
453
- )
454
- plot_3 = gr.Plot(p3, show_label=False)
455
- with gr.Column():
456
- gr.Markdown(
457
- "#### Figure 2: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)",
458
- elem_id="plot-title",
459
- )
460
- plot_4 = gr.Plot(p4, show_label=False)
461
- with gr.Row():
462
- with gr.Column():
463
- gr.Markdown(
464
- "#### Figure 3: Fraction of Model A Wins for All Non-tied A vs. B Battles",
465
- elem_id="plot-title",
466
- )
467
- plot_1 = gr.Plot(
468
- p1, show_label=False, elem_id="plot-container"
469
- )
470
- with gr.Column():
471
- gr.Markdown(
472
- "#### Figure 4: Battle Count for Each Combination of Models (without Ties)",
473
- elem_id="plot-title",
474
- )
475
- plot_2 = gr.Plot(p2, show_label=False)
476
-
477
- if not show_plot:
478
- gr.Markdown(
479
- """
480
- """,
481
- elem_id="leaderboard_markdown",
482
- )
483
- else:
484
- pass
485
 
486
- def update_leaderboard_df(arena_table_vals):
487
- elo_datarame = pd.DataFrame(
488
- arena_table_vals,
 
489
  columns=[
490
- "Rank* (UB)",
491
- "Delta",
492
- "Model",
493
- "Arena Elo",
494
- "95% CI",
495
- "Votes",
496
- "Organization",
497
- "License",
498
- "Knowledge Cutoff",
499
- ],
500
- )
501
-
502
- def highlight_max(s):
503
- return [
504
- "color: green; font-weight: bold"
505
- if "\u2191" in v
506
- else "color: red; font-weight: bold"
507
- if "\u2193" in v
508
- else ""
509
- for v in s
510
  ]
 
 
 
 
 
 
 
 
511
 
512
- def highlight_rank_max(s):
513
- return [
514
- "color: green; font-weight: bold"
515
- if v > 0
516
- else "color: red; font-weight: bold"
517
- if v < 0
518
- else ""
519
- for v in s
520
- ]
521
 
522
- return elo_datarame.style.apply(highlight_max, subset=["Rank* (UB)"]).apply(
523
- highlight_rank_max, subset=["Delta"]
 
 
 
 
 
 
 
 
 
 
 
 
524
  )
525
 
526
- def update_leaderboard_and_plots(category, filters):
527
- _, arena_dfs, category_elo_results, _, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
528
 
529
- arena_subset_df = arena_dfs[category]
530
- arena_subset_df = arena_subset_df[arena_subset_df["num_battles"] > 200]
531
- elo_subset_results = category_elo_results[category]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
532
 
533
- baseline_category = cat_name_to_baseline.get(category, selected_category)
534
- arena_df = arena_dfs[baseline_category]
535
- arena_values = get_arena_table(
536
- arena_df,
537
- model_table_df,
538
- arena_subset_df=arena_subset_df if category != "Overall" else None,
539
- hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
540
- )
 
 
 
541
 
542
- # Filter plots based on deprecated models
543
- p1 = filter_deprecated_models_plots(
544
- elo_subset_results["win_fraction_heatmap"],
545
- hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
546
- )
547
- p2 = filter_deprecated_models_plots(
548
- elo_subset_results["battle_count_heatmap"],
549
- hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
550
- )
551
- p3 = filter_deprecated_models_plots(
552
- elo_subset_results["bootstrap_elo_rating"],
553
- hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
554
- )
555
- p4 = filter_deprecated_models_plots(
556
- elo_subset_results["average_win_rate_bar"],
557
- hidden_models=(None if len(filters) > 0 and "Deprecated" in filters else deprecated_model_name)
558
- )
559
- if category != "Overall":
560
- arena_values = update_leaderboard_df(arena_values)
561
- arena_values = gr.Dataframe(
562
- headers=[
563
- "Rank* (UB)",
564
- "Delta",
565
- "Model",
566
- "Arena Elo",
567
- "95% CI",
568
- "Votes",
569
- "Organization",
570
- "License",
571
- "Knowledge Cutoff",
572
- ],
573
- datatype=[
574
- "str",
575
- "number",
576
- "markdown",
577
- "number",
578
- "str",
579
- "number",
580
- "str",
581
- "str",
582
- "str",
583
  ],
584
- value=arena_values,
585
- elem_id="arena_leaderboard_dataframe",
586
- height=700,
587
- column_widths=[70, 70, 200, 90, 100, 90, 120, 150, 100],
588
- wrap=True,
589
  )
590
- else:
591
- arena_values = gr.Dataframe(
592
- headers=[
593
- "Rank* (UB)",
594
- "Model",
595
- "Arena Elo",
596
- "95% CI",
597
- "Votes",
598
- "Organization",
599
- "License",
600
- "Knowledge Cutoff",
601
- ],
602
- datatype=[
603
- "str",
604
- "markdown",
605
- "number",
606
- "str",
607
- "number",
608
- "str",
609
- "str",
610
- "str",
611
- ],
612
- value=arena_values,
613
- elem_id="arena_leaderboard_dataframe",
614
- height=700,
615
- column_widths=[70, 190, 100, 100, 90, 140, 150, 100],
616
- wrap=True,
617
  )
 
618
 
619
- p1 = elo_subset_results["win_fraction_heatmap"]
620
- p2 = elo_subset_results["battle_count_heatmap"]
621
- p3 = elo_subset_results["bootstrap_elo_rating"]
622
- p4 = elo_subset_results["average_win_rate_bar"]
623
- more_stats_md = f"""## More Statistics for Chatbot Arena - {category}
624
- """
625
- leaderboard_md = make_category_arena_leaderboard_md(
626
- arena_df, arena_subset_df, name=category
627
- )
628
- return arena_values, p1, p2, p3, p4, more_stats_md, leaderboard_md
629
-
630
- if leaderboard_table_file:
631
- category_dropdown.change(
632
- fn=update_leaderboard_and_plots,
633
- inputs=[category_dropdown, category_checkbox],
634
- outputs=[
635
- elo_display_df,
636
- plot_1,
637
- plot_2,
638
- plot_3,
639
- plot_4,
640
- more_stats_md,
641
- category_deets,
642
- ],
643
- )
644
- category_checkbox.change(
645
- update_leaderboard_and_plots,
646
- inputs=[category_dropdown, category_checkbox],
647
- outputs=[
648
- elo_display_df,
649
- plot_1,
650
- plot_2,
651
- plot_3,
652
- plot_4,
653
- more_stats_md,
654
- category_deets,
655
- ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
656
  )
657
- if show_plot and leaderboard_table_file:
658
- return [md_1, md_2, lb_description, category_deets, elo_display_df, plot_1, plot_2, plot_3, plot_4]
659
- return [md_1]
660
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
661
 
662
- def build_demo(elo_results_file, leaderboard_table_file):
663
- text_size = gr.themes.sizes.text_lg
664
- theme = gr.themes.Default.load("theme.json")
665
- theme.text_size = text_size
666
- theme.set(
667
- button_large_text_size="40px",
668
- button_small_text_size="40px",
669
- button_large_text_weight="1000",
670
- button_small_text_weight="1000",
671
- button_shadow="*shadow_drop_lg",
672
- button_shadow_hover="*shadow_drop_lg",
673
- checkbox_label_shadow="*shadow_drop_lg",
674
- button_shadow_active="*shadow_inset",
675
- button_secondary_background_fill="*primary_300",
676
- button_secondary_background_fill_dark="*primary_700",
677
- button_secondary_background_fill_hover="*primary_200",
678
- button_secondary_background_fill_hover_dark="*primary_500",
679
- button_secondary_text_color="*primary_800",
680
- button_secondary_text_color_dark="white",
681
- )
682
 
683
- with gr.Blocks(
684
- title="LLM arena: leaderboard",
685
- theme=theme,
686
- css=block_css,
687
- ) as demo:
688
- build_leaderboard_tab(
689
- elo_results_file, leaderboard_table_file, show_plot=True, mirror=True
690
- )
691
- return demo
692
 
693
- block_css = """
694
- #notice_markdown .prose {
695
- font-size: 110% !important;
696
- }
697
- #notice_markdown th {
698
- display: none;
699
- }
700
- #notice_markdown td {
701
- padding-top: 6px;
702
- padding-bottom: 6px;
703
- }
704
- #arena_leaderboard_dataframe table {
705
- font-size: 110%;
706
- }
707
- #full_leaderboard_dataframe table {
708
- font-size: 110%;
709
- }
710
- #model_description_markdown {
711
- font-size: 110% !important;
712
- }
713
- #leaderboard_markdown .prose {
714
- font-size: 110% !important;
715
- }
716
- #leaderboard_markdown td {
717
- padding-top: 6px;
718
- padding-bottom: 6px;
719
- }
720
- #leaderboard_dataframe td {
721
- line-height: 0.1em;
722
- }
723
- #about_markdown .prose {
724
- font-size: 110% !important;
725
- }
726
- #ack_markdown .prose {
727
- font-size: 110% !important;
728
- }
729
- #chatbot .prose {
730
- font-size: 105% !important;
731
- }
732
- .sponsor-image-about img {
733
- margin: 0 20px;
734
- margin-top: 20px;
735
- height: 40px;
736
- max-height: 100%;
737
- width: auto;
738
- float: left;
739
- }
740
 
741
- .chatbot h1, h2, h3 {
742
- margin-top: 8px; /* Adjust the value as needed */
743
- margin-bottom: 0px; /* Adjust the value as needed */
744
- padding-bottom: 0px;
745
- }
746
 
747
- .chatbot h1 {
748
- font-size: 130%;
749
- }
750
- .chatbot h2 {
751
- font-size: 120%;
752
- }
753
- .chatbot h3 {
754
- font-size: 110%;
755
- }
756
- .chatbot p:not(:first-child) {
757
- margin-top: 8px;
758
- }
759
 
760
- .typing {
761
- display: inline-block;
762
- }
763
 
764
- .cursor {
765
- display: inline-block;
766
- width: 7px;
767
- height: 1em;
768
- background-color: black;
769
- vertical-align: middle;
770
- animation: blink 1s infinite;
771
- }
772
 
773
- .dark .cursor {
774
- display: inline-block;
775
- width: 7px;
776
- height: 1em;
777
- background-color: white;
778
- vertical-align: middle;
779
- animation: blink 1s infinite;
780
- }
781
 
782
- @keyframes blink {
783
- 0%, 50% { opacity: 1; }
784
- 50.1%, 100% { opacity: 0; }
785
- }
 
 
 
 
 
 
 
 
 
 
 
786
 
787
- .app {
788
- max-width: 100% !important;
789
- padding: 20px !important;
790
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
791
 
792
- a {
793
- color: #1976D2; /* Your current link color, a shade of blue */
794
- text-decoration: none; /* Removes underline from links */
795
- }
796
- a:hover {
797
- color: #63A4FF; /* This can be any color you choose for hover */
798
- text-decoration: underline; /* Adds underline on hover */
799
- }
800
- """
 
 
801
 
802
 
803
  if __name__ == "__main__":
804
  parser = argparse.ArgumentParser()
805
- parser.add_argument("--share", action="store_true")
806
  parser.add_argument("--host", default="0.0.0.0")
807
  parser.add_argument("--port", type=int, default=7860)
 
808
  args = parser.parse_args()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
809
 
810
- elo_result_files = glob.glob("elo_results_*.pkl")
811
- elo_result_files.sort(key=lambda x: int(x[12:-4]))
812
- elo_result_file = elo_result_files[-1]
813
-
814
- leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
815
- leaderboard_table_files.sort(key=lambda x: int(x[18:-4]))
816
- leaderboard_table_file = leaderboard_table_files[-1]
817
 
818
  demo = build_demo(elo_result_file, leaderboard_table_file)
819
- demo.launch(show_api=False)
 
 
 
 
 
 
 
4
  import pickle
5
  import traceback
6
  import numpy as np
 
7
 
8
  import pandas as pd
9
  import gradio as gr
10
  import numpy as np
11
 
12
 
 
 
 
 
13
  promo_banner = """
14
  <div style="background-color: #ffcc00; color: black; padding: 10px; text-align: center; font-weight: bold; font-size: 18px; border: 2px solid #000;">
15
  USE THE LATEST VERSIONS OF THE BEST CHATBOTS IN RUSSIAN FOR FREE
 
18
 
19
  deprecated_model_name = [
20
  "GigaChat 3.1.25.3",
21
+ "GigaChat-Pro 2.2.25.3",
22
  "saiga_llama3_8b_v6",
23
  "saiga_phi3_medium",
24
  "GigaChat-Plus 3.1.25.3",
25
  "GigaChat-Pro 4.0.26.8",
26
  "GigaChat 4.0.26.8",
27
+ "xAI: Grok 2",
28
  "GigaChat-Pro 4.0.26.15",
29
  "GigaChat 4.0.26.15",
30
+ "YandexGPT Experimental", "yandex-gpt-arena",
31
+ "RefalMachine/ruadapt_llama3_instruct_lep_saiga_kto_ablitirated"
32
+ ]
33
+
34
+ models_10b = [
35
+ "saiga_llama3_8b_v7",
36
+ "Vikhrmodels/Vikhr-YandexGPT-5-Lite-8B-it",
37
+ "T-lite-instruct-0.1",
38
+ "t-tech/T-lite-it-1.0",
39
+ "LLaMA-3 Chat (8B)",
40
+ "Llama 3.1 8B Instruct Turbo",
41
+ "MTSAIR/Cotype-Nano"
42
  ]
43
 
44
+
45
  def make_default_md_1():
46
  leaderboard_md = f"""
47
  # 🏆 LLM Arena in Russian: Leaderboard
 
51
  """
52
  return leaderboard_md
53
 
 
54
  def make_default_md_2():
55
  leaderboard_md = f"""
56
+
57
  The LLM Arena platform is an open crowdsourcing platform for evaluating large language models (LLM) in Russian. We collect pairwise comparisons from people to rank LLMs using the Bradley-Terry model and display model ratings on the Elo scale.
58
  Chatbot Arena in Russian depends on community participation, so please contribute by casting your vote!
59
+
60
  - To **add your model** to the comparison, contact us on TG: [Group](https://t.me/+bFEOl-Bdmok4NGUy)
61
  - If you **found a bug** or **have a suggestion**, contact us: [Roman](https://t.me/roman_kucev)
62
  - You can **contribute your vote** at [llmarena.ru](https://llmarena.ru/)!
63
  """
 
64
  return leaderboard_md
65
 
66
 
67
  def make_arena_leaderboard_md(arena_df, last_updated_time):
68
+ # Using version from monitor.py (translated)
69
+ total_votes = sum(arena_df["num_battles"]) if not arena_df.empty else 0
70
  total_models = len(arena_df)
71
+ space = "   " # Using HTML space
72
 
73
  leaderboard_md = f"""
74
  Total # of models: **{total_models}**.{space} Total # of votes: **{"{:,}".format(total_votes)}**.{space} Last updated: {last_updated_time}.
 
81
 
82
 
83
  def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="site_visitors/medium_prompts:style control"):
84
+ total_votes = sum(arena_df["num_battles"]) if not arena_df.empty else 0
85
  total_models = len(arena_df)
86
+ space = "   "
87
+ total_subset_votes = sum(arena_subset_df["num_battles"]) if not arena_subset_df.empty else 0
88
  total_subset_models = len(arena_subset_df)
89
+
90
+ perc_models = round(total_subset_models / total_models * 100) if total_models > 0 else 0
91
+ perc_votes = round(total_subset_votes / total_votes * 100) if total_votes > 0 else 0
92
+
93
+ leaderboard_md = f"""### {cat_name_to_explanation.get(name, name)}
94
+ #### {space} #models: **{total_subset_models} ({perc_models}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({perc_votes}%)**{space}
95
  """
96
  return leaderboard_md
97
 
 
98
  def model_hyperlink(model_name, link):
99
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
100
 
101
 
102
+ def filter_deprecated_models_plots(fig, hidden_models=None, limit_to_top=25):
103
  """
104
+ Filters Plotly plots to show only top N models and optionally removes specific models.
105
 
106
  Args:
107
  fig: The Plotly figure object.
108
+ hidden_models (list, optional): A list of model names to remove. Defaults to None.
109
+ limit_to_top (int, optional): Limit display to top N models (0 or None means no limit). Defaults to 25.
110
+
111
+ Returns:
112
+ Plotly figure: The filtered figure object or the original if filtering fails or is not applicable.
113
  """
114
  if fig is None:
115
+ return None
116
+
117
+ # Check if the figure has data
118
+ if not hasattr(fig, 'data') or len(fig.data) == 0:
119
+ return fig
120
+
121
+ # Check if data has a type attribute
122
+ if not hasattr(fig.data[0], 'type'):
123
  return fig
124
 
125
+ # Check minimum number of models after initial hidden_models filtering
126
+ models_to_check = []
127
+ if hasattr(fig.data[0], 'x'):
128
+ models_to_check = fig.data[0].x
129
+ elif hasattr(fig.data[0], 'y'): # For some types like bar, X axis might be numeric
130
+ models_to_check = fig.data[0].y
131
+
132
+ if hidden_models is not None and models_to_check.any():
133
+ available_models = [x for x in models_to_check if x not in hidden_models]
134
+ # print(f"Available models before top N: {len(available_models)}") # Debug
135
+ if len(available_models) <= 2: # If less than 3 models remain before top_n
136
+ # print(f"Warning: Too few models left after initial filtering ({len(available_models)}), returning original plot.")
137
+ return fig # Return the original plot if too few models
138
+
139
+ if limit_to_top is not None and limit_to_top <= 0:
140
+ limit_to_top = None
141
+
142
+ try:
143
+ # Work on a deep copy to avoid modifying the original figure object
144
+ fig_copy = pickle.loads(pickle.dumps(fig))
145
+ data = fig_copy.data[0]
146
+
147
+ if data.type == 'heatmap':
148
+ # Apply hidden models filter
149
+ mask_x = ~np.isin(data.x, hidden_models) if hidden_models is not None else np.ones_like(data.x, dtype=bool)
150
+ mask_y = ~np.isin(data.y, hidden_models) if hidden_models is not None else np.ones_like(data.y, dtype=bool)
151
+
152
+ # Get initially filtered X and Y arrays
153
+ filtered_x = np.array(data.x)[mask_x]
154
+ filtered_y = np.array(data.y)[mask_y]
155
+
156
+ # Apply top N limit (assuming the order is already by rank/rating)
157
+ if limit_to_top is not None and len(filtered_x) > limit_to_top:
158
+ top_models = filtered_x[:limit_to_top]
159
+ # Create new masks based on the top models relative to the *original* data axes
160
+ mask_x = np.isin(data.x, top_models)
161
+ mask_y = np.isin(data.y, top_models)
162
+ # Get final filtered axes
163
+ filtered_x = np.array(data.x)[mask_x]
164
+ filtered_y = np.array(data.y)[mask_y]
165
+ elif len(filtered_x) <= 2: # If <=2 models remain after filtering
166
+ return fig # Return original
167
+
168
+ # Update the heatmap data
169
+ data.x = filtered_x
170
+ data.y = filtered_y
171
+ # Important: Indexing 'z' must use masks derived from the *original* data order
172
+ z_original = np.array(fig.data[0].z)
173
+ data.z = z_original[np.ix_(mask_y, mask_x)]
174
+
175
+ elif data.type == 'scatter':
176
+ trace = data
177
+ # Apply hidden models filter
178
+ mask = ~np.isin(trace.x, hidden_models) if hidden_models is not None else np.ones_like(trace.x, dtype=bool)
179
+
180
+ # Get initially filtered arrays
181
+ current_x = np.array(trace.x)[mask]
182
+ current_y = np.array(trace.y)[mask]
183
+ current_text = np.array(trace.text)[mask] if hasattr(trace, 'text') and trace.text is not None else None
184
+ # Handle error bars safely
185
+ current_error_y_array = np.array(trace.error_y['array'])[mask] if 'error_y' in trace and 'array' in trace.error_y and trace.error_y['array'] is not None else None
186
+ current_error_y_arrayminus = np.array(trace.error_y['arrayminus'])[mask] if 'error_y' in trace and 'arrayminus' in trace.error_y and trace.error_y['arrayminus'] is not None else None
187
+
188
+ # Apply top N limit
189
+ if limit_to_top is not None and len(current_x) > limit_to_top:
190
+ # Sort by y-value (rating) descending to find the top N
191
+ sort_indices = np.argsort(-current_y)[:limit_to_top]
192
+ current_x = current_x[sort_indices]
193
+ current_y = current_y[sort_indices]
194
+ if current_text is not None:
195
+ current_text = current_text[sort_indices]
196
+ if current_error_y_array is not None:
197
+ current_error_y_array = current_error_y_array[sort_indices]
198
+ if current_error_y_arrayminus is not None:
199
+ current_error_y_arrayminus = current_error_y_arrayminus[sort_indices]
200
+ elif len(current_x) <= 2: # If <=2 models remain after filtering
201
+ return fig # Return original
202
+
203
+ # Update the scatter trace data
204
+ trace.x, trace.y = current_x, current_y
205
+ if current_text is not None:
206
+ trace.text = current_text
207
+ # Update error bars if they exist
208
+ if current_error_y_array is not None:
209
+ # Ensure error_y exists before assigning
210
+ if 'error_y' not in trace: trace.error_y = {}
211
+ trace.error_y['array'] = current_error_y_array
212
+ if current_error_y_arrayminus is not None:
213
+ if 'error_y' not in trace: trace.error_y = {}
214
+ trace.error_y['arrayminus'] = current_error_y_arrayminus
215
+
216
+ elif data.type == 'bar':
217
+ trace = data
218
+ # Apply hidden models filter
219
+ mask = ~np.isin(trace.x, hidden_models) if hidden_models is not None else np.ones_like(trace.x, dtype=bool)
220
+
221
+ # Get initially filtered arrays
222
+ current_x = np.array(trace.x)[mask]
223
+ current_y = np.array(trace.y)[mask]
224
+
225
+ # Apply top N limit
226
+ if limit_to_top is not None and len(current_x) > limit_to_top:
227
+ # Sort by y-value (rating) descending
228
+ sort_indices = np.argsort(-current_y)[:limit_to_top]
229
+ current_x = current_x[sort_indices]
230
+ current_y = current_y[sort_indices]
231
+ elif len(current_x) <= 2: # If <=2 models remain after filtering
232
+ return fig # Return original
233
+
234
+ # Update the bar trace data
235
+ trace.x, trace.y = current_x, current_y
236
+
237
+ return fig_copy
238
+
239
+ except Exception as e:
240
+ print(f"Error filtering plot: {e}")
241
+ traceback.print_exc()
242
+ return fig # Return original figure on error
243
+
244
 
245
  def load_leaderboard_table_csv(filename, add_hyperlink=True):
246
  lines = open(filename).readlines()
 
248
  rows = []
249
  for i in range(1, len(lines)):
250
  row = [v.strip() for v in lines[i].split(",")]
251
+ item = {} # Create dictionary once per row
252
+ for h, v in zip(heads, row):
253
+ if h == "Arena Elo rating":
254
+ if v != "-":
255
+ try:
256
  v = int(ast.literal_eval(v))
257
+ except:
258
+ v = np.nan # Handle parsing errors
259
+ else:
260
+ v = np.nan
261
+ item[h] = v
262
+ if add_hyperlink and "Model" in item and "Link" in item: # Check keys exist
263
+ # Check for empty/missing link
264
+ if item["Link"] and item["Link"] != "-":
 
 
 
 
 
 
 
 
 
 
 
265
  item["Model"] = model_hyperlink(item["Model"], item["Link"])
266
+ # Otherwise, keep the model name as is
267
  rows.append(item)
 
268
  return rows
269
 
270
 
271
  def create_ranking_str(ranking, ranking_difference):
272
+ # Convert rank to int before comparison
273
+ try:
274
+ # Ensure rank and difference are treated as numbers
275
+ ranking_val = int(float(ranking)) # Handle potential float input
276
+ ranking_difference_val = int(float(ranking_difference))
277
+ if ranking_difference_val > 0:
278
+ return f"{ranking_val} ↑"
279
+ elif ranking_difference_val < 0:
280
+ return f"{ranking_val} ↓"
281
+ else:
282
+ return f"{ranking_val}"
283
+ except (ValueError, TypeError): # Handle cases where rank is not numeric
284
+ return str(ranking)
285
 
286
 
287
  def recompute_final_ranking(arena_df):
 
288
  ranking = {}
289
+ if arena_df.empty:
290
+ return []
291
+
292
+ model_indices = arena_df.index
293
+ # Ensure CI columns exist before trying to access them
294
+ if "rating_q025" not in arena_df.columns or "rating_q975" not in arena_df.columns:
295
+ print("Warning: Confidence interval columns ('rating_q025', 'rating_q975') not found in DataFrame. Cannot compute UB Rank.")
296
+ # Return NaN or simple rank based on order
297
+ return [np.nan] * len(model_indices) # Or range(1, len(model_indices) + 1)
298
+
299
+ ratings_q025 = arena_df["rating_q025"].to_dict()
300
+ ratings_q975 = arena_df["rating_q975"].to_dict()
301
+
302
+ for model_a in model_indices:
303
+ rank = 1
304
+ rating_a_q975 = ratings_q975.get(model_a)
305
+ # Skip if model A has no CI data
306
+ if pd.isna(rating_a_q975):
307
+ ranking[model_a] = np.nan # Or assign max rank + 1
308
+ continue
309
+
310
+ for model_b in model_indices:
311
+ if model_a == model_b:
312
  continue
313
+
314
+ rating_b_q025 = ratings_q025.get(model_b)
315
+ # Skip comparison if model B has no CI data
316
+ if pd.isna(rating_b_q025):
317
+ continue
318
+
319
+ # Check if B is statistically better than A
320
+ if rating_b_q025 > rating_a_q975:
321
+ rank += 1
322
+ ranking[model_a] = rank
323
  return list(ranking.values())
324
 
325
 
326
  def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_models=None):
327
+ """
328
+ Generates the leaderboard table data.
329
+ 'use_cache' parameter removed.
330
+ """
331
+ # print(f'Calculating get_arena_table') # Debug
332
+
333
+ # Create copies to avoid modifying original DataFrames
334
+ arena_df_processed = arena_df.copy()
335
+ if arena_subset_df is not None:
336
+ arena_subset_df_processed = arena_subset_df.copy()
337
+ else:
338
+ arena_subset_df_processed = None
339
+
340
+ # Sort by rating initially to have a stable order before ranking
341
+ arena_df_processed = arena_df_processed.sort_values(by=["rating"], ascending=False)
342
+ # Compute 'final_ranking' based on CIs if possible
343
+ if "rating_q025" in arena_df_processed.columns and "rating_q975" in arena_df_processed.columns:
344
+ arena_df_processed["final_ranking"] = recompute_final_ranking(arena_df_processed)
345
+ arena_df_processed = arena_df_processed.sort_values(
346
+ by=["final_ranking", "rating"], ascending=[True, False]
347
+ )
348
+ else:
349
+ # Fallback to simple ordering if CI columns are missing
350
+ arena_df_processed["final_ranking"] = range(1, len(arena_df_processed) + 1)
351
+
352
  if hidden_models:
353
+ arena_df_processed = arena_df_processed[~arena_df_processed.index.isin(hidden_models)].copy()
354
+ # Recompute ranks for the filtered view
355
+ if "rating_q025" in arena_df_processed.columns and "rating_q975" in arena_df_processed.columns:
356
+ arena_df_processed["final_ranking"] = recompute_final_ranking(arena_df_processed)
357
+ # Re-sort based on new ranks
358
+ arena_df_processed = arena_df_processed.sort_values(
359
+ by=["final_ranking", "rating"], ascending=[True, False]
360
+ )
361
+ else:
362
+ arena_df_processed["final_ranking"] = range(1, len(arena_df_processed) + 1)
363
 
 
 
 
364
 
365
+ if arena_subset_df_processed is not None:
366
+ # Filter subset by hidden_models first
367
+ if hidden_models:
368
+ arena_subset_df_processed = arena_subset_df_processed[~arena_subset_df_processed.index.isin(hidden_models)].copy()
369
 
370
+ # Ensure models in the subset are also present in the (filtered) main view
371
+ arena_subset_df_processed = arena_subset_df_processed[arena_subset_df_processed.index.isin(arena_df_processed.index)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
 
373
+ # Proceed only if subset is not empty and has CI columns
374
+ if not arena_subset_df_processed.empty and "rating_q025" in arena_subset_df_processed.columns and "rating_q975" in arena_subset_df_processed.columns:
375
+ # Rank within the subset
376
+ arena_subset_df_processed = arena_subset_df_processed.sort_values(by=["rating"], ascending=False)
377
+ arena_subset_df_processed["final_ranking_subset"] = recompute_final_ranking(arena_subset_df_processed) # Rank within category
 
 
378
 
379
+ # Filter the main processed DF to only include models from the subset
380
+ # 'final_ranking' here represents the rank *among these models* in the baseline category view
381
+ arena_df_for_join = arena_df_processed[arena_df_processed.index.isin(arena_subset_df_processed.index)][["final_ranking", "rating"]].copy()
382
+ arena_df_for_join.rename(columns={"final_ranking": "final_ranking_baseline"}, inplace=True)
383
 
384
+
385
+ # Join the subset ranks and baseline ranks
386
+ arena_df_combined = arena_subset_df_processed[["final_ranking_subset", "rating"]].join(
387
+ arena_df_for_join["final_ranking_baseline"], how="inner"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
  )
389
+
390
+ # Calculate rank difference
391
+ arena_df_combined["ranking_difference"] = arena_df_combined["final_ranking_baseline"] - arena_df_combined["final_ranking_subset"]
392
+
393
+ # Sort by subset rank and rating
394
+ arena_df_combined = arena_df_combined.sort_values(
395
+ by=["final_ranking_subset", "rating"], ascending=[True, False]
396
  )
397
+
398
+ # Format the rank string with delta for display
399
+ arena_df_combined["display_ranking"] = arena_df_combined.apply(
400
+ lambda x: create_ranking_str(x["final_ranking_subset"], x["ranking_difference"]),
401
+ axis=1,
402
  )
403
+ arena_df_processed = arena_df_processed.loc[arena_df_combined.index] # Reorder arena_df_processed
404
+
405
+ columns_to_join = ["display_ranking", "ranking_difference", "final_ranking_subset"]
406
+ columns_to_join = [col for col in columns_to_join if col in arena_df_combined.columns]
407
+ arena_df_processed = arena_df_processed.join(arena_df_combined[columns_to_join], how="inner")
408
+
409
+
410
+ # Now sorting should work as the column exists
411
+ # Use the subset rank for final sorting if subset is active
412
+ # Check if 'final_ranking_subset' was successfully joined before sorting
413
+ if "final_ranking_subset" in arena_df_processed.columns:
414
+ arena_df_processed.sort_values(by=["final_ranking_subset", "rating"], ascending=[True, False], inplace=True)
415
  else:
416
+ # Fallback sort if join failed for some reason
417
+ arena_df_processed.sort_values(by=["rating"], ascending=False, inplace=True)
418
+
419
+
420
+ else:
421
+ # If subset is empty or lacks CI, disable subset logic
422
+ arena_subset_df_processed = None
423
+ # Use the baseline ranking as the display ranking
424
+ arena_df_processed["display_ranking"] = arena_df_processed["final_ranking"].astype(str)
425
+ arena_df_processed.sort_values(by=["final_ranking", "rating"], ascending=[True, False], inplace=True)
426
+
427
+
428
+ else:
429
+ # If no subset is used, display ranking is just the final rank from the main DF
430
+ arena_df_processed["display_ranking"] = arena_df_processed["final_ranking"].astype(str)
431
+ # Ensure it's sorted correctly
432
+ arena_df_processed.sort_values(by=["final_ranking", "rating"], ascending=[True, False], inplace=True)
433
+
434
+
435
+ values = []
436
+ # Iterate using the final sorted index of arena_df_processed
437
+ for model_key in arena_df_processed.index:
438
+ row_data = arena_df_processed.loc[model_key]
439
+ # Find model metadata
440
+ model_info = model_table_df[model_table_df["key"] == model_key]
441
+ if model_info.empty:
442
+ # print(f"Warning: Model key '{model_key}' not found in model_table_df. Skipping.")
443
+ continue # Skip if no metadata
444
+
445
+ row = []
446
+ # Rank (Display)
447
+ row.append(row_data.get("display_ranking", "")) # Use the calculated display rank
448
+
449
+ # Delta (only if subset was processed successfully)
450
+ if arena_subset_df_processed is not None:
451
+ row.append(row_data.get("ranking_difference", 0))
452
+
453
+ # Model Name (hyperlink applied during loading)
454
+ row.append(model_info["Model"].values[0])
455
+
456
+ # Arena Elo
457
+ row.append(round(row_data["rating"]))
458
+
459
+ # 95% CI
460
+ # Check for NaN before calculation
461
+ upper_rating = row_data.get("rating_q975")
462
+ lower_rating = row_data.get("rating_q025")
463
+ current_rating = row_data.get("rating")
464
+ upper_diff = round(upper_rating - current_rating) if pd.notna(upper_rating) and pd.notna(current_rating) else '?'
465
+ lower_diff = round(current_rating - lower_rating) if pd.notna(current_rating) and pd.notna(lower_rating) else '?'
466
+ row.append(f"+{upper_diff}/-{lower_diff}")
467
+
468
+
469
+ # Votes
470
+ row.append(round(row_data["num_battles"]))
471
+
472
+ # Organization
473
+ row.append(model_info["Organization"].values[0])
474
+
475
+ # License
476
+ row.append(model_info["License"].values[0])
477
+
478
+ # Knowledge Cutoff
479
+ cutoff_date = model_info["Knowledge cutoff date"].values[0]
480
+ row.append("Unknown" if cutoff_date == "-" else cutoff_date)
481
+
482
+ values.append(row)
483
+
484
  return values
485
 
486
 
487
  key_to_category_name = {
488
+ # Mapping from internal key to display name (kept English for consistency)
489
+ "full": "Overall", # Might not be used if filtered out later
490
  "crowdsourcing/simple_prompts": "crowdsourcing/simple_prompts",
491
  "site_visitors/medium_prompts": "site_visitors/medium_prompts",
492
+ "site_visitors/medium_prompts:style control": "site_visitors/medium_prompts:style_control" # Use underscore for display consistency if needed
493
  }
494
  cat_name_to_explanation = {
495
+ # Translated explanations for display
496
  "Overall": "All queries",
497
+ "crowdsourcing/simple_prompts": "Queries collected via crowdsourcing. Mostly simple ones.",
498
  "site_visitors/medium_prompts": "Queries from website visitors. Contain more complex prompts.",
499
+ "site_visitors/medium_prompts:style_control": "Queries from website visitors. Contain more complex prompts. [Reduced stylistic influence](https://lmsys.org/blog/2024-08-28-style-control/) of the response on the rating."
500
  }
501
  cat_name_to_baseline = {
502
+ # Baseline category for comparison (if needed, seems unused now but kept)
503
+ # "Hard Prompts (English)": "English",
504
  }
505
 
506
  actual_categories = [
507
+ # Categories available in the dropdown (use the *keys* from key_to_category_name)
508
+ # "Overall", # Removed
509
+ # "crowdsourcing/simple_prompts", # Removed
510
  "site_visitors/medium_prompts",
511
  "site_visitors/medium_prompts:style control"
512
  ]
513
+ # Default selected category key
514
+ req_cat_key = "site_visitors/medium_prompts:style control"
515
+ selected_category_key = req_cat_key if req_cat_key in actual_categories else ("site_visitors/medium_prompts" if "site_visitors/medium_prompts" in actual_categories else (actual_categories[0] if actual_categories else None))
516
+ # Get the display name for the selected category
517
+ selected_category_display_name = key_to_category_name.get(selected_category_key, selected_category_key) # Fallback to key if not found
518
 
519
 
520
  def read_elo_file(elo_results_file, leaderboard_table_file):
521
+ # Version from monitor.py, but no lazy_load or caching
522
+ print('Reading Elo file...')
523
  arena_dfs = {}
524
  category_elo_results = {}
525
+ last_updated_time = "N/A" # Default value
526
+ elo_results = {} # Default value
527
+ model_table_df = pd.DataFrame() # Default value
528
+
529
+ try:
530
+ # Use context manager for file operations
531
+ with open(elo_results_file, "rb") as fin:
532
+ elo_results = pickle.load(fin)
533
+
534
+ # Try to get last updated time from primary or fallback categories
535
+ main_cat_key = "site_visitors/medium_prompts:style control"
536
+ fallback_cat_key_1 = "site_visitors/medium_prompts"
537
+ fallback_cat_key_2 = "full" # Another fallback
538
+
539
+ if main_cat_key in elo_results and "last_updated_datetime" in elo_results[main_cat_key]:
540
+ last_updated_time = elo_results[main_cat_key]["last_updated_datetime"].split(" ")[0]
541
+ elif fallback_cat_key_1 in elo_results and "last_updated_datetime" in elo_results[fallback_cat_key_1]:
542
+ last_updated_time = elo_results[fallback_cat_key_1]["last_updated_datetime"].split(" ")[0]
543
+ elif fallback_cat_key_2 in elo_results and "last_updated_datetime" in elo_results[fallback_cat_key_2]:
544
+ last_updated_time = elo_results[fallback_cat_key_2]["last_updated_datetime"].split(" ")[0]
545
+
546
+ # Iterate through defined category keys
547
+ for key in key_to_category_name.keys():
548
+ display_name = key_to_category_name[key] # Get the display name
549
+ if key in elo_results:
550
+ # Check for required data within the category result
551
+ if "leaderboard_table_df" in elo_results[key] and isinstance(elo_results[key]["leaderboard_table_df"], pd.DataFrame):
552
+ df = elo_results[key]["leaderboard_table_df"]
553
+ # Filter by number of battles > 200
554
+ # Store using the *display_name* as the key for consistency with dropdown/UI
555
+ arena_dfs[display_name] = df[df["num_battles"] > 200].copy()
556
+ category_elo_results[display_name] = elo_results[key]
557
+ # else:
558
+ # print(f"Warning: 'leaderboard_table_df' not found or not a DataFrame for key '{key}'")
559
+ # else:
560
+ # print(f"Warning: Key '{key}' not found in elo_results")
561
+
562
+ # Load model metadata CSV
563
+ data = load_leaderboard_table_csv(leaderboard_table_file)
564
+ model_table_df = pd.DataFrame(data)
565
 
566
+ except FileNotFoundError:
567
+ print(f"Error: Elo results file not found at {elo_results_file}")
568
+ # Return empty structures
569
+ except Exception as e:
570
+ print(f"Error reading elo file: {e}")
571
+ traceback.print_exc()
572
+ # Return empty structures
573
 
574
+ # Ensure correct data types are returned even on error
575
  return last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df
576
 
577
 
578
  def build_leaderboard_tab(
579
  elo_results_file, leaderboard_table_file, show_plot=False, mirror=False
580
  ):
581
+ # Load data once during build time
582
+ try:
583
+ last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
584
+ except Exception as e:
585
+ print(f"Failed to load initial data: {e}")
586
+ # Set empty defaults to prevent app crash
587
+ last_updated_time = "Error"
588
+ arena_dfs = {}
589
+ category_elo_results = {}
590
+ elo_results = {}
591
+ model_table_df = pd.DataFrame()
592
+
593
+ # Get data for the default selected category
594
+ # Use the *display name* derived from the selected key
595
+ if selected_category_display_name in arena_dfs:
596
+ arena_df = arena_dfs[selected_category_display_name]
597
+ elo_subset_results_init = category_elo_results[selected_category_display_name]
598
+ p1_init = elo_subset_results_init.get("win_fraction_heatmap")
599
+ p2_init = elo_subset_results_init.get("battle_count_heatmap")
600
+ p3_init = elo_subset_results_init.get("bootstrap_elo_rating")
601
+ p4_init = elo_subset_results_init.get("average_win_rate_bar")
602
+ else:
603
+ # Fallback if default category is missing
604
+ fallback_cat_display_name = None
605
+ if actual_categories:
606
+ # Try the first actual category's display name
607
+ first_cat_key = actual_categories[0]
608
+ fallback_cat_display_name = key_to_category_name.get(first_cat_key, first_cat_key)
609
+
610
+ if fallback_cat_display_name and fallback_cat_display_name in arena_dfs:
611
+ print(f"Warning: Selected category '{selected_category_display_name}' not found. Falling back to '{fallback_cat_display_name}'.")
612
+ arena_df = arena_dfs[fallback_cat_display_name]
613
+ elo_subset_results_init = category_elo_results[fallback_cat_display_name]
614
+ p1_init = elo_subset_results_init.get("win_fraction_heatmap")
615
+ p2_init = elo_subset_results_init.get("battle_count_heatmap")
616
+ p3_init = elo_subset_results_init.get("bootstrap_elo_rating")
617
+ p4_init = elo_subset_results_init.get("average_win_rate_bar")
618
+ else:
619
+ print(f"Warning: Default category '{selected_category_display_name}' and fallback categories not found in data.")
620
+ arena_df = pd.DataFrame() # Empty DataFrame
621
+ p1_init, p2_init, p3_init, p4_init = None, None, None, None
622
+
623
+ # Apply initial filtering to plots
624
+ p1_init = filter_deprecated_models_plots(p1_init, hidden_models=deprecated_model_name)
625
+ p2_init = filter_deprecated_models_plots(p2_init, hidden_models=deprecated_model_name)
626
+ p3_init = filter_deprecated_models_plots(p3_init, hidden_models=deprecated_model_name)
627
+ p4_init = filter_deprecated_models_plots(p4_init, hidden_models=deprecated_model_name)
628
+
629
+ default_md = make_default_md_1() # Parameters removed
630
+ default_md_2 = make_default_md_2() # Parameters removed
631
+
632
  with gr.Row():
633
  with gr.Column(scale=4):
634
+ # Removed Vote button
635
  md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
636
  with gr.Column(scale=1):
637
  vote_button = gr.Button("Vote!", link="https://llmarena.ru")
638
  md_2 = gr.Markdown(default_md_2, elem_id="leaderboard_markdown")
 
 
 
639
 
640
+ # Generate initial table data
641
+ if not arena_df.empty and not model_table_df.empty:
642
+ # Pass the baseline DF and the model table; initially no subset difference is shown
643
+ arena_table_vals_init = get_arena_table(arena_df, model_table_df, hidden_models=deprecated_model_name)
644
+ else:
645
+ arena_table_vals_init = []
646
+
647
+ # Single "Arena" tab
648
+ with gr.Tab("Arena", id=0): # Removed Tabs() as only one tab
649
+ md_arena = make_arena_leaderboard_md(arena_df, last_updated_time)
650
+ lb_description = gr.Markdown(md_arena, elem_id="leaderboard_markdown")
651
+
652
+ with gr.Row():
653
+ with gr.Column(scale=2):
654
+ # Use *display names* for choices if they differ significantly from keys,
655
+ # but here keys are descriptive enough. Callback receives the *key*.
656
+ category_dropdown = gr.Dropdown(
657
+ # Choices should be the *keys* corresponding to display names
658
+ choices=actual_categories,
659
+ value=selected_category_key, # Use the key for the default value
660
+ label="Category", # Translated
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
661
  )
662
 
663
+ with gr.Column(scale=2):
664
+ category_checkbox = gr.CheckboxGroup(
665
+ # Use user-friendly translated labels
666
+ ["Show Deprecated", "Only <10B Models"], # Adjusted label for clarity
667
+ label="Apply Filter",
668
+ info="",
669
+ value=[], # Filters off by default
670
  )
671
 
672
+ # Category details
673
+ default_category_details = make_category_arena_leaderboard_md(
674
+ arena_df, arena_df, name=selected_category_display_name # Pass arena_df twice for initial display
675
+ ) if not arena_df.empty else "No data for category"
676
 
677
+ with gr.Column(scale=4, variant="panel"):
678
+ category_deets = gr.Markdown(
679
+ default_category_details, elem_id="category_deets"
680
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
681
 
682
+ # DataFrame for displaying the table
683
+ # Initial view doesn't have 'Delta' column
684
+ arena_vals = pd.DataFrame(
685
+ arena_table_vals_init,
686
  columns=[
687
+ "Rank* (UB)", "Model", "Arena Elo", "95% CI",
688
+ "Votes", "Organization", "License", "Knowledge Cutoff"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
689
  ]
690
+ ) if arena_table_vals_init else pd.DataFrame(columns=[ # Handle empty initial data
691
+ "Rank* (UB)", "Model", "Arena Elo", "95% CI",
692
+ "Votes", "Organization", "License", "Knowledge Cutoff"
693
+ ])
694
+
695
+ # Sort by Elo for initial display
696
+ if "Arena Elo" in arena_vals.columns:
697
+ arena_vals = arena_vals.sort_values(by="Arena Elo", ascending=False)
698
 
 
 
 
 
 
 
 
 
 
699
 
700
+ elo_display_df = gr.Dataframe(
701
+ headers=[ # Translated headers
702
+ "Rank* (UB)", "Model", "Arena Elo", "95% CI",
703
+ "Votes", "Organization", "License", "Knowledge Cutoff"
704
+ ],
705
+ datatype=[
706
+ "str", "markdown", "number", "str",
707
+ "number", "str", "str", "str"
708
+ ],
709
+ value=arena_vals.style, # Apply Pandas styling if needed
710
+ elem_id="arena_leaderboard_dataframe",
711
+ height=700,
712
+ column_widths=[70, 190, 100, 100, 90, 130, 150, 100], # Widths from monitor.py
713
+ wrap=True,
714
  )
715
 
716
+ gr.Markdown(elem_id="leaderboard_markdown") # Empty markdown for spacing
 
717
 
718
+ plot_1, plot_2, plot_3, plot_4 = None, None, None, None # Initialize plot variables
719
+ more_stats_md = None # Initialize markdown variable
720
+ if show_plot:
721
+ more_stats_md = gr.Markdown(
722
+ f"""## More Statistics for Chatbot Arena""", # Translated
723
+ elem_id="leaderboard_header_markdown",
724
+ )
725
+ with gr.Row(elem_id="leaderboard_bars"): # Use ID from monitor.py
726
+ with gr.Column():
727
+ gr.Markdown( # Translated title
728
+ "#### Figure 1: Confidence Intervals on Model Strength (via Bootstrapping)",
729
+ elem_id="plot-title",
730
+ )
731
+ plot_3 = gr.Plot(p3_init, show_label=False) # Use initial data
732
+ with gr.Column():
733
+ gr.Markdown( # Translated title
734
+ "#### Figure 2: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)",
735
+ elem_id="plot-title",
736
+ )
737
+ plot_4 = gr.Plot(p4_init, show_label=False) # Use initial data
738
+ with gr.Row(elem_id="leaderboard_plots"): # Use ID from monitor.py
739
+ with gr.Column():
740
+ gr.Markdown( # Translated title
741
+ "#### Figure 3: Fraction of Model A Wins for All Non-tied A vs. B Battles",
742
+ elem_id="plot-title",
743
+ )
744
+ plot_1 = gr.Plot(
745
+ p1_init, show_label=False, elem_id="plot-container" # Use initial data
746
+ )
747
+ with gr.Column():
748
+ gr.Markdown( # Translated title
749
+ "#### Figure 4: Battle Count for Each Combination of Models (without Ties)",
750
+ elem_id="plot-title",
751
+ )
752
+ plot_2 = gr.Plot(p2_init, show_label=False) # Use initial data
753
 
754
+ def update_leaderboard_df(arena_table_vals):
755
+ # Add error handling for empty or incorrect data
756
+ # Expects 9 columns when Delta is present
757
+ if not arena_table_vals or not isinstance(arena_table_vals, list) or not arena_table_vals[0] or len(arena_table_vals[0]) != 9:
758
+ print("Warning: Invalid data for styling in update_leaderboard_df. Returning empty DataFrame.")
759
+ # Return an empty styled DataFrame to avoid Gradio errors
760
+ empty_styled = pd.DataFrame(columns=[
761
+ "Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
762
+ "Votes", "Organization", "License", "Knowledge Cutoff"
763
+ ]).style
764
+ return empty_styled
765
 
766
+ try:
767
+ elo_datarame = pd.DataFrame(
768
+ arena_table_vals,
769
+ columns=[
770
+ "Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
771
+ "Votes", "Organization", "License", "Knowledge Cutoff"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
772
  ],
 
 
 
 
 
773
  )
774
+
775
+ def highlight_max(s):
776
+ # Check rank string for arrows
777
+ return [
778
+ "color: green; font-weight: bold" if "↑" in str(v) else
779
+ "color: red; font-weight: bold" if "↓" in str(v) else ""
780
+ for v in s
781
+ ]
782
+
783
+ def highlight_rank_max(s):
784
+ # Check Delta value (ensure it's numeric)
785
+ return [
786
+ "color: green; font-weight: bold" if isinstance(v, (int, float)) and v > 0 else
787
+ "color: red; font-weight: bold" if isinstance(v, (int, float)) and v < 0 else ""
788
+ for v in s
789
+ ]
790
+ # Apply styles
791
+ styled_df = elo_datarame.style.apply(highlight_max, subset=["Rank* (UB)"]).apply(
792
+ highlight_rank_max, subset=["Delta"]
 
 
 
 
 
 
 
 
793
  )
794
+ return styled_df
795
 
796
+ except Exception as e:
797
+ print(f"Error applying styles in update_leaderboard_df: {e}")
798
+ traceback.print_exc()
799
+ # Return unstyled DataFrame on error
800
+ return pd.DataFrame(arena_table_vals, columns=[
801
+ "Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
802
+ "Votes", "Organization", "License", "Knowledge Cutoff"
803
+ ]).style
804
+
805
+ def update_leaderboard_and_plots(category_key, filters): # Receives category *key* from dropdown
806
+ # No caching
807
+ # Reload data on each call
808
+ try:
809
+ current_last_updated_time, current_arena_dfs, current_category_elo_results, _, current_model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
810
+ except Exception as e:
811
+ print(f"Error reloading data in callback: {e}")
812
+ # Return empty updates to prevent UI crash
813
+ empty_df_update = gr.Dataframe(value=pd.DataFrame().style) # Empty DataFrame
814
+ empty_plot_update = gr.Plot(value=None) # Empty Plot
815
+ empty_md_update = gr.Markdown(value="Error loading data.") # Error Markdown
816
+ # Match the number of outputs expected by the .change() call
817
+ num_plots = 4 if show_plot else 0
818
+ return [empty_df_update] + [empty_plot_update] * num_plots + [empty_md_update, empty_md_update]
819
+
820
+
821
+ # Use the display name corresponding to the selected key
822
+ category_display_name = key_to_category_name.get(category_key, category_key)
823
+
824
+ # Check if data exists for the selected category (using display name as key now)
825
+ if not current_arena_dfs or category_display_name not in current_arena_dfs or category_display_name not in current_category_elo_results or current_model_table_df.empty:
826
+ print(f"Warning: Data missing for category '{category_display_name}' (key: '{category_key}') after reload.")
827
+ empty_df_update = gr.Dataframe(value=pd.DataFrame().style)
828
+ empty_plot_update = gr.Plot(value=None)
829
+ empty_md_update = gr.Markdown(value=f"No data available for category: {category_display_name}")
830
+ num_plots = 4 if show_plot else 0
831
+ # Match the number of outputs
832
+ return [empty_df_update] + [empty_plot_update] * num_plots + [empty_md_update, empty_md_update]
833
+
834
+ # Get the specific data slices using the display name
835
+ arena_subset_df = current_arena_dfs[category_display_name]
836
+ elo_subset_results = current_category_elo_results[category_display_name]
837
+
838
+ # Use the hardcoded baseline key, get its display name
839
+ baseline_key = "site_visitors/medium_prompts:style control"
840
+ baseline_display_name = key_to_category_name.get(baseline_key, baseline_key)
841
+
842
+ # Fallback if baseline is missing
843
+ if baseline_display_name not in current_arena_dfs:
844
+ print(f"Warning: Baseline category '{baseline_display_name}' not found. Using selected category '{category_display_name}' as baseline.")
845
+ baseline_display_name = category_display_name # Fallback to the selected category itself
846
+
847
+ arena_df_baseline = current_arena_dfs[baseline_display_name]
848
+
849
+
850
+ hidden_models_list = None # Default: show all
851
+ # Check filter labels (must match the translated CheckboxGroup choices)
852
+ if "Show Deprecated" not in filters:
853
+ hidden_models_list = deprecated_model_name.copy() # Hide deprecated
854
+
855
+ if "Only <10B Models" in filters:
856
+ # Get all models currently in the baseline view
857
+ all_models_in_view = arena_df_baseline.index.tolist()
858
+ # Find models *not* in the allowed list
859
+ models_to_hide = [model for model in all_models_in_view if model not in models_10b]
860
+
861
+ if hidden_models_list is None: # If deprecated are not hidden
862
+ hidden_models_list = models_to_hide
863
+ else: # If deprecated are already hidden, add the non-<10B ones
864
+ # Use set to avoid duplicates
865
+ hidden_models_list = list(set(hidden_models_list + models_to_hide))
866
+
867
+ arena_table_values = get_arena_table(
868
+ arena_df_baseline, # Use the determined baseline DataFrame
869
+ current_model_table_df,
870
+ # Pass subset only if it's different from the baseline
871
+ arena_subset_df=(arena_subset_df if category_display_name != baseline_display_name else None),
872
+ hidden_models=hidden_models_list
873
  )
 
 
 
874
 
875
+ dataframe_update = None
876
+ # Show Delta column only if category is not the baseline and data exists
877
+ if category_display_name != baseline_display_name and arena_table_values:
878
+ styled_arena_values = update_leaderboard_df(arena_table_values) # Apply styling with Delta
879
+ # Check if styling was successful
880
+ if isinstance(styled_arena_values, pd.io.formats.style.Styler) and not styled_arena_values.data.empty:
881
+ dataframe_update = gr.Dataframe(
882
+ headers=[ # Headers including Delta
883
+ "Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
884
+ "Votes", "Organization", "License", "Knowledge Cutoff"
885
+ ],
886
+ datatype=[
887
+ "str", "number", "markdown", "number", "str",
888
+ "number", "str", "str", "str"
889
+ ],
890
+ value=styled_arena_values, # Pass the Styler object
891
+ elem_id="arena_leaderboard_dataframe",
892
+ height=700,
893
+ column_widths=[70, 70, 200, 90, 100, 90, 120, 150, 100], # Widths with Delta
894
+ wrap=True,
895
+ )
896
+ else: # Handle styling failure
897
+ dataframe_update = gr.Dataframe(value=pd.DataFrame().style) # Empty update
898
+
899
+ else: # Baseline category or no data for Delta
900
+ # Ensure data exists before creating DataFrame
901
+ if arena_table_values:
902
+ # Create DataFrame without Delta column from the raw values
903
+ df_no_delta = pd.DataFrame(arena_table_values, columns=[
904
+ "Rank* (UB)", "Model", "Arena Elo", "95% CI",
905
+ "Votes", "Organization", "License", "Knowledge Cutoff"
906
+ ])
907
+ dataframe_update = gr.Dataframe(
908
+ headers=[ # Headers without Delta
909
+ "Rank* (UB)", "Model", "Arena Elo", "95% CI",
910
+ "Votes", "Organization", "License", "Knowledge Cutoff"
911
+ ],
912
+ datatype=[
913
+ "str", "markdown", "number", "str", "number",
914
+ "str", "str", "str"
915
+ ],
916
+ value=df_no_delta.style, # Apply basic Pandas styling
917
+ elem_id="arena_leaderboard_dataframe",
918
+ height=700,
919
+ column_widths=[70, 190, 100, 100, 90, 130, 150, 100], # Widths without Delta
920
+ wrap=True,
921
+ )
922
+ else:
923
+ dataframe_update = gr.Dataframe(value=pd.DataFrame().style) # Empty update
924
+
925
+ plot_updates = [gr.Plot(value=None)] * 4 # Default empty plot updates
926
+ if show_plot:
927
+ p1_updated = elo_subset_results.get("win_fraction_heatmap")
928
+ p2_updated = elo_subset_results.get("battle_count_heatmap")
929
+ p3_updated = elo_subset_results.get("bootstrap_elo_rating")
930
+ p4_updated = elo_subset_results.get("average_win_rate_bar")
931
+
932
+ # Filter plots
933
+ p1_filtered = filter_deprecated_models_plots(p1_updated, hidden_models=hidden_models_list)
934
+ p2_filtered = filter_deprecated_models_plots(p2_updated, hidden_models=hidden_models_list)
935
+ p3_filtered = filter_deprecated_models_plots(p3_updated, hidden_models=hidden_models_list)
936
+ p4_filtered = filter_deprecated_models_plots(p4_updated, hidden_models=hidden_models_list)
937
+ plot_updates = [p1_filtered, p2_filtered, p3_filtered, p4_filtered]
938
+
939
+
940
+ more_stats_md_updated_text = f"""## More Statistics for Chatbot Arena - {category_display_name} """ if show_plot else ""
941
+ more_stats_md_update = gr.Markdown(value=more_stats_md_updated_text)
942
+
943
+ # Use baseline DF for total counts, subset DF for category-specific counts
944
+ category_details_md_updated_text = make_category_arena_leaderboard_md(
945
+ arena_df_baseline, arena_subset_df, name=category_display_name # Pass display name
946
+ )
947
+ category_deets_update = gr.Markdown(value=category_details_md_updated_text)
948
 
949
+ # Return updates in the correct order matching outputs list
950
+ # Order: df, p1, p2, p3, p4, more_stats_md, category_deets
951
+ return [dataframe_update] + plot_updates + [more_stats_md_update, category_deets_update]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
952
 
 
 
 
 
 
 
 
 
 
953
 
954
+ # Define output components (must exist in the UI build)
955
+ outputs_list = [elo_display_df]
956
+ if show_plot:
957
+ # Add plot components if they exist
958
+ outputs_list.extend([plot_1, plot_2, plot_3, plot_4])
959
+ # Add markdown component if it exists
960
+ if more_stats_md: outputs_list.append(more_stats_md)
961
+ else: outputs_list.append(gr.Markdown(visible=False)) # Placeholder if MD wasn't created
962
+ else:
963
+ # Add placeholders if plots/MD are not shown
964
+ outputs_list.extend([gr.Plot(visible=False)] * 4)
965
+ outputs_list.append(gr.Markdown(visible=False))
966
+ outputs_list.append(category_deets) # Always update category details
967
+
968
+ # Attach change listeners
969
+ category_dropdown.change(
970
+ fn=update_leaderboard_and_plots,
971
+ inputs=[category_dropdown, category_checkbox],
972
+ outputs=outputs_list
973
+ )
974
+ category_checkbox.change(
975
+ fn=update_leaderboard_and_plots, # Use the same function
976
+ inputs=[category_dropdown, category_checkbox],
977
+ outputs=outputs_list
978
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
979
 
 
 
 
 
 
980
 
981
+ return_components = [md_1, md_2, lb_description, category_deets, elo_display_df]
982
+ if show_plot:
983
+ # Add plots if they were created
984
+ return_components.extend([plot_1, plot_2, plot_3, plot_4])
985
+ # Add the extra stats markdown if it was created
986
+ if more_stats_md: return_components.append(more_stats_md)
 
 
 
 
 
 
987
 
 
 
 
988
 
989
+ return return_components
 
 
 
 
 
 
 
990
 
 
 
 
 
 
 
 
 
991
 
992
+ def build_demo(elo_results_file, leaderboard_table_file):
993
+ # Assumes block_css is available or defined elsewhere
994
+ try:
995
+ from fastchat.serve.gradio_web_server import block_css
996
+ except ImportError:
997
+ print("Warning: fastchat.serve.gradio_web_server.block_css not found. Using fallback CSS.")
998
+ # Define a minimal fallback CSS or copy the content here
999
+ block_css = """
1000
+ /* Add minimal CSS rules here if needed */
1001
+ #arena_leaderboard_dataframe table { font-size: 105%; }
1002
+ #leaderboard_markdown .prose { font-size: 110% !important; }
1003
+ .app { max-width: 100% !important; padding: 20px !important; }
1004
+ a { color: #1976D2; text-decoration: none; }
1005
+ a:hover { color: #63A4FF; text-decoration: underline; }
1006
+ """
1007
 
1008
+ text_size = gr.themes.sizes.text_lg
1009
+ # Assumes theme.json is present
1010
+ try:
1011
+ theme = gr.themes.Default.load("theme.json")
1012
+ except:
1013
+ print("Warning: theme.json not found. Using default Gradio theme.")
1014
+ theme = gr.themes.Default(text_size=text_size) # Fallback theme
1015
+
1016
+ if hasattr(theme, 'text_size'): theme.text_size = text_size
1017
+ # Apply custom settings if theme object supports it
1018
+ if hasattr(theme, 'set'):
1019
+ theme.set(
1020
+ button_large_text_size="40px",
1021
+ button_small_text_size="40px",
1022
+ button_large_text_weight="1000",
1023
+ button_small_text_weight="1000",
1024
+ button_shadow="*shadow_drop_lg",
1025
+ button_shadow_hover="*shadow_drop_lg",
1026
+ checkbox_label_shadow="*shadow_drop_lg",
1027
+ button_shadow_active="*shadow_inset",
1028
+ button_secondary_background_fill="*primary_300",
1029
+ button_secondary_background_fill_dark="*primary_700",
1030
+ button_secondary_background_fill_hover="*primary_200",
1031
+ button_secondary_background_fill_hover_dark="*primary_500",
1032
+ button_secondary_text_color="*primary_800",
1033
+ button_secondary_text_color_dark="white",
1034
+ )
1035
 
1036
+ with gr.Blocks(
1037
+ title="LLM Arena: Leaderboard", # Translated title
1038
+ theme=theme,
1039
+ css=block_css, # Use loaded or fallback CSS
1040
+ ) as demo:
1041
+ # Build only the leaderboard tab content
1042
+ # show_plot=True to display plots
1043
+ leader_components = build_leaderboard_tab(
1044
+ elo_results_file, leaderboard_table_file, show_plot=True, mirror=False
1045
+ )
1046
+ return demo
1047
 
1048
 
1049
  if __name__ == "__main__":
1050
  parser = argparse.ArgumentParser()
1051
+ parser.add_argument("--share", action="store_true", default=False) # Default False for HF
1052
  parser.add_argument("--host", default="0.0.0.0")
1053
  parser.add_argument("--port", type=int, default=7860)
1054
+ # Removed args specific to monitor.py
1055
  args = parser.parse_args()
1056
+ try:
1057
+ elo_result_files = glob.glob("elo_results_*.pkl")
1058
+ if not elo_result_files:
1059
+ raise FileNotFoundError("No elo_results_*.pkl files found.")
1060
+ # More robust sorting extracting the number
1061
+ elo_result_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))
1062
+ elo_result_file = elo_result_files[-1]
1063
+ print(f"Using Elo results file: {elo_result_file}")
1064
+ except Exception as e:
1065
+ print(f"Error finding Elo results file: {e}")
1066
+ print("Please ensure a file matching 'elo_results_NUMBER.pkl' exists.")
1067
+ exit(1) # Exit if file not found
1068
+
1069
+ try:
1070
+ leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
1071
+ if not leaderboard_table_files:
1072
+ raise FileNotFoundError("No leaderboard_table_*.csv files found.")
1073
+ leaderboard_table_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))
1074
+ leaderboard_table_file = leaderboard_table_files[-1]
1075
+ print(f"Using leaderboard table file: {leaderboard_table_file}")
1076
+ except Exception as e:
1077
+ print(f"Error finding leaderboard table file: {e}")
1078
+ print("Please ensure a file matching 'leaderboard_table_NUMBER.csv' exists.")
1079
+ exit(1) # Exit if file not found
1080
 
 
 
 
 
 
 
 
1081
 
1082
  demo = build_demo(elo_result_file, leaderboard_table_file)
1083
+ # Launch with args
1084
+ demo.launch(
1085
+ server_name=args.host,
1086
+ server_port=args.port,
1087
+ share=args.share,
1088
+ show_api=False
1089
+ )