|
import argparse |
|
import ast |
|
import glob |
|
import pickle |
|
import traceback |
|
import numpy as np |
|
|
|
import pandas as pd |
|
import gradio as gr |
|
import numpy as np |
|
|
|
|
|
promo_banner = """ |
|
<div style="background-color: #ffcc00; color: black; padding: 10px; text-align: center; font-weight: bold; font-size: 18px; border: 2px solid #000;"> |
|
USE THE LATEST VERSIONS OF THE BEST CHATBOTS IN RUSSIAN FOR FREE |
|
</div> |
|
""" |
|
|
|
deprecated_model_name = [ |
|
"GigaChat 3.1.25.3", |
|
"GigaChat-Pro 2.2.25.3", |
|
"saiga_llama3_8b_v6", |
|
"saiga_phi3_medium", |
|
"GigaChat-Plus 3.1.25.3", |
|
"GigaChat-Pro 4.0.26.8", |
|
"GigaChat 4.0.26.8", |
|
"xAI: Grok 2", |
|
"GigaChat-Pro 4.0.26.15", |
|
"GigaChat 4.0.26.15", |
|
"YandexGPT Experimental", "yandex-gpt-arena", |
|
"RefalMachine/ruadapt_llama3_instruct_lep_saiga_kto_ablitirated" |
|
] |
|
|
|
models_10b = [ |
|
"saiga_llama3_8b_v7", |
|
"Vikhrmodels/Vikhr-YandexGPT-5-Lite-8B-it", |
|
"T-lite-instruct-0.1", |
|
"t-tech/T-lite-it-1.0", |
|
"LLaMA-3 Chat (8B)", |
|
"Llama 3.1 8B Instruct Turbo", |
|
"MTSAIR/Cotype-Nano" |
|
] |
|
|
|
|
|
def make_default_md_1(): |
|
leaderboard_md = f""" |
|
# π LLM Arena in Russian: Leaderboard |
|
|
|
{promo_banner} |
|
|
|
""" |
|
return leaderboard_md |
|
|
|
def make_default_md_2(): |
|
leaderboard_md = f""" |
|
|
|
The LLM Arena platform is an open crowdsourcing platform for evaluating large language models (LLM) in Russian. We collect pairwise comparisons from people to rank LLMs using the Bradley-Terry model and display model ratings on the Elo scale. |
|
Chatbot Arena in Russian depends on community participation, so please contribute by casting your vote! |
|
|
|
- To **add your model** to the comparison, contact us on TG: [Group](https://t.me/+bFEOl-Bdmok4NGUy) |
|
- If you **found a bug** or **have a suggestion**, contact us: [Roman](https://t.me/roman_kucev) |
|
- You can **contribute your vote** at [llmarena.ru](https://llmarena.ru/)! |
|
""" |
|
return leaderboard_md |
|
|
|
|
|
def make_arena_leaderboard_md(arena_df, last_updated_time): |
|
|
|
total_votes = sum(arena_df["num_battles"]) if not arena_df.empty else 0 |
|
total_models = len(arena_df) |
|
space = "Β Β Β " |
|
|
|
leaderboard_md = f""" |
|
Total # of models: **{total_models}**.{space} Total # of votes: **{"{:,}".format(total_votes)}**.{space} Last updated: {last_updated_time}. |
|
|
|
***Rank (UB)**: model rating (upper bound), determined as one plus the number of models that are statistically better than the target model. |
|
Model A is statistically better than Model B when the lower bound of Model A's rating is higher than the upper bound of Model B's rating (with a 95% confidence interval). |
|
See Figure 1 below for a visualization of the confidence intervals of model ratings. |
|
""" |
|
return leaderboard_md |
|
|
|
|
|
def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="site_visitors/medium_prompts:style control"): |
|
total_votes = sum(arena_df["num_battles"]) if not arena_df.empty else 0 |
|
total_models = len(arena_df) |
|
space = "Β Β Β " |
|
total_subset_votes = sum(arena_subset_df["num_battles"]) if not arena_subset_df.empty else 0 |
|
total_subset_models = len(arena_subset_df) |
|
|
|
perc_models = round(total_subset_models / total_models * 100) if total_models > 0 else 0 |
|
perc_votes = round(total_subset_votes / total_votes * 100) if total_votes > 0 else 0 |
|
|
|
leaderboard_md = f"""### {cat_name_to_explanation.get(name, name)} |
|
#### {space} #models: **{total_subset_models} ({perc_models}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({perc_votes}%)**{space} |
|
""" |
|
return leaderboard_md |
|
|
|
def model_hyperlink(model_name, link): |
|
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>' |
|
|
|
|
|
def filter_deprecated_models_plots(fig, hidden_models=None, limit_to_top=25): |
|
""" |
|
Filters Plotly plots to show only top N models and optionally removes specific models. |
|
|
|
Args: |
|
fig: The Plotly figure object. |
|
hidden_models (list, optional): A list of model names to remove. Defaults to None. |
|
limit_to_top (int, optional): Limit display to top N models (0 or None means no limit). Defaults to 25. |
|
|
|
Returns: |
|
Plotly figure: The filtered figure object or the original if filtering fails or is not applicable. |
|
""" |
|
if fig is None: |
|
return None |
|
|
|
|
|
if not hasattr(fig, 'data') or len(fig.data) == 0: |
|
return fig |
|
|
|
|
|
if not hasattr(fig.data[0], 'type'): |
|
return fig |
|
|
|
|
|
models_to_check = [] |
|
if hasattr(fig.data[0], 'x'): |
|
models_to_check = fig.data[0].x |
|
elif hasattr(fig.data[0], 'y'): |
|
models_to_check = fig.data[0].y |
|
|
|
if hidden_models is not None and models_to_check.any(): |
|
available_models = [x for x in models_to_check if x not in hidden_models] |
|
|
|
if len(available_models) <= 2: |
|
|
|
return fig |
|
|
|
if limit_to_top is not None and limit_to_top <= 0: |
|
limit_to_top = None |
|
|
|
try: |
|
|
|
fig_copy = pickle.loads(pickle.dumps(fig)) |
|
data = fig_copy.data[0] |
|
|
|
if data.type == 'heatmap': |
|
|
|
mask_x = ~np.isin(data.x, hidden_models) if hidden_models is not None else np.ones_like(data.x, dtype=bool) |
|
mask_y = ~np.isin(data.y, hidden_models) if hidden_models is not None else np.ones_like(data.y, dtype=bool) |
|
|
|
|
|
filtered_x = np.array(data.x)[mask_x] |
|
filtered_y = np.array(data.y)[mask_y] |
|
|
|
|
|
if limit_to_top is not None and len(filtered_x) > limit_to_top: |
|
top_models = filtered_x[:limit_to_top] |
|
|
|
mask_x = np.isin(data.x, top_models) |
|
mask_y = np.isin(data.y, top_models) |
|
|
|
filtered_x = np.array(data.x)[mask_x] |
|
filtered_y = np.array(data.y)[mask_y] |
|
elif len(filtered_x) <= 2: |
|
return fig |
|
|
|
|
|
data.x = filtered_x |
|
data.y = filtered_y |
|
|
|
z_original = np.array(fig.data[0].z) |
|
data.z = z_original[np.ix_(mask_y, mask_x)] |
|
|
|
elif data.type == 'scatter': |
|
trace = data |
|
|
|
mask = ~np.isin(trace.x, hidden_models) if hidden_models is not None else np.ones_like(trace.x, dtype=bool) |
|
|
|
|
|
current_x = np.array(trace.x)[mask] |
|
current_y = np.array(trace.y)[mask] |
|
current_text = np.array(trace.text)[mask] if hasattr(trace, 'text') and trace.text is not None else None |
|
|
|
current_error_y_array = np.array(trace.error_y['array'])[mask] if 'error_y' in trace and 'array' in trace.error_y and trace.error_y['array'] is not None else None |
|
current_error_y_arrayminus = np.array(trace.error_y['arrayminus'])[mask] if 'error_y' in trace and 'arrayminus' in trace.error_y and trace.error_y['arrayminus'] is not None else None |
|
|
|
|
|
if limit_to_top is not None and len(current_x) > limit_to_top: |
|
|
|
sort_indices = np.argsort(-current_y)[:limit_to_top] |
|
current_x = current_x[sort_indices] |
|
current_y = current_y[sort_indices] |
|
if current_text is not None: |
|
current_text = current_text[sort_indices] |
|
if current_error_y_array is not None: |
|
current_error_y_array = current_error_y_array[sort_indices] |
|
if current_error_y_arrayminus is not None: |
|
current_error_y_arrayminus = current_error_y_arrayminus[sort_indices] |
|
elif len(current_x) <= 2: |
|
return fig |
|
|
|
|
|
trace.x, trace.y = current_x, current_y |
|
if current_text is not None: |
|
trace.text = current_text |
|
|
|
if current_error_y_array is not None: |
|
|
|
if 'error_y' not in trace: trace.error_y = {} |
|
trace.error_y['array'] = current_error_y_array |
|
if current_error_y_arrayminus is not None: |
|
if 'error_y' not in trace: trace.error_y = {} |
|
trace.error_y['arrayminus'] = current_error_y_arrayminus |
|
|
|
elif data.type == 'bar': |
|
trace = data |
|
|
|
mask = ~np.isin(trace.x, hidden_models) if hidden_models is not None else np.ones_like(trace.x, dtype=bool) |
|
|
|
|
|
current_x = np.array(trace.x)[mask] |
|
current_y = np.array(trace.y)[mask] |
|
|
|
|
|
if limit_to_top is not None and len(current_x) > limit_to_top: |
|
|
|
sort_indices = np.argsort(-current_y)[:limit_to_top] |
|
current_x = current_x[sort_indices] |
|
current_y = current_y[sort_indices] |
|
elif len(current_x) <= 2: |
|
return fig |
|
|
|
|
|
trace.x, trace.y = current_x, current_y |
|
|
|
return fig_copy |
|
|
|
except Exception as e: |
|
print(f"Error filtering plot: {e}") |
|
traceback.print_exc() |
|
return fig |
|
|
|
|
|
def load_leaderboard_table_csv(filename, add_hyperlink=True): |
|
lines = open(filename).readlines() |
|
heads = [v.strip() for v in lines[0].split(",")] |
|
rows = [] |
|
for i in range(1, len(lines)): |
|
row = [v.strip() for v in lines[i].split(",")] |
|
item = {} |
|
for h, v in zip(heads, row): |
|
if h == "Arena Elo rating": |
|
if v != "-": |
|
try: |
|
v = int(ast.literal_eval(v)) |
|
except: |
|
v = np.nan |
|
else: |
|
v = np.nan |
|
item[h] = v |
|
if add_hyperlink and "Model" in item and "Link" in item: |
|
|
|
if item["Link"] and item["Link"] != "-": |
|
item["Model"] = model_hyperlink(item["Model"], item["Link"]) |
|
|
|
rows.append(item) |
|
return rows |
|
|
|
|
|
def create_ranking_str(ranking, ranking_difference): |
|
|
|
try: |
|
|
|
ranking_val = int(float(ranking)) |
|
ranking_difference_val = int(float(ranking_difference)) |
|
if ranking_difference_val > 0: |
|
return f"{ranking_val} β" |
|
elif ranking_difference_val < 0: |
|
return f"{ranking_val} β" |
|
else: |
|
return f"{ranking_val}" |
|
except (ValueError, TypeError): |
|
return str(ranking) |
|
|
|
|
|
def recompute_final_ranking(arena_df): |
|
ranking = {} |
|
if arena_df.empty: |
|
return [] |
|
|
|
model_indices = arena_df.index |
|
|
|
if "rating_q025" not in arena_df.columns or "rating_q975" not in arena_df.columns: |
|
print("Warning: Confidence interval columns ('rating_q025', 'rating_q975') not found in DataFrame. Cannot compute UB Rank.") |
|
|
|
return [np.nan] * len(model_indices) |
|
|
|
ratings_q025 = arena_df["rating_q025"].to_dict() |
|
ratings_q975 = arena_df["rating_q975"].to_dict() |
|
|
|
for model_a in model_indices: |
|
rank = 1 |
|
rating_a_q975 = ratings_q975.get(model_a) |
|
|
|
if pd.isna(rating_a_q975): |
|
ranking[model_a] = np.nan |
|
continue |
|
|
|
for model_b in model_indices: |
|
if model_a == model_b: |
|
continue |
|
|
|
rating_b_q025 = ratings_q025.get(model_b) |
|
|
|
if pd.isna(rating_b_q025): |
|
continue |
|
|
|
|
|
if rating_b_q025 > rating_a_q975: |
|
rank += 1 |
|
ranking[model_a] = rank |
|
return list(ranking.values()) |
|
|
|
|
|
def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_models=None): |
|
""" |
|
Generates the leaderboard table data. |
|
'use_cache' parameter removed. |
|
""" |
|
|
|
|
|
|
|
arena_df_processed = arena_df.copy() |
|
if arena_subset_df is not None: |
|
arena_subset_df_processed = arena_subset_df.copy() |
|
else: |
|
arena_subset_df_processed = None |
|
|
|
|
|
arena_df_processed = arena_df_processed.sort_values(by=["rating"], ascending=False) |
|
|
|
if "rating_q025" in arena_df_processed.columns and "rating_q975" in arena_df_processed.columns: |
|
arena_df_processed["final_ranking"] = recompute_final_ranking(arena_df_processed) |
|
arena_df_processed = arena_df_processed.sort_values( |
|
by=["final_ranking", "rating"], ascending=[True, False] |
|
) |
|
else: |
|
|
|
arena_df_processed["final_ranking"] = range(1, len(arena_df_processed) + 1) |
|
|
|
if hidden_models: |
|
arena_df_processed = arena_df_processed[~arena_df_processed.index.isin(hidden_models)].copy() |
|
|
|
if "rating_q025" in arena_df_processed.columns and "rating_q975" in arena_df_processed.columns: |
|
arena_df_processed["final_ranking"] = recompute_final_ranking(arena_df_processed) |
|
|
|
arena_df_processed = arena_df_processed.sort_values( |
|
by=["final_ranking", "rating"], ascending=[True, False] |
|
) |
|
else: |
|
arena_df_processed["final_ranking"] = range(1, len(arena_df_processed) + 1) |
|
|
|
|
|
if arena_subset_df_processed is not None: |
|
|
|
if hidden_models: |
|
arena_subset_df_processed = arena_subset_df_processed[~arena_subset_df_processed.index.isin(hidden_models)].copy() |
|
|
|
|
|
arena_subset_df_processed = arena_subset_df_processed[arena_subset_df_processed.index.isin(arena_df_processed.index)] |
|
|
|
|
|
if not arena_subset_df_processed.empty and "rating_q025" in arena_subset_df_processed.columns and "rating_q975" in arena_subset_df_processed.columns: |
|
|
|
arena_subset_df_processed = arena_subset_df_processed.sort_values(by=["rating"], ascending=False) |
|
arena_subset_df_processed["final_ranking_subset"] = recompute_final_ranking(arena_subset_df_processed) |
|
|
|
|
|
|
|
arena_df_for_join = arena_df_processed[arena_df_processed.index.isin(arena_subset_df_processed.index)][["final_ranking", "rating"]].copy() |
|
arena_df_for_join.rename(columns={"final_ranking": "final_ranking_baseline"}, inplace=True) |
|
|
|
|
|
|
|
arena_df_combined = arena_subset_df_processed[["final_ranking_subset", "rating"]].join( |
|
arena_df_for_join["final_ranking_baseline"], how="inner" |
|
) |
|
|
|
|
|
arena_df_combined["ranking_difference"] = arena_df_combined["final_ranking_baseline"] - arena_df_combined["final_ranking_subset"] |
|
|
|
|
|
arena_df_combined = arena_df_combined.sort_values( |
|
by=["final_ranking_subset", "rating"], ascending=[True, False] |
|
) |
|
|
|
|
|
arena_df_combined["display_ranking"] = arena_df_combined.apply( |
|
lambda x: create_ranking_str(x["final_ranking_subset"], x["ranking_difference"]), |
|
axis=1, |
|
) |
|
arena_df_processed = arena_df_processed.loc[arena_df_combined.index] |
|
|
|
columns_to_join = ["display_ranking", "ranking_difference", "final_ranking_subset"] |
|
columns_to_join = [col for col in columns_to_join if col in arena_df_combined.columns] |
|
arena_df_processed = arena_df_processed.join(arena_df_combined[columns_to_join], how="inner") |
|
|
|
|
|
|
|
|
|
|
|
if "final_ranking_subset" in arena_df_processed.columns: |
|
arena_df_processed.sort_values(by=["final_ranking_subset", "rating"], ascending=[True, False], inplace=True) |
|
else: |
|
|
|
arena_df_processed.sort_values(by=["rating"], ascending=False, inplace=True) |
|
|
|
|
|
else: |
|
|
|
arena_subset_df_processed = None |
|
|
|
arena_df_processed["display_ranking"] = arena_df_processed["final_ranking"].astype(str) |
|
arena_df_processed.sort_values(by=["final_ranking", "rating"], ascending=[True, False], inplace=True) |
|
|
|
|
|
else: |
|
|
|
arena_df_processed["display_ranking"] = arena_df_processed["final_ranking"].astype(str) |
|
|
|
arena_df_processed.sort_values(by=["final_ranking", "rating"], ascending=[True, False], inplace=True) |
|
|
|
|
|
values = [] |
|
|
|
for model_key in arena_df_processed.index: |
|
row_data = arena_df_processed.loc[model_key] |
|
|
|
model_info = model_table_df[model_table_df["key"] == model_key] |
|
if model_info.empty: |
|
|
|
continue |
|
|
|
row = [] |
|
|
|
row.append(row_data.get("display_ranking", "")) |
|
|
|
|
|
if arena_subset_df_processed is not None: |
|
row.append(row_data.get("ranking_difference", 0)) |
|
|
|
|
|
row.append(model_info["Model"].values[0]) |
|
|
|
|
|
row.append(round(row_data["rating"])) |
|
|
|
|
|
|
|
upper_rating = row_data.get("rating_q975") |
|
lower_rating = row_data.get("rating_q025") |
|
current_rating = row_data.get("rating") |
|
upper_diff = round(upper_rating - current_rating) if pd.notna(upper_rating) and pd.notna(current_rating) else '?' |
|
lower_diff = round(current_rating - lower_rating) if pd.notna(current_rating) and pd.notna(lower_rating) else '?' |
|
row.append(f"+{upper_diff}/-{lower_diff}") |
|
|
|
|
|
|
|
row.append(round(row_data["num_battles"])) |
|
|
|
|
|
row.append(model_info["Organization"].values[0]) |
|
|
|
|
|
row.append(model_info["License"].values[0]) |
|
|
|
|
|
cutoff_date = model_info["Knowledge cutoff date"].values[0] |
|
row.append("Unknown" if cutoff_date == "-" else cutoff_date) |
|
|
|
values.append(row) |
|
|
|
return values |
|
|
|
|
|
key_to_category_name = { |
|
|
|
"full": "Overall", |
|
"crowdsourcing/simple_prompts": "crowdsourcing/simple_prompts", |
|
"site_visitors/medium_prompts": "site_visitors/medium_prompts", |
|
"site_visitors/medium_prompts:style control": "site_visitors/medium_prompts:style_control" |
|
} |
|
cat_name_to_explanation = { |
|
|
|
"Overall": "All queries", |
|
"crowdsourcing/simple_prompts": "Queries collected via crowdsourcing. Mostly simple ones.", |
|
"site_visitors/medium_prompts": "Queries from website visitors. Contain more complex prompts.", |
|
"site_visitors/medium_prompts:style_control": "Queries from website visitors. Contain more complex prompts. [Reduced stylistic influence](https://lmsys.org/blog/2024-08-28-style-control/) of the response on the rating." |
|
} |
|
cat_name_to_baseline = { |
|
|
|
|
|
} |
|
|
|
actual_categories = [ |
|
|
|
|
|
|
|
"site_visitors/medium_prompts", |
|
"site_visitors/medium_prompts:style control" |
|
] |
|
|
|
req_cat_key = "site_visitors/medium_prompts:style control" |
|
selected_category_key = req_cat_key if req_cat_key in actual_categories else ("site_visitors/medium_prompts" if "site_visitors/medium_prompts" in actual_categories else (actual_categories[0] if actual_categories else None)) |
|
|
|
selected_category_display_name = key_to_category_name.get(selected_category_key, selected_category_key) |
|
|
|
|
|
def read_elo_file(elo_results_file, leaderboard_table_file): |
|
|
|
print('Reading Elo file...') |
|
arena_dfs = {} |
|
category_elo_results = {} |
|
last_updated_time = "N/A" |
|
elo_results = {} |
|
model_table_df = pd.DataFrame() |
|
|
|
try: |
|
|
|
with open(elo_results_file, "rb") as fin: |
|
elo_results = pickle.load(fin) |
|
|
|
|
|
main_cat_key = "site_visitors/medium_prompts:style control" |
|
fallback_cat_key_1 = "site_visitors/medium_prompts" |
|
fallback_cat_key_2 = "full" |
|
|
|
if main_cat_key in elo_results and "last_updated_datetime" in elo_results[main_cat_key]: |
|
last_updated_time = elo_results[main_cat_key]["last_updated_datetime"].split(" ")[0] |
|
elif fallback_cat_key_1 in elo_results and "last_updated_datetime" in elo_results[fallback_cat_key_1]: |
|
last_updated_time = elo_results[fallback_cat_key_1]["last_updated_datetime"].split(" ")[0] |
|
elif fallback_cat_key_2 in elo_results and "last_updated_datetime" in elo_results[fallback_cat_key_2]: |
|
last_updated_time = elo_results[fallback_cat_key_2]["last_updated_datetime"].split(" ")[0] |
|
|
|
|
|
for key in key_to_category_name.keys(): |
|
display_name = key_to_category_name[key] |
|
if key in elo_results: |
|
|
|
if "leaderboard_table_df" in elo_results[key] and isinstance(elo_results[key]["leaderboard_table_df"], pd.DataFrame): |
|
df = elo_results[key]["leaderboard_table_df"] |
|
|
|
|
|
arena_dfs[display_name] = df[df["num_battles"] > 200].copy() |
|
category_elo_results[display_name] = elo_results[key] |
|
|
|
|
|
|
|
|
|
|
|
|
|
data = load_leaderboard_table_csv(leaderboard_table_file) |
|
model_table_df = pd.DataFrame(data) |
|
|
|
except FileNotFoundError: |
|
print(f"Error: Elo results file not found at {elo_results_file}") |
|
|
|
except Exception as e: |
|
print(f"Error reading elo file: {e}") |
|
traceback.print_exc() |
|
|
|
|
|
|
|
return last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df |
|
|
|
|
|
def build_leaderboard_tab( |
|
elo_results_file, leaderboard_table_file, show_plot=False, mirror=False |
|
): |
|
|
|
try: |
|
last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file) |
|
except Exception as e: |
|
print(f"Failed to load initial data: {e}") |
|
|
|
last_updated_time = "Error" |
|
arena_dfs = {} |
|
category_elo_results = {} |
|
elo_results = {} |
|
model_table_df = pd.DataFrame() |
|
|
|
|
|
|
|
if selected_category_display_name in arena_dfs: |
|
arena_df = arena_dfs[selected_category_display_name] |
|
elo_subset_results_init = category_elo_results[selected_category_display_name] |
|
p1_init = elo_subset_results_init.get("win_fraction_heatmap") |
|
p2_init = elo_subset_results_init.get("battle_count_heatmap") |
|
p3_init = elo_subset_results_init.get("bootstrap_elo_rating") |
|
p4_init = elo_subset_results_init.get("average_win_rate_bar") |
|
else: |
|
|
|
fallback_cat_display_name = None |
|
if actual_categories: |
|
|
|
first_cat_key = actual_categories[0] |
|
fallback_cat_display_name = key_to_category_name.get(first_cat_key, first_cat_key) |
|
|
|
if fallback_cat_display_name and fallback_cat_display_name in arena_dfs: |
|
print(f"Warning: Selected category '{selected_category_display_name}' not found. Falling back to '{fallback_cat_display_name}'.") |
|
arena_df = arena_dfs[fallback_cat_display_name] |
|
elo_subset_results_init = category_elo_results[fallback_cat_display_name] |
|
p1_init = elo_subset_results_init.get("win_fraction_heatmap") |
|
p2_init = elo_subset_results_init.get("battle_count_heatmap") |
|
p3_init = elo_subset_results_init.get("bootstrap_elo_rating") |
|
p4_init = elo_subset_results_init.get("average_win_rate_bar") |
|
else: |
|
print(f"Warning: Default category '{selected_category_display_name}' and fallback categories not found in data.") |
|
arena_df = pd.DataFrame() |
|
p1_init, p2_init, p3_init, p4_init = None, None, None, None |
|
|
|
|
|
p1_init = filter_deprecated_models_plots(p1_init, hidden_models=deprecated_model_name) |
|
p2_init = filter_deprecated_models_plots(p2_init, hidden_models=deprecated_model_name) |
|
p3_init = filter_deprecated_models_plots(p3_init, hidden_models=deprecated_model_name) |
|
p4_init = filter_deprecated_models_plots(p4_init, hidden_models=deprecated_model_name) |
|
|
|
default_md = make_default_md_1() |
|
default_md_2 = make_default_md_2() |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=4): |
|
|
|
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown") |
|
with gr.Column(scale=1): |
|
vote_button = gr.Button("Vote!", link="https://llmarena.ru") |
|
md_2 = gr.Markdown(default_md_2, elem_id="leaderboard_markdown") |
|
|
|
|
|
if not arena_df.empty and not model_table_df.empty: |
|
|
|
arena_table_vals_init = get_arena_table(arena_df, model_table_df, hidden_models=deprecated_model_name) |
|
else: |
|
arena_table_vals_init = [] |
|
|
|
|
|
with gr.Tab("Arena", id=0): |
|
md_arena = make_arena_leaderboard_md(arena_df, last_updated_time) |
|
lb_description = gr.Markdown(md_arena, elem_id="leaderboard_markdown") |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
|
|
|
|
category_dropdown = gr.Dropdown( |
|
|
|
choices=actual_categories, |
|
value=selected_category_key, |
|
label="Category", |
|
) |
|
|
|
with gr.Column(scale=2): |
|
category_checkbox = gr.CheckboxGroup( |
|
|
|
["Show Deprecated", "Only <10B Models"], |
|
label="Apply Filter", |
|
info="", |
|
value=[], |
|
) |
|
|
|
|
|
default_category_details = make_category_arena_leaderboard_md( |
|
arena_df, arena_df, name=selected_category_display_name |
|
) if not arena_df.empty else "No data for category" |
|
|
|
with gr.Column(scale=4, variant="panel"): |
|
category_deets = gr.Markdown( |
|
default_category_details, elem_id="category_deets" |
|
) |
|
|
|
|
|
|
|
arena_vals = pd.DataFrame( |
|
arena_table_vals_init, |
|
columns=[ |
|
"Rank* (UB)", "Model", "Arena Elo", "95% CI", |
|
"Votes", "Organization", "License", "Knowledge Cutoff" |
|
] |
|
) if arena_table_vals_init else pd.DataFrame(columns=[ |
|
"Rank* (UB)", "Model", "Arena Elo", "95% CI", |
|
"Votes", "Organization", "License", "Knowledge Cutoff" |
|
]) |
|
|
|
|
|
if "Arena Elo" in arena_vals.columns: |
|
arena_vals = arena_vals.sort_values(by="Arena Elo", ascending=False) |
|
|
|
|
|
elo_display_df = gr.Dataframe( |
|
headers=[ |
|
"Rank* (UB)", "Model", "Arena Elo", "95% CI", |
|
"Votes", "Organization", "License", "Knowledge Cutoff" |
|
], |
|
datatype=[ |
|
"str", "markdown", "number", "str", |
|
"number", "str", "str", "str" |
|
], |
|
value=arena_vals.style, |
|
elem_id="arena_leaderboard_dataframe", |
|
height=700, |
|
column_widths=[70, 190, 100, 100, 90, 130, 150, 100], |
|
wrap=True, |
|
) |
|
|
|
gr.Markdown(elem_id="leaderboard_markdown") |
|
|
|
plot_1, plot_2, plot_3, plot_4 = None, None, None, None |
|
more_stats_md = None |
|
if show_plot: |
|
more_stats_md = gr.Markdown( |
|
f"""## More Statistics for Chatbot Arena""", |
|
elem_id="leaderboard_header_markdown", |
|
) |
|
with gr.Row(elem_id="leaderboard_bars"): |
|
with gr.Column(): |
|
gr.Markdown( |
|
"#### Figure 1: Confidence Intervals on Model Strength (via Bootstrapping)", |
|
elem_id="plot-title", |
|
) |
|
plot_3 = gr.Plot(p3_init, show_label=False) |
|
with gr.Column(): |
|
gr.Markdown( |
|
"#### Figure 2: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)", |
|
elem_id="plot-title", |
|
) |
|
plot_4 = gr.Plot(p4_init, show_label=False) |
|
with gr.Row(elem_id="leaderboard_plots"): |
|
with gr.Column(): |
|
gr.Markdown( |
|
"#### Figure 3: Fraction of Model A Wins for All Non-tied A vs. B Battles", |
|
elem_id="plot-title", |
|
) |
|
plot_1 = gr.Plot( |
|
p1_init, show_label=False, elem_id="plot-container" |
|
) |
|
with gr.Column(): |
|
gr.Markdown( |
|
"#### Figure 4: Battle Count for Each Combination of Models (without Ties)", |
|
elem_id="plot-title", |
|
) |
|
plot_2 = gr.Plot(p2_init, show_label=False) |
|
|
|
def update_leaderboard_df(arena_table_vals): |
|
|
|
|
|
if not arena_table_vals or not isinstance(arena_table_vals, list) or not arena_table_vals[0] or len(arena_table_vals[0]) != 9: |
|
print("Warning: Invalid data for styling in update_leaderboard_df. Returning empty DataFrame.") |
|
|
|
empty_styled = pd.DataFrame(columns=[ |
|
"Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI", |
|
"Votes", "Organization", "License", "Knowledge Cutoff" |
|
]).style |
|
return empty_styled |
|
|
|
try: |
|
elo_datarame = pd.DataFrame( |
|
arena_table_vals, |
|
columns=[ |
|
"Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI", |
|
"Votes", "Organization", "License", "Knowledge Cutoff" |
|
], |
|
) |
|
|
|
def highlight_max(s): |
|
|
|
return [ |
|
"color: green; font-weight: bold" if "β" in str(v) else |
|
"color: red; font-weight: bold" if "β" in str(v) else "" |
|
for v in s |
|
] |
|
|
|
def highlight_rank_max(s): |
|
|
|
return [ |
|
"color: green; font-weight: bold" if isinstance(v, (int, float)) and v > 0 else |
|
"color: red; font-weight: bold" if isinstance(v, (int, float)) and v < 0 else "" |
|
for v in s |
|
] |
|
|
|
styled_df = elo_datarame.style.apply(highlight_max, subset=["Rank* (UB)"]).apply( |
|
highlight_rank_max, subset=["Delta"] |
|
) |
|
return styled_df |
|
|
|
except Exception as e: |
|
print(f"Error applying styles in update_leaderboard_df: {e}") |
|
traceback.print_exc() |
|
|
|
return pd.DataFrame(arena_table_vals, columns=[ |
|
"Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI", |
|
"Votes", "Organization", "License", "Knowledge Cutoff" |
|
]).style |
|
|
|
def update_leaderboard_and_plots(category_key, filters): |
|
|
|
|
|
try: |
|
current_last_updated_time, current_arena_dfs, current_category_elo_results, _, current_model_table_df = read_elo_file(elo_results_file, leaderboard_table_file) |
|
except Exception as e: |
|
print(f"Error reloading data in callback: {e}") |
|
|
|
empty_df_update = gr.Dataframe(value=pd.DataFrame().style) |
|
empty_plot_update = gr.Plot(value=None) |
|
empty_md_update = gr.Markdown(value="Error loading data.") |
|
|
|
num_plots = 4 if show_plot else 0 |
|
return [empty_df_update] + [empty_plot_update] * num_plots + [empty_md_update, empty_md_update] |
|
|
|
|
|
|
|
category_display_name = key_to_category_name.get(category_key, category_key) |
|
|
|
|
|
if not current_arena_dfs or category_display_name not in current_arena_dfs or category_display_name not in current_category_elo_results or current_model_table_df.empty: |
|
print(f"Warning: Data missing for category '{category_display_name}' (key: '{category_key}') after reload.") |
|
empty_df_update = gr.Dataframe(value=pd.DataFrame().style) |
|
empty_plot_update = gr.Plot(value=None) |
|
empty_md_update = gr.Markdown(value=f"No data available for category: {category_display_name}") |
|
num_plots = 4 if show_plot else 0 |
|
|
|
return [empty_df_update] + [empty_plot_update] * num_plots + [empty_md_update, empty_md_update] |
|
|
|
|
|
arena_subset_df = current_arena_dfs[category_display_name] |
|
elo_subset_results = current_category_elo_results[category_display_name] |
|
|
|
|
|
baseline_key = "site_visitors/medium_prompts:style control" |
|
baseline_display_name = key_to_category_name.get(baseline_key, baseline_key) |
|
|
|
|
|
if baseline_display_name not in current_arena_dfs: |
|
print(f"Warning: Baseline category '{baseline_display_name}' not found. Using selected category '{category_display_name}' as baseline.") |
|
baseline_display_name = category_display_name |
|
|
|
arena_df_baseline = current_arena_dfs[baseline_display_name] |
|
|
|
|
|
hidden_models_list = None |
|
|
|
if "Show Deprecated" not in filters: |
|
hidden_models_list = deprecated_model_name.copy() |
|
|
|
if "Only <10B Models" in filters: |
|
|
|
all_models_in_view = arena_df_baseline.index.tolist() |
|
|
|
models_to_hide = [model for model in all_models_in_view if model not in models_10b] |
|
|
|
if hidden_models_list is None: |
|
hidden_models_list = models_to_hide |
|
else: |
|
|
|
hidden_models_list = list(set(hidden_models_list + models_to_hide)) |
|
|
|
arena_table_values = get_arena_table( |
|
arena_df_baseline, |
|
current_model_table_df, |
|
|
|
arena_subset_df=(arena_subset_df if category_display_name != baseline_display_name else None), |
|
hidden_models=hidden_models_list |
|
) |
|
|
|
dataframe_update = None |
|
|
|
if category_display_name != baseline_display_name and arena_table_values: |
|
styled_arena_values = update_leaderboard_df(arena_table_values) |
|
|
|
if isinstance(styled_arena_values, pd.io.formats.style.Styler) and not styled_arena_values.data.empty: |
|
dataframe_update = gr.Dataframe( |
|
headers=[ |
|
"Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI", |
|
"Votes", "Organization", "License", "Knowledge Cutoff" |
|
], |
|
datatype=[ |
|
"str", "number", "markdown", "number", "str", |
|
"number", "str", "str", "str" |
|
], |
|
value=styled_arena_values, |
|
elem_id="arena_leaderboard_dataframe", |
|
height=700, |
|
column_widths=[70, 70, 200, 90, 100, 90, 120, 150, 100], |
|
wrap=True, |
|
) |
|
else: |
|
dataframe_update = gr.Dataframe(value=pd.DataFrame().style) |
|
|
|
else: |
|
|
|
if arena_table_values: |
|
|
|
df_no_delta = pd.DataFrame(arena_table_values, columns=[ |
|
"Rank* (UB)", "Model", "Arena Elo", "95% CI", |
|
"Votes", "Organization", "License", "Knowledge Cutoff" |
|
]) |
|
dataframe_update = gr.Dataframe( |
|
headers=[ |
|
"Rank* (UB)", "Model", "Arena Elo", "95% CI", |
|
"Votes", "Organization", "License", "Knowledge Cutoff" |
|
], |
|
datatype=[ |
|
"str", "markdown", "number", "str", "number", |
|
"str", "str", "str" |
|
], |
|
value=df_no_delta.style, |
|
elem_id="arena_leaderboard_dataframe", |
|
height=700, |
|
column_widths=[70, 190, 100, 100, 90, 130, 150, 100], |
|
wrap=True, |
|
) |
|
else: |
|
dataframe_update = gr.Dataframe(value=pd.DataFrame().style) |
|
|
|
plot_updates = [gr.Plot(value=None)] * 4 |
|
if show_plot: |
|
p1_updated = elo_subset_results.get("win_fraction_heatmap") |
|
p2_updated = elo_subset_results.get("battle_count_heatmap") |
|
p3_updated = elo_subset_results.get("bootstrap_elo_rating") |
|
p4_updated = elo_subset_results.get("average_win_rate_bar") |
|
|
|
|
|
p1_filtered = filter_deprecated_models_plots(p1_updated, hidden_models=hidden_models_list) |
|
p2_filtered = filter_deprecated_models_plots(p2_updated, hidden_models=hidden_models_list) |
|
p3_filtered = filter_deprecated_models_plots(p3_updated, hidden_models=hidden_models_list) |
|
p4_filtered = filter_deprecated_models_plots(p4_updated, hidden_models=hidden_models_list) |
|
plot_updates = [p1_filtered, p2_filtered, p3_filtered, p4_filtered] |
|
|
|
|
|
more_stats_md_updated_text = f"""## More Statistics for Chatbot Arena - {category_display_name} """ if show_plot else "" |
|
more_stats_md_update = gr.Markdown(value=more_stats_md_updated_text) |
|
|
|
|
|
category_details_md_updated_text = make_category_arena_leaderboard_md( |
|
arena_df_baseline, arena_subset_df, name=category_display_name |
|
) |
|
category_deets_update = gr.Markdown(value=category_details_md_updated_text) |
|
|
|
|
|
|
|
return [dataframe_update] + plot_updates + [more_stats_md_update, category_deets_update] |
|
|
|
|
|
|
|
outputs_list = [elo_display_df] |
|
if show_plot: |
|
|
|
outputs_list.extend([plot_1, plot_2, plot_3, plot_4]) |
|
|
|
if more_stats_md: outputs_list.append(more_stats_md) |
|
else: outputs_list.append(gr.Markdown(visible=False)) |
|
else: |
|
|
|
outputs_list.extend([gr.Plot(visible=False)] * 4) |
|
outputs_list.append(gr.Markdown(visible=False)) |
|
outputs_list.append(category_deets) |
|
|
|
|
|
category_dropdown.change( |
|
fn=update_leaderboard_and_plots, |
|
inputs=[category_dropdown, category_checkbox], |
|
outputs=outputs_list |
|
) |
|
category_checkbox.change( |
|
fn=update_leaderboard_and_plots, |
|
inputs=[category_dropdown, category_checkbox], |
|
outputs=outputs_list |
|
) |
|
|
|
|
|
return_components = [md_1, md_2, lb_description, category_deets, elo_display_df] |
|
if show_plot: |
|
|
|
return_components.extend([plot_1, plot_2, plot_3, plot_4]) |
|
|
|
if more_stats_md: return_components.append(more_stats_md) |
|
|
|
|
|
return return_components |
|
|
|
|
|
def build_demo(elo_results_file, leaderboard_table_file): |
|
|
|
try: |
|
from fastchat.serve.gradio_web_server import block_css |
|
except ImportError: |
|
print("Warning: fastchat.serve.gradio_web_server.block_css not found. Using fallback CSS.") |
|
|
|
block_css = """ |
|
/* Add minimal CSS rules here if needed */ |
|
#arena_leaderboard_dataframe table { font-size: 105%; } |
|
#leaderboard_markdown .prose { font-size: 110% !important; } |
|
.app { max-width: 100% !important; padding: 20px !important; } |
|
a { color: #1976D2; text-decoration: none; } |
|
a:hover { color: #63A4FF; text-decoration: underline; } |
|
""" |
|
|
|
text_size = gr.themes.sizes.text_lg |
|
|
|
try: |
|
theme = gr.themes.Default.load("theme.json") |
|
except: |
|
print("Warning: theme.json not found. Using default Gradio theme.") |
|
theme = gr.themes.Default(text_size=text_size) |
|
|
|
if hasattr(theme, 'text_size'): theme.text_size = text_size |
|
|
|
if hasattr(theme, 'set'): |
|
theme.set( |
|
button_large_text_size="40px", |
|
button_small_text_size="40px", |
|
button_large_text_weight="1000", |
|
button_small_text_weight="1000", |
|
button_shadow="*shadow_drop_lg", |
|
button_shadow_hover="*shadow_drop_lg", |
|
checkbox_label_shadow="*shadow_drop_lg", |
|
button_shadow_active="*shadow_inset", |
|
button_secondary_background_fill="*primary_300", |
|
button_secondary_background_fill_dark="*primary_700", |
|
button_secondary_background_fill_hover="*primary_200", |
|
button_secondary_background_fill_hover_dark="*primary_500", |
|
button_secondary_text_color="*primary_800", |
|
button_secondary_text_color_dark="white", |
|
) |
|
|
|
with gr.Blocks( |
|
title="LLM Arena: Leaderboard", |
|
theme=theme, |
|
css=block_css, |
|
) as demo: |
|
|
|
|
|
leader_components = build_leaderboard_tab( |
|
elo_results_file, leaderboard_table_file, show_plot=True, mirror=False |
|
) |
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("--share", action="store_true", default=False) |
|
parser.add_argument("--host", default="0.0.0.0") |
|
parser.add_argument("--port", type=int, default=7860) |
|
|
|
args = parser.parse_args() |
|
try: |
|
elo_result_files = glob.glob("elo_results_*.pkl") |
|
if not elo_result_files: |
|
raise FileNotFoundError("No elo_results_*.pkl files found.") |
|
|
|
elo_result_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0])) |
|
elo_result_file = elo_result_files[-1] |
|
print(f"Using Elo results file: {elo_result_file}") |
|
except Exception as e: |
|
print(f"Error finding Elo results file: {e}") |
|
print("Please ensure a file matching 'elo_results_NUMBER.pkl' exists.") |
|
exit(1) |
|
|
|
try: |
|
leaderboard_table_files = glob.glob("leaderboard_table_*.csv") |
|
if not leaderboard_table_files: |
|
raise FileNotFoundError("No leaderboard_table_*.csv files found.") |
|
leaderboard_table_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0])) |
|
leaderboard_table_file = leaderboard_table_files[-1] |
|
print(f"Using leaderboard table file: {leaderboard_table_file}") |
|
except Exception as e: |
|
print(f"Error finding leaderboard table file: {e}") |
|
print("Please ensure a file matching 'leaderboard_table_NUMBER.csv' exists.") |
|
exit(1) |
|
|
|
|
|
demo = build_demo(elo_result_file, leaderboard_table_file) |
|
|
|
demo.launch( |
|
server_name=args.host, |
|
server_port=args.port, |
|
share=args.share, |
|
show_api=False |
|
) |