import argparse
import ast
import glob
import pickle
import traceback
import numpy as np
import pandas as pd
import gradio as gr
import numpy as np
promo_banner = """
USE THE LATEST VERSIONS OF THE BEST CHATBOTS IN RUSSIAN FOR FREE
"""
deprecated_model_name = [
"GigaChat 3.1.25.3",
"GigaChat-Pro 2.2.25.3",
"saiga_llama3_8b_v6",
"saiga_phi3_medium",
"GigaChat-Plus 3.1.25.3",
"GigaChat-Pro 4.0.26.8",
"GigaChat 4.0.26.8",
"xAI: Grok 2",
"GigaChat-Pro 4.0.26.15",
"GigaChat 4.0.26.15",
"YandexGPT Experimental", "yandex-gpt-arena",
"RefalMachine/ruadapt_llama3_instruct_lep_saiga_kto_ablitirated"
]
models_10b = [
"saiga_llama3_8b_v7",
"Vikhrmodels/Vikhr-YandexGPT-5-Lite-8B-it",
"T-lite-instruct-0.1",
"t-tech/T-lite-it-1.0",
"LLaMA-3 Chat (8B)",
"Llama 3.1 8B Instruct Turbo",
"MTSAIR/Cotype-Nano"
]
def make_default_md_1():
leaderboard_md = f"""
# 🏆 LLM Arena in Russian: Leaderboard
{promo_banner}
"""
return leaderboard_md
def make_default_md_2():
leaderboard_md = f"""
The LLM Arena platform is an open crowdsourcing platform for evaluating large language models (LLM) in Russian. We collect pairwise comparisons from people to rank LLMs using the Bradley-Terry model and display model ratings on the Elo scale.
Chatbot Arena in Russian depends on community participation, so please contribute by casting your vote!
- To **add your model** to the comparison, contact us on TG: [Group](https://t.me/+bFEOl-Bdmok4NGUy)
- If you **found a bug** or **have a suggestion**, contact us: [Roman](https://t.me/roman_kucev)
- You can **contribute your vote** at [llmarena.ru](https://llmarena.ru/)!
"""
return leaderboard_md
def make_arena_leaderboard_md(arena_df, last_updated_time):
# Using version from monitor.py (translated)
total_votes = sum(arena_df["num_battles"]) if not arena_df.empty else 0
total_models = len(arena_df)
space = " " # Using HTML space
leaderboard_md = f"""
Total # of models: **{total_models}**.{space} Total # of votes: **{"{:,}".format(total_votes)}**.{space} Last updated: {last_updated_time}.
***Rank (UB)**: model rating (upper bound), determined as one plus the number of models that are statistically better than the target model.
Model A is statistically better than Model B when the lower bound of Model A's rating is higher than the upper bound of Model B's rating (with a 95% confidence interval).
See Figure 1 below for a visualization of the confidence intervals of model ratings.
"""
return leaderboard_md
def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="site_visitors/medium_prompts:style control"):
total_votes = sum(arena_df["num_battles"]) if not arena_df.empty else 0
total_models = len(arena_df)
space = " "
total_subset_votes = sum(arena_subset_df["num_battles"]) if not arena_subset_df.empty else 0
total_subset_models = len(arena_subset_df)
perc_models = round(total_subset_models / total_models * 100) if total_models > 0 else 0
perc_votes = round(total_subset_votes / total_votes * 100) if total_votes > 0 else 0
leaderboard_md = f"""### {cat_name_to_explanation.get(name, name)}
#### {space} #models: **{total_subset_models} ({perc_models}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({perc_votes}%)**{space}
"""
return leaderboard_md
def model_hyperlink(model_name, link):
return f'{model_name}'
def filter_deprecated_models_plots(fig, hidden_models=None, limit_to_top=25):
"""
Filters Plotly plots to show only top N models and optionally removes specific models.
Args:
fig: The Plotly figure object.
hidden_models (list, optional): A list of model names to remove. Defaults to None.
limit_to_top (int, optional): Limit display to top N models (0 or None means no limit). Defaults to 25.
Returns:
Plotly figure: The filtered figure object or the original if filtering fails or is not applicable.
"""
if fig is None:
return None
# Check if the figure has data
if not hasattr(fig, 'data') or len(fig.data) == 0:
return fig
# Check if data has a type attribute
if not hasattr(fig.data[0], 'type'):
return fig
# Check minimum number of models after initial hidden_models filtering
models_to_check = []
if hasattr(fig.data[0], 'x'):
models_to_check = fig.data[0].x
elif hasattr(fig.data[0], 'y'): # For some types like bar, X axis might be numeric
models_to_check = fig.data[0].y
if hidden_models is not None and models_to_check.any():
available_models = [x for x in models_to_check if x not in hidden_models]
# print(f"Available models before top N: {len(available_models)}") # Debug
if len(available_models) <= 2: # If less than 3 models remain before top_n
# print(f"Warning: Too few models left after initial filtering ({len(available_models)}), returning original plot.")
return fig # Return the original plot if too few models
if limit_to_top is not None and limit_to_top <= 0:
limit_to_top = None
try:
# Work on a deep copy to avoid modifying the original figure object
fig_copy = pickle.loads(pickle.dumps(fig))
data = fig_copy.data[0]
if data.type == 'heatmap':
# Apply hidden models filter
mask_x = ~np.isin(data.x, hidden_models) if hidden_models is not None else np.ones_like(data.x, dtype=bool)
mask_y = ~np.isin(data.y, hidden_models) if hidden_models is not None else np.ones_like(data.y, dtype=bool)
# Get initially filtered X and Y arrays
filtered_x = np.array(data.x)[mask_x]
filtered_y = np.array(data.y)[mask_y]
# Apply top N limit (assuming the order is already by rank/rating)
if limit_to_top is not None and len(filtered_x) > limit_to_top:
top_models = filtered_x[:limit_to_top]
# Create new masks based on the top models relative to the *original* data axes
mask_x = np.isin(data.x, top_models)
mask_y = np.isin(data.y, top_models)
# Get final filtered axes
filtered_x = np.array(data.x)[mask_x]
filtered_y = np.array(data.y)[mask_y]
elif len(filtered_x) <= 2: # If <=2 models remain after filtering
return fig # Return original
# Update the heatmap data
data.x = filtered_x
data.y = filtered_y
# Important: Indexing 'z' must use masks derived from the *original* data order
z_original = np.array(fig.data[0].z)
data.z = z_original[np.ix_(mask_y, mask_x)]
elif data.type == 'scatter':
trace = data
# Apply hidden models filter
mask = ~np.isin(trace.x, hidden_models) if hidden_models is not None else np.ones_like(trace.x, dtype=bool)
# Get initially filtered arrays
current_x = np.array(trace.x)[mask]
current_y = np.array(trace.y)[mask]
current_text = np.array(trace.text)[mask] if hasattr(trace, 'text') and trace.text is not None else None
# Handle error bars safely
current_error_y_array = np.array(trace.error_y['array'])[mask] if 'error_y' in trace and 'array' in trace.error_y and trace.error_y['array'] is not None else None
current_error_y_arrayminus = np.array(trace.error_y['arrayminus'])[mask] if 'error_y' in trace and 'arrayminus' in trace.error_y and trace.error_y['arrayminus'] is not None else None
# Apply top N limit
if limit_to_top is not None and len(current_x) > limit_to_top:
# Sort by y-value (rating) descending to find the top N
sort_indices = np.argsort(-current_y)[:limit_to_top]
current_x = current_x[sort_indices]
current_y = current_y[sort_indices]
if current_text is not None:
current_text = current_text[sort_indices]
if current_error_y_array is not None:
current_error_y_array = current_error_y_array[sort_indices]
if current_error_y_arrayminus is not None:
current_error_y_arrayminus = current_error_y_arrayminus[sort_indices]
elif len(current_x) <= 2: # If <=2 models remain after filtering
return fig # Return original
# Update the scatter trace data
trace.x, trace.y = current_x, current_y
if current_text is not None:
trace.text = current_text
# Update error bars if they exist
if current_error_y_array is not None:
# Ensure error_y exists before assigning
if 'error_y' not in trace: trace.error_y = {}
trace.error_y['array'] = current_error_y_array
if current_error_y_arrayminus is not None:
if 'error_y' not in trace: trace.error_y = {}
trace.error_y['arrayminus'] = current_error_y_arrayminus
elif data.type == 'bar':
trace = data
# Apply hidden models filter
mask = ~np.isin(trace.x, hidden_models) if hidden_models is not None else np.ones_like(trace.x, dtype=bool)
# Get initially filtered arrays
current_x = np.array(trace.x)[mask]
current_y = np.array(trace.y)[mask]
# Apply top N limit
if limit_to_top is not None and len(current_x) > limit_to_top:
# Sort by y-value (rating) descending
sort_indices = np.argsort(-current_y)[:limit_to_top]
current_x = current_x[sort_indices]
current_y = current_y[sort_indices]
elif len(current_x) <= 2: # If <=2 models remain after filtering
return fig # Return original
# Update the bar trace data
trace.x, trace.y = current_x, current_y
return fig_copy
except Exception as e:
print(f"Error filtering plot: {e}")
traceback.print_exc()
return fig # Return original figure on error
def load_leaderboard_table_csv(filename, add_hyperlink=True):
lines = open(filename).readlines()
heads = [v.strip() for v in lines[0].split(",")]
rows = []
for i in range(1, len(lines)):
row = [v.strip() for v in lines[i].split(",")]
item = {} # Create dictionary once per row
for h, v in zip(heads, row):
if h == "Arena Elo rating":
if v != "-":
try:
v = int(ast.literal_eval(v))
except:
v = np.nan # Handle parsing errors
else:
v = np.nan
item[h] = v
if add_hyperlink and "Model" in item and "Link" in item: # Check keys exist
# Check for empty/missing link
if item["Link"] and item["Link"] != "-":
item["Model"] = model_hyperlink(item["Model"], item["Link"])
# Otherwise, keep the model name as is
rows.append(item)
return rows
def create_ranking_str(ranking, ranking_difference):
# Convert rank to int before comparison
try:
# Ensure rank and difference are treated as numbers
ranking_val = int(float(ranking)) # Handle potential float input
ranking_difference_val = int(float(ranking_difference))
if ranking_difference_val > 0:
return f"{ranking_val} ↑"
elif ranking_difference_val < 0:
return f"{ranking_val} ↓"
else:
return f"{ranking_val}"
except (ValueError, TypeError): # Handle cases where rank is not numeric
return str(ranking)
def recompute_final_ranking(arena_df):
ranking = {}
if arena_df.empty:
return []
model_indices = arena_df.index
# Ensure CI columns exist before trying to access them
if "rating_q025" not in arena_df.columns or "rating_q975" not in arena_df.columns:
print("Warning: Confidence interval columns ('rating_q025', 'rating_q975') not found in DataFrame. Cannot compute UB Rank.")
# Return NaN or simple rank based on order
return [np.nan] * len(model_indices) # Or range(1, len(model_indices) + 1)
ratings_q025 = arena_df["rating_q025"].to_dict()
ratings_q975 = arena_df["rating_q975"].to_dict()
for model_a in model_indices:
rank = 1
rating_a_q975 = ratings_q975.get(model_a)
# Skip if model A has no CI data
if pd.isna(rating_a_q975):
ranking[model_a] = np.nan # Or assign max rank + 1
continue
for model_b in model_indices:
if model_a == model_b:
continue
rating_b_q025 = ratings_q025.get(model_b)
# Skip comparison if model B has no CI data
if pd.isna(rating_b_q025):
continue
# Check if B is statistically better than A
if rating_b_q025 > rating_a_q975:
rank += 1
ranking[model_a] = rank
return list(ranking.values())
def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_models=None):
"""
Generates the leaderboard table data.
'use_cache' parameter removed.
"""
# print(f'Calculating get_arena_table') # Debug
# Create copies to avoid modifying original DataFrames
arena_df_processed = arena_df.copy()
if arena_subset_df is not None:
arena_subset_df_processed = arena_subset_df.copy()
else:
arena_subset_df_processed = None
# Sort by rating initially to have a stable order before ranking
arena_df_processed = arena_df_processed.sort_values(by=["rating"], ascending=False)
# Compute 'final_ranking' based on CIs if possible
if "rating_q025" in arena_df_processed.columns and "rating_q975" in arena_df_processed.columns:
arena_df_processed["final_ranking"] = recompute_final_ranking(arena_df_processed)
arena_df_processed = arena_df_processed.sort_values(
by=["final_ranking", "rating"], ascending=[True, False]
)
else:
# Fallback to simple ordering if CI columns are missing
arena_df_processed["final_ranking"] = range(1, len(arena_df_processed) + 1)
if hidden_models:
arena_df_processed = arena_df_processed[~arena_df_processed.index.isin(hidden_models)].copy()
# Recompute ranks for the filtered view
if "rating_q025" in arena_df_processed.columns and "rating_q975" in arena_df_processed.columns:
arena_df_processed["final_ranking"] = recompute_final_ranking(arena_df_processed)
# Re-sort based on new ranks
arena_df_processed = arena_df_processed.sort_values(
by=["final_ranking", "rating"], ascending=[True, False]
)
else:
arena_df_processed["final_ranking"] = range(1, len(arena_df_processed) + 1)
if arena_subset_df_processed is not None:
# Filter subset by hidden_models first
if hidden_models:
arena_subset_df_processed = arena_subset_df_processed[~arena_subset_df_processed.index.isin(hidden_models)].copy()
# Ensure models in the subset are also present in the (filtered) main view
arena_subset_df_processed = arena_subset_df_processed[arena_subset_df_processed.index.isin(arena_df_processed.index)]
# Proceed only if subset is not empty and has CI columns
if not arena_subset_df_processed.empty and "rating_q025" in arena_subset_df_processed.columns and "rating_q975" in arena_subset_df_processed.columns:
# Rank within the subset
arena_subset_df_processed = arena_subset_df_processed.sort_values(by=["rating"], ascending=False)
arena_subset_df_processed["final_ranking_subset"] = recompute_final_ranking(arena_subset_df_processed) # Rank within category
# Filter the main processed DF to only include models from the subset
# 'final_ranking' here represents the rank *among these models* in the baseline category view
arena_df_for_join = arena_df_processed[arena_df_processed.index.isin(arena_subset_df_processed.index)][["final_ranking", "rating"]].copy()
arena_df_for_join.rename(columns={"final_ranking": "final_ranking_baseline"}, inplace=True)
# Join the subset ranks and baseline ranks
arena_df_combined = arena_subset_df_processed[["final_ranking_subset", "rating"]].join(
arena_df_for_join["final_ranking_baseline"], how="inner"
)
# Calculate rank difference
arena_df_combined["ranking_difference"] = arena_df_combined["final_ranking_baseline"] - arena_df_combined["final_ranking_subset"]
# Sort by subset rank and rating
arena_df_combined = arena_df_combined.sort_values(
by=["final_ranking_subset", "rating"], ascending=[True, False]
)
# Format the rank string with delta for display
arena_df_combined["display_ranking"] = arena_df_combined.apply(
lambda x: create_ranking_str(x["final_ranking_subset"], x["ranking_difference"]),
axis=1,
)
arena_df_processed = arena_df_processed.loc[arena_df_combined.index] # Reorder arena_df_processed
columns_to_join = ["display_ranking", "ranking_difference", "final_ranking_subset"]
columns_to_join = [col for col in columns_to_join if col in arena_df_combined.columns]
arena_df_processed = arena_df_processed.join(arena_df_combined[columns_to_join], how="inner")
# Now sorting should work as the column exists
# Use the subset rank for final sorting if subset is active
# Check if 'final_ranking_subset' was successfully joined before sorting
if "final_ranking_subset" in arena_df_processed.columns:
arena_df_processed.sort_values(by=["final_ranking_subset", "rating"], ascending=[True, False], inplace=True)
else:
# Fallback sort if join failed for some reason
arena_df_processed.sort_values(by=["rating"], ascending=False, inplace=True)
else:
# If subset is empty or lacks CI, disable subset logic
arena_subset_df_processed = None
# Use the baseline ranking as the display ranking
arena_df_processed["display_ranking"] = arena_df_processed["final_ranking"].astype(str)
arena_df_processed.sort_values(by=["final_ranking", "rating"], ascending=[True, False], inplace=True)
else:
# If no subset is used, display ranking is just the final rank from the main DF
arena_df_processed["display_ranking"] = arena_df_processed["final_ranking"].astype(str)
# Ensure it's sorted correctly
arena_df_processed.sort_values(by=["final_ranking", "rating"], ascending=[True, False], inplace=True)
values = []
# Iterate using the final sorted index of arena_df_processed
for model_key in arena_df_processed.index:
row_data = arena_df_processed.loc[model_key]
# Find model metadata
model_info = model_table_df[model_table_df["key"] == model_key]
if model_info.empty:
# print(f"Warning: Model key '{model_key}' not found in model_table_df. Skipping.")
continue # Skip if no metadata
row = []
# Rank (Display)
row.append(row_data.get("display_ranking", "")) # Use the calculated display rank
# Delta (only if subset was processed successfully)
if arena_subset_df_processed is not None:
row.append(row_data.get("ranking_difference", 0))
# Model Name (hyperlink applied during loading)
row.append(model_info["Model"].values[0])
# Arena Elo
row.append(round(row_data["rating"]))
# 95% CI
# Check for NaN before calculation
upper_rating = row_data.get("rating_q975")
lower_rating = row_data.get("rating_q025")
current_rating = row_data.get("rating")
upper_diff = round(upper_rating - current_rating) if pd.notna(upper_rating) and pd.notna(current_rating) else '?'
lower_diff = round(current_rating - lower_rating) if pd.notna(current_rating) and pd.notna(lower_rating) else '?'
row.append(f"+{upper_diff}/-{lower_diff}")
# Votes
row.append(round(row_data["num_battles"]))
# Organization
row.append(model_info["Organization"].values[0])
# License
row.append(model_info["License"].values[0])
# Knowledge Cutoff
cutoff_date = model_info["Knowledge cutoff date"].values[0]
row.append("Unknown" if cutoff_date == "-" else cutoff_date)
values.append(row)
return values
key_to_category_name = {
# Mapping from internal key to display name (kept English for consistency)
"full": "Overall", # Might not be used if filtered out later
"crowdsourcing/simple_prompts": "crowdsourcing/simple_prompts",
"site_visitors/medium_prompts": "site_visitors/medium_prompts",
"site_visitors/medium_prompts:style control": "site_visitors/medium_prompts:style_control" # Use underscore for display consistency if needed
}
cat_name_to_explanation = {
# Translated explanations for display
"Overall": "All queries",
"crowdsourcing/simple_prompts": "Queries collected via crowdsourcing. Mostly simple ones.",
"site_visitors/medium_prompts": "Queries from website visitors. Contain more complex prompts.",
"site_visitors/medium_prompts:style_control": "Queries from website visitors. Contain more complex prompts. [Reduced stylistic influence](https://lmsys.org/blog/2024-08-28-style-control/) of the response on the rating."
}
cat_name_to_baseline = {
# Baseline category for comparison (if needed, seems unused now but kept)
# "Hard Prompts (English)": "English",
}
actual_categories = [
# Categories available in the dropdown (use the *keys* from key_to_category_name)
# "Overall", # Removed
# "crowdsourcing/simple_prompts", # Removed
"site_visitors/medium_prompts",
"site_visitors/medium_prompts:style control"
]
# Default selected category key
req_cat_key = "site_visitors/medium_prompts:style control"
selected_category_key = req_cat_key if req_cat_key in actual_categories else ("site_visitors/medium_prompts" if "site_visitors/medium_prompts" in actual_categories else (actual_categories[0] if actual_categories else None))
# Get the display name for the selected category
selected_category_display_name = key_to_category_name.get(selected_category_key, selected_category_key) # Fallback to key if not found
def read_elo_file(elo_results_file, leaderboard_table_file):
# Version from monitor.py, but no lazy_load or caching
print('Reading Elo file...')
arena_dfs = {}
category_elo_results = {}
last_updated_time = "N/A" # Default value
elo_results = {} # Default value
model_table_df = pd.DataFrame() # Default value
try:
# Use context manager for file operations
with open(elo_results_file, "rb") as fin:
elo_results = pickle.load(fin)
# Try to get last updated time from primary or fallback categories
main_cat_key = "site_visitors/medium_prompts:style control"
fallback_cat_key_1 = "site_visitors/medium_prompts"
fallback_cat_key_2 = "full" # Another fallback
if main_cat_key in elo_results and "last_updated_datetime" in elo_results[main_cat_key]:
last_updated_time = elo_results[main_cat_key]["last_updated_datetime"].split(" ")[0]
elif fallback_cat_key_1 in elo_results and "last_updated_datetime" in elo_results[fallback_cat_key_1]:
last_updated_time = elo_results[fallback_cat_key_1]["last_updated_datetime"].split(" ")[0]
elif fallback_cat_key_2 in elo_results and "last_updated_datetime" in elo_results[fallback_cat_key_2]:
last_updated_time = elo_results[fallback_cat_key_2]["last_updated_datetime"].split(" ")[0]
# Iterate through defined category keys
for key in key_to_category_name.keys():
display_name = key_to_category_name[key] # Get the display name
if key in elo_results:
# Check for required data within the category result
if "leaderboard_table_df" in elo_results[key] and isinstance(elo_results[key]["leaderboard_table_df"], pd.DataFrame):
df = elo_results[key]["leaderboard_table_df"]
# Filter by number of battles > 200
# Store using the *display_name* as the key for consistency with dropdown/UI
arena_dfs[display_name] = df[df["num_battles"] > 200].copy()
category_elo_results[display_name] = elo_results[key]
# else:
# print(f"Warning: 'leaderboard_table_df' not found or not a DataFrame for key '{key}'")
# else:
# print(f"Warning: Key '{key}' not found in elo_results")
# Load model metadata CSV
data = load_leaderboard_table_csv(leaderboard_table_file)
model_table_df = pd.DataFrame(data)
except FileNotFoundError:
print(f"Error: Elo results file not found at {elo_results_file}")
# Return empty structures
except Exception as e:
print(f"Error reading elo file: {e}")
traceback.print_exc()
# Return empty structures
# Ensure correct data types are returned even on error
return last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df
def build_leaderboard_tab(
elo_results_file, leaderboard_table_file, show_plot=False, mirror=False
):
# Load data once during build time
try:
last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
except Exception as e:
print(f"Failed to load initial data: {e}")
# Set empty defaults to prevent app crash
last_updated_time = "Error"
arena_dfs = {}
category_elo_results = {}
elo_results = {}
model_table_df = pd.DataFrame()
# Get data for the default selected category
# Use the *display name* derived from the selected key
if selected_category_display_name in arena_dfs:
arena_df = arena_dfs[selected_category_display_name]
elo_subset_results_init = category_elo_results[selected_category_display_name]
p1_init = elo_subset_results_init.get("win_fraction_heatmap")
p2_init = elo_subset_results_init.get("battle_count_heatmap")
p3_init = elo_subset_results_init.get("bootstrap_elo_rating")
p4_init = elo_subset_results_init.get("average_win_rate_bar")
else:
# Fallback if default category is missing
fallback_cat_display_name = None
if actual_categories:
# Try the first actual category's display name
first_cat_key = actual_categories[0]
fallback_cat_display_name = key_to_category_name.get(first_cat_key, first_cat_key)
if fallback_cat_display_name and fallback_cat_display_name in arena_dfs:
print(f"Warning: Selected category '{selected_category_display_name}' not found. Falling back to '{fallback_cat_display_name}'.")
arena_df = arena_dfs[fallback_cat_display_name]
elo_subset_results_init = category_elo_results[fallback_cat_display_name]
p1_init = elo_subset_results_init.get("win_fraction_heatmap")
p2_init = elo_subset_results_init.get("battle_count_heatmap")
p3_init = elo_subset_results_init.get("bootstrap_elo_rating")
p4_init = elo_subset_results_init.get("average_win_rate_bar")
else:
print(f"Warning: Default category '{selected_category_display_name}' and fallback categories not found in data.")
arena_df = pd.DataFrame() # Empty DataFrame
p1_init, p2_init, p3_init, p4_init = None, None, None, None
# Apply initial filtering to plots
p1_init = filter_deprecated_models_plots(p1_init, hidden_models=deprecated_model_name)
p2_init = filter_deprecated_models_plots(p2_init, hidden_models=deprecated_model_name)
p3_init = filter_deprecated_models_plots(p3_init, hidden_models=deprecated_model_name)
p4_init = filter_deprecated_models_plots(p4_init, hidden_models=deprecated_model_name)
default_md = make_default_md_1() # Parameters removed
default_md_2 = make_default_md_2() # Parameters removed
with gr.Row():
with gr.Column(scale=4):
# Removed Vote button
md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown")
with gr.Column(scale=1):
vote_button = gr.Button("Vote!", link="https://llmarena.ru")
md_2 = gr.Markdown(default_md_2, elem_id="leaderboard_markdown")
# Generate initial table data
if not arena_df.empty and not model_table_df.empty:
# Pass the baseline DF and the model table; initially no subset difference is shown
arena_table_vals_init = get_arena_table(arena_df, model_table_df, hidden_models=deprecated_model_name)
else:
arena_table_vals_init = []
# Single "Arena" tab
with gr.Tab("Arena", id=0): # Removed Tabs() as only one tab
md_arena = make_arena_leaderboard_md(arena_df, last_updated_time)
lb_description = gr.Markdown(md_arena, elem_id="leaderboard_markdown")
with gr.Row():
with gr.Column(scale=2):
# Use *display names* for choices if they differ significantly from keys,
# but here keys are descriptive enough. Callback receives the *key*.
category_dropdown = gr.Dropdown(
# Choices should be the *keys* corresponding to display names
choices=actual_categories,
value=selected_category_key, # Use the key for the default value
label="Category", # Translated
)
with gr.Column(scale=2):
category_checkbox = gr.CheckboxGroup(
# Use user-friendly translated labels
["Show Deprecated", "Only <10B Models"], # Adjusted label for clarity
label="Apply Filter",
info="",
value=[], # Filters off by default
)
# Category details
default_category_details = make_category_arena_leaderboard_md(
arena_df, arena_df, name=selected_category_display_name # Pass arena_df twice for initial display
) if not arena_df.empty else "No data for category"
with gr.Column(scale=4, variant="panel"):
category_deets = gr.Markdown(
default_category_details, elem_id="category_deets"
)
# DataFrame for displaying the table
# Initial view doesn't have 'Delta' column
arena_vals = pd.DataFrame(
arena_table_vals_init,
columns=[
"Rank* (UB)", "Model", "Arena Elo", "95% CI",
"Votes", "Organization", "License", "Knowledge Cutoff"
]
) if arena_table_vals_init else pd.DataFrame(columns=[ # Handle empty initial data
"Rank* (UB)", "Model", "Arena Elo", "95% CI",
"Votes", "Organization", "License", "Knowledge Cutoff"
])
# Sort by Elo for initial display
if "Arena Elo" in arena_vals.columns:
arena_vals = arena_vals.sort_values(by="Arena Elo", ascending=False)
elo_display_df = gr.Dataframe(
headers=[ # Translated headers
"Rank* (UB)", "Model", "Arena Elo", "95% CI",
"Votes", "Organization", "License", "Knowledge Cutoff"
],
datatype=[
"str", "markdown", "number", "str",
"number", "str", "str", "str"
],
value=arena_vals.style, # Apply Pandas styling if needed
elem_id="arena_leaderboard_dataframe",
height=700,
column_widths=[70, 190, 100, 100, 90, 130, 150, 100], # Widths from monitor.py
wrap=True,
)
gr.Markdown(elem_id="leaderboard_markdown") # Empty markdown for spacing
plot_1, plot_2, plot_3, plot_4 = None, None, None, None # Initialize plot variables
more_stats_md = None # Initialize markdown variable
if show_plot:
more_stats_md = gr.Markdown(
f"""## More Statistics for Chatbot Arena""", # Translated
elem_id="leaderboard_header_markdown",
)
with gr.Row(elem_id="leaderboard_bars"): # Use ID from monitor.py
with gr.Column():
gr.Markdown( # Translated title
"#### Figure 1: Confidence Intervals on Model Strength (via Bootstrapping)",
elem_id="plot-title",
)
plot_3 = gr.Plot(p3_init, show_label=False) # Use initial data
with gr.Column():
gr.Markdown( # Translated title
"#### Figure 2: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)",
elem_id="plot-title",
)
plot_4 = gr.Plot(p4_init, show_label=False) # Use initial data
with gr.Row(elem_id="leaderboard_plots"): # Use ID from monitor.py
with gr.Column():
gr.Markdown( # Translated title
"#### Figure 3: Fraction of Model A Wins for All Non-tied A vs. B Battles",
elem_id="plot-title",
)
plot_1 = gr.Plot(
p1_init, show_label=False, elem_id="plot-container" # Use initial data
)
with gr.Column():
gr.Markdown( # Translated title
"#### Figure 4: Battle Count for Each Combination of Models (without Ties)",
elem_id="plot-title",
)
plot_2 = gr.Plot(p2_init, show_label=False) # Use initial data
def update_leaderboard_df(arena_table_vals):
# Add error handling for empty or incorrect data
# Expects 9 columns when Delta is present
if not arena_table_vals or not isinstance(arena_table_vals, list) or not arena_table_vals[0] or len(arena_table_vals[0]) != 9:
print("Warning: Invalid data for styling in update_leaderboard_df. Returning empty DataFrame.")
# Return an empty styled DataFrame to avoid Gradio errors
empty_styled = pd.DataFrame(columns=[
"Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
"Votes", "Organization", "License", "Knowledge Cutoff"
]).style
return empty_styled
try:
elo_datarame = pd.DataFrame(
arena_table_vals,
columns=[
"Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
"Votes", "Organization", "License", "Knowledge Cutoff"
],
)
def highlight_max(s):
# Check rank string for arrows
return [
"color: green; font-weight: bold" if "↑" in str(v) else
"color: red; font-weight: bold" if "↓" in str(v) else ""
for v in s
]
def highlight_rank_max(s):
# Check Delta value (ensure it's numeric)
return [
"color: green; font-weight: bold" if isinstance(v, (int, float)) and v > 0 else
"color: red; font-weight: bold" if isinstance(v, (int, float)) and v < 0 else ""
for v in s
]
# Apply styles
styled_df = elo_datarame.style.apply(highlight_max, subset=["Rank* (UB)"]).apply(
highlight_rank_max, subset=["Delta"]
)
return styled_df
except Exception as e:
print(f"Error applying styles in update_leaderboard_df: {e}")
traceback.print_exc()
# Return unstyled DataFrame on error
return pd.DataFrame(arena_table_vals, columns=[
"Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
"Votes", "Organization", "License", "Knowledge Cutoff"
]).style
def update_leaderboard_and_plots(category_key, filters): # Receives category *key* from dropdown
# No caching
# Reload data on each call
try:
current_last_updated_time, current_arena_dfs, current_category_elo_results, _, current_model_table_df = read_elo_file(elo_results_file, leaderboard_table_file)
except Exception as e:
print(f"Error reloading data in callback: {e}")
# Return empty updates to prevent UI crash
empty_df_update = gr.Dataframe(value=pd.DataFrame().style) # Empty DataFrame
empty_plot_update = gr.Plot(value=None) # Empty Plot
empty_md_update = gr.Markdown(value="Error loading data.") # Error Markdown
# Match the number of outputs expected by the .change() call
num_plots = 4 if show_plot else 0
return [empty_df_update] + [empty_plot_update] * num_plots + [empty_md_update, empty_md_update]
# Use the display name corresponding to the selected key
category_display_name = key_to_category_name.get(category_key, category_key)
# Check if data exists for the selected category (using display name as key now)
if not current_arena_dfs or category_display_name not in current_arena_dfs or category_display_name not in current_category_elo_results or current_model_table_df.empty:
print(f"Warning: Data missing for category '{category_display_name}' (key: '{category_key}') after reload.")
empty_df_update = gr.Dataframe(value=pd.DataFrame().style)
empty_plot_update = gr.Plot(value=None)
empty_md_update = gr.Markdown(value=f"No data available for category: {category_display_name}")
num_plots = 4 if show_plot else 0
# Match the number of outputs
return [empty_df_update] + [empty_plot_update] * num_plots + [empty_md_update, empty_md_update]
# Get the specific data slices using the display name
arena_subset_df = current_arena_dfs[category_display_name]
elo_subset_results = current_category_elo_results[category_display_name]
# Use the hardcoded baseline key, get its display name
baseline_key = "site_visitors/medium_prompts:style control"
baseline_display_name = key_to_category_name.get(baseline_key, baseline_key)
# Fallback if baseline is missing
if baseline_display_name not in current_arena_dfs:
print(f"Warning: Baseline category '{baseline_display_name}' not found. Using selected category '{category_display_name}' as baseline.")
baseline_display_name = category_display_name # Fallback to the selected category itself
arena_df_baseline = current_arena_dfs[baseline_display_name]
hidden_models_list = None # Default: show all
# Check filter labels (must match the translated CheckboxGroup choices)
if "Show Deprecated" not in filters:
hidden_models_list = deprecated_model_name.copy() # Hide deprecated
if "Only <10B Models" in filters:
# Get all models currently in the baseline view
all_models_in_view = arena_df_baseline.index.tolist()
# Find models *not* in the allowed list
models_to_hide = [model for model in all_models_in_view if model not in models_10b]
if hidden_models_list is None: # If deprecated are not hidden
hidden_models_list = models_to_hide
else: # If deprecated are already hidden, add the non-<10B ones
# Use set to avoid duplicates
hidden_models_list = list(set(hidden_models_list + models_to_hide))
arena_table_values = get_arena_table(
arena_df_baseline, # Use the determined baseline DataFrame
current_model_table_df,
# Pass subset only if it's different from the baseline
arena_subset_df=(arena_subset_df if category_display_name != baseline_display_name else None),
hidden_models=hidden_models_list
)
dataframe_update = None
# Show Delta column only if category is not the baseline and data exists
if category_display_name != baseline_display_name and arena_table_values:
styled_arena_values = update_leaderboard_df(arena_table_values) # Apply styling with Delta
# Check if styling was successful
if isinstance(styled_arena_values, pd.io.formats.style.Styler) and not styled_arena_values.data.empty:
dataframe_update = gr.Dataframe(
headers=[ # Headers including Delta
"Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI",
"Votes", "Organization", "License", "Knowledge Cutoff"
],
datatype=[
"str", "number", "markdown", "number", "str",
"number", "str", "str", "str"
],
value=styled_arena_values, # Pass the Styler object
elem_id="arena_leaderboard_dataframe",
height=700,
column_widths=[70, 70, 200, 90, 100, 90, 120, 150, 100], # Widths with Delta
wrap=True,
)
else: # Handle styling failure
dataframe_update = gr.Dataframe(value=pd.DataFrame().style) # Empty update
else: # Baseline category or no data for Delta
# Ensure data exists before creating DataFrame
if arena_table_values:
# Create DataFrame without Delta column from the raw values
df_no_delta = pd.DataFrame(arena_table_values, columns=[
"Rank* (UB)", "Model", "Arena Elo", "95% CI",
"Votes", "Organization", "License", "Knowledge Cutoff"
])
dataframe_update = gr.Dataframe(
headers=[ # Headers without Delta
"Rank* (UB)", "Model", "Arena Elo", "95% CI",
"Votes", "Organization", "License", "Knowledge Cutoff"
],
datatype=[
"str", "markdown", "number", "str", "number",
"str", "str", "str"
],
value=df_no_delta.style, # Apply basic Pandas styling
elem_id="arena_leaderboard_dataframe",
height=700,
column_widths=[70, 190, 100, 100, 90, 130, 150, 100], # Widths without Delta
wrap=True,
)
else:
dataframe_update = gr.Dataframe(value=pd.DataFrame().style) # Empty update
plot_updates = [gr.Plot(value=None)] * 4 # Default empty plot updates
if show_plot:
p1_updated = elo_subset_results.get("win_fraction_heatmap")
p2_updated = elo_subset_results.get("battle_count_heatmap")
p3_updated = elo_subset_results.get("bootstrap_elo_rating")
p4_updated = elo_subset_results.get("average_win_rate_bar")
# Filter plots
p1_filtered = filter_deprecated_models_plots(p1_updated, hidden_models=hidden_models_list)
p2_filtered = filter_deprecated_models_plots(p2_updated, hidden_models=hidden_models_list)
p3_filtered = filter_deprecated_models_plots(p3_updated, hidden_models=hidden_models_list)
p4_filtered = filter_deprecated_models_plots(p4_updated, hidden_models=hidden_models_list)
plot_updates = [p1_filtered, p2_filtered, p3_filtered, p4_filtered]
more_stats_md_updated_text = f"""## More Statistics for Chatbot Arena - {category_display_name} """ if show_plot else ""
more_stats_md_update = gr.Markdown(value=more_stats_md_updated_text)
# Use baseline DF for total counts, subset DF for category-specific counts
category_details_md_updated_text = make_category_arena_leaderboard_md(
arena_df_baseline, arena_subset_df, name=category_display_name # Pass display name
)
category_deets_update = gr.Markdown(value=category_details_md_updated_text)
# Return updates in the correct order matching outputs list
# Order: df, p1, p2, p3, p4, more_stats_md, category_deets
return [dataframe_update] + plot_updates + [more_stats_md_update, category_deets_update]
# Define output components (must exist in the UI build)
outputs_list = [elo_display_df]
if show_plot:
# Add plot components if they exist
outputs_list.extend([plot_1, plot_2, plot_3, plot_4])
# Add markdown component if it exists
if more_stats_md: outputs_list.append(more_stats_md)
else: outputs_list.append(gr.Markdown(visible=False)) # Placeholder if MD wasn't created
else:
# Add placeholders if plots/MD are not shown
outputs_list.extend([gr.Plot(visible=False)] * 4)
outputs_list.append(gr.Markdown(visible=False))
outputs_list.append(category_deets) # Always update category details
# Attach change listeners
category_dropdown.change(
fn=update_leaderboard_and_plots,
inputs=[category_dropdown, category_checkbox],
outputs=outputs_list
)
category_checkbox.change(
fn=update_leaderboard_and_plots, # Use the same function
inputs=[category_dropdown, category_checkbox],
outputs=outputs_list
)
return_components = [md_1, md_2, lb_description, category_deets, elo_display_df]
if show_plot:
# Add plots if they were created
return_components.extend([plot_1, plot_2, plot_3, plot_4])
# Add the extra stats markdown if it was created
if more_stats_md: return_components.append(more_stats_md)
return return_components
def build_demo(elo_results_file, leaderboard_table_file):
# Assumes block_css is available or defined elsewhere
try:
from fastchat.serve.gradio_web_server import block_css
except ImportError:
print("Warning: fastchat.serve.gradio_web_server.block_css not found. Using fallback CSS.")
# Define a minimal fallback CSS or copy the content here
block_css = """
/* Add minimal CSS rules here if needed */
#arena_leaderboard_dataframe table { font-size: 105%; }
#leaderboard_markdown .prose { font-size: 110% !important; }
.app { max-width: 100% !important; padding: 20px !important; }
a { color: #1976D2; text-decoration: none; }
a:hover { color: #63A4FF; text-decoration: underline; }
"""
text_size = gr.themes.sizes.text_lg
# Assumes theme.json is present
try:
theme = gr.themes.Default.load("theme.json")
except:
print("Warning: theme.json not found. Using default Gradio theme.")
theme = gr.themes.Default(text_size=text_size) # Fallback theme
if hasattr(theme, 'text_size'): theme.text_size = text_size
# Apply custom settings if theme object supports it
if hasattr(theme, 'set'):
theme.set(
button_large_text_size="40px",
button_small_text_size="40px",
button_large_text_weight="1000",
button_small_text_weight="1000",
button_shadow="*shadow_drop_lg",
button_shadow_hover="*shadow_drop_lg",
checkbox_label_shadow="*shadow_drop_lg",
button_shadow_active="*shadow_inset",
button_secondary_background_fill="*primary_300",
button_secondary_background_fill_dark="*primary_700",
button_secondary_background_fill_hover="*primary_200",
button_secondary_background_fill_hover_dark="*primary_500",
button_secondary_text_color="*primary_800",
button_secondary_text_color_dark="white",
)
with gr.Blocks(
title="LLM Arena: Leaderboard", # Translated title
theme=theme,
css=block_css, # Use loaded or fallback CSS
) as demo:
# Build only the leaderboard tab content
# show_plot=True to display plots
leader_components = build_leaderboard_tab(
elo_results_file, leaderboard_table_file, show_plot=True, mirror=False
)
return demo
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--share", action="store_true", default=False) # Default False for HF
parser.add_argument("--host", default="0.0.0.0")
parser.add_argument("--port", type=int, default=7860)
# Removed args specific to monitor.py
args = parser.parse_args()
try:
elo_result_files = glob.glob("elo_results_*.pkl")
if not elo_result_files:
raise FileNotFoundError("No elo_results_*.pkl files found.")
# More robust sorting extracting the number
elo_result_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))
elo_result_file = elo_result_files[-1]
print(f"Using Elo results file: {elo_result_file}")
except Exception as e:
print(f"Error finding Elo results file: {e}")
print("Please ensure a file matching 'elo_results_NUMBER.pkl' exists.")
exit(1) # Exit if file not found
try:
leaderboard_table_files = glob.glob("leaderboard_table_*.csv")
if not leaderboard_table_files:
raise FileNotFoundError("No leaderboard_table_*.csv files found.")
leaderboard_table_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))
leaderboard_table_file = leaderboard_table_files[-1]
print(f"Using leaderboard table file: {leaderboard_table_file}")
except Exception as e:
print(f"Error finding leaderboard table file: {e}")
print("Please ensure a file matching 'leaderboard_table_NUMBER.csv' exists.")
exit(1) # Exit if file not found
demo = build_demo(elo_result_file, leaderboard_table_file)
# Launch with args
demo.launch(
server_name=args.host,
server_port=args.port,
share=args.share,
show_api=False
)