import argparse import ast import glob import pickle import traceback import numpy as np import pandas as pd import gradio as gr import numpy as np promo_banner = """

USE THE LATEST VERSIONS OF THE BEST CHATBOTS IN RUSSIAN FOR FREE

""" deprecated_model_name = [ "GigaChat 3.1.25.3", "GigaChat-Pro 2.2.25.3", "saiga_llama3_8b_v6", "saiga_phi3_medium", "GigaChat-Plus 3.1.25.3", "GigaChat-Pro 4.0.26.8", "GigaChat 4.0.26.8", "xAI: Grok 2", "GigaChat-Pro 4.0.26.15", "GigaChat 4.0.26.15", "YandexGPT Experimental", "yandex-gpt-arena", "RefalMachine/ruadapt_llama3_instruct_lep_saiga_kto_ablitirated" ] models_10b = [ "saiga_llama3_8b_v7", "Vikhrmodels/Vikhr-YandexGPT-5-Lite-8B-it", "T-lite-instruct-0.1", "t-tech/T-lite-it-1.0", "LLaMA-3 Chat (8B)", "Llama 3.1 8B Instruct Turbo", "MTSAIR/Cotype-Nano" ] def make_default_md_1(): leaderboard_md = f""" # 🏆 LLM Arena in Russian: Leaderboard {promo_banner} """ return leaderboard_md def make_default_md_2(): leaderboard_md = f""" The LLM Arena platform is an open crowdsourcing platform for evaluating large language models (LLM) in Russian. We collect pairwise comparisons from people to rank LLMs using the Bradley-Terry model and display model ratings on the Elo scale. Chatbot Arena in Russian depends on community participation, so please contribute by casting your vote! - To **add your model** to the comparison, contact us on TG: [Group](https://t.me/+bFEOl-Bdmok4NGUy) - If you **found a bug** or **have a suggestion**, contact us: [Roman](https://t.me/roman_kucev) - You can **contribute your vote** at [llmarena.ru](https://llmarena.ru/)! """ return leaderboard_md def make_arena_leaderboard_md(arena_df, last_updated_time): # Using version from monitor.py (translated) total_votes = sum(arena_df["num_battles"]) if not arena_df.empty else 0 total_models = len(arena_df) space = " " # Using HTML space leaderboard_md = f""" Total # of models: **{total_models}**.{space} Total # of votes: **{"{:,}".format(total_votes)}**.{space} Last updated: {last_updated_time}. ***Rank (UB)**: model rating (upper bound), determined as one plus the number of models that are statistically better than the target model. Model A is statistically better than Model B when the lower bound of Model A's rating is higher than the upper bound of Model B's rating (with a 95% confidence interval). See Figure 1 below for a visualization of the confidence intervals of model ratings. """ return leaderboard_md def make_category_arena_leaderboard_md(arena_df, arena_subset_df, name="site_visitors/medium_prompts:style control"): total_votes = sum(arena_df["num_battles"]) if not arena_df.empty else 0 total_models = len(arena_df) space = " " total_subset_votes = sum(arena_subset_df["num_battles"]) if not arena_subset_df.empty else 0 total_subset_models = len(arena_subset_df) perc_models = round(total_subset_models / total_models * 100) if total_models > 0 else 0 perc_votes = round(total_subset_votes / total_votes * 100) if total_votes > 0 else 0 leaderboard_md = f"""### {cat_name_to_explanation.get(name, name)} #### {space} #models: **{total_subset_models} ({perc_models}%)** {space} #votes: **{"{:,}".format(total_subset_votes)} ({perc_votes}%)**{space} """ return leaderboard_md def model_hyperlink(model_name, link): return f'{model_name}' def filter_deprecated_models_plots(fig, hidden_models=None, limit_to_top=25): """ Filters Plotly plots to show only top N models and optionally removes specific models. Args: fig: The Plotly figure object. hidden_models (list, optional): A list of model names to remove. Defaults to None. limit_to_top (int, optional): Limit display to top N models (0 or None means no limit). Defaults to 25. Returns: Plotly figure: The filtered figure object or the original if filtering fails or is not applicable. """ if fig is None: return None # Check if the figure has data if not hasattr(fig, 'data') or len(fig.data) == 0: return fig # Check if data has a type attribute if not hasattr(fig.data[0], 'type'): return fig # Check minimum number of models after initial hidden_models filtering models_to_check = [] if hasattr(fig.data[0], 'x'): models_to_check = fig.data[0].x elif hasattr(fig.data[0], 'y'): # For some types like bar, X axis might be numeric models_to_check = fig.data[0].y if hidden_models is not None and models_to_check.any(): available_models = [x for x in models_to_check if x not in hidden_models] # print(f"Available models before top N: {len(available_models)}") # Debug if len(available_models) <= 2: # If less than 3 models remain before top_n # print(f"Warning: Too few models left after initial filtering ({len(available_models)}), returning original plot.") return fig # Return the original plot if too few models if limit_to_top is not None and limit_to_top <= 0: limit_to_top = None try: # Work on a deep copy to avoid modifying the original figure object fig_copy = pickle.loads(pickle.dumps(fig)) data = fig_copy.data[0] if data.type == 'heatmap': # Apply hidden models filter mask_x = ~np.isin(data.x, hidden_models) if hidden_models is not None else np.ones_like(data.x, dtype=bool) mask_y = ~np.isin(data.y, hidden_models) if hidden_models is not None else np.ones_like(data.y, dtype=bool) # Get initially filtered X and Y arrays filtered_x = np.array(data.x)[mask_x] filtered_y = np.array(data.y)[mask_y] # Apply top N limit (assuming the order is already by rank/rating) if limit_to_top is not None and len(filtered_x) > limit_to_top: top_models = filtered_x[:limit_to_top] # Create new masks based on the top models relative to the *original* data axes mask_x = np.isin(data.x, top_models) mask_y = np.isin(data.y, top_models) # Get final filtered axes filtered_x = np.array(data.x)[mask_x] filtered_y = np.array(data.y)[mask_y] elif len(filtered_x) <= 2: # If <=2 models remain after filtering return fig # Return original # Update the heatmap data data.x = filtered_x data.y = filtered_y # Important: Indexing 'z' must use masks derived from the *original* data order z_original = np.array(fig.data[0].z) data.z = z_original[np.ix_(mask_y, mask_x)] elif data.type == 'scatter': trace = data # Apply hidden models filter mask = ~np.isin(trace.x, hidden_models) if hidden_models is not None else np.ones_like(trace.x, dtype=bool) # Get initially filtered arrays current_x = np.array(trace.x)[mask] current_y = np.array(trace.y)[mask] current_text = np.array(trace.text)[mask] if hasattr(trace, 'text') and trace.text is not None else None # Handle error bars safely current_error_y_array = np.array(trace.error_y['array'])[mask] if 'error_y' in trace and 'array' in trace.error_y and trace.error_y['array'] is not None else None current_error_y_arrayminus = np.array(trace.error_y['arrayminus'])[mask] if 'error_y' in trace and 'arrayminus' in trace.error_y and trace.error_y['arrayminus'] is not None else None # Apply top N limit if limit_to_top is not None and len(current_x) > limit_to_top: # Sort by y-value (rating) descending to find the top N sort_indices = np.argsort(-current_y)[:limit_to_top] current_x = current_x[sort_indices] current_y = current_y[sort_indices] if current_text is not None: current_text = current_text[sort_indices] if current_error_y_array is not None: current_error_y_array = current_error_y_array[sort_indices] if current_error_y_arrayminus is not None: current_error_y_arrayminus = current_error_y_arrayminus[sort_indices] elif len(current_x) <= 2: # If <=2 models remain after filtering return fig # Return original # Update the scatter trace data trace.x, trace.y = current_x, current_y if current_text is not None: trace.text = current_text # Update error bars if they exist if current_error_y_array is not None: # Ensure error_y exists before assigning if 'error_y' not in trace: trace.error_y = {} trace.error_y['array'] = current_error_y_array if current_error_y_arrayminus is not None: if 'error_y' not in trace: trace.error_y = {} trace.error_y['arrayminus'] = current_error_y_arrayminus elif data.type == 'bar': trace = data # Apply hidden models filter mask = ~np.isin(trace.x, hidden_models) if hidden_models is not None else np.ones_like(trace.x, dtype=bool) # Get initially filtered arrays current_x = np.array(trace.x)[mask] current_y = np.array(trace.y)[mask] # Apply top N limit if limit_to_top is not None and len(current_x) > limit_to_top: # Sort by y-value (rating) descending sort_indices = np.argsort(-current_y)[:limit_to_top] current_x = current_x[sort_indices] current_y = current_y[sort_indices] elif len(current_x) <= 2: # If <=2 models remain after filtering return fig # Return original # Update the bar trace data trace.x, trace.y = current_x, current_y return fig_copy except Exception as e: print(f"Error filtering plot: {e}") traceback.print_exc() return fig # Return original figure on error def load_leaderboard_table_csv(filename, add_hyperlink=True): lines = open(filename).readlines() heads = [v.strip() for v in lines[0].split(",")] rows = [] for i in range(1, len(lines)): row = [v.strip() for v in lines[i].split(",")] item = {} # Create dictionary once per row for h, v in zip(heads, row): if h == "Arena Elo rating": if v != "-": try: v = int(ast.literal_eval(v)) except: v = np.nan # Handle parsing errors else: v = np.nan item[h] = v if add_hyperlink and "Model" in item and "Link" in item: # Check keys exist # Check for empty/missing link if item["Link"] and item["Link"] != "-": item["Model"] = model_hyperlink(item["Model"], item["Link"]) # Otherwise, keep the model name as is rows.append(item) return rows def create_ranking_str(ranking, ranking_difference): # Convert rank to int before comparison try: # Ensure rank and difference are treated as numbers ranking_val = int(float(ranking)) # Handle potential float input ranking_difference_val = int(float(ranking_difference)) if ranking_difference_val > 0: return f"{ranking_val} ↑" elif ranking_difference_val < 0: return f"{ranking_val} ↓" else: return f"{ranking_val}" except (ValueError, TypeError): # Handle cases where rank is not numeric return str(ranking) def recompute_final_ranking(arena_df): ranking = {} if arena_df.empty: return [] model_indices = arena_df.index # Ensure CI columns exist before trying to access them if "rating_q025" not in arena_df.columns or "rating_q975" not in arena_df.columns: print("Warning: Confidence interval columns ('rating_q025', 'rating_q975') not found in DataFrame. Cannot compute UB Rank.") # Return NaN or simple rank based on order return [np.nan] * len(model_indices) # Or range(1, len(model_indices) + 1) ratings_q025 = arena_df["rating_q025"].to_dict() ratings_q975 = arena_df["rating_q975"].to_dict() for model_a in model_indices: rank = 1 rating_a_q975 = ratings_q975.get(model_a) # Skip if model A has no CI data if pd.isna(rating_a_q975): ranking[model_a] = np.nan # Or assign max rank + 1 continue for model_b in model_indices: if model_a == model_b: continue rating_b_q025 = ratings_q025.get(model_b) # Skip comparison if model B has no CI data if pd.isna(rating_b_q025): continue # Check if B is statistically better than A if rating_b_q025 > rating_a_q975: rank += 1 ranking[model_a] = rank return list(ranking.values()) def get_arena_table(arena_df, model_table_df, arena_subset_df=None, hidden_models=None): """ Generates the leaderboard table data. 'use_cache' parameter removed. """ # print(f'Calculating get_arena_table') # Debug # Create copies to avoid modifying original DataFrames arena_df_processed = arena_df.copy() if arena_subset_df is not None: arena_subset_df_processed = arena_subset_df.copy() else: arena_subset_df_processed = None # Sort by rating initially to have a stable order before ranking arena_df_processed = arena_df_processed.sort_values(by=["rating"], ascending=False) # Compute 'final_ranking' based on CIs if possible if "rating_q025" in arena_df_processed.columns and "rating_q975" in arena_df_processed.columns: arena_df_processed["final_ranking"] = recompute_final_ranking(arena_df_processed) arena_df_processed = arena_df_processed.sort_values( by=["final_ranking", "rating"], ascending=[True, False] ) else: # Fallback to simple ordering if CI columns are missing arena_df_processed["final_ranking"] = range(1, len(arena_df_processed) + 1) if hidden_models: arena_df_processed = arena_df_processed[~arena_df_processed.index.isin(hidden_models)].copy() # Recompute ranks for the filtered view if "rating_q025" in arena_df_processed.columns and "rating_q975" in arena_df_processed.columns: arena_df_processed["final_ranking"] = recompute_final_ranking(arena_df_processed) # Re-sort based on new ranks arena_df_processed = arena_df_processed.sort_values( by=["final_ranking", "rating"], ascending=[True, False] ) else: arena_df_processed["final_ranking"] = range(1, len(arena_df_processed) + 1) if arena_subset_df_processed is not None: # Filter subset by hidden_models first if hidden_models: arena_subset_df_processed = arena_subset_df_processed[~arena_subset_df_processed.index.isin(hidden_models)].copy() # Ensure models in the subset are also present in the (filtered) main view arena_subset_df_processed = arena_subset_df_processed[arena_subset_df_processed.index.isin(arena_df_processed.index)] # Proceed only if subset is not empty and has CI columns if not arena_subset_df_processed.empty and "rating_q025" in arena_subset_df_processed.columns and "rating_q975" in arena_subset_df_processed.columns: # Rank within the subset arena_subset_df_processed = arena_subset_df_processed.sort_values(by=["rating"], ascending=False) arena_subset_df_processed["final_ranking_subset"] = recompute_final_ranking(arena_subset_df_processed) # Rank within category # Filter the main processed DF to only include models from the subset # 'final_ranking' here represents the rank *among these models* in the baseline category view arena_df_for_join = arena_df_processed[arena_df_processed.index.isin(arena_subset_df_processed.index)][["final_ranking", "rating"]].copy() arena_df_for_join.rename(columns={"final_ranking": "final_ranking_baseline"}, inplace=True) # Join the subset ranks and baseline ranks arena_df_combined = arena_subset_df_processed[["final_ranking_subset", "rating"]].join( arena_df_for_join["final_ranking_baseline"], how="inner" ) # Calculate rank difference arena_df_combined["ranking_difference"] = arena_df_combined["final_ranking_baseline"] - arena_df_combined["final_ranking_subset"] # Sort by subset rank and rating arena_df_combined = arena_df_combined.sort_values( by=["final_ranking_subset", "rating"], ascending=[True, False] ) # Format the rank string with delta for display arena_df_combined["display_ranking"] = arena_df_combined.apply( lambda x: create_ranking_str(x["final_ranking_subset"], x["ranking_difference"]), axis=1, ) arena_df_processed = arena_df_processed.loc[arena_df_combined.index] # Reorder arena_df_processed columns_to_join = ["display_ranking", "ranking_difference", "final_ranking_subset"] columns_to_join = [col for col in columns_to_join if col in arena_df_combined.columns] arena_df_processed = arena_df_processed.join(arena_df_combined[columns_to_join], how="inner") # Now sorting should work as the column exists # Use the subset rank for final sorting if subset is active # Check if 'final_ranking_subset' was successfully joined before sorting if "final_ranking_subset" in arena_df_processed.columns: arena_df_processed.sort_values(by=["final_ranking_subset", "rating"], ascending=[True, False], inplace=True) else: # Fallback sort if join failed for some reason arena_df_processed.sort_values(by=["rating"], ascending=False, inplace=True) else: # If subset is empty or lacks CI, disable subset logic arena_subset_df_processed = None # Use the baseline ranking as the display ranking arena_df_processed["display_ranking"] = arena_df_processed["final_ranking"].astype(str) arena_df_processed.sort_values(by=["final_ranking", "rating"], ascending=[True, False], inplace=True) else: # If no subset is used, display ranking is just the final rank from the main DF arena_df_processed["display_ranking"] = arena_df_processed["final_ranking"].astype(str) # Ensure it's sorted correctly arena_df_processed.sort_values(by=["final_ranking", "rating"], ascending=[True, False], inplace=True) values = [] # Iterate using the final sorted index of arena_df_processed for model_key in arena_df_processed.index: row_data = arena_df_processed.loc[model_key] # Find model metadata model_info = model_table_df[model_table_df["key"] == model_key] if model_info.empty: # print(f"Warning: Model key '{model_key}' not found in model_table_df. Skipping.") continue # Skip if no metadata row = [] # Rank (Display) row.append(row_data.get("display_ranking", "")) # Use the calculated display rank # Delta (only if subset was processed successfully) if arena_subset_df_processed is not None: row.append(row_data.get("ranking_difference", 0)) # Model Name (hyperlink applied during loading) row.append(model_info["Model"].values[0]) # Arena Elo row.append(round(row_data["rating"])) # 95% CI # Check for NaN before calculation upper_rating = row_data.get("rating_q975") lower_rating = row_data.get("rating_q025") current_rating = row_data.get("rating") upper_diff = round(upper_rating - current_rating) if pd.notna(upper_rating) and pd.notna(current_rating) else '?' lower_diff = round(current_rating - lower_rating) if pd.notna(current_rating) and pd.notna(lower_rating) else '?' row.append(f"+{upper_diff}/-{lower_diff}") # Votes row.append(round(row_data["num_battles"])) # Organization row.append(model_info["Organization"].values[0]) # License row.append(model_info["License"].values[0]) # Knowledge Cutoff cutoff_date = model_info["Knowledge cutoff date"].values[0] row.append("Unknown" if cutoff_date == "-" else cutoff_date) values.append(row) return values key_to_category_name = { # Mapping from internal key to display name (kept English for consistency) "full": "Overall", # Might not be used if filtered out later "crowdsourcing/simple_prompts": "crowdsourcing/simple_prompts", "site_visitors/medium_prompts": "site_visitors/medium_prompts", "site_visitors/medium_prompts:style control": "site_visitors/medium_prompts:style_control" # Use underscore for display consistency if needed } cat_name_to_explanation = { # Translated explanations for display "Overall": "All queries", "crowdsourcing/simple_prompts": "Queries collected via crowdsourcing. Mostly simple ones.", "site_visitors/medium_prompts": "Queries from website visitors. Contain more complex prompts.", "site_visitors/medium_prompts:style_control": "Queries from website visitors. Contain more complex prompts. [Reduced stylistic influence](https://lmsys.org/blog/2024-08-28-style-control/) of the response on the rating." } cat_name_to_baseline = { # Baseline category for comparison (if needed, seems unused now but kept) # "Hard Prompts (English)": "English", } actual_categories = [ # Categories available in the dropdown (use the *keys* from key_to_category_name) # "Overall", # Removed # "crowdsourcing/simple_prompts", # Removed "site_visitors/medium_prompts", "site_visitors/medium_prompts:style control" ] # Default selected category key req_cat_key = "site_visitors/medium_prompts:style control" selected_category_key = req_cat_key if req_cat_key in actual_categories else ("site_visitors/medium_prompts" if "site_visitors/medium_prompts" in actual_categories else (actual_categories[0] if actual_categories else None)) # Get the display name for the selected category selected_category_display_name = key_to_category_name.get(selected_category_key, selected_category_key) # Fallback to key if not found def read_elo_file(elo_results_file, leaderboard_table_file): # Version from monitor.py, but no lazy_load or caching print('Reading Elo file...') arena_dfs = {} category_elo_results = {} last_updated_time = "N/A" # Default value elo_results = {} # Default value model_table_df = pd.DataFrame() # Default value try: # Use context manager for file operations with open(elo_results_file, "rb") as fin: elo_results = pickle.load(fin) # Try to get last updated time from primary or fallback categories main_cat_key = "site_visitors/medium_prompts:style control" fallback_cat_key_1 = "site_visitors/medium_prompts" fallback_cat_key_2 = "full" # Another fallback if main_cat_key in elo_results and "last_updated_datetime" in elo_results[main_cat_key]: last_updated_time = elo_results[main_cat_key]["last_updated_datetime"].split(" ")[0] elif fallback_cat_key_1 in elo_results and "last_updated_datetime" in elo_results[fallback_cat_key_1]: last_updated_time = elo_results[fallback_cat_key_1]["last_updated_datetime"].split(" ")[0] elif fallback_cat_key_2 in elo_results and "last_updated_datetime" in elo_results[fallback_cat_key_2]: last_updated_time = elo_results[fallback_cat_key_2]["last_updated_datetime"].split(" ")[0] # Iterate through defined category keys for key in key_to_category_name.keys(): display_name = key_to_category_name[key] # Get the display name if key in elo_results: # Check for required data within the category result if "leaderboard_table_df" in elo_results[key] and isinstance(elo_results[key]["leaderboard_table_df"], pd.DataFrame): df = elo_results[key]["leaderboard_table_df"] # Filter by number of battles > 200 # Store using the *display_name* as the key for consistency with dropdown/UI arena_dfs[display_name] = df[df["num_battles"] > 200].copy() category_elo_results[display_name] = elo_results[key] # else: # print(f"Warning: 'leaderboard_table_df' not found or not a DataFrame for key '{key}'") # else: # print(f"Warning: Key '{key}' not found in elo_results") # Load model metadata CSV data = load_leaderboard_table_csv(leaderboard_table_file) model_table_df = pd.DataFrame(data) except FileNotFoundError: print(f"Error: Elo results file not found at {elo_results_file}") # Return empty structures except Exception as e: print(f"Error reading elo file: {e}") traceback.print_exc() # Return empty structures # Ensure correct data types are returned even on error return last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df def build_leaderboard_tab( elo_results_file, leaderboard_table_file, show_plot=False, mirror=False ): # Load data once during build time try: last_updated_time, arena_dfs, category_elo_results, elo_results, model_table_df = read_elo_file(elo_results_file, leaderboard_table_file) except Exception as e: print(f"Failed to load initial data: {e}") # Set empty defaults to prevent app crash last_updated_time = "Error" arena_dfs = {} category_elo_results = {} elo_results = {} model_table_df = pd.DataFrame() # Get data for the default selected category # Use the *display name* derived from the selected key if selected_category_display_name in arena_dfs: arena_df = arena_dfs[selected_category_display_name] elo_subset_results_init = category_elo_results[selected_category_display_name] p1_init = elo_subset_results_init.get("win_fraction_heatmap") p2_init = elo_subset_results_init.get("battle_count_heatmap") p3_init = elo_subset_results_init.get("bootstrap_elo_rating") p4_init = elo_subset_results_init.get("average_win_rate_bar") else: # Fallback if default category is missing fallback_cat_display_name = None if actual_categories: # Try the first actual category's display name first_cat_key = actual_categories[0] fallback_cat_display_name = key_to_category_name.get(first_cat_key, first_cat_key) if fallback_cat_display_name and fallback_cat_display_name in arena_dfs: print(f"Warning: Selected category '{selected_category_display_name}' not found. Falling back to '{fallback_cat_display_name}'.") arena_df = arena_dfs[fallback_cat_display_name] elo_subset_results_init = category_elo_results[fallback_cat_display_name] p1_init = elo_subset_results_init.get("win_fraction_heatmap") p2_init = elo_subset_results_init.get("battle_count_heatmap") p3_init = elo_subset_results_init.get("bootstrap_elo_rating") p4_init = elo_subset_results_init.get("average_win_rate_bar") else: print(f"Warning: Default category '{selected_category_display_name}' and fallback categories not found in data.") arena_df = pd.DataFrame() # Empty DataFrame p1_init, p2_init, p3_init, p4_init = None, None, None, None # Apply initial filtering to plots p1_init = filter_deprecated_models_plots(p1_init, hidden_models=deprecated_model_name) p2_init = filter_deprecated_models_plots(p2_init, hidden_models=deprecated_model_name) p3_init = filter_deprecated_models_plots(p3_init, hidden_models=deprecated_model_name) p4_init = filter_deprecated_models_plots(p4_init, hidden_models=deprecated_model_name) default_md = make_default_md_1() # Parameters removed default_md_2 = make_default_md_2() # Parameters removed with gr.Row(): with gr.Column(scale=4): # Removed Vote button md_1 = gr.Markdown(default_md, elem_id="leaderboard_markdown") with gr.Column(scale=1): vote_button = gr.Button("Vote!", link="https://llmarena.ru") md_2 = gr.Markdown(default_md_2, elem_id="leaderboard_markdown") # Generate initial table data if not arena_df.empty and not model_table_df.empty: # Pass the baseline DF and the model table; initially no subset difference is shown arena_table_vals_init = get_arena_table(arena_df, model_table_df, hidden_models=deprecated_model_name) else: arena_table_vals_init = [] # Single "Arena" tab with gr.Tab("Arena", id=0): # Removed Tabs() as only one tab md_arena = make_arena_leaderboard_md(arena_df, last_updated_time) lb_description = gr.Markdown(md_arena, elem_id="leaderboard_markdown") with gr.Row(): with gr.Column(scale=2): # Use *display names* for choices if they differ significantly from keys, # but here keys are descriptive enough. Callback receives the *key*. category_dropdown = gr.Dropdown( # Choices should be the *keys* corresponding to display names choices=actual_categories, value=selected_category_key, # Use the key for the default value label="Category", # Translated ) with gr.Column(scale=2): category_checkbox = gr.CheckboxGroup( # Use user-friendly translated labels ["Show Deprecated", "Only <10B Models"], # Adjusted label for clarity label="Apply Filter", info="", value=[], # Filters off by default ) # Category details default_category_details = make_category_arena_leaderboard_md( arena_df, arena_df, name=selected_category_display_name # Pass arena_df twice for initial display ) if not arena_df.empty else "No data for category" with gr.Column(scale=4, variant="panel"): category_deets = gr.Markdown( default_category_details, elem_id="category_deets" ) # DataFrame for displaying the table # Initial view doesn't have 'Delta' column arena_vals = pd.DataFrame( arena_table_vals_init, columns=[ "Rank* (UB)", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff" ] ) if arena_table_vals_init else pd.DataFrame(columns=[ # Handle empty initial data "Rank* (UB)", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff" ]) # Sort by Elo for initial display if "Arena Elo" in arena_vals.columns: arena_vals = arena_vals.sort_values(by="Arena Elo", ascending=False) elo_display_df = gr.Dataframe( headers=[ # Translated headers "Rank* (UB)", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff" ], datatype=[ "str", "markdown", "number", "str", "number", "str", "str", "str" ], value=arena_vals.style, # Apply Pandas styling if needed elem_id="arena_leaderboard_dataframe", height=700, column_widths=[70, 190, 100, 100, 90, 130, 150, 100], # Widths from monitor.py wrap=True, ) gr.Markdown(elem_id="leaderboard_markdown") # Empty markdown for spacing plot_1, plot_2, plot_3, plot_4 = None, None, None, None # Initialize plot variables more_stats_md = None # Initialize markdown variable if show_plot: more_stats_md = gr.Markdown( f"""## More Statistics for Chatbot Arena""", # Translated elem_id="leaderboard_header_markdown", ) with gr.Row(elem_id="leaderboard_bars"): # Use ID from monitor.py with gr.Column(): gr.Markdown( # Translated title "#### Figure 1: Confidence Intervals on Model Strength (via Bootstrapping)", elem_id="plot-title", ) plot_3 = gr.Plot(p3_init, show_label=False) # Use initial data with gr.Column(): gr.Markdown( # Translated title "#### Figure 2: Average Win Rate Against All Other Models (Assuming Uniform Sampling and No Ties)", elem_id="plot-title", ) plot_4 = gr.Plot(p4_init, show_label=False) # Use initial data with gr.Row(elem_id="leaderboard_plots"): # Use ID from monitor.py with gr.Column(): gr.Markdown( # Translated title "#### Figure 3: Fraction of Model A Wins for All Non-tied A vs. B Battles", elem_id="plot-title", ) plot_1 = gr.Plot( p1_init, show_label=False, elem_id="plot-container" # Use initial data ) with gr.Column(): gr.Markdown( # Translated title "#### Figure 4: Battle Count for Each Combination of Models (without Ties)", elem_id="plot-title", ) plot_2 = gr.Plot(p2_init, show_label=False) # Use initial data def update_leaderboard_df(arena_table_vals): # Add error handling for empty or incorrect data # Expects 9 columns when Delta is present if not arena_table_vals or not isinstance(arena_table_vals, list) or not arena_table_vals[0] or len(arena_table_vals[0]) != 9: print("Warning: Invalid data for styling in update_leaderboard_df. Returning empty DataFrame.") # Return an empty styled DataFrame to avoid Gradio errors empty_styled = pd.DataFrame(columns=[ "Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff" ]).style return empty_styled try: elo_datarame = pd.DataFrame( arena_table_vals, columns=[ "Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff" ], ) def highlight_max(s): # Check rank string for arrows return [ "color: green; font-weight: bold" if "↑" in str(v) else "color: red; font-weight: bold" if "↓" in str(v) else "" for v in s ] def highlight_rank_max(s): # Check Delta value (ensure it's numeric) return [ "color: green; font-weight: bold" if isinstance(v, (int, float)) and v > 0 else "color: red; font-weight: bold" if isinstance(v, (int, float)) and v < 0 else "" for v in s ] # Apply styles styled_df = elo_datarame.style.apply(highlight_max, subset=["Rank* (UB)"]).apply( highlight_rank_max, subset=["Delta"] ) return styled_df except Exception as e: print(f"Error applying styles in update_leaderboard_df: {e}") traceback.print_exc() # Return unstyled DataFrame on error return pd.DataFrame(arena_table_vals, columns=[ "Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff" ]).style def update_leaderboard_and_plots(category_key, filters): # Receives category *key* from dropdown # No caching # Reload data on each call try: current_last_updated_time, current_arena_dfs, current_category_elo_results, _, current_model_table_df = read_elo_file(elo_results_file, leaderboard_table_file) except Exception as e: print(f"Error reloading data in callback: {e}") # Return empty updates to prevent UI crash empty_df_update = gr.Dataframe(value=pd.DataFrame().style) # Empty DataFrame empty_plot_update = gr.Plot(value=None) # Empty Plot empty_md_update = gr.Markdown(value="Error loading data.") # Error Markdown # Match the number of outputs expected by the .change() call num_plots = 4 if show_plot else 0 return [empty_df_update] + [empty_plot_update] * num_plots + [empty_md_update, empty_md_update] # Use the display name corresponding to the selected key category_display_name = key_to_category_name.get(category_key, category_key) # Check if data exists for the selected category (using display name as key now) if not current_arena_dfs or category_display_name not in current_arena_dfs or category_display_name not in current_category_elo_results or current_model_table_df.empty: print(f"Warning: Data missing for category '{category_display_name}' (key: '{category_key}') after reload.") empty_df_update = gr.Dataframe(value=pd.DataFrame().style) empty_plot_update = gr.Plot(value=None) empty_md_update = gr.Markdown(value=f"No data available for category: {category_display_name}") num_plots = 4 if show_plot else 0 # Match the number of outputs return [empty_df_update] + [empty_plot_update] * num_plots + [empty_md_update, empty_md_update] # Get the specific data slices using the display name arena_subset_df = current_arena_dfs[category_display_name] elo_subset_results = current_category_elo_results[category_display_name] # Use the hardcoded baseline key, get its display name baseline_key = "site_visitors/medium_prompts:style control" baseline_display_name = key_to_category_name.get(baseline_key, baseline_key) # Fallback if baseline is missing if baseline_display_name not in current_arena_dfs: print(f"Warning: Baseline category '{baseline_display_name}' not found. Using selected category '{category_display_name}' as baseline.") baseline_display_name = category_display_name # Fallback to the selected category itself arena_df_baseline = current_arena_dfs[baseline_display_name] hidden_models_list = None # Default: show all # Check filter labels (must match the translated CheckboxGroup choices) if "Show Deprecated" not in filters: hidden_models_list = deprecated_model_name.copy() # Hide deprecated if "Only <10B Models" in filters: # Get all models currently in the baseline view all_models_in_view = arena_df_baseline.index.tolist() # Find models *not* in the allowed list models_to_hide = [model for model in all_models_in_view if model not in models_10b] if hidden_models_list is None: # If deprecated are not hidden hidden_models_list = models_to_hide else: # If deprecated are already hidden, add the non-<10B ones # Use set to avoid duplicates hidden_models_list = list(set(hidden_models_list + models_to_hide)) arena_table_values = get_arena_table( arena_df_baseline, # Use the determined baseline DataFrame current_model_table_df, # Pass subset only if it's different from the baseline arena_subset_df=(arena_subset_df if category_display_name != baseline_display_name else None), hidden_models=hidden_models_list ) dataframe_update = None # Show Delta column only if category is not the baseline and data exists if category_display_name != baseline_display_name and arena_table_values: styled_arena_values = update_leaderboard_df(arena_table_values) # Apply styling with Delta # Check if styling was successful if isinstance(styled_arena_values, pd.io.formats.style.Styler) and not styled_arena_values.data.empty: dataframe_update = gr.Dataframe( headers=[ # Headers including Delta "Rank* (UB)", "Delta", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff" ], datatype=[ "str", "number", "markdown", "number", "str", "number", "str", "str", "str" ], value=styled_arena_values, # Pass the Styler object elem_id="arena_leaderboard_dataframe", height=700, column_widths=[70, 70, 200, 90, 100, 90, 120, 150, 100], # Widths with Delta wrap=True, ) else: # Handle styling failure dataframe_update = gr.Dataframe(value=pd.DataFrame().style) # Empty update else: # Baseline category or no data for Delta # Ensure data exists before creating DataFrame if arena_table_values: # Create DataFrame without Delta column from the raw values df_no_delta = pd.DataFrame(arena_table_values, columns=[ "Rank* (UB)", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff" ]) dataframe_update = gr.Dataframe( headers=[ # Headers without Delta "Rank* (UB)", "Model", "Arena Elo", "95% CI", "Votes", "Organization", "License", "Knowledge Cutoff" ], datatype=[ "str", "markdown", "number", "str", "number", "str", "str", "str" ], value=df_no_delta.style, # Apply basic Pandas styling elem_id="arena_leaderboard_dataframe", height=700, column_widths=[70, 190, 100, 100, 90, 130, 150, 100], # Widths without Delta wrap=True, ) else: dataframe_update = gr.Dataframe(value=pd.DataFrame().style) # Empty update plot_updates = [gr.Plot(value=None)] * 4 # Default empty plot updates if show_plot: p1_updated = elo_subset_results.get("win_fraction_heatmap") p2_updated = elo_subset_results.get("battle_count_heatmap") p3_updated = elo_subset_results.get("bootstrap_elo_rating") p4_updated = elo_subset_results.get("average_win_rate_bar") # Filter plots p1_filtered = filter_deprecated_models_plots(p1_updated, hidden_models=hidden_models_list) p2_filtered = filter_deprecated_models_plots(p2_updated, hidden_models=hidden_models_list) p3_filtered = filter_deprecated_models_plots(p3_updated, hidden_models=hidden_models_list) p4_filtered = filter_deprecated_models_plots(p4_updated, hidden_models=hidden_models_list) plot_updates = [p1_filtered, p2_filtered, p3_filtered, p4_filtered] more_stats_md_updated_text = f"""## More Statistics for Chatbot Arena - {category_display_name} """ if show_plot else "" more_stats_md_update = gr.Markdown(value=more_stats_md_updated_text) # Use baseline DF for total counts, subset DF for category-specific counts category_details_md_updated_text = make_category_arena_leaderboard_md( arena_df_baseline, arena_subset_df, name=category_display_name # Pass display name ) category_deets_update = gr.Markdown(value=category_details_md_updated_text) # Return updates in the correct order matching outputs list # Order: df, p1, p2, p3, p4, more_stats_md, category_deets return [dataframe_update] + plot_updates + [more_stats_md_update, category_deets_update] # Define output components (must exist in the UI build) outputs_list = [elo_display_df] if show_plot: # Add plot components if they exist outputs_list.extend([plot_1, plot_2, plot_3, plot_4]) # Add markdown component if it exists if more_stats_md: outputs_list.append(more_stats_md) else: outputs_list.append(gr.Markdown(visible=False)) # Placeholder if MD wasn't created else: # Add placeholders if plots/MD are not shown outputs_list.extend([gr.Plot(visible=False)] * 4) outputs_list.append(gr.Markdown(visible=False)) outputs_list.append(category_deets) # Always update category details # Attach change listeners category_dropdown.change( fn=update_leaderboard_and_plots, inputs=[category_dropdown, category_checkbox], outputs=outputs_list ) category_checkbox.change( fn=update_leaderboard_and_plots, # Use the same function inputs=[category_dropdown, category_checkbox], outputs=outputs_list ) return_components = [md_1, md_2, lb_description, category_deets, elo_display_df] if show_plot: # Add plots if they were created return_components.extend([plot_1, plot_2, plot_3, plot_4]) # Add the extra stats markdown if it was created if more_stats_md: return_components.append(more_stats_md) return return_components def build_demo(elo_results_file, leaderboard_table_file): # Assumes block_css is available or defined elsewhere try: from fastchat.serve.gradio_web_server import block_css except ImportError: print("Warning: fastchat.serve.gradio_web_server.block_css not found. Using fallback CSS.") # Define a minimal fallback CSS or copy the content here block_css = """ /* Add minimal CSS rules here if needed */ #arena_leaderboard_dataframe table { font-size: 105%; } #leaderboard_markdown .prose { font-size: 110% !important; } .app { max-width: 100% !important; padding: 20px !important; } a { color: #1976D2; text-decoration: none; } a:hover { color: #63A4FF; text-decoration: underline; } """ text_size = gr.themes.sizes.text_lg # Assumes theme.json is present try: theme = gr.themes.Default.load("theme.json") except: print("Warning: theme.json not found. Using default Gradio theme.") theme = gr.themes.Default(text_size=text_size) # Fallback theme if hasattr(theme, 'text_size'): theme.text_size = text_size # Apply custom settings if theme object supports it if hasattr(theme, 'set'): theme.set( button_large_text_size="40px", button_small_text_size="40px", button_large_text_weight="1000", button_small_text_weight="1000", button_shadow="*shadow_drop_lg", button_shadow_hover="*shadow_drop_lg", checkbox_label_shadow="*shadow_drop_lg", button_shadow_active="*shadow_inset", button_secondary_background_fill="*primary_300", button_secondary_background_fill_dark="*primary_700", button_secondary_background_fill_hover="*primary_200", button_secondary_background_fill_hover_dark="*primary_500", button_secondary_text_color="*primary_800", button_secondary_text_color_dark="white", ) with gr.Blocks( title="LLM Arena: Leaderboard", # Translated title theme=theme, css=block_css, # Use loaded or fallback CSS ) as demo: # Build only the leaderboard tab content # show_plot=True to display plots leader_components = build_leaderboard_tab( elo_results_file, leaderboard_table_file, show_plot=True, mirror=False ) return demo if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--share", action="store_true", default=False) # Default False for HF parser.add_argument("--host", default="0.0.0.0") parser.add_argument("--port", type=int, default=7860) # Removed args specific to monitor.py args = parser.parse_args() try: elo_result_files = glob.glob("elo_results_*.pkl") if not elo_result_files: raise FileNotFoundError("No elo_results_*.pkl files found.") # More robust sorting extracting the number elo_result_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0])) elo_result_file = elo_result_files[-1] print(f"Using Elo results file: {elo_result_file}") except Exception as e: print(f"Error finding Elo results file: {e}") print("Please ensure a file matching 'elo_results_NUMBER.pkl' exists.") exit(1) # Exit if file not found try: leaderboard_table_files = glob.glob("leaderboard_table_*.csv") if not leaderboard_table_files: raise FileNotFoundError("No leaderboard_table_*.csv files found.") leaderboard_table_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0])) leaderboard_table_file = leaderboard_table_files[-1] print(f"Using leaderboard table file: {leaderboard_table_file}") except Exception as e: print(f"Error finding leaderboard table file: {e}") print("Please ensure a file matching 'leaderboard_table_NUMBER.csv' exists.") exit(1) # Exit if file not found demo = build_demo(elo_result_file, leaderboard_table_file) # Launch with args demo.launch( server_name=args.host, server_port=args.port, share=args.share, show_api=False )