import os import json import pandas as pd import glob import gradio as gr # Cache for loaded data data_cache = {} # Load data functions with caching def load_jsonl(file_path): """Load a JSONL file into a pandas DataFrame with caching.""" if file_path in data_cache: return data_cache[file_path] if not os.path.exists(file_path): return pd.DataFrame() try: df = pd.read_json(file_path, lines=True) data_cache[file_path] = df return df except Exception as e: print(f"Error loading {file_path}: {e}") return pd.DataFrame() def get_available_benchmarks(): """Get list of available benchmarks in data directory.""" return [dir_name for dir_name in os.listdir("data") if os.path.isdir(os.path.join("data", dir_name))] def get_categories(benchmark): """Get list of categories for a given benchmark.""" questions = load_jsonl(f"data/{benchmark}/question.jsonl") if questions.empty: return [] return sorted(questions['category'].unique().tolist()) def get_languages(benchmark): """Get list of languages available in the benchmark.""" questions = load_jsonl(f"data/{benchmark}/question.jsonl") if questions.empty or 'language' not in questions.columns: return ["English"] # Default if no language column return sorted(questions['language'].unique().tolist()) def get_judges(benchmark): """Get list of available judges for a benchmark.""" judgment_dir = f"data/{benchmark}/model_judgment" if not os.path.exists(judgment_dir): return [] return [dir_name for dir_name in os.listdir(judgment_dir) if os.path.isdir(os.path.join(judgment_dir, dir_name))] def get_models(benchmark, judge): """Get list of models that have judgments by the specified judge.""" if not judge: return [] judgment_dir = f"data/{benchmark}/model_judgment/{judge}" if not os.path.exists(judgment_dir): return [] return [os.path.splitext(os.path.basename(file))[0] for file in glob.glob(f"{judgment_dir}/*.jsonl")] def get_questions(benchmark, category=None, language=None): """Get questions with category and language filters if provided.""" questions = load_jsonl(f"data/{benchmark}/question.jsonl") if questions.empty: return [] # Apply category filter if provided if category and category != "All": questions = questions[questions['category'] == category] # Apply language filter if provided and column exists if language and language != "All" and 'language' in questions.columns: questions = questions[questions['language'] == language] # Create list of question previews with their UIDs question_previews = [(row['uid'], row['prompt'][:100] + "..." if len(row['prompt']) > 100 else row['prompt']) for _, row in questions.iterrows()] return question_previews def get_model_answer(benchmark, model, uid): """Get a model's answer for a specific question.""" model_answers = load_jsonl(f"data/{benchmark}/model_answer/{model}.jsonl") if model_answers.empty: return "No answer found" answer = model_answers[model_answers['uid'] == uid] if answer.empty: return "No answer found" # Extract the actual answer from the messages try: messages = answer.iloc[0]['messages'] if len(messages) < 2: return "No answer found" # The assistant's message should be the second one assistant_msg = messages[1] if 'role' in assistant_msg and assistant_msg['role'] == 'assistant': content = assistant_msg['content'] # Handle different content formats if isinstance(content, dict) and 'answer' in content: return content['answer'] elif isinstance(content, str): return content else: return str(content) else: return "Invalid message format" except Exception as e: return f"Error extracting answer: {str(e)}" def get_judgment(benchmark, judge, model, uid): """Get judgment for a specific model and question.""" judgments = load_jsonl(f"data/{benchmark}/model_judgment/{judge}/{model}.jsonl") if judgments.empty: return None, None judgment = judgments[judgments['uid'] == uid] if judgment.empty: return None, None games = judgment.iloc[0]['games'] if len(games) < 2: return games[0] if games else None, None return games[0], games[1] # First game, second game def format_judgment(game): """Format judgment for display.""" if not game: return "No judgment available" score = game.get('score', 'No score') # Try to get judgment text judgment = game.get('judgment', {}) if isinstance(judgment, dict) and 'answer' in judgment: judgment_text = judgment['answer'] else: judgment_text = str(judgment) return f"### Score: {score}\n\n{judgment_text}" # Gradio interface functions def update_categories(benchmark): """Update category dropdown based on selected benchmark.""" categories = ["All"] + get_categories(benchmark) return gr.Dropdown(choices=categories, value="All") def update_languages(benchmark): """Update language dropdown based on selected benchmark.""" languages = ["All"] + get_languages(benchmark) default = "English" if "English" in languages else languages[0] return gr.Dropdown(choices=languages, value=default) def update_judges(benchmark): """Update judge dropdown based on selected benchmark.""" judges = get_judges(benchmark) default = judges[0] if judges else None return gr.Dropdown(choices=judges, value=default) def update_models(benchmark, judge): """Update model dropdown based on selected benchmark and judge.""" models = get_models(benchmark, judge) default = models[0] if models else None return gr.Dropdown(choices=models, value=default) def update_questions(benchmark, category, language): """Update question dropdown based on selected benchmark, category and language.""" question_list = get_questions(benchmark, category, language) if not question_list: return gr.Dropdown(choices=[], value=None), {} # Create a dictionary mapping previews to UIDs to ensure we can look up UIDs from previews question_dict = {q[1]: q[0] for q in question_list} question_options = list(question_dict.keys()) default = question_options[0] if question_options else None return gr.Dropdown(choices=question_options, value=default), question_dict def display_content(benchmark, category, language, judge, model, question, question_dict): """Display the question, answers, and judgments.""" if not question or not question_dict or question not in question_dict: return "No question selected", "No baseline answer", "No model answer", "No judgment", "No judgment" uid = question_dict[question] # Load the question text questions_df = load_jsonl(f"data/{benchmark}/question.jsonl") question_row = questions_df[questions_df['uid'] == uid] if question_row.empty: return "Question not found", "No baseline answer", "No model answer", "No judgment", "No judgment" question_text = question_row.iloc[0]['prompt'] # Load judgments and identify baseline model judgments = load_jsonl(f"data/{benchmark}/model_judgment/{judge}/{model}.jsonl") judgment_row = judgments[judgments['uid'] == uid] if judgment_row.empty: return question_text, "No baseline answer", "No model answer", "No judgment", "No judgment" baseline_model = judgment_row.iloc[0]['baseline'] # Get answers baseline_answer = get_model_answer(benchmark, baseline_model, uid) model_answer = get_model_answer(benchmark, model, uid) # Get judgments game1, game2 = get_judgment(benchmark, judge, model, uid) judgment1 = format_judgment(game1) judgment2 = format_judgment(game2) return question_text, baseline_answer, model_answer, judgment1, judgment2 # Initialize app components based on selected benchmark def init_app(benchmark): categories = ["All"] + get_categories(benchmark) default_category = "All" languages = ["All"] + get_languages(benchmark) default_language = "English" if "English" in languages else languages[0] judges = get_judges(benchmark) default_judge = judges[0] if judges else None models = get_models(benchmark, default_judge) if default_judge else [] default_model = models[0] if models else None question_list = get_questions(benchmark, default_category, default_language) question_dict = {q[1]: q[0] for q in question_list} question_options = list(question_dict.keys()) default_question = question_options[0] if question_options else None # Get initial display content if default_question and default_model and default_judge: question_text, baseline_ans, model_ans, judgment1, judgment2 = display_content( benchmark, default_category, default_language, default_judge, default_model, default_question, question_dict ) else: question_text = "No question available" baseline_ans = "No baseline answer" model_ans = "No model answer" judgment1 = "No judgment" judgment2 = "No judgment" return ( gr.Dropdown(choices=categories, value=default_category), gr.Dropdown(choices=languages, value=default_language), gr.Dropdown(choices=judges, value=default_judge), gr.Dropdown(choices=models, value=default_model), gr.Dropdown(choices=question_options, value=default_question), question_dict, question_text, baseline_ans, model_ans, judgment1, judgment2 ) # Function to go to the next question def next_question(benchmark, category, language, current_question, question_dict): question_list = get_questions(benchmark, category, language) previews = [q[1] for q in question_list] if current_question not in previews: return gr.Dropdown(value=previews[0] if previews else None) current_idx = previews.index(current_question) next_idx = (current_idx + 1) % len(previews) return gr.Dropdown(value=previews[next_idx]) # Create Gradio app def create_app(): benchmarks = get_available_benchmarks() default_benchmark = "arena-hard-v2.0" if "arena-hard-v2.0" in benchmarks else benchmarks[0] # Initialize data for the default benchmark init_data = init_app(default_benchmark) with gr.Blocks() as app: gr.Markdown( '''# Arena-Hard-Auto Benchmark Viewer Arena-Hard-Auto is an automatic evaluation tool for instruction-tuned LLMs. It has the highest correlation and separability to LMArena (Chatbot Arena) among popular open-ended LLM benchmarks. If you are curious to see how well your model might perform on LMArena before deploying, we recommend trying Arena-Hard-Auto's newest evaluation set, **Arena-Hard-v2.0-Preview**. **Repo:** https://github.com/lmarena/arena-hard-auto **Paper:** https://arxiv.org/abs/2406.11939 ''' ) with gr.Row(): with gr.Column(): benchmark_dropdown = gr.Dropdown( choices=benchmarks, value=default_benchmark, label="Benchmark" ) category_dropdown = gr.Dropdown( choices=init_data[0].choices, value=init_data[0].value, label="Category" ) language_dropdown = gr.Dropdown( choices=init_data[1].choices, value=init_data[1].value, label="Language" ) with gr.Column(): judge_dropdown = gr.Dropdown( choices=init_data[2].choices, value=init_data[2].value, label="Judge Model" ) model_dropdown = gr.Dropdown( label="Model to Evaluate", choices=init_data[3].choices, value=init_data[3].value, ) question_dict = gr.State(init_data[5]) question_dropdown = gr.Dropdown( choices=init_data[4].choices, value=init_data[4].value, label="Select Question" ) # Add a next question button next_button = gr.Button("Next Question") # Display the question gr.Markdown("---") question_display = gr.Markdown(value="### Question\n\n" + init_data[6]) with gr.Tabs(): with gr.TabItem("Game 1: Baseline (A) vs Model (B)"): with gr.Row(): with gr.Column(): gr.Markdown("### Baseline (A)") baseline_answer1 = gr.Markdown(value=init_data[7]) with gr.Column(): gr.Markdown("### Model (B)") model_answer1 = gr.Markdown(value=init_data[8]) gr.Markdown("---") gr.Markdown("### Judgment") judgment1 = gr.Markdown(value=init_data[9]) with gr.TabItem("Game 2: Model (A) vs Baseline (B)"): with gr.Row(): with gr.Column(): gr.Markdown("### Model (A)") model_answer2 = gr.Markdown(value=init_data[8]) with gr.Column(): gr.Markdown("### Baseline (B)") baseline_answer2 = gr.Markdown(value=init_data[7]) gr.Markdown("---") gr.Markdown("### Judgment") judgment2 = gr.Markdown(value=init_data[10]) gr.Markdown("---") gr.Markdown("### Citation") gr.Markdown("If you find this tool useful, please cite the following papers:") gr.Markdown( '''```bibtex @article{li2024crowdsourced, title={From Crowdsourced Data to High-Quality Benchmarks: Arena-Hard and BenchBuilder Pipeline}, author={Li, Tianle and Chiang, Wei-Lin and Frick, Evan and Dunlap, Lisa and Wu, Tianhao and Zhu, Banghua and Gonzalez, Joseph E and Stoica, Ion}, journal={arXiv preprint arXiv:2406.11939}, year={2024} } @misc{arenahard2024, title = {From Live Data to High-Quality Benchmarks: The Arena-Hard Pipeline}, url = {https://lmsys.org/blog/2024-04-19-arena-hard/}, author = {Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica}, month = {April}, year = {2024} } ```''') # Set up event handlers benchmark_dropdown.change( fn=init_app, inputs=benchmark_dropdown, outputs=[ category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict, question_display, baseline_answer1, model_answer1, judgment1, judgment2 ] ).then( fn=lambda model, baseline: (model, baseline), inputs=[model_answer1, baseline_answer1], outputs=[model_answer2, baseline_answer2] ) # Update questions when category changes category_dropdown.change( fn=update_questions, inputs=[benchmark_dropdown, category_dropdown, language_dropdown], outputs=[question_dropdown, question_dict] ).then( fn=display_content, inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict], outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2] ).then( fn=lambda model, baseline: (model, baseline), inputs=[model_answer1, baseline_answer1], outputs=[model_answer2, baseline_answer2] ) # Update questions when language changes language_dropdown.change( fn=update_questions, inputs=[benchmark_dropdown, category_dropdown, language_dropdown], outputs=[question_dropdown, question_dict] ).then( fn=display_content, inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict], outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2] ).then( fn=lambda model, baseline: (model, baseline), inputs=[model_answer1, baseline_answer1], outputs=[model_answer2, baseline_answer2] ) # Update models when judge changes judge_dropdown.change( fn=update_models, inputs=[benchmark_dropdown, judge_dropdown], outputs=model_dropdown ).then( fn=display_content, inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict], outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2] ).then( fn=lambda model, baseline: (model, baseline), inputs=[model_answer1, baseline_answer1], outputs=[model_answer2, baseline_answer2] ) # Display content when model changes model_dropdown.change( fn=display_content, inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict], outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2] ).then( fn=lambda model, baseline: (model, baseline), inputs=[model_answer1, baseline_answer1], outputs=[model_answer2, baseline_answer2] ) # Display content when question changes question_dropdown.change( fn=display_content, inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict], outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2] ).then( fn=lambda model, baseline: (model, baseline), inputs=[model_answer1, baseline_answer1], outputs=[model_answer2, baseline_answer2] ) # Handle next question button next_button.click( fn=next_question, inputs=[benchmark_dropdown, category_dropdown, language_dropdown, question_dropdown, question_dict], outputs=question_dropdown ) return app if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--host", type=str, default="0.0.0.0") parser.add_argument("--port", type=int) parser.add_argument("--share", action="store_true") args = parser.parse_args() app = create_app() app.launch(server_name=args.host, server_port=args.port, share=args.share)