Timmli's picture
v2.0 release
ad39573
import os
import json
import pandas as pd
import glob
import gradio as gr
# Cache for loaded data
data_cache = {}
# Load data functions with caching
def load_jsonl(file_path):
"""Load a JSONL file into a pandas DataFrame with caching."""
if file_path in data_cache:
return data_cache[file_path]
if not os.path.exists(file_path):
return pd.DataFrame()
try:
df = pd.read_json(file_path, lines=True)
data_cache[file_path] = df
return df
except Exception as e:
print(f"Error loading {file_path}: {e}")
return pd.DataFrame()
def get_available_benchmarks():
"""Get list of available benchmarks in data directory."""
return [dir_name for dir_name in os.listdir("data")
if os.path.isdir(os.path.join("data", dir_name))]
def get_categories(benchmark):
"""Get list of categories for a given benchmark."""
questions = load_jsonl(f"data/{benchmark}/question.jsonl")
if questions.empty:
return []
return sorted(questions['category'].unique().tolist())
def get_languages(benchmark):
"""Get list of languages available in the benchmark."""
questions = load_jsonl(f"data/{benchmark}/question.jsonl")
if questions.empty or 'language' not in questions.columns:
return ["English"] # Default if no language column
return sorted(questions['language'].unique().tolist())
def get_judges(benchmark):
"""Get list of available judges for a benchmark."""
judgment_dir = f"data/{benchmark}/model_judgment"
if not os.path.exists(judgment_dir):
return []
return [dir_name for dir_name in os.listdir(judgment_dir)
if os.path.isdir(os.path.join(judgment_dir, dir_name))]
def get_models(benchmark, judge):
"""Get list of models that have judgments by the specified judge."""
if not judge:
return []
judgment_dir = f"data/{benchmark}/model_judgment/{judge}"
if not os.path.exists(judgment_dir):
return []
return [os.path.splitext(os.path.basename(file))[0]
for file in glob.glob(f"{judgment_dir}/*.jsonl")]
def get_questions(benchmark, category=None, language=None):
"""Get questions with category and language filters if provided."""
questions = load_jsonl(f"data/{benchmark}/question.jsonl")
if questions.empty:
return []
# Apply category filter if provided
if category and category != "All":
questions = questions[questions['category'] == category]
# Apply language filter if provided and column exists
if language and language != "All" and 'language' in questions.columns:
questions = questions[questions['language'] == language]
# Create list of question previews with their UIDs
question_previews = [(row['uid'], row['prompt'][:100] + "..." if len(row['prompt']) > 100 else row['prompt'])
for _, row in questions.iterrows()]
return question_previews
def get_model_answer(benchmark, model, uid):
"""Get a model's answer for a specific question."""
model_answers = load_jsonl(f"data/{benchmark}/model_answer/{model}.jsonl")
if model_answers.empty:
return "No answer found"
answer = model_answers[model_answers['uid'] == uid]
if answer.empty:
return "No answer found"
# Extract the actual answer from the messages
try:
messages = answer.iloc[0]['messages']
if len(messages) < 2:
return "No answer found"
# The assistant's message should be the second one
assistant_msg = messages[1]
if 'role' in assistant_msg and assistant_msg['role'] == 'assistant':
content = assistant_msg['content']
# Handle different content formats
if isinstance(content, dict) and 'answer' in content:
return content['answer']
elif isinstance(content, str):
return content
else:
return str(content)
else:
return "Invalid message format"
except Exception as e:
return f"Error extracting answer: {str(e)}"
def get_judgment(benchmark, judge, model, uid):
"""Get judgment for a specific model and question."""
judgments = load_jsonl(f"data/{benchmark}/model_judgment/{judge}/{model}.jsonl")
if judgments.empty:
return None, None
judgment = judgments[judgments['uid'] == uid]
if judgment.empty:
return None, None
games = judgment.iloc[0]['games']
if len(games) < 2:
return games[0] if games else None, None
return games[0], games[1] # First game, second game
def format_judgment(game):
"""Format judgment for display."""
if not game:
return "No judgment available"
score = game.get('score', 'No score')
# Try to get judgment text
judgment = game.get('judgment', {})
if isinstance(judgment, dict) and 'answer' in judgment:
judgment_text = judgment['answer']
else:
judgment_text = str(judgment)
return f"### Score: {score}\n\n{judgment_text}"
# Gradio interface functions
def update_categories(benchmark):
"""Update category dropdown based on selected benchmark."""
categories = ["All"] + get_categories(benchmark)
return gr.Dropdown(choices=categories, value="All")
def update_languages(benchmark):
"""Update language dropdown based on selected benchmark."""
languages = ["All"] + get_languages(benchmark)
default = "English" if "English" in languages else languages[0]
return gr.Dropdown(choices=languages, value=default)
def update_judges(benchmark):
"""Update judge dropdown based on selected benchmark."""
judges = get_judges(benchmark)
default = judges[0] if judges else None
return gr.Dropdown(choices=judges, value=default)
def update_models(benchmark, judge):
"""Update model dropdown based on selected benchmark and judge."""
models = get_models(benchmark, judge)
default = models[0] if models else None
return gr.Dropdown(choices=models, value=default)
def update_questions(benchmark, category, language):
"""Update question dropdown based on selected benchmark, category and language."""
question_list = get_questions(benchmark, category, language)
if not question_list:
return gr.Dropdown(choices=[], value=None), {}
# Create a dictionary mapping previews to UIDs to ensure we can look up UIDs from previews
question_dict = {q[1]: q[0] for q in question_list}
question_options = list(question_dict.keys())
default = question_options[0] if question_options else None
return gr.Dropdown(choices=question_options, value=default), question_dict
def display_content(benchmark, category, language, judge, model, question, question_dict):
"""Display the question, answers, and judgments."""
if not question or not question_dict or question not in question_dict:
return "No question selected", "No baseline answer", "No model answer", "No judgment", "No judgment"
uid = question_dict[question]
# Load the question text
questions_df = load_jsonl(f"data/{benchmark}/question.jsonl")
question_row = questions_df[questions_df['uid'] == uid]
if question_row.empty:
return "Question not found", "No baseline answer", "No model answer", "No judgment", "No judgment"
question_text = question_row.iloc[0]['prompt']
# Load judgments and identify baseline model
judgments = load_jsonl(f"data/{benchmark}/model_judgment/{judge}/{model}.jsonl")
judgment_row = judgments[judgments['uid'] == uid]
if judgment_row.empty:
return question_text, "No baseline answer", "No model answer", "No judgment", "No judgment"
baseline_model = judgment_row.iloc[0]['baseline']
# Get answers
baseline_answer = get_model_answer(benchmark, baseline_model, uid)
model_answer = get_model_answer(benchmark, model, uid)
# Get judgments
game1, game2 = get_judgment(benchmark, judge, model, uid)
judgment1 = format_judgment(game1)
judgment2 = format_judgment(game2)
return question_text, baseline_answer, model_answer, judgment1, judgment2
# Initialize app components based on selected benchmark
def init_app(benchmark):
categories = ["All"] + get_categories(benchmark)
default_category = "All"
languages = ["All"] + get_languages(benchmark)
default_language = "English" if "English" in languages else languages[0]
judges = get_judges(benchmark)
default_judge = judges[0] if judges else None
models = get_models(benchmark, default_judge) if default_judge else []
default_model = models[0] if models else None
question_list = get_questions(benchmark, default_category, default_language)
question_dict = {q[1]: q[0] for q in question_list}
question_options = list(question_dict.keys())
default_question = question_options[0] if question_options else None
# Get initial display content
if default_question and default_model and default_judge:
question_text, baseline_ans, model_ans, judgment1, judgment2 = display_content(
benchmark, default_category, default_language, default_judge, default_model, default_question, question_dict
)
else:
question_text = "No question available"
baseline_ans = "No baseline answer"
model_ans = "No model answer"
judgment1 = "No judgment"
judgment2 = "No judgment"
return (
gr.Dropdown(choices=categories, value=default_category),
gr.Dropdown(choices=languages, value=default_language),
gr.Dropdown(choices=judges, value=default_judge),
gr.Dropdown(choices=models, value=default_model),
gr.Dropdown(choices=question_options, value=default_question),
question_dict,
question_text,
baseline_ans, model_ans,
judgment1, judgment2
)
# Function to go to the next question
def next_question(benchmark, category, language, current_question, question_dict):
question_list = get_questions(benchmark, category, language)
previews = [q[1] for q in question_list]
if current_question not in previews:
return gr.Dropdown(value=previews[0] if previews else None)
current_idx = previews.index(current_question)
next_idx = (current_idx + 1) % len(previews)
return gr.Dropdown(value=previews[next_idx])
# Create Gradio app
def create_app():
benchmarks = get_available_benchmarks()
default_benchmark = "arena-hard-v2.0" if "arena-hard-v2.0" in benchmarks else benchmarks[0]
# Initialize data for the default benchmark
init_data = init_app(default_benchmark)
with gr.Blocks() as app:
gr.Markdown(
'''# Arena-Hard-Auto Benchmark Viewer
Arena-Hard-Auto is an automatic evaluation tool for instruction-tuned LLMs. It has the highest correlation and separability to LMArena (Chatbot Arena) among popular open-ended LLM benchmarks. If you are curious to see how well your model might perform on LMArena before deploying, we recommend trying Arena-Hard-Auto's newest evaluation set, **Arena-Hard-v2.0-Preview**.
**Repo:** https://github.com/lmarena/arena-hard-auto
**Paper:** https://arxiv.org/abs/2406.11939
'''
)
with gr.Row():
with gr.Column():
benchmark_dropdown = gr.Dropdown(
choices=benchmarks,
value=default_benchmark,
label="Benchmark"
)
category_dropdown = gr.Dropdown(
choices=init_data[0].choices,
value=init_data[0].value,
label="Category"
)
language_dropdown = gr.Dropdown(
choices=init_data[1].choices,
value=init_data[1].value,
label="Language"
)
with gr.Column():
judge_dropdown = gr.Dropdown(
choices=init_data[2].choices,
value=init_data[2].value,
label="Judge Model"
)
model_dropdown = gr.Dropdown(
label="Model to Evaluate",
choices=init_data[3].choices,
value=init_data[3].value,
)
question_dict = gr.State(init_data[5])
question_dropdown = gr.Dropdown(
choices=init_data[4].choices,
value=init_data[4].value,
label="Select Question"
)
# Add a next question button
next_button = gr.Button("Next Question")
# Display the question
gr.Markdown("---")
question_display = gr.Markdown(value="### Question\n\n" + init_data[6])
with gr.Tabs():
with gr.TabItem("Game 1: Baseline (A) vs Model (B)"):
with gr.Row():
with gr.Column():
gr.Markdown("### Baseline (A)")
baseline_answer1 = gr.Markdown(value=init_data[7])
with gr.Column():
gr.Markdown("### Model (B)")
model_answer1 = gr.Markdown(value=init_data[8])
gr.Markdown("---")
gr.Markdown("### Judgment")
judgment1 = gr.Markdown(value=init_data[9])
with gr.TabItem("Game 2: Model (A) vs Baseline (B)"):
with gr.Row():
with gr.Column():
gr.Markdown("### Model (A)")
model_answer2 = gr.Markdown(value=init_data[8])
with gr.Column():
gr.Markdown("### Baseline (B)")
baseline_answer2 = gr.Markdown(value=init_data[7])
gr.Markdown("---")
gr.Markdown("### Judgment")
judgment2 = gr.Markdown(value=init_data[10])
gr.Markdown("---")
gr.Markdown("### Citation")
gr.Markdown("If you find this tool useful, please cite the following papers:")
gr.Markdown(
'''```bibtex
@article{li2024crowdsourced,
title={From Crowdsourced Data to High-Quality Benchmarks: Arena-Hard and BenchBuilder Pipeline},
author={Li, Tianle and Chiang, Wei-Lin and Frick, Evan and Dunlap, Lisa and Wu, Tianhao and Zhu, Banghua and Gonzalez, Joseph E and Stoica, Ion},
journal={arXiv preprint arXiv:2406.11939},
year={2024}
}
@misc{arenahard2024,
title = {From Live Data to High-Quality Benchmarks: The Arena-Hard Pipeline},
url = {https://lmsys.org/blog/2024-04-19-arena-hard/},
author = {Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica},
month = {April},
year = {2024}
}
```''')
# Set up event handlers
benchmark_dropdown.change(
fn=init_app,
inputs=benchmark_dropdown,
outputs=[
category_dropdown, language_dropdown, judge_dropdown, model_dropdown,
question_dropdown, question_dict,
question_display,
baseline_answer1, model_answer1,
judgment1, judgment2
]
).then(
fn=lambda model, baseline: (model, baseline),
inputs=[model_answer1, baseline_answer1],
outputs=[model_answer2, baseline_answer2]
)
# Update questions when category changes
category_dropdown.change(
fn=update_questions,
inputs=[benchmark_dropdown, category_dropdown, language_dropdown],
outputs=[question_dropdown, question_dict]
).then(
fn=display_content,
inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
).then(
fn=lambda model, baseline: (model, baseline),
inputs=[model_answer1, baseline_answer1],
outputs=[model_answer2, baseline_answer2]
)
# Update questions when language changes
language_dropdown.change(
fn=update_questions,
inputs=[benchmark_dropdown, category_dropdown, language_dropdown],
outputs=[question_dropdown, question_dict]
).then(
fn=display_content,
inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
).then(
fn=lambda model, baseline: (model, baseline),
inputs=[model_answer1, baseline_answer1],
outputs=[model_answer2, baseline_answer2]
)
# Update models when judge changes
judge_dropdown.change(
fn=update_models,
inputs=[benchmark_dropdown, judge_dropdown],
outputs=model_dropdown
).then(
fn=display_content,
inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
).then(
fn=lambda model, baseline: (model, baseline),
inputs=[model_answer1, baseline_answer1],
outputs=[model_answer2, baseline_answer2]
)
# Display content when model changes
model_dropdown.change(
fn=display_content,
inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
).then(
fn=lambda model, baseline: (model, baseline),
inputs=[model_answer1, baseline_answer1],
outputs=[model_answer2, baseline_answer2]
)
# Display content when question changes
question_dropdown.change(
fn=display_content,
inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
).then(
fn=lambda model, baseline: (model, baseline),
inputs=[model_answer1, baseline_answer1],
outputs=[model_answer2, baseline_answer2]
)
# Handle next question button
next_button.click(
fn=next_question,
inputs=[benchmark_dropdown, category_dropdown, language_dropdown, question_dropdown, question_dict],
outputs=question_dropdown
)
return app
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="0.0.0.0")
parser.add_argument("--port", type=int)
parser.add_argument("--share", action="store_true")
args = parser.parse_args()
app = create_app()
app.launch(server_name=args.host, server_port=args.port, share=args.share)