Spaces:

lmarena-ai
/

arena-hard-viewer

Running

App Files Files Community

arena-hard-viewer / app.py

Timmli

v2.0 release

ad39573 7 days ago

raw

history blame contribute delete

19.8 kB

	import os
	import json
	import pandas as pd
	import glob
	import gradio as gr

	# Cache for loaded data
	data_cache = {}

	# Load data functions with caching
	def load_jsonl(file_path):
	"""Load a JSONL file into a pandas DataFrame with caching."""
	if file_path in data_cache:
	return data_cache[file_path]

	if not os.path.exists(file_path):
	return pd.DataFrame()

	try:
	df = pd.read_json(file_path, lines=True)
	data_cache[file_path] = df
	return df
	except Exception as e:
	print(f"Error loading {file_path}: {e}")
	return pd.DataFrame()

	def get_available_benchmarks():
	"""Get list of available benchmarks in data directory."""
	return [dir_name for dir_name in os.listdir("data")
	if os.path.isdir(os.path.join("data", dir_name))]

	def get_categories(benchmark):
	"""Get list of categories for a given benchmark."""
	questions = load_jsonl(f"data/{benchmark}/question.jsonl")
	if questions.empty:
	return []
	return sorted(questions['category'].unique().tolist())

	def get_languages(benchmark):
	"""Get list of languages available in the benchmark."""
	questions = load_jsonl(f"data/{benchmark}/question.jsonl")
	if questions.empty or 'language' not in questions.columns:
	return ["English"] # Default if no language column

	return sorted(questions['language'].unique().tolist())

	def get_judges(benchmark):
	"""Get list of available judges for a benchmark."""
	judgment_dir = f"data/{benchmark}/model_judgment"
	if not os.path.exists(judgment_dir):
	return []
	return [dir_name for dir_name in os.listdir(judgment_dir)
	if os.path.isdir(os.path.join(judgment_dir, dir_name))]

	def get_models(benchmark, judge):
	"""Get list of models that have judgments by the specified judge."""
	if not judge:
	return []

	judgment_dir = f"data/{benchmark}/model_judgment/{judge}"
	if not os.path.exists(judgment_dir):
	return []

	return [os.path.splitext(os.path.basename(file))[0]
	for file in glob.glob(f"{judgment_dir}/*.jsonl")]

	def get_questions(benchmark, category=None, language=None):
	"""Get questions with category and language filters if provided."""
	questions = load_jsonl(f"data/{benchmark}/question.jsonl")
	if questions.empty:
	return []

	# Apply category filter if provided
	if category and category != "All":
	questions = questions[questions['category'] == category]

	# Apply language filter if provided and column exists
	if language and language != "All" and 'language' in questions.columns:
	questions = questions[questions['language'] == language]

	# Create list of question previews with their UIDs
	question_previews = [(row['uid'], row['prompt'][:100] + "..." if len(row['prompt']) > 100 else row['prompt'])
	for _, row in questions.iterrows()]

	return question_previews

	def get_model_answer(benchmark, model, uid):
	"""Get a model's answer for a specific question."""
	model_answers = load_jsonl(f"data/{benchmark}/model_answer/{model}.jsonl")
	if model_answers.empty:
	return "No answer found"

	answer = model_answers[model_answers['uid'] == uid]
	if answer.empty:
	return "No answer found"

	# Extract the actual answer from the messages
	try:
	messages = answer.iloc[0]['messages']
	if len(messages) < 2:
	return "No answer found"

	# The assistant's message should be the second one
	assistant_msg = messages[1]
	if 'role' in assistant_msg and assistant_msg['role'] == 'assistant':
	content = assistant_msg['content']

	# Handle different content formats
	if isinstance(content, dict) and 'answer' in content:
	return content['answer']
	elif isinstance(content, str):
	return content
	else:
	return str(content)
	else:
	return "Invalid message format"
	except Exception as e:
	return f"Error extracting answer: {str(e)}"

	def get_judgment(benchmark, judge, model, uid):
	"""Get judgment for a specific model and question."""
	judgments = load_jsonl(f"data/{benchmark}/model_judgment/{judge}/{model}.jsonl")
	if judgments.empty:
	return None, None

	judgment = judgments[judgments['uid'] == uid]
	if judgment.empty:
	return None, None

	games = judgment.iloc[0]['games']
	if len(games) < 2:
	return games[0] if games else None, None

	return games[0], games[1] # First game, second game

	def format_judgment(game):
	"""Format judgment for display."""
	if not game:
	return "No judgment available"

	score = game.get('score', 'No score')

	# Try to get judgment text
	judgment = game.get('judgment', {})
	if isinstance(judgment, dict) and 'answer' in judgment:
	judgment_text = judgment['answer']
	else:
	judgment_text = str(judgment)

	return f"### Score: {score}\n\n{judgment_text}"

	# Gradio interface functions
	def update_categories(benchmark):
	"""Update category dropdown based on selected benchmark."""
	categories = ["All"] + get_categories(benchmark)
	return gr.Dropdown(choices=categories, value="All")

	def update_languages(benchmark):
	"""Update language dropdown based on selected benchmark."""
	languages = ["All"] + get_languages(benchmark)
	default = "English" if "English" in languages else languages[0]
	return gr.Dropdown(choices=languages, value=default)

	def update_judges(benchmark):
	"""Update judge dropdown based on selected benchmark."""
	judges = get_judges(benchmark)
	default = judges[0] if judges else None
	return gr.Dropdown(choices=judges, value=default)

	def update_models(benchmark, judge):
	"""Update model dropdown based on selected benchmark and judge."""
	models = get_models(benchmark, judge)
	default = models[0] if models else None
	return gr.Dropdown(choices=models, value=default)

	def update_questions(benchmark, category, language):
	"""Update question dropdown based on selected benchmark, category and language."""
	question_list = get_questions(benchmark, category, language)
	if not question_list:
	return gr.Dropdown(choices=[], value=None), {}

	# Create a dictionary mapping previews to UIDs to ensure we can look up UIDs from previews
	question_dict = {q[1]: q[0] for q in question_list}
	question_options = list(question_dict.keys())

	default = question_options[0] if question_options else None
	return gr.Dropdown(choices=question_options, value=default), question_dict

	def display_content(benchmark, category, language, judge, model, question, question_dict):
	"""Display the question, answers, and judgments."""
	if not question or not question_dict or question not in question_dict:
	return "No question selected", "No baseline answer", "No model answer", "No judgment", "No judgment"

	uid = question_dict[question]

	# Load the question text
	questions_df = load_jsonl(f"data/{benchmark}/question.jsonl")
	question_row = questions_df[questions_df['uid'] == uid]
	if question_row.empty:
	return "Question not found", "No baseline answer", "No model answer", "No judgment", "No judgment"

	question_text = question_row.iloc[0]['prompt']

	# Load judgments and identify baseline model
	judgments = load_jsonl(f"data/{benchmark}/model_judgment/{judge}/{model}.jsonl")
	judgment_row = judgments[judgments['uid'] == uid]

	if judgment_row.empty:
	return question_text, "No baseline answer", "No model answer", "No judgment", "No judgment"

	baseline_model = judgment_row.iloc[0]['baseline']

	# Get answers
	baseline_answer = get_model_answer(benchmark, baseline_model, uid)
	model_answer = get_model_answer(benchmark, model, uid)

	# Get judgments
	game1, game2 = get_judgment(benchmark, judge, model, uid)

	judgment1 = format_judgment(game1)
	judgment2 = format_judgment(game2)

	return question_text, baseline_answer, model_answer, judgment1, judgment2

	# Initialize app components based on selected benchmark
	def init_app(benchmark):
	categories = ["All"] + get_categories(benchmark)
	default_category = "All"

	languages = ["All"] + get_languages(benchmark)
	default_language = "English" if "English" in languages else languages[0]

	judges = get_judges(benchmark)
	default_judge = judges[0] if judges else None

	models = get_models(benchmark, default_judge) if default_judge else []
	default_model = models[0] if models else None

	question_list = get_questions(benchmark, default_category, default_language)
	question_dict = {q[1]: q[0] for q in question_list}
	question_options = list(question_dict.keys())
	default_question = question_options[0] if question_options else None

	# Get initial display content
	if default_question and default_model and default_judge:
	question_text, baseline_ans, model_ans, judgment1, judgment2 = display_content(
	benchmark, default_category, default_language, default_judge, default_model, default_question, question_dict
	)
	else:
	question_text = "No question available"
	baseline_ans = "No baseline answer"
	model_ans = "No model answer"
	judgment1 = "No judgment"
	judgment2 = "No judgment"

	return (
	gr.Dropdown(choices=categories, value=default_category),
	gr.Dropdown(choices=languages, value=default_language),
	gr.Dropdown(choices=judges, value=default_judge),
	gr.Dropdown(choices=models, value=default_model),
	gr.Dropdown(choices=question_options, value=default_question),
	question_dict,
	question_text,
	baseline_ans, model_ans,
	judgment1, judgment2
	)

	# Function to go to the next question
	def next_question(benchmark, category, language, current_question, question_dict):
	question_list = get_questions(benchmark, category, language)
	previews = [q[1] for q in question_list]

	if current_question not in previews:
	return gr.Dropdown(value=previews[0] if previews else None)

	current_idx = previews.index(current_question)
	next_idx = (current_idx + 1) % len(previews)
	return gr.Dropdown(value=previews[next_idx])

	# Create Gradio app
	def create_app():
	benchmarks = get_available_benchmarks()
	default_benchmark = "arena-hard-v2.0" if "arena-hard-v2.0" in benchmarks else benchmarks[0]

	# Initialize data for the default benchmark
	init_data = init_app(default_benchmark)

	with gr.Blocks() as app:
	gr.Markdown(
	'''# Arena-Hard-Auto Benchmark Viewer

	Arena-Hard-Auto is an automatic evaluation tool for instruction-tuned LLMs. It has the highest correlation and separability to LMArena (Chatbot Arena) among popular open-ended LLM benchmarks. If you are curious to see how well your model might perform on LMArena before deploying, we recommend trying Arena-Hard-Auto's newest evaluation set, Arena-Hard-v2.0-Preview.

	Repo: https://github.com/lmarena/arena-hard-auto

	Paper: https://arxiv.org/abs/2406.11939
	'''
	)

	with gr.Row():
	with gr.Column():
	benchmark_dropdown = gr.Dropdown(
	choices=benchmarks,
	value=default_benchmark,
	label="Benchmark"
	)

	category_dropdown = gr.Dropdown(
	choices=init_data[0].choices,
	value=init_data[0].value,
	label="Category"
	)

	language_dropdown = gr.Dropdown(
	choices=init_data[1].choices,
	value=init_data[1].value,
	label="Language"
	)

	with gr.Column():
	judge_dropdown = gr.Dropdown(
	choices=init_data[2].choices,
	value=init_data[2].value,
	label="Judge Model"
	)

	model_dropdown = gr.Dropdown(
	label="Model to Evaluate",
	choices=init_data[3].choices,
	value=init_data[3].value,
	)

	question_dict = gr.State(init_data[5])
	question_dropdown = gr.Dropdown(
	choices=init_data[4].choices,
	value=init_data[4].value,
	label="Select Question"
	)

	# Add a next question button
	next_button = gr.Button("Next Question")

	# Display the question
	gr.Markdown("---")
	question_display = gr.Markdown(value="### Question\n\n" + init_data[6])

	with gr.Tabs():
	with gr.TabItem("Game 1: Baseline (A) vs Model (B)"):
	with gr.Row():
	with gr.Column():
	gr.Markdown("### Baseline (A)")
	baseline_answer1 = gr.Markdown(value=init_data[7])
	with gr.Column():
	gr.Markdown("### Model (B)")
	model_answer1 = gr.Markdown(value=init_data[8])
	gr.Markdown("---")
	gr.Markdown("### Judgment")
	judgment1 = gr.Markdown(value=init_data[9])

	with gr.TabItem("Game 2: Model (A) vs Baseline (B)"):
	with gr.Row():
	with gr.Column():
	gr.Markdown("### Model (A)")
	model_answer2 = gr.Markdown(value=init_data[8])
	with gr.Column():
	gr.Markdown("### Baseline (B)")
	baseline_answer2 = gr.Markdown(value=init_data[7])
	gr.Markdown("---")
	gr.Markdown("### Judgment")
	judgment2 = gr.Markdown(value=init_data[10])

	gr.Markdown("---")
	gr.Markdown("### Citation")
	gr.Markdown("If you find this tool useful, please cite the following papers:")
	gr.Markdown(
	'''```bibtex
	@article{li2024crowdsourced,
	title={From Crowdsourced Data to High-Quality Benchmarks: Arena-Hard and BenchBuilder Pipeline},
	author={Li, Tianle and Chiang, Wei-Lin and Frick, Evan and Dunlap, Lisa and Wu, Tianhao and Zhu, Banghua and Gonzalez, Joseph E and Stoica, Ion},
	journal={arXiv preprint arXiv:2406.11939},
	year={2024}
	}
	@misc{arenahard2024,
	title = {From Live Data to High-Quality Benchmarks: The Arena-Hard Pipeline},
	url = {https://lmsys.org/blog/2024-04-19-arena-hard/},
	author = {Tianle Li, Wei-Lin Chiang, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica},
	month = {April},
	year = {2024}
	}
	```''')

	# Set up event handlers
	benchmark_dropdown.change(
	fn=init_app,
	inputs=benchmark_dropdown,
	outputs=[
	category_dropdown, language_dropdown, judge_dropdown, model_dropdown,
	question_dropdown, question_dict,
	question_display,
	baseline_answer1, model_answer1,
	judgment1, judgment2
	]
	).then(
	fn=lambda model, baseline: (model, baseline),
	inputs=[model_answer1, baseline_answer1],
	outputs=[model_answer2, baseline_answer2]
	)

	# Update questions when category changes
	category_dropdown.change(
	fn=update_questions,
	inputs=[benchmark_dropdown, category_dropdown, language_dropdown],
	outputs=[question_dropdown, question_dict]
	).then(
	fn=display_content,
	inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
	outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
	).then(
	fn=lambda model, baseline: (model, baseline),
	inputs=[model_answer1, baseline_answer1],
	outputs=[model_answer2, baseline_answer2]
	)

	# Update questions when language changes
	language_dropdown.change(
	fn=update_questions,
	inputs=[benchmark_dropdown, category_dropdown, language_dropdown],
	outputs=[question_dropdown, question_dict]
	).then(
	fn=display_content,
	inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
	outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
	).then(
	fn=lambda model, baseline: (model, baseline),
	inputs=[model_answer1, baseline_answer1],
	outputs=[model_answer2, baseline_answer2]
	)

	# Update models when judge changes
	judge_dropdown.change(
	fn=update_models,
	inputs=[benchmark_dropdown, judge_dropdown],
	outputs=model_dropdown
	).then(
	fn=display_content,
	inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
	outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
	).then(
	fn=lambda model, baseline: (model, baseline),
	inputs=[model_answer1, baseline_answer1],
	outputs=[model_answer2, baseline_answer2]
	)

	# Display content when model changes
	model_dropdown.change(
	fn=display_content,
	inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
	outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
	).then(
	fn=lambda model, baseline: (model, baseline),
	inputs=[model_answer1, baseline_answer1],
	outputs=[model_answer2, baseline_answer2]
	)

	# Display content when question changes
	question_dropdown.change(
	fn=display_content,
	inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
	outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
	).then(
	fn=lambda model, baseline: (model, baseline),
	inputs=[model_answer1, baseline_answer1],
	outputs=[model_answer2, baseline_answer2]
	)

	# Handle next question button
	next_button.click(
	fn=next_question,
	inputs=[benchmark_dropdown, category_dropdown, language_dropdown, question_dropdown, question_dict],
	outputs=question_dropdown
	)

	return app

	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser()
	parser.add_argument("--host", type=str, default="0.0.0.0")
	parser.add_argument("--port", type=int)
	parser.add_argument("--share", action="store_true")
	args = parser.parse_args()

	app = create_app()
	app.launch(server_name=args.host, server_port=args.port, share=args.share)