Spaces:

lmarena-ai
/

arena-hard-viewer

Running

App Files Files Community

Timmli commited on 6 days ago

Commit

ad39573

1 Parent(s): 3c1e115

v2.0 release

Browse files

Files changed (30) hide show

.gitattributes +1 -0
app.py +496 -0
data/arena-hard-v0.1/model_answer/claude-3-5-sonnet-20240620.jsonl +3 -0
data/arena-hard-v0.1/model_answer/claude-3-haiku-20240307.jsonl +3 -0
data/arena-hard-v0.1/model_answer/claude-3-opus-20240229.jsonl +3 -0
data/arena-hard-v0.1/model_answer/claude-3-sonnet-20240229.jsonl +3 -0
data/arena-hard-v0.1/model_answer/gemma-2-27b-it.jsonl +3 -0
data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl +3 -0
data/arena-hard-v0.1/model_answer/gpt-4-0613.jsonl +3 -0
data/arena-hard-v0.1/model_answer/gpt-4o-2024-05-13.jsonl +3 -0
data/arena-hard-v0.1/model_answer/gpt-4o-mini-2024-07-18.jsonl +3 -0
data/arena-hard-v0.1/model_answer/llama-3.1-70b-instruct.jsonl +3 -0
data/arena-hard-v0.1/model_answer/llama-3.1-8b-instruct.jsonl +3 -0
data/arena-hard-v0.1/model_answer/o1-mini-2024-09-12.jsonl +3 -0
data/arena-hard-v0.1/model_answer/o1-preview-2024-09-12.jsonl +3 -0
data/arena-hard-v0.1/model_answer/qwen2.5-72b-instruct.jsonl +3 -0
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/claude-3-5-sonnet-20240620.jsonl +3 -0
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/claude-3-haiku-20240307.jsonl +3 -0
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/claude-3-opus-20240229.jsonl +3 -0
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/claude-3-sonnet-20240229.jsonl +3 -0
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/gemma-2-27b-it.jsonl +3 -0
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/gpt-4-0613.jsonl +3 -0
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/gpt-4o-2024-05-13.jsonl +3 -0
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/gpt-4o-mini-2024-07-18.jsonl +3 -0
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/llama-3.1-70b-instruct.jsonl +3 -0
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/llama-3.1-8b-instruct.jsonl +3 -0
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/o1-mini-2024-09-12.jsonl +3 -0
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/o1-preview-2024-09-12.jsonl +3 -0
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/qwen2.5-72b-instruct.jsonl +3 -0
data/arena-hard-v0.1/question.jsonl +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.jsonl filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,496 @@

+import os
+import json
+import pandas as pd
+import glob
+import gradio as gr
+# Cache for loaded data
+data_cache = {}
+# Load data functions with caching
+def load_jsonl(file_path):
+    """Load a JSONL file into a pandas DataFrame with caching."""
+    if file_path in data_cache:
+        return data_cache[file_path]
+    if not os.path.exists(file_path):
+        return pd.DataFrame()
+    try:
+        df = pd.read_json(file_path, lines=True)
+        data_cache[file_path] = df
+        return df
+    except Exception as e:
+        print(f"Error loading {file_path}: {e}")
+        return pd.DataFrame()
+def get_available_benchmarks():
+    """Get list of available benchmarks in data directory."""
+    return [dir_name for dir_name in os.listdir("data")
+            if os.path.isdir(os.path.join("data", dir_name))]
+def get_categories(benchmark):
+    """Get list of categories for a given benchmark."""
+    questions = load_jsonl(f"data/{benchmark}/question.jsonl")
+    if questions.empty:
+        return []
+    return sorted(questions['category'].unique().tolist())
+def get_languages(benchmark):
+    """Get list of languages available in the benchmark."""
+    questions = load_jsonl(f"data/{benchmark}/question.jsonl")
+    if questions.empty or 'language' not in questions.columns:
+        return ["English"]  # Default if no language column
+    return sorted(questions['language'].unique().tolist())
+def get_judges(benchmark):
+    """Get list of available judges for a benchmark."""
+    judgment_dir = f"data/{benchmark}/model_judgment"
+    if not os.path.exists(judgment_dir):
+        return []
+    return [dir_name for dir_name in os.listdir(judgment_dir)
+            if os.path.isdir(os.path.join(judgment_dir, dir_name))]
+def get_models(benchmark, judge):
+    """Get list of models that have judgments by the specified judge."""
+    if not judge:
+        return []
+    judgment_dir = f"data/{benchmark}/model_judgment/{judge}"
+    if not os.path.exists(judgment_dir):
+        return []
+    return [os.path.splitext(os.path.basename(file))[0]
+            for file in glob.glob(f"{judgment_dir}/*.jsonl")]
+def get_questions(benchmark, category=None, language=None):
+    """Get questions with category and language filters if provided."""
+    questions = load_jsonl(f"data/{benchmark}/question.jsonl")
+    if questions.empty:
+        return []
+    # Apply category filter if provided
+    if category and category != "All":
+        questions = questions[questions['category'] == category]
+    # Apply language filter if provided and column exists
+    if language and language != "All" and 'language' in questions.columns:
+        questions = questions[questions['language'] == language]
+    # Create list of question previews with their UIDs
+    question_previews = [(row['uid'], row['prompt'][:100] + "..." if len(row['prompt']) > 100 else row['prompt'])
+                        for _, row in questions.iterrows()]
+    return question_previews
+def get_model_answer(benchmark, model, uid):
+    """Get a model's answer for a specific question."""
+    model_answers = load_jsonl(f"data/{benchmark}/model_answer/{model}.jsonl")
+    if model_answers.empty:
+        return "No answer found"
+    answer = model_answers[model_answers['uid'] == uid]
+    if answer.empty:
+        return "No answer found"
+    # Extract the actual answer from the messages
+    try:
+        messages = answer.iloc[0]['messages']
+        if len(messages) < 2:
+            return "No answer found"
+        # The assistant's message should be the second one
+        assistant_msg = messages[1]
+        if 'role' in assistant_msg and assistant_msg['role'] == 'assistant':
+            content = assistant_msg['content']
+            # Handle different content formats
+            if isinstance(content, dict) and 'answer' in content:
+                return content['answer']
+            elif isinstance(content, str):
+                return content
+            else:
+                return str(content)
+        else:
+            return "Invalid message format"
+    except Exception as e:
+        return f"Error extracting answer: {str(e)}"
+def get_judgment(benchmark, judge, model, uid):
+    """Get judgment for a specific model and question."""
+    judgments = load_jsonl(f"data/{benchmark}/model_judgment/{judge}/{model}.jsonl")
+    if judgments.empty:
+        return None, None
+    judgment = judgments[judgments['uid'] == uid]
+    if judgment.empty:
+        return None, None
+    games = judgment.iloc[0]['games']
+    if len(games) < 2:
+        return games[0] if games else None, None
+    return games[0], games[1]  # First game, second game
+def format_judgment(game):
+    """Format judgment for display."""
+    if not game:
+        return "No judgment available"
+    score = game.get('score', 'No score')
+    # Try to get judgment text
+    judgment = game.get('judgment', {})
+    if isinstance(judgment, dict) and 'answer' in judgment:
+        judgment_text = judgment['answer']
+    else:
+        judgment_text = str(judgment)
+    return f"### Score: {score}\n\n{judgment_text}"
+# Gradio interface functions
+def update_categories(benchmark):
+    """Update category dropdown based on selected benchmark."""
+    categories = ["All"] + get_categories(benchmark)
+    return gr.Dropdown(choices=categories, value="All")
+def update_languages(benchmark):
+    """Update language dropdown based on selected benchmark."""
+    languages = ["All"] + get_languages(benchmark)
+    default = "English" if "English" in languages else languages[0]
+    return gr.Dropdown(choices=languages, value=default)
+def update_judges(benchmark):
+    """Update judge dropdown based on selected benchmark."""
+    judges = get_judges(benchmark)
+    default = judges[0] if judges else None
+    return gr.Dropdown(choices=judges, value=default)
+def update_models(benchmark, judge):
+    """Update model dropdown based on selected benchmark and judge."""
+    models = get_models(benchmark, judge)
+    default = models[0] if models else None
+    return gr.Dropdown(choices=models, value=default)
+def update_questions(benchmark, category, language):
+    """Update question dropdown based on selected benchmark, category and language."""
+    question_list = get_questions(benchmark, category, language)
+    if not question_list:
+        return gr.Dropdown(choices=[], value=None), {}
+    # Create a dictionary mapping previews to UIDs to ensure we can look up UIDs from previews
+    question_dict = {q[1]: q[0] for q in question_list}
+    question_options = list(question_dict.keys())
+    default = question_options[0] if question_options else None
+    return gr.Dropdown(choices=question_options, value=default), question_dict
+def display_content(benchmark, category, language, judge, model, question, question_dict):
+    """Display the question, answers, and judgments."""
+    if not question or not question_dict or question not in question_dict:
+        return "No question selected", "No baseline answer", "No model answer", "No judgment", "No judgment"
+    uid = question_dict[question]
+    # Load the question text
+    questions_df = load_jsonl(f"data/{benchmark}/question.jsonl")
+    question_row = questions_df[questions_df['uid'] == uid]
+    if question_row.empty:
+        return "Question not found", "No baseline answer", "No model answer", "No judgment", "No judgment"
+    question_text = question_row.iloc[0]['prompt']
+    # Load judgments and identify baseline model
+    judgments = load_jsonl(f"data/{benchmark}/model_judgment/{judge}/{model}.jsonl")
+    judgment_row = judgments[judgments['uid'] == uid]
+    if judgment_row.empty:
+        return question_text, "No baseline answer", "No model answer", "No judgment", "No judgment"
+    baseline_model = judgment_row.iloc[0]['baseline']
+    # Get answers
+    baseline_answer = get_model_answer(benchmark, baseline_model, uid)
+    model_answer = get_model_answer(benchmark, model, uid)
+    # Get judgments
+    game1, game2 = get_judgment(benchmark, judge, model, uid)
+    judgment1 = format_judgment(game1)
+    judgment2 = format_judgment(game2)
+    return question_text, baseline_answer, model_answer, judgment1, judgment2
+# Initialize app components based on selected benchmark
+def init_app(benchmark):
+    categories = ["All"] + get_categories(benchmark)
+    default_category = "All"
+    languages = ["All"] + get_languages(benchmark)
+    default_language = "English" if "English" in languages else languages[0]
+    judges = get_judges(benchmark)
+    default_judge = judges[0] if judges else None
+    models = get_models(benchmark, default_judge) if default_judge else []
+    default_model = models[0] if models else None
+    question_list = get_questions(benchmark, default_category, default_language)
+    question_dict = {q[1]: q[0] for q in question_list}
+    question_options = list(question_dict.keys())
+    default_question = question_options[0] if question_options else None
+    # Get initial display content
+    if default_question and default_model and default_judge:
+        question_text, baseline_ans, model_ans, judgment1, judgment2 = display_content(
+            benchmark, default_category, default_language, default_judge, default_model, default_question, question_dict
+        )
+    else:
+        question_text = "No question available"
+        baseline_ans = "No baseline answer"
+        model_ans = "No model answer"
+        judgment1 = "No judgment"
+        judgment2 = "No judgment"
+    return (
+        gr.Dropdown(choices=categories, value=default_category),
+        gr.Dropdown(choices=languages, value=default_language),
+        gr.Dropdown(choices=judges, value=default_judge),
+        gr.Dropdown(choices=models, value=default_model),
+        gr.Dropdown(choices=question_options, value=default_question),
+        question_dict,
+        question_text,
+        baseline_ans, model_ans,
+        judgment1, judgment2
+    )
+# Function to go to the next question
+def next_question(benchmark, category, language, current_question, question_dict):
+    question_list = get_questions(benchmark, category, language)
+    previews = [q[1] for q in question_list]
+    if current_question not in previews:
+        return gr.Dropdown(value=previews[0] if previews else None)
+    current_idx = previews.index(current_question)
+    next_idx = (current_idx + 1) % len(previews)
+    return gr.Dropdown(value=previews[next_idx])
+# Create Gradio app
+def create_app():
+    benchmarks = get_available_benchmarks()
+    default_benchmark = "arena-hard-v2.0" if "arena-hard-v2.0" in benchmarks else benchmarks[0]
+    # Initialize data for the default benchmark
+    init_data = init_app(default_benchmark)
+    with gr.Blocks() as app:
+        gr.Markdown(
+            '''# Arena-Hard-Auto Benchmark Viewer
+            Arena-Hard-Auto is an automatic evaluation tool for instruction-tuned LLMs. It has the highest correlation and separability to LMArena (Chatbot Arena) among popular open-ended LLM benchmarks. If you are curious to see how well your model might perform on LMArena before deploying, we recommend trying Arena-Hard-Auto's newest evaluation set, **Arena-Hard-v2.0-Preview**.
+            **Repo:** https://github.com/lmarena/arena-hard-auto
+            **Paper:** https://arxiv.org/abs/2406.11939
+            '''
+        )
+        with gr.Row():
+            with gr.Column():
+                benchmark_dropdown = gr.Dropdown(
+                    choices=benchmarks,
+                    value=default_benchmark,
+                    label="Benchmark"
+                )
+                category_dropdown = gr.Dropdown(
+                    choices=init_data[0].choices,
+                    value=init_data[0].value,
+                    label="Category"
+                )
+                language_dropdown = gr.Dropdown(
+                    choices=init_data[1].choices,
+                    value=init_data[1].value,
+                    label="Language"
+                )
+            with gr.Column():
+                judge_dropdown = gr.Dropdown(
+                    choices=init_data[2].choices,
+                    value=init_data[2].value,
+                    label="Judge Model"
+                )
+                model_dropdown = gr.Dropdown(
+                    label="Model to Evaluate",
+                    choices=init_data[3].choices,
+                    value=init_data[3].value,
+                )
+        question_dict = gr.State(init_data[5])
+        question_dropdown = gr.Dropdown(
+            choices=init_data[4].choices,
+            value=init_data[4].value,
+            label="Select Question"
+        )
+        # Add a next question button
+        next_button = gr.Button("Next Question")
+        # Display the question
+        gr.Markdown("---")
+        question_display = gr.Markdown(value="### Question\n\n" + init_data[6])
+        with gr.Tabs():
+            with gr.TabItem("Game 1: Baseline (A) vs Model (B)"):
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("### Baseline (A)")
+                        baseline_answer1 = gr.Markdown(value=init_data[7])
+                    with gr.Column():
+                        gr.Markdown("### Model (B)")
+                        model_answer1 = gr.Markdown(value=init_data[8])
+                gr.Markdown("---")
+                gr.Markdown("### Judgment")
+                judgment1 = gr.Markdown(value=init_data[9])
+            with gr.TabItem("Game 2: Model (A) vs Baseline (B)"):
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("### Model (A)")
+                        model_answer2 = gr.Markdown(value=init_data[8])
+                    with gr.Column():
+                        gr.Markdown("### Baseline (B)")
+                        baseline_answer2 = gr.Markdown(value=init_data[7])
+                gr.Markdown("---")
+                gr.Markdown("### Judgment")
+                judgment2 = gr.Markdown(value=init_data[10])
+        gr.Markdown("---")
+        gr.Markdown("### Citation")
+        gr.Markdown("If you find this tool useful, please cite the following papers:")
+        gr.Markdown(
+            '''```bibtex
+@article{li2024crowdsourced,
+  title={From Crowdsourced Data to High-Quality Benchmarks: Arena-Hard and BenchBuilder Pipeline},
+  author={Li, Tianle and Chiang, Wei-Lin and Frick, Evan and Dunlap, Lisa and Wu, Tianhao and Zhu, Banghua and Gonzalez, Joseph E and Stoica, Ion},
+  journal={arXiv preprint arXiv:2406.11939},
+  year={2024}
+}
+@misc{arenahard2024,
+    title = {From Live Data to High-Quality Benchmarks: The Arena-Hard Pipeline},
+    url = {https://lmsys.org/blog/2024-04-19-arena-hard/},
+    author = {Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica},
+    month = {April},
+    year = {2024}
+}
+```''')
+        # Set up event handlers
+        benchmark_dropdown.change(
+            fn=init_app,
+            inputs=benchmark_dropdown,
+            outputs=[
+                category_dropdown, language_dropdown, judge_dropdown, model_dropdown,
+                question_dropdown, question_dict,
+                question_display,
+                baseline_answer1, model_answer1,
+                judgment1, judgment2
+            ]
+        ).then(
+            fn=lambda model, baseline: (model, baseline),
+            inputs=[model_answer1, baseline_answer1],
+            outputs=[model_answer2, baseline_answer2]
+        )
+        # Update questions when category changes
+        category_dropdown.change(
+            fn=update_questions,
+            inputs=[benchmark_dropdown, category_dropdown, language_dropdown],
+            outputs=[question_dropdown, question_dict]
+        ).then(
+            fn=display_content,
+            inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
+            outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
+        ).then(
+            fn=lambda model, baseline: (model, baseline),
+            inputs=[model_answer1, baseline_answer1],
+            outputs=[model_answer2, baseline_answer2]
+        )
+        # Update questions when language changes
+        language_dropdown.change(
+            fn=update_questions,
+            inputs=[benchmark_dropdown, category_dropdown, language_dropdown],
+            outputs=[question_dropdown, question_dict]
+        ).then(
+            fn=display_content,
+            inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
+            outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
+        ).then(
+            fn=lambda model, baseline: (model, baseline),
+            inputs=[model_answer1, baseline_answer1],
+            outputs=[model_answer2, baseline_answer2]
+        )
+        # Update models when judge changes
+        judge_dropdown.change(
+            fn=update_models,
+            inputs=[benchmark_dropdown, judge_dropdown],
+            outputs=model_dropdown
+        ).then(
+            fn=display_content,
+            inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
+            outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
+        ).then(
+            fn=lambda model, baseline: (model, baseline),
+            inputs=[model_answer1, baseline_answer1],
+            outputs=[model_answer2, baseline_answer2]
+        )
+        # Display content when model changes
+        model_dropdown.change(
+            fn=display_content,
+            inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
+            outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
+        ).then(
+            fn=lambda model, baseline: (model, baseline),
+            inputs=[model_answer1, baseline_answer1],
+            outputs=[model_answer2, baseline_answer2]
+        )
+        # Display content when question changes
+        question_dropdown.change(
+            fn=display_content,
+            inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
+            outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
+        ).then(
+            fn=lambda model, baseline: (model, baseline),
+            inputs=[model_answer1, baseline_answer1],
+            outputs=[model_answer2, baseline_answer2]
+        )
+        # Handle next question button
+        next_button.click(
+            fn=next_question,
+            inputs=[benchmark_dropdown, category_dropdown, language_dropdown, question_dropdown, question_dict],
+            outputs=question_dropdown
+        )
+    return app
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--host", type=str, default="0.0.0.0")
+    parser.add_argument("--port", type=int)
+    parser.add_argument("--share", action="store_true")
+    args = parser.parse_args()
+    app = create_app()
+    app.launch(server_name=args.host, server_port=args.port, share=args.share)

data/arena-hard-v0.1/model_answer/claude-3-5-sonnet-20240620.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5a702fcd2a599b5c0cd7169297fdda5395fcf80b532467d19326d1db7eea4c7c
+size 1608405

data/arena-hard-v0.1/model_answer/claude-3-haiku-20240307.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c4fc6d5675555a85c4e6de277d537f5fec01ef8e1a3f62166fc1fadfbf1bd001
+size 1478348

data/arena-hard-v0.1/model_answer/claude-3-opus-20240229.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef342f26a4527df1a31f38bbd73d44a4a630bd6ae1a48e5f0a77023749febf90
+size 1553155

data/arena-hard-v0.1/model_answer/claude-3-sonnet-20240229.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4bc5d2860a3019e907a0e333e00bd887be9dc0ac5a4ef3a0ee2e6b1ad8524405
+size 1580256

data/arena-hard-v0.1/model_answer/gemma-2-27b-it.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba7b4f01eaa5997c50954d940f10f622798296fd042a21cd1d632d478d229289
+size 1627911

data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f6abc84de4600fde987d9db70f838f4ecdbd27a2f1f4058c12aa322a6f791b2
+size 1270631

data/arena-hard-v0.1/model_answer/gpt-4-0613.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:055c37c6b230c87413f080c916deba18c6cace3d44b3d456009f1a3d3834a04b
+size 1116462

data/arena-hard-v0.1/model_answer/gpt-4o-2024-05-13.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e59bc21b9548ca3dc7e777c86f63dddcf198c6e0d77f4188f9d62bf33a33e8c1
+size 1860759

data/arena-hard-v0.1/model_answer/gpt-4o-mini-2024-07-18.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b60ddc9b001a8e01fe6310fd91fd0935f1381ebb8579b20159aa0d11f608850
+size 1819622

data/arena-hard-v0.1/model_answer/llama-3.1-70b-instruct.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a06a0dcd713ada3ba1c82605b6c53c488ec979163d3b576925c4a897ea7b1d7b
+size 1714275

data/arena-hard-v0.1/model_answer/llama-3.1-8b-instruct.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc402fabbe7bffe5419902d015a899a8503738507f6cfec02944213b652e86c7
+size 2175258

data/arena-hard-v0.1/model_answer/o1-mini-2024-09-12.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d9aaa8d8406794e0bcbfb811b6772bce4c1df8f6783f22411b77a02f95b6d310
+size 3521179

data/arena-hard-v0.1/model_answer/o1-preview-2024-09-12.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:332965f3faff8043cf893a71e2e821a58b784809b9fdf132f8d651fe82998d43
+size 3019306

data/arena-hard-v0.1/model_answer/qwen2.5-72b-instruct.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3aa67167beaa119b58f1ba3c0bf0cbb7aed9c78bd02f403d9a0e27f7641060aa
+size 2096511

data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/claude-3-5-sonnet-20240620.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:738a92cbe0d70372b916bd59df42026162ce63fb145d6a3885a702c97b35b033
+size 8383851

data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/claude-3-haiku-20240307.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3761e2a9b661194df5e28ff1f9fd04e0c668ae3f870c7ac3c2b824ecb45c1964
+size 8089726

data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/claude-3-opus-20240229.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba410f24795fd5b7a075f695044b6d03376d4b66e0f0644dc22189ff052afdb0
+size 8283106

data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/claude-3-sonnet-20240229.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88e02e948e8fb80cf52533884d8c2eed030d9ed82ab876139cafd417a6a1dabb
+size 8344535

data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/gemma-2-27b-it.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9007c9ce0933e6cbb8655821196c8069878fa3b56958f7261fcd4465d756a9e8
+size 8429777

data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/gpt-4-0613.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5de0e37567b546784257b898c8eddf3f30f592c255eb96ed73e64e9834c7093
+size 7307154

data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/gpt-4o-2024-05-13.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:93e1d86eb954a83196eef8352ec66f9ee64f6c727b875ea213b26a500536ca90
+size 8906574

data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/gpt-4o-mini-2024-07-18.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2b5d9eb68337f41b7a6ffce5231cde6b078a4478df557db9a4adc1be67117a5
+size 8824934

data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/llama-3.1-70b-instruct.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40f05377c49c4dd81655338b1e362b35c727507ed3664c874e5df068c4861d75
+size 8615331

data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/llama-3.1-8b-instruct.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f4100745388b1c340abff7180932bf7832a3155f6fb45e8549772b8f56f2fa4
+size 9478840

data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/o1-mini-2024-09-12.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:090396d929d38fc61c4529e8212fe0066b69c5c84f4b8c6d7f9c13cde047c331
+size 12263135

data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/o1-preview-2024-09-12.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0bbb3a0f09000ca143c0f1115ac6d363074fe1e812b7aba9c077db8ee9310a66
+size 11234646

data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/qwen2.5-72b-instruct.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:530e98f85478f6c1ed0dc096eec5425cd3b5ef1e4f1203c18ea44c8505755233
+size 9351695

data/arena-hard-v0.1/question.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ed52dc0ffea3ddc5a44393658d30288e11ce5307da981e0a010170b3c3037ce
+size 282575