Timmli commited on
Commit
ad39573
·
1 Parent(s): 3c1e115

v2.0 release

Browse files
Files changed (30) hide show
  1. .gitattributes +1 -0
  2. app.py +496 -0
  3. data/arena-hard-v0.1/model_answer/claude-3-5-sonnet-20240620.jsonl +3 -0
  4. data/arena-hard-v0.1/model_answer/claude-3-haiku-20240307.jsonl +3 -0
  5. data/arena-hard-v0.1/model_answer/claude-3-opus-20240229.jsonl +3 -0
  6. data/arena-hard-v0.1/model_answer/claude-3-sonnet-20240229.jsonl +3 -0
  7. data/arena-hard-v0.1/model_answer/gemma-2-27b-it.jsonl +3 -0
  8. data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl +3 -0
  9. data/arena-hard-v0.1/model_answer/gpt-4-0613.jsonl +3 -0
  10. data/arena-hard-v0.1/model_answer/gpt-4o-2024-05-13.jsonl +3 -0
  11. data/arena-hard-v0.1/model_answer/gpt-4o-mini-2024-07-18.jsonl +3 -0
  12. data/arena-hard-v0.1/model_answer/llama-3.1-70b-instruct.jsonl +3 -0
  13. data/arena-hard-v0.1/model_answer/llama-3.1-8b-instruct.jsonl +3 -0
  14. data/arena-hard-v0.1/model_answer/o1-mini-2024-09-12.jsonl +3 -0
  15. data/arena-hard-v0.1/model_answer/o1-preview-2024-09-12.jsonl +3 -0
  16. data/arena-hard-v0.1/model_answer/qwen2.5-72b-instruct.jsonl +3 -0
  17. data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/claude-3-5-sonnet-20240620.jsonl +3 -0
  18. data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/claude-3-haiku-20240307.jsonl +3 -0
  19. data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/claude-3-opus-20240229.jsonl +3 -0
  20. data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/claude-3-sonnet-20240229.jsonl +3 -0
  21. data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/gemma-2-27b-it.jsonl +3 -0
  22. data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/gpt-4-0613.jsonl +3 -0
  23. data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/gpt-4o-2024-05-13.jsonl +3 -0
  24. data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/gpt-4o-mini-2024-07-18.jsonl +3 -0
  25. data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/llama-3.1-70b-instruct.jsonl +3 -0
  26. data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/llama-3.1-8b-instruct.jsonl +3 -0
  27. data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/o1-mini-2024-09-12.jsonl +3 -0
  28. data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/o1-preview-2024-09-12.jsonl +3 -0
  29. data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/qwen2.5-72b-instruct.jsonl +3 -0
  30. data/arena-hard-v0.1/question.jsonl +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.jsonl filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import pandas as pd
4
+ import glob
5
+ import gradio as gr
6
+
7
+ # Cache for loaded data
8
+ data_cache = {}
9
+
10
+ # Load data functions with caching
11
+ def load_jsonl(file_path):
12
+ """Load a JSONL file into a pandas DataFrame with caching."""
13
+ if file_path in data_cache:
14
+ return data_cache[file_path]
15
+
16
+ if not os.path.exists(file_path):
17
+ return pd.DataFrame()
18
+
19
+ try:
20
+ df = pd.read_json(file_path, lines=True)
21
+ data_cache[file_path] = df
22
+ return df
23
+ except Exception as e:
24
+ print(f"Error loading {file_path}: {e}")
25
+ return pd.DataFrame()
26
+
27
+ def get_available_benchmarks():
28
+ """Get list of available benchmarks in data directory."""
29
+ return [dir_name for dir_name in os.listdir("data")
30
+ if os.path.isdir(os.path.join("data", dir_name))]
31
+
32
+ def get_categories(benchmark):
33
+ """Get list of categories for a given benchmark."""
34
+ questions = load_jsonl(f"data/{benchmark}/question.jsonl")
35
+ if questions.empty:
36
+ return []
37
+ return sorted(questions['category'].unique().tolist())
38
+
39
+ def get_languages(benchmark):
40
+ """Get list of languages available in the benchmark."""
41
+ questions = load_jsonl(f"data/{benchmark}/question.jsonl")
42
+ if questions.empty or 'language' not in questions.columns:
43
+ return ["English"] # Default if no language column
44
+
45
+ return sorted(questions['language'].unique().tolist())
46
+
47
+ def get_judges(benchmark):
48
+ """Get list of available judges for a benchmark."""
49
+ judgment_dir = f"data/{benchmark}/model_judgment"
50
+ if not os.path.exists(judgment_dir):
51
+ return []
52
+ return [dir_name for dir_name in os.listdir(judgment_dir)
53
+ if os.path.isdir(os.path.join(judgment_dir, dir_name))]
54
+
55
+ def get_models(benchmark, judge):
56
+ """Get list of models that have judgments by the specified judge."""
57
+ if not judge:
58
+ return []
59
+
60
+ judgment_dir = f"data/{benchmark}/model_judgment/{judge}"
61
+ if not os.path.exists(judgment_dir):
62
+ return []
63
+
64
+ return [os.path.splitext(os.path.basename(file))[0]
65
+ for file in glob.glob(f"{judgment_dir}/*.jsonl")]
66
+
67
+ def get_questions(benchmark, category=None, language=None):
68
+ """Get questions with category and language filters if provided."""
69
+ questions = load_jsonl(f"data/{benchmark}/question.jsonl")
70
+ if questions.empty:
71
+ return []
72
+
73
+ # Apply category filter if provided
74
+ if category and category != "All":
75
+ questions = questions[questions['category'] == category]
76
+
77
+ # Apply language filter if provided and column exists
78
+ if language and language != "All" and 'language' in questions.columns:
79
+ questions = questions[questions['language'] == language]
80
+
81
+ # Create list of question previews with their UIDs
82
+ question_previews = [(row['uid'], row['prompt'][:100] + "..." if len(row['prompt']) > 100 else row['prompt'])
83
+ for _, row in questions.iterrows()]
84
+
85
+ return question_previews
86
+
87
+ def get_model_answer(benchmark, model, uid):
88
+ """Get a model's answer for a specific question."""
89
+ model_answers = load_jsonl(f"data/{benchmark}/model_answer/{model}.jsonl")
90
+ if model_answers.empty:
91
+ return "No answer found"
92
+
93
+ answer = model_answers[model_answers['uid'] == uid]
94
+ if answer.empty:
95
+ return "No answer found"
96
+
97
+ # Extract the actual answer from the messages
98
+ try:
99
+ messages = answer.iloc[0]['messages']
100
+ if len(messages) < 2:
101
+ return "No answer found"
102
+
103
+ # The assistant's message should be the second one
104
+ assistant_msg = messages[1]
105
+ if 'role' in assistant_msg and assistant_msg['role'] == 'assistant':
106
+ content = assistant_msg['content']
107
+
108
+ # Handle different content formats
109
+ if isinstance(content, dict) and 'answer' in content:
110
+ return content['answer']
111
+ elif isinstance(content, str):
112
+ return content
113
+ else:
114
+ return str(content)
115
+ else:
116
+ return "Invalid message format"
117
+ except Exception as e:
118
+ return f"Error extracting answer: {str(e)}"
119
+
120
+ def get_judgment(benchmark, judge, model, uid):
121
+ """Get judgment for a specific model and question."""
122
+ judgments = load_jsonl(f"data/{benchmark}/model_judgment/{judge}/{model}.jsonl")
123
+ if judgments.empty:
124
+ return None, None
125
+
126
+ judgment = judgments[judgments['uid'] == uid]
127
+ if judgment.empty:
128
+ return None, None
129
+
130
+ games = judgment.iloc[0]['games']
131
+ if len(games) < 2:
132
+ return games[0] if games else None, None
133
+
134
+ return games[0], games[1] # First game, second game
135
+
136
+ def format_judgment(game):
137
+ """Format judgment for display."""
138
+ if not game:
139
+ return "No judgment available"
140
+
141
+ score = game.get('score', 'No score')
142
+
143
+ # Try to get judgment text
144
+ judgment = game.get('judgment', {})
145
+ if isinstance(judgment, dict) and 'answer' in judgment:
146
+ judgment_text = judgment['answer']
147
+ else:
148
+ judgment_text = str(judgment)
149
+
150
+ return f"### Score: {score}\n\n{judgment_text}"
151
+
152
+ # Gradio interface functions
153
+ def update_categories(benchmark):
154
+ """Update category dropdown based on selected benchmark."""
155
+ categories = ["All"] + get_categories(benchmark)
156
+ return gr.Dropdown(choices=categories, value="All")
157
+
158
+ def update_languages(benchmark):
159
+ """Update language dropdown based on selected benchmark."""
160
+ languages = ["All"] + get_languages(benchmark)
161
+ default = "English" if "English" in languages else languages[0]
162
+ return gr.Dropdown(choices=languages, value=default)
163
+
164
+ def update_judges(benchmark):
165
+ """Update judge dropdown based on selected benchmark."""
166
+ judges = get_judges(benchmark)
167
+ default = judges[0] if judges else None
168
+ return gr.Dropdown(choices=judges, value=default)
169
+
170
+ def update_models(benchmark, judge):
171
+ """Update model dropdown based on selected benchmark and judge."""
172
+ models = get_models(benchmark, judge)
173
+ default = models[0] if models else None
174
+ return gr.Dropdown(choices=models, value=default)
175
+
176
+ def update_questions(benchmark, category, language):
177
+ """Update question dropdown based on selected benchmark, category and language."""
178
+ question_list = get_questions(benchmark, category, language)
179
+ if not question_list:
180
+ return gr.Dropdown(choices=[], value=None), {}
181
+
182
+ # Create a dictionary mapping previews to UIDs to ensure we can look up UIDs from previews
183
+ question_dict = {q[1]: q[0] for q in question_list}
184
+ question_options = list(question_dict.keys())
185
+
186
+ default = question_options[0] if question_options else None
187
+ return gr.Dropdown(choices=question_options, value=default), question_dict
188
+
189
+ def display_content(benchmark, category, language, judge, model, question, question_dict):
190
+ """Display the question, answers, and judgments."""
191
+ if not question or not question_dict or question not in question_dict:
192
+ return "No question selected", "No baseline answer", "No model answer", "No judgment", "No judgment"
193
+
194
+ uid = question_dict[question]
195
+
196
+ # Load the question text
197
+ questions_df = load_jsonl(f"data/{benchmark}/question.jsonl")
198
+ question_row = questions_df[questions_df['uid'] == uid]
199
+ if question_row.empty:
200
+ return "Question not found", "No baseline answer", "No model answer", "No judgment", "No judgment"
201
+
202
+ question_text = question_row.iloc[0]['prompt']
203
+
204
+ # Load judgments and identify baseline model
205
+ judgments = load_jsonl(f"data/{benchmark}/model_judgment/{judge}/{model}.jsonl")
206
+ judgment_row = judgments[judgments['uid'] == uid]
207
+
208
+ if judgment_row.empty:
209
+ return question_text, "No baseline answer", "No model answer", "No judgment", "No judgment"
210
+
211
+ baseline_model = judgment_row.iloc[0]['baseline']
212
+
213
+ # Get answers
214
+ baseline_answer = get_model_answer(benchmark, baseline_model, uid)
215
+ model_answer = get_model_answer(benchmark, model, uid)
216
+
217
+ # Get judgments
218
+ game1, game2 = get_judgment(benchmark, judge, model, uid)
219
+
220
+ judgment1 = format_judgment(game1)
221
+ judgment2 = format_judgment(game2)
222
+
223
+ return question_text, baseline_answer, model_answer, judgment1, judgment2
224
+
225
+ # Initialize app components based on selected benchmark
226
+ def init_app(benchmark):
227
+ categories = ["All"] + get_categories(benchmark)
228
+ default_category = "All"
229
+
230
+ languages = ["All"] + get_languages(benchmark)
231
+ default_language = "English" if "English" in languages else languages[0]
232
+
233
+ judges = get_judges(benchmark)
234
+ default_judge = judges[0] if judges else None
235
+
236
+ models = get_models(benchmark, default_judge) if default_judge else []
237
+ default_model = models[0] if models else None
238
+
239
+ question_list = get_questions(benchmark, default_category, default_language)
240
+ question_dict = {q[1]: q[0] for q in question_list}
241
+ question_options = list(question_dict.keys())
242
+ default_question = question_options[0] if question_options else None
243
+
244
+ # Get initial display content
245
+ if default_question and default_model and default_judge:
246
+ question_text, baseline_ans, model_ans, judgment1, judgment2 = display_content(
247
+ benchmark, default_category, default_language, default_judge, default_model, default_question, question_dict
248
+ )
249
+ else:
250
+ question_text = "No question available"
251
+ baseline_ans = "No baseline answer"
252
+ model_ans = "No model answer"
253
+ judgment1 = "No judgment"
254
+ judgment2 = "No judgment"
255
+
256
+ return (
257
+ gr.Dropdown(choices=categories, value=default_category),
258
+ gr.Dropdown(choices=languages, value=default_language),
259
+ gr.Dropdown(choices=judges, value=default_judge),
260
+ gr.Dropdown(choices=models, value=default_model),
261
+ gr.Dropdown(choices=question_options, value=default_question),
262
+ question_dict,
263
+ question_text,
264
+ baseline_ans, model_ans,
265
+ judgment1, judgment2
266
+ )
267
+
268
+ # Function to go to the next question
269
+ def next_question(benchmark, category, language, current_question, question_dict):
270
+ question_list = get_questions(benchmark, category, language)
271
+ previews = [q[1] for q in question_list]
272
+
273
+ if current_question not in previews:
274
+ return gr.Dropdown(value=previews[0] if previews else None)
275
+
276
+ current_idx = previews.index(current_question)
277
+ next_idx = (current_idx + 1) % len(previews)
278
+ return gr.Dropdown(value=previews[next_idx])
279
+
280
+ # Create Gradio app
281
+ def create_app():
282
+ benchmarks = get_available_benchmarks()
283
+ default_benchmark = "arena-hard-v2.0" if "arena-hard-v2.0" in benchmarks else benchmarks[0]
284
+
285
+ # Initialize data for the default benchmark
286
+ init_data = init_app(default_benchmark)
287
+
288
+ with gr.Blocks() as app:
289
+ gr.Markdown(
290
+ '''# Arena-Hard-Auto Benchmark Viewer
291
+
292
+ Arena-Hard-Auto is an automatic evaluation tool for instruction-tuned LLMs. It has the highest correlation and separability to LMArena (Chatbot Arena) among popular open-ended LLM benchmarks. If you are curious to see how well your model might perform on LMArena before deploying, we recommend trying Arena-Hard-Auto's newest evaluation set, **Arena-Hard-v2.0-Preview**.
293
+
294
+ **Repo:** https://github.com/lmarena/arena-hard-auto
295
+
296
+ **Paper:** https://arxiv.org/abs/2406.11939
297
+ '''
298
+ )
299
+
300
+ with gr.Row():
301
+ with gr.Column():
302
+ benchmark_dropdown = gr.Dropdown(
303
+ choices=benchmarks,
304
+ value=default_benchmark,
305
+ label="Benchmark"
306
+ )
307
+
308
+ category_dropdown = gr.Dropdown(
309
+ choices=init_data[0].choices,
310
+ value=init_data[0].value,
311
+ label="Category"
312
+ )
313
+
314
+ language_dropdown = gr.Dropdown(
315
+ choices=init_data[1].choices,
316
+ value=init_data[1].value,
317
+ label="Language"
318
+ )
319
+
320
+ with gr.Column():
321
+ judge_dropdown = gr.Dropdown(
322
+ choices=init_data[2].choices,
323
+ value=init_data[2].value,
324
+ label="Judge Model"
325
+ )
326
+
327
+ model_dropdown = gr.Dropdown(
328
+ label="Model to Evaluate",
329
+ choices=init_data[3].choices,
330
+ value=init_data[3].value,
331
+ )
332
+
333
+ question_dict = gr.State(init_data[5])
334
+ question_dropdown = gr.Dropdown(
335
+ choices=init_data[4].choices,
336
+ value=init_data[4].value,
337
+ label="Select Question"
338
+ )
339
+
340
+ # Add a next question button
341
+ next_button = gr.Button("Next Question")
342
+
343
+ # Display the question
344
+ gr.Markdown("---")
345
+ question_display = gr.Markdown(value="### Question\n\n" + init_data[6])
346
+
347
+ with gr.Tabs():
348
+ with gr.TabItem("Game 1: Baseline (A) vs Model (B)"):
349
+ with gr.Row():
350
+ with gr.Column():
351
+ gr.Markdown("### Baseline (A)")
352
+ baseline_answer1 = gr.Markdown(value=init_data[7])
353
+ with gr.Column():
354
+ gr.Markdown("### Model (B)")
355
+ model_answer1 = gr.Markdown(value=init_data[8])
356
+ gr.Markdown("---")
357
+ gr.Markdown("### Judgment")
358
+ judgment1 = gr.Markdown(value=init_data[9])
359
+
360
+ with gr.TabItem("Game 2: Model (A) vs Baseline (B)"):
361
+ with gr.Row():
362
+ with gr.Column():
363
+ gr.Markdown("### Model (A)")
364
+ model_answer2 = gr.Markdown(value=init_data[8])
365
+ with gr.Column():
366
+ gr.Markdown("### Baseline (B)")
367
+ baseline_answer2 = gr.Markdown(value=init_data[7])
368
+ gr.Markdown("---")
369
+ gr.Markdown("### Judgment")
370
+ judgment2 = gr.Markdown(value=init_data[10])
371
+
372
+ gr.Markdown("---")
373
+ gr.Markdown("### Citation")
374
+ gr.Markdown("If you find this tool useful, please cite the following papers:")
375
+ gr.Markdown(
376
+ '''```bibtex
377
+ @article{li2024crowdsourced,
378
+ title={From Crowdsourced Data to High-Quality Benchmarks: Arena-Hard and BenchBuilder Pipeline},
379
+ author={Li, Tianle and Chiang, Wei-Lin and Frick, Evan and Dunlap, Lisa and Wu, Tianhao and Zhu, Banghua and Gonzalez, Joseph E and Stoica, Ion},
380
+ journal={arXiv preprint arXiv:2406.11939},
381
+ year={2024}
382
+ }
383
+ @misc{arenahard2024,
384
+ title = {From Live Data to High-Quality Benchmarks: The Arena-Hard Pipeline},
385
+ url = {https://lmsys.org/blog/2024-04-19-arena-hard/},
386
+ author = {Tianle Li*, Wei-Lin Chiang*, Evan Frick, Lisa Dunlap, Banghua Zhu, Joseph E. Gonzalez, Ion Stoica},
387
+ month = {April},
388
+ year = {2024}
389
+ }
390
+ ```''')
391
+
392
+ # Set up event handlers
393
+ benchmark_dropdown.change(
394
+ fn=init_app,
395
+ inputs=benchmark_dropdown,
396
+ outputs=[
397
+ category_dropdown, language_dropdown, judge_dropdown, model_dropdown,
398
+ question_dropdown, question_dict,
399
+ question_display,
400
+ baseline_answer1, model_answer1,
401
+ judgment1, judgment2
402
+ ]
403
+ ).then(
404
+ fn=lambda model, baseline: (model, baseline),
405
+ inputs=[model_answer1, baseline_answer1],
406
+ outputs=[model_answer2, baseline_answer2]
407
+ )
408
+
409
+ # Update questions when category changes
410
+ category_dropdown.change(
411
+ fn=update_questions,
412
+ inputs=[benchmark_dropdown, category_dropdown, language_dropdown],
413
+ outputs=[question_dropdown, question_dict]
414
+ ).then(
415
+ fn=display_content,
416
+ inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
417
+ outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
418
+ ).then(
419
+ fn=lambda model, baseline: (model, baseline),
420
+ inputs=[model_answer1, baseline_answer1],
421
+ outputs=[model_answer2, baseline_answer2]
422
+ )
423
+
424
+ # Update questions when language changes
425
+ language_dropdown.change(
426
+ fn=update_questions,
427
+ inputs=[benchmark_dropdown, category_dropdown, language_dropdown],
428
+ outputs=[question_dropdown, question_dict]
429
+ ).then(
430
+ fn=display_content,
431
+ inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
432
+ outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
433
+ ).then(
434
+ fn=lambda model, baseline: (model, baseline),
435
+ inputs=[model_answer1, baseline_answer1],
436
+ outputs=[model_answer2, baseline_answer2]
437
+ )
438
+
439
+ # Update models when judge changes
440
+ judge_dropdown.change(
441
+ fn=update_models,
442
+ inputs=[benchmark_dropdown, judge_dropdown],
443
+ outputs=model_dropdown
444
+ ).then(
445
+ fn=display_content,
446
+ inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
447
+ outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
448
+ ).then(
449
+ fn=lambda model, baseline: (model, baseline),
450
+ inputs=[model_answer1, baseline_answer1],
451
+ outputs=[model_answer2, baseline_answer2]
452
+ )
453
+
454
+ # Display content when model changes
455
+ model_dropdown.change(
456
+ fn=display_content,
457
+ inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
458
+ outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
459
+ ).then(
460
+ fn=lambda model, baseline: (model, baseline),
461
+ inputs=[model_answer1, baseline_answer1],
462
+ outputs=[model_answer2, baseline_answer2]
463
+ )
464
+
465
+ # Display content when question changes
466
+ question_dropdown.change(
467
+ fn=display_content,
468
+ inputs=[benchmark_dropdown, category_dropdown, language_dropdown, judge_dropdown, model_dropdown, question_dropdown, question_dict],
469
+ outputs=[question_display, baseline_answer1, model_answer1, judgment1, judgment2]
470
+ ).then(
471
+ fn=lambda model, baseline: (model, baseline),
472
+ inputs=[model_answer1, baseline_answer1],
473
+ outputs=[model_answer2, baseline_answer2]
474
+ )
475
+
476
+ # Handle next question button
477
+ next_button.click(
478
+ fn=next_question,
479
+ inputs=[benchmark_dropdown, category_dropdown, language_dropdown, question_dropdown, question_dict],
480
+ outputs=question_dropdown
481
+ )
482
+
483
+ return app
484
+
485
+ if __name__ == "__main__":
486
+ import argparse
487
+
488
+ parser = argparse.ArgumentParser()
489
+ parser.add_argument("--host", type=str, default="0.0.0.0")
490
+ parser.add_argument("--port", type=int)
491
+ parser.add_argument("--share", action="store_true")
492
+ args = parser.parse_args()
493
+
494
+ app = create_app()
495
+ app.launch(server_name=args.host, server_port=args.port, share=args.share)
496
+
data/arena-hard-v0.1/model_answer/claude-3-5-sonnet-20240620.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a702fcd2a599b5c0cd7169297fdda5395fcf80b532467d19326d1db7eea4c7c
3
+ size 1608405
data/arena-hard-v0.1/model_answer/claude-3-haiku-20240307.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4fc6d5675555a85c4e6de277d537f5fec01ef8e1a3f62166fc1fadfbf1bd001
3
+ size 1478348
data/arena-hard-v0.1/model_answer/claude-3-opus-20240229.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef342f26a4527df1a31f38bbd73d44a4a630bd6ae1a48e5f0a77023749febf90
3
+ size 1553155
data/arena-hard-v0.1/model_answer/claude-3-sonnet-20240229.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4bc5d2860a3019e907a0e333e00bd887be9dc0ac5a4ef3a0ee2e6b1ad8524405
3
+ size 1580256
data/arena-hard-v0.1/model_answer/gemma-2-27b-it.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba7b4f01eaa5997c50954d940f10f622798296fd042a21cd1d632d478d229289
3
+ size 1627911
data/arena-hard-v0.1/model_answer/gpt-4-0314.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f6abc84de4600fde987d9db70f838f4ecdbd27a2f1f4058c12aa322a6f791b2
3
+ size 1270631
data/arena-hard-v0.1/model_answer/gpt-4-0613.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:055c37c6b230c87413f080c916deba18c6cace3d44b3d456009f1a3d3834a04b
3
+ size 1116462
data/arena-hard-v0.1/model_answer/gpt-4o-2024-05-13.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e59bc21b9548ca3dc7e777c86f63dddcf198c6e0d77f4188f9d62bf33a33e8c1
3
+ size 1860759
data/arena-hard-v0.1/model_answer/gpt-4o-mini-2024-07-18.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b60ddc9b001a8e01fe6310fd91fd0935f1381ebb8579b20159aa0d11f608850
3
+ size 1819622
data/arena-hard-v0.1/model_answer/llama-3.1-70b-instruct.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a06a0dcd713ada3ba1c82605b6c53c488ec979163d3b576925c4a897ea7b1d7b
3
+ size 1714275
data/arena-hard-v0.1/model_answer/llama-3.1-8b-instruct.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc402fabbe7bffe5419902d015a899a8503738507f6cfec02944213b652e86c7
3
+ size 2175258
data/arena-hard-v0.1/model_answer/o1-mini-2024-09-12.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9aaa8d8406794e0bcbfb811b6772bce4c1df8f6783f22411b77a02f95b6d310
3
+ size 3521179
data/arena-hard-v0.1/model_answer/o1-preview-2024-09-12.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:332965f3faff8043cf893a71e2e821a58b784809b9fdf132f8d651fe82998d43
3
+ size 3019306
data/arena-hard-v0.1/model_answer/qwen2.5-72b-instruct.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3aa67167beaa119b58f1ba3c0bf0cbb7aed9c78bd02f403d9a0e27f7641060aa
3
+ size 2096511
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/claude-3-5-sonnet-20240620.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:738a92cbe0d70372b916bd59df42026162ce63fb145d6a3885a702c97b35b033
3
+ size 8383851
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/claude-3-haiku-20240307.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3761e2a9b661194df5e28ff1f9fd04e0c668ae3f870c7ac3c2b824ecb45c1964
3
+ size 8089726
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/claude-3-opus-20240229.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba410f24795fd5b7a075f695044b6d03376d4b66e0f0644dc22189ff052afdb0
3
+ size 8283106
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/claude-3-sonnet-20240229.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88e02e948e8fb80cf52533884d8c2eed030d9ed82ab876139cafd417a6a1dabb
3
+ size 8344535
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/gemma-2-27b-it.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9007c9ce0933e6cbb8655821196c8069878fa3b56958f7261fcd4465d756a9e8
3
+ size 8429777
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/gpt-4-0613.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5de0e37567b546784257b898c8eddf3f30f592c255eb96ed73e64e9834c7093
3
+ size 7307154
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/gpt-4o-2024-05-13.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93e1d86eb954a83196eef8352ec66f9ee64f6c727b875ea213b26a500536ca90
3
+ size 8906574
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/gpt-4o-mini-2024-07-18.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2b5d9eb68337f41b7a6ffce5231cde6b078a4478df557db9a4adc1be67117a5
3
+ size 8824934
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/llama-3.1-70b-instruct.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40f05377c49c4dd81655338b1e362b35c727507ed3664c874e5df068c4861d75
3
+ size 8615331
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/llama-3.1-8b-instruct.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f4100745388b1c340abff7180932bf7832a3155f6fb45e8549772b8f56f2fa4
3
+ size 9478840
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/o1-mini-2024-09-12.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:090396d929d38fc61c4529e8212fe0066b69c5c84f4b8c6d7f9c13cde047c331
3
+ size 12263135
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/o1-preview-2024-09-12.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bbb3a0f09000ca143c0f1115ac6d363074fe1e812b7aba9c077db8ee9310a66
3
+ size 11234646
data/arena-hard-v0.1/model_judgment/gpt-4-1106-preview/qwen2.5-72b-instruct.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:530e98f85478f6c1ed0dc096eec5425cd3b5ef1e4f1203c18ea44c8505755233
3
+ size 9351695
data/arena-hard-v0.1/question.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ed52dc0ffea3ddc5a44393658d30288e11ce5307da981e0a010170b3c3037ce
3
+ size 282575