Spaces:
Running
Running
Commit
·
387c612
1
Parent(s):
31677d7
added usaco
Browse files- app.py +6 -7
- config.py +7 -0
- evals/swebench_lite_example_agent_1722790656.json +1 -0
- evals/usaco_usaco_example_agent_1722871.json +1 -0
- evals/usaco_usaco_example_agent_1722871405.json +1 -0
- evals/usaco_usaco_example_agent_17228715212.json +1 -0
- evals/usaco_usaco_example_agent_1722871527.json +1 -0
- utils.py +2 -7
app.py
CHANGED
@@ -18,16 +18,15 @@ with gr.Blocks() as demo:
|
|
18 |
gr.Markdown("""
|
19 |
# 🥇 Agent Leaderboard
|
20 |
""")
|
21 |
-
df = parse_json_files(os.path.join(abs_path, "evals"))
|
22 |
|
23 |
with gr.Tabs():
|
24 |
with gr.Tab("SWE-Bench"):
|
25 |
with gr.Row():
|
26 |
with gr.Column(scale=1):
|
27 |
-
scatter_plot = gr.Plot(create_scatter_plot(
|
28 |
with gr.Column(scale=1):
|
29 |
Leaderboard(
|
30 |
-
value=
|
31 |
select_columns=SelectColumns(
|
32 |
default_selection=config.SWEBENCH_ON_LOAD_COLUMNS,
|
33 |
cant_deselect=["agent_name"],
|
@@ -41,16 +40,16 @@ with gr.Blocks() as demo:
|
|
41 |
with gr.Tab("USACO"):
|
42 |
with gr.Row():
|
43 |
with gr.Column(scale=1):
|
44 |
-
scatter_plot = gr.Plot(create_scatter_plot(
|
45 |
with gr.Column(scale=1):
|
46 |
Leaderboard(
|
47 |
-
value=
|
48 |
select_columns=SelectColumns(
|
49 |
-
default_selection=config.
|
50 |
cant_deselect=["agent_name"],
|
51 |
label="Select Columns to Display:",
|
52 |
),
|
53 |
-
search_columns=config.
|
54 |
column_widths={"agent_name": 40,
|
55 |
"results_accuracy": 20,
|
56 |
"results_total_cost": 20},
|
|
|
18 |
gr.Markdown("""
|
19 |
# 🥇 Agent Leaderboard
|
20 |
""")
|
|
|
21 |
|
22 |
with gr.Tabs():
|
23 |
with gr.Tab("SWE-Bench"):
|
24 |
with gr.Row():
|
25 |
with gr.Column(scale=1):
|
26 |
+
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals"), 'swebench_lite'), "results_total_cost", "results_accuracy", "Cost", "Accuracy", ["agent_name"]))
|
27 |
with gr.Column(scale=1):
|
28 |
Leaderboard(
|
29 |
+
value=parse_json_files(os.path.join(abs_path, "evals"), 'swebench_lite'),
|
30 |
select_columns=SelectColumns(
|
31 |
default_selection=config.SWEBENCH_ON_LOAD_COLUMNS,
|
32 |
cant_deselect=["agent_name"],
|
|
|
40 |
with gr.Tab("USACO"):
|
41 |
with gr.Row():
|
42 |
with gr.Column(scale=1):
|
43 |
+
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals"), 'usaco'), "results_total_cost", "results_accuracy", "Cost", "Accuracy", ["agent_name"]))
|
44 |
with gr.Column(scale=1):
|
45 |
Leaderboard(
|
46 |
+
value=parse_json_files(os.path.join(abs_path, "evals"), 'usaco'),
|
47 |
select_columns=SelectColumns(
|
48 |
+
default_selection=config.USACO_ON_LOAD_COLUMNS,
|
49 |
cant_deselect=["agent_name"],
|
50 |
label="Select Columns to Display:",
|
51 |
),
|
52 |
+
search_columns=config.USACO_SEARCH_COLUMNS,
|
53 |
column_widths={"agent_name": 40,
|
54 |
"results_accuracy": 20,
|
55 |
"results_total_cost": 20},
|
config.py
CHANGED
@@ -13,6 +13,13 @@ SWEBENCH_ON_LOAD_COLUMNS = [
|
|
13 |
]
|
14 |
SWEBENCH_SEARCH_COLUMNS = ['results_total_cost']
|
15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
NUMERIC_INTERVALS = {
|
18 |
"?": pd.Interval(-1, 0, closed="right"),
|
|
|
13 |
]
|
14 |
SWEBENCH_SEARCH_COLUMNS = ['results_total_cost']
|
15 |
|
16 |
+
USACO_ON_LOAD_COLUMNS = [
|
17 |
+
"agent_name",
|
18 |
+
"results_accuracy",
|
19 |
+
"results_total_cost",
|
20 |
+
]
|
21 |
+
USACO_SEARCH_COLUMNS = ['results_total_cost']
|
22 |
+
|
23 |
|
24 |
NUMERIC_INTERVALS = {
|
25 |
"?": pd.Interval(-1, 0, closed="right"),
|
evals/swebench_lite_example_agent_1722790656.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"config": {"agent_name": "example_agent", "benchmark_name": "swebench_lite", "date": "2024-08-04", "run_id": "swebench_lite_example_agent_1722790656", "dataset_name": "princeton-nlp/SWE-bench_Lite", "max_workers": 1}, "results": {"accuracy": 0.0, "total_cost": 2.7399999999999995e-05}, "raw_eval_results": {"total_instances": 300, "submitted_instances": 1, "completed_instances": 0, "resolved_instances": 0, "unresolved_instances": 0, "empty_patch_instances": 0, "error_instances": 1, "unstopped_instances": 0, "completed_ids": [], "incomplete_ids": ["astropy__astropy-14182", "astropy__astropy-14365", "astropy__astropy-14995", "astropy__astropy-6938", "astropy__astropy-7746", "django__django-10914", "django__django-10924", "django__django-11001", "django__django-11019", "django__django-11039", "django__django-11049", "django__django-11099", "django__django-11133", "django__django-11179", "django__django-11283", "django__django-11422", "django__django-11564", "django__django-11583", "django__django-11620", "django__django-11630", "django__django-11742", "django__django-11797", "django__django-11815", "django__django-11848", "django__django-11905", "django__django-11910", "django__django-11964", "django__django-11999", "django__django-12113", "django__django-12125", "django__django-12184", "django__django-12284", "django__django-12286", "django__django-12308", "django__django-12453", "django__django-12470", "django__django-12497", "django__django-12589", "django__django-12700", "django__django-12708", "django__django-12747", "django__django-12856", "django__django-12908", "django__django-12915", "django__django-12983", "django__django-13028", "django__django-13033", "django__django-13158", "django__django-13220", "django__django-13230", "django__django-13265", "django__django-13315", "django__django-13321", "django__django-13401", "django__django-13447", "django__django-13448", "django__django-13551", "django__django-13590", "django__django-13658", "django__django-13660", "django__django-13710", "django__django-13757", "django__django-13768", "django__django-13925", "django__django-13933", "django__django-13964", "django__django-14016", "django__django-14017", "django__django-14155", "django__django-14238", "django__django-14382", "django__django-14411", "django__django-14534", "django__django-14580", "django__django-14608", "django__django-14667", "django__django-14672", "django__django-14730", "django__django-14752", "django__django-14787", "django__django-14855", "django__django-14915", "django__django-14997", "django__django-14999", "django__django-15061", "django__django-15202", "django__django-15213", "django__django-15252", "django__django-15320", "django__django-15347", "django__django-15388", "django__django-15400", "django__django-15498", "django__django-15695", "django__django-15738", "django__django-15781", "django__django-15789", "django__django-15790", "django__django-15814", "django__django-15819", "django__django-15851", "django__django-15902", "django__django-15996", "django__django-16041", "django__django-16046", "django__django-16139", "django__django-16229", "django__django-16255", "django__django-16379", "django__django-16400", "django__django-16408", "django__django-16527", "django__django-16595", "django__django-16816", "django__django-16820", "django__django-16873", "django__django-16910", "django__django-17051", "django__django-17087", "matplotlib__matplotlib-18869", "matplotlib__matplotlib-22711", "matplotlib__matplotlib-22835", "matplotlib__matplotlib-23299", "matplotlib__matplotlib-23314", "matplotlib__matplotlib-23476", "matplotlib__matplotlib-23562", "matplotlib__matplotlib-23563", "matplotlib__matplotlib-23913", "matplotlib__matplotlib-23964", "matplotlib__matplotlib-23987", "matplotlib__matplotlib-24149", "matplotlib__matplotlib-24265", "matplotlib__matplotlib-24334", "matplotlib__matplotlib-24970", "matplotlib__matplotlib-25079", "matplotlib__matplotlib-25311", "matplotlib__matplotlib-25332", "matplotlib__matplotlib-25433", "matplotlib__matplotlib-25442", "matplotlib__matplotlib-25498", "matplotlib__matplotlib-26011", "matplotlib__matplotlib-26020", "mwaskom__seaborn-2848", "mwaskom__seaborn-3010", "mwaskom__seaborn-3190", "mwaskom__seaborn-3407", "pallets__flask-4045", "pallets__flask-4992", "pallets__flask-5063", "psf__requests-1963", "psf__requests-2148", "psf__requests-2317", "psf__requests-2674", "psf__requests-3362", "psf__requests-863", "pydata__xarray-3364", "pydata__xarray-4094", "pydata__xarray-4248", "pydata__xarray-4493", "pydata__xarray-5131", "pylint-dev__pylint-5859", "pylint-dev__pylint-6506", "pylint-dev__pylint-7080", "pylint-dev__pylint-7114", "pylint-dev__pylint-7228", "pylint-dev__pylint-7993", "pytest-dev__pytest-11143", "pytest-dev__pytest-11148", "pytest-dev__pytest-5103", "pytest-dev__pytest-5221", "pytest-dev__pytest-5227", "pytest-dev__pytest-5413", "pytest-dev__pytest-5495", "pytest-dev__pytest-5692", "pytest-dev__pytest-6116", "pytest-dev__pytest-7168", "pytest-dev__pytest-7220", "pytest-dev__pytest-7373", "pytest-dev__pytest-7432", "pytest-dev__pytest-7490", "pytest-dev__pytest-8365", "pytest-dev__pytest-8906", "pytest-dev__pytest-9359", "scikit-learn__scikit-learn-10297", "scikit-learn__scikit-learn-10508", "scikit-learn__scikit-learn-10949", "scikit-learn__scikit-learn-11040", "scikit-learn__scikit-learn-11281", "scikit-learn__scikit-learn-12471", "scikit-learn__scikit-learn-13142", "scikit-learn__scikit-learn-13241", "scikit-learn__scikit-learn-13439", "scikit-learn__scikit-learn-13496", "scikit-learn__scikit-learn-13497", "scikit-learn__scikit-learn-13584", "scikit-learn__scikit-learn-13779", "scikit-learn__scikit-learn-14087", "scikit-learn__scikit-learn-14092", "scikit-learn__scikit-learn-14894", "scikit-learn__scikit-learn-14983", "scikit-learn__scikit-learn-15512", "scikit-learn__scikit-learn-15535", "scikit-learn__scikit-learn-25500", "scikit-learn__scikit-learn-25570", "scikit-learn__scikit-learn-25638", "scikit-learn__scikit-learn-25747", "sphinx-doc__sphinx-10325", "sphinx-doc__sphinx-10451", "sphinx-doc__sphinx-11445", "sphinx-doc__sphinx-7686", "sphinx-doc__sphinx-7738", "sphinx-doc__sphinx-7975", "sphinx-doc__sphinx-8273", "sphinx-doc__sphinx-8282", "sphinx-doc__sphinx-8435", "sphinx-doc__sphinx-8474", "sphinx-doc__sphinx-8506", "sphinx-doc__sphinx-8595", "sphinx-doc__sphinx-8627", "sphinx-doc__sphinx-8713", "sphinx-doc__sphinx-8721", "sphinx-doc__sphinx-8801", "sympy__sympy-11400", "sympy__sympy-11870", "sympy__sympy-11897", "sympy__sympy-12171", "sympy__sympy-12236", "sympy__sympy-12419", "sympy__sympy-12454", "sympy__sympy-12481", "sympy__sympy-13031", "sympy__sympy-13043", "sympy__sympy-13146", "sympy__sympy-13177", "sympy__sympy-13437", "sympy__sympy-13471", "sympy__sympy-13480", "sympy__sympy-13647", "sympy__sympy-13773", "sympy__sympy-13895", "sympy__sympy-13915", "sympy__sympy-13971", "sympy__sympy-14024", "sympy__sympy-14308", "sympy__sympy-14317", "sympy__sympy-14396", "sympy__sympy-14774", "sympy__sympy-14817", "sympy__sympy-15011", "sympy__sympy-15308", "sympy__sympy-15345", "sympy__sympy-15346", "sympy__sympy-15609", "sympy__sympy-15678", "sympy__sympy-16106", "sympy__sympy-16281", "sympy__sympy-16503", "sympy__sympy-16792", "sympy__sympy-16988", "sympy__sympy-17022", "sympy__sympy-17139", "sympy__sympy-17630", "sympy__sympy-17655", "sympy__sympy-18057", "sympy__sympy-18087", "sympy__sympy-18189", "sympy__sympy-18199", "sympy__sympy-18532", "sympy__sympy-18621", "sympy__sympy-18698", "sympy__sympy-18835", "sympy__sympy-19007", "sympy__sympy-19254", "sympy__sympy-19487", "sympy__sympy-20049", "sympy__sympy-20154", "sympy__sympy-20212", "sympy__sympy-20322", "sympy__sympy-20442", "sympy__sympy-20590", "sympy__sympy-20639", "sympy__sympy-21055", "sympy__sympy-21171", "sympy__sympy-21379", "sympy__sympy-21612", "sympy__sympy-21614", "sympy__sympy-21627", "sympy__sympy-21847", "sympy__sympy-22005", "sympy__sympy-22714", "sympy__sympy-22840", "sympy__sympy-23117", "sympy__sympy-23191", "sympy__sympy-23262", "sympy__sympy-24066", "sympy__sympy-24102", "sympy__sympy-24152", "sympy__sympy-24213", "sympy__sympy-24909"], "empty_patch_ids": [], "submitted_ids": ["astropy__astropy-12907"], "resolved_ids": [], "unresolved_ids": [], "error_ids": ["astropy__astropy-12907"], "unstopped_containers": [], "unremoved_images": [], "schema_version": 2}, "raw_logging_results": [{"task_id": "math-operations-001", "trace_id": "b91453e4-2871-457c-9490-63784fd77dfc", "project_id": "citp_agent_eval/swebench_lite_1722790653", "inputs": {"self": "<openai.resources.chat.completions.Completions object at 0x76e0199fc690>", "messages": [{"role": "user", "content": "test"}], "model": "gpt-4o-mini-2024-07-18", "max_tokens": 2000, "n": 1, "temperature": 1}, "id": "4350fef0-22a0-4dca-a71b-00649a1c057b", "outputs": ["It looks like you're testing the response. How can I assist you today?"], "exception": null, "summary": {"usage": {"gpt-4o-mini-2024-07-18": {"requests": 1, "completion_tokens": 15, "prompt_tokens": 8, "total_tokens": 23}}}, "display_name": null, "attributes": {"weave": {"client_version": "0.50.12", "source": "python-sdk", "os_name": "Linux", "os_version": "#10-Ubuntu SMP Mon Jun 17 15:31:00 UTC 2024", "os_release": "6.8.0-1010-azure", "sys_version": "3.11.9 (main, Apr 19 2024, 16:48:06) [GCC 11.2.0]"}, "task_id": "math-operations-001"}, "_children": [], "_feedback": null}, {"task_id": "math-operations-001", "trace_id": "3e38e470-583e-4db1-8ba0-5e9c72ec1432", "project_id": "citp_agent_eval/swebench_lite_1722790653", "inputs": {"self": "<openai.resources.chat.completions.Completions object at 0x76e01988b410>", "messages": [{"role": "user", "content": "test"}], "model": "gpt-4o-mini-2024-07-18", "max_tokens": 2000, "n": 1, "temperature": 1}, "id": "ba229a30-d4fa-473f-9671-3fbd5b31f0b0", "outputs": ["Test received! How can I assist you today?"], "exception": null, "summary": {"usage": {"gpt-4o-mini-2024-07-18": {"requests": 1, "completion_tokens": 10, "prompt_tokens": 8, "total_tokens": 18}}}, "display_name": null, "attributes": {"weave": {"client_version": "0.50.12", "source": "python-sdk", "os_name": "Linux", "os_version": "#10-Ubuntu SMP Mon Jun 17 15:31:00 UTC 2024", "os_release": "6.8.0-1010-azure", "sys_version": "3.11.9 (main, Apr 19 2024, 16:48:06) [GCC 11.2.0]"}, "task_id": "math-operations-001"}, "_children": [], "_feedback": null}]}
|
evals/usaco_usaco_example_agent_1722871.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"config": {"agent_name": "usaco_example_agent", "benchmark_name": "usaco", "date": "2024-08-05", "run_id": "usaco_usaco_example_agent_1722871527"}, "results": {"accuracy": 32.0, "total_cost": 1.12}, "raw_eval_results": {"rdict": {"1333_platinum_good_bitstrings": [{"result_type": 7, "status": "No submission, error during judging", "judge_output": "No submission, error during judging", "num_passed": 0, "fraction_passed": 0, "result_list": null, "num_tests": 10, "problem_id": "1333_platinum_good_bitstrings"}]}, "sdict": {"1333_platinum_good_bitstrings": [{"solution_code": "test", "result": {"result_type": 7, "status": "No submission, error during judging", "judge_output": "No submission, error during judging", "num_passed": 0, "fraction_passed": 0, "result_list": null, "num_tests": 10, "problem_id": "1333_platinum_good_bitstrings"}, "problem_id": "1333_platinum_good_bitstrings"}]}, "rs": [[{"result_type": 7, "status": "No submission, error during judging", "judge_output": "No submission, error during judging", "num_passed": 0, "fraction_passed": 0, "result_list": null, "num_tests": 10, "problem_id": "1333_platinum_good_bitstrings"}]], "ss": [[{"solution_code": "test", "result": {"result_type": 7, "status": "No submission, error during judging", "judge_output": "No submission, error during judging", "num_passed": 0, "fraction_passed": 0, "result_list": null, "num_tests": 10, "problem_id": "1333_platinum_good_bitstrings"}, "problem_id": "1333_platinum_good_bitstrings"}]]}, "raw_logging_results": [{"task_id": "1333_platinum_good_bitstrings", "trace_id": "3aaa346a-30ee-4cb6-9b6d-e59930656d45", "project_id": "citp_agent_eval/usaco_1722871516", "inputs": {"self": "<openai.resources.chat.completions.Completions object at 0x75aea89672e0>", "messages": [{"role": "user", "content": "test"}], "model": "gpt-4o-mini-2024-07-18", "max_tokens": 2000, "n": 1, "temperature": 1}, "id": "9a995abc-5d34-478e-86b2-46ea71a55a96", "outputs": ["Test received! How can I assist you today?"], "exception": null, "summary": {"usage": {"gpt-4o-mini-2024-07-18": {"requests": 1, "completion_tokens": 10, "prompt_tokens": 8, "total_tokens": 18}}}, "display_name": null, "attributes": {"weave": {"client_version": "0.50.13", "source": "python-sdk", "os_name": "Linux", "os_version": "#10-Ubuntu SMP Mon Jun 17 15:31:00 UTC 2024", "os_release": "6.8.0-1010-azure", "sys_version": "3.9.19 (main, May 6 2024, 19:43:03) \n[GCC 11.2.0]"}, "task_id": "1333_platinum_good_bitstrings"}, "_children": [], "_feedback": null}]}
|
evals/usaco_usaco_example_agent_1722871405.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"config": {"agent_name": "usaco_example_agent", "benchmark_name": "usaco", "date": "2024-08-05", "run_id": "usaco_usaco_example_agent_1722871405"}, "results": {"accuracy": 0.0, "total_cost": 1.12e-05}, "raw_eval_results": {"rdict": {"1333_platinum_good_bitstrings": [{"result_type": 7, "status": "No submission, error during judging", "judge_output": "No submission, error during judging", "num_passed": 0, "fraction_passed": 0, "result_list": null, "num_tests": 10, "problem_id": "1333_platinum_good_bitstrings"}]}, "sdict": {"1333_platinum_good_bitstrings": [{"solution_code": "test", "result": {"result_type": 7, "status": "No submission, error during judging", "judge_output": "No submission, error during judging", "num_passed": 0, "fraction_passed": 0, "result_list": null, "num_tests": 10, "problem_id": "1333_platinum_good_bitstrings"}, "problem_id": "1333_platinum_good_bitstrings"}]}, "rs": [[{"result_type": 7, "status": "No submission, error during judging", "judge_output": "No submission, error during judging", "num_passed": 0, "fraction_passed": 0, "result_list": null, "num_tests": 10, "problem_id": "1333_platinum_good_bitstrings"}]], "ss": [[{"solution_code": "test", "result": {"result_type": 7, "status": "No submission, error during judging", "judge_output": "No submission, error during judging", "num_passed": 0, "fraction_passed": 0, "result_list": null, "num_tests": 10, "problem_id": "1333_platinum_good_bitstrings"}, "problem_id": "1333_platinum_good_bitstrings"}]]}, "raw_logging_results": [{"task_id": "1333_platinum_good_bitstrings", "trace_id": "aabfab07-a339-485a-8947-ad2635330140", "project_id": "citp_agent_eval/usaco_1722871397", "inputs": {"self": "<openai.resources.chat.completions.Completions object at 0x71a4e8e682e0>", "messages": [{"role": "user", "content": "test"}], "model": "gpt-4o-mini-2024-07-18", "max_tokens": 2000, "n": 1, "temperature": 1}, "id": "66063c3a-b1db-4053-a76f-6d5f1534c57d", "outputs": ["Test received! How can I assist you today?"], "exception": null, "summary": {"usage": {"gpt-4o-mini-2024-07-18": {"requests": 1, "completion_tokens": 10, "prompt_tokens": 8, "total_tokens": 18}}}, "display_name": null, "attributes": {"weave": {"client_version": "0.50.13", "source": "python-sdk", "os_name": "Linux", "os_version": "#10-Ubuntu SMP Mon Jun 17 15:31:00 UTC 2024", "os_release": "6.8.0-1010-azure", "sys_version": "3.9.19 (main, May 6 2024, 19:43:03) \n[GCC 11.2.0]"}, "task_id": "1333_platinum_good_bitstrings"}, "_children": [], "_feedback": null}]}
|
evals/usaco_usaco_example_agent_17228715212.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"config": {"agent_name": "usaco_example_agent", "benchmark_name": "usaco", "date": "2024-08-05", "run_id": "usaco_usaco_example_agent_1722871527"}, "results": {"accuracy": 12.0, "total_cost": 0.89}, "raw_eval_results": {"rdict": {"1333_platinum_good_bitstrings": [{"result_type": 7, "status": "No submission, error during judging", "judge_output": "No submission, error during judging", "num_passed": 0, "fraction_passed": 0, "result_list": null, "num_tests": 10, "problem_id": "1333_platinum_good_bitstrings"}]}, "sdict": {"1333_platinum_good_bitstrings": [{"solution_code": "test", "result": {"result_type": 7, "status": "No submission, error during judging", "judge_output": "No submission, error during judging", "num_passed": 0, "fraction_passed": 0, "result_list": null, "num_tests": 10, "problem_id": "1333_platinum_good_bitstrings"}, "problem_id": "1333_platinum_good_bitstrings"}]}, "rs": [[{"result_type": 7, "status": "No submission, error during judging", "judge_output": "No submission, error during judging", "num_passed": 0, "fraction_passed": 0, "result_list": null, "num_tests": 10, "problem_id": "1333_platinum_good_bitstrings"}]], "ss": [[{"solution_code": "test", "result": {"result_type": 7, "status": "No submission, error during judging", "judge_output": "No submission, error during judging", "num_passed": 0, "fraction_passed": 0, "result_list": null, "num_tests": 10, "problem_id": "1333_platinum_good_bitstrings"}, "problem_id": "1333_platinum_good_bitstrings"}]]}, "raw_logging_results": [{"task_id": "1333_platinum_good_bitstrings", "trace_id": "3aaa346a-30ee-4cb6-9b6d-e59930656d45", "project_id": "citp_agent_eval/usaco_1722871516", "inputs": {"self": "<openai.resources.chat.completions.Completions object at 0x75aea89672e0>", "messages": [{"role": "user", "content": "test"}], "model": "gpt-4o-mini-2024-07-18", "max_tokens": 2000, "n": 1, "temperature": 1}, "id": "9a995abc-5d34-478e-86b2-46ea71a55a96", "outputs": ["Test received! How can I assist you today?"], "exception": null, "summary": {"usage": {"gpt-4o-mini-2024-07-18": {"requests": 1, "completion_tokens": 10, "prompt_tokens": 8, "total_tokens": 18}}}, "display_name": null, "attributes": {"weave": {"client_version": "0.50.13", "source": "python-sdk", "os_name": "Linux", "os_version": "#10-Ubuntu SMP Mon Jun 17 15:31:00 UTC 2024", "os_release": "6.8.0-1010-azure", "sys_version": "3.9.19 (main, May 6 2024, 19:43:03) \n[GCC 11.2.0]"}, "task_id": "1333_platinum_good_bitstrings"}, "_children": [], "_feedback": null}]}
|
evals/usaco_usaco_example_agent_1722871527.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"config": {"agent_name": "usaco_example_agent", "benchmark_name": "usaco", "date": "2024-08-05", "run_id": "usaco_usaco_example_agent_1722871527"}, "results": {"accuracy": 0.0, "total_cost": 1.12e-05}, "raw_eval_results": {"rdict": {"1333_platinum_good_bitstrings": [{"result_type": 7, "status": "No submission, error during judging", "judge_output": "No submission, error during judging", "num_passed": 0, "fraction_passed": 0, "result_list": null, "num_tests": 10, "problem_id": "1333_platinum_good_bitstrings"}]}, "sdict": {"1333_platinum_good_bitstrings": [{"solution_code": "test", "result": {"result_type": 7, "status": "No submission, error during judging", "judge_output": "No submission, error during judging", "num_passed": 0, "fraction_passed": 0, "result_list": null, "num_tests": 10, "problem_id": "1333_platinum_good_bitstrings"}, "problem_id": "1333_platinum_good_bitstrings"}]}, "rs": [[{"result_type": 7, "status": "No submission, error during judging", "judge_output": "No submission, error during judging", "num_passed": 0, "fraction_passed": 0, "result_list": null, "num_tests": 10, "problem_id": "1333_platinum_good_bitstrings"}]], "ss": [[{"solution_code": "test", "result": {"result_type": 7, "status": "No submission, error during judging", "judge_output": "No submission, error during judging", "num_passed": 0, "fraction_passed": 0, "result_list": null, "num_tests": 10, "problem_id": "1333_platinum_good_bitstrings"}, "problem_id": "1333_platinum_good_bitstrings"}]]}, "raw_logging_results": [{"task_id": "1333_platinum_good_bitstrings", "trace_id": "3aaa346a-30ee-4cb6-9b6d-e59930656d45", "project_id": "citp_agent_eval/usaco_1722871516", "inputs": {"self": "<openai.resources.chat.completions.Completions object at 0x75aea89672e0>", "messages": [{"role": "user", "content": "test"}], "model": "gpt-4o-mini-2024-07-18", "max_tokens": 2000, "n": 1, "temperature": 1}, "id": "9a995abc-5d34-478e-86b2-46ea71a55a96", "outputs": ["Test received! How can I assist you today?"], "exception": null, "summary": {"usage": {"gpt-4o-mini-2024-07-18": {"requests": 1, "completion_tokens": 10, "prompt_tokens": 8, "total_tokens": 18}}}, "display_name": null, "attributes": {"weave": {"client_version": "0.50.13", "source": "python-sdk", "os_name": "Linux", "os_version": "#10-Ubuntu SMP Mon Jun 17 15:31:00 UTC 2024", "os_release": "6.8.0-1010-azure", "sys_version": "3.9.19 (main, May 6 2024, 19:43:03) \n[GCC 11.2.0]"}, "task_id": "1333_platinum_good_bitstrings"}, "_children": [], "_feedback": null}]}
|
utils.py
CHANGED
@@ -7,7 +7,7 @@ import plotly.graph_objects as go
|
|
7 |
|
8 |
|
9 |
|
10 |
-
def parse_json_files(folder_path):
|
11 |
# Convert folder path to Path object
|
12 |
folder = Path(folder_path)
|
13 |
|
@@ -42,18 +42,13 @@ def parse_json_files(folder_path):
|
|
42 |
# Create DataFrame from the list of dictionaries
|
43 |
df = pd.DataFrame(data_list)
|
44 |
|
45 |
-
return df
|
46 |
|
47 |
|
48 |
def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None):
|
49 |
-
print(df)
|
50 |
-
|
51 |
agents = [Agent(row.results_total_cost, row.results_accuracy) for row in df.itertuples()]
|
52 |
pareto_frontier = compute_pareto_frontier(agents)
|
53 |
|
54 |
-
|
55 |
-
|
56 |
-
print(pareto_frontier)
|
57 |
|
58 |
fig = px.scatter(df,
|
59 |
x=x,
|
|
|
7 |
|
8 |
|
9 |
|
10 |
+
def parse_json_files(folder_path, benchmark_name):
|
11 |
# Convert folder path to Path object
|
12 |
folder = Path(folder_path)
|
13 |
|
|
|
42 |
# Create DataFrame from the list of dictionaries
|
43 |
df = pd.DataFrame(data_list)
|
44 |
|
45 |
+
return df[df['benchmark_name'] == benchmark_name]
|
46 |
|
47 |
|
48 |
def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None):
|
|
|
|
|
49 |
agents = [Agent(row.results_total_cost, row.results_accuracy) for row in df.itertuples()]
|
50 |
pareto_frontier = compute_pareto_frontier(agents)
|
51 |
|
|
|
|
|
|
|
52 |
|
53 |
fig = px.scatter(df,
|
54 |
x=x,
|