Spaces:
Running
Running
Commit
·
e59dbdb
1
Parent(s):
01fb261
Big update with SQL backend
Browse files- agent_monitor/failure_report.py +48 -7
- agent_monitor/monitor.py +0 -61
- app.py +88 -47
- config.py +2 -0
- utils/db.py +162 -0
- utils/processing.py +1 -1
- utils/viz.py +28 -27
agent_monitor/failure_report.py
CHANGED
@@ -7,6 +7,9 @@ from abc import ABC, abstractmethod
|
|
7 |
import json
|
8 |
from typing import Dict, List
|
9 |
from datetime import datetime
|
|
|
|
|
|
|
10 |
|
11 |
class FailureCategory(BaseModel):
|
12 |
category_id: int
|
@@ -36,21 +39,59 @@ class AsyncLLMClient(ABC):
|
|
36 |
async def generate_text(self, prompt, system_message=None, response_format=None):
|
37 |
pass
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
class AsyncOpenAIClient(AsyncLLMClient):
|
40 |
-
def __init__(self, model="gpt-4o-mini"):
|
41 |
self.model = model
|
42 |
self.client = AsyncOpenAI()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
async def generate_text(self, prompt, system_message=None, response_format=None):
|
45 |
messages = [
|
46 |
{"role": "system", "content": system_message or "You are a helpful AI assistant."},
|
47 |
{"role": "user", "content": prompt}
|
48 |
]
|
49 |
-
|
50 |
-
response = await self.
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
|
55 |
def get_weave_calls(client):
|
56 |
calls = client.calls()
|
@@ -106,7 +147,7 @@ async def summarize_task(task_id, calls, llm_client):
|
|
106 |
Step {i}:
|
107 |
Input: {call['inputs']}
|
108 |
Output: {call['outputs']}
|
109 |
-
Timestamp: {datetime.fromtimestamp(
|
110 |
"""
|
111 |
|
112 |
prompt = f"""
|
|
|
7 |
import json
|
8 |
from typing import Dict, List
|
9 |
from datetime import datetime
|
10 |
+
import backoff
|
11 |
+
from openai import APITimeoutError, APIError, RateLimitError
|
12 |
+
|
13 |
|
14 |
class FailureCategory(BaseModel):
|
15 |
category_id: int
|
|
|
39 |
async def generate_text(self, prompt, system_message=None, response_format=None):
|
40 |
pass
|
41 |
|
42 |
+
# class AsyncOpenAIClient(AsyncLLMClient):
|
43 |
+
# def __init__(self, model="gpt-4o-mini"):
|
44 |
+
# self.model = model
|
45 |
+
# self.client = AsyncOpenAI()
|
46 |
+
|
47 |
+
# async def generate_text(self, prompt, system_message=None, response_format=None):
|
48 |
+
# messages = [
|
49 |
+
# {"role": "system", "content": system_message or "You are a helpful AI assistant."},
|
50 |
+
# {"role": "user", "content": prompt}
|
51 |
+
# ]
|
52 |
+
# if response_format:
|
53 |
+
# response = await self.client.beta.chat.completions.parse(model=self.model, messages=messages, response_format=response_format)
|
54 |
+
# else:
|
55 |
+
# response = await self.client.chat.completions.create(model=self.model, messages=messages)
|
56 |
+
# return response.choices[0].message.content
|
57 |
+
|
58 |
+
|
59 |
class AsyncOpenAIClient(AsyncLLMClient):
|
60 |
+
def __init__(self, model="gpt-4o-mini", max_tries=5, max_time=300):
|
61 |
self.model = model
|
62 |
self.client = AsyncOpenAI()
|
63 |
+
self.max_tries = max_tries
|
64 |
+
self.max_time = max_time
|
65 |
+
|
66 |
+
@backoff.on_exception(
|
67 |
+
backoff.expo,
|
68 |
+
(APITimeoutError, APIError, RateLimitError),
|
69 |
+
max_tries=10,
|
70 |
+
max_time=300
|
71 |
+
)
|
72 |
+
async def _make_request(self, messages, response_format=None):
|
73 |
+
if response_format:
|
74 |
+
return await self.client.beta.chat.completions.parse(
|
75 |
+
model=self.model,
|
76 |
+
messages=messages,
|
77 |
+
response_format=response_format
|
78 |
+
)
|
79 |
+
else:
|
80 |
+
return await self.client.chat.completions.create(
|
81 |
+
model=self.model,
|
82 |
+
messages=messages
|
83 |
+
)
|
84 |
|
85 |
async def generate_text(self, prompt, system_message=None, response_format=None):
|
86 |
messages = [
|
87 |
{"role": "system", "content": system_message or "You are a helpful AI assistant."},
|
88 |
{"role": "user", "content": prompt}
|
89 |
]
|
90 |
+
try:
|
91 |
+
response = await self._make_request(messages, response_format)
|
92 |
+
return response.choices[0].message.content
|
93 |
+
except Exception as e:
|
94 |
+
raise Exception(f"Failed after {self.max_tries} attempts or {self.max_time} seconds: {str(e)}")
|
95 |
|
96 |
def get_weave_calls(client):
|
97 |
calls = client.calls()
|
|
|
147 |
Step {i}:
|
148 |
Input: {call['inputs']}
|
149 |
Output: {call['outputs']}
|
150 |
+
Timestamp: {datetime.fromtimestamp(call['created_timestamp'])}
|
151 |
"""
|
152 |
|
153 |
prompt = f"""
|
agent_monitor/monitor.py
CHANGED
@@ -1,9 +1,6 @@
|
|
1 |
import asyncio
|
2 |
-
from openai import AsyncOpenAI
|
3 |
from collections import defaultdict
|
4 |
-
import weave
|
5 |
from pydantic import BaseModel
|
6 |
-
from abc import ABC, abstractmethod
|
7 |
import json
|
8 |
|
9 |
class StepAnalysis(BaseModel):
|
@@ -20,50 +17,6 @@ class TaskSummary(BaseModel):
|
|
20 |
overall_assessment: str
|
21 |
|
22 |
|
23 |
-
def get_weave_calls(client):
|
24 |
-
calls = client.calls()
|
25 |
-
|
26 |
-
processed_calls = []
|
27 |
-
for call in calls:
|
28 |
-
ChatCompletion = weave.ref(call.output).get()
|
29 |
-
choices = [choice.message.content for choice in ChatCompletion.choices]
|
30 |
-
output = {
|
31 |
-
'weave_task_id': call.attributes['weave_task_id'],
|
32 |
-
'trace_id': call.trace_id,
|
33 |
-
'project_id': call.project_id,
|
34 |
-
'created_timestamp': ChatCompletion.created,
|
35 |
-
'inputs': dict(call.inputs),
|
36 |
-
'id': call.id,
|
37 |
-
'outputs': {'choices' : choices},
|
38 |
-
'exception': call.exception,
|
39 |
-
'summary': call.summary,
|
40 |
-
'display_name': call.display_name,
|
41 |
-
'attributes': dict(call.attributes),
|
42 |
-
"_children": call._children,
|
43 |
-
'_feedback': call._feedback,
|
44 |
-
}
|
45 |
-
processed_calls.append(output)
|
46 |
-
return processed_calls
|
47 |
-
|
48 |
-
class AsyncLLMClient(ABC):
|
49 |
-
@abstractmethod
|
50 |
-
async def generate_text(self, prompt, system_message=None, response_format=None):
|
51 |
-
pass
|
52 |
-
|
53 |
-
class AsyncOpenAIClient(AsyncLLMClient):
|
54 |
-
def __init__(self, model="gpt-4o-mini"):
|
55 |
-
self.model = model
|
56 |
-
self.client = AsyncOpenAI()
|
57 |
-
|
58 |
-
async def generate_text(self, prompt, system_message=None, response_format=None):
|
59 |
-
messages = [
|
60 |
-
{"role": "system", "content": system_message or "You are a helpful AI assistant."},
|
61 |
-
{"role": "user", "content": prompt}
|
62 |
-
]
|
63 |
-
response = await self.client.beta.chat.completions.parse(model=self.model, messages=messages, response_format=response_format)
|
64 |
-
|
65 |
-
return response.choices[0].message.content
|
66 |
-
|
67 |
async def analyze_agent_steps(processed_calls, llm_client, llm_eval=False):
|
68 |
task_calls = defaultdict(list)
|
69 |
for call in processed_calls:
|
@@ -185,17 +138,3 @@ async def summarize_task(steps, llm_client):
|
|
185 |
system_message = "You are an expert AI performance analyst, skilled in evaluating and summarizing AI agent task execution. You are specialized in providing analyses to support AI researchers to develop AI agents."
|
186 |
analysis = await llm_client.generate_text(prompt, system_message, response_format=TaskSummary)
|
187 |
return json.loads(analysis)
|
188 |
-
|
189 |
-
# async def main():
|
190 |
-
# client = weave.init("citp_agent_eval/usaco_1723148990")
|
191 |
-
# processed_calls = get_weave_calls(client)
|
192 |
-
|
193 |
-
# weave.finish()
|
194 |
-
# openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
|
195 |
-
# task_analyses_openai = await analyze_agent_steps(processed_calls, openai_client)
|
196 |
-
|
197 |
-
# with open("task_analyses.json", "w") as f:
|
198 |
-
# json.dump(task_analyses_openai, f, indent=4)
|
199 |
-
|
200 |
-
# if __name__ == "__main__":
|
201 |
-
# asyncio.run(main())
|
|
|
1 |
import asyncio
|
|
|
2 |
from collections import defaultdict
|
|
|
3 |
from pydantic import BaseModel
|
|
|
4 |
import json
|
5 |
|
6 |
class StepAnalysis(BaseModel):
|
|
|
17 |
overall_assessment: str
|
18 |
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
async def analyze_agent_steps(processed_calls, llm_client, llm_eval=False):
|
21 |
task_calls = defaultdict(list)
|
22 |
for call in processed_calls:
|
|
|
138 |
system_message = "You are an expert AI performance analyst, skilled in evaluating and summarizing AI agent task execution. You are specialized in providing analyses to support AI researchers to develop AI agents."
|
139 |
analysis = await llm_client.generate_text(prompt, system_message, response_format=TaskSummary)
|
140 |
return json.loads(analysis)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -6,7 +6,6 @@ from pathlib import Path
|
|
6 |
import pandas as pd
|
7 |
import os
|
8 |
import json
|
9 |
-
from utils.data import parse_json_files
|
10 |
from utils.viz import create_scatter_plot, create_flow_chart, create_bar_chart
|
11 |
from utils.processing import check_and_process_uploads
|
12 |
from huggingface_hub import snapshot_download
|
@@ -18,6 +17,10 @@ import markdown
|
|
18 |
import asyncio
|
19 |
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
20 |
import weave
|
|
|
|
|
|
|
|
|
21 |
|
22 |
|
23 |
from datetime import datetime
|
@@ -40,47 +43,14 @@ def download_latest_results():
|
|
40 |
print("Download complete.")
|
41 |
|
42 |
|
43 |
-
# Global variable to store preprocessed data
|
44 |
-
preprocessed_traces = {}
|
45 |
-
failure_reports = {}
|
46 |
-
def preprocess_traces():
|
47 |
-
global preprocessed_traces
|
48 |
-
global failure_reports
|
49 |
-
processed_dir = Path("evals_live")
|
50 |
-
for file in processed_dir.glob('*.json'):
|
51 |
-
with open(file, 'r') as f:
|
52 |
-
data = json.load(f)
|
53 |
-
agent_name = data['config']['agent_name']
|
54 |
-
benchmark_name = data['config']['benchmark_name']
|
55 |
-
if benchmark_name not in preprocessed_traces:
|
56 |
-
preprocessed_traces[benchmark_name] = {}
|
57 |
-
if benchmark_name not in failure_reports:
|
58 |
-
failure_reports[benchmark_name] = {}
|
59 |
-
|
60 |
-
try:
|
61 |
-
assert type(data['raw_logging_results']) == dict, f"Invalid format for raw_logging_results: {type(data['raw_logging_results'])}"
|
62 |
-
preprocessed_traces[benchmark_name][agent_name] = data['raw_logging_results']
|
63 |
-
except AssertionError as e:
|
64 |
-
preprocessed_traces[benchmark_name][agent_name] = None
|
65 |
-
except Exception as e:
|
66 |
-
print(f"Error preprocessing {file}: {e}")
|
67 |
-
preprocessed_traces[benchmark_name][agent_name] = None
|
68 |
-
|
69 |
-
try:
|
70 |
-
assert type(data['failure_report']) == dict, f"Invalid format for failure_report: {type(data['failure_report'])}"
|
71 |
-
failure_reports[benchmark_name][agent_name] = data['failure_report']
|
72 |
-
except AssertionError as e:
|
73 |
-
failure_reports[benchmark_name][agent_name] = None
|
74 |
-
except Exception as e:
|
75 |
-
print(f"Error preprocessing {file}: {e}")
|
76 |
-
failure_reports[benchmark_name][agent_name] = None
|
77 |
-
|
78 |
-
|
79 |
def get_analyzed_traces(agent_name, benchmark_name):
|
80 |
-
return
|
81 |
|
82 |
def get_failure_report(agent_name, benchmark_name):
|
83 |
-
return
|
|
|
|
|
|
|
84 |
|
85 |
def update_agent_dropdown(benchmark_name, metric):
|
86 |
df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name)
|
@@ -115,14 +85,18 @@ def update_task_details(benchmark_name, agent_name, task_id):
|
|
115 |
return f"No analysis available for task: {task_id}", None, ""
|
116 |
|
117 |
analysis = analyzed_traces[task_id]
|
|
|
118 |
summary = analysis.get('task_analysis', {})
|
119 |
|
120 |
overview = f"## Task Overview\n\n{summary.get('overview', 'No overview available.')}\n\n"
|
121 |
-
overview += f"### Successes\n{summary.get('key_successes', 'No successes listed.')}\n\n"
|
122 |
-
overview += f"### Challenges\n{summary.get('main_challenges', 'No challenges listed.')}\n\n"
|
123 |
-
overview += f"### Overall Assessment\n{summary.get('overall_assessment', 'No assessment available.')}\n\n"
|
124 |
|
125 |
-
|
|
|
|
|
|
|
126 |
|
127 |
return overview, flow_chart, ""
|
128 |
|
@@ -259,6 +233,7 @@ with gr.Blocks() as demo:
|
|
259 |
cant_deselect=["Agent Name"],
|
260 |
label="Select Columns to Display:",
|
261 |
),
|
|
|
262 |
search_columns=config.USACO_SEARCH_COLUMNS,
|
263 |
column_widths={"Agent Name": 40,
|
264 |
"Accuracy": 20,
|
@@ -266,6 +241,28 @@ with gr.Blocks() as demo:
|
|
266 |
)
|
267 |
with gr.Row():
|
268 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
gr.Markdown("# Agent Monitor")
|
270 |
with gr.Row():
|
271 |
with gr.Column(scale=1):
|
@@ -306,7 +303,7 @@ with gr.Blocks() as demo:
|
|
306 |
return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
|
307 |
task_ids = list(analyzed_traces.keys())
|
308 |
steps = analyzed_traces[task_ids[0]]['steps']
|
309 |
-
return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0),
|
310 |
|
311 |
def update_raw_step_dropdown(agent_name, task_id):
|
312 |
analyzed_traces = get_analyzed_traces(agent_name, "usaco")
|
@@ -385,6 +382,28 @@ with gr.Blocks() as demo:
|
|
385 |
inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_lite", visible=False)],
|
386 |
outputs=[failure_categories_overview, failure_categories_chart])
|
387 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
gr.Markdown("# Raw Predictions")
|
389 |
with gr.Row():
|
390 |
with gr.Column(scale=1):
|
@@ -483,6 +502,28 @@ with gr.Blocks() as demo:
|
|
483 |
inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
|
484 |
outputs=[failure_categories_overview, failure_categories_chart])
|
485 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
486 |
gr.Markdown("# Raw Predictions")
|
487 |
with gr.Row():
|
488 |
with gr.Column(scale=1):
|
@@ -550,13 +591,13 @@ with gr.Blocks() as demo:
|
|
550 |
|
551 |
async def main():
|
552 |
# Preprocess traces
|
553 |
-
preprocess_traces()
|
554 |
|
555 |
# # Download the results from the Hugging Face Hub
|
556 |
await asyncio.to_thread(download_latest_results)
|
557 |
|
558 |
# Check for new uploads and process them
|
559 |
-
await check_and_process_uploads()
|
560 |
|
561 |
scheduler = AsyncIOScheduler()
|
562 |
scheduler.add_job(restart_space, "interval", hours=1)
|
@@ -567,5 +608,5 @@ async def main():
|
|
567 |
await demo.launch()
|
568 |
|
569 |
if __name__ == "__main__":
|
570 |
-
weave.init(f'
|
571 |
asyncio.run(main())
|
|
|
6 |
import pandas as pd
|
7 |
import os
|
8 |
import json
|
|
|
9 |
from utils.viz import create_scatter_plot, create_flow_chart, create_bar_chart
|
10 |
from utils.processing import check_and_process_uploads
|
11 |
from huggingface_hub import snapshot_download
|
|
|
17 |
import asyncio
|
18 |
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
19 |
import weave
|
20 |
+
from utils.db import TracePreprocessor
|
21 |
+
|
22 |
+
# Initialize the TracePreprocessor
|
23 |
+
preprocessor = TracePreprocessor()
|
24 |
|
25 |
|
26 |
from datetime import datetime
|
|
|
43 |
print("Download complete.")
|
44 |
|
45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
def get_analyzed_traces(agent_name, benchmark_name):
|
47 |
+
return preprocessor.get_analyzed_traces(agent_name, benchmark_name)
|
48 |
|
49 |
def get_failure_report(agent_name, benchmark_name):
|
50 |
+
return preprocessor.get_failure_report(agent_name, benchmark_name)
|
51 |
+
|
52 |
+
def parse_json_files(folder_path, benchmark_name):
|
53 |
+
return preprocessor.get_parsed_results(benchmark_name)
|
54 |
|
55 |
def update_agent_dropdown(benchmark_name, metric):
|
56 |
df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name)
|
|
|
85 |
return f"No analysis available for task: {task_id}", None, ""
|
86 |
|
87 |
analysis = analyzed_traces[task_id]
|
88 |
+
|
89 |
summary = analysis.get('task_analysis', {})
|
90 |
|
91 |
overview = f"## Task Overview\n\n{summary.get('overview', 'No overview available.')}\n\n"
|
92 |
+
# overview += f"### Successes\n{summary.get('key_successes', 'No successes listed.')}\n\n"
|
93 |
+
# overview += f"### Challenges\n{summary.get('main_challenges', 'No challenges listed.')}\n\n"
|
94 |
+
# overview += f"### Overall Assessment\n{summary.get('overall_assessment', 'No assessment available.')}\n\n"
|
95 |
|
96 |
+
if summary.get('overview', 'No overview available.') != "Not available":
|
97 |
+
flow_chart = create_flow_chart(analysis['steps'])
|
98 |
+
else:
|
99 |
+
flow_chart = None
|
100 |
|
101 |
return overview, flow_chart, ""
|
102 |
|
|
|
233 |
cant_deselect=["Agent Name"],
|
234 |
label="Select Columns to Display:",
|
235 |
),
|
236 |
+
hide_columns=config.USACO_HIDE_COLUMNS,
|
237 |
search_columns=config.USACO_SEARCH_COLUMNS,
|
238 |
column_widths={"Agent Name": 40,
|
239 |
"Accuracy": 20,
|
|
|
241 |
)
|
242 |
with gr.Row():
|
243 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
244 |
+
|
245 |
+
gr.Markdown("# Failure Report")
|
246 |
+
with gr.Row():
|
247 |
+
with gr.Column(scale=1):
|
248 |
+
failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
249 |
+
with gr.Row():
|
250 |
+
with gr.Column(scale=1):
|
251 |
+
failure_categories_overview = gr.Markdown()
|
252 |
+
|
253 |
+
with gr.Column(scale=1):
|
254 |
+
failure_categories_chart = gr.Plot()
|
255 |
+
|
256 |
+
# Initialize the failure report agent dropdown with all agents
|
257 |
+
demo.load(update_agent_dropdown,
|
258 |
+
inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)],
|
259 |
+
outputs=[failure_report_agent_dropdown])
|
260 |
+
|
261 |
+
# Update failure report when agent is selected
|
262 |
+
failure_report_agent_dropdown.change(update_failure_report,
|
263 |
+
inputs=[failure_report_agent_dropdown, gr.Textbox(value="usaco", visible=False)],
|
264 |
+
outputs=[failure_categories_overview, failure_categories_chart])
|
265 |
+
|
266 |
gr.Markdown("# Agent Monitor")
|
267 |
with gr.Row():
|
268 |
with gr.Column(scale=1):
|
|
|
303 |
return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
|
304 |
task_ids = list(analyzed_traces.keys())
|
305 |
steps = analyzed_traces[task_ids[0]]['steps']
|
306 |
+
return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "usaco")[task_ids[0]]['steps'][0], 0)
|
307 |
|
308 |
def update_raw_step_dropdown(agent_name, task_id):
|
309 |
analyzed_traces = get_analyzed_traces(agent_name, "usaco")
|
|
|
382 |
inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_lite", visible=False)],
|
383 |
outputs=[failure_categories_overview, failure_categories_chart])
|
384 |
|
385 |
+
gr.Markdown("# Agent Monitor")
|
386 |
+
with gr.Row():
|
387 |
+
with gr.Column(scale=1):
|
388 |
+
agent_dropdown = gr.Dropdown(label="Select Agent")
|
389 |
+
with gr.Column(scale=1):
|
390 |
+
task_dropdown = gr.Dropdown(label="Select SWE-Bench Task")
|
391 |
+
with gr.Row():
|
392 |
+
task_overview = gr.Markdown()
|
393 |
+
with gr.Row():
|
394 |
+
flow_chart = gr.Plot(label="Task Flow")
|
395 |
+
|
396 |
+
# Initialize the agent dropdown with the best agent
|
397 |
+
demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
|
398 |
+
demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
399 |
+
|
400 |
+
agent_dropdown.change(update_task_analysis,
|
401 |
+
inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown],
|
402 |
+
outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
403 |
+
task_dropdown.change(update_task_details,
|
404 |
+
inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown, task_dropdown],
|
405 |
+
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
406 |
+
|
407 |
gr.Markdown("# Raw Predictions")
|
408 |
with gr.Row():
|
409 |
with gr.Column(scale=1):
|
|
|
502 |
inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
|
503 |
outputs=[failure_categories_overview, failure_categories_chart])
|
504 |
|
505 |
+
gr.Markdown("# Agent Monitor")
|
506 |
+
with gr.Row():
|
507 |
+
with gr.Column(scale=1):
|
508 |
+
agent_dropdown = gr.Dropdown(label="Select Agent")
|
509 |
+
with gr.Column(scale=1):
|
510 |
+
task_dropdown = gr.Dropdown(label="Select SWE-Bench Task")
|
511 |
+
with gr.Row():
|
512 |
+
task_overview = gr.Markdown()
|
513 |
+
with gr.Row():
|
514 |
+
flow_chart = gr.Plot(label="Task Flow")
|
515 |
+
|
516 |
+
# Initialize the agent dropdown with the best agent
|
517 |
+
demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
|
518 |
+
demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
519 |
+
|
520 |
+
agent_dropdown.change(update_task_analysis,
|
521 |
+
inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown],
|
522 |
+
outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
523 |
+
task_dropdown.change(update_task_details,
|
524 |
+
inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
|
525 |
+
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
526 |
+
|
527 |
gr.Markdown("# Raw Predictions")
|
528 |
with gr.Row():
|
529 |
with gr.Column(scale=1):
|
|
|
591 |
|
592 |
async def main():
|
593 |
# Preprocess traces
|
594 |
+
preprocessor.preprocess_traces('evals_live')
|
595 |
|
596 |
# # Download the results from the Hugging Face Hub
|
597 |
await asyncio.to_thread(download_latest_results)
|
598 |
|
599 |
# Check for new uploads and process them
|
600 |
+
# await check_and_process_uploads()
|
601 |
|
602 |
scheduler = AsyncIOScheduler()
|
603 |
scheduler.add_job(restart_space, "interval", hours=1)
|
|
|
608 |
await demo.launch()
|
609 |
|
610 |
if __name__ == "__main__":
|
611 |
+
weave.init(f'leaderboard_{datetime.now().strftime("%Y%m%d%H%M%S")}')
|
612 |
asyncio.run(main())
|
config.py
CHANGED
@@ -12,6 +12,7 @@ SWEBENCH_ON_LOAD_COLUMNS = [
|
|
12 |
"Total Cost",
|
13 |
]
|
14 |
SWEBENCH_SEARCH_COLUMNS = ['Total Cost']
|
|
|
15 |
|
16 |
USACO_ON_LOAD_COLUMNS = [
|
17 |
"Agent Name",
|
@@ -19,6 +20,7 @@ USACO_ON_LOAD_COLUMNS = [
|
|
19 |
"Total Cost",
|
20 |
]
|
21 |
USACO_SEARCH_COLUMNS = ['Total Cost']
|
|
|
22 |
|
23 |
|
24 |
NUMERIC_INTERVALS = {
|
|
|
12 |
"Total Cost",
|
13 |
]
|
14 |
SWEBENCH_SEARCH_COLUMNS = ['Total Cost']
|
15 |
+
SWEBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name"]
|
16 |
|
17 |
USACO_ON_LOAD_COLUMNS = [
|
18 |
"Agent Name",
|
|
|
20 |
"Total Cost",
|
21 |
]
|
22 |
USACO_SEARCH_COLUMNS = ['Total Cost']
|
23 |
+
USACO_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name"]
|
24 |
|
25 |
|
26 |
NUMERIC_INTERVALS = {
|
utils/db.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
from pathlib import Path
|
3 |
+
import sqlite3
|
4 |
+
import pickle
|
5 |
+
from functools import lru_cache
|
6 |
+
import threading
|
7 |
+
import pandas as pd
|
8 |
+
|
9 |
+
class TracePreprocessor:
|
10 |
+
def __init__(self, db_path='preprocessed_traces.db'):
|
11 |
+
self.db_path = db_path
|
12 |
+
self.local = threading.local()
|
13 |
+
self.create_tables()
|
14 |
+
|
15 |
+
def get_conn(self):
|
16 |
+
if not hasattr(self.local, 'conn'):
|
17 |
+
self.local.conn = sqlite3.connect(self.db_path)
|
18 |
+
return self.local.conn
|
19 |
+
|
20 |
+
def create_tables(self):
|
21 |
+
with self.get_conn() as conn:
|
22 |
+
conn.execute('''
|
23 |
+
CREATE TABLE IF NOT EXISTS preprocessed_traces (
|
24 |
+
benchmark_name TEXT,
|
25 |
+
agent_name TEXT,
|
26 |
+
raw_logging_results BLOB,
|
27 |
+
PRIMARY KEY (benchmark_name, agent_name)
|
28 |
+
)
|
29 |
+
''')
|
30 |
+
conn.execute('''
|
31 |
+
CREATE TABLE IF NOT EXISTS failure_reports (
|
32 |
+
benchmark_name TEXT,
|
33 |
+
agent_name TEXT,
|
34 |
+
failure_report BLOB,
|
35 |
+
PRIMARY KEY (benchmark_name, agent_name)
|
36 |
+
)
|
37 |
+
''')
|
38 |
+
conn.execute('''
|
39 |
+
CREATE TABLE IF NOT EXISTS parsed_results (
|
40 |
+
benchmark_name TEXT,
|
41 |
+
agent_name TEXT,
|
42 |
+
date TEXT,
|
43 |
+
total_cost REAL,
|
44 |
+
accuracy REAL,
|
45 |
+
precision REAL,
|
46 |
+
recall REAL,
|
47 |
+
f1_score REAL,
|
48 |
+
auc REAL,
|
49 |
+
PRIMARY KEY (benchmark_name, agent_name)
|
50 |
+
)
|
51 |
+
''')
|
52 |
+
|
53 |
+
def preprocess_traces(self, processed_dir="evals_live"):
|
54 |
+
processed_dir = Path(processed_dir)
|
55 |
+
for file in processed_dir.glob('*.json'):
|
56 |
+
with open(file, 'r') as f:
|
57 |
+
data = json.load(f)
|
58 |
+
agent_name = data['config']['agent_name']
|
59 |
+
benchmark_name = data['config']['benchmark_name']
|
60 |
+
|
61 |
+
try:
|
62 |
+
raw_logging_results = pickle.dumps(data['raw_logging_results'])
|
63 |
+
with self.get_conn() as conn:
|
64 |
+
conn.execute('''
|
65 |
+
INSERT OR REPLACE INTO preprocessed_traces
|
66 |
+
(benchmark_name, agent_name, raw_logging_results)
|
67 |
+
VALUES (?, ?, ?)
|
68 |
+
''', (benchmark_name, agent_name, raw_logging_results))
|
69 |
+
except Exception as e:
|
70 |
+
print(f"Error preprocessing raw_logging_results in {file}: {e}")
|
71 |
+
|
72 |
+
try:
|
73 |
+
failure_report = pickle.dumps(data['failure_report'])
|
74 |
+
with self.get_conn() as conn:
|
75 |
+
conn.execute('''
|
76 |
+
INSERT OR REPLACE INTO failure_reports
|
77 |
+
(benchmark_name, agent_name, failure_report)
|
78 |
+
VALUES (?, ?, ?)
|
79 |
+
''', (benchmark_name, agent_name, failure_report))
|
80 |
+
except Exception as e:
|
81 |
+
print(f"Error preprocessing failure_report in {file}: {e}")
|
82 |
+
|
83 |
+
try:
|
84 |
+
config = data['config']
|
85 |
+
results = data['results']
|
86 |
+
with self.get_conn() as conn:
|
87 |
+
conn.execute('''
|
88 |
+
INSERT OR REPLACE INTO parsed_results
|
89 |
+
(benchmark_name, agent_name, date, total_cost, accuracy, precision, recall, f1_score, auc)
|
90 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
91 |
+
''', (
|
92 |
+
benchmark_name,
|
93 |
+
agent_name,
|
94 |
+
config['date'],
|
95 |
+
results.get('total_cost'),
|
96 |
+
results.get('accuracy'),
|
97 |
+
results.get('precision'),
|
98 |
+
results.get('recall'),
|
99 |
+
results.get('f1_score'),
|
100 |
+
results.get('auc')
|
101 |
+
))
|
102 |
+
except Exception as e:
|
103 |
+
print(f"Error preprocessing parsed results in {file}: {e}")
|
104 |
+
|
105 |
+
@lru_cache(maxsize=100)
|
106 |
+
def get_analyzed_traces(self, agent_name, benchmark_name):
|
107 |
+
with self.get_conn() as conn:
|
108 |
+
cursor = conn.cursor()
|
109 |
+
cursor.execute('''
|
110 |
+
SELECT raw_logging_results FROM preprocessed_traces
|
111 |
+
WHERE benchmark_name = ? AND agent_name = ?
|
112 |
+
''', (benchmark_name, agent_name))
|
113 |
+
result = cursor.fetchone()
|
114 |
+
if result:
|
115 |
+
return pickle.loads(result[0])
|
116 |
+
return None
|
117 |
+
|
118 |
+
@lru_cache(maxsize=100)
|
119 |
+
def get_failure_report(self, agent_name, benchmark_name):
|
120 |
+
with self.get_conn() as conn:
|
121 |
+
cursor = conn.cursor()
|
122 |
+
cursor.execute('''
|
123 |
+
SELECT failure_report FROM failure_reports
|
124 |
+
WHERE benchmark_name = ? AND agent_name = ?
|
125 |
+
''', (benchmark_name, agent_name))
|
126 |
+
result = cursor.fetchone()
|
127 |
+
if result:
|
128 |
+
return pickle.loads(result[0])
|
129 |
+
return None
|
130 |
+
|
131 |
+
def get_parsed_results(self, benchmark_name):
|
132 |
+
with self.get_conn() as conn:
|
133 |
+
query = '''
|
134 |
+
SELECT * FROM parsed_results
|
135 |
+
WHERE benchmark_name = ?
|
136 |
+
ORDER BY accuracy DESC
|
137 |
+
'''
|
138 |
+
df = pd.read_sql_query(query, conn, params=(benchmark_name,))
|
139 |
+
|
140 |
+
# Round float columns to 3 decimal places
|
141 |
+
float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc']
|
142 |
+
for column in float_columns:
|
143 |
+
if column in df.columns:
|
144 |
+
df[column] = df[column].round(3)
|
145 |
+
|
146 |
+
# Rename columns
|
147 |
+
df = df.rename(columns={
|
148 |
+
'agent_name': 'Agent Name',
|
149 |
+
'date': 'Date',
|
150 |
+
'total_cost': 'Total Cost',
|
151 |
+
'accuracy': 'Accuracy',
|
152 |
+
'precision': 'Precision',
|
153 |
+
'recall': 'Recall',
|
154 |
+
'f1_score': 'F1 Score',
|
155 |
+
'auc': 'AUC',
|
156 |
+
})
|
157 |
+
|
158 |
+
return df
|
159 |
+
|
160 |
+
if __name__ == '__main__':
|
161 |
+
preprocessor = TracePreprocessor()
|
162 |
+
preprocessor.preprocess_traces()
|
utils/processing.py
CHANGED
@@ -123,7 +123,7 @@ async def process_upload(input_path, output_path):
|
|
123 |
openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
|
124 |
|
125 |
try:
|
126 |
-
processed_calls = await analyze_agent_steps(data['raw_logging_results'], openai_client, llm_eval=
|
127 |
failure_report = await analyze_agent_performance(data['raw_logging_results'], data['results']['failed_tasks'], openai_client)
|
128 |
data['raw_logging_results'] = processed_calls
|
129 |
data['failure_report'] = failure_report
|
|
|
123 |
openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
|
124 |
|
125 |
try:
|
126 |
+
processed_calls = await analyze_agent_steps(data['raw_logging_results'], openai_client, llm_eval=True)
|
127 |
failure_report = await analyze_agent_performance(data['raw_logging_results'], data['results']['failed_tasks'], openai_client)
|
128 |
data['raw_logging_results'] = processed_calls
|
129 |
data['failure_report'] = failure_report
|
utils/viz.py
CHANGED
@@ -174,15 +174,15 @@ def create_flow_chart(steps):
|
|
174 |
description = analysis.get('description', 'No description available.')
|
175 |
assessment = analysis.get('assessment', 'No assessment available.')
|
176 |
success = analysis.get('success', True) # Assuming True if not specified
|
177 |
-
action_type = analysis.get('action_type', 'other') # Default to 'other' if not specified
|
178 |
step_headline = analysis.get('headline', '')
|
179 |
|
180 |
# Set node color and shape based on attributes
|
181 |
node_colors.append(color_map[success])
|
182 |
-
node_shapes.append(shape_map.get(action_type, 'circle'))
|
183 |
|
184 |
# Wrap text to improve readability
|
185 |
-
wrapped_description = '<br>'.join(textwrap.wrap(description, width=90, max_lines=
|
186 |
wrapped_assessment = '<br>'.join(textwrap.wrap(assessment, width=90, max_lines=10))
|
187 |
wrapped_outline = textwrap.shorten(step_headline, width=30, placeholder='')
|
188 |
wrapped_outline = '' if wrapped_outline == '' else f": {wrapped_outline}"
|
@@ -194,10 +194,10 @@ def create_flow_chart(steps):
|
|
194 |
hover_info = f"<b>Step {i+1}{wrapped_outline}</b><br><br>" \
|
195 |
f"<b>Description:</b><br>" \
|
196 |
f"{wrapped_description}<br><br>" \
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
hover_text.append(hover_info)
|
202 |
|
203 |
if i > 0:
|
@@ -214,10 +214,11 @@ def create_flow_chart(steps):
|
|
214 |
hoverinfo='text',
|
215 |
hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
|
216 |
marker=dict(
|
217 |
-
color=node_colors,
|
|
|
218 |
size=30,
|
219 |
line_width=2,
|
220 |
-
symbol=node_shapes
|
221 |
))
|
222 |
|
223 |
edge_trace = go.Scatter(
|
@@ -230,25 +231,25 @@ def create_flow_chart(steps):
|
|
230 |
# Create legend traces
|
231 |
legend_traces = []
|
232 |
|
233 |
-
# Color legend
|
234 |
-
for success, color in color_map.items():
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
|
243 |
-
# Shape legend
|
244 |
-
for action, shape in shape_map.items():
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
|
253 |
# Combine all traces
|
254 |
all_traces = [edge_trace, node_trace] + legend_traces
|
|
|
174 |
description = analysis.get('description', 'No description available.')
|
175 |
assessment = analysis.get('assessment', 'No assessment available.')
|
176 |
success = analysis.get('success', True) # Assuming True if not specified
|
177 |
+
# action_type = analysis.get('action_type', 'other') # Default to 'other' if not specified
|
178 |
step_headline = analysis.get('headline', '')
|
179 |
|
180 |
# Set node color and shape based on attributes
|
181 |
node_colors.append(color_map[success])
|
182 |
+
# node_shapes.append(shape_map.get(action_type, 'circle'))
|
183 |
|
184 |
# Wrap text to improve readability
|
185 |
+
wrapped_description = '<br>'.join(textwrap.wrap(description, width=90, max_lines=20))
|
186 |
wrapped_assessment = '<br>'.join(textwrap.wrap(assessment, width=90, max_lines=10))
|
187 |
wrapped_outline = textwrap.shorten(step_headline, width=30, placeholder='')
|
188 |
wrapped_outline = '' if wrapped_outline == '' else f": {wrapped_outline}"
|
|
|
194 |
hover_info = f"<b>Step {i+1}{wrapped_outline}</b><br><br>" \
|
195 |
f"<b>Description:</b><br>" \
|
196 |
f"{wrapped_description}<br><br>" \
|
197 |
+
# f"<b>Assessment:</b><br>" \
|
198 |
+
# f"{wrapped_assessment}<br><br>" \
|
199 |
+
# f"<b>Successful:</b> {'Yes' if success else 'No'}<br>" \
|
200 |
+
# f"<b>Action Type:</b> {action_type.capitalize()}"
|
201 |
hover_text.append(hover_info)
|
202 |
|
203 |
if i > 0:
|
|
|
214 |
hoverinfo='text',
|
215 |
hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
|
216 |
marker=dict(
|
217 |
+
# color=node_colors,
|
218 |
+
color='#1b9e77',
|
219 |
size=30,
|
220 |
line_width=2,
|
221 |
+
# symbol=node_shapes
|
222 |
))
|
223 |
|
224 |
edge_trace = go.Scatter(
|
|
|
231 |
# Create legend traces
|
232 |
legend_traces = []
|
233 |
|
234 |
+
# # Color legend
|
235 |
+
# for success, color in color_map.items():
|
236 |
+
# legend_traces.append(go.Scatter(
|
237 |
+
# x=[None], y=[None],
|
238 |
+
# mode='markers',
|
239 |
+
# marker=dict(size=10, color=color),
|
240 |
+
# showlegend=True,
|
241 |
+
# name=f"{'Success' if success else 'Issue'}"
|
242 |
+
# ))
|
243 |
|
244 |
+
# # Shape legend
|
245 |
+
# for action, shape in shape_map.items():
|
246 |
+
# legend_traces.append(go.Scatter(
|
247 |
+
# x=[None], y=[None],
|
248 |
+
# mode='markers',
|
249 |
+
# marker=dict(size=10, symbol=shape, color='gray'),
|
250 |
+
# showlegend=True,
|
251 |
+
# name=f"{action.capitalize()}"
|
252 |
+
# ))
|
253 |
|
254 |
# Combine all traces
|
255 |
all_traces = [edge_trace, node_trace] + legend_traces
|