benediktstroebl commited on
Commit
e59dbdb
·
1 Parent(s): 01fb261

Big update with SQL backend

Browse files
agent_monitor/failure_report.py CHANGED
@@ -7,6 +7,9 @@ from abc import ABC, abstractmethod
7
  import json
8
  from typing import Dict, List
9
  from datetime import datetime
 
 
 
10
 
11
  class FailureCategory(BaseModel):
12
  category_id: int
@@ -36,21 +39,59 @@ class AsyncLLMClient(ABC):
36
  async def generate_text(self, prompt, system_message=None, response_format=None):
37
  pass
38
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  class AsyncOpenAIClient(AsyncLLMClient):
40
- def __init__(self, model="gpt-4o-mini"):
41
  self.model = model
42
  self.client = AsyncOpenAI()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  async def generate_text(self, prompt, system_message=None, response_format=None):
45
  messages = [
46
  {"role": "system", "content": system_message or "You are a helpful AI assistant."},
47
  {"role": "user", "content": prompt}
48
  ]
49
- if response_format:
50
- response = await self.client.beta.chat.completions.parse(model=self.model, messages=messages, response_format=response_format)
51
- else:
52
- response = await self.client.chat.completions.create(model=self.model, messages=messages)
53
- return response.choices[0].message.content
54
 
55
  def get_weave_calls(client):
56
  calls = client.calls()
@@ -106,7 +147,7 @@ async def summarize_task(task_id, calls, llm_client):
106
  Step {i}:
107
  Input: {call['inputs']}
108
  Output: {call['outputs']}
109
- Timestamp: {datetime.fromtimestamp(call_data['created_timestamp'])}
110
  """
111
 
112
  prompt = f"""
 
7
  import json
8
  from typing import Dict, List
9
  from datetime import datetime
10
+ import backoff
11
+ from openai import APITimeoutError, APIError, RateLimitError
12
+
13
 
14
  class FailureCategory(BaseModel):
15
  category_id: int
 
39
  async def generate_text(self, prompt, system_message=None, response_format=None):
40
  pass
41
 
42
+ # class AsyncOpenAIClient(AsyncLLMClient):
43
+ # def __init__(self, model="gpt-4o-mini"):
44
+ # self.model = model
45
+ # self.client = AsyncOpenAI()
46
+
47
+ # async def generate_text(self, prompt, system_message=None, response_format=None):
48
+ # messages = [
49
+ # {"role": "system", "content": system_message or "You are a helpful AI assistant."},
50
+ # {"role": "user", "content": prompt}
51
+ # ]
52
+ # if response_format:
53
+ # response = await self.client.beta.chat.completions.parse(model=self.model, messages=messages, response_format=response_format)
54
+ # else:
55
+ # response = await self.client.chat.completions.create(model=self.model, messages=messages)
56
+ # return response.choices[0].message.content
57
+
58
+
59
  class AsyncOpenAIClient(AsyncLLMClient):
60
+ def __init__(self, model="gpt-4o-mini", max_tries=5, max_time=300):
61
  self.model = model
62
  self.client = AsyncOpenAI()
63
+ self.max_tries = max_tries
64
+ self.max_time = max_time
65
+
66
+ @backoff.on_exception(
67
+ backoff.expo,
68
+ (APITimeoutError, APIError, RateLimitError),
69
+ max_tries=10,
70
+ max_time=300
71
+ )
72
+ async def _make_request(self, messages, response_format=None):
73
+ if response_format:
74
+ return await self.client.beta.chat.completions.parse(
75
+ model=self.model,
76
+ messages=messages,
77
+ response_format=response_format
78
+ )
79
+ else:
80
+ return await self.client.chat.completions.create(
81
+ model=self.model,
82
+ messages=messages
83
+ )
84
 
85
  async def generate_text(self, prompt, system_message=None, response_format=None):
86
  messages = [
87
  {"role": "system", "content": system_message or "You are a helpful AI assistant."},
88
  {"role": "user", "content": prompt}
89
  ]
90
+ try:
91
+ response = await self._make_request(messages, response_format)
92
+ return response.choices[0].message.content
93
+ except Exception as e:
94
+ raise Exception(f"Failed after {self.max_tries} attempts or {self.max_time} seconds: {str(e)}")
95
 
96
  def get_weave_calls(client):
97
  calls = client.calls()
 
147
  Step {i}:
148
  Input: {call['inputs']}
149
  Output: {call['outputs']}
150
+ Timestamp: {datetime.fromtimestamp(call['created_timestamp'])}
151
  """
152
 
153
  prompt = f"""
agent_monitor/monitor.py CHANGED
@@ -1,9 +1,6 @@
1
  import asyncio
2
- from openai import AsyncOpenAI
3
  from collections import defaultdict
4
- import weave
5
  from pydantic import BaseModel
6
- from abc import ABC, abstractmethod
7
  import json
8
 
9
  class StepAnalysis(BaseModel):
@@ -20,50 +17,6 @@ class TaskSummary(BaseModel):
20
  overall_assessment: str
21
 
22
 
23
- def get_weave_calls(client):
24
- calls = client.calls()
25
-
26
- processed_calls = []
27
- for call in calls:
28
- ChatCompletion = weave.ref(call.output).get()
29
- choices = [choice.message.content for choice in ChatCompletion.choices]
30
- output = {
31
- 'weave_task_id': call.attributes['weave_task_id'],
32
- 'trace_id': call.trace_id,
33
- 'project_id': call.project_id,
34
- 'created_timestamp': ChatCompletion.created,
35
- 'inputs': dict(call.inputs),
36
- 'id': call.id,
37
- 'outputs': {'choices' : choices},
38
- 'exception': call.exception,
39
- 'summary': call.summary,
40
- 'display_name': call.display_name,
41
- 'attributes': dict(call.attributes),
42
- "_children": call._children,
43
- '_feedback': call._feedback,
44
- }
45
- processed_calls.append(output)
46
- return processed_calls
47
-
48
- class AsyncLLMClient(ABC):
49
- @abstractmethod
50
- async def generate_text(self, prompt, system_message=None, response_format=None):
51
- pass
52
-
53
- class AsyncOpenAIClient(AsyncLLMClient):
54
- def __init__(self, model="gpt-4o-mini"):
55
- self.model = model
56
- self.client = AsyncOpenAI()
57
-
58
- async def generate_text(self, prompt, system_message=None, response_format=None):
59
- messages = [
60
- {"role": "system", "content": system_message or "You are a helpful AI assistant."},
61
- {"role": "user", "content": prompt}
62
- ]
63
- response = await self.client.beta.chat.completions.parse(model=self.model, messages=messages, response_format=response_format)
64
-
65
- return response.choices[0].message.content
66
-
67
  async def analyze_agent_steps(processed_calls, llm_client, llm_eval=False):
68
  task_calls = defaultdict(list)
69
  for call in processed_calls:
@@ -185,17 +138,3 @@ async def summarize_task(steps, llm_client):
185
  system_message = "You are an expert AI performance analyst, skilled in evaluating and summarizing AI agent task execution. You are specialized in providing analyses to support AI researchers to develop AI agents."
186
  analysis = await llm_client.generate_text(prompt, system_message, response_format=TaskSummary)
187
  return json.loads(analysis)
188
-
189
- # async def main():
190
- # client = weave.init("citp_agent_eval/usaco_1723148990")
191
- # processed_calls = get_weave_calls(client)
192
-
193
- # weave.finish()
194
- # openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
195
- # task_analyses_openai = await analyze_agent_steps(processed_calls, openai_client)
196
-
197
- # with open("task_analyses.json", "w") as f:
198
- # json.dump(task_analyses_openai, f, indent=4)
199
-
200
- # if __name__ == "__main__":
201
- # asyncio.run(main())
 
1
  import asyncio
 
2
  from collections import defaultdict
 
3
  from pydantic import BaseModel
 
4
  import json
5
 
6
  class StepAnalysis(BaseModel):
 
17
  overall_assessment: str
18
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  async def analyze_agent_steps(processed_calls, llm_client, llm_eval=False):
21
  task_calls = defaultdict(list)
22
  for call in processed_calls:
 
138
  system_message = "You are an expert AI performance analyst, skilled in evaluating and summarizing AI agent task execution. You are specialized in providing analyses to support AI researchers to develop AI agents."
139
  analysis = await llm_client.generate_text(prompt, system_message, response_format=TaskSummary)
140
  return json.loads(analysis)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -6,7 +6,6 @@ from pathlib import Path
6
  import pandas as pd
7
  import os
8
  import json
9
- from utils.data import parse_json_files
10
  from utils.viz import create_scatter_plot, create_flow_chart, create_bar_chart
11
  from utils.processing import check_and_process_uploads
12
  from huggingface_hub import snapshot_download
@@ -18,6 +17,10 @@ import markdown
18
  import asyncio
19
  from apscheduler.schedulers.asyncio import AsyncIOScheduler
20
  import weave
 
 
 
 
21
 
22
 
23
  from datetime import datetime
@@ -40,47 +43,14 @@ def download_latest_results():
40
  print("Download complete.")
41
 
42
 
43
- # Global variable to store preprocessed data
44
- preprocessed_traces = {}
45
- failure_reports = {}
46
- def preprocess_traces():
47
- global preprocessed_traces
48
- global failure_reports
49
- processed_dir = Path("evals_live")
50
- for file in processed_dir.glob('*.json'):
51
- with open(file, 'r') as f:
52
- data = json.load(f)
53
- agent_name = data['config']['agent_name']
54
- benchmark_name = data['config']['benchmark_name']
55
- if benchmark_name not in preprocessed_traces:
56
- preprocessed_traces[benchmark_name] = {}
57
- if benchmark_name not in failure_reports:
58
- failure_reports[benchmark_name] = {}
59
-
60
- try:
61
- assert type(data['raw_logging_results']) == dict, f"Invalid format for raw_logging_results: {type(data['raw_logging_results'])}"
62
- preprocessed_traces[benchmark_name][agent_name] = data['raw_logging_results']
63
- except AssertionError as e:
64
- preprocessed_traces[benchmark_name][agent_name] = None
65
- except Exception as e:
66
- print(f"Error preprocessing {file}: {e}")
67
- preprocessed_traces[benchmark_name][agent_name] = None
68
-
69
- try:
70
- assert type(data['failure_report']) == dict, f"Invalid format for failure_report: {type(data['failure_report'])}"
71
- failure_reports[benchmark_name][agent_name] = data['failure_report']
72
- except AssertionError as e:
73
- failure_reports[benchmark_name][agent_name] = None
74
- except Exception as e:
75
- print(f"Error preprocessing {file}: {e}")
76
- failure_reports[benchmark_name][agent_name] = None
77
-
78
-
79
  def get_analyzed_traces(agent_name, benchmark_name):
80
- return preprocessed_traces.get(benchmark_name, {}).get(agent_name)
81
 
82
  def get_failure_report(agent_name, benchmark_name):
83
- return failure_reports.get(benchmark_name, {}).get(agent_name)
 
 
 
84
 
85
  def update_agent_dropdown(benchmark_name, metric):
86
  df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name)
@@ -115,14 +85,18 @@ def update_task_details(benchmark_name, agent_name, task_id):
115
  return f"No analysis available for task: {task_id}", None, ""
116
 
117
  analysis = analyzed_traces[task_id]
 
118
  summary = analysis.get('task_analysis', {})
119
 
120
  overview = f"## Task Overview\n\n{summary.get('overview', 'No overview available.')}\n\n"
121
- overview += f"### Successes\n{summary.get('key_successes', 'No successes listed.')}\n\n"
122
- overview += f"### Challenges\n{summary.get('main_challenges', 'No challenges listed.')}\n\n"
123
- overview += f"### Overall Assessment\n{summary.get('overall_assessment', 'No assessment available.')}\n\n"
124
 
125
- flow_chart = create_flow_chart(analysis['steps'])
 
 
 
126
 
127
  return overview, flow_chart, ""
128
 
@@ -259,6 +233,7 @@ with gr.Blocks() as demo:
259
  cant_deselect=["Agent Name"],
260
  label="Select Columns to Display:",
261
  ),
 
262
  search_columns=config.USACO_SEARCH_COLUMNS,
263
  column_widths={"Agent Name": 40,
264
  "Accuracy": 20,
@@ -266,6 +241,28 @@ with gr.Blocks() as demo:
266
  )
267
  with gr.Row():
268
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  gr.Markdown("# Agent Monitor")
270
  with gr.Row():
271
  with gr.Column(scale=1):
@@ -306,7 +303,7 @@ with gr.Blocks() as demo:
306
  return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
307
  task_ids = list(analyzed_traces.keys())
308
  steps = analyzed_traces[task_ids[0]]['steps']
309
- return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)
310
 
311
  def update_raw_step_dropdown(agent_name, task_id):
312
  analyzed_traces = get_analyzed_traces(agent_name, "usaco")
@@ -385,6 +382,28 @@ with gr.Blocks() as demo:
385
  inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_lite", visible=False)],
386
  outputs=[failure_categories_overview, failure_categories_chart])
387
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
  gr.Markdown("# Raw Predictions")
389
  with gr.Row():
390
  with gr.Column(scale=1):
@@ -483,6 +502,28 @@ with gr.Blocks() as demo:
483
  inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
484
  outputs=[failure_categories_overview, failure_categories_chart])
485
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
486
  gr.Markdown("# Raw Predictions")
487
  with gr.Row():
488
  with gr.Column(scale=1):
@@ -550,13 +591,13 @@ with gr.Blocks() as demo:
550
 
551
  async def main():
552
  # Preprocess traces
553
- preprocess_traces()
554
 
555
  # # Download the results from the Hugging Face Hub
556
  await asyncio.to_thread(download_latest_results)
557
 
558
  # Check for new uploads and process them
559
- await check_and_process_uploads()
560
 
561
  scheduler = AsyncIOScheduler()
562
  scheduler.add_job(restart_space, "interval", hours=1)
@@ -567,5 +608,5 @@ async def main():
567
  await demo.launch()
568
 
569
  if __name__ == "__main__":
570
- weave.init(f'leaderboard_testing_{datetime.now().strftime("%Y%m%d%H%M%S")}')
571
  asyncio.run(main())
 
6
  import pandas as pd
7
  import os
8
  import json
 
9
  from utils.viz import create_scatter_plot, create_flow_chart, create_bar_chart
10
  from utils.processing import check_and_process_uploads
11
  from huggingface_hub import snapshot_download
 
17
  import asyncio
18
  from apscheduler.schedulers.asyncio import AsyncIOScheduler
19
  import weave
20
+ from utils.db import TracePreprocessor
21
+
22
+ # Initialize the TracePreprocessor
23
+ preprocessor = TracePreprocessor()
24
 
25
 
26
  from datetime import datetime
 
43
  print("Download complete.")
44
 
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def get_analyzed_traces(agent_name, benchmark_name):
47
+ return preprocessor.get_analyzed_traces(agent_name, benchmark_name)
48
 
49
  def get_failure_report(agent_name, benchmark_name):
50
+ return preprocessor.get_failure_report(agent_name, benchmark_name)
51
+
52
+ def parse_json_files(folder_path, benchmark_name):
53
+ return preprocessor.get_parsed_results(benchmark_name)
54
 
55
  def update_agent_dropdown(benchmark_name, metric):
56
  df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name)
 
85
  return f"No analysis available for task: {task_id}", None, ""
86
 
87
  analysis = analyzed_traces[task_id]
88
+
89
  summary = analysis.get('task_analysis', {})
90
 
91
  overview = f"## Task Overview\n\n{summary.get('overview', 'No overview available.')}\n\n"
92
+ # overview += f"### Successes\n{summary.get('key_successes', 'No successes listed.')}\n\n"
93
+ # overview += f"### Challenges\n{summary.get('main_challenges', 'No challenges listed.')}\n\n"
94
+ # overview += f"### Overall Assessment\n{summary.get('overall_assessment', 'No assessment available.')}\n\n"
95
 
96
+ if summary.get('overview', 'No overview available.') != "Not available":
97
+ flow_chart = create_flow_chart(analysis['steps'])
98
+ else:
99
+ flow_chart = None
100
 
101
  return overview, flow_chart, ""
102
 
 
233
  cant_deselect=["Agent Name"],
234
  label="Select Columns to Display:",
235
  ),
236
+ hide_columns=config.USACO_HIDE_COLUMNS,
237
  search_columns=config.USACO_SEARCH_COLUMNS,
238
  column_widths={"Agent Name": 40,
239
  "Accuracy": 20,
 
241
  )
242
  with gr.Row():
243
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
244
+
245
+ gr.Markdown("# Failure Report")
246
+ with gr.Row():
247
+ with gr.Column(scale=1):
248
+ failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
249
+ with gr.Row():
250
+ with gr.Column(scale=1):
251
+ failure_categories_overview = gr.Markdown()
252
+
253
+ with gr.Column(scale=1):
254
+ failure_categories_chart = gr.Plot()
255
+
256
+ # Initialize the failure report agent dropdown with all agents
257
+ demo.load(update_agent_dropdown,
258
+ inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)],
259
+ outputs=[failure_report_agent_dropdown])
260
+
261
+ # Update failure report when agent is selected
262
+ failure_report_agent_dropdown.change(update_failure_report,
263
+ inputs=[failure_report_agent_dropdown, gr.Textbox(value="usaco", visible=False)],
264
+ outputs=[failure_categories_overview, failure_categories_chart])
265
+
266
  gr.Markdown("# Agent Monitor")
267
  with gr.Row():
268
  with gr.Column(scale=1):
 
303
  return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
304
  task_ids = list(analyzed_traces.keys())
305
  steps = analyzed_traces[task_ids[0]]['steps']
306
+ return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "usaco")[task_ids[0]]['steps'][0], 0)
307
 
308
  def update_raw_step_dropdown(agent_name, task_id):
309
  analyzed_traces = get_analyzed_traces(agent_name, "usaco")
 
382
  inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_lite", visible=False)],
383
  outputs=[failure_categories_overview, failure_categories_chart])
384
 
385
+ gr.Markdown("# Agent Monitor")
386
+ with gr.Row():
387
+ with gr.Column(scale=1):
388
+ agent_dropdown = gr.Dropdown(label="Select Agent")
389
+ with gr.Column(scale=1):
390
+ task_dropdown = gr.Dropdown(label="Select SWE-Bench Task")
391
+ with gr.Row():
392
+ task_overview = gr.Markdown()
393
+ with gr.Row():
394
+ flow_chart = gr.Plot(label="Task Flow")
395
+
396
+ # Initialize the agent dropdown with the best agent
397
+ demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
398
+ demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
399
+
400
+ agent_dropdown.change(update_task_analysis,
401
+ inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown],
402
+ outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
403
+ task_dropdown.change(update_task_details,
404
+ inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown, task_dropdown],
405
+ outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
406
+
407
  gr.Markdown("# Raw Predictions")
408
  with gr.Row():
409
  with gr.Column(scale=1):
 
502
  inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
503
  outputs=[failure_categories_overview, failure_categories_chart])
504
 
505
+ gr.Markdown("# Agent Monitor")
506
+ with gr.Row():
507
+ with gr.Column(scale=1):
508
+ agent_dropdown = gr.Dropdown(label="Select Agent")
509
+ with gr.Column(scale=1):
510
+ task_dropdown = gr.Dropdown(label="Select SWE-Bench Task")
511
+ with gr.Row():
512
+ task_overview = gr.Markdown()
513
+ with gr.Row():
514
+ flow_chart = gr.Plot(label="Task Flow")
515
+
516
+ # Initialize the agent dropdown with the best agent
517
+ demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
518
+ demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
519
+
520
+ agent_dropdown.change(update_task_analysis,
521
+ inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown],
522
+ outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
523
+ task_dropdown.change(update_task_details,
524
+ inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
525
+ outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
526
+
527
  gr.Markdown("# Raw Predictions")
528
  with gr.Row():
529
  with gr.Column(scale=1):
 
591
 
592
  async def main():
593
  # Preprocess traces
594
+ preprocessor.preprocess_traces('evals_live')
595
 
596
  # # Download the results from the Hugging Face Hub
597
  await asyncio.to_thread(download_latest_results)
598
 
599
  # Check for new uploads and process them
600
+ # await check_and_process_uploads()
601
 
602
  scheduler = AsyncIOScheduler()
603
  scheduler.add_job(restart_space, "interval", hours=1)
 
608
  await demo.launch()
609
 
610
  if __name__ == "__main__":
611
+ weave.init(f'leaderboard_{datetime.now().strftime("%Y%m%d%H%M%S")}')
612
  asyncio.run(main())
config.py CHANGED
@@ -12,6 +12,7 @@ SWEBENCH_ON_LOAD_COLUMNS = [
12
  "Total Cost",
13
  ]
14
  SWEBENCH_SEARCH_COLUMNS = ['Total Cost']
 
15
 
16
  USACO_ON_LOAD_COLUMNS = [
17
  "Agent Name",
@@ -19,6 +20,7 @@ USACO_ON_LOAD_COLUMNS = [
19
  "Total Cost",
20
  ]
21
  USACO_SEARCH_COLUMNS = ['Total Cost']
 
22
 
23
 
24
  NUMERIC_INTERVALS = {
 
12
  "Total Cost",
13
  ]
14
  SWEBENCH_SEARCH_COLUMNS = ['Total Cost']
15
+ SWEBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name"]
16
 
17
  USACO_ON_LOAD_COLUMNS = [
18
  "Agent Name",
 
20
  "Total Cost",
21
  ]
22
  USACO_SEARCH_COLUMNS = ['Total Cost']
23
+ USACO_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name"]
24
 
25
 
26
  NUMERIC_INTERVALS = {
utils/db.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ import sqlite3
4
+ import pickle
5
+ from functools import lru_cache
6
+ import threading
7
+ import pandas as pd
8
+
9
+ class TracePreprocessor:
10
+ def __init__(self, db_path='preprocessed_traces.db'):
11
+ self.db_path = db_path
12
+ self.local = threading.local()
13
+ self.create_tables()
14
+
15
+ def get_conn(self):
16
+ if not hasattr(self.local, 'conn'):
17
+ self.local.conn = sqlite3.connect(self.db_path)
18
+ return self.local.conn
19
+
20
+ def create_tables(self):
21
+ with self.get_conn() as conn:
22
+ conn.execute('''
23
+ CREATE TABLE IF NOT EXISTS preprocessed_traces (
24
+ benchmark_name TEXT,
25
+ agent_name TEXT,
26
+ raw_logging_results BLOB,
27
+ PRIMARY KEY (benchmark_name, agent_name)
28
+ )
29
+ ''')
30
+ conn.execute('''
31
+ CREATE TABLE IF NOT EXISTS failure_reports (
32
+ benchmark_name TEXT,
33
+ agent_name TEXT,
34
+ failure_report BLOB,
35
+ PRIMARY KEY (benchmark_name, agent_name)
36
+ )
37
+ ''')
38
+ conn.execute('''
39
+ CREATE TABLE IF NOT EXISTS parsed_results (
40
+ benchmark_name TEXT,
41
+ agent_name TEXT,
42
+ date TEXT,
43
+ total_cost REAL,
44
+ accuracy REAL,
45
+ precision REAL,
46
+ recall REAL,
47
+ f1_score REAL,
48
+ auc REAL,
49
+ PRIMARY KEY (benchmark_name, agent_name)
50
+ )
51
+ ''')
52
+
53
+ def preprocess_traces(self, processed_dir="evals_live"):
54
+ processed_dir = Path(processed_dir)
55
+ for file in processed_dir.glob('*.json'):
56
+ with open(file, 'r') as f:
57
+ data = json.load(f)
58
+ agent_name = data['config']['agent_name']
59
+ benchmark_name = data['config']['benchmark_name']
60
+
61
+ try:
62
+ raw_logging_results = pickle.dumps(data['raw_logging_results'])
63
+ with self.get_conn() as conn:
64
+ conn.execute('''
65
+ INSERT OR REPLACE INTO preprocessed_traces
66
+ (benchmark_name, agent_name, raw_logging_results)
67
+ VALUES (?, ?, ?)
68
+ ''', (benchmark_name, agent_name, raw_logging_results))
69
+ except Exception as e:
70
+ print(f"Error preprocessing raw_logging_results in {file}: {e}")
71
+
72
+ try:
73
+ failure_report = pickle.dumps(data['failure_report'])
74
+ with self.get_conn() as conn:
75
+ conn.execute('''
76
+ INSERT OR REPLACE INTO failure_reports
77
+ (benchmark_name, agent_name, failure_report)
78
+ VALUES (?, ?, ?)
79
+ ''', (benchmark_name, agent_name, failure_report))
80
+ except Exception as e:
81
+ print(f"Error preprocessing failure_report in {file}: {e}")
82
+
83
+ try:
84
+ config = data['config']
85
+ results = data['results']
86
+ with self.get_conn() as conn:
87
+ conn.execute('''
88
+ INSERT OR REPLACE INTO parsed_results
89
+ (benchmark_name, agent_name, date, total_cost, accuracy, precision, recall, f1_score, auc)
90
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
91
+ ''', (
92
+ benchmark_name,
93
+ agent_name,
94
+ config['date'],
95
+ results.get('total_cost'),
96
+ results.get('accuracy'),
97
+ results.get('precision'),
98
+ results.get('recall'),
99
+ results.get('f1_score'),
100
+ results.get('auc')
101
+ ))
102
+ except Exception as e:
103
+ print(f"Error preprocessing parsed results in {file}: {e}")
104
+
105
+ @lru_cache(maxsize=100)
106
+ def get_analyzed_traces(self, agent_name, benchmark_name):
107
+ with self.get_conn() as conn:
108
+ cursor = conn.cursor()
109
+ cursor.execute('''
110
+ SELECT raw_logging_results FROM preprocessed_traces
111
+ WHERE benchmark_name = ? AND agent_name = ?
112
+ ''', (benchmark_name, agent_name))
113
+ result = cursor.fetchone()
114
+ if result:
115
+ return pickle.loads(result[0])
116
+ return None
117
+
118
+ @lru_cache(maxsize=100)
119
+ def get_failure_report(self, agent_name, benchmark_name):
120
+ with self.get_conn() as conn:
121
+ cursor = conn.cursor()
122
+ cursor.execute('''
123
+ SELECT failure_report FROM failure_reports
124
+ WHERE benchmark_name = ? AND agent_name = ?
125
+ ''', (benchmark_name, agent_name))
126
+ result = cursor.fetchone()
127
+ if result:
128
+ return pickle.loads(result[0])
129
+ return None
130
+
131
+ def get_parsed_results(self, benchmark_name):
132
+ with self.get_conn() as conn:
133
+ query = '''
134
+ SELECT * FROM parsed_results
135
+ WHERE benchmark_name = ?
136
+ ORDER BY accuracy DESC
137
+ '''
138
+ df = pd.read_sql_query(query, conn, params=(benchmark_name,))
139
+
140
+ # Round float columns to 3 decimal places
141
+ float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc']
142
+ for column in float_columns:
143
+ if column in df.columns:
144
+ df[column] = df[column].round(3)
145
+
146
+ # Rename columns
147
+ df = df.rename(columns={
148
+ 'agent_name': 'Agent Name',
149
+ 'date': 'Date',
150
+ 'total_cost': 'Total Cost',
151
+ 'accuracy': 'Accuracy',
152
+ 'precision': 'Precision',
153
+ 'recall': 'Recall',
154
+ 'f1_score': 'F1 Score',
155
+ 'auc': 'AUC',
156
+ })
157
+
158
+ return df
159
+
160
+ if __name__ == '__main__':
161
+ preprocessor = TracePreprocessor()
162
+ preprocessor.preprocess_traces()
utils/processing.py CHANGED
@@ -123,7 +123,7 @@ async def process_upload(input_path, output_path):
123
  openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
124
 
125
  try:
126
- processed_calls = await analyze_agent_steps(data['raw_logging_results'], openai_client, llm_eval=False)
127
  failure_report = await analyze_agent_performance(data['raw_logging_results'], data['results']['failed_tasks'], openai_client)
128
  data['raw_logging_results'] = processed_calls
129
  data['failure_report'] = failure_report
 
123
  openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
124
 
125
  try:
126
+ processed_calls = await analyze_agent_steps(data['raw_logging_results'], openai_client, llm_eval=True)
127
  failure_report = await analyze_agent_performance(data['raw_logging_results'], data['results']['failed_tasks'], openai_client)
128
  data['raw_logging_results'] = processed_calls
129
  data['failure_report'] = failure_report
utils/viz.py CHANGED
@@ -174,15 +174,15 @@ def create_flow_chart(steps):
174
  description = analysis.get('description', 'No description available.')
175
  assessment = analysis.get('assessment', 'No assessment available.')
176
  success = analysis.get('success', True) # Assuming True if not specified
177
- action_type = analysis.get('action_type', 'other') # Default to 'other' if not specified
178
  step_headline = analysis.get('headline', '')
179
 
180
  # Set node color and shape based on attributes
181
  node_colors.append(color_map[success])
182
- node_shapes.append(shape_map.get(action_type, 'circle'))
183
 
184
  # Wrap text to improve readability
185
- wrapped_description = '<br>'.join(textwrap.wrap(description, width=90, max_lines=10))
186
  wrapped_assessment = '<br>'.join(textwrap.wrap(assessment, width=90, max_lines=10))
187
  wrapped_outline = textwrap.shorten(step_headline, width=30, placeholder='')
188
  wrapped_outline = '' if wrapped_outline == '' else f": {wrapped_outline}"
@@ -194,10 +194,10 @@ def create_flow_chart(steps):
194
  hover_info = f"<b>Step {i+1}{wrapped_outline}</b><br><br>" \
195
  f"<b>Description:</b><br>" \
196
  f"{wrapped_description}<br><br>" \
197
- f"<b>Assessment:</b><br>" \
198
- f"{wrapped_assessment}<br><br>" \
199
- f"<b>Successful:</b> {'Yes' if success else 'No'}<br>" \
200
- f"<b>Action Type:</b> {action_type.capitalize()}"
201
  hover_text.append(hover_info)
202
 
203
  if i > 0:
@@ -214,10 +214,11 @@ def create_flow_chart(steps):
214
  hoverinfo='text',
215
  hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
216
  marker=dict(
217
- color=node_colors,
 
218
  size=30,
219
  line_width=2,
220
- symbol=node_shapes
221
  ))
222
 
223
  edge_trace = go.Scatter(
@@ -230,25 +231,25 @@ def create_flow_chart(steps):
230
  # Create legend traces
231
  legend_traces = []
232
 
233
- # Color legend
234
- for success, color in color_map.items():
235
- legend_traces.append(go.Scatter(
236
- x=[None], y=[None],
237
- mode='markers',
238
- marker=dict(size=10, color=color),
239
- showlegend=True,
240
- name=f"{'Success' if success else 'Issue'}"
241
- ))
242
 
243
- # Shape legend
244
- for action, shape in shape_map.items():
245
- legend_traces.append(go.Scatter(
246
- x=[None], y=[None],
247
- mode='markers',
248
- marker=dict(size=10, symbol=shape, color='gray'),
249
- showlegend=True,
250
- name=f"{action.capitalize()}"
251
- ))
252
 
253
  # Combine all traces
254
  all_traces = [edge_trace, node_trace] + legend_traces
 
174
  description = analysis.get('description', 'No description available.')
175
  assessment = analysis.get('assessment', 'No assessment available.')
176
  success = analysis.get('success', True) # Assuming True if not specified
177
+ # action_type = analysis.get('action_type', 'other') # Default to 'other' if not specified
178
  step_headline = analysis.get('headline', '')
179
 
180
  # Set node color and shape based on attributes
181
  node_colors.append(color_map[success])
182
+ # node_shapes.append(shape_map.get(action_type, 'circle'))
183
 
184
  # Wrap text to improve readability
185
+ wrapped_description = '<br>'.join(textwrap.wrap(description, width=90, max_lines=20))
186
  wrapped_assessment = '<br>'.join(textwrap.wrap(assessment, width=90, max_lines=10))
187
  wrapped_outline = textwrap.shorten(step_headline, width=30, placeholder='')
188
  wrapped_outline = '' if wrapped_outline == '' else f": {wrapped_outline}"
 
194
  hover_info = f"<b>Step {i+1}{wrapped_outline}</b><br><br>" \
195
  f"<b>Description:</b><br>" \
196
  f"{wrapped_description}<br><br>" \
197
+ # f"<b>Assessment:</b><br>" \
198
+ # f"{wrapped_assessment}<br><br>" \
199
+ # f"<b>Successful:</b> {'Yes' if success else 'No'}<br>" \
200
+ # f"<b>Action Type:</b> {action_type.capitalize()}"
201
  hover_text.append(hover_info)
202
 
203
  if i > 0:
 
214
  hoverinfo='text',
215
  hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
216
  marker=dict(
217
+ # color=node_colors,
218
+ color='#1b9e77',
219
  size=30,
220
  line_width=2,
221
+ # symbol=node_shapes
222
  ))
223
 
224
  edge_trace = go.Scatter(
 
231
  # Create legend traces
232
  legend_traces = []
233
 
234
+ # # Color legend
235
+ # for success, color in color_map.items():
236
+ # legend_traces.append(go.Scatter(
237
+ # x=[None], y=[None],
238
+ # mode='markers',
239
+ # marker=dict(size=10, color=color),
240
+ # showlegend=True,
241
+ # name=f"{'Success' if success else 'Issue'}"
242
+ # ))
243
 
244
+ # # Shape legend
245
+ # for action, shape in shape_map.items():
246
+ # legend_traces.append(go.Scatter(
247
+ # x=[None], y=[None],
248
+ # mode='markers',
249
+ # marker=dict(size=10, symbol=shape, color='gray'),
250
+ # showlegend=True,
251
+ # name=f"{action.capitalize()}"
252
+ # ))
253
 
254
  # Combine all traces
255
  all_traces = [edge_trace, node_trace] + legend_traces