core_leaderboard

Running

App Files Files Community

benediktstroebl commited on Aug 19, 2024

Commit

e59dbdb

1 Parent(s): 01fb261

Big update with SQL backend

Browse files

Files changed (7) hide show

agent_monitor/failure_report.py +48 -7
agent_monitor/monitor.py +0 -61
app.py +88 -47
config.py +2 -0
utils/db.py +162 -0
utils/processing.py +1 -1
utils/viz.py +28 -27

agent_monitor/failure_report.py CHANGED Viewed

@@ -7,6 +7,9 @@ from abc import ABC, abstractmethod
 import json
 from typing import Dict, List
 from datetime import datetime
 class FailureCategory(BaseModel):
     category_id: int
@@ -36,21 +39,59 @@ class AsyncLLMClient(ABC):
     async def generate_text(self, prompt, system_message=None, response_format=None):
         pass
 class AsyncOpenAIClient(AsyncLLMClient):
-    def __init__(self, model="gpt-4o-mini"):
         self.model = model
         self.client = AsyncOpenAI()
     async def generate_text(self, prompt, system_message=None, response_format=None):
         messages = [
             {"role": "system", "content": system_message or "You are a helpful AI assistant."},
             {"role": "user", "content": prompt}
         ]
-        if response_format:
-            response = await self.client.beta.chat.completions.parse(model=self.model, messages=messages, response_format=response_format)
-        else:
-            response = await self.client.chat.completions.create(model=self.model, messages=messages)
-        return response.choices[0].message.content
 def get_weave_calls(client):
     calls = client.calls()
@@ -106,7 +147,7 @@ async def summarize_task(task_id, calls, llm_client):
         Step {i}:
         Input: {call['inputs']}
         Output: {call['outputs']}
-        Timestamp: {datetime.fromtimestamp(call_data['created_timestamp'])}
         """
     prompt = f"""

 import json
 from typing import Dict, List
 from datetime import datetime
+import backoff
+from openai import APITimeoutError, APIError, RateLimitError
 class FailureCategory(BaseModel):
     category_id: int
     async def generate_text(self, prompt, system_message=None, response_format=None):
         pass
+# class AsyncOpenAIClient(AsyncLLMClient):
+#     def __init__(self, model="gpt-4o-mini"):
+#         self.model = model
+#         self.client = AsyncOpenAI()
+#     async def generate_text(self, prompt, system_message=None, response_format=None):
+#         messages = [
+#             {"role": "system", "content": system_message or "You are a helpful AI assistant."},
+#             {"role": "user", "content": prompt}
+#         ]
+#         if response_format:
+#             response = await self.client.beta.chat.completions.parse(model=self.model, messages=messages, response_format=response_format)
+#         else:
+#             response = await self.client.chat.completions.create(model=self.model, messages=messages)
+#         return response.choices[0].message.content
 class AsyncOpenAIClient(AsyncLLMClient):
+    def __init__(self, model="gpt-4o-mini", max_tries=5, max_time=300):
         self.model = model
         self.client = AsyncOpenAI()
+        self.max_tries = max_tries
+        self.max_time = max_time
+    @backoff.on_exception(
+        backoff.expo,
+        (APITimeoutError, APIError, RateLimitError),
+        max_tries=10,
+        max_time=300
+    )
+    async def _make_request(self, messages, response_format=None):
+        if response_format:
+            return await self.client.beta.chat.completions.parse(
+                model=self.model,
+                messages=messages,
+                response_format=response_format
+            )
+        else:
+            return await self.client.chat.completions.create(
+                model=self.model,
+                messages=messages
+            )
     async def generate_text(self, prompt, system_message=None, response_format=None):
         messages = [
             {"role": "system", "content": system_message or "You are a helpful AI assistant."},
             {"role": "user", "content": prompt}
         ]
+        try:
+            response = await self._make_request(messages, response_format)
+            return response.choices[0].message.content
+        except Exception as e:
+            raise Exception(f"Failed after {self.max_tries} attempts or {self.max_time} seconds: {str(e)}")
 def get_weave_calls(client):
     calls = client.calls()
         Step {i}:
         Input: {call['inputs']}
         Output: {call['outputs']}
+        Timestamp: {datetime.fromtimestamp(call['created_timestamp'])}
         """
     prompt = f"""

agent_monitor/monitor.py CHANGED Viewed

@@ -1,9 +1,6 @@
 import asyncio
-from openai import AsyncOpenAI
 from collections import defaultdict
-import weave
 from pydantic import BaseModel
-from abc import ABC, abstractmethod
 import json
 class StepAnalysis(BaseModel):
@@ -20,50 +17,6 @@ class TaskSummary(BaseModel):
     overall_assessment: str
-def get_weave_calls(client):
-    calls = client.calls()
-    processed_calls = []
-    for call in calls:
-        ChatCompletion = weave.ref(call.output).get()
-        choices = [choice.message.content for choice in ChatCompletion.choices]
-        output = {
-            'weave_task_id': call.attributes['weave_task_id'],
-            'trace_id': call.trace_id,
-            'project_id': call.project_id,
-            'created_timestamp': ChatCompletion.created,
-            'inputs': dict(call.inputs),
-            'id': call.id,
-            'outputs': {'choices' : choices},
-            'exception': call.exception,
-            'summary': call.summary,
-            'display_name': call.display_name,
-            'attributes': dict(call.attributes),
-            "_children": call._children,
-            '_feedback': call._feedback,
-        }
-        processed_calls.append(output)
-    return processed_calls
-class AsyncLLMClient(ABC):
-    @abstractmethod
-    async def generate_text(self, prompt, system_message=None, response_format=None):
-        pass
-class AsyncOpenAIClient(AsyncLLMClient):
-    def __init__(self, model="gpt-4o-mini"):
-        self.model = model
-        self.client = AsyncOpenAI()
-    async def generate_text(self, prompt, system_message=None, response_format=None):
-        messages = [
-            {"role": "system", "content": system_message or "You are a helpful AI assistant."},
-            {"role": "user", "content": prompt}
-        ]
-        response = await self.client.beta.chat.completions.parse(model=self.model, messages=messages, response_format=response_format)
-        return response.choices[0].message.content
 async def analyze_agent_steps(processed_calls, llm_client, llm_eval=False):
     task_calls = defaultdict(list)
     for call in processed_calls:
@@ -185,17 +138,3 @@ async def summarize_task(steps, llm_client):
     system_message = "You are an expert AI performance analyst, skilled in evaluating and summarizing AI agent task execution. You are specialized in providing analyses to support AI researchers to develop AI agents."
     analysis = await llm_client.generate_text(prompt, system_message, response_format=TaskSummary)
     return json.loads(analysis)
-# async def main():
-#     client = weave.init("citp_agent_eval/usaco_1723148990")
-#     processed_calls = get_weave_calls(client)
-#     weave.finish()
-#     openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
-#     task_analyses_openai = await analyze_agent_steps(processed_calls, openai_client)
-#     with open("task_analyses.json", "w") as f:
-#         json.dump(task_analyses_openai, f, indent=4)
-# if __name__ == "__main__":
-#     asyncio.run(main())

 import asyncio
 from collections import defaultdict
 from pydantic import BaseModel
 import json
 class StepAnalysis(BaseModel):
     overall_assessment: str
 async def analyze_agent_steps(processed_calls, llm_client, llm_eval=False):
     task_calls = defaultdict(list)
     for call in processed_calls:
     system_message = "You are an expert AI performance analyst, skilled in evaluating and summarizing AI agent task execution. You are specialized in providing analyses to support AI researchers to develop AI agents."
     analysis = await llm_client.generate_text(prompt, system_message, response_format=TaskSummary)
     return json.loads(analysis)

app.py CHANGED Viewed

@@ -6,7 +6,6 @@ from pathlib import Path
 import pandas as pd
 import os
 import json
-from utils.data import parse_json_files
 from utils.viz import create_scatter_plot, create_flow_chart, create_bar_chart
 from utils.processing import check_and_process_uploads
 from huggingface_hub import snapshot_download
@@ -18,6 +17,10 @@ import markdown
 import asyncio
 from apscheduler.schedulers.asyncio import AsyncIOScheduler
 import weave
 from datetime import datetime
@@ -40,47 +43,14 @@ def download_latest_results():
     print("Download complete.")
-# Global variable to store preprocessed data
-preprocessed_traces = {}
-failure_reports = {}
-def preprocess_traces():
-    global preprocessed_traces
-    global failure_reports
-    processed_dir = Path("evals_live")
-    for file in processed_dir.glob('*.json'):
-        with open(file, 'r') as f:
-                data = json.load(f)
-                agent_name = data['config']['agent_name']
-                benchmark_name = data['config']['benchmark_name']
-                if benchmark_name not in preprocessed_traces:
-                    preprocessed_traces[benchmark_name] = {}
-                if benchmark_name not in failure_reports:
-                    failure_reports[benchmark_name] = {}
-        try:
-            assert type(data['raw_logging_results']) == dict, f"Invalid format for raw_logging_results: {type(data['raw_logging_results'])}"
-            preprocessed_traces[benchmark_name][agent_name] = data['raw_logging_results']
-        except AssertionError as e:
-            preprocessed_traces[benchmark_name][agent_name] = None
-        except Exception as e:
-            print(f"Error preprocessing {file}: {e}")
-            preprocessed_traces[benchmark_name][agent_name] = None
-        try:
-            assert type(data['failure_report']) == dict, f"Invalid format for failure_report: {type(data['failure_report'])}"
-            failure_reports[benchmark_name][agent_name] = data['failure_report']
-        except AssertionError as e:
-            failure_reports[benchmark_name][agent_name] = None
-        except Exception as e:
-            print(f"Error preprocessing {file}: {e}")
-            failure_reports[benchmark_name][agent_name] = None
 def get_analyzed_traces(agent_name, benchmark_name):
-    return preprocessed_traces.get(benchmark_name, {}).get(agent_name)
 def get_failure_report(agent_name, benchmark_name):
-    return failure_reports.get(benchmark_name, {}).get(agent_name)
 def update_agent_dropdown(benchmark_name, metric):
     df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name)
@@ -115,14 +85,18 @@ def update_task_details(benchmark_name, agent_name, task_id):
         return f"No analysis available for task: {task_id}", None, ""
     analysis = analyzed_traces[task_id]
     summary = analysis.get('task_analysis', {})
     overview = f"## Task Overview\n\n{summary.get('overview', 'No overview available.')}\n\n"
-    overview += f"### Successes\n{summary.get('key_successes', 'No successes listed.')}\n\n"
-    overview += f"### Challenges\n{summary.get('main_challenges', 'No challenges listed.')}\n\n"
-    overview += f"### Overall Assessment\n{summary.get('overall_assessment', 'No assessment available.')}\n\n"
-    flow_chart = create_flow_chart(analysis['steps'])
     return overview, flow_chart, ""
@@ -259,6 +233,7 @@ with gr.Blocks() as demo:
                             cant_deselect=["Agent Name"],
                             label="Select Columns to Display:",
                         ),
                         search_columns=config.USACO_SEARCH_COLUMNS,
                         column_widths={"Agent Name": 40,
                                        "Accuracy": 20,
@@ -266,6 +241,28 @@ with gr.Blocks() as demo:
                     )
             with gr.Row():
                 scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
             gr.Markdown("# Agent Monitor")
             with gr.Row():
                 with gr.Column(scale=1):
@@ -306,7 +303,7 @@ with gr.Blocks() as demo:
                     return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
                 task_ids = list(analyzed_traces.keys())
                 steps = analyzed_traces[task_ids[0]]['steps']
-                return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)
             def update_raw_step_dropdown(agent_name, task_id):
                 analyzed_traces = get_analyzed_traces(agent_name, "usaco")
@@ -385,6 +382,28 @@ with gr.Blocks() as demo:
                                                 inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_lite", visible=False)],
                                                 outputs=[failure_categories_overview, failure_categories_chart])
             gr.Markdown("# Raw Predictions")
             with gr.Row():
                 with gr.Column(scale=1):
@@ -483,6 +502,28 @@ with gr.Blocks() as demo:
                                                 inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
                                                 outputs=[failure_categories_overview, failure_categories_chart])
             gr.Markdown("# Raw Predictions")
             with gr.Row():
                 with gr.Column(scale=1):
@@ -550,13 +591,13 @@ with gr.Blocks() as demo:
 async def main():
     # Preprocess traces
-    preprocess_traces()
     # # Download the results from the Hugging Face Hub
     await asyncio.to_thread(download_latest_results)
     # Check for new uploads and process them
-    await check_and_process_uploads()
     scheduler = AsyncIOScheduler()
     scheduler.add_job(restart_space, "interval", hours=1)
@@ -567,5 +608,5 @@ async def main():
     await demo.launch()
 if __name__ == "__main__":
-    weave.init(f'leaderboard_testing_{datetime.now().strftime("%Y%m%d%H%M%S")}')
     asyncio.run(main())

 import pandas as pd
 import os
 import json
 from utils.viz import create_scatter_plot, create_flow_chart, create_bar_chart
 from utils.processing import check_and_process_uploads
 from huggingface_hub import snapshot_download
 import asyncio
 from apscheduler.schedulers.asyncio import AsyncIOScheduler
 import weave
+from utils.db import TracePreprocessor
+# Initialize the TracePreprocessor
+preprocessor = TracePreprocessor()
 from datetime import datetime
     print("Download complete.")
 def get_analyzed_traces(agent_name, benchmark_name):
+    return preprocessor.get_analyzed_traces(agent_name, benchmark_name)
 def get_failure_report(agent_name, benchmark_name):
+    return preprocessor.get_failure_report(agent_name, benchmark_name)
+def parse_json_files(folder_path, benchmark_name):
+    return preprocessor.get_parsed_results(benchmark_name)
 def update_agent_dropdown(benchmark_name, metric):
     df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name)
         return f"No analysis available for task: {task_id}", None, ""
     analysis = analyzed_traces[task_id]
     summary = analysis.get('task_analysis', {})
     overview = f"## Task Overview\n\n{summary.get('overview', 'No overview available.')}\n\n"
+    # overview += f"### Successes\n{summary.get('key_successes', 'No successes listed.')}\n\n"
+    # overview += f"### Challenges\n{summary.get('main_challenges', 'No challenges listed.')}\n\n"
+    # overview += f"### Overall Assessment\n{summary.get('overall_assessment', 'No assessment available.')}\n\n"
+    if summary.get('overview', 'No overview available.') != "Not available":
+        flow_chart = create_flow_chart(analysis['steps'])
+    else:
+        flow_chart = None
     return overview, flow_chart, ""
                             cant_deselect=["Agent Name"],
                             label="Select Columns to Display:",
                         ),
+                        hide_columns=config.USACO_HIDE_COLUMNS,
                         search_columns=config.USACO_SEARCH_COLUMNS,
                         column_widths={"Agent Name": 40,
                                        "Accuracy": 20,
                     )
             with gr.Row():
                 scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
+            gr.Markdown("# Failure Report")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    failure_categories_overview = gr.Markdown()
+                with gr.Column(scale=1):
+                    failure_categories_chart = gr.Plot()
+            # Initialize the failure report agent dropdown with all agents
+            demo.load(update_agent_dropdown,
+                    inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)],
+                    outputs=[failure_report_agent_dropdown])
+            # Update failure report when agent is selected
+            failure_report_agent_dropdown.change(update_failure_report,
+                                                inputs=[failure_report_agent_dropdown, gr.Textbox(value="usaco", visible=False)],
+                                                outputs=[failure_categories_overview, failure_categories_chart])
             gr.Markdown("# Agent Monitor")
             with gr.Row():
                 with gr.Column(scale=1):
                     return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
                 task_ids = list(analyzed_traces.keys())
                 steps = analyzed_traces[task_ids[0]]['steps']
+                return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "usaco")[task_ids[0]]['steps'][0], 0)
             def update_raw_step_dropdown(agent_name, task_id):
                 analyzed_traces = get_analyzed_traces(agent_name, "usaco")
                                                 inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_lite", visible=False)],
                                                 outputs=[failure_categories_overview, failure_categories_chart])
+            gr.Markdown("# Agent Monitor")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    agent_dropdown = gr.Dropdown(label="Select Agent")
+                with gr.Column(scale=1):
+                    task_dropdown = gr.Dropdown(label="Select SWE-Bench Task")
+            with gr.Row():
+                task_overview = gr.Markdown()
+            with gr.Row():
+                flow_chart = gr.Plot(label="Task Flow")
+            # Initialize the agent dropdown with the best agent
+            demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
+            demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
+            agent_dropdown.change(update_task_analysis,
+                                  inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown],
+                                  outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
+            task_dropdown.change(update_task_details,
+                                 inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown, task_dropdown],
+                                 outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
             gr.Markdown("# Raw Predictions")
             with gr.Row():
                 with gr.Column(scale=1):
                                                 inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
                                                 outputs=[failure_categories_overview, failure_categories_chart])
+            gr.Markdown("# Agent Monitor")
+            with gr.Row():
+                with gr.Column(scale=1):
+                    agent_dropdown = gr.Dropdown(label="Select Agent")
+                with gr.Column(scale=1):
+                    task_dropdown = gr.Dropdown(label="Select SWE-Bench Task")
+            with gr.Row():
+                task_overview = gr.Markdown()
+            with gr.Row():
+                flow_chart = gr.Plot(label="Task Flow")
+            # Initialize the agent dropdown with the best agent
+            demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
+            demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
+            agent_dropdown.change(update_task_analysis,
+                                  inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown],
+                                  outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
+            task_dropdown.change(update_task_details,
+                                 inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
+                                 outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
             gr.Markdown("# Raw Predictions")
             with gr.Row():
                 with gr.Column(scale=1):
 async def main():
     # Preprocess traces
+    preprocessor.preprocess_traces('evals_live')
     # # Download the results from the Hugging Face Hub
     await asyncio.to_thread(download_latest_results)
     # Check for new uploads and process them
+    # await check_and_process_uploads()
     scheduler = AsyncIOScheduler()
     scheduler.add_job(restart_space, "interval", hours=1)
     await demo.launch()
 if __name__ == "__main__":
+    weave.init(f'leaderboard_{datetime.now().strftime("%Y%m%d%H%M%S")}')
     asyncio.run(main())

config.py CHANGED Viewed

@@ -12,6 +12,7 @@ SWEBENCH_ON_LOAD_COLUMNS = [
     "Total Cost",
    ]
 SWEBENCH_SEARCH_COLUMNS = ['Total Cost']
 USACO_ON_LOAD_COLUMNS = [
     "Agent Name",
@@ -19,6 +20,7 @@ USACO_ON_LOAD_COLUMNS = [
     "Total Cost",
    ]
 USACO_SEARCH_COLUMNS = ['Total Cost']
 NUMERIC_INTERVALS = {

     "Total Cost",
    ]
 SWEBENCH_SEARCH_COLUMNS = ['Total Cost']
+SWEBENCH_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name"]
 USACO_ON_LOAD_COLUMNS = [
     "Agent Name",
     "Total Cost",
    ]
 USACO_SEARCH_COLUMNS = ['Total Cost']
+USACO_HIDE_COLUMNS = ["F1 Score", "AUC", "Precision", "Recall", "benchmark_name"]
 NUMERIC_INTERVALS = {

utils/db.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import json
+from pathlib import Path
+import sqlite3
+import pickle
+from functools import lru_cache
+import threading
+import pandas as pd
+class TracePreprocessor:
+    def __init__(self, db_path='preprocessed_traces.db'):
+        self.db_path = db_path
+        self.local = threading.local()
+        self.create_tables()
+    def get_conn(self):
+        if not hasattr(self.local, 'conn'):
+            self.local.conn = sqlite3.connect(self.db_path)
+        return self.local.conn
+    def create_tables(self):
+        with self.get_conn() as conn:
+            conn.execute('''
+                CREATE TABLE IF NOT EXISTS preprocessed_traces (
+                    benchmark_name TEXT,
+                    agent_name TEXT,
+                    raw_logging_results BLOB,
+                    PRIMARY KEY (benchmark_name, agent_name)
+                )
+            ''')
+            conn.execute('''
+                CREATE TABLE IF NOT EXISTS failure_reports (
+                    benchmark_name TEXT,
+                    agent_name TEXT,
+                    failure_report BLOB,
+                    PRIMARY KEY (benchmark_name, agent_name)
+                )
+            ''')
+            conn.execute('''
+                CREATE TABLE IF NOT EXISTS parsed_results (
+                    benchmark_name TEXT,
+                    agent_name TEXT,
+                    date TEXT,
+                    total_cost REAL,
+                    accuracy REAL,
+                    precision REAL,
+                    recall REAL,
+                    f1_score REAL,
+                    auc REAL,
+                    PRIMARY KEY (benchmark_name, agent_name)
+                )
+            ''')
+    def preprocess_traces(self, processed_dir="evals_live"):
+        processed_dir = Path(processed_dir)
+        for file in processed_dir.glob('*.json'):
+            with open(file, 'r') as f:
+                data = json.load(f)
+                agent_name = data['config']['agent_name']
+                benchmark_name = data['config']['benchmark_name']
+            try:
+                raw_logging_results = pickle.dumps(data['raw_logging_results'])
+                with self.get_conn() as conn:
+                    conn.execute('''
+                        INSERT OR REPLACE INTO preprocessed_traces
+                        (benchmark_name, agent_name, raw_logging_results)
+                        VALUES (?, ?, ?)
+                    ''', (benchmark_name, agent_name, raw_logging_results))
+            except Exception as e:
+                print(f"Error preprocessing raw_logging_results in {file}: {e}")
+            try:
+                failure_report = pickle.dumps(data['failure_report'])
+                with self.get_conn() as conn:
+                    conn.execute('''
+                        INSERT OR REPLACE INTO failure_reports
+                        (benchmark_name, agent_name, failure_report)
+                        VALUES (?, ?, ?)
+                    ''', (benchmark_name, agent_name, failure_report))
+            except Exception as e:
+                print(f"Error preprocessing failure_report in {file}: {e}")
+            try:
+                config = data['config']
+                results = data['results']
+                with self.get_conn() as conn:
+                    conn.execute('''
+                        INSERT OR REPLACE INTO parsed_results
+                        (benchmark_name, agent_name, date, total_cost, accuracy, precision, recall, f1_score, auc)
+                        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+                    ''', (
+                        benchmark_name,
+                        agent_name,
+                        config['date'],
+                        results.get('total_cost'),
+                        results.get('accuracy'),
+                        results.get('precision'),
+                        results.get('recall'),
+                        results.get('f1_score'),
+                        results.get('auc')
+                    ))
+            except Exception as e:
+                print(f"Error preprocessing parsed results in {file}: {e}")
+    @lru_cache(maxsize=100)
+    def get_analyzed_traces(self, agent_name, benchmark_name):
+        with self.get_conn() as conn:
+            cursor = conn.cursor()
+            cursor.execute('''
+                SELECT raw_logging_results FROM preprocessed_traces
+                WHERE benchmark_name = ? AND agent_name = ?
+            ''', (benchmark_name, agent_name))
+            result = cursor.fetchone()
+        if result:
+            return pickle.loads(result[0])
+        return None
+    @lru_cache(maxsize=100)
+    def get_failure_report(self, agent_name, benchmark_name):
+        with self.get_conn() as conn:
+            cursor = conn.cursor()
+            cursor.execute('''
+                SELECT failure_report FROM failure_reports
+                WHERE benchmark_name = ? AND agent_name = ?
+            ''', (benchmark_name, agent_name))
+            result = cursor.fetchone()
+        if result:
+            return pickle.loads(result[0])
+        return None
+    def get_parsed_results(self, benchmark_name):
+        with self.get_conn() as conn:
+            query = '''
+                SELECT * FROM parsed_results
+                WHERE benchmark_name = ?
+                ORDER BY accuracy DESC
+            '''
+            df = pd.read_sql_query(query, conn, params=(benchmark_name,))
+        # Round float columns to 3 decimal places
+        float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc']
+        for column in float_columns:
+            if column in df.columns:
+                df[column] = df[column].round(3)
+        # Rename columns
+        df = df.rename(columns={
+            'agent_name': 'Agent Name',
+            'date': 'Date',
+            'total_cost': 'Total Cost',
+            'accuracy': 'Accuracy',
+            'precision': 'Precision',
+            'recall': 'Recall',
+            'f1_score': 'F1 Score',
+            'auc': 'AUC',
+        })
+        return df
+if __name__ == '__main__':
+    preprocessor = TracePreprocessor()
+    preprocessor.preprocess_traces()

utils/processing.py CHANGED Viewed

@@ -123,7 +123,7 @@ async def process_upload(input_path, output_path):
     openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
     try:
-        processed_calls = await analyze_agent_steps(data['raw_logging_results'], openai_client, llm_eval=False)
         failure_report = await analyze_agent_performance(data['raw_logging_results'], data['results']['failed_tasks'], openai_client)
         data['raw_logging_results'] = processed_calls
         data['failure_report'] = failure_report

     openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
     try:
+        processed_calls = await analyze_agent_steps(data['raw_logging_results'], openai_client, llm_eval=True)
         failure_report = await analyze_agent_performance(data['raw_logging_results'], data['results']['failed_tasks'], openai_client)
         data['raw_logging_results'] = processed_calls
         data['failure_report'] = failure_report

utils/viz.py CHANGED Viewed

@@ -174,15 +174,15 @@ def create_flow_chart(steps):
         description = analysis.get('description', 'No description available.')
         assessment = analysis.get('assessment', 'No assessment available.')
         success = analysis.get('success', True)  # Assuming True if not specified
-        action_type = analysis.get('action_type', 'other')  # Default to 'other' if not specified
         step_headline = analysis.get('headline', '')
         # Set node color and shape based on attributes
         node_colors.append(color_map[success])
-        node_shapes.append(shape_map.get(action_type, 'circle'))
         # Wrap text to improve readability
-        wrapped_description = '<br>'.join(textwrap.wrap(description, width=90, max_lines=10))
         wrapped_assessment = '<br>'.join(textwrap.wrap(assessment, width=90, max_lines=10))
         wrapped_outline = textwrap.shorten(step_headline, width=30, placeholder='')
         wrapped_outline = '' if wrapped_outline == '' else f": {wrapped_outline}"
@@ -194,10 +194,10 @@ def create_flow_chart(steps):
         hover_info = f"<b>Step {i+1}{wrapped_outline}</b><br><br>" \
                      f"<b>Description:</b><br>" \
                      f"{wrapped_description}<br><br>" \
-                     f"<b>Assessment:</b><br>" \
-                     f"{wrapped_assessment}<br><br>" \
-                     f"<b>Successful:</b> {'Yes' if success else 'No'}<br>" \
-                     f"<b>Action Type:</b> {action_type.capitalize()}"
         hover_text.append(hover_info)
         if i > 0:
@@ -214,10 +214,11 @@ def create_flow_chart(steps):
         hoverinfo='text',
         hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
         marker=dict(
-            color=node_colors,
             size=30,
             line_width=2,
-            symbol=node_shapes
         ))
     edge_trace = go.Scatter(
@@ -230,25 +231,25 @@ def create_flow_chart(steps):
     # Create legend traces
     legend_traces = []
-    # Color legend
-    for success, color in color_map.items():
-        legend_traces.append(go.Scatter(
-            x=[None], y=[None],
-            mode='markers',
-            marker=dict(size=10, color=color),
-            showlegend=True,
-            name=f"{'Success' if success else 'Issue'}"
-        ))
-    # Shape legend
-    for action, shape in shape_map.items():
-        legend_traces.append(go.Scatter(
-            x=[None], y=[None],
-            mode='markers',
-            marker=dict(size=10, symbol=shape, color='gray'),
-            showlegend=True,
-            name=f"{action.capitalize()}"
-        ))
     # Combine all traces
     all_traces = [edge_trace, node_trace] + legend_traces

         description = analysis.get('description', 'No description available.')
         assessment = analysis.get('assessment', 'No assessment available.')
         success = analysis.get('success', True)  # Assuming True if not specified
+        # action_type = analysis.get('action_type', 'other')  # Default to 'other' if not specified
         step_headline = analysis.get('headline', '')
         # Set node color and shape based on attributes
         node_colors.append(color_map[success])
+        # node_shapes.append(shape_map.get(action_type, 'circle'))
         # Wrap text to improve readability
+        wrapped_description = '<br>'.join(textwrap.wrap(description, width=90, max_lines=20))
         wrapped_assessment = '<br>'.join(textwrap.wrap(assessment, width=90, max_lines=10))
         wrapped_outline = textwrap.shorten(step_headline, width=30, placeholder='')
         wrapped_outline = '' if wrapped_outline == '' else f": {wrapped_outline}"
         hover_info = f"<b>Step {i+1}{wrapped_outline}</b><br><br>" \
                      f"<b>Description:</b><br>" \
                      f"{wrapped_description}<br><br>" \
+                    #  f"<b>Assessment:</b><br>" \
+                    #  f"{wrapped_assessment}<br><br>" \
+                    #  f"<b>Successful:</b> {'Yes' if success else 'No'}<br>" \
+                    #  f"<b>Action Type:</b> {action_type.capitalize()}"
         hover_text.append(hover_info)
         if i > 0:
         hoverinfo='text',
         hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
         marker=dict(
+            # color=node_colors,
+            color='#1b9e77',
             size=30,
             line_width=2,
+            # symbol=node_shapes
         ))
     edge_trace = go.Scatter(
     # Create legend traces
     legend_traces = []
+    # # Color legend
+    # for success, color in color_map.items():
+    #     legend_traces.append(go.Scatter(
+    #         x=[None], y=[None],
+    #         mode='markers',
+    #         marker=dict(size=10, color=color),
+    #         showlegend=True,
+    #         name=f"{'Success' if success else 'Issue'}"
+    #     ))
+    # # Shape legend
+    # for action, shape in shape_map.items():
+    #     legend_traces.append(go.Scatter(
+    #         x=[None], y=[None],
+    #         mode='markers',
+    #         marker=dict(size=10, symbol=shape, color='gray'),
+    #         showlegend=True,
+    #         name=f"{action.capitalize()}"
+    #     ))
     # Combine all traces
     all_traces = [edge_trace, node_trace] + legend_traces