Spaces:
Running
Running
Upload 3 files
Browse files- utils/db.py +132 -33
- utils/processing.py +14 -13
- utils/viz.py +393 -47
utils/db.py
CHANGED
@@ -6,7 +6,9 @@ from functools import lru_cache
|
|
6 |
import threading
|
7 |
import pandas as pd
|
8 |
import ast
|
|
|
9 |
import yaml
|
|
|
10 |
|
11 |
class TracePreprocessor:
|
12 |
def __init__(self, db_path='preprocessed_traces.db'):
|
@@ -24,16 +26,20 @@ class TracePreprocessor:
|
|
24 |
CREATE TABLE IF NOT EXISTS preprocessed_traces (
|
25 |
benchmark_name TEXT,
|
26 |
agent_name TEXT,
|
|
|
|
|
27 |
raw_logging_results BLOB,
|
28 |
-
PRIMARY KEY (benchmark_name, agent_name)
|
29 |
)
|
30 |
''')
|
31 |
conn.execute('''
|
32 |
CREATE TABLE IF NOT EXISTS failure_reports (
|
33 |
benchmark_name TEXT,
|
34 |
agent_name TEXT,
|
|
|
|
|
35 |
failure_report BLOB,
|
36 |
-
PRIMARY KEY (benchmark_name, agent_name)
|
37 |
)
|
38 |
''')
|
39 |
conn.execute('''
|
@@ -41,6 +47,7 @@ class TracePreprocessor:
|
|
41 |
benchmark_name TEXT,
|
42 |
agent_name TEXT,
|
43 |
date TEXT,
|
|
|
44 |
successful_tasks TEXT,
|
45 |
failed_tasks TEXT,
|
46 |
total_cost REAL,
|
@@ -58,7 +65,7 @@ class TracePreprocessor:
|
|
58 |
amp_parkinsons_disease_progression_prediction_score REAL,
|
59 |
cifar10_score REAL,
|
60 |
imdb_score REAL,
|
61 |
-
PRIMARY KEY (benchmark_name, agent_name)
|
62 |
)
|
63 |
''')
|
64 |
|
@@ -70,15 +77,17 @@ class TracePreprocessor:
|
|
70 |
data = json.load(f)
|
71 |
agent_name = data['config']['agent_name']
|
72 |
benchmark_name = data['config']['benchmark_name']
|
|
|
|
|
73 |
|
74 |
try:
|
75 |
raw_logging_results = pickle.dumps(data['raw_logging_results'])
|
76 |
with self.get_conn() as conn:
|
77 |
conn.execute('''
|
78 |
INSERT OR REPLACE INTO preprocessed_traces
|
79 |
-
(benchmark_name, agent_name, raw_logging_results)
|
80 |
-
VALUES (?, ?, ?)
|
81 |
-
''', (benchmark_name, agent_name, raw_logging_results))
|
82 |
except Exception as e:
|
83 |
print(f"Error preprocessing raw_logging_results in {file}: {e}")
|
84 |
|
@@ -86,10 +95,10 @@ class TracePreprocessor:
|
|
86 |
failure_report = pickle.dumps(data['failure_report'])
|
87 |
with self.get_conn() as conn:
|
88 |
conn.execute('''
|
89 |
-
INSERT
|
90 |
-
(benchmark_name, agent_name, failure_report)
|
91 |
-
VALUES (?, ?, ?)
|
92 |
-
''', (benchmark_name, agent_name, failure_report))
|
93 |
except Exception as e:
|
94 |
print(f"Error preprocessing failure_report in {file}: {e}")
|
95 |
|
@@ -98,13 +107,14 @@ class TracePreprocessor:
|
|
98 |
results = data['results']
|
99 |
with self.get_conn() as conn:
|
100 |
conn.execute('''
|
101 |
-
INSERT
|
102 |
-
(benchmark_name, agent_name, date, successful_tasks, failed_tasks, total_cost, accuracy, precision, recall, f1_score, auc, overall_score, vectorization_score, fathomnet_score, feedback_score, house_price_score, spaceship_titanic_score, amp_parkinsons_disease_progression_prediction_score, cifar10_score, imdb_score)
|
103 |
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
104 |
''', (
|
105 |
benchmark_name,
|
106 |
agent_name,
|
107 |
config['date'],
|
|
|
108 |
str(results.get('successful_tasks')),
|
109 |
str(results.get('failed_tasks')),
|
110 |
results.get('total_cost'),
|
@@ -129,30 +139,62 @@ class TracePreprocessor:
|
|
129 |
@lru_cache(maxsize=100)
|
130 |
def get_analyzed_traces(self, agent_name, benchmark_name):
|
131 |
with self.get_conn() as conn:
|
132 |
-
|
133 |
-
|
134 |
-
SELECT raw_logging_results FROM preprocessed_traces
|
135 |
WHERE benchmark_name = ? AND agent_name = ?
|
136 |
-
'''
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
@lru_cache(maxsize=100)
|
143 |
def get_failure_report(self, agent_name, benchmark_name):
|
144 |
with self.get_conn() as conn:
|
145 |
-
|
146 |
-
|
147 |
-
SELECT failure_report FROM failure_reports
|
148 |
WHERE benchmark_name = ? AND agent_name = ?
|
149 |
-
'''
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
-
def get_parsed_results(self, benchmark_name):
|
156 |
with self.get_conn() as conn:
|
157 |
query = '''
|
158 |
SELECT * FROM parsed_results
|
@@ -167,13 +209,65 @@ class TracePreprocessor:
|
|
167 |
# Add 'Verified' column
|
168 |
df['Verified'] = df.apply(lambda row: '✓' if (benchmark_name, row['agent_name']) in verified_agents else '', axis=1)
|
169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
170 |
# Round float columns to 3 decimal places
|
171 |
float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc', 'overall_score', 'vectorization_score', 'fathomnet_score', 'feedback_score', 'house-price_score', 'spaceship-titanic_score', 'amp-parkinsons-disease-progression-prediction_score', 'cifar10_score', 'imdb_score']
|
172 |
for column in float_columns:
|
173 |
if column in df.columns:
|
174 |
df[column] = df[column].round(3)
|
175 |
|
176 |
-
|
|
|
177 |
|
178 |
# Rename columns
|
179 |
df = df.rename(columns={
|
@@ -193,7 +287,9 @@ class TracePreprocessor:
|
|
193 |
'spaceship_titanic_score': 'Spaceship Titanic Score',
|
194 |
'amp_parkinsons_disease_progression_prediction_score': 'AMP Parkinsons Disease Progression Prediction Score',
|
195 |
'cifar10_score': 'CIFAR10 Score',
|
196 |
-
'imdb_score': 'IMDB Score'
|
|
|
|
|
197 |
})
|
198 |
|
199 |
return df
|
@@ -201,11 +297,14 @@ class TracePreprocessor:
|
|
201 |
def get_task_success_data(self, benchmark_name):
|
202 |
with self.get_conn() as conn:
|
203 |
query = '''
|
204 |
-
SELECT agent_name, successful_tasks, failed_tasks
|
205 |
FROM parsed_results
|
206 |
WHERE benchmark_name = ?
|
207 |
'''
|
208 |
df = pd.read_sql_query(query, conn, params=(benchmark_name,))
|
|
|
|
|
|
|
209 |
|
210 |
# Get all unique task IDs
|
211 |
task_ids = set()
|
|
|
6 |
import threading
|
7 |
import pandas as pd
|
8 |
import ast
|
9 |
+
from scipy import stats
|
10 |
import yaml
|
11 |
+
import numpy as np
|
12 |
|
13 |
class TracePreprocessor:
|
14 |
def __init__(self, db_path='preprocessed_traces.db'):
|
|
|
26 |
CREATE TABLE IF NOT EXISTS preprocessed_traces (
|
27 |
benchmark_name TEXT,
|
28 |
agent_name TEXT,
|
29 |
+
date TEXT,
|
30 |
+
run_id TEXT,
|
31 |
raw_logging_results BLOB,
|
32 |
+
PRIMARY KEY (benchmark_name, agent_name, run_id)
|
33 |
)
|
34 |
''')
|
35 |
conn.execute('''
|
36 |
CREATE TABLE IF NOT EXISTS failure_reports (
|
37 |
benchmark_name TEXT,
|
38 |
agent_name TEXT,
|
39 |
+
date TEXT,
|
40 |
+
run_id TEXT,
|
41 |
failure_report BLOB,
|
42 |
+
PRIMARY KEY (benchmark_name, agent_name, run_id)
|
43 |
)
|
44 |
''')
|
45 |
conn.execute('''
|
|
|
47 |
benchmark_name TEXT,
|
48 |
agent_name TEXT,
|
49 |
date TEXT,
|
50 |
+
run_id TEXT,
|
51 |
successful_tasks TEXT,
|
52 |
failed_tasks TEXT,
|
53 |
total_cost REAL,
|
|
|
65 |
amp_parkinsons_disease_progression_prediction_score REAL,
|
66 |
cifar10_score REAL,
|
67 |
imdb_score REAL,
|
68 |
+
PRIMARY KEY (benchmark_name, agent_name, run_id)
|
69 |
)
|
70 |
''')
|
71 |
|
|
|
77 |
data = json.load(f)
|
78 |
agent_name = data['config']['agent_name']
|
79 |
benchmark_name = data['config']['benchmark_name']
|
80 |
+
date = data['config']['date']
|
81 |
+
config = data['config']
|
82 |
|
83 |
try:
|
84 |
raw_logging_results = pickle.dumps(data['raw_logging_results'])
|
85 |
with self.get_conn() as conn:
|
86 |
conn.execute('''
|
87 |
INSERT OR REPLACE INTO preprocessed_traces
|
88 |
+
(benchmark_name, agent_name, date, run_id, raw_logging_results)
|
89 |
+
VALUES (?, ?, ?, ?, ?)
|
90 |
+
''', (benchmark_name, agent_name, date, config['run_id'], raw_logging_results))
|
91 |
except Exception as e:
|
92 |
print(f"Error preprocessing raw_logging_results in {file}: {e}")
|
93 |
|
|
|
95 |
failure_report = pickle.dumps(data['failure_report'])
|
96 |
with self.get_conn() as conn:
|
97 |
conn.execute('''
|
98 |
+
INSERT INTO failure_reports
|
99 |
+
(benchmark_name, agent_name, date, run_id, failure_report)
|
100 |
+
VALUES (?, ?, ?, ? ,?)
|
101 |
+
''', (benchmark_name, agent_name, date, config['run_id'], failure_report))
|
102 |
except Exception as e:
|
103 |
print(f"Error preprocessing failure_report in {file}: {e}")
|
104 |
|
|
|
107 |
results = data['results']
|
108 |
with self.get_conn() as conn:
|
109 |
conn.execute('''
|
110 |
+
INSERT INTO parsed_results
|
111 |
+
(benchmark_name, agent_name, date, run_id, successful_tasks, failed_tasks, total_cost, accuracy, precision, recall, f1_score, auc, overall_score, vectorization_score, fathomnet_score, feedback_score, house_price_score, spaceship_titanic_score, amp_parkinsons_disease_progression_prediction_score, cifar10_score, imdb_score)
|
112 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
113 |
''', (
|
114 |
benchmark_name,
|
115 |
agent_name,
|
116 |
config['date'],
|
117 |
+
config['run_id'],
|
118 |
str(results.get('successful_tasks')),
|
119 |
str(results.get('failed_tasks')),
|
120 |
results.get('total_cost'),
|
|
|
139 |
@lru_cache(maxsize=100)
|
140 |
def get_analyzed_traces(self, agent_name, benchmark_name):
|
141 |
with self.get_conn() as conn:
|
142 |
+
query = '''
|
143 |
+
SELECT agent_name, raw_logging_results, date FROM preprocessed_traces
|
|
|
144 |
WHERE benchmark_name = ? AND agent_name = ?
|
145 |
+
'''
|
146 |
+
df = pd.read_sql_query(query, conn, params=(benchmark_name, agent_name))
|
147 |
+
|
148 |
+
|
149 |
+
# check for each row if raw_logging_results is not None with pickle.loads because it is stored as a byte string
|
150 |
+
df = df[df['raw_logging_results'].apply(lambda x: pickle.loads(x) is not None and x != 'None')]
|
151 |
+
|
152 |
+
if len(df) == 0:
|
153 |
+
return None
|
154 |
+
|
155 |
+
# select latest run
|
156 |
+
df = df.sort_values('date', ascending=False).groupby('agent_name').first().reset_index()
|
157 |
+
|
158 |
+
|
159 |
+
return pickle.loads(df['raw_logging_results'][0])
|
160 |
+
|
161 |
|
162 |
@lru_cache(maxsize=100)
|
163 |
def get_failure_report(self, agent_name, benchmark_name):
|
164 |
with self.get_conn() as conn:
|
165 |
+
query = '''
|
166 |
+
SELECT agent_name, date, failure_report FROM failure_reports
|
|
|
167 |
WHERE benchmark_name = ? AND agent_name = ?
|
168 |
+
'''
|
169 |
+
df = pd.read_sql_query(query, conn, params=(benchmark_name, agent_name))
|
170 |
+
|
171 |
+
# Select only rows for which failure report is not None and None is a string
|
172 |
+
df = df[df['failure_report'].apply(lambda x: pickle.loads(x) is not None and x != 'None')]
|
173 |
+
|
174 |
+
if len(df) == 0:
|
175 |
+
return None
|
176 |
+
|
177 |
+
|
178 |
+
# if there is multiple failure reports, take the last one
|
179 |
+
df = df.sort_values('date', ascending=False).groupby('agent_name').first().reset_index()
|
180 |
+
|
181 |
+
# if there is a failure report, return the first one
|
182 |
+
return pickle.loads(df['failure_report'][0])
|
183 |
+
|
184 |
+
def _calculate_ci(self, data, confidence=0.95):
|
185 |
+
data = data[np.isfinite(data)]
|
186 |
+
|
187 |
+
if len(data) < 2:
|
188 |
+
return '', '', '' # No CI for less than 2 samples
|
189 |
+
n = len(data)
|
190 |
+
|
191 |
+
mean = np.mean(data)
|
192 |
+
sem = stats.sem(data)
|
193 |
+
|
194 |
+
ci = stats.t.interval(confidence, n-1, loc=mean, scale=sem)
|
195 |
+
return mean, ci[0], ci[1]
|
196 |
|
197 |
+
def get_parsed_results(self, benchmark_name, aggregate=True):
|
198 |
with self.get_conn() as conn:
|
199 |
query = '''
|
200 |
SELECT * FROM parsed_results
|
|
|
209 |
# Add 'Verified' column
|
210 |
df['Verified'] = df.apply(lambda row: '✓' if (benchmark_name, row['agent_name']) in verified_agents else '', axis=1)
|
211 |
|
212 |
+
|
213 |
+
|
214 |
+
# Add column for how many times an agent_name appears in the DataFrame
|
215 |
+
df['Runs'] = df.groupby('agent_name')['agent_name'].transform('count')
|
216 |
+
|
217 |
+
# Compute the 95% confidence interval for accuracy and cost for agents that have been run more than once
|
218 |
+
df['acc_ci'] = None
|
219 |
+
df['cost_ci'] = None
|
220 |
+
|
221 |
+
for agent_name in df['agent_name'].unique():
|
222 |
+
agent_df = df[df['agent_name'] == agent_name]
|
223 |
+
|
224 |
+
if len(agent_df) > 1:
|
225 |
+
accuracy_mean, accuracy_lower, accuracy_upper = self._calculate_ci(agent_df['accuracy'])
|
226 |
+
cost_mean, cost_lower, cost_upper = self._calculate_ci(agent_df['total_cost'])
|
227 |
+
|
228 |
+
# format the confidence interval with +/- sign
|
229 |
+
accuracy_ci = f"± {abs(accuracy_mean - accuracy_lower):.3f}"
|
230 |
+
cost_ci = f"± {abs(cost_mean - cost_lower):.3f}"
|
231 |
+
|
232 |
+
df.loc[df['agent_name'] == agent_name, 'acc_ci'] = accuracy_ci
|
233 |
+
df.loc[df['agent_name'] == agent_name, 'cost_ci'] = cost_ci
|
234 |
+
|
235 |
+
|
236 |
+
df = df.drop(columns=['successful_tasks', 'failed_tasks', 'run_id'], axis=1)
|
237 |
+
|
238 |
+
if aggregate:
|
239 |
+
# For agents that have been run more than once, compute the average accuracy and cost and use that as the value in the DataFrame
|
240 |
+
df = df.groupby('agent_name').agg({
|
241 |
+
'date': 'first',
|
242 |
+
'total_cost': 'mean',
|
243 |
+
'accuracy': 'mean',
|
244 |
+
'precision': 'mean',
|
245 |
+
'recall': 'mean',
|
246 |
+
'f1_score': 'mean',
|
247 |
+
'auc': 'mean',
|
248 |
+
'overall_score': 'mean',
|
249 |
+
'vectorization_score': 'mean',
|
250 |
+
'fathomnet_score': 'mean',
|
251 |
+
'feedback_score': 'mean',
|
252 |
+
'house_price_score': 'mean',
|
253 |
+
'spaceship_titanic_score': 'mean',
|
254 |
+
'amp_parkinsons_disease_progression_prediction_score': 'mean',
|
255 |
+
'cifar10_score': 'mean',
|
256 |
+
'imdb_score': 'mean',
|
257 |
+
'Verified': 'first',
|
258 |
+
'Runs': 'first',
|
259 |
+
'acc_ci': 'first',
|
260 |
+
'cost_ci': 'first'
|
261 |
+
}).reset_index()
|
262 |
+
|
263 |
# Round float columns to 3 decimal places
|
264 |
float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc', 'overall_score', 'vectorization_score', 'fathomnet_score', 'feedback_score', 'house-price_score', 'spaceship-titanic_score', 'amp-parkinsons-disease-progression-prediction_score', 'cifar10_score', 'imdb_score']
|
265 |
for column in float_columns:
|
266 |
if column in df.columns:
|
267 |
df[column] = df[column].round(3)
|
268 |
|
269 |
+
# sort by accuracy
|
270 |
+
df = df.sort_values('accuracy', ascending=False)
|
271 |
|
272 |
# Rename columns
|
273 |
df = df.rename(columns={
|
|
|
287 |
'spaceship_titanic_score': 'Spaceship Titanic Score',
|
288 |
'amp_parkinsons_disease_progression_prediction_score': 'AMP Parkinsons Disease Progression Prediction Score',
|
289 |
'cifar10_score': 'CIFAR10 Score',
|
290 |
+
'imdb_score': 'IMDB Score',
|
291 |
+
'acc_ci': 'Accuracy CI',
|
292 |
+
'cost_ci': 'Total Cost CI'
|
293 |
})
|
294 |
|
295 |
return df
|
|
|
297 |
def get_task_success_data(self, benchmark_name):
|
298 |
with self.get_conn() as conn:
|
299 |
query = '''
|
300 |
+
SELECT agent_name, accuracy, successful_tasks, failed_tasks
|
301 |
FROM parsed_results
|
302 |
WHERE benchmark_name = ?
|
303 |
'''
|
304 |
df = pd.read_sql_query(query, conn, params=(benchmark_name,))
|
305 |
+
|
306 |
+
# for agent_names that have been run more than once, take the run with the highest accuracy
|
307 |
+
df = df.sort_values('accuracy', ascending=False).groupby('agent_name').first().reset_index()
|
308 |
|
309 |
# Get all unique task IDs
|
310 |
task_ids = set()
|
utils/processing.py
CHANGED
@@ -10,6 +10,7 @@ import aiosmtplib
|
|
10 |
from agent_monitor.monitor import analyze_agent_steps
|
11 |
from agent_monitor.failure_report import analyze_agent_performance, AsyncOpenAIClient
|
12 |
import traceback
|
|
|
13 |
|
14 |
async def check_and_process_uploads():
|
15 |
upload_dir = "evals_upload"
|
@@ -58,12 +59,14 @@ async def check_and_process_uploads():
|
|
58 |
|
59 |
print(f"Processing {len(unprocessed_uploads)} new uploads.")
|
60 |
tasks = []
|
61 |
-
for upload in unprocessed_uploads:
|
62 |
upload_path = os.path.join(upload_dir, upload)
|
63 |
processed_path = os.path.join(processed_dir, upload)
|
64 |
-
tasks.append(process_single_upload(upload_path, processed_path))
|
65 |
-
|
66 |
-
|
|
|
|
|
67 |
|
68 |
|
69 |
async def process_single_upload(upload_path, processed_path):
|
@@ -77,12 +80,9 @@ async def process_single_upload(upload_path, processed_path):
|
|
77 |
# Move the file to processed directory
|
78 |
# await asyncio.to_thread(shutil.move, upload_path, processed_path)
|
79 |
|
80 |
-
# Send email notification
|
81 |
-
# await send_email_notification(upload_path.name, check_result, "Processing successful")
|
82 |
else:
|
83 |
print(f"Upload check failed for {upload_path}: {check_result['message']}")
|
84 |
-
|
85 |
-
# await send_email_notification(upload_path.name, check_result, "Upload check failed")
|
86 |
|
87 |
|
88 |
async def check_upload_structure(file_path):
|
@@ -123,11 +123,11 @@ async def process_upload(input_path, output_path):
|
|
123 |
openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
|
124 |
|
125 |
try:
|
126 |
-
processed_calls = await analyze_agent_steps(data['raw_logging_results'], openai_client, llm_eval=
|
127 |
-
data['raw_logging_results']
|
128 |
|
129 |
-
|
130 |
-
|
131 |
except Exception as e:
|
132 |
traceback.print_exc()
|
133 |
print(f"Error in processing: {str(e)}")
|
@@ -163,4 +163,5 @@ async def send_email_notification(filename, check_result, status):
|
|
163 |
use_tls=True,
|
164 |
username=sender_email,
|
165 |
password=password
|
166 |
-
)
|
|
|
|
10 |
from agent_monitor.monitor import analyze_agent_steps
|
11 |
from agent_monitor.failure_report import analyze_agent_performance, AsyncOpenAIClient
|
12 |
import traceback
|
13 |
+
from tqdm import tqdm
|
14 |
|
15 |
async def check_and_process_uploads():
|
16 |
upload_dir = "evals_upload"
|
|
|
59 |
|
60 |
print(f"Processing {len(unprocessed_uploads)} new uploads.")
|
61 |
tasks = []
|
62 |
+
for upload in tqdm(unprocessed_uploads):
|
63 |
upload_path = os.path.join(upload_dir, upload)
|
64 |
processed_path = os.path.join(processed_dir, upload)
|
65 |
+
# tasks.append(process_single_upload(upload_path, processed_path)) # for async processing
|
66 |
+
await process_single_upload(upload_path, processed_path)
|
67 |
+
|
68 |
+
|
69 |
+
# await asyncio.gather(*tasks) # for async processing
|
70 |
|
71 |
|
72 |
async def process_single_upload(upload_path, processed_path):
|
|
|
80 |
# Move the file to processed directory
|
81 |
# await asyncio.to_thread(shutil.move, upload_path, processed_path)
|
82 |
|
|
|
|
|
83 |
else:
|
84 |
print(f"Upload check failed for {upload_path}: {check_result['message']}")
|
85 |
+
|
|
|
86 |
|
87 |
|
88 |
async def check_upload_structure(file_path):
|
|
|
123 |
openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
|
124 |
|
125 |
try:
|
126 |
+
processed_calls = await analyze_agent_steps(data['raw_logging_results'], openai_client, llm_eval=True)
|
127 |
+
failure_report = await analyze_agent_performance(data['raw_logging_results'], data['results']['failed_tasks'], openai_client)
|
128 |
|
129 |
+
data['raw_logging_results'] = processed_calls
|
130 |
+
data['failure_report'] = failure_report
|
131 |
except Exception as e:
|
132 |
traceback.print_exc()
|
133 |
print(f"Error in processing: {str(e)}")
|
|
|
163 |
use_tls=True,
|
164 |
username=sender_email,
|
165 |
password=password
|
166 |
+
)
|
167 |
+
|
utils/viz.py
CHANGED
@@ -3,8 +3,29 @@ import plotly.express as px
|
|
3 |
from utils.pareto import Agent, compute_pareto_frontier
|
4 |
import plotly.graph_objects as go
|
5 |
import textwrap
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
def create_task_success_heatmap(df, benchmark_name):
|
|
|
8 |
# Calculate agent accuracy
|
9 |
agent_accuracy = df.groupby('Agent Name')['Success'].mean().sort_values(ascending=False)
|
10 |
|
@@ -17,16 +38,30 @@ def create_task_success_heatmap(df, benchmark_name):
|
|
17 |
# Sort the pivot table
|
18 |
pivot_df = pivot_df.reindex(index=agent_accuracy.index, columns=task_success_rate.index)
|
19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
num_agents = len(pivot_df.index)
|
21 |
row_height = 30 # Fixed height for each row in pixels
|
22 |
total_height = num_agents * row_height
|
23 |
|
|
|
|
|
|
|
24 |
# Create the heatmap
|
25 |
fig = go.Figure(data=go.Heatmap(
|
26 |
z=pivot_df.values,
|
27 |
y=pivot_df.index,
|
28 |
x=pivot_df.columns,
|
29 |
-
colorscale=
|
30 |
showscale=False,
|
31 |
hovertemplate='<b>Agent:</b> %{y}<br>' +
|
32 |
'<b>Task:</b> %{x}<br>' +
|
@@ -36,18 +71,17 @@ def create_task_success_heatmap(df, benchmark_name):
|
|
36 |
# Update the layout
|
37 |
fig.update_layout(
|
38 |
xaxis_title='Task ID',
|
39 |
-
height=total_height,
|
40 |
-
# width=1150,
|
41 |
yaxis=dict(
|
42 |
autorange='reversed',
|
43 |
-
showticklabels=True,
|
44 |
showline=True,
|
45 |
linecolor='black',
|
46 |
showgrid=False
|
47 |
),
|
48 |
xaxis=dict(
|
49 |
side='top',
|
50 |
-
showticklabels=False,
|
51 |
showline=True,
|
52 |
linecolor='black',
|
53 |
showgrid=False
|
@@ -136,65 +170,173 @@ def create_bar_chart(categories, values, x_label, y_label, title):
|
|
136 |
return fig
|
137 |
|
138 |
def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None):
|
139 |
-
agents = [Agent(row['Total Cost'], row['Accuracy']) for i, row in df.iterrows()]
|
|
|
|
|
|
|
|
|
140 |
pareto_frontier = compute_pareto_frontier(agents)
|
141 |
|
142 |
-
fig =
|
143 |
-
x=x,
|
144 |
-
y=y,
|
145 |
-
custom_data=hover_data)
|
146 |
-
fig.update_traces(
|
147 |
-
hovertemplate="<br>".join([
|
148 |
-
"<b>Agent</b>: %{customdata[0]}",
|
149 |
-
"<b>Total Cost</b>: $%{x:.1f}",
|
150 |
-
"<b>Accuracy</b>: %{y:.1%}",
|
151 |
-
])
|
152 |
-
)
|
153 |
-
|
154 |
-
fig.update_traces(marker=dict(size=10, color='#3498db'),
|
155 |
-
hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),)
|
156 |
-
|
157 |
|
158 |
# Sort the Pareto frontier points by x-coordinate
|
159 |
pareto_points = sorted([(agent.total_cost, agent.accuracy) for agent in pareto_frontier], key=lambda x: x[0])
|
160 |
-
|
161 |
# Add the Pareto frontier line
|
162 |
fig.add_trace(go.Scatter(
|
163 |
x=[point[0] for point in pareto_points],
|
164 |
y=[point[1] for point in pareto_points],
|
165 |
mode='lines',
|
166 |
name='Pareto Frontier',
|
|
|
167 |
line=dict(color='black', width=1, dash='dash')
|
168 |
))
|
169 |
|
170 |
-
|
171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
|
173 |
fig.update_layout(
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
x=0.98,
|
193 |
-
bgcolor="rgba(255, 255, 255, 0.5)" # semi-transparent white background
|
194 |
),
|
195 |
-
|
196 |
activecolor='#1f77b4', # Color of active tool
|
197 |
-
orientation='h', #
|
198 |
bgcolor='rgba(255,255,255,0.8)', # Slightly transparent white background
|
199 |
color='#777', # Color of inactive tools
|
200 |
add = ['pan2d'],
|
@@ -211,9 +353,213 @@ def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str =
|
|
211 |
'select2d',
|
212 |
'select']
|
213 |
),
|
214 |
-
|
215 |
)
|
|
|
|
|
|
|
|
|
216 |
return fig
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
|
218 |
|
219 |
import plotly.graph_objects as go
|
|
|
3 |
from utils.pareto import Agent, compute_pareto_frontier
|
4 |
import plotly.graph_objects as go
|
5 |
import textwrap
|
6 |
+
import numpy as np
|
7 |
+
import pandas as pd
|
8 |
+
from scipy.stats import chi2
|
9 |
+
from scipy import stats
|
10 |
+
|
11 |
+
|
12 |
+
def create_leaderboard(df, ci_metrics = None):
|
13 |
+
# cast dtypes to string
|
14 |
+
df = df.astype(str)
|
15 |
+
|
16 |
+
# for each metric join metric and metric CI columns
|
17 |
+
if ci_metrics:
|
18 |
+
for metric in ci_metrics:
|
19 |
+
CI_metric = metric + ' CI'
|
20 |
+
# for rows in the df for which CI metric is not None, join the metric and CI columns by looping through the CI metrics columns
|
21 |
+
for i, row in df.iterrows():
|
22 |
+
if str(row[CI_metric]) != 'None':
|
23 |
+
df.at[i, metric] = str(row[metric]) + " (" + str(row[CI_metric]) + ")"
|
24 |
+
|
25 |
+
return df
|
26 |
|
27 |
def create_task_success_heatmap(df, benchmark_name):
|
28 |
+
|
29 |
# Calculate agent accuracy
|
30 |
agent_accuracy = df.groupby('Agent Name')['Success'].mean().sort_values(ascending=False)
|
31 |
|
|
|
38 |
# Sort the pivot table
|
39 |
pivot_df = pivot_df.reindex(index=agent_accuracy.index, columns=task_success_rate.index)
|
40 |
|
41 |
+
# Calculate tasks solved across all agents
|
42 |
+
tasks_solved = (pivot_df.sum(axis=0) > 0).astype(int)
|
43 |
+
# Total number of tasks (columns)
|
44 |
+
total_tasks = len(pivot_df.columns)
|
45 |
+
|
46 |
+
# Add the new row to the pivot table
|
47 |
+
tasks_solved_df = pd.DataFrame(tasks_solved).T
|
48 |
+
tasks_solved_df.index = [f'<b>Tasks Solved: {tasks_solved.sum()}/{total_tasks} (All Agents)</b>']
|
49 |
+
# print number of tasks solved
|
50 |
+
pivot_df = pd.concat([pivot_df, tasks_solved_df])
|
51 |
+
|
52 |
num_agents = len(pivot_df.index)
|
53 |
row_height = 30 # Fixed height for each row in pixels
|
54 |
total_height = num_agents * row_height
|
55 |
|
56 |
+
# Create a custom colorscale
|
57 |
+
colorscale=[[0, 'white'], [1, '#3498db']]
|
58 |
+
|
59 |
# Create the heatmap
|
60 |
fig = go.Figure(data=go.Heatmap(
|
61 |
z=pivot_df.values,
|
62 |
y=pivot_df.index,
|
63 |
x=pivot_df.columns,
|
64 |
+
colorscale=colorscale,
|
65 |
showscale=False,
|
66 |
hovertemplate='<b>Agent:</b> %{y}<br>' +
|
67 |
'<b>Task:</b> %{x}<br>' +
|
|
|
71 |
# Update the layout
|
72 |
fig.update_layout(
|
73 |
xaxis_title='Task ID',
|
74 |
+
height=total_height + 50, # Add extra space for the new row
|
|
|
75 |
yaxis=dict(
|
76 |
autorange='reversed',
|
77 |
+
showticklabels=True,
|
78 |
showline=True,
|
79 |
linecolor='black',
|
80 |
showgrid=False
|
81 |
),
|
82 |
xaxis=dict(
|
83 |
side='top',
|
84 |
+
showticklabels=False,
|
85 |
showline=True,
|
86 |
linecolor='black',
|
87 |
showgrid=False
|
|
|
170 |
return fig
|
171 |
|
172 |
def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None):
|
173 |
+
# agents = [Agent(row['Total Cost'], row['Accuracy']) for i, row in df.iterrows()]
|
174 |
+
# instead of creating one Agent object for each row, we can create one Agent object for each unique agent and use the mean of the cost and accuracy values
|
175 |
+
unique_agents = df['Agent Name'].unique()
|
176 |
+
agents = [Agent(df[df['Agent Name'] == agent]['Total Cost'].mean(), df[df['Agent Name'] == agent]['Accuracy'].mean()) for agent in unique_agents]
|
177 |
+
|
178 |
pareto_frontier = compute_pareto_frontier(agents)
|
179 |
|
180 |
+
fig = go.Figure()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
|
182 |
# Sort the Pareto frontier points by x-coordinate
|
183 |
pareto_points = sorted([(agent.total_cost, agent.accuracy) for agent in pareto_frontier], key=lambda x: x[0])
|
|
|
184 |
# Add the Pareto frontier line
|
185 |
fig.add_trace(go.Scatter(
|
186 |
x=[point[0] for point in pareto_points],
|
187 |
y=[point[1] for point in pareto_points],
|
188 |
mode='lines',
|
189 |
name='Pareto Frontier',
|
190 |
+
hoverinfo=None,
|
191 |
line=dict(color='black', width=1, dash='dash')
|
192 |
))
|
193 |
|
194 |
+
# Plot scatter points and error bars for each agent
|
195 |
+
unique_agents = df[hover_data[0]].unique()
|
196 |
+
for agent in unique_agents:
|
197 |
+
agent_data = df[df[hover_data[0]] == agent]
|
198 |
+
|
199 |
+
x_value = [np.mean(agent_data[x].values)]
|
200 |
+
y_value = [np.mean(agent_data[y].values)]
|
201 |
+
|
202 |
+
if len(agent_data) > 1:
|
203 |
+
# Calculate 95% confidence intervals
|
204 |
+
ci_x = stats.t.interval(0.95, len(agent_data[x])-1, loc=np.mean(agent_data[x]), scale=stats.sem(agent_data[x]))
|
205 |
+
ci_y = stats.t.interval(0.95, len(agent_data[y])-1, loc=np.mean(agent_data[y]), scale=stats.sem(agent_data[y]))
|
206 |
+
|
207 |
+
# # Add error bars for x (cost)
|
208 |
+
# fig.add_trace(go.Scatter(
|
209 |
+
# x=x_value,
|
210 |
+
# y=y_value,
|
211 |
+
# error_x=dict(
|
212 |
+
# type='data',
|
213 |
+
# symmetric=False,
|
214 |
+
# array=[ci_x[1] - x_value],
|
215 |
+
# arrayminus=[x_value - ci_x[0]],
|
216 |
+
# color='red',
|
217 |
+
# ),
|
218 |
+
# mode='markers',
|
219 |
+
# marker=dict(color='rgba(0,0,0,0)'),
|
220 |
+
# showlegend=False,
|
221 |
+
# hoverinfo='none'
|
222 |
+
# ))
|
223 |
+
|
224 |
+
# # Add error bars for y (accuracy)
|
225 |
+
# fig.add_trace(go.Scatter(
|
226 |
+
# x=x_value,
|
227 |
+
# y=y_value,
|
228 |
+
# error_y=dict(
|
229 |
+
# type='data',
|
230 |
+
# symmetric=False,
|
231 |
+
# array=[ci_y[1] - y_value],
|
232 |
+
# arrayminus=[y_value - ci_y[0]],
|
233 |
+
# color='green',
|
234 |
+
# ),
|
235 |
+
# mode='markers',
|
236 |
+
# marker=dict(color='rgba(0,0,0,0)'),
|
237 |
+
# showlegend=False,
|
238 |
+
# hoverinfo='none'
|
239 |
+
# ))
|
240 |
+
|
241 |
+
# Add error bars for x (cost minmax)
|
242 |
+
fig.add_trace(go.Scatter(
|
243 |
+
x=x_value,
|
244 |
+
y=y_value,
|
245 |
+
error_x=dict(
|
246 |
+
type='data',
|
247 |
+
symmetric=False,
|
248 |
+
array=[np.max(agent_data[x]) - x_value],
|
249 |
+
arrayminus=[x_value - np.min(agent_data[x])],
|
250 |
+
color='#fec44f',
|
251 |
+
),
|
252 |
+
mode='markers',
|
253 |
+
marker=dict(color='rgba(0,0,0,0)', opacity=0),
|
254 |
+
showlegend=False,
|
255 |
+
hoverinfo=None
|
256 |
+
))
|
257 |
+
|
258 |
+
# Add error bars for y (accuracy minmax)
|
259 |
+
fig.add_trace(go.Scatter(
|
260 |
+
x=x_value,
|
261 |
+
y=y_value,
|
262 |
+
error_y=dict(
|
263 |
+
type='data',
|
264 |
+
symmetric=False,
|
265 |
+
array=[np.max(agent_data[y]) - y_value],
|
266 |
+
arrayminus=[y_value - np.min(agent_data[y])],
|
267 |
+
color='#bdbdbd',
|
268 |
+
),
|
269 |
+
mode='markers',
|
270 |
+
marker=dict(color='rgba(0,0,0,0)', opacity=0),
|
271 |
+
showlegend=False,
|
272 |
+
hoverinfo=None
|
273 |
+
))
|
274 |
+
|
275 |
+
# Add scatter points for this agent
|
276 |
+
fig.add_trace(go.Scatter(
|
277 |
+
x=x_value,
|
278 |
+
y=y_value,
|
279 |
+
mode='markers',
|
280 |
+
marker=dict(size=10, color='#3498db'),
|
281 |
+
customdata=agent_data[hover_data],
|
282 |
+
showlegend=False,
|
283 |
+
hovertemplate="<br>".join([
|
284 |
+
"<b>Agent</b>: %{customdata[0]}",
|
285 |
+
"<b>Total Cost</b>: $%{x:.1f}",
|
286 |
+
"<b>Accuracy</b>: %{y:.1%}<extra></extra>",
|
287 |
+
]),
|
288 |
+
hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
|
289 |
+
))
|
290 |
+
|
291 |
+
|
292 |
+
|
293 |
+
# Add legend entries for error bars
|
294 |
+
# fig.add_trace(go.Scatter(
|
295 |
+
# x=[None], y=[None], mode='markers',
|
296 |
+
# marker=dict(color='red', size=10),
|
297 |
+
# name='Cost CI (95%)'
|
298 |
+
# ))
|
299 |
+
# fig.add_trace(go.Scatter(
|
300 |
+
# x=[None], y=[None], mode='markers',
|
301 |
+
# marker=dict(color='green', size=10),
|
302 |
+
# name='Accuracy CI (95%)'
|
303 |
+
# ))
|
304 |
+
|
305 |
+
# Add legend entries for error bars
|
306 |
+
fig.add_trace(go.Scatter(
|
307 |
+
x=[None], y=[None], mode='markers',
|
308 |
+
marker=dict(color='#fec44f', size=10),
|
309 |
+
name='Cost CI (Min-Max)'
|
310 |
+
))
|
311 |
+
fig.add_trace(go.Scatter(
|
312 |
+
x=[None], y=[None], mode='markers',
|
313 |
+
marker=dict(color='#bdbdbd', size=10),
|
314 |
+
name='Accuracy CI (Min-Max)'
|
315 |
+
))
|
316 |
|
317 |
fig.update_layout(
|
318 |
+
height = 600,
|
319 |
+
xaxis_title = x_label,
|
320 |
+
yaxis_title = y_label,
|
321 |
+
xaxis = dict(
|
322 |
+
showline = True,
|
323 |
+
linecolor = 'black',
|
324 |
+
showgrid = False),
|
325 |
+
yaxis = dict(
|
326 |
+
showline = True,
|
327 |
+
showgrid = False,
|
328 |
+
linecolor = 'black'),
|
329 |
+
plot_bgcolor = 'white',
|
330 |
+
legend=dict(
|
331 |
+
yanchor="bottom",
|
332 |
+
y=0.01,
|
333 |
+
xanchor="right",
|
334 |
+
x=0.98,
|
335 |
+
bgcolor="rgba(255, 255, 255, 0.5)" # semi-transparent white background
|
|
|
|
|
336 |
),
|
337 |
+
modebar=dict(
|
338 |
activecolor='#1f77b4', # Color of active tool
|
339 |
+
orientation='h', # Horizontal orientation
|
340 |
bgcolor='rgba(255,255,255,0.8)', # Slightly transparent white background
|
341 |
color='#777', # Color of inactive tools
|
342 |
add = ['pan2d'],
|
|
|
353 |
'select2d',
|
354 |
'select']
|
355 |
),
|
356 |
+
dragmode='pan'
|
357 |
)
|
358 |
+
|
359 |
+
fig.update_yaxes(rangemode="tozero")
|
360 |
+
fig.update_xaxes(rangemode="tozero")
|
361 |
+
|
362 |
return fig
|
363 |
+
# def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None):
|
364 |
+
# agents = [Agent(row['Total Cost'], row['Accuracy']) for i, row in df.iterrows()]
|
365 |
+
# pareto_frontier = compute_pareto_frontier(agents)
|
366 |
+
|
367 |
+
# fig = go.Figure()
|
368 |
+
|
369 |
+
# # Function to generate points for error ellipse
|
370 |
+
# def error_ellipse(x_center, y_center, x_radius, y_radius, angle, n=50):
|
371 |
+
# t = np.linspace(0, 2*np.pi, n)
|
372 |
+
# x = x_radius * np.cos(t)
|
373 |
+
# y = y_radius * np.sin(t)
|
374 |
+
# rotation = np.array([[np.cos(angle), -np.sin(angle)],
|
375 |
+
# [np.sin(angle), np.cos(angle)]])
|
376 |
+
# xy = np.dot(rotation, np.array([x, y]))
|
377 |
+
# return x_center + xy[0], y_center + xy[1]
|
378 |
+
|
379 |
+
# # Create a color map for agents
|
380 |
+
# unique_agents = df['Agent Name'].unique()
|
381 |
+
# colors = px.colors.qualitative.Plotly
|
382 |
+
# color_map = {agent: colors[i % len(colors)] for i, agent in enumerate(unique_agents)}
|
383 |
+
|
384 |
+
# # Add scatter points and error ellipses for each agent
|
385 |
+
# for agent in unique_agents:
|
386 |
+
# agent_data = df[df['Agent Name'] == agent]
|
387 |
+
|
388 |
+
# # Add scatter points
|
389 |
+
# fig.add_trace(go.Scatter(
|
390 |
+
# x=agent_data[x],
|
391 |
+
# y=agent_data[y],
|
392 |
+
# mode='markers',
|
393 |
+
# name=agent,
|
394 |
+
# marker=dict(size=10, color=color_map[agent]),
|
395 |
+
# customdata=agent_data[hover_data] if hover_data else None,
|
396 |
+
# hovertemplate="<br>".join([
|
397 |
+
# f"<b>Agent</b>: {agent}",
|
398 |
+
# f"<b>{x}</b>: ${{x:.1f}}",
|
399 |
+
# f"<b>{y}</b>: {{y:.1%}}",
|
400 |
+
# ] + ([f"<b>{col}</b>: {{customdata[{i}]}}" for i, col in enumerate(hover_data)] if hover_data else []))
|
401 |
+
# ))
|
402 |
+
|
403 |
+
# # Calculate mean and standard deviation for x and y
|
404 |
+
# x_mean = agent_data[x].mean()
|
405 |
+
# y_mean = agent_data[y].mean()
|
406 |
+
# x_std = agent_data[x].std()
|
407 |
+
# y_std = agent_data[y].std()
|
408 |
+
|
409 |
+
# # Calculate correlation coefficient
|
410 |
+
# corr = agent_data[x].corr(agent_data[y])
|
411 |
+
|
412 |
+
# # Add error ellipses (1 and 2 standard deviations)
|
413 |
+
# for n_std, opacity in [(1, 0.5), (2, 0.5)]:
|
414 |
+
# chi2_val = chi2.ppf(0.68 if n_std == 1 else 0.95, 2)
|
415 |
+
# x_radius = np.sqrt(chi2_val) * x_std
|
416 |
+
# y_radius = np.sqrt(chi2_val) * y_std
|
417 |
+
# angle = np.arctan2(y_std * corr, x_std)
|
418 |
+
|
419 |
+
# ellipse_x, ellipse_y = error_ellipse(x_mean, y_mean, x_radius, y_radius, angle)
|
420 |
+
|
421 |
+
# fig.add_shape(type="path",
|
422 |
+
# path=f"M {ellipse_x[0]}, {ellipse_y[0]} " +
|
423 |
+
# " ".join([f"L{x},{y}" for x, y in zip(ellipse_x[1:], ellipse_y[1:])]) +
|
424 |
+
# " Z",
|
425 |
+
# line_color=color_map[agent],
|
426 |
+
# line_width=2,
|
427 |
+
# opacity=opacity,
|
428 |
+
# layer="below")
|
429 |
+
|
430 |
+
# # Sort the Pareto frontier points by x-coordinate
|
431 |
+
# pareto_points = sorted([(agent.total_cost, agent.accuracy) for agent in pareto_frontier], key=lambda x: x[0])
|
432 |
+
|
433 |
+
# # Add the Pareto frontier line
|
434 |
+
# fig.add_trace(go.Scatter(
|
435 |
+
# x=[point[0] for point in pareto_points],
|
436 |
+
# y=[point[1] for point in pareto_points],
|
437 |
+
# mode='lines',
|
438 |
+
# name='Pareto Frontier',
|
439 |
+
# line=dict(color='black', width=1, dash='dash')
|
440 |
+
# ))
|
441 |
+
|
442 |
+
# fig.update_layout(
|
443 |
+
# height = 600,
|
444 |
+
# xaxis_title = x_label,
|
445 |
+
# yaxis_title = y_label,
|
446 |
+
# xaxis = dict(
|
447 |
+
# showline = True,
|
448 |
+
# linecolor = 'black',
|
449 |
+
# showgrid = False),
|
450 |
+
# yaxis = dict(
|
451 |
+
# showline = True,
|
452 |
+
# showgrid = False,
|
453 |
+
# linecolor = 'black'),
|
454 |
+
# plot_bgcolor = 'white',
|
455 |
+
# legend=dict(
|
456 |
+
# yanchor="bottom",
|
457 |
+
# y=0.01,
|
458 |
+
# xanchor="right",
|
459 |
+
# x=0.98,
|
460 |
+
# bgcolor="rgba(255, 255, 255, 0.5)"
|
461 |
+
# ),
|
462 |
+
# modebar=dict(
|
463 |
+
# activecolor='#1f77b4',
|
464 |
+
# orientation='h',
|
465 |
+
# bgcolor='rgba(255,255,255,0.8)',
|
466 |
+
# color='#777',
|
467 |
+
# add = ['pan2d'],
|
468 |
+
# remove = [
|
469 |
+
# 'zoom2d', 'zoomIn2d', 'zoomOut2d', 'resetScale2d',
|
470 |
+
# 'hoverClosestCartesian', 'hoverCompareCartesian',
|
471 |
+
# 'toggleSpikelines', 'lasso2d', 'lasso',
|
472 |
+
# 'select2d', 'select'
|
473 |
+
# ]
|
474 |
+
# ),
|
475 |
+
# dragmode='pan'
|
476 |
+
# )
|
477 |
+
|
478 |
+
# fig.update_yaxes(rangemode="tozero")
|
479 |
+
# fig.update_xaxes(rangemode="tozero")
|
480 |
+
|
481 |
+
# return fig
|
482 |
+
|
483 |
+
# def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None):
|
484 |
+
# agents = [Agent(row['Total Cost'], row['Accuracy']) for i, row in df.iterrows()]
|
485 |
+
# pareto_frontier = compute_pareto_frontier(agents)
|
486 |
+
|
487 |
+
# fig = px.scatter(df,
|
488 |
+
# x=x,
|
489 |
+
# y=y,
|
490 |
+
# custom_data=hover_data)
|
491 |
+
# fig.update_traces(
|
492 |
+
# hovertemplate="<br>".join([
|
493 |
+
# "<b>Agent</b>: %{customdata[0]}",
|
494 |
+
# "<b>Total Cost</b>: $%{x:.1f}",
|
495 |
+
# "<b>Accuracy</b>: %{y:.1%}",
|
496 |
+
# ])
|
497 |
+
# )
|
498 |
+
|
499 |
+
# fig.update_traces(marker=dict(size=10, color='#3498db'),
|
500 |
+
# hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),)
|
501 |
+
|
502 |
+
|
503 |
+
# # Sort the Pareto frontier points by x-coordinate
|
504 |
+
# pareto_points = sorted([(agent.total_cost, agent.accuracy) for agent in pareto_frontier], key=lambda x: x[0])
|
505 |
+
|
506 |
+
# # Add the Pareto frontier line
|
507 |
+
# fig.add_trace(go.Scatter(
|
508 |
+
# x=[point[0] for point in pareto_points],
|
509 |
+
# y=[point[1] for point in pareto_points],
|
510 |
+
# mode='lines',
|
511 |
+
# name='Pareto Frontier',
|
512 |
+
# line=dict(color='black', width=1, dash='dash')
|
513 |
+
# ))
|
514 |
+
|
515 |
+
# fig.update_layout(
|
516 |
+
# # width = 1150,
|
517 |
+
# height = 600,
|
518 |
+
# xaxis_title = x_label,
|
519 |
+
# yaxis_title = y_label,
|
520 |
+
# xaxis = dict(
|
521 |
+
# showline = True,
|
522 |
+
# linecolor = 'black',
|
523 |
+
# showgrid = False),
|
524 |
+
# yaxis = dict(
|
525 |
+
# showline = True,
|
526 |
+
# showgrid = False,
|
527 |
+
# linecolor = 'black'),
|
528 |
+
# plot_bgcolor = 'white',
|
529 |
+
# # Legend positioning
|
530 |
+
# legend=dict(
|
531 |
+
# yanchor="bottom",
|
532 |
+
# y=0.01,
|
533 |
+
# xanchor="right",
|
534 |
+
# x=0.98,
|
535 |
+
# bgcolor="rgba(255, 255, 255, 0.5)" # semi-transparent white background
|
536 |
+
# ),
|
537 |
+
# modebar=dict(
|
538 |
+
# activecolor='#1f77b4', # Color of active tool
|
539 |
+
# orientation='h', # Vertical orientation
|
540 |
+
# bgcolor='rgba(255,255,255,0.8)', # Slightly transparent white background
|
541 |
+
# color='#777', # Color of inactive tools
|
542 |
+
# add = ['pan2d'],
|
543 |
+
# remove = [
|
544 |
+
# 'zoom2d',
|
545 |
+
# 'zoomIn2d',
|
546 |
+
# 'zoomOut2d',
|
547 |
+
# 'resetScale2d',
|
548 |
+
# 'hoverClosestCartesian',
|
549 |
+
# 'hoverCompareCartesian',
|
550 |
+
# 'toggleSpikelines',
|
551 |
+
# 'lasso2d',
|
552 |
+
# 'lasso',
|
553 |
+
# 'select2d',
|
554 |
+
# 'select']
|
555 |
+
# ),
|
556 |
+
# dragmode='pan'
|
557 |
+
# )
|
558 |
+
|
559 |
+
# fig.update_yaxes(rangemode="tozero")
|
560 |
+
# fig.update_xaxes(rangemode="tozero")
|
561 |
+
|
562 |
+
# return fig
|
563 |
|
564 |
|
565 |
import plotly.graph_objects as go
|