benediktstroebl commited on
Commit
de321fc
·
verified ·
1 Parent(s): a60497d

Upload 3 files

Browse files
Files changed (3) hide show
  1. utils/db.py +132 -33
  2. utils/processing.py +14 -13
  3. utils/viz.py +393 -47
utils/db.py CHANGED
@@ -6,7 +6,9 @@ from functools import lru_cache
6
  import threading
7
  import pandas as pd
8
  import ast
 
9
  import yaml
 
10
 
11
  class TracePreprocessor:
12
  def __init__(self, db_path='preprocessed_traces.db'):
@@ -24,16 +26,20 @@ class TracePreprocessor:
24
  CREATE TABLE IF NOT EXISTS preprocessed_traces (
25
  benchmark_name TEXT,
26
  agent_name TEXT,
 
 
27
  raw_logging_results BLOB,
28
- PRIMARY KEY (benchmark_name, agent_name)
29
  )
30
  ''')
31
  conn.execute('''
32
  CREATE TABLE IF NOT EXISTS failure_reports (
33
  benchmark_name TEXT,
34
  agent_name TEXT,
 
 
35
  failure_report BLOB,
36
- PRIMARY KEY (benchmark_name, agent_name)
37
  )
38
  ''')
39
  conn.execute('''
@@ -41,6 +47,7 @@ class TracePreprocessor:
41
  benchmark_name TEXT,
42
  agent_name TEXT,
43
  date TEXT,
 
44
  successful_tasks TEXT,
45
  failed_tasks TEXT,
46
  total_cost REAL,
@@ -58,7 +65,7 @@ class TracePreprocessor:
58
  amp_parkinsons_disease_progression_prediction_score REAL,
59
  cifar10_score REAL,
60
  imdb_score REAL,
61
- PRIMARY KEY (benchmark_name, agent_name)
62
  )
63
  ''')
64
 
@@ -70,15 +77,17 @@ class TracePreprocessor:
70
  data = json.load(f)
71
  agent_name = data['config']['agent_name']
72
  benchmark_name = data['config']['benchmark_name']
 
 
73
 
74
  try:
75
  raw_logging_results = pickle.dumps(data['raw_logging_results'])
76
  with self.get_conn() as conn:
77
  conn.execute('''
78
  INSERT OR REPLACE INTO preprocessed_traces
79
- (benchmark_name, agent_name, raw_logging_results)
80
- VALUES (?, ?, ?)
81
- ''', (benchmark_name, agent_name, raw_logging_results))
82
  except Exception as e:
83
  print(f"Error preprocessing raw_logging_results in {file}: {e}")
84
 
@@ -86,10 +95,10 @@ class TracePreprocessor:
86
  failure_report = pickle.dumps(data['failure_report'])
87
  with self.get_conn() as conn:
88
  conn.execute('''
89
- INSERT OR REPLACE INTO failure_reports
90
- (benchmark_name, agent_name, failure_report)
91
- VALUES (?, ?, ?)
92
- ''', (benchmark_name, agent_name, failure_report))
93
  except Exception as e:
94
  print(f"Error preprocessing failure_report in {file}: {e}")
95
 
@@ -98,13 +107,14 @@ class TracePreprocessor:
98
  results = data['results']
99
  with self.get_conn() as conn:
100
  conn.execute('''
101
- INSERT OR REPLACE INTO parsed_results
102
- (benchmark_name, agent_name, date, successful_tasks, failed_tasks, total_cost, accuracy, precision, recall, f1_score, auc, overall_score, vectorization_score, fathomnet_score, feedback_score, house_price_score, spaceship_titanic_score, amp_parkinsons_disease_progression_prediction_score, cifar10_score, imdb_score)
103
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
104
  ''', (
105
  benchmark_name,
106
  agent_name,
107
  config['date'],
 
108
  str(results.get('successful_tasks')),
109
  str(results.get('failed_tasks')),
110
  results.get('total_cost'),
@@ -129,30 +139,62 @@ class TracePreprocessor:
129
  @lru_cache(maxsize=100)
130
  def get_analyzed_traces(self, agent_name, benchmark_name):
131
  with self.get_conn() as conn:
132
- cursor = conn.cursor()
133
- cursor.execute('''
134
- SELECT raw_logging_results FROM preprocessed_traces
135
  WHERE benchmark_name = ? AND agent_name = ?
136
- ''', (benchmark_name, agent_name))
137
- result = cursor.fetchone()
138
- if result:
139
- return pickle.loads(result[0])
140
- return None
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  @lru_cache(maxsize=100)
143
  def get_failure_report(self, agent_name, benchmark_name):
144
  with self.get_conn() as conn:
145
- cursor = conn.cursor()
146
- cursor.execute('''
147
- SELECT failure_report FROM failure_reports
148
  WHERE benchmark_name = ? AND agent_name = ?
149
- ''', (benchmark_name, agent_name))
150
- result = cursor.fetchone()
151
- if result:
152
- return pickle.loads(result[0])
153
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
- def get_parsed_results(self, benchmark_name):
156
  with self.get_conn() as conn:
157
  query = '''
158
  SELECT * FROM parsed_results
@@ -167,13 +209,65 @@ class TracePreprocessor:
167
  # Add 'Verified' column
168
  df['Verified'] = df.apply(lambda row: '✓' if (benchmark_name, row['agent_name']) in verified_agents else '', axis=1)
169
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
  # Round float columns to 3 decimal places
171
  float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc', 'overall_score', 'vectorization_score', 'fathomnet_score', 'feedback_score', 'house-price_score', 'spaceship-titanic_score', 'amp-parkinsons-disease-progression-prediction_score', 'cifar10_score', 'imdb_score']
172
  for column in float_columns:
173
  if column in df.columns:
174
  df[column] = df[column].round(3)
175
 
176
- df = df.drop(columns=['successful_tasks', 'failed_tasks'], axis=1)
 
177
 
178
  # Rename columns
179
  df = df.rename(columns={
@@ -193,7 +287,9 @@ class TracePreprocessor:
193
  'spaceship_titanic_score': 'Spaceship Titanic Score',
194
  'amp_parkinsons_disease_progression_prediction_score': 'AMP Parkinsons Disease Progression Prediction Score',
195
  'cifar10_score': 'CIFAR10 Score',
196
- 'imdb_score': 'IMDB Score'
 
 
197
  })
198
 
199
  return df
@@ -201,11 +297,14 @@ class TracePreprocessor:
201
  def get_task_success_data(self, benchmark_name):
202
  with self.get_conn() as conn:
203
  query = '''
204
- SELECT agent_name, successful_tasks, failed_tasks
205
  FROM parsed_results
206
  WHERE benchmark_name = ?
207
  '''
208
  df = pd.read_sql_query(query, conn, params=(benchmark_name,))
 
 
 
209
 
210
  # Get all unique task IDs
211
  task_ids = set()
 
6
  import threading
7
  import pandas as pd
8
  import ast
9
+ from scipy import stats
10
  import yaml
11
+ import numpy as np
12
 
13
  class TracePreprocessor:
14
  def __init__(self, db_path='preprocessed_traces.db'):
 
26
  CREATE TABLE IF NOT EXISTS preprocessed_traces (
27
  benchmark_name TEXT,
28
  agent_name TEXT,
29
+ date TEXT,
30
+ run_id TEXT,
31
  raw_logging_results BLOB,
32
+ PRIMARY KEY (benchmark_name, agent_name, run_id)
33
  )
34
  ''')
35
  conn.execute('''
36
  CREATE TABLE IF NOT EXISTS failure_reports (
37
  benchmark_name TEXT,
38
  agent_name TEXT,
39
+ date TEXT,
40
+ run_id TEXT,
41
  failure_report BLOB,
42
+ PRIMARY KEY (benchmark_name, agent_name, run_id)
43
  )
44
  ''')
45
  conn.execute('''
 
47
  benchmark_name TEXT,
48
  agent_name TEXT,
49
  date TEXT,
50
+ run_id TEXT,
51
  successful_tasks TEXT,
52
  failed_tasks TEXT,
53
  total_cost REAL,
 
65
  amp_parkinsons_disease_progression_prediction_score REAL,
66
  cifar10_score REAL,
67
  imdb_score REAL,
68
+ PRIMARY KEY (benchmark_name, agent_name, run_id)
69
  )
70
  ''')
71
 
 
77
  data = json.load(f)
78
  agent_name = data['config']['agent_name']
79
  benchmark_name = data['config']['benchmark_name']
80
+ date = data['config']['date']
81
+ config = data['config']
82
 
83
  try:
84
  raw_logging_results = pickle.dumps(data['raw_logging_results'])
85
  with self.get_conn() as conn:
86
  conn.execute('''
87
  INSERT OR REPLACE INTO preprocessed_traces
88
+ (benchmark_name, agent_name, date, run_id, raw_logging_results)
89
+ VALUES (?, ?, ?, ?, ?)
90
+ ''', (benchmark_name, agent_name, date, config['run_id'], raw_logging_results))
91
  except Exception as e:
92
  print(f"Error preprocessing raw_logging_results in {file}: {e}")
93
 
 
95
  failure_report = pickle.dumps(data['failure_report'])
96
  with self.get_conn() as conn:
97
  conn.execute('''
98
+ INSERT INTO failure_reports
99
+ (benchmark_name, agent_name, date, run_id, failure_report)
100
+ VALUES (?, ?, ?, ? ,?)
101
+ ''', (benchmark_name, agent_name, date, config['run_id'], failure_report))
102
  except Exception as e:
103
  print(f"Error preprocessing failure_report in {file}: {e}")
104
 
 
107
  results = data['results']
108
  with self.get_conn() as conn:
109
  conn.execute('''
110
+ INSERT INTO parsed_results
111
+ (benchmark_name, agent_name, date, run_id, successful_tasks, failed_tasks, total_cost, accuracy, precision, recall, f1_score, auc, overall_score, vectorization_score, fathomnet_score, feedback_score, house_price_score, spaceship_titanic_score, amp_parkinsons_disease_progression_prediction_score, cifar10_score, imdb_score)
112
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
113
  ''', (
114
  benchmark_name,
115
  agent_name,
116
  config['date'],
117
+ config['run_id'],
118
  str(results.get('successful_tasks')),
119
  str(results.get('failed_tasks')),
120
  results.get('total_cost'),
 
139
  @lru_cache(maxsize=100)
140
  def get_analyzed_traces(self, agent_name, benchmark_name):
141
  with self.get_conn() as conn:
142
+ query = '''
143
+ SELECT agent_name, raw_logging_results, date FROM preprocessed_traces
 
144
  WHERE benchmark_name = ? AND agent_name = ?
145
+ '''
146
+ df = pd.read_sql_query(query, conn, params=(benchmark_name, agent_name))
147
+
148
+
149
+ # check for each row if raw_logging_results is not None with pickle.loads because it is stored as a byte string
150
+ df = df[df['raw_logging_results'].apply(lambda x: pickle.loads(x) is not None and x != 'None')]
151
+
152
+ if len(df) == 0:
153
+ return None
154
+
155
+ # select latest run
156
+ df = df.sort_values('date', ascending=False).groupby('agent_name').first().reset_index()
157
+
158
+
159
+ return pickle.loads(df['raw_logging_results'][0])
160
+
161
 
162
  @lru_cache(maxsize=100)
163
  def get_failure_report(self, agent_name, benchmark_name):
164
  with self.get_conn() as conn:
165
+ query = '''
166
+ SELECT agent_name, date, failure_report FROM failure_reports
 
167
  WHERE benchmark_name = ? AND agent_name = ?
168
+ '''
169
+ df = pd.read_sql_query(query, conn, params=(benchmark_name, agent_name))
170
+
171
+ # Select only rows for which failure report is not None and None is a string
172
+ df = df[df['failure_report'].apply(lambda x: pickle.loads(x) is not None and x != 'None')]
173
+
174
+ if len(df) == 0:
175
+ return None
176
+
177
+
178
+ # if there is multiple failure reports, take the last one
179
+ df = df.sort_values('date', ascending=False).groupby('agent_name').first().reset_index()
180
+
181
+ # if there is a failure report, return the first one
182
+ return pickle.loads(df['failure_report'][0])
183
+
184
+ def _calculate_ci(self, data, confidence=0.95):
185
+ data = data[np.isfinite(data)]
186
+
187
+ if len(data) < 2:
188
+ return '', '', '' # No CI for less than 2 samples
189
+ n = len(data)
190
+
191
+ mean = np.mean(data)
192
+ sem = stats.sem(data)
193
+
194
+ ci = stats.t.interval(confidence, n-1, loc=mean, scale=sem)
195
+ return mean, ci[0], ci[1]
196
 
197
+ def get_parsed_results(self, benchmark_name, aggregate=True):
198
  with self.get_conn() as conn:
199
  query = '''
200
  SELECT * FROM parsed_results
 
209
  # Add 'Verified' column
210
  df['Verified'] = df.apply(lambda row: '✓' if (benchmark_name, row['agent_name']) in verified_agents else '', axis=1)
211
 
212
+
213
+
214
+ # Add column for how many times an agent_name appears in the DataFrame
215
+ df['Runs'] = df.groupby('agent_name')['agent_name'].transform('count')
216
+
217
+ # Compute the 95% confidence interval for accuracy and cost for agents that have been run more than once
218
+ df['acc_ci'] = None
219
+ df['cost_ci'] = None
220
+
221
+ for agent_name in df['agent_name'].unique():
222
+ agent_df = df[df['agent_name'] == agent_name]
223
+
224
+ if len(agent_df) > 1:
225
+ accuracy_mean, accuracy_lower, accuracy_upper = self._calculate_ci(agent_df['accuracy'])
226
+ cost_mean, cost_lower, cost_upper = self._calculate_ci(agent_df['total_cost'])
227
+
228
+ # format the confidence interval with +/- sign
229
+ accuracy_ci = f"± {abs(accuracy_mean - accuracy_lower):.3f}"
230
+ cost_ci = f"± {abs(cost_mean - cost_lower):.3f}"
231
+
232
+ df.loc[df['agent_name'] == agent_name, 'acc_ci'] = accuracy_ci
233
+ df.loc[df['agent_name'] == agent_name, 'cost_ci'] = cost_ci
234
+
235
+
236
+ df = df.drop(columns=['successful_tasks', 'failed_tasks', 'run_id'], axis=1)
237
+
238
+ if aggregate:
239
+ # For agents that have been run more than once, compute the average accuracy and cost and use that as the value in the DataFrame
240
+ df = df.groupby('agent_name').agg({
241
+ 'date': 'first',
242
+ 'total_cost': 'mean',
243
+ 'accuracy': 'mean',
244
+ 'precision': 'mean',
245
+ 'recall': 'mean',
246
+ 'f1_score': 'mean',
247
+ 'auc': 'mean',
248
+ 'overall_score': 'mean',
249
+ 'vectorization_score': 'mean',
250
+ 'fathomnet_score': 'mean',
251
+ 'feedback_score': 'mean',
252
+ 'house_price_score': 'mean',
253
+ 'spaceship_titanic_score': 'mean',
254
+ 'amp_parkinsons_disease_progression_prediction_score': 'mean',
255
+ 'cifar10_score': 'mean',
256
+ 'imdb_score': 'mean',
257
+ 'Verified': 'first',
258
+ 'Runs': 'first',
259
+ 'acc_ci': 'first',
260
+ 'cost_ci': 'first'
261
+ }).reset_index()
262
+
263
  # Round float columns to 3 decimal places
264
  float_columns = ['total_cost', 'accuracy', 'precision', 'recall', 'f1_score', 'auc', 'overall_score', 'vectorization_score', 'fathomnet_score', 'feedback_score', 'house-price_score', 'spaceship-titanic_score', 'amp-parkinsons-disease-progression-prediction_score', 'cifar10_score', 'imdb_score']
265
  for column in float_columns:
266
  if column in df.columns:
267
  df[column] = df[column].round(3)
268
 
269
+ # sort by accuracy
270
+ df = df.sort_values('accuracy', ascending=False)
271
 
272
  # Rename columns
273
  df = df.rename(columns={
 
287
  'spaceship_titanic_score': 'Spaceship Titanic Score',
288
  'amp_parkinsons_disease_progression_prediction_score': 'AMP Parkinsons Disease Progression Prediction Score',
289
  'cifar10_score': 'CIFAR10 Score',
290
+ 'imdb_score': 'IMDB Score',
291
+ 'acc_ci': 'Accuracy CI',
292
+ 'cost_ci': 'Total Cost CI'
293
  })
294
 
295
  return df
 
297
  def get_task_success_data(self, benchmark_name):
298
  with self.get_conn() as conn:
299
  query = '''
300
+ SELECT agent_name, accuracy, successful_tasks, failed_tasks
301
  FROM parsed_results
302
  WHERE benchmark_name = ?
303
  '''
304
  df = pd.read_sql_query(query, conn, params=(benchmark_name,))
305
+
306
+ # for agent_names that have been run more than once, take the run with the highest accuracy
307
+ df = df.sort_values('accuracy', ascending=False).groupby('agent_name').first().reset_index()
308
 
309
  # Get all unique task IDs
310
  task_ids = set()
utils/processing.py CHANGED
@@ -10,6 +10,7 @@ import aiosmtplib
10
  from agent_monitor.monitor import analyze_agent_steps
11
  from agent_monitor.failure_report import analyze_agent_performance, AsyncOpenAIClient
12
  import traceback
 
13
 
14
  async def check_and_process_uploads():
15
  upload_dir = "evals_upload"
@@ -58,12 +59,14 @@ async def check_and_process_uploads():
58
 
59
  print(f"Processing {len(unprocessed_uploads)} new uploads.")
60
  tasks = []
61
- for upload in unprocessed_uploads:
62
  upload_path = os.path.join(upload_dir, upload)
63
  processed_path = os.path.join(processed_dir, upload)
64
- tasks.append(process_single_upload(upload_path, processed_path))
65
-
66
- await asyncio.gather(*tasks)
 
 
67
 
68
 
69
  async def process_single_upload(upload_path, processed_path):
@@ -77,12 +80,9 @@ async def process_single_upload(upload_path, processed_path):
77
  # Move the file to processed directory
78
  # await asyncio.to_thread(shutil.move, upload_path, processed_path)
79
 
80
- # Send email notification
81
- # await send_email_notification(upload_path.name, check_result, "Processing successful")
82
  else:
83
  print(f"Upload check failed for {upload_path}: {check_result['message']}")
84
- # Send email notification about the failed check
85
- # await send_email_notification(upload_path.name, check_result, "Upload check failed")
86
 
87
 
88
  async def check_upload_structure(file_path):
@@ -123,11 +123,11 @@ async def process_upload(input_path, output_path):
123
  openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
124
 
125
  try:
126
- processed_calls = await analyze_agent_steps(data['raw_logging_results'], openai_client, llm_eval=False)
127
- data['raw_logging_results'] = processed_calls
128
 
129
- # failure_report = await analyze_agent_performance(data['raw_logging_results'], data['results']['failed_tasks'], openai_client)
130
- # data['failure_report'] = None
131
  except Exception as e:
132
  traceback.print_exc()
133
  print(f"Error in processing: {str(e)}")
@@ -163,4 +163,5 @@ async def send_email_notification(filename, check_result, status):
163
  use_tls=True,
164
  username=sender_email,
165
  password=password
166
- )
 
 
10
  from agent_monitor.monitor import analyze_agent_steps
11
  from agent_monitor.failure_report import analyze_agent_performance, AsyncOpenAIClient
12
  import traceback
13
+ from tqdm import tqdm
14
 
15
  async def check_and_process_uploads():
16
  upload_dir = "evals_upload"
 
59
 
60
  print(f"Processing {len(unprocessed_uploads)} new uploads.")
61
  tasks = []
62
+ for upload in tqdm(unprocessed_uploads):
63
  upload_path = os.path.join(upload_dir, upload)
64
  processed_path = os.path.join(processed_dir, upload)
65
+ # tasks.append(process_single_upload(upload_path, processed_path)) # for async processing
66
+ await process_single_upload(upload_path, processed_path)
67
+
68
+
69
+ # await asyncio.gather(*tasks) # for async processing
70
 
71
 
72
  async def process_single_upload(upload_path, processed_path):
 
80
  # Move the file to processed directory
81
  # await asyncio.to_thread(shutil.move, upload_path, processed_path)
82
 
 
 
83
  else:
84
  print(f"Upload check failed for {upload_path}: {check_result['message']}")
85
+
 
86
 
87
 
88
  async def check_upload_structure(file_path):
 
123
  openai_client = AsyncOpenAIClient(model="gpt-4o-mini")
124
 
125
  try:
126
+ processed_calls = await analyze_agent_steps(data['raw_logging_results'], openai_client, llm_eval=True)
127
+ failure_report = await analyze_agent_performance(data['raw_logging_results'], data['results']['failed_tasks'], openai_client)
128
 
129
+ data['raw_logging_results'] = processed_calls
130
+ data['failure_report'] = failure_report
131
  except Exception as e:
132
  traceback.print_exc()
133
  print(f"Error in processing: {str(e)}")
 
163
  use_tls=True,
164
  username=sender_email,
165
  password=password
166
+ )
167
+
utils/viz.py CHANGED
@@ -3,8 +3,29 @@ import plotly.express as px
3
  from utils.pareto import Agent, compute_pareto_frontier
4
  import plotly.graph_objects as go
5
  import textwrap
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def create_task_success_heatmap(df, benchmark_name):
 
8
  # Calculate agent accuracy
9
  agent_accuracy = df.groupby('Agent Name')['Success'].mean().sort_values(ascending=False)
10
 
@@ -17,16 +38,30 @@ def create_task_success_heatmap(df, benchmark_name):
17
  # Sort the pivot table
18
  pivot_df = pivot_df.reindex(index=agent_accuracy.index, columns=task_success_rate.index)
19
 
 
 
 
 
 
 
 
 
 
 
 
20
  num_agents = len(pivot_df.index)
21
  row_height = 30 # Fixed height for each row in pixels
22
  total_height = num_agents * row_height
23
 
 
 
 
24
  # Create the heatmap
25
  fig = go.Figure(data=go.Heatmap(
26
  z=pivot_df.values,
27
  y=pivot_df.index,
28
  x=pivot_df.columns,
29
- colorscale=[[0, 'white'], [1, '#3498db']], # White for failed, green for success
30
  showscale=False,
31
  hovertemplate='<b>Agent:</b> %{y}<br>' +
32
  '<b>Task:</b> %{x}<br>' +
@@ -36,18 +71,17 @@ def create_task_success_heatmap(df, benchmark_name):
36
  # Update the layout
37
  fig.update_layout(
38
  xaxis_title='Task ID',
39
- height=total_height,
40
- # width=1150,
41
  yaxis=dict(
42
  autorange='reversed',
43
- showticklabels=True, # Show y-axis tick labels (agent names)
44
  showline=True,
45
  linecolor='black',
46
  showgrid=False
47
  ),
48
  xaxis=dict(
49
  side='top',
50
- showticklabels=False, # Hide x-axis tick labels (task IDs)
51
  showline=True,
52
  linecolor='black',
53
  showgrid=False
@@ -136,65 +170,173 @@ def create_bar_chart(categories, values, x_label, y_label, title):
136
  return fig
137
 
138
  def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None):
139
- agents = [Agent(row['Total Cost'], row['Accuracy']) for i, row in df.iterrows()]
 
 
 
 
140
  pareto_frontier = compute_pareto_frontier(agents)
141
 
142
- fig = px.scatter(df,
143
- x=x,
144
- y=y,
145
- custom_data=hover_data)
146
- fig.update_traces(
147
- hovertemplate="<br>".join([
148
- "<b>Agent</b>: %{customdata[0]}",
149
- "<b>Total Cost</b>: $%{x:.1f}",
150
- "<b>Accuracy</b>: %{y:.1%}",
151
- ])
152
- )
153
-
154
- fig.update_traces(marker=dict(size=10, color='#3498db'),
155
- hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),)
156
-
157
 
158
  # Sort the Pareto frontier points by x-coordinate
159
  pareto_points = sorted([(agent.total_cost, agent.accuracy) for agent in pareto_frontier], key=lambda x: x[0])
160
-
161
  # Add the Pareto frontier line
162
  fig.add_trace(go.Scatter(
163
  x=[point[0] for point in pareto_points],
164
  y=[point[1] for point in pareto_points],
165
  mode='lines',
166
  name='Pareto Frontier',
 
167
  line=dict(color='black', width=1, dash='dash')
168
  ))
169
 
170
- fig.update_yaxes(rangemode="tozero")
171
- fig.update_xaxes(rangemode="tozero")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
  fig.update_layout(
174
- # width = 1150,
175
- height = 600,
176
- xaxis_title = x_label,
177
- yaxis_title = y_label,
178
- xaxis = dict(
179
- showline = True,
180
- linecolor = 'black',
181
- showgrid = False),
182
- yaxis = dict(
183
- showline = True,
184
- showgrid = False,
185
- linecolor = 'black'),
186
- plot_bgcolor = 'white',
187
- # Legend positioning
188
- legend=dict(
189
- yanchor="bottom",
190
- y=0.01,
191
- xanchor="right",
192
- x=0.98,
193
- bgcolor="rgba(255, 255, 255, 0.5)" # semi-transparent white background
194
  ),
195
- modebar=dict(
196
  activecolor='#1f77b4', # Color of active tool
197
- orientation='h', # Vertical orientation
198
  bgcolor='rgba(255,255,255,0.8)', # Slightly transparent white background
199
  color='#777', # Color of inactive tools
200
  add = ['pan2d'],
@@ -211,9 +353,213 @@ def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str =
211
  'select2d',
212
  'select']
213
  ),
214
- dragmode='pan'
215
  )
 
 
 
 
216
  return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
 
219
  import plotly.graph_objects as go
 
3
  from utils.pareto import Agent, compute_pareto_frontier
4
  import plotly.graph_objects as go
5
  import textwrap
6
+ import numpy as np
7
+ import pandas as pd
8
+ from scipy.stats import chi2
9
+ from scipy import stats
10
+
11
+
12
+ def create_leaderboard(df, ci_metrics = None):
13
+ # cast dtypes to string
14
+ df = df.astype(str)
15
+
16
+ # for each metric join metric and metric CI columns
17
+ if ci_metrics:
18
+ for metric in ci_metrics:
19
+ CI_metric = metric + ' CI'
20
+ # for rows in the df for which CI metric is not None, join the metric and CI columns by looping through the CI metrics columns
21
+ for i, row in df.iterrows():
22
+ if str(row[CI_metric]) != 'None':
23
+ df.at[i, metric] = str(row[metric]) + " (" + str(row[CI_metric]) + ")"
24
+
25
+ return df
26
 
27
  def create_task_success_heatmap(df, benchmark_name):
28
+
29
  # Calculate agent accuracy
30
  agent_accuracy = df.groupby('Agent Name')['Success'].mean().sort_values(ascending=False)
31
 
 
38
  # Sort the pivot table
39
  pivot_df = pivot_df.reindex(index=agent_accuracy.index, columns=task_success_rate.index)
40
 
41
+ # Calculate tasks solved across all agents
42
+ tasks_solved = (pivot_df.sum(axis=0) > 0).astype(int)
43
+ # Total number of tasks (columns)
44
+ total_tasks = len(pivot_df.columns)
45
+
46
+ # Add the new row to the pivot table
47
+ tasks_solved_df = pd.DataFrame(tasks_solved).T
48
+ tasks_solved_df.index = [f'<b>Tasks Solved: {tasks_solved.sum()}/{total_tasks} (All Agents)</b>']
49
+ # print number of tasks solved
50
+ pivot_df = pd.concat([pivot_df, tasks_solved_df])
51
+
52
  num_agents = len(pivot_df.index)
53
  row_height = 30 # Fixed height for each row in pixels
54
  total_height = num_agents * row_height
55
 
56
+ # Create a custom colorscale
57
+ colorscale=[[0, 'white'], [1, '#3498db']]
58
+
59
  # Create the heatmap
60
  fig = go.Figure(data=go.Heatmap(
61
  z=pivot_df.values,
62
  y=pivot_df.index,
63
  x=pivot_df.columns,
64
+ colorscale=colorscale,
65
  showscale=False,
66
  hovertemplate='<b>Agent:</b> %{y}<br>' +
67
  '<b>Task:</b> %{x}<br>' +
 
71
  # Update the layout
72
  fig.update_layout(
73
  xaxis_title='Task ID',
74
+ height=total_height + 50, # Add extra space for the new row
 
75
  yaxis=dict(
76
  autorange='reversed',
77
+ showticklabels=True,
78
  showline=True,
79
  linecolor='black',
80
  showgrid=False
81
  ),
82
  xaxis=dict(
83
  side='top',
84
+ showticklabels=False,
85
  showline=True,
86
  linecolor='black',
87
  showgrid=False
 
170
  return fig
171
 
172
  def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None):
173
+ # agents = [Agent(row['Total Cost'], row['Accuracy']) for i, row in df.iterrows()]
174
+ # instead of creating one Agent object for each row, we can create one Agent object for each unique agent and use the mean of the cost and accuracy values
175
+ unique_agents = df['Agent Name'].unique()
176
+ agents = [Agent(df[df['Agent Name'] == agent]['Total Cost'].mean(), df[df['Agent Name'] == agent]['Accuracy'].mean()) for agent in unique_agents]
177
+
178
  pareto_frontier = compute_pareto_frontier(agents)
179
 
180
+ fig = go.Figure()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
  # Sort the Pareto frontier points by x-coordinate
183
  pareto_points = sorted([(agent.total_cost, agent.accuracy) for agent in pareto_frontier], key=lambda x: x[0])
 
184
  # Add the Pareto frontier line
185
  fig.add_trace(go.Scatter(
186
  x=[point[0] for point in pareto_points],
187
  y=[point[1] for point in pareto_points],
188
  mode='lines',
189
  name='Pareto Frontier',
190
+ hoverinfo=None,
191
  line=dict(color='black', width=1, dash='dash')
192
  ))
193
 
194
+ # Plot scatter points and error bars for each agent
195
+ unique_agents = df[hover_data[0]].unique()
196
+ for agent in unique_agents:
197
+ agent_data = df[df[hover_data[0]] == agent]
198
+
199
+ x_value = [np.mean(agent_data[x].values)]
200
+ y_value = [np.mean(agent_data[y].values)]
201
+
202
+ if len(agent_data) > 1:
203
+ # Calculate 95% confidence intervals
204
+ ci_x = stats.t.interval(0.95, len(agent_data[x])-1, loc=np.mean(agent_data[x]), scale=stats.sem(agent_data[x]))
205
+ ci_y = stats.t.interval(0.95, len(agent_data[y])-1, loc=np.mean(agent_data[y]), scale=stats.sem(agent_data[y]))
206
+
207
+ # # Add error bars for x (cost)
208
+ # fig.add_trace(go.Scatter(
209
+ # x=x_value,
210
+ # y=y_value,
211
+ # error_x=dict(
212
+ # type='data',
213
+ # symmetric=False,
214
+ # array=[ci_x[1] - x_value],
215
+ # arrayminus=[x_value - ci_x[0]],
216
+ # color='red',
217
+ # ),
218
+ # mode='markers',
219
+ # marker=dict(color='rgba(0,0,0,0)'),
220
+ # showlegend=False,
221
+ # hoverinfo='none'
222
+ # ))
223
+
224
+ # # Add error bars for y (accuracy)
225
+ # fig.add_trace(go.Scatter(
226
+ # x=x_value,
227
+ # y=y_value,
228
+ # error_y=dict(
229
+ # type='data',
230
+ # symmetric=False,
231
+ # array=[ci_y[1] - y_value],
232
+ # arrayminus=[y_value - ci_y[0]],
233
+ # color='green',
234
+ # ),
235
+ # mode='markers',
236
+ # marker=dict(color='rgba(0,0,0,0)'),
237
+ # showlegend=False,
238
+ # hoverinfo='none'
239
+ # ))
240
+
241
+ # Add error bars for x (cost minmax)
242
+ fig.add_trace(go.Scatter(
243
+ x=x_value,
244
+ y=y_value,
245
+ error_x=dict(
246
+ type='data',
247
+ symmetric=False,
248
+ array=[np.max(agent_data[x]) - x_value],
249
+ arrayminus=[x_value - np.min(agent_data[x])],
250
+ color='#fec44f',
251
+ ),
252
+ mode='markers',
253
+ marker=dict(color='rgba(0,0,0,0)', opacity=0),
254
+ showlegend=False,
255
+ hoverinfo=None
256
+ ))
257
+
258
+ # Add error bars for y (accuracy minmax)
259
+ fig.add_trace(go.Scatter(
260
+ x=x_value,
261
+ y=y_value,
262
+ error_y=dict(
263
+ type='data',
264
+ symmetric=False,
265
+ array=[np.max(agent_data[y]) - y_value],
266
+ arrayminus=[y_value - np.min(agent_data[y])],
267
+ color='#bdbdbd',
268
+ ),
269
+ mode='markers',
270
+ marker=dict(color='rgba(0,0,0,0)', opacity=0),
271
+ showlegend=False,
272
+ hoverinfo=None
273
+ ))
274
+
275
+ # Add scatter points for this agent
276
+ fig.add_trace(go.Scatter(
277
+ x=x_value,
278
+ y=y_value,
279
+ mode='markers',
280
+ marker=dict(size=10, color='#3498db'),
281
+ customdata=agent_data[hover_data],
282
+ showlegend=False,
283
+ hovertemplate="<br>".join([
284
+ "<b>Agent</b>: %{customdata[0]}",
285
+ "<b>Total Cost</b>: $%{x:.1f}",
286
+ "<b>Accuracy</b>: %{y:.1%}<extra></extra>",
287
+ ]),
288
+ hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),
289
+ ))
290
+
291
+
292
+
293
+ # Add legend entries for error bars
294
+ # fig.add_trace(go.Scatter(
295
+ # x=[None], y=[None], mode='markers',
296
+ # marker=dict(color='red', size=10),
297
+ # name='Cost CI (95%)'
298
+ # ))
299
+ # fig.add_trace(go.Scatter(
300
+ # x=[None], y=[None], mode='markers',
301
+ # marker=dict(color='green', size=10),
302
+ # name='Accuracy CI (95%)'
303
+ # ))
304
+
305
+ # Add legend entries for error bars
306
+ fig.add_trace(go.Scatter(
307
+ x=[None], y=[None], mode='markers',
308
+ marker=dict(color='#fec44f', size=10),
309
+ name='Cost CI (Min-Max)'
310
+ ))
311
+ fig.add_trace(go.Scatter(
312
+ x=[None], y=[None], mode='markers',
313
+ marker=dict(color='#bdbdbd', size=10),
314
+ name='Accuracy CI (Min-Max)'
315
+ ))
316
 
317
  fig.update_layout(
318
+ height = 600,
319
+ xaxis_title = x_label,
320
+ yaxis_title = y_label,
321
+ xaxis = dict(
322
+ showline = True,
323
+ linecolor = 'black',
324
+ showgrid = False),
325
+ yaxis = dict(
326
+ showline = True,
327
+ showgrid = False,
328
+ linecolor = 'black'),
329
+ plot_bgcolor = 'white',
330
+ legend=dict(
331
+ yanchor="bottom",
332
+ y=0.01,
333
+ xanchor="right",
334
+ x=0.98,
335
+ bgcolor="rgba(255, 255, 255, 0.5)" # semi-transparent white background
 
 
336
  ),
337
+ modebar=dict(
338
  activecolor='#1f77b4', # Color of active tool
339
+ orientation='h', # Horizontal orientation
340
  bgcolor='rgba(255,255,255,0.8)', # Slightly transparent white background
341
  color='#777', # Color of inactive tools
342
  add = ['pan2d'],
 
353
  'select2d',
354
  'select']
355
  ),
356
+ dragmode='pan'
357
  )
358
+
359
+ fig.update_yaxes(rangemode="tozero")
360
+ fig.update_xaxes(rangemode="tozero")
361
+
362
  return fig
363
+ # def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None):
364
+ # agents = [Agent(row['Total Cost'], row['Accuracy']) for i, row in df.iterrows()]
365
+ # pareto_frontier = compute_pareto_frontier(agents)
366
+
367
+ # fig = go.Figure()
368
+
369
+ # # Function to generate points for error ellipse
370
+ # def error_ellipse(x_center, y_center, x_radius, y_radius, angle, n=50):
371
+ # t = np.linspace(0, 2*np.pi, n)
372
+ # x = x_radius * np.cos(t)
373
+ # y = y_radius * np.sin(t)
374
+ # rotation = np.array([[np.cos(angle), -np.sin(angle)],
375
+ # [np.sin(angle), np.cos(angle)]])
376
+ # xy = np.dot(rotation, np.array([x, y]))
377
+ # return x_center + xy[0], y_center + xy[1]
378
+
379
+ # # Create a color map for agents
380
+ # unique_agents = df['Agent Name'].unique()
381
+ # colors = px.colors.qualitative.Plotly
382
+ # color_map = {agent: colors[i % len(colors)] for i, agent in enumerate(unique_agents)}
383
+
384
+ # # Add scatter points and error ellipses for each agent
385
+ # for agent in unique_agents:
386
+ # agent_data = df[df['Agent Name'] == agent]
387
+
388
+ # # Add scatter points
389
+ # fig.add_trace(go.Scatter(
390
+ # x=agent_data[x],
391
+ # y=agent_data[y],
392
+ # mode='markers',
393
+ # name=agent,
394
+ # marker=dict(size=10, color=color_map[agent]),
395
+ # customdata=agent_data[hover_data] if hover_data else None,
396
+ # hovertemplate="<br>".join([
397
+ # f"<b>Agent</b>: {agent}",
398
+ # f"<b>{x}</b>: ${{x:.1f}}",
399
+ # f"<b>{y}</b>: {{y:.1%}}",
400
+ # ] + ([f"<b>{col}</b>: {{customdata[{i}]}}" for i, col in enumerate(hover_data)] if hover_data else []))
401
+ # ))
402
+
403
+ # # Calculate mean and standard deviation for x and y
404
+ # x_mean = agent_data[x].mean()
405
+ # y_mean = agent_data[y].mean()
406
+ # x_std = agent_data[x].std()
407
+ # y_std = agent_data[y].std()
408
+
409
+ # # Calculate correlation coefficient
410
+ # corr = agent_data[x].corr(agent_data[y])
411
+
412
+ # # Add error ellipses (1 and 2 standard deviations)
413
+ # for n_std, opacity in [(1, 0.5), (2, 0.5)]:
414
+ # chi2_val = chi2.ppf(0.68 if n_std == 1 else 0.95, 2)
415
+ # x_radius = np.sqrt(chi2_val) * x_std
416
+ # y_radius = np.sqrt(chi2_val) * y_std
417
+ # angle = np.arctan2(y_std * corr, x_std)
418
+
419
+ # ellipse_x, ellipse_y = error_ellipse(x_mean, y_mean, x_radius, y_radius, angle)
420
+
421
+ # fig.add_shape(type="path",
422
+ # path=f"M {ellipse_x[0]}, {ellipse_y[0]} " +
423
+ # " ".join([f"L{x},{y}" for x, y in zip(ellipse_x[1:], ellipse_y[1:])]) +
424
+ # " Z",
425
+ # line_color=color_map[agent],
426
+ # line_width=2,
427
+ # opacity=opacity,
428
+ # layer="below")
429
+
430
+ # # Sort the Pareto frontier points by x-coordinate
431
+ # pareto_points = sorted([(agent.total_cost, agent.accuracy) for agent in pareto_frontier], key=lambda x: x[0])
432
+
433
+ # # Add the Pareto frontier line
434
+ # fig.add_trace(go.Scatter(
435
+ # x=[point[0] for point in pareto_points],
436
+ # y=[point[1] for point in pareto_points],
437
+ # mode='lines',
438
+ # name='Pareto Frontier',
439
+ # line=dict(color='black', width=1, dash='dash')
440
+ # ))
441
+
442
+ # fig.update_layout(
443
+ # height = 600,
444
+ # xaxis_title = x_label,
445
+ # yaxis_title = y_label,
446
+ # xaxis = dict(
447
+ # showline = True,
448
+ # linecolor = 'black',
449
+ # showgrid = False),
450
+ # yaxis = dict(
451
+ # showline = True,
452
+ # showgrid = False,
453
+ # linecolor = 'black'),
454
+ # plot_bgcolor = 'white',
455
+ # legend=dict(
456
+ # yanchor="bottom",
457
+ # y=0.01,
458
+ # xanchor="right",
459
+ # x=0.98,
460
+ # bgcolor="rgba(255, 255, 255, 0.5)"
461
+ # ),
462
+ # modebar=dict(
463
+ # activecolor='#1f77b4',
464
+ # orientation='h',
465
+ # bgcolor='rgba(255,255,255,0.8)',
466
+ # color='#777',
467
+ # add = ['pan2d'],
468
+ # remove = [
469
+ # 'zoom2d', 'zoomIn2d', 'zoomOut2d', 'resetScale2d',
470
+ # 'hoverClosestCartesian', 'hoverCompareCartesian',
471
+ # 'toggleSpikelines', 'lasso2d', 'lasso',
472
+ # 'select2d', 'select'
473
+ # ]
474
+ # ),
475
+ # dragmode='pan'
476
+ # )
477
+
478
+ # fig.update_yaxes(rangemode="tozero")
479
+ # fig.update_xaxes(rangemode="tozero")
480
+
481
+ # return fig
482
+
483
+ # def create_scatter_plot(df, x: str, y: str, x_label: str = None, y_label: str = None, hover_data: list = None):
484
+ # agents = [Agent(row['Total Cost'], row['Accuracy']) for i, row in df.iterrows()]
485
+ # pareto_frontier = compute_pareto_frontier(agents)
486
+
487
+ # fig = px.scatter(df,
488
+ # x=x,
489
+ # y=y,
490
+ # custom_data=hover_data)
491
+ # fig.update_traces(
492
+ # hovertemplate="<br>".join([
493
+ # "<b>Agent</b>: %{customdata[0]}",
494
+ # "<b>Total Cost</b>: $%{x:.1f}",
495
+ # "<b>Accuracy</b>: %{y:.1%}",
496
+ # ])
497
+ # )
498
+
499
+ # fig.update_traces(marker=dict(size=10, color='#3498db'),
500
+ # hoverlabel=dict(bgcolor="white", font_size=12, font_family="Arial"),)
501
+
502
+
503
+ # # Sort the Pareto frontier points by x-coordinate
504
+ # pareto_points = sorted([(agent.total_cost, agent.accuracy) for agent in pareto_frontier], key=lambda x: x[0])
505
+
506
+ # # Add the Pareto frontier line
507
+ # fig.add_trace(go.Scatter(
508
+ # x=[point[0] for point in pareto_points],
509
+ # y=[point[1] for point in pareto_points],
510
+ # mode='lines',
511
+ # name='Pareto Frontier',
512
+ # line=dict(color='black', width=1, dash='dash')
513
+ # ))
514
+
515
+ # fig.update_layout(
516
+ # # width = 1150,
517
+ # height = 600,
518
+ # xaxis_title = x_label,
519
+ # yaxis_title = y_label,
520
+ # xaxis = dict(
521
+ # showline = True,
522
+ # linecolor = 'black',
523
+ # showgrid = False),
524
+ # yaxis = dict(
525
+ # showline = True,
526
+ # showgrid = False,
527
+ # linecolor = 'black'),
528
+ # plot_bgcolor = 'white',
529
+ # # Legend positioning
530
+ # legend=dict(
531
+ # yanchor="bottom",
532
+ # y=0.01,
533
+ # xanchor="right",
534
+ # x=0.98,
535
+ # bgcolor="rgba(255, 255, 255, 0.5)" # semi-transparent white background
536
+ # ),
537
+ # modebar=dict(
538
+ # activecolor='#1f77b4', # Color of active tool
539
+ # orientation='h', # Vertical orientation
540
+ # bgcolor='rgba(255,255,255,0.8)', # Slightly transparent white background
541
+ # color='#777', # Color of inactive tools
542
+ # add = ['pan2d'],
543
+ # remove = [
544
+ # 'zoom2d',
545
+ # 'zoomIn2d',
546
+ # 'zoomOut2d',
547
+ # 'resetScale2d',
548
+ # 'hoverClosestCartesian',
549
+ # 'hoverCompareCartesian',
550
+ # 'toggleSpikelines',
551
+ # 'lasso2d',
552
+ # 'lasso',
553
+ # 'select2d',
554
+ # 'select']
555
+ # ),
556
+ # dragmode='pan'
557
+ # )
558
+
559
+ # fig.update_yaxes(rangemode="tozero")
560
+ # fig.update_xaxes(rangemode="tozero")
561
+
562
+ # return fig
563
 
564
 
565
  import plotly.graph_objects as go