benediktstroebl commited on
Commit
47280b7
·
1 Parent(s): fd35772

added task heatmaps

Browse files
Files changed (3) hide show
  1. app.py +42 -9
  2. utils/db.py +48 -2
  3. utils/viz.py +59 -0
app.py CHANGED
@@ -6,7 +6,7 @@ from pathlib import Path
6
  import pandas as pd
7
  import os
8
  import json
9
- from utils.viz import create_scatter_plot, create_flow_chart, create_bar_chart
10
  from utils.processing import check_and_process_uploads
11
  from huggingface_hub import snapshot_download
12
  from apscheduler.schedulers.background import BackgroundScheduler
@@ -19,10 +19,8 @@ from apscheduler.schedulers.asyncio import AsyncIOScheduler
19
  import weave
20
  from utils.db import TracePreprocessor
21
 
22
- # Initialize the TracePreprocessor
23
  preprocessor = TracePreprocessor()
24
 
25
-
26
  from datetime import datetime
27
 
28
  abs_path = Path(__file__).parent
@@ -234,7 +232,7 @@ with gr.Blocks() as demo:
234
  label="Select Columns to Display:",
235
  ),
236
  hide_columns=config.USACO_HIDE_COLUMNS,
237
- search_columns=config.USACO_SEARCH_COLUMNS,
238
  column_widths={"Agent Name": 40,
239
  "Accuracy": 20,
240
  "Total Cost": 20},
@@ -242,6 +240,17 @@ with gr.Blocks() as demo:
242
  with gr.Row():
243
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
244
 
 
 
 
 
 
 
 
 
 
 
 
245
  gr.Markdown("# Failure Report")
246
  with gr.Row():
247
  with gr.Column(scale=1):
@@ -355,7 +364,7 @@ with gr.Blocks() as demo:
355
  label="Select Columns to Display:",
356
  ),
357
  hide_columns=config.SWEBENCH_HIDE_COLUMNS,
358
- search_columns=config.SWEBENCH_SEARCH_COLUMNS,
359
  column_widths={"Agent Name": 40,
360
  "Accuracy": 20,
361
  "Total Cost": 20},
@@ -363,6 +372,17 @@ with gr.Blocks() as demo:
363
  with gr.Row():
364
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
365
 
 
 
 
 
 
 
 
 
 
 
 
366
  gr.Markdown("# Failure Report")
367
  with gr.Row():
368
  with gr.Column(scale=1):
@@ -474,7 +494,7 @@ with gr.Blocks() as demo:
474
  cant_deselect=["Agent Name"],
475
  label="Select Columns to Display:",
476
  ),
477
- search_columns=config.SWEBENCH_SEARCH_COLUMNS,
478
  hide_columns=config.SWEBENCH_HIDE_COLUMNS,
479
  column_widths={"Agent Name": 40,
480
  "Accuracy": 20,
@@ -483,6 +503,17 @@ with gr.Blocks() as demo:
483
  with gr.Row():
484
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
485
 
 
 
 
 
 
 
 
 
 
 
 
486
  gr.Markdown("# Failure Report")
487
  with gr.Row():
488
  with gr.Column(scale=1):
@@ -594,7 +625,7 @@ with gr.Blocks() as demo:
594
  cant_deselect=["Agent Name"],
595
  label="Select Columns to Display:",
596
  ),
597
- search_columns=config.MLAGENTBENCH_SEARCH_COLUMNS,
598
  hide_columns=config.MLAGENTBENCH_HIDE_COLUMNS,
599
  column_widths={"Agent Name": 40,
600
  "Overall Score": 20,
@@ -714,12 +745,14 @@ with gr.Blocks() as demo:
714
 
715
  async def main():
716
  # Preprocess traces
 
717
  # preprocessor.preprocess_traces('evals_live')
 
718
 
719
- # # # Download the results from the Hugging Face Hub
720
  # await asyncio.to_thread(download_latest_results)
721
 
722
- # # Check for new uploads and process them
723
  # await check_and_process_uploads()
724
 
725
  scheduler = AsyncIOScheduler()
 
6
  import pandas as pd
7
  import os
8
  import json
9
+ from utils.viz import create_scatter_plot, create_flow_chart, create_bar_chart, create_task_success_heatmap
10
  from utils.processing import check_and_process_uploads
11
  from huggingface_hub import snapshot_download
12
  from apscheduler.schedulers.background import BackgroundScheduler
 
19
  import weave
20
  from utils.db import TracePreprocessor
21
 
 
22
  preprocessor = TracePreprocessor()
23
 
 
24
  from datetime import datetime
25
 
26
  abs_path = Path(__file__).parent
 
232
  label="Select Columns to Display:",
233
  ),
234
  hide_columns=config.USACO_HIDE_COLUMNS,
235
+ # search_columns=config.USACO_SEARCH_COLUMNS,
236
  column_widths={"Agent Name": 40,
237
  "Accuracy": 20,
238
  "Total Cost": 20},
 
240
  with gr.Row():
241
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
242
 
243
+ gr.Markdown("# Task Success Heatmap")
244
+ with gr.Row():
245
+ task_success_heatmap = gr.Plot()
246
+ demo.load(
247
+ lambda: create_task_success_heatmap(
248
+ preprocessor.get_task_success_data('usaco'),
249
+ 'USACO'
250
+ ),
251
+ outputs=[task_success_heatmap]
252
+ )
253
+
254
  gr.Markdown("# Failure Report")
255
  with gr.Row():
256
  with gr.Column(scale=1):
 
364
  label="Select Columns to Display:",
365
  ),
366
  hide_columns=config.SWEBENCH_HIDE_COLUMNS,
367
+ # search_columns=config.SWEBENCH_SEARCH_COLUMNS,
368
  column_widths={"Agent Name": 40,
369
  "Accuracy": 20,
370
  "Total Cost": 20},
 
372
  with gr.Row():
373
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
374
 
375
+ gr.Markdown("# Task Success Heatmap")
376
+ with gr.Row():
377
+ task_success_heatmap = gr.Plot()
378
+ demo.load(
379
+ lambda: create_task_success_heatmap(
380
+ preprocessor.get_task_success_data('swebench_verified'),
381
+ 'SWEBench Verified'
382
+ ),
383
+ outputs=[task_success_heatmap]
384
+ )
385
+
386
  gr.Markdown("# Failure Report")
387
  with gr.Row():
388
  with gr.Column(scale=1):
 
494
  cant_deselect=["Agent Name"],
495
  label="Select Columns to Display:",
496
  ),
497
+ # search_columns=config.SWEBENCH_SEARCH_COLUMNS,
498
  hide_columns=config.SWEBENCH_HIDE_COLUMNS,
499
  column_widths={"Agent Name": 40,
500
  "Accuracy": 20,
 
503
  with gr.Row():
504
  scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
505
 
506
+ gr.Markdown("# Task Success Heatmap")
507
+ with gr.Row():
508
+ task_success_heatmap = gr.Plot()
509
+ demo.load(
510
+ lambda: create_task_success_heatmap(
511
+ preprocessor.get_task_success_data('swebench_lite'),
512
+ 'SWEBench Lite'
513
+ ),
514
+ outputs=[task_success_heatmap]
515
+ )
516
+
517
  gr.Markdown("# Failure Report")
518
  with gr.Row():
519
  with gr.Column(scale=1):
 
625
  cant_deselect=["Agent Name"],
626
  label="Select Columns to Display:",
627
  ),
628
+ # search_columns=config.MLAGENTBENCH_SEARCH_COLUMNS,
629
  hide_columns=config.MLAGENTBENCH_HIDE_COLUMNS,
630
  column_widths={"Agent Name": 40,
631
  "Overall Score": 20,
 
745
 
746
  async def main():
747
  # Preprocess traces
748
+ # preprocessor = TracePreprocessor()
749
  # preprocessor.preprocess_traces('evals_live')
750
+ # preprocessor = TracePreprocessor()
751
 
752
+ # Download the results from the Hugging Face Hub
753
  # await asyncio.to_thread(download_latest_results)
754
 
755
+ # Check for new uploads and process them
756
  # await check_and_process_uploads()
757
 
758
  scheduler = AsyncIOScheduler()
utils/db.py CHANGED
@@ -5,6 +5,7 @@ import pickle
5
  from functools import lru_cache
6
  import threading
7
  import pandas as pd
 
8
 
9
  class TracePreprocessor:
10
  def __init__(self, db_path='preprocessed_traces.db'):
@@ -40,6 +41,8 @@ class TracePreprocessor:
40
  benchmark_name TEXT,
41
  agent_name TEXT,
42
  date TEXT,
 
 
43
  total_cost REAL,
44
  accuracy REAL,
45
  precision REAL,
@@ -95,12 +98,14 @@ class TracePreprocessor:
95
  with self.get_conn() as conn:
96
  conn.execute('''
97
  INSERT OR REPLACE INTO parsed_results
98
- (benchmark_name, agent_name, date, total_cost, accuracy, precision, recall, f1_score, auc, overall_score, vectorization_score, fathomnet_score, feedback_score, house_price_score, spaceship_titanic_score, amp_parkinsons_disease_progression_prediction_score, cifar10_score, imdb_score)
99
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
100
  ''', (
101
  benchmark_name,
102
  agent_name,
103
  config['date'],
 
 
104
  results.get('total_cost'),
105
  results.get('accuracy'),
106
  results.get('precision'),
@@ -161,6 +166,8 @@ class TracePreprocessor:
161
  if column in df.columns:
162
  df[column] = df[column].round(3)
163
 
 
 
164
  # Rename columns
165
  df = df.rename(columns={
166
  'agent_name': 'Agent Name',
@@ -183,6 +190,45 @@ class TracePreprocessor:
183
  })
184
 
185
  return df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
  if __name__ == '__main__':
188
  preprocessor = TracePreprocessor()
 
5
  from functools import lru_cache
6
  import threading
7
  import pandas as pd
8
+ import ast
9
 
10
  class TracePreprocessor:
11
  def __init__(self, db_path='preprocessed_traces.db'):
 
41
  benchmark_name TEXT,
42
  agent_name TEXT,
43
  date TEXT,
44
+ successful_tasks TEXT,
45
+ failed_tasks TEXT,
46
  total_cost REAL,
47
  accuracy REAL,
48
  precision REAL,
 
98
  with self.get_conn() as conn:
99
  conn.execute('''
100
  INSERT OR REPLACE INTO parsed_results
101
+ (benchmark_name, agent_name, date, successful_tasks, failed_tasks, total_cost, accuracy, precision, recall, f1_score, auc, overall_score, vectorization_score, fathomnet_score, feedback_score, house_price_score, spaceship_titanic_score, amp_parkinsons_disease_progression_prediction_score, cifar10_score, imdb_score)
102
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
103
  ''', (
104
  benchmark_name,
105
  agent_name,
106
  config['date'],
107
+ str(results.get('successful_tasks')),
108
+ str(results.get('failed_tasks')),
109
  results.get('total_cost'),
110
  results.get('accuracy'),
111
  results.get('precision'),
 
166
  if column in df.columns:
167
  df[column] = df[column].round(3)
168
 
169
+ df = df.drop(columns=['successful_tasks', 'failed_tasks'], axis=1)
170
+
171
  # Rename columns
172
  df = df.rename(columns={
173
  'agent_name': 'Agent Name',
 
190
  })
191
 
192
  return df
193
+
194
+ def get_task_success_data(self, benchmark_name):
195
+ with self.get_conn() as conn:
196
+ query = '''
197
+ SELECT agent_name, successful_tasks, failed_tasks
198
+ FROM parsed_results
199
+ WHERE benchmark_name = ?
200
+ '''
201
+ df = pd.read_sql_query(query, conn, params=(benchmark_name,))
202
+
203
+ # Get all unique task IDs
204
+ task_ids = set()
205
+ for tasks in df['successful_tasks']:
206
+ if ast.literal_eval(tasks) is not None:
207
+ task_ids.update(ast.literal_eval(tasks))
208
+ for tasks in df['failed_tasks']:
209
+ if ast.literal_eval(tasks) is not None:
210
+ task_ids.update(ast.literal_eval(tasks))
211
+
212
+ # Create a DataFrame with agent_name, task_ids, and success columns
213
+ data_list = []
214
+ for _, row in df.iterrows():
215
+ agent_name = row['agent_name']
216
+ for task_id in task_ids:
217
+ success = 1 if task_id in row['successful_tasks'] else 0
218
+ data_list.append({
219
+ 'agent_name': agent_name,
220
+ 'task_id': task_id,
221
+ 'success': success
222
+ })
223
+ df = pd.DataFrame(data_list)
224
+
225
+ df = df.rename(columns={
226
+ 'agent_name': 'Agent Name',
227
+ 'task_id': 'Task ID',
228
+ 'success': 'Success'
229
+ })
230
+
231
+ return df
232
 
233
  if __name__ == '__main__':
234
  preprocessor = TracePreprocessor()
utils/viz.py CHANGED
@@ -4,6 +4,65 @@ from utils.pareto import Agent, compute_pareto_frontier
4
  import plotly.graph_objects as go
5
  import textwrap
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  def create_bar_chart(categories, values, x_label, y_label, title):
8
  # Sort categories and values based on values in descending order
9
  sorted_data = sorted(zip(categories, values), key=lambda x: x[1], reverse=True)
 
4
  import plotly.graph_objects as go
5
  import textwrap
6
 
7
+ def create_task_success_heatmap(df, benchmark_name):
8
+ # Pivot the dataframe to create a matrix of agents vs tasks
9
+ pivot_df = df.pivot(index='Agent Name', columns='Task ID', values='Success')
10
+
11
+ # Create the heatmap
12
+ fig = go.Figure(data=go.Heatmap(
13
+ z=pivot_df.values,
14
+ y=pivot_df.index,
15
+ x=pivot_df.columns,
16
+ colorscale=[[0, 'white'], [1, '#1b9e77']], # White for failed, green for success
17
+ showscale=False,
18
+ hovertemplate='<b>Agent:</b> %{y}<br>' +
19
+ '<b>Task:</b> %{x}<br>' +
20
+ '<b>Status:</b> %{z}<extra></extra>'
21
+ ))
22
+
23
+ # Update the layout
24
+ fig.update_layout(
25
+ xaxis_title='Task ID',
26
+ height=600,
27
+ width=1300,
28
+ yaxis=dict(
29
+ autorange='reversed',
30
+ showticklabels=True, # Show y-axis tick labels (agent names)
31
+ showline=True,
32
+ linecolor='black',
33
+ showgrid=False
34
+ ),
35
+ xaxis=dict(
36
+ side='top',
37
+ showticklabels=False, # Hide x-axis tick labels (task IDs)
38
+ showline=True,
39
+ linecolor='black',
40
+ showgrid=False
41
+ ),
42
+ plot_bgcolor='white',
43
+ paper_bgcolor='white',
44
+ hoverlabel=dict(
45
+ bgcolor="white",
46
+ font_size=12,
47
+ font_family="Arial"
48
+ ),
49
+ modebar=dict(
50
+ activecolor='#1f77b4',
51
+ orientation='h',
52
+ bgcolor='rgba(255,255,255,0.8)',
53
+ color='#777',
54
+ add=['pan2d'],
55
+ remove=[
56
+ 'zoom2d', 'zoomIn2d', 'zoomOut2d', 'resetScale2d',
57
+ 'hoverClosestCartesian', 'hoverCompareCartesian',
58
+ 'toggleSpikelines', 'lasso2d', 'lasso', 'select2d', 'select'
59
+ ]
60
+ ),
61
+ dragmode='pan'
62
+ )
63
+
64
+ return fig
65
+
66
  def create_bar_chart(categories, values, x_label, y_label, title):
67
  # Sort categories and values based on values in descending order
68
  sorted_data = sorted(zip(categories, values), key=lambda x: x[1], reverse=True)