Spaces:
Running
Running
Commit
·
47280b7
1
Parent(s):
fd35772
added task heatmaps
Browse files- app.py +42 -9
- utils/db.py +48 -2
- utils/viz.py +59 -0
app.py
CHANGED
@@ -6,7 +6,7 @@ from pathlib import Path
|
|
6 |
import pandas as pd
|
7 |
import os
|
8 |
import json
|
9 |
-
from utils.viz import create_scatter_plot, create_flow_chart, create_bar_chart
|
10 |
from utils.processing import check_and_process_uploads
|
11 |
from huggingface_hub import snapshot_download
|
12 |
from apscheduler.schedulers.background import BackgroundScheduler
|
@@ -19,10 +19,8 @@ from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
|
19 |
import weave
|
20 |
from utils.db import TracePreprocessor
|
21 |
|
22 |
-
# Initialize the TracePreprocessor
|
23 |
preprocessor = TracePreprocessor()
|
24 |
|
25 |
-
|
26 |
from datetime import datetime
|
27 |
|
28 |
abs_path = Path(__file__).parent
|
@@ -234,7 +232,7 @@ with gr.Blocks() as demo:
|
|
234 |
label="Select Columns to Display:",
|
235 |
),
|
236 |
hide_columns=config.USACO_HIDE_COLUMNS,
|
237 |
-
search_columns=config.USACO_SEARCH_COLUMNS,
|
238 |
column_widths={"Agent Name": 40,
|
239 |
"Accuracy": 20,
|
240 |
"Total Cost": 20},
|
@@ -242,6 +240,17 @@ with gr.Blocks() as demo:
|
|
242 |
with gr.Row():
|
243 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
244 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
245 |
gr.Markdown("# Failure Report")
|
246 |
with gr.Row():
|
247 |
with gr.Column(scale=1):
|
@@ -355,7 +364,7 @@ with gr.Blocks() as demo:
|
|
355 |
label="Select Columns to Display:",
|
356 |
),
|
357 |
hide_columns=config.SWEBENCH_HIDE_COLUMNS,
|
358 |
-
search_columns=config.SWEBENCH_SEARCH_COLUMNS,
|
359 |
column_widths={"Agent Name": 40,
|
360 |
"Accuracy": 20,
|
361 |
"Total Cost": 20},
|
@@ -363,6 +372,17 @@ with gr.Blocks() as demo:
|
|
363 |
with gr.Row():
|
364 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
365 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
366 |
gr.Markdown("# Failure Report")
|
367 |
with gr.Row():
|
368 |
with gr.Column(scale=1):
|
@@ -474,7 +494,7 @@ with gr.Blocks() as demo:
|
|
474 |
cant_deselect=["Agent Name"],
|
475 |
label="Select Columns to Display:",
|
476 |
),
|
477 |
-
search_columns=config.SWEBENCH_SEARCH_COLUMNS,
|
478 |
hide_columns=config.SWEBENCH_HIDE_COLUMNS,
|
479 |
column_widths={"Agent Name": 40,
|
480 |
"Accuracy": 20,
|
@@ -483,6 +503,17 @@ with gr.Blocks() as demo:
|
|
483 |
with gr.Row():
|
484 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
485 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
486 |
gr.Markdown("# Failure Report")
|
487 |
with gr.Row():
|
488 |
with gr.Column(scale=1):
|
@@ -594,7 +625,7 @@ with gr.Blocks() as demo:
|
|
594 |
cant_deselect=["Agent Name"],
|
595 |
label="Select Columns to Display:",
|
596 |
),
|
597 |
-
search_columns=config.MLAGENTBENCH_SEARCH_COLUMNS,
|
598 |
hide_columns=config.MLAGENTBENCH_HIDE_COLUMNS,
|
599 |
column_widths={"Agent Name": 40,
|
600 |
"Overall Score": 20,
|
@@ -714,12 +745,14 @@ with gr.Blocks() as demo:
|
|
714 |
|
715 |
async def main():
|
716 |
# Preprocess traces
|
|
|
717 |
# preprocessor.preprocess_traces('evals_live')
|
|
|
718 |
|
719 |
-
#
|
720 |
# await asyncio.to_thread(download_latest_results)
|
721 |
|
722 |
-
#
|
723 |
# await check_and_process_uploads()
|
724 |
|
725 |
scheduler = AsyncIOScheduler()
|
|
|
6 |
import pandas as pd
|
7 |
import os
|
8 |
import json
|
9 |
+
from utils.viz import create_scatter_plot, create_flow_chart, create_bar_chart, create_task_success_heatmap
|
10 |
from utils.processing import check_and_process_uploads
|
11 |
from huggingface_hub import snapshot_download
|
12 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
19 |
import weave
|
20 |
from utils.db import TracePreprocessor
|
21 |
|
|
|
22 |
preprocessor = TracePreprocessor()
|
23 |
|
|
|
24 |
from datetime import datetime
|
25 |
|
26 |
abs_path = Path(__file__).parent
|
|
|
232 |
label="Select Columns to Display:",
|
233 |
),
|
234 |
hide_columns=config.USACO_HIDE_COLUMNS,
|
235 |
+
# search_columns=config.USACO_SEARCH_COLUMNS,
|
236 |
column_widths={"Agent Name": 40,
|
237 |
"Accuracy": 20,
|
238 |
"Total Cost": 20},
|
|
|
240 |
with gr.Row():
|
241 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
242 |
|
243 |
+
gr.Markdown("# Task Success Heatmap")
|
244 |
+
with gr.Row():
|
245 |
+
task_success_heatmap = gr.Plot()
|
246 |
+
demo.load(
|
247 |
+
lambda: create_task_success_heatmap(
|
248 |
+
preprocessor.get_task_success_data('usaco'),
|
249 |
+
'USACO'
|
250 |
+
),
|
251 |
+
outputs=[task_success_heatmap]
|
252 |
+
)
|
253 |
+
|
254 |
gr.Markdown("# Failure Report")
|
255 |
with gr.Row():
|
256 |
with gr.Column(scale=1):
|
|
|
364 |
label="Select Columns to Display:",
|
365 |
),
|
366 |
hide_columns=config.SWEBENCH_HIDE_COLUMNS,
|
367 |
+
# search_columns=config.SWEBENCH_SEARCH_COLUMNS,
|
368 |
column_widths={"Agent Name": 40,
|
369 |
"Accuracy": 20,
|
370 |
"Total Cost": 20},
|
|
|
372 |
with gr.Row():
|
373 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
374 |
|
375 |
+
gr.Markdown("# Task Success Heatmap")
|
376 |
+
with gr.Row():
|
377 |
+
task_success_heatmap = gr.Plot()
|
378 |
+
demo.load(
|
379 |
+
lambda: create_task_success_heatmap(
|
380 |
+
preprocessor.get_task_success_data('swebench_verified'),
|
381 |
+
'SWEBench Verified'
|
382 |
+
),
|
383 |
+
outputs=[task_success_heatmap]
|
384 |
+
)
|
385 |
+
|
386 |
gr.Markdown("# Failure Report")
|
387 |
with gr.Row():
|
388 |
with gr.Column(scale=1):
|
|
|
494 |
cant_deselect=["Agent Name"],
|
495 |
label="Select Columns to Display:",
|
496 |
),
|
497 |
+
# search_columns=config.SWEBENCH_SEARCH_COLUMNS,
|
498 |
hide_columns=config.SWEBENCH_HIDE_COLUMNS,
|
499 |
column_widths={"Agent Name": 40,
|
500 |
"Accuracy": 20,
|
|
|
503 |
with gr.Row():
|
504 |
scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
|
505 |
|
506 |
+
gr.Markdown("# Task Success Heatmap")
|
507 |
+
with gr.Row():
|
508 |
+
task_success_heatmap = gr.Plot()
|
509 |
+
demo.load(
|
510 |
+
lambda: create_task_success_heatmap(
|
511 |
+
preprocessor.get_task_success_data('swebench_lite'),
|
512 |
+
'SWEBench Lite'
|
513 |
+
),
|
514 |
+
outputs=[task_success_heatmap]
|
515 |
+
)
|
516 |
+
|
517 |
gr.Markdown("# Failure Report")
|
518 |
with gr.Row():
|
519 |
with gr.Column(scale=1):
|
|
|
625 |
cant_deselect=["Agent Name"],
|
626 |
label="Select Columns to Display:",
|
627 |
),
|
628 |
+
# search_columns=config.MLAGENTBENCH_SEARCH_COLUMNS,
|
629 |
hide_columns=config.MLAGENTBENCH_HIDE_COLUMNS,
|
630 |
column_widths={"Agent Name": 40,
|
631 |
"Overall Score": 20,
|
|
|
745 |
|
746 |
async def main():
|
747 |
# Preprocess traces
|
748 |
+
# preprocessor = TracePreprocessor()
|
749 |
# preprocessor.preprocess_traces('evals_live')
|
750 |
+
# preprocessor = TracePreprocessor()
|
751 |
|
752 |
+
# Download the results from the Hugging Face Hub
|
753 |
# await asyncio.to_thread(download_latest_results)
|
754 |
|
755 |
+
# Check for new uploads and process them
|
756 |
# await check_and_process_uploads()
|
757 |
|
758 |
scheduler = AsyncIOScheduler()
|
utils/db.py
CHANGED
@@ -5,6 +5,7 @@ import pickle
|
|
5 |
from functools import lru_cache
|
6 |
import threading
|
7 |
import pandas as pd
|
|
|
8 |
|
9 |
class TracePreprocessor:
|
10 |
def __init__(self, db_path='preprocessed_traces.db'):
|
@@ -40,6 +41,8 @@ class TracePreprocessor:
|
|
40 |
benchmark_name TEXT,
|
41 |
agent_name TEXT,
|
42 |
date TEXT,
|
|
|
|
|
43 |
total_cost REAL,
|
44 |
accuracy REAL,
|
45 |
precision REAL,
|
@@ -95,12 +98,14 @@ class TracePreprocessor:
|
|
95 |
with self.get_conn() as conn:
|
96 |
conn.execute('''
|
97 |
INSERT OR REPLACE INTO parsed_results
|
98 |
-
(benchmark_name, agent_name, date, total_cost, accuracy, precision, recall, f1_score, auc, overall_score, vectorization_score, fathomnet_score, feedback_score, house_price_score, spaceship_titanic_score, amp_parkinsons_disease_progression_prediction_score, cifar10_score, imdb_score)
|
99 |
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
100 |
''', (
|
101 |
benchmark_name,
|
102 |
agent_name,
|
103 |
config['date'],
|
|
|
|
|
104 |
results.get('total_cost'),
|
105 |
results.get('accuracy'),
|
106 |
results.get('precision'),
|
@@ -161,6 +166,8 @@ class TracePreprocessor:
|
|
161 |
if column in df.columns:
|
162 |
df[column] = df[column].round(3)
|
163 |
|
|
|
|
|
164 |
# Rename columns
|
165 |
df = df.rename(columns={
|
166 |
'agent_name': 'Agent Name',
|
@@ -183,6 +190,45 @@ class TracePreprocessor:
|
|
183 |
})
|
184 |
|
185 |
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
if __name__ == '__main__':
|
188 |
preprocessor = TracePreprocessor()
|
|
|
5 |
from functools import lru_cache
|
6 |
import threading
|
7 |
import pandas as pd
|
8 |
+
import ast
|
9 |
|
10 |
class TracePreprocessor:
|
11 |
def __init__(self, db_path='preprocessed_traces.db'):
|
|
|
41 |
benchmark_name TEXT,
|
42 |
agent_name TEXT,
|
43 |
date TEXT,
|
44 |
+
successful_tasks TEXT,
|
45 |
+
failed_tasks TEXT,
|
46 |
total_cost REAL,
|
47 |
accuracy REAL,
|
48 |
precision REAL,
|
|
|
98 |
with self.get_conn() as conn:
|
99 |
conn.execute('''
|
100 |
INSERT OR REPLACE INTO parsed_results
|
101 |
+
(benchmark_name, agent_name, date, successful_tasks, failed_tasks, total_cost, accuracy, precision, recall, f1_score, auc, overall_score, vectorization_score, fathomnet_score, feedback_score, house_price_score, spaceship_titanic_score, amp_parkinsons_disease_progression_prediction_score, cifar10_score, imdb_score)
|
102 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
103 |
''', (
|
104 |
benchmark_name,
|
105 |
agent_name,
|
106 |
config['date'],
|
107 |
+
str(results.get('successful_tasks')),
|
108 |
+
str(results.get('failed_tasks')),
|
109 |
results.get('total_cost'),
|
110 |
results.get('accuracy'),
|
111 |
results.get('precision'),
|
|
|
166 |
if column in df.columns:
|
167 |
df[column] = df[column].round(3)
|
168 |
|
169 |
+
df = df.drop(columns=['successful_tasks', 'failed_tasks'], axis=1)
|
170 |
+
|
171 |
# Rename columns
|
172 |
df = df.rename(columns={
|
173 |
'agent_name': 'Agent Name',
|
|
|
190 |
})
|
191 |
|
192 |
return df
|
193 |
+
|
194 |
+
def get_task_success_data(self, benchmark_name):
|
195 |
+
with self.get_conn() as conn:
|
196 |
+
query = '''
|
197 |
+
SELECT agent_name, successful_tasks, failed_tasks
|
198 |
+
FROM parsed_results
|
199 |
+
WHERE benchmark_name = ?
|
200 |
+
'''
|
201 |
+
df = pd.read_sql_query(query, conn, params=(benchmark_name,))
|
202 |
+
|
203 |
+
# Get all unique task IDs
|
204 |
+
task_ids = set()
|
205 |
+
for tasks in df['successful_tasks']:
|
206 |
+
if ast.literal_eval(tasks) is not None:
|
207 |
+
task_ids.update(ast.literal_eval(tasks))
|
208 |
+
for tasks in df['failed_tasks']:
|
209 |
+
if ast.literal_eval(tasks) is not None:
|
210 |
+
task_ids.update(ast.literal_eval(tasks))
|
211 |
+
|
212 |
+
# Create a DataFrame with agent_name, task_ids, and success columns
|
213 |
+
data_list = []
|
214 |
+
for _, row in df.iterrows():
|
215 |
+
agent_name = row['agent_name']
|
216 |
+
for task_id in task_ids:
|
217 |
+
success = 1 if task_id in row['successful_tasks'] else 0
|
218 |
+
data_list.append({
|
219 |
+
'agent_name': agent_name,
|
220 |
+
'task_id': task_id,
|
221 |
+
'success': success
|
222 |
+
})
|
223 |
+
df = pd.DataFrame(data_list)
|
224 |
+
|
225 |
+
df = df.rename(columns={
|
226 |
+
'agent_name': 'Agent Name',
|
227 |
+
'task_id': 'Task ID',
|
228 |
+
'success': 'Success'
|
229 |
+
})
|
230 |
+
|
231 |
+
return df
|
232 |
|
233 |
if __name__ == '__main__':
|
234 |
preprocessor = TracePreprocessor()
|
utils/viz.py
CHANGED
@@ -4,6 +4,65 @@ from utils.pareto import Agent, compute_pareto_frontier
|
|
4 |
import plotly.graph_objects as go
|
5 |
import textwrap
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
def create_bar_chart(categories, values, x_label, y_label, title):
|
8 |
# Sort categories and values based on values in descending order
|
9 |
sorted_data = sorted(zip(categories, values), key=lambda x: x[1], reverse=True)
|
|
|
4 |
import plotly.graph_objects as go
|
5 |
import textwrap
|
6 |
|
7 |
+
def create_task_success_heatmap(df, benchmark_name):
|
8 |
+
# Pivot the dataframe to create a matrix of agents vs tasks
|
9 |
+
pivot_df = df.pivot(index='Agent Name', columns='Task ID', values='Success')
|
10 |
+
|
11 |
+
# Create the heatmap
|
12 |
+
fig = go.Figure(data=go.Heatmap(
|
13 |
+
z=pivot_df.values,
|
14 |
+
y=pivot_df.index,
|
15 |
+
x=pivot_df.columns,
|
16 |
+
colorscale=[[0, 'white'], [1, '#1b9e77']], # White for failed, green for success
|
17 |
+
showscale=False,
|
18 |
+
hovertemplate='<b>Agent:</b> %{y}<br>' +
|
19 |
+
'<b>Task:</b> %{x}<br>' +
|
20 |
+
'<b>Status:</b> %{z}<extra></extra>'
|
21 |
+
))
|
22 |
+
|
23 |
+
# Update the layout
|
24 |
+
fig.update_layout(
|
25 |
+
xaxis_title='Task ID',
|
26 |
+
height=600,
|
27 |
+
width=1300,
|
28 |
+
yaxis=dict(
|
29 |
+
autorange='reversed',
|
30 |
+
showticklabels=True, # Show y-axis tick labels (agent names)
|
31 |
+
showline=True,
|
32 |
+
linecolor='black',
|
33 |
+
showgrid=False
|
34 |
+
),
|
35 |
+
xaxis=dict(
|
36 |
+
side='top',
|
37 |
+
showticklabels=False, # Hide x-axis tick labels (task IDs)
|
38 |
+
showline=True,
|
39 |
+
linecolor='black',
|
40 |
+
showgrid=False
|
41 |
+
),
|
42 |
+
plot_bgcolor='white',
|
43 |
+
paper_bgcolor='white',
|
44 |
+
hoverlabel=dict(
|
45 |
+
bgcolor="white",
|
46 |
+
font_size=12,
|
47 |
+
font_family="Arial"
|
48 |
+
),
|
49 |
+
modebar=dict(
|
50 |
+
activecolor='#1f77b4',
|
51 |
+
orientation='h',
|
52 |
+
bgcolor='rgba(255,255,255,0.8)',
|
53 |
+
color='#777',
|
54 |
+
add=['pan2d'],
|
55 |
+
remove=[
|
56 |
+
'zoom2d', 'zoomIn2d', 'zoomOut2d', 'resetScale2d',
|
57 |
+
'hoverClosestCartesian', 'hoverCompareCartesian',
|
58 |
+
'toggleSpikelines', 'lasso2d', 'lasso', 'select2d', 'select'
|
59 |
+
]
|
60 |
+
),
|
61 |
+
dragmode='pan'
|
62 |
+
)
|
63 |
+
|
64 |
+
return fig
|
65 |
+
|
66 |
def create_bar_chart(categories, values, x_label, y_label, title):
|
67 |
# Sort categories and values based on values in descending order
|
68 |
sorted_data = sorted(zip(categories, values), key=lambda x: x[1], reverse=True)
|