core_leaderboard

Running

File size: 48,340 Bytes

import gradio as gr
from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter
import config
from envs import RESULTS_REPO_ID, REPO_ID, API, HF_TOKEN
from pathlib import Path
import pandas as pd
import os
import json
from utils.viz import create_scatter_plot, create_flow_chart, create_bar_chart, create_task_success_heatmap
from utils.processing import check_and_process_uploads
from huggingface_hub import snapshot_download
from apscheduler.schedulers.background import BackgroundScheduler
from datetime import datetime
import json
import re
import markdown
import asyncio
from apscheduler.schedulers.asyncio import AsyncIOScheduler
import weave
from utils.db import TracePreprocessor
from gradio.themes.soft import Soft

preprocessor = TracePreprocessor()

from datetime import datetime

abs_path = Path(__file__).parent

def restart_space():
    API.restart_space(repo_id=REPO_ID, token=HF_TOKEN)

# New function to download results
def download_latest_results():
    print("Downloading latest results...")
    snapshot_download(RESULTS_REPO_ID, 
                    local_dir= "evals_upload",
                    repo_type='dataset',
                    tqdm_class=None,
                    etag_timeout=30,
                    max_workers=4,
                    )
    print("Download complete.")


def get_analyzed_traces(agent_name, benchmark_name):
    return preprocessor.get_analyzed_traces(agent_name, benchmark_name)

def get_failure_report(agent_name, benchmark_name):
    return preprocessor.get_failure_report(agent_name, benchmark_name)

def parse_json_files(folder_path, benchmark_name):
    return preprocessor.get_parsed_results(benchmark_name)

def update_agent_dropdown(benchmark_name, metric):
    df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name)
    agents = df['Agent Name'].tolist()
    best_agent = get_best_agent(benchmark_name, metric)
    return gr.Dropdown(choices=agents, value=best_agent, label="Select Agent")

def get_best_agent(benchmark_name, metric):
    df = parse_json_files(os.path.join(abs_path, "evals_live"), benchmark_name)
    return df.loc[df[metric].idxmax()]['Agent Name']

def update_task_analysis(benchmark_name, agent_name):
    if not agent_name:
        return "Please select an agent.", None, None, ""
    
    analyzed_traces = get_analyzed_traces(agent_name, benchmark_name)
    if not analyzed_traces:
        return f"No analysis available for agent: {agent_name}", None, None, ""
    
    task_ids = list(analyzed_traces.keys())

    overview, flow_chart, _ = update_task_details(benchmark_name, agent_name, task_ids[0])
    
    return overview, flow_chart, gr.Dropdown(choices=task_ids, value=task_ids[0], label="Select Task"), ""

def update_task_details(benchmark_name, agent_name, task_id):
    if not task_id:
        return "Please select a task.", None, ""
    
    analyzed_traces = get_analyzed_traces(agent_name, benchmark_name)
    if not analyzed_traces or task_id not in analyzed_traces:
        return f"No analysis available for task: {task_id}", None, ""
    
    analysis = analyzed_traces[task_id]

    summary = analysis.get('task_analysis', {})
    
    overview = f"### Task Overview\n\n{summary.get('overview', 'No overview available.')}\n\n"
    # overview += f"### Successes\n{summary.get('key_successes', 'No successes listed.')}\n\n"
    # overview += f"### Challenges\n{summary.get('main_challenges', 'No challenges listed.')}\n\n"
    # overview += f"### Overall Assessment\n{summary.get('overall_assessment', 'No assessment available.')}\n\n"
    
    if summary.get('overview', 'No overview available.') != "Not available":
        flow_chart = create_flow_chart(analysis['steps'])
    else:
        flow_chart = None
    
    return overview, flow_chart, ""


def format_call_info(step, step_index):
    call_data = step['call_data']
    analysis = step['analysis']

    def format_json(obj):
        # if isinstance(obj, dict) and 'choices' in obj:
        #     # Special handling for message content
        #     formatted_content = format_message_content(obj['choices'][0])
        #     return f'<div class="message-content">{formatted_content}</div>'
        # else:
        json_str = json.dumps(obj, indent=2)
        json_str = json_str.replace(' ', '&nbsp;')
        json_str = json_str.replace('\n', '<br>')
        return f'<div class="json-wrapper">{json_str}</div>'

    # Currently not used but we can enable it to format message content
    def format_message_content(content):
        # Convert Markdown to HTML
        html_content = markdown.markdown(content)
        
        # Replace ``` code blocks with styled pre blocks
        html_content = re.sub(r'```python\n(.*?)```', lambda m: f'<pre class="code-block">{m.group(1)}</pre>', html_content, flags=re.DOTALL)
        
        return html_content

    formatted_info = f"""
    <style>
        .json-wrapper {{
            white-space: pre-wrap;
            word-wrap: break-word;
            font-family: monospace;
            max-height: 300px;
            overflow-y: auto;
            background-color: #f5f5f5;
            padding: 10px;
            border-radius: 5px;
        }}
        .message-content {{
            white-space: normal;
            word-wrap: break-word;
            font-family: Arial, sans-serif;
            max-height: 500px;
            overflow-y: auto;
            background-color: #ffffff;
            padding: 10px;
            border-radius: 5px;
            border: 1px solid #e0e0e0;
        }}
        .code-block {{
            background-color: #f0f0f0;
            padding: 10px;
            border-radius: 5px;
            font-family: monospace;
            white-space: pre-wrap;
            word-wrap: break-word;
        }}
    </style>

    <h3>Step {step_index + 1}: {analysis.get('headline', '')}</h3>

    <h4>Call Metadata</h4>
    <ul>
        <li><strong>Weave Task ID:</strong> {call_data['weave_task_id']}</li>
        <li><strong>Trace ID:</strong> {call_data['trace_id']}</li>
        <li><strong>Project ID:</strong> {call_data['project_id']}</li>
        <li><strong>Created Timestamp:</strong> {datetime.fromtimestamp(call_data['created_timestamp'])}</li>
        <li><strong>Model:</strong> {call_data['inputs']['model']}</li>
    </ul>

    <h4>Inputs</h4>
    {format_json(call_data['inputs'])}

    <h4>Outputs</h4>
    {format_json(call_data['outputs'])}

    <h4>Usage</h4>
    {format_json(call_data['summary'])}

    <h4>Analysis</h4>
    <ul>
        <li><strong>Description:</strong> {analysis['description']}</li>
        <li><strong>Assessment:</strong> {analysis['assessment']}</li>
        <li><strong>Success:</strong> {analysis['success']}</li>
        <li><strong>Action Type:</strong> {analysis['action_type']}</li>
    </ul>
    """
    return formatted_info


def update_failure_report(agent_name, benchmark_name):
    failure_report = get_failure_report(agent_name, benchmark_name)
    if not failure_report:
        return "No failure report available for this agent.", None

    # Create overview of failure categories
    categories_overview = "### Failure Categories:\n\n"
    for category in failure_report['failure_categories']:
        categories_overview += f"#### {category['category_name']}\n"
        categories_overview += f"{category['description']}\n\n"

    # Count tasks affected by each category
    category_counts = {}
    for task, classification in failure_report['task_classifications'].items():
        category_id = classification['category_id']
        category_counts[category_id] = category_counts.get(category_id, 0) + 1

    # Prepare data for bar chart
    categories = [cat['category_name'] for cat in failure_report['failure_categories']]
    counts = [category_counts.get(str(i+1), 0) for i in range(len(categories))]

    # Create bar chart
    chart = create_bar_chart(categories, counts, "Failure Categories", "Number of Affected Tasks", "Failure Categories Distribution")

    return categories_overview, chart

from gradio.themes.utils import colors, fonts, sizes
from typing import Iterable
class MyTheme(Soft):
    def __init__(
        self,
        *,
        primary_hue: colors.Color | str = colors.blue,
        text_size: sizes.Size | str = sizes.text_lg,
        font: fonts.Font
        | str
        | Iterable[fonts.Font | str] = (
            fonts.GoogleFont("Lato"),
            "ui-sans-serif",
            "sans-serif",
        ),
        font_mono: fonts.Font
        | str
        | Iterable[fonts.Font | str] = (
            fonts.GoogleFont("IBM Plex Mono"),
            "ui-monospace",
            "monospace",
        ),
    ):
        super().__init__(
            primary_hue=primary_hue,
            text_size=text_size,
            font=font,
            font_mono=font_mono,
        )

my_theme = MyTheme()

with gr.Blocks(theme=my_theme, css='css.css') as demo:
    # gr.Markdown((Path(__file__).parent / "header.md").read_text(), elem_classes=["text-large"])
    gr.HTML("""
            <style>
    .hal-header {
        color: #ecf0f1;
        padding: 40px 20px;
        text-align: center;
        box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
    }
    .hal-title {
        font-size: 2.5em;
        font-weight: 700;
        margin: 0;
        letter-spacing: 2px;
        text-transform: uppercase;
    }
    .hal-subtitle {
        font-size: 1.2em;
        font-weight: 300;
        margin-top: 15px;
        margin-left: auto;
        margin-right: auto;
        line-height: 1.6;
        text-align: center;
    }
    .hal-highlight {
        color: #3498db;
        font-weight: 600;
    }
</style>

<header class="hal-header">
    <h1 class="hal-title">Holistic Agent Leaderboard (HAL)</h1>
    <p class="hal-subtitle">
        A centralized, standardized, and cost-aware leaderboard for evaluating agents.
    </p>
</header>""")
    gr.HTML("""
<style>
    .feature-row {
        display: flex;
        justify-content: space-between;
        margin-top: 20px;
        margin-bottom: 20px;
    }
    .feature-column {
        flex: 1;
        padding: 25px;
        background-color: #ffffff;
        border-radius: 10px;
        margin: 0 15px;
        text-align: left;
        box-shadow: 0 6px 12px rgba(0, 0, 0, 0.1);
        display: flex;
        flex-direction: column;
        align-items: flex-start;
        border-top: 5px solid #3498db;
        transition: transform 0.3s ease, box-shadow 0.3s ease;
    }
    .feature-column:hover {
        transform: translateY(-5px);
        box-shadow: 0 5px 10px rgba(0, 0, 0, 0.15);
    }
    .feature-keyword {
        font-size: 1.2em;
        font-weight: bold;
        color: #1b9e77;
        margin-bottom: 10px;
        text-transform: uppercase;
        letter-spacing: 1px;
    }
    .feature-content {
        flex-grow: 1;
    }
    .feature-description {
        font-size: 0.95em;
        line-height: 1.6;
        color: #333;
    }
</style>

<div class="feature-row">
    <div class="feature-column">
        <div class="feature-keyword">Centralized</div>
        <div class="feature-content">
            <p class="feature-description">Evaluations across agent benchmarks are all recorded to a single leaderboard that evaluates every listed agent in the same way.</p>
        </div>
    </div>
    <div class="feature-column">
        <div class="feature-keyword">Third-party</div>
        <div class="feature-content">
            <p class="feature-description">Agent developers clearly have competing objectives in reporting accuracy: they want to achieve state-of-the-art performance.</p>
        </div>
    </div>
    <div class="feature-column">
        <div class="feature-keyword">Cost-controlled</div>
        <div class="feature-content">
            <p class="feature-description">For downstream users, understanding the cost of running agents is a significant need for adoption. For agent developers, cost-controlled evaluations help develop accurate baselines.</p>
        </div>
    </div>
</div>
<style>
    .section-heading {
        font-size: 1.8em;
        font-weight: bold;
        color: #2c3e50;
        margin-top: 40px;
        margin-bottom: 20px;
        text-align: left;
    }
    .user-types-container {
        display: grid;
        grid-template-columns: repeat(2, 1fr);
        gap: 20px;
        margin-top: 20px;
    }
    .user-type {
        background-color: #ffffff;
        border-radius: 10px;
        padding: 25px;
        box-shadow: 0 6px 12px rgba(0, 0, 0, 0.1);
        transition: transform 0.3s ease, box-shadow 0.3s ease;
        border-left: 5px solid #3498db;
    }
    .user-type:hover {
        transform: translateY(-5px);
        box-shadow: 0 5px 10px rgba(0, 0, 0, 0.15);
    }
    .user-type-title {
        font-size: 1.2em;
        font-weight: bold;
        color: #3498db;
        margin-bottom: 10px;
    }
    .user-type-description {
        font-size: 0.95em;
        line-height: 1.6;
        color: #333;
    }
</style>
<br/>
<h2 class="section-heading">Who is it for?</h2>
<p>We see HAL being useful for four types of users:</p>

<div class="user-types-container">
    <div class="user-type">
        <h3 class="user-type-title">Downstream Users & Procurers</h3>
        <p class="user-type-description">Customers looking to deploy agents can get visibility into existing benchmarks, know developers building useful agents, and identify the state of the art for both cost and accuracy for their tasks of interest.</p>
    </div>
    <div class="user-type">
        <h3 class="user-type-title">Agent Benchmark Developers</h3>
        <p class="user-type-description">Reporting results on a centralized leaderboard could allow improved visibility into agent benchmarks that measure real-world utility.</p>
    </div>
    <div class="user-type">
        <h3 class="user-type-title">Agent Developers</h3>
        <p class="user-type-description">HAL allows for easy reproduction of past agents, clear comparison with past baselines, and a straightforward way to compete on a leaderboard.</p>
    </div>
    <div class="user-type">
        <h3 class="user-type-title">Safety Researchers</h3>
        <p class="user-type-description">Understanding agent capabilities on real-world safety threats and their associated costs is crucial. For example, Cybench evaluations could provide insights into agent performance and affordability for potential adversaries.</p>
    </div>
</div>
<br/>
""")
    
    with gr.Tabs():
        with gr.Tab("USACO"):
            gr.Markdown("""The USA Computing Olympiad (USACO) is a computer programming competition for pre-college students. This benchmark evaluates the performance of AI agents on a set of 307 USACO tasks. The agents are evaluated based on the number of tasks correctly solved.""")
            with gr.Row():
                with gr.Column(scale=2):
                    Leaderboard(
                        value=parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'),
                        select_columns=SelectColumns(
                            default_selection=config.USACO_ON_LOAD_COLUMNS + ["Verified"],
                            cant_deselect=["Agent Name"],
                            label="Select Columns to Display:",
                        ),
                        hide_columns=config.USACO_HIDE_COLUMNS,
                        search_columns=config.USACO_SEARCH_COLUMNS,
                    )
            with gr.Row():
                gr.Markdown("### Accuracy vs. Cost for USACO agents")
            with gr.Row():
                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'usaco'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
            
            gr.Markdown("")
            gr.Markdown("")
            gr.Markdown("## Task success heatmap")
            gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least")
            with gr.Row():
                task_success_heatmap = gr.Plot()
            demo.load(
            lambda: create_task_success_heatmap(
                preprocessor.get_task_success_data('usaco'),
                'USACO'
            ),
            outputs=[task_success_heatmap]
            )
            gr.Markdown("")
            gr.Markdown("")
            gr.Markdown("## Failure report for each agent")
            gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.')
            with gr.Row():
                with gr.Column(scale=1):
                    failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
            with gr.Row():
                with gr.Column(scale=1):
                    failure_categories_overview = gr.Markdown()
            
                with gr.Column(scale=1):
                    failure_categories_chart = gr.Plot()

            # Initialize the failure report agent dropdown with all agents
            demo.load(update_agent_dropdown, 
                    inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)], 
                    outputs=[failure_report_agent_dropdown])
            
            # Update failure report when agent is selected
            failure_report_agent_dropdown.change(update_failure_report,
                                                inputs=[failure_report_agent_dropdown, gr.Textbox(value="usaco", visible=False)],
                                                outputs=[failure_categories_overview, failure_categories_chart])
            
            gr.Markdown("")
            gr.Markdown("")
            gr.Markdown("## Agent monitor")
            with gr.Row():
                with gr.Column(scale=1):
                    agent_dropdown = gr.Dropdown(label="Select Agent")
                with gr.Column(scale=1):
                    task_dropdown = gr.Dropdown(label="Select USACO Task")
            with gr.Row():
                task_overview = gr.Markdown()
            with gr.Row():
                flow_chart = gr.Plot(label="Task Flow")

            # Initialize the agent dropdown with the best agent
            demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
            demo.load(update_task_analysis, inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])

            agent_dropdown.change(update_task_analysis, 
                                  inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown],
                                  outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
            task_dropdown.change(update_task_details,
                                 inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown, task_dropdown],
                                 outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
            
            gr.Markdown("")
            gr.Markdown("")
            gr.Markdown("## Raw predictions")
            with gr.Row():
                with gr.Column(scale=1):
                    raw_agent_dropdown = gr.Dropdown(label="Select Agent")
                with gr.Column(scale=1):
                    raw_task_dropdown = gr.Dropdown(label="Select Task")
                with gr.Column(scale=1):
                    raw_step_dropdown = gr.Dropdown(label="Select Step")
            
            with gr.Row():
                raw_call_details = gr.HTML()
            
            def update_raw_task_dropdown(agent_name):
                analyzed_traces = get_analyzed_traces(agent_name, "usaco")
                if not analyzed_traces:
                    return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
                task_ids = list(analyzed_traces.keys())
                steps = analyzed_traces[task_ids[0]]['steps']
                return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "usaco")[task_ids[0]]['steps'][0], 0)

            def update_raw_step_dropdown(agent_name, task_id):
                analyzed_traces = get_analyzed_traces(agent_name, "usaco")
                if not analyzed_traces or task_id not in analyzed_traces:
                    return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
                steps = analyzed_traces[task_id]['steps']
                return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)

            def update_raw_call_details(agent_name, task_id, step_index):
                analyzed_traces = get_analyzed_traces(agent_name, "usaco")
                if not analyzed_traces or task_id not in analyzed_traces:
                    return "No data available for this selection."
                steps = analyzed_traces[task_id]['steps']
                if step_index is None:
                    return "Invalid step selection."
                step = steps[step_index]
                return format_call_info(step, step_index)

            # Initialize the raw agent dropdown with all agents
            demo.load(update_agent_dropdown, 
                    inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)], 
                    outputs=[raw_agent_dropdown])
            demo.load(update_raw_task_dropdown,
                    inputs=[raw_agent_dropdown],
                    outputs=[raw_task_dropdown, raw_step_dropdown])
            demo.load(update_raw_call_details,
                    inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
                    outputs=[raw_call_details])

            raw_agent_dropdown.change(update_raw_task_dropdown, 
                                    inputs=[raw_agent_dropdown], 
                                    outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
            raw_task_dropdown.change(update_raw_step_dropdown, 
                                    inputs=[raw_agent_dropdown, raw_task_dropdown], 
                                    outputs=[raw_step_dropdown, raw_call_details])
            raw_step_dropdown.change(update_raw_call_details, 
                                    inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown], 
                                    outputs=[raw_call_details])
        
        
        with gr.Tab("SWE-Bench Verified"):
            gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Verified is a human-validated subset of 500 problems reviewed by software engineers. The  We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
            with gr.Row():
                with gr.Column(scale=2):
                    Leaderboard(
                        value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'),
                        select_columns=SelectColumns(
                            default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
                            cant_deselect=["Agent Name"],
                            label="Select Columns to Display:",
                        ),
                        hide_columns=config.SWEBENCH_HIDE_COLUMNS,
                        search_columns=config.SWEBENCH_SEARCH_COLUMNS
                    )
            with gr.Row():
                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
            
            gr.Markdown("")
            gr.Markdown("")
            gr.Markdown("## Task success heatmap")
            with gr.Row():
                task_success_heatmap = gr.Plot()
            demo.load(
            lambda: create_task_success_heatmap(
                preprocessor.get_task_success_data('swebench_verified'),
                'SWEBench Verified'
            ),
            outputs=[task_success_heatmap]
            )
            
            gr.Markdown("")
            gr.Markdown("")
            gr.Markdown("## Failure report for each agent")
            with gr.Row():
                with gr.Column(scale=1):
                    failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
            with gr.Row():
                with gr.Column(scale=1):
                    failure_categories_overview = gr.Markdown()
            
                with gr.Column(scale=1):
                    failure_categories_chart = gr.Plot()

            # Initialize the failure report agent dropdown with all agents
            demo.load(update_agent_dropdown, 
                    inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], 
                    outputs=[failure_report_agent_dropdown])
            
            # Update failure report when agent is selected
            failure_report_agent_dropdown.change(update_failure_report,
                                                inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
                                                outputs=[failure_categories_overview, failure_categories_chart])
            
            gr.Markdown("")
            gr.Markdown("")
            gr.Markdown("## Agent monitor")
            with gr.Row():
                with gr.Column(scale=1):
                    agent_dropdown = gr.Dropdown(label="Select Agent")
                with gr.Column(scale=1):
                    task_dropdown = gr.Dropdown(label="Select SWE-Bench Task")
            with gr.Row():
                task_overview = gr.Markdown()
            with gr.Row():
                flow_chart = gr.Plot(label="Task Flow")

            # Initialize the agent dropdown with the best agent
            demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
            demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])

            agent_dropdown.change(update_task_analysis, 
                                  inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown],
                                  outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
            task_dropdown.change(update_task_details,
                                 inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
                                 outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
            gr.Markdown("")
            gr.Markdown("")
            gr.Markdown("## Raw predictions")
            with gr.Row():
                with gr.Column(scale=1):
                    raw_agent_dropdown = gr.Dropdown(label="Select Agent")
                with gr.Column(scale=1):
                    raw_task_dropdown = gr.Dropdown(label="Select Task")
                with gr.Column(scale=1):
                    raw_step_dropdown = gr.Dropdown(label="Select Step")
            
            with gr.Row():
                raw_call_details = gr.HTML()
            
            def update_raw_task_dropdown(agent_name):
                analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
                if not analyzed_traces:
                    return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
                task_ids = list(analyzed_traces.keys())
                steps = analyzed_traces[task_ids[0]]['steps']
                return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)

            def update_raw_step_dropdown(agent_name, task_id):
                analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
                if not analyzed_traces or task_id not in analyzed_traces:
                    return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
                steps = analyzed_traces[task_id]['steps']
                return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)

            def update_raw_call_details(agent_name, task_id, step_index):
                analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
                if not analyzed_traces or task_id not in analyzed_traces:
                    return "No data available for this selection."
                steps = analyzed_traces[task_id]['steps']
                if step_index is None:
                    return "Invalid step selection."
                step = steps[step_index]
                return format_call_info(step, step_index)

            # Initialize the raw agent dropdown with all agents
            demo.load(update_agent_dropdown, 
                    inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], 
                    outputs=[raw_agent_dropdown])
            demo.load(update_raw_task_dropdown,
                    inputs=[raw_agent_dropdown],
                    outputs=[raw_task_dropdown, raw_step_dropdown])
            demo.load(update_raw_call_details,
                    inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
                    outputs=[raw_call_details])

            raw_agent_dropdown.change(update_raw_task_dropdown, 
                                    inputs=[raw_agent_dropdown], 
                                    outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
            raw_task_dropdown.change(update_raw_step_dropdown, 
                                    inputs=[raw_agent_dropdown, raw_task_dropdown], 
                                    outputs=[raw_step_dropdown, raw_call_details])
            raw_step_dropdown.change(update_raw_call_details, 
                                    inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown], 
                                    outputs=[raw_call_details])
            
        with gr.Tab("SWE-Bench Lite"):
            gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Lite is a subset of 300 tasks of the original SWE-bench. We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
            with gr.Row():
                with gr.Column(scale=2):
                    Leaderboard(
                        value=parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'),
                        select_columns=SelectColumns(
                            default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
                            cant_deselect=["Agent Name"],
                            label="Select Columns to Display:",
                        ),
                        search_columns=config.SWEBENCH_SEARCH_COLUMNS,
                        hide_columns=config.SWEBENCH_HIDE_COLUMNS
                    )
            with gr.Row():
                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))

            gr.Markdown("")
            gr.Markdown("")
            gr.Markdown("## Task success heatmap")
            with gr.Row():
                task_success_heatmap = gr.Plot()
            demo.load(
            lambda: create_task_success_heatmap(
                preprocessor.get_task_success_data('swebench_lite'),
                'SWEBench Lite'
            ),
            outputs=[task_success_heatmap]
            )
            
            gr.Markdown("")
            gr.Markdown("")
            gr.Markdown("## Failure report for each agent")
            with gr.Row():
                with gr.Column(scale=1):
                    failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
            with gr.Row():
                with gr.Column(scale=1):
                    failure_categories_overview = gr.Markdown()
            
                with gr.Column(scale=1):
                    failure_categories_chart = gr.Plot()

            # Initialize the failure report agent dropdown with all agents
            demo.load(update_agent_dropdown, 
                    inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)], 
                    outputs=[failure_report_agent_dropdown])
            
            # Update failure report when agent is selected
            failure_report_agent_dropdown.change(update_failure_report,
                                                inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_lite", visible=False)],
                                                outputs=[failure_categories_overview, failure_categories_chart])
            
            gr.Markdown("")
            gr.Markdown("")
            gr.Markdown("## Agent monitor")
            with gr.Row():
                with gr.Column(scale=1):
                    agent_dropdown = gr.Dropdown(label="Select Agent")
                with gr.Column(scale=1):
                    task_dropdown = gr.Dropdown(label="Select SWE-Bench Task")
            with gr.Row():
                task_overview = gr.Markdown()
            with gr.Row():
                flow_chart = gr.Plot(label="Task Flow")

            # Initialize the agent dropdown with the best agent
            demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
            demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])

            agent_dropdown.change(update_task_analysis, 
                                  inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown],
                                  outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
            task_dropdown.change(update_task_details,
                                 inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown, task_dropdown],
                                 outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])

            gr.Markdown("")
            gr.Markdown("")
            gr.Markdown("## Raw predictions")
            with gr.Row():
                with gr.Column(scale=1):
                    raw_agent_dropdown = gr.Dropdown(label="Select Agent")
                with gr.Column(scale=1):
                    raw_task_dropdown = gr.Dropdown(label="Select Task")
                with gr.Column(scale=1):
                    raw_step_dropdown = gr.Dropdown(label="Select Step")
            
            with gr.Row():
                raw_call_details = gr.HTML()
            
            def update_raw_task_dropdown(agent_name):
                analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
                if not analyzed_traces:
                    return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
                task_ids = list(analyzed_traces.keys())
                steps = analyzed_traces[task_ids[0]]['steps']
                return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)

            def update_raw_step_dropdown(agent_name, task_id):
                analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
                if not analyzed_traces or task_id not in analyzed_traces:
                    return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
                steps = analyzed_traces[task_id]['steps']
                return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)

            def update_raw_call_details(agent_name, task_id, step_index):
                analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
                if not analyzed_traces or task_id not in analyzed_traces:
                    return "No data available for this selection."
                steps = analyzed_traces[task_id]['steps']
                if step_index is None:
                    return "Invalid step selection."
                step = steps[step_index]
                return format_call_info(step, step_index)

            # Initialize the raw agent dropdown with all agents
            demo.load(update_agent_dropdown, 
                    inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)], 
                    outputs=[raw_agent_dropdown])
            demo.load(update_raw_task_dropdown,
                    inputs=[raw_agent_dropdown],
                    outputs=[raw_task_dropdown, raw_step_dropdown])
            demo.load(update_raw_call_details,
                    inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
                    outputs=[raw_call_details])

            raw_agent_dropdown.change(update_raw_task_dropdown, 
                                    inputs=[raw_agent_dropdown], 
                                    outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
            raw_task_dropdown.change(update_raw_step_dropdown, 
                                    inputs=[raw_agent_dropdown, raw_task_dropdown], 
                                    outputs=[raw_step_dropdown, raw_call_details])
            raw_step_dropdown.change(update_raw_call_details, 
                                    inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown], 
                                    outputs=[raw_call_details])
            
        with gr.Tab("MLAgentBench"):
            gr.Markdown("""MLAgentBench is a suite of end-to-end Machine Learning (ML) experimentation tasks, where the agent aims to take a given dataset and a machine learning task description and autonomously develop or improve an ML model. We are currently actively developing this platform and this benchmark is not fully implemented yet. In particular, we only include one agent and a subset of tasks for this benchmark.""")
            with gr.Row():
                with gr.Column(scale=2):
                    Leaderboard(
                        value=parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench'),
                        select_columns=SelectColumns(
                            default_selection=config.MLAGENTBENCH_ON_LOAD_COLUMNS + ["Verified"],
                            cant_deselect=["Agent Name"],
                            label="Select Columns to Display:",
                        ),
                        search_columns=config.MLAGENTBENCH_SEARCH_COLUMNS,
                        hide_columns=config.MLAGENTBENCH_HIDE_COLUMNS,
                    )
            with gr.Row():
                scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench'), "Total Cost", "Overall Score", "Total Cost (in USD)", "Overall Score", ["Agent Name"]))

            gr.Markdown("")
            gr.Markdown("")
            gr.Markdown("## Failure report for each agent")
            with gr.Row():
                with gr.Column(scale=1):
                    failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
            with gr.Row():
                with gr.Column(scale=1):
                    failure_categories_overview = gr.Markdown()
            
                with gr.Column(scale=1):
                    failure_categories_chart = gr.Plot()

            # Initialize the failure report agent dropdown with all agents
            demo.load(update_agent_dropdown, 
                    inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)], 
                    outputs=[failure_report_agent_dropdown])
            
            # Update failure report when agent is selected
            failure_report_agent_dropdown.change(update_failure_report,
                                                inputs=[failure_report_agent_dropdown, gr.Textbox(value="mlagentbench", visible=False)],
                                                outputs=[failure_categories_overview, failure_categories_chart])
            
            gr.Markdown("")
            gr.Markdown("")
            gr.Markdown("## Agent monitor")
            with gr.Row():
                with gr.Column(scale=1):
                    agent_dropdown = gr.Dropdown(label="Select Agent")
                with gr.Column(scale=1):
                    task_dropdown = gr.Dropdown(label="Select SWE-Bench Task")
            with gr.Row():
                task_overview = gr.Markdown()
            with gr.Row():
                flow_chart = gr.Plot(label="Task Flow")

            # Initialize the agent dropdown with the best agent
            demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)], outputs=[agent_dropdown])
            demo.load(update_task_analysis, inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])

            agent_dropdown.change(update_task_analysis, 
                                  inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown],
                                  outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
            task_dropdown.change(update_task_details,
                                 inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown, task_dropdown],
                                 outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])

            gr.Markdown("")
            gr.Markdown("")
            gr.Markdown("## Raw predictions")
            with gr.Row():
                with gr.Column(scale=1):
                    raw_agent_dropdown = gr.Dropdown(label="Select Agent")
                with gr.Column(scale=1):
                    raw_task_dropdown = gr.Dropdown(label="Select Task")
                with gr.Column(scale=1):
                    raw_step_dropdown = gr.Dropdown(label="Select Step")
            
            with gr.Row():
                raw_call_details = gr.HTML()
            
            def update_raw_task_dropdown(agent_name):
                analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
                if not analyzed_traces:
                    return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
                task_ids = list(analyzed_traces.keys())
                steps = analyzed_traces[task_ids[0]]['steps']
                return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), update_raw_call_details(agent_name, task_ids[0], 0)

            def update_raw_step_dropdown(agent_name, task_id):
                analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
                if not analyzed_traces or task_id not in analyzed_traces:
                    return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
                steps = analyzed_traces[task_id]['steps']
                return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)

            def update_raw_call_details(agent_name, task_id, step_index):
                analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
                if not analyzed_traces or task_id not in analyzed_traces:
                    return "No data available for this selection."
                steps = analyzed_traces[task_id]['steps']
                if step_index is None:
                    return "Invalid step selection."
                step = steps[step_index]
                return format_call_info(step, step_index)

            # Initialize the raw agent dropdown with all agents
            demo.load(update_agent_dropdown, 
                    inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)], 
                    outputs=[raw_agent_dropdown])
            demo.load(update_raw_task_dropdown,
                    inputs=[raw_agent_dropdown],
                    outputs=[raw_task_dropdown, raw_step_dropdown])
            demo.load(update_raw_call_details,
                    inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
                    outputs=[raw_call_details])

            raw_agent_dropdown.change(update_raw_task_dropdown, 
                                    inputs=[raw_agent_dropdown], 
                                    outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
            raw_task_dropdown.change(update_raw_step_dropdown, 
                                    inputs=[raw_agent_dropdown, raw_task_dropdown], 
                                    outputs=[raw_step_dropdown, raw_call_details])
            raw_step_dropdown.change(update_raw_call_details, 
                                    inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown], 
                                    outputs=[raw_call_details])
            
        
        with gr.Tab("About"):
            gr.Markdown((Path(__file__).parent / "about.md").read_text())





async def main():
    # Preprocess traces
    # preprocessor = TracePreprocessor()
    # preprocessor.preprocess_traces('evals_live')
    # preprocessor = TracePreprocessor()
    
    # Download the results from the Hugging Face Hub
    # await asyncio.to_thread(download_latest_results)

    # Check for new uploads and process them
    # await check_and_process_uploads()
    
    scheduler = AsyncIOScheduler()
    scheduler.add_job(restart_space, "interval", hours=1)
    scheduler.add_job(download_latest_results, "interval", hours=1)
    # scheduler.add_job(check_and_process_uploads, "interval", hours=1)
    scheduler.start()
    
    await demo.launch()

if __name__ == "__main__":
    weave.init(f'leaderboard_{datetime.now().strftime("%Y%m%d%H%M%S")}')
    asyncio.run(main())