Spaces:
Running
Running
Upload app.py
Browse files
app.py
CHANGED
@@ -87,7 +87,7 @@ def update_task_details(benchmark_name, agent_name, task_id):
|
|
87 |
|
88 |
summary = analysis.get('task_analysis', {})
|
89 |
|
90 |
-
overview = f"###
|
91 |
# overview += f"### Successes\n{summary.get('key_successes', 'No successes listed.')}\n\n"
|
92 |
# overview += f"### Challenges\n{summary.get('main_challenges', 'No challenges listed.')}\n\n"
|
93 |
# overview += f"### Overall Assessment\n{summary.get('overall_assessment', 'No assessment available.')}\n\n"
|
@@ -283,7 +283,7 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
|
|
283 |
<header class="hal-header">
|
284 |
<h1 class="hal-title">Holistic Agent Leaderboard (HAL)</h1>
|
285 |
<p class="hal-subtitle">
|
286 |
-
A
|
287 |
</p>
|
288 |
</header>""")
|
289 |
gr.HTML("""
|
@@ -332,21 +332,21 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
|
|
332 |
|
333 |
<div class="feature-row">
|
334 |
<div class="feature-column">
|
335 |
-
<div class="feature-keyword">
|
336 |
<div class="feature-content">
|
337 |
<p class="feature-description">Evaluations across agent benchmarks are all recorded to a single leaderboard that evaluates every listed agent in the same way.</p>
|
338 |
</div>
|
339 |
</div>
|
340 |
<div class="feature-column">
|
341 |
-
<div class="feature-keyword">
|
342 |
<div class="feature-content">
|
343 |
-
<p class="feature-description">
|
344 |
</div>
|
345 |
</div>
|
346 |
<div class="feature-column">
|
347 |
-
<div class="feature-keyword">
|
348 |
<div class="feature-content">
|
349 |
-
<p class="feature-description">
|
350 |
</div>
|
351 |
</div>
|
352 |
</div>
|
@@ -388,8 +388,34 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
|
|
388 |
line-height: 1.6;
|
389 |
color: #333;
|
390 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
391 |
</style>
|
392 |
-
|
393 |
<h2 class="section-heading">Who is it for?</h2>
|
394 |
<p>We see HAL being useful for four types of users:</p>
|
395 |
|
@@ -397,21 +423,38 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
|
|
397 |
<div class="user-type">
|
398 |
<h3 class="user-type-title">Downstream Users & Procurers</h3>
|
399 |
<p class="user-type-description">Customers looking to deploy agents can get visibility into existing benchmarks, know developers building useful agents, and identify the state of the art for both cost and accuracy for their tasks of interest.</p>
|
|
|
|
|
|
|
400 |
</div>
|
401 |
<div class="user-type">
|
402 |
<h3 class="user-type-title">Agent Benchmark Developers</h3>
|
403 |
<p class="user-type-description">Reporting results on a centralized leaderboard could allow improved visibility into agent benchmarks that measure real-world utility.</p>
|
|
|
|
|
|
|
404 |
</div>
|
405 |
<div class="user-type">
|
406 |
<h3 class="user-type-title">Agent Developers</h3>
|
407 |
<p class="user-type-description">HAL allows for easy reproduction of past agents, clear comparison with past baselines, and a straightforward way to compete on a leaderboard.</p>
|
|
|
|
|
|
|
|
|
|
|
408 |
</div>
|
409 |
<div class="user-type">
|
410 |
<h3 class="user-type-title">Safety Researchers</h3>
|
411 |
<p class="user-type-description">Understanding agent capabilities on real-world safety threats and their associated costs is crucial. For example, Cybench evaluations could provide insights into agent performance and affordability for potential adversaries.</p>
|
|
|
|
|
|
|
|
|
412 |
</div>
|
413 |
</div>
|
414 |
-
|
|
|
|
|
415 |
""")
|
416 |
|
417 |
with gr.Tabs():
|
@@ -447,113 +490,139 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
|
|
447 |
),
|
448 |
outputs=[task_success_heatmap]
|
449 |
)
|
450 |
-
gr.Markdown("")
|
451 |
-
gr.Markdown("")
|
452 |
-
gr.Markdown("## Failure report for each agent")
|
453 |
-
gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.')
|
454 |
-
with gr.Row():
|
455 |
-
with gr.Column(scale=1):
|
456 |
-
failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
457 |
-
with gr.Row():
|
458 |
-
with gr.Column(scale=1):
|
459 |
-
failure_categories_overview = gr.Markdown()
|
460 |
-
|
461 |
-
with gr.Column(scale=1):
|
462 |
-
failure_categories_chart = gr.Plot()
|
463 |
-
|
464 |
-
# Initialize the failure report agent dropdown with all agents
|
465 |
-
demo.load(update_agent_dropdown,
|
466 |
-
inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)],
|
467 |
-
outputs=[failure_report_agent_dropdown])
|
468 |
-
|
469 |
-
# Update failure report when agent is selected
|
470 |
-
failure_report_agent_dropdown.change(update_failure_report,
|
471 |
-
inputs=[failure_report_agent_dropdown, gr.Textbox(value="usaco", visible=False)],
|
472 |
-
outputs=[failure_categories_overview, failure_categories_chart])
|
473 |
-
|
474 |
-
gr.Markdown("")
|
475 |
-
gr.Markdown("")
|
476 |
-
gr.Markdown("## Agent monitor")
|
477 |
-
with gr.Row():
|
478 |
-
with gr.Column(scale=1):
|
479 |
-
agent_dropdown = gr.Dropdown(label="Select Agent")
|
480 |
-
with gr.Column(scale=1):
|
481 |
-
task_dropdown = gr.Dropdown(label="Select USACO Task")
|
482 |
-
with gr.Row():
|
483 |
-
task_overview = gr.Markdown()
|
484 |
-
with gr.Row():
|
485 |
-
flow_chart = gr.Plot(label="Task Flow")
|
486 |
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
497 |
|
498 |
-
gr.Markdown("")
|
499 |
-
gr.Markdown("")
|
500 |
gr.Markdown("## Raw predictions")
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
553 |
-
|
554 |
-
|
555 |
-
|
556 |
-
|
|
|
557 |
|
558 |
|
559 |
with gr.Tab("SWE-Bench Verified"):
|
@@ -632,8 +701,7 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
|
|
632 |
task_dropdown.change(update_task_details,
|
633 |
inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
|
634 |
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
635 |
-
|
636 |
-
gr.Markdown("")
|
637 |
gr.Markdown("## Raw predictions")
|
638 |
with gr.Row():
|
639 |
with gr.Column(scale=1):
|
@@ -769,8 +837,7 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
|
|
769 |
inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown, task_dropdown],
|
770 |
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
771 |
|
772 |
-
|
773 |
-
gr.Markdown("")
|
774 |
gr.Markdown("## Raw predictions")
|
775 |
with gr.Row():
|
776 |
with gr.Column(scale=1):
|
@@ -893,8 +960,7 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
|
|
893 |
inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown, task_dropdown],
|
894 |
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
895 |
|
896 |
-
|
897 |
-
gr.Markdown("")
|
898 |
gr.Markdown("## Raw predictions")
|
899 |
with gr.Row():
|
900 |
with gr.Column(scale=1):
|
@@ -958,6 +1024,14 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
|
|
958 |
gr.Markdown((Path(__file__).parent / "about.md").read_text())
|
959 |
|
960 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
961 |
|
962 |
|
963 |
|
|
|
87 |
|
88 |
summary = analysis.get('task_analysis', {})
|
89 |
|
90 |
+
overview = f"### Summary\n\n{summary.get('overview', 'No overview available.')}\n\n"
|
91 |
# overview += f"### Successes\n{summary.get('key_successes', 'No successes listed.')}\n\n"
|
92 |
# overview += f"### Challenges\n{summary.get('main_challenges', 'No challenges listed.')}\n\n"
|
93 |
# overview += f"### Overall Assessment\n{summary.get('overall_assessment', 'No assessment available.')}\n\n"
|
|
|
283 |
<header class="hal-header">
|
284 |
<h1 class="hal-title">Holistic Agent Leaderboard (HAL)</h1>
|
285 |
<p class="hal-subtitle">
|
286 |
+
A standardized, cost-aware, and third-party leaderboard for evaluating agents.
|
287 |
</p>
|
288 |
</header>""")
|
289 |
gr.HTML("""
|
|
|
332 |
|
333 |
<div class="feature-row">
|
334 |
<div class="feature-column">
|
335 |
+
<div class="feature-keyword">Standardized</div>
|
336 |
<div class="feature-content">
|
337 |
<p class="feature-description">Evaluations across agent benchmarks are all recorded to a single leaderboard that evaluates every listed agent in the same way.</p>
|
338 |
</div>
|
339 |
</div>
|
340 |
<div class="feature-column">
|
341 |
+
<div class="feature-keyword">Cost-controlled</div>
|
342 |
<div class="feature-content">
|
343 |
+
<p class="feature-description">For downstream users, understanding the cost of running agents is a significant need for adoption. For agent developers, cost-controlled evaluations help develop accurate baselines.</p>
|
344 |
</div>
|
345 |
</div>
|
346 |
<div class="feature-column">
|
347 |
+
<div class="feature-keyword">Third-party</div>
|
348 |
<div class="feature-content">
|
349 |
+
<p class="feature-description">Agent developers clearly have competing objectives in reporting accuracy: they want to achieve state-of-the-art performance.</p>
|
350 |
</div>
|
351 |
</div>
|
352 |
</div>
|
|
|
388 |
line-height: 1.6;
|
389 |
color: #333;
|
390 |
}
|
391 |
+
.user-type-links a {
|
392 |
+
display: inline-block;
|
393 |
+
padding: 5px 12px;
|
394 |
+
background-color: #f0f4f8;
|
395 |
+
color: #2c3e50 !important; /* Force the color change */
|
396 |
+
text-decoration: none !important; /* Force remove underline */
|
397 |
+
border-radius: 15px;
|
398 |
+
font-size: 0.85em;
|
399 |
+
transition: all 0.3s ease;
|
400 |
+
border: 1px solid #e1e8ed;
|
401 |
+
}
|
402 |
+
.user-type-links a:hover {
|
403 |
+
background-color: #3498db;
|
404 |
+
color: white !important; /* Force the color change on hover */
|
405 |
+
transform: translateY(-2px);
|
406 |
+
box-shadow: 0 2px 5px rgba(52, 152, 219, 0.2);
|
407 |
+
text-decoration: none !important; /* Ensure no underline on hover */
|
408 |
+
}
|
409 |
+
.user-type-links a:visited {
|
410 |
+
color: #2c3e50 !important; /* Ensure visited links have the same color */
|
411 |
+
}
|
412 |
+
.user-type-links a::before {
|
413 |
+
content: "→";
|
414 |
+
margin-right: 5px;
|
415 |
+
font-size: 1.1em;
|
416 |
+
}
|
417 |
</style>
|
418 |
+
|
419 |
<h2 class="section-heading">Who is it for?</h2>
|
420 |
<p>We see HAL being useful for four types of users:</p>
|
421 |
|
|
|
423 |
<div class="user-type">
|
424 |
<h3 class="user-type-title">Downstream Users & Procurers</h3>
|
425 |
<p class="user-type-description">Customers looking to deploy agents can get visibility into existing benchmarks, know developers building useful agents, and identify the state of the art for both cost and accuracy for their tasks of interest.</p>
|
426 |
+
<div class="user-type-links">
|
427 |
+
<a href="#leaderboards">Leaderboards</a>
|
428 |
+
</div>
|
429 |
</div>
|
430 |
<div class="user-type">
|
431 |
<h3 class="user-type-title">Agent Benchmark Developers</h3>
|
432 |
<p class="user-type-description">Reporting results on a centralized leaderboard could allow improved visibility into agent benchmarks that measure real-world utility.</p>
|
433 |
+
<div class="user-type-links">
|
434 |
+
<a href="#benchmark-submission">Add a Benchmark</a>
|
435 |
+
</div>
|
436 |
</div>
|
437 |
<div class="user-type">
|
438 |
<h3 class="user-type-title">Agent Developers</h3>
|
439 |
<p class="user-type-description">HAL allows for easy reproduction of past agents, clear comparison with past baselines, and a straightforward way to compete on a leaderboard.</p>
|
440 |
+
<div class="user-type-links">
|
441 |
+
<a href="#agent-submission">Submit an Agent</a>
|
442 |
+
<a href="#leaderboards">Leaderboards</a>
|
443 |
+
<a href="#reproduction-guide">Reproduction Guide</a>
|
444 |
+
</div>
|
445 |
</div>
|
446 |
<div class="user-type">
|
447 |
<h3 class="user-type-title">Safety Researchers</h3>
|
448 |
<p class="user-type-description">Understanding agent capabilities on real-world safety threats and their associated costs is crucial. For example, Cybench evaluations could provide insights into agent performance and affordability for potential adversaries.</p>
|
449 |
+
<div class="user-type-links">
|
450 |
+
<a href="#cybench-results">Cybench Leaderboard (coming soon)</a>
|
451 |
+
<a href="#agent-monitor">Agent Monitor</a>
|
452 |
+
</div>
|
453 |
</div>
|
454 |
</div>
|
455 |
+
</br>
|
456 |
+
<h2 class="section-heading" id="leaderboards">Leaderboards</h2>
|
457 |
+
<p>Select a benchmark to see the agent leaderboard. Verified results have been run by the HAL team:</p>
|
458 |
""")
|
459 |
|
460 |
with gr.Tabs():
|
|
|
490 |
),
|
491 |
outputs=[task_success_heatmap]
|
492 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
493 |
|
494 |
+
gr.HTML("""
|
495 |
+
<style>
|
496 |
+
.grouped-section {
|
497 |
+
border: 2px solid #dee2e6; /* Color matching unactivated tabs */
|
498 |
+
border-radius: 10px;
|
499 |
+
padding: 30px;
|
500 |
+
margin-top: 40px;
|
501 |
+
margin-bottom: 40px;
|
502 |
+
position: relative;
|
503 |
+
}
|
504 |
+
|
505 |
+
.grouped-section-title {
|
506 |
+
font-size: 1.7em;
|
507 |
+
font-weight: bold;
|
508 |
+
color: #2c3e50;
|
509 |
+
margin-bottom: 20px;
|
510 |
+
padding-bottom: 10px;
|
511 |
+
border-bottom: 2px solid #dee2e6;
|
512 |
+
}
|
513 |
+
</style>
|
514 |
+
""")
|
515 |
+
with gr.Group(elem_classes=["grouped-section"]):
|
516 |
+
gr.Markdown("# Agent monitor", elem_classes=["grouped-section-title"], elem_id="agent-monitor")
|
517 |
+
|
518 |
+
gr.HTML('<div style="height: 10px;"></div>')
|
519 |
+
gr.Markdown("## Failure report for each agent")
|
520 |
+
gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.')
|
521 |
+
gr.HTML('<div style="height: 10px;"></div>')
|
522 |
+
with gr.Row():
|
523 |
+
with gr.Column(scale=1):
|
524 |
+
failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
|
525 |
+
gr.HTML('<div style="height: 10px;"></div>')
|
526 |
+
with gr.Row():
|
527 |
+
with gr.Column(scale=1):
|
528 |
+
failure_categories_overview = gr.Markdown()
|
529 |
+
|
530 |
+
with gr.Column(scale=1):
|
531 |
+
failure_categories_chart = gr.Plot()
|
532 |
+
|
533 |
+
# Initialize the failure report agent dropdown with all agents
|
534 |
+
demo.load(update_agent_dropdown,
|
535 |
+
inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)],
|
536 |
+
outputs=[failure_report_agent_dropdown])
|
537 |
+
|
538 |
+
# Update failure report when agent is selected
|
539 |
+
failure_report_agent_dropdown.change(update_failure_report,
|
540 |
+
inputs=[failure_report_agent_dropdown, gr.Textbox(value="usaco", visible=False)],
|
541 |
+
outputs=[failure_categories_overview, failure_categories_chart])
|
542 |
+
|
543 |
+
gr.HTML('<div style="height: 30px;"></div>')
|
544 |
+
gr.Markdown("## Task overview")
|
545 |
+
gr.HTML('<div style="height: 10px;"></div>')
|
546 |
+
with gr.Row():
|
547 |
+
with gr.Column(scale=1):
|
548 |
+
agent_dropdown = gr.Dropdown(label="Select Agent")
|
549 |
+
with gr.Column(scale=1):
|
550 |
+
task_dropdown = gr.Dropdown(label="Select USACO Task")
|
551 |
+
gr.HTML('<div style="height: 10px;"></div>')
|
552 |
+
with gr.Row():
|
553 |
+
task_overview = gr.Markdown()
|
554 |
+
with gr.Row():
|
555 |
+
flow_chart = gr.Plot(label="Task Flow")
|
556 |
+
|
557 |
+
# Initialize the agent dropdown with the best agent
|
558 |
+
demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
|
559 |
+
demo.load(update_task_analysis, inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
560 |
+
|
561 |
+
agent_dropdown.change(update_task_analysis,
|
562 |
+
inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown],
|
563 |
+
outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
|
564 |
+
task_dropdown.change(update_task_details,
|
565 |
+
inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown, task_dropdown],
|
566 |
+
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
567 |
|
|
|
|
|
568 |
gr.Markdown("## Raw predictions")
|
569 |
+
gr.Markdown('Select an agent to see the raw predictions made by the agent for each task. We also provide information on token usage for each call.')
|
570 |
+
with gr.Accordion("Expand to inspect raw predictions of agents...", open=False):
|
571 |
+
with gr.Row():
|
572 |
+
with gr.Column(scale=1):
|
573 |
+
raw_agent_dropdown = gr.Dropdown(label="Select Agent")
|
574 |
+
with gr.Column(scale=1):
|
575 |
+
raw_task_dropdown = gr.Dropdown(label="Select Task")
|
576 |
+
with gr.Column(scale=1):
|
577 |
+
raw_step_dropdown = gr.Dropdown(label="Select Step")
|
578 |
+
with gr.Row():
|
579 |
+
raw_call_details = gr.HTML()
|
580 |
+
|
581 |
+
def update_raw_task_dropdown(agent_name):
|
582 |
+
analyzed_traces = get_analyzed_traces(agent_name, "usaco")
|
583 |
+
if not analyzed_traces:
|
584 |
+
return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
|
585 |
+
task_ids = list(analyzed_traces.keys())
|
586 |
+
steps = analyzed_traces[task_ids[0]]['steps']
|
587 |
+
return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "usaco")[task_ids[0]]['steps'][0], 0)
|
588 |
+
|
589 |
+
def update_raw_step_dropdown(agent_name, task_id):
|
590 |
+
analyzed_traces = get_analyzed_traces(agent_name, "usaco")
|
591 |
+
if not analyzed_traces or task_id not in analyzed_traces:
|
592 |
+
return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
|
593 |
+
steps = analyzed_traces[task_id]['steps']
|
594 |
+
return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
|
595 |
+
|
596 |
+
def update_raw_call_details(agent_name, task_id, step_index):
|
597 |
+
analyzed_traces = get_analyzed_traces(agent_name, "usaco")
|
598 |
+
if not analyzed_traces or task_id not in analyzed_traces:
|
599 |
+
return "No data available for this selection."
|
600 |
+
steps = analyzed_traces[task_id]['steps']
|
601 |
+
if step_index is None:
|
602 |
+
return "Invalid step selection."
|
603 |
+
step = steps[step_index]
|
604 |
+
return format_call_info(step, step_index)
|
605 |
+
|
606 |
+
# Initialize the raw agent dropdown with all agents
|
607 |
+
demo.load(update_agent_dropdown,
|
608 |
+
inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)],
|
609 |
+
outputs=[raw_agent_dropdown])
|
610 |
+
demo.load(update_raw_task_dropdown,
|
611 |
+
inputs=[raw_agent_dropdown],
|
612 |
+
outputs=[raw_task_dropdown, raw_step_dropdown])
|
613 |
+
demo.load(update_raw_call_details,
|
614 |
+
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
615 |
+
outputs=[raw_call_details])
|
616 |
+
|
617 |
+
raw_agent_dropdown.change(update_raw_task_dropdown,
|
618 |
+
inputs=[raw_agent_dropdown],
|
619 |
+
outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
|
620 |
+
raw_task_dropdown.change(update_raw_step_dropdown,
|
621 |
+
inputs=[raw_agent_dropdown, raw_task_dropdown],
|
622 |
+
outputs=[raw_step_dropdown, raw_call_details])
|
623 |
+
raw_step_dropdown.change(update_raw_call_details,
|
624 |
+
inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
|
625 |
+
outputs=[raw_call_details])
|
626 |
|
627 |
|
628 |
with gr.Tab("SWE-Bench Verified"):
|
|
|
701 |
task_dropdown.change(update_task_details,
|
702 |
inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
|
703 |
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
704 |
+
|
|
|
705 |
gr.Markdown("## Raw predictions")
|
706 |
with gr.Row():
|
707 |
with gr.Column(scale=1):
|
|
|
837 |
inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown, task_dropdown],
|
838 |
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
839 |
|
840 |
+
|
|
|
841 |
gr.Markdown("## Raw predictions")
|
842 |
with gr.Row():
|
843 |
with gr.Column(scale=1):
|
|
|
960 |
inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown, task_dropdown],
|
961 |
outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
|
962 |
|
963 |
+
|
|
|
964 |
gr.Markdown("## Raw predictions")
|
965 |
with gr.Row():
|
966 |
with gr.Column(scale=1):
|
|
|
1024 |
gr.Markdown((Path(__file__).parent / "about.md").read_text())
|
1025 |
|
1026 |
|
1027 |
+
gr.HTML("""<h2 class="section-heading" id="agent-submission">How to add an agent?</h2>
|
1028 |
+
<p>Below we provide a guide on how to add an agent to the leaderboard:</p>""")
|
1029 |
+
gr.HTML("""<h2 class="section-heading" id="benchmark-submission">How to add a benchmark?</h2>
|
1030 |
+
<p>Below we provide a guide on how to add a benchmark to the leaderboard:</p>""")
|
1031 |
+
gr.HTML("""<h2 class="section-heading" id="reproduction-guide">How can I run evaluations?</h2>
|
1032 |
+
<p>Below we provide a guide on how to reproduce evaluations:</p>""")
|
1033 |
+
|
1034 |
+
|
1035 |
|
1036 |
|
1037 |
|