benediktstroebl commited on
Commit
596cb18
·
verified ·
1 Parent(s): 6dcfa1f

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +193 -119
app.py CHANGED
@@ -87,7 +87,7 @@ def update_task_details(benchmark_name, agent_name, task_id):
87
 
88
  summary = analysis.get('task_analysis', {})
89
 
90
- overview = f"### Task Overview\n\n{summary.get('overview', 'No overview available.')}\n\n"
91
  # overview += f"### Successes\n{summary.get('key_successes', 'No successes listed.')}\n\n"
92
  # overview += f"### Challenges\n{summary.get('main_challenges', 'No challenges listed.')}\n\n"
93
  # overview += f"### Overall Assessment\n{summary.get('overall_assessment', 'No assessment available.')}\n\n"
@@ -283,7 +283,7 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
283
  <header class="hal-header">
284
  <h1 class="hal-title">Holistic Agent Leaderboard (HAL)</h1>
285
  <p class="hal-subtitle">
286
- A centralized, standardized, and cost-aware leaderboard for evaluating agents.
287
  </p>
288
  </header>""")
289
  gr.HTML("""
@@ -332,21 +332,21 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
332
 
333
  <div class="feature-row">
334
  <div class="feature-column">
335
- <div class="feature-keyword">Centralized</div>
336
  <div class="feature-content">
337
  <p class="feature-description">Evaluations across agent benchmarks are all recorded to a single leaderboard that evaluates every listed agent in the same way.</p>
338
  </div>
339
  </div>
340
  <div class="feature-column">
341
- <div class="feature-keyword">Third-party</div>
342
  <div class="feature-content">
343
- <p class="feature-description">Agent developers clearly have competing objectives in reporting accuracy: they want to achieve state-of-the-art performance.</p>
344
  </div>
345
  </div>
346
  <div class="feature-column">
347
- <div class="feature-keyword">Cost-controlled</div>
348
  <div class="feature-content">
349
- <p class="feature-description">For downstream users, understanding the cost of running agents is a significant need for adoption. For agent developers, cost-controlled evaluations help develop accurate baselines.</p>
350
  </div>
351
  </div>
352
  </div>
@@ -388,8 +388,34 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
388
  line-height: 1.6;
389
  color: #333;
390
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
  </style>
392
- <br/>
393
  <h2 class="section-heading">Who is it for?</h2>
394
  <p>We see HAL being useful for four types of users:</p>
395
 
@@ -397,21 +423,38 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
397
  <div class="user-type">
398
  <h3 class="user-type-title">Downstream Users & Procurers</h3>
399
  <p class="user-type-description">Customers looking to deploy agents can get visibility into existing benchmarks, know developers building useful agents, and identify the state of the art for both cost and accuracy for their tasks of interest.</p>
 
 
 
400
  </div>
401
  <div class="user-type">
402
  <h3 class="user-type-title">Agent Benchmark Developers</h3>
403
  <p class="user-type-description">Reporting results on a centralized leaderboard could allow improved visibility into agent benchmarks that measure real-world utility.</p>
 
 
 
404
  </div>
405
  <div class="user-type">
406
  <h3 class="user-type-title">Agent Developers</h3>
407
  <p class="user-type-description">HAL allows for easy reproduction of past agents, clear comparison with past baselines, and a straightforward way to compete on a leaderboard.</p>
 
 
 
 
 
408
  </div>
409
  <div class="user-type">
410
  <h3 class="user-type-title">Safety Researchers</h3>
411
  <p class="user-type-description">Understanding agent capabilities on real-world safety threats and their associated costs is crucial. For example, Cybench evaluations could provide insights into agent performance and affordability for potential adversaries.</p>
 
 
 
 
412
  </div>
413
  </div>
414
- <br/>
 
 
415
  """)
416
 
417
  with gr.Tabs():
@@ -447,113 +490,139 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
447
  ),
448
  outputs=[task_success_heatmap]
449
  )
450
- gr.Markdown("")
451
- gr.Markdown("")
452
- gr.Markdown("## Failure report for each agent")
453
- gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.')
454
- with gr.Row():
455
- with gr.Column(scale=1):
456
- failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
457
- with gr.Row():
458
- with gr.Column(scale=1):
459
- failure_categories_overview = gr.Markdown()
460
-
461
- with gr.Column(scale=1):
462
- failure_categories_chart = gr.Plot()
463
-
464
- # Initialize the failure report agent dropdown with all agents
465
- demo.load(update_agent_dropdown,
466
- inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)],
467
- outputs=[failure_report_agent_dropdown])
468
-
469
- # Update failure report when agent is selected
470
- failure_report_agent_dropdown.change(update_failure_report,
471
- inputs=[failure_report_agent_dropdown, gr.Textbox(value="usaco", visible=False)],
472
- outputs=[failure_categories_overview, failure_categories_chart])
473
-
474
- gr.Markdown("")
475
- gr.Markdown("")
476
- gr.Markdown("## Agent monitor")
477
- with gr.Row():
478
- with gr.Column(scale=1):
479
- agent_dropdown = gr.Dropdown(label="Select Agent")
480
- with gr.Column(scale=1):
481
- task_dropdown = gr.Dropdown(label="Select USACO Task")
482
- with gr.Row():
483
- task_overview = gr.Markdown()
484
- with gr.Row():
485
- flow_chart = gr.Plot(label="Task Flow")
486
 
487
- # Initialize the agent dropdown with the best agent
488
- demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
489
- demo.load(update_task_analysis, inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
490
-
491
- agent_dropdown.change(update_task_analysis,
492
- inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown],
493
- outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
494
- task_dropdown.change(update_task_details,
495
- inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown, task_dropdown],
496
- outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
497
 
498
- gr.Markdown("")
499
- gr.Markdown("")
500
  gr.Markdown("## Raw predictions")
501
- with gr.Row():
502
- with gr.Column(scale=1):
503
- raw_agent_dropdown = gr.Dropdown(label="Select Agent")
504
- with gr.Column(scale=1):
505
- raw_task_dropdown = gr.Dropdown(label="Select Task")
506
- with gr.Column(scale=1):
507
- raw_step_dropdown = gr.Dropdown(label="Select Step")
508
-
509
- with gr.Row():
510
- raw_call_details = gr.HTML()
511
-
512
- def update_raw_task_dropdown(agent_name):
513
- analyzed_traces = get_analyzed_traces(agent_name, "usaco")
514
- if not analyzed_traces:
515
- return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
516
- task_ids = list(analyzed_traces.keys())
517
- steps = analyzed_traces[task_ids[0]]['steps']
518
- return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "usaco")[task_ids[0]]['steps'][0], 0)
519
-
520
- def update_raw_step_dropdown(agent_name, task_id):
521
- analyzed_traces = get_analyzed_traces(agent_name, "usaco")
522
- if not analyzed_traces or task_id not in analyzed_traces:
523
- return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
524
- steps = analyzed_traces[task_id]['steps']
525
- return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
526
-
527
- def update_raw_call_details(agent_name, task_id, step_index):
528
- analyzed_traces = get_analyzed_traces(agent_name, "usaco")
529
- if not analyzed_traces or task_id not in analyzed_traces:
530
- return "No data available for this selection."
531
- steps = analyzed_traces[task_id]['steps']
532
- if step_index is None:
533
- return "Invalid step selection."
534
- step = steps[step_index]
535
- return format_call_info(step, step_index)
536
-
537
- # Initialize the raw agent dropdown with all agents
538
- demo.load(update_agent_dropdown,
539
- inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)],
540
- outputs=[raw_agent_dropdown])
541
- demo.load(update_raw_task_dropdown,
542
- inputs=[raw_agent_dropdown],
543
- outputs=[raw_task_dropdown, raw_step_dropdown])
544
- demo.load(update_raw_call_details,
545
- inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
546
- outputs=[raw_call_details])
547
-
548
- raw_agent_dropdown.change(update_raw_task_dropdown,
549
- inputs=[raw_agent_dropdown],
550
- outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
551
- raw_task_dropdown.change(update_raw_step_dropdown,
552
- inputs=[raw_agent_dropdown, raw_task_dropdown],
553
- outputs=[raw_step_dropdown, raw_call_details])
554
- raw_step_dropdown.change(update_raw_call_details,
555
- inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
556
- outputs=[raw_call_details])
 
557
 
558
 
559
  with gr.Tab("SWE-Bench Verified"):
@@ -632,8 +701,7 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
632
  task_dropdown.change(update_task_details,
633
  inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
634
  outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
635
- gr.Markdown("")
636
- gr.Markdown("")
637
  gr.Markdown("## Raw predictions")
638
  with gr.Row():
639
  with gr.Column(scale=1):
@@ -769,8 +837,7 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
769
  inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown, task_dropdown],
770
  outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
771
 
772
- gr.Markdown("")
773
- gr.Markdown("")
774
  gr.Markdown("## Raw predictions")
775
  with gr.Row():
776
  with gr.Column(scale=1):
@@ -893,8 +960,7 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
893
  inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown, task_dropdown],
894
  outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
895
 
896
- gr.Markdown("")
897
- gr.Markdown("")
898
  gr.Markdown("## Raw predictions")
899
  with gr.Row():
900
  with gr.Column(scale=1):
@@ -958,6 +1024,14 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
958
  gr.Markdown((Path(__file__).parent / "about.md").read_text())
959
 
960
 
 
 
 
 
 
 
 
 
961
 
962
 
963
 
 
87
 
88
  summary = analysis.get('task_analysis', {})
89
 
90
+ overview = f"### Summary\n\n{summary.get('overview', 'No overview available.')}\n\n"
91
  # overview += f"### Successes\n{summary.get('key_successes', 'No successes listed.')}\n\n"
92
  # overview += f"### Challenges\n{summary.get('main_challenges', 'No challenges listed.')}\n\n"
93
  # overview += f"### Overall Assessment\n{summary.get('overall_assessment', 'No assessment available.')}\n\n"
 
283
  <header class="hal-header">
284
  <h1 class="hal-title">Holistic Agent Leaderboard (HAL)</h1>
285
  <p class="hal-subtitle">
286
+ A standardized, cost-aware, and third-party leaderboard for evaluating agents.
287
  </p>
288
  </header>""")
289
  gr.HTML("""
 
332
 
333
  <div class="feature-row">
334
  <div class="feature-column">
335
+ <div class="feature-keyword">Standardized</div>
336
  <div class="feature-content">
337
  <p class="feature-description">Evaluations across agent benchmarks are all recorded to a single leaderboard that evaluates every listed agent in the same way.</p>
338
  </div>
339
  </div>
340
  <div class="feature-column">
341
+ <div class="feature-keyword">Cost-controlled</div>
342
  <div class="feature-content">
343
+ <p class="feature-description">For downstream users, understanding the cost of running agents is a significant need for adoption. For agent developers, cost-controlled evaluations help develop accurate baselines.</p>
344
  </div>
345
  </div>
346
  <div class="feature-column">
347
+ <div class="feature-keyword">Third-party</div>
348
  <div class="feature-content">
349
+ <p class="feature-description">Agent developers clearly have competing objectives in reporting accuracy: they want to achieve state-of-the-art performance.</p>
350
  </div>
351
  </div>
352
  </div>
 
388
  line-height: 1.6;
389
  color: #333;
390
  }
391
+ .user-type-links a {
392
+ display: inline-block;
393
+ padding: 5px 12px;
394
+ background-color: #f0f4f8;
395
+ color: #2c3e50 !important; /* Force the color change */
396
+ text-decoration: none !important; /* Force remove underline */
397
+ border-radius: 15px;
398
+ font-size: 0.85em;
399
+ transition: all 0.3s ease;
400
+ border: 1px solid #e1e8ed;
401
+ }
402
+ .user-type-links a:hover {
403
+ background-color: #3498db;
404
+ color: white !important; /* Force the color change on hover */
405
+ transform: translateY(-2px);
406
+ box-shadow: 0 2px 5px rgba(52, 152, 219, 0.2);
407
+ text-decoration: none !important; /* Ensure no underline on hover */
408
+ }
409
+ .user-type-links a:visited {
410
+ color: #2c3e50 !important; /* Ensure visited links have the same color */
411
+ }
412
+ .user-type-links a::before {
413
+ content: "→";
414
+ margin-right: 5px;
415
+ font-size: 1.1em;
416
+ }
417
  </style>
418
+
419
  <h2 class="section-heading">Who is it for?</h2>
420
  <p>We see HAL being useful for four types of users:</p>
421
 
 
423
  <div class="user-type">
424
  <h3 class="user-type-title">Downstream Users & Procurers</h3>
425
  <p class="user-type-description">Customers looking to deploy agents can get visibility into existing benchmarks, know developers building useful agents, and identify the state of the art for both cost and accuracy for their tasks of interest.</p>
426
+ <div class="user-type-links">
427
+ <a href="#leaderboards">Leaderboards</a>
428
+ </div>
429
  </div>
430
  <div class="user-type">
431
  <h3 class="user-type-title">Agent Benchmark Developers</h3>
432
  <p class="user-type-description">Reporting results on a centralized leaderboard could allow improved visibility into agent benchmarks that measure real-world utility.</p>
433
+ <div class="user-type-links">
434
+ <a href="#benchmark-submission">Add a Benchmark</a>
435
+ </div>
436
  </div>
437
  <div class="user-type">
438
  <h3 class="user-type-title">Agent Developers</h3>
439
  <p class="user-type-description">HAL allows for easy reproduction of past agents, clear comparison with past baselines, and a straightforward way to compete on a leaderboard.</p>
440
+ <div class="user-type-links">
441
+ <a href="#agent-submission">Submit an Agent</a>
442
+ <a href="#leaderboards">Leaderboards</a>
443
+ <a href="#reproduction-guide">Reproduction Guide</a>
444
+ </div>
445
  </div>
446
  <div class="user-type">
447
  <h3 class="user-type-title">Safety Researchers</h3>
448
  <p class="user-type-description">Understanding agent capabilities on real-world safety threats and their associated costs is crucial. For example, Cybench evaluations could provide insights into agent performance and affordability for potential adversaries.</p>
449
+ <div class="user-type-links">
450
+ <a href="#cybench-results">Cybench Leaderboard (coming soon)</a>
451
+ <a href="#agent-monitor">Agent Monitor</a>
452
+ </div>
453
  </div>
454
  </div>
455
+ </br>
456
+ <h2 class="section-heading" id="leaderboards">Leaderboards</h2>
457
+ <p>Select a benchmark to see the agent leaderboard. Verified results have been run by the HAL team:</p>
458
  """)
459
 
460
  with gr.Tabs():
 
490
  ),
491
  outputs=[task_success_heatmap]
492
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
 
494
+ gr.HTML("""
495
+ <style>
496
+ .grouped-section {
497
+ border: 2px solid #dee2e6; /* Color matching unactivated tabs */
498
+ border-radius: 10px;
499
+ padding: 30px;
500
+ margin-top: 40px;
501
+ margin-bottom: 40px;
502
+ position: relative;
503
+ }
504
+
505
+ .grouped-section-title {
506
+ font-size: 1.7em;
507
+ font-weight: bold;
508
+ color: #2c3e50;
509
+ margin-bottom: 20px;
510
+ padding-bottom: 10px;
511
+ border-bottom: 2px solid #dee2e6;
512
+ }
513
+ </style>
514
+ """)
515
+ with gr.Group(elem_classes=["grouped-section"]):
516
+ gr.Markdown("# Agent monitor", elem_classes=["grouped-section-title"], elem_id="agent-monitor")
517
+
518
+ gr.HTML('<div style="height: 10px;"></div>')
519
+ gr.Markdown("## Failure report for each agent")
520
+ gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.')
521
+ gr.HTML('<div style="height: 10px;"></div>')
522
+ with gr.Row():
523
+ with gr.Column(scale=1):
524
+ failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
525
+ gr.HTML('<div style="height: 10px;"></div>')
526
+ with gr.Row():
527
+ with gr.Column(scale=1):
528
+ failure_categories_overview = gr.Markdown()
529
+
530
+ with gr.Column(scale=1):
531
+ failure_categories_chart = gr.Plot()
532
+
533
+ # Initialize the failure report agent dropdown with all agents
534
+ demo.load(update_agent_dropdown,
535
+ inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)],
536
+ outputs=[failure_report_agent_dropdown])
537
+
538
+ # Update failure report when agent is selected
539
+ failure_report_agent_dropdown.change(update_failure_report,
540
+ inputs=[failure_report_agent_dropdown, gr.Textbox(value="usaco", visible=False)],
541
+ outputs=[failure_categories_overview, failure_categories_chart])
542
+
543
+ gr.HTML('<div style="height: 30px;"></div>')
544
+ gr.Markdown("## Task overview")
545
+ gr.HTML('<div style="height: 10px;"></div>')
546
+ with gr.Row():
547
+ with gr.Column(scale=1):
548
+ agent_dropdown = gr.Dropdown(label="Select Agent")
549
+ with gr.Column(scale=1):
550
+ task_dropdown = gr.Dropdown(label="Select USACO Task")
551
+ gr.HTML('<div style="height: 10px;"></div>')
552
+ with gr.Row():
553
+ task_overview = gr.Markdown()
554
+ with gr.Row():
555
+ flow_chart = gr.Plot(label="Task Flow")
556
+
557
+ # Initialize the agent dropdown with the best agent
558
+ demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
559
+ demo.load(update_task_analysis, inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
560
+
561
+ agent_dropdown.change(update_task_analysis,
562
+ inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown],
563
+ outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
564
+ task_dropdown.change(update_task_details,
565
+ inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown, task_dropdown],
566
+ outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
567
 
 
 
568
  gr.Markdown("## Raw predictions")
569
+ gr.Markdown('Select an agent to see the raw predictions made by the agent for each task. We also provide information on token usage for each call.')
570
+ with gr.Accordion("Expand to inspect raw predictions of agents...", open=False):
571
+ with gr.Row():
572
+ with gr.Column(scale=1):
573
+ raw_agent_dropdown = gr.Dropdown(label="Select Agent")
574
+ with gr.Column(scale=1):
575
+ raw_task_dropdown = gr.Dropdown(label="Select Task")
576
+ with gr.Column(scale=1):
577
+ raw_step_dropdown = gr.Dropdown(label="Select Step")
578
+ with gr.Row():
579
+ raw_call_details = gr.HTML()
580
+
581
+ def update_raw_task_dropdown(agent_name):
582
+ analyzed_traces = get_analyzed_traces(agent_name, "usaco")
583
+ if not analyzed_traces:
584
+ return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
585
+ task_ids = list(analyzed_traces.keys())
586
+ steps = analyzed_traces[task_ids[0]]['steps']
587
+ return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "usaco")[task_ids[0]]['steps'][0], 0)
588
+
589
+ def update_raw_step_dropdown(agent_name, task_id):
590
+ analyzed_traces = get_analyzed_traces(agent_name, "usaco")
591
+ if not analyzed_traces or task_id not in analyzed_traces:
592
+ return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
593
+ steps = analyzed_traces[task_id]['steps']
594
+ return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
595
+
596
+ def update_raw_call_details(agent_name, task_id, step_index):
597
+ analyzed_traces = get_analyzed_traces(agent_name, "usaco")
598
+ if not analyzed_traces or task_id not in analyzed_traces:
599
+ return "No data available for this selection."
600
+ steps = analyzed_traces[task_id]['steps']
601
+ if step_index is None:
602
+ return "Invalid step selection."
603
+ step = steps[step_index]
604
+ return format_call_info(step, step_index)
605
+
606
+ # Initialize the raw agent dropdown with all agents
607
+ demo.load(update_agent_dropdown,
608
+ inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)],
609
+ outputs=[raw_agent_dropdown])
610
+ demo.load(update_raw_task_dropdown,
611
+ inputs=[raw_agent_dropdown],
612
+ outputs=[raw_task_dropdown, raw_step_dropdown])
613
+ demo.load(update_raw_call_details,
614
+ inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
615
+ outputs=[raw_call_details])
616
+
617
+ raw_agent_dropdown.change(update_raw_task_dropdown,
618
+ inputs=[raw_agent_dropdown],
619
+ outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
620
+ raw_task_dropdown.change(update_raw_step_dropdown,
621
+ inputs=[raw_agent_dropdown, raw_task_dropdown],
622
+ outputs=[raw_step_dropdown, raw_call_details])
623
+ raw_step_dropdown.change(update_raw_call_details,
624
+ inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
625
+ outputs=[raw_call_details])
626
 
627
 
628
  with gr.Tab("SWE-Bench Verified"):
 
701
  task_dropdown.change(update_task_details,
702
  inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
703
  outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
704
+
 
705
  gr.Markdown("## Raw predictions")
706
  with gr.Row():
707
  with gr.Column(scale=1):
 
837
  inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown, task_dropdown],
838
  outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
839
 
840
+
 
841
  gr.Markdown("## Raw predictions")
842
  with gr.Row():
843
  with gr.Column(scale=1):
 
960
  inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown, task_dropdown],
961
  outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
962
 
963
+
 
964
  gr.Markdown("## Raw predictions")
965
  with gr.Row():
966
  with gr.Column(scale=1):
 
1024
  gr.Markdown((Path(__file__).parent / "about.md").read_text())
1025
 
1026
 
1027
+ gr.HTML("""<h2 class="section-heading" id="agent-submission">How to add an agent?</h2>
1028
+ <p>Below we provide a guide on how to add an agent to the leaderboard:</p>""")
1029
+ gr.HTML("""<h2 class="section-heading" id="benchmark-submission">How to add a benchmark?</h2>
1030
+ <p>Below we provide a guide on how to add a benchmark to the leaderboard:</p>""")
1031
+ gr.HTML("""<h2 class="section-heading" id="reproduction-guide">How can I run evaluations?</h2>
1032
+ <p>Below we provide a guide on how to reproduce evaluations:</p>""")
1033
+
1034
+
1035
 
1036
 
1037