Zachary Siegel commited on
Commit
2faf3bd
·
1 Parent(s): de4df51

core bench outline

Browse files
Files changed (1) hide show
  1. app.py +48 -757
app.py CHANGED
@@ -281,9 +281,9 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
281
  </style>
282
 
283
  <header class="hal-header">
284
- <h1 class="hal-title">Holistic Agent Leaderboard (HAL)</h1>
285
  <p class="hal-subtitle">
286
- A standardized, cost-aware, and third-party leaderboard for evaluating agents.
287
  </p>
288
  </header>""")
289
  gr.HTML("""
@@ -300,11 +300,11 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
300
  background-color: #ffffff;
301
  border-radius: 10px;
302
  margin: 0 15px;
303
- text-align: left;
304
  box-shadow: 0 6px 12px rgba(0, 0, 0, 0.1);
305
  display: flex;
306
  flex-direction: column;
307
- align-items: flex-start;
308
  border-top: 5px solid #3498db;
309
  transition: transform 0.3s ease, box-shadow 0.3s ease;
310
  }
@@ -316,7 +316,7 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
316
  font-size: 1.2em;
317
  font-weight: bold;
318
  color: #1b9e77;
319
- margin-bottom: 10px;
320
  text-transform: uppercase;
321
  letter-spacing: 1px;
322
  }
@@ -328,128 +328,52 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
328
  line-height: 1.6;
329
  color: #333;
330
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
  </style>
332
 
333
  <div class="feature-row">
334
  <div class="feature-column">
335
- <div class="feature-keyword">Standardized</div>
336
  <div class="feature-content">
337
- <p class="feature-description">Evaluations across agent benchmarks are all recorded to a single leaderboard that evaluates every listed agent in the same way.</p>
 
 
338
  </div>
339
  </div>
340
  <div class="feature-column">
341
- <div class="feature-keyword">Cost-controlled</div>
342
  <div class="feature-content">
343
- <p class="feature-description">For downstream users, understanding the cost of running agents is a significant need for adoption. For agent developers, cost-controlled evaluations help develop accurate baselines.</p>
 
 
344
  </div>
345
  </div>
346
  <div class="feature-column">
347
- <div class="feature-keyword">Third-party</div>
348
  <div class="feature-content">
349
- <p class="feature-description">Agent developers clearly have competing objectives in reporting accuracy: they want to achieve state-of-the-art performance.</p>
350
- </div>
351
- </div>
352
- </div>
353
- <style>
354
- .section-heading {
355
- font-size: 1.8em;
356
- font-weight: bold;
357
- color: #2c3e50;
358
- margin-top: 40px;
359
- margin-bottom: 20px;
360
- text-align: left;
361
- }
362
- .user-types-container {
363
- display: grid;
364
- grid-template-columns: repeat(2, 1fr);
365
- gap: 20px;
366
- margin-top: 20px;
367
- }
368
- .user-type {
369
- background-color: #ffffff;
370
- border-radius: 10px;
371
- padding: 25px;
372
- box-shadow: 0 6px 12px rgba(0, 0, 0, 0.1);
373
- transition: transform 0.3s ease, box-shadow 0.3s ease;
374
- border-left: 5px solid #3498db;
375
- }
376
- .user-type:hover {
377
- transform: translateY(-5px);
378
- box-shadow: 0 5px 10px rgba(0, 0, 0, 0.15);
379
- }
380
- .user-type-title {
381
- font-size: 1.2em;
382
- font-weight: bold;
383
- color: #3498db;
384
- margin-bottom: 10px;
385
- }
386
- .user-type-description {
387
- font-size: 0.95em;
388
- line-height: 1.6;
389
- color: #333;
390
- }
391
- .user-type-links a {
392
- display: inline-block;
393
- padding: 5px 12px;
394
- margin-bottom: 5px;
395
- background-color: #f0f4f8;
396
- color: #2c3e50 !important; /* Force the color change */
397
- text-decoration: none !important; /* Force remove underline */
398
- border-radius: 15px;
399
- font-size: 0.85em;
400
- transition: all 0.3s ease;
401
- border: 1px solid #e1e8ed;
402
- }
403
- .user-type-links a:hover {
404
- background-color: #3498db;
405
- color: white !important; /* Force the color change on hover */
406
- transform: translateY(-2px);
407
- box-shadow: 0 2px 5px rgba(52, 152, 219, 0.2);
408
- text-decoration: none !important; /* Ensure no underline on hover */
409
- }
410
- .user-type-links a:visited {
411
- color: #2c3e50 !important; /* Ensure visited links have the same color */
412
- }
413
- .user-type-links a::before {
414
- content: "→";
415
- margin-right: 5px;
416
- font-size: 1.1em;
417
- }
418
- </style>
419
-
420
- <h2 class="section-heading">Who is it for?</h2>
421
- <p>We see HAL being useful for four types of users:</p>
422
-
423
- <div class="user-types-container">
424
- <div class="user-type">
425
- <h3 class="user-type-title">Downstream Users & Procurers</h3>
426
- <p class="user-type-description">Customers looking to deploy agents can get visibility into existing benchmarks, know developers building useful agents, and identify the state of the art for both cost and accuracy for their tasks of interest.</p>
427
- <div class="user-type-links">
428
- <a href="#leaderboards">Leaderboards</a>
429
- </div>
430
- </div>
431
- <div class="user-type">
432
- <h3 class="user-type-title">Agent Benchmark Developers</h3>
433
- <p class="user-type-description">Reporting results on a centralized leaderboard could allow improved visibility into agent benchmarks that measure real-world utility.</p>
434
- <div class="user-type-links">
435
- <a href="#benchmark-submission">Add a Benchmark</a>
436
- </div>
437
- </div>
438
- <div class="user-type">
439
- <h3 class="user-type-title">Agent Developers</h3>
440
- <p class="user-type-description">HAL allows for easy reproduction of past agents, clear comparison with past baselines, and a straightforward way to compete on a leaderboard.</p>
441
- <div class="user-type-links">
442
- <a href="#agent-submission">Submit an Agent</a>
443
- <a href="#leaderboards">Leaderboards</a>
444
- <a href="#reproduction-guide">Reproduction Guide</a>
445
- </div>
446
- </div>
447
- <div class="user-type">
448
- <h3 class="user-type-title">Safety Researchers</h3>
449
- <p class="user-type-description">Understanding agent capabilities on real-world safety threats and their associated costs is crucial. For example, Cybench evaluations could provide insights into agent performance and affordability for potential adversaries.</p>
450
- <div class="user-type-links">
451
- <a href="#cybench-results">Cybench Leaderboard (coming soon)</a>
452
- <a href="#agent-monitor">Agent Monitor</a>
453
  </div>
454
  </div>
455
  </div>
@@ -459,8 +383,16 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
459
  """)
460
 
461
  with gr.Tabs() as tabs:
462
- with gr.Tab("USACO"):
463
- gr.Markdown("""The USA Computing Olympiad (USACO) is a computer programming competition for pre-college students. This benchmark evaluates the performance of AI agents on a set of 307 USACO tasks. The agents are evaluated based on the number of tasks correctly solved.""")
 
 
 
 
 
 
 
 
464
  with gr.Row():
465
  with gr.Column(scale=2):
466
  Leaderboard(
@@ -492,643 +424,6 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
492
  outputs=[task_success_heatmap]
493
  )
494
 
495
- gr.HTML("""
496
- <style>
497
- .grouped-section {
498
- border: 2px solid #dee2e6; /* Color matching unactivated tabs */
499
- border-radius: 10px;
500
- padding: 30px;
501
- margin-top: 40px;
502
- margin-bottom: 40px;
503
- position: relative;
504
- }
505
-
506
- .grouped-section-title {
507
- font-size: 1.7em;
508
- font-weight: bold;
509
- color: #2c3e50;
510
- margin-bottom: 20px;
511
- padding-bottom: 10px;
512
- border-bottom: 2px solid #dee2e6;
513
- }
514
- </style>
515
- """)
516
- with gr.Group(elem_classes=["grouped-section"]):
517
- gr.Markdown("# Agent monitor", elem_classes=["grouped-section-title"], elem_id="agent-monitor")
518
- gr.Markdown('The agent monitor provides an overview of the recurring errors an agent makes as well as a summary of the steps the agent takes to solve a task. It currently consists of two main components:')
519
-
520
- gr.HTML('<div style="height: 10px;"></div>')
521
- gr.Markdown("## Failure report for each agent")
522
- gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.')
523
- gr.HTML('<div style="height: 10px;"></div>')
524
- with gr.Row():
525
- with gr.Column(scale=1):
526
- failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
527
- gr.HTML('<div style="height: 10px;"></div>')
528
- with gr.Row():
529
- with gr.Column(scale=1):
530
- failure_categories_overview = gr.Markdown()
531
-
532
- with gr.Column(scale=1):
533
- failure_categories_chart = gr.Plot()
534
-
535
- # Initialize the failure report agent dropdown with all agents
536
- demo.load(update_agent_dropdown,
537
- inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)],
538
- outputs=[failure_report_agent_dropdown])
539
-
540
- # Update failure report when agent is selected
541
- failure_report_agent_dropdown.change(update_failure_report,
542
- inputs=[failure_report_agent_dropdown, gr.Textbox(value="usaco", visible=False)],
543
- outputs=[failure_categories_overview, failure_categories_chart])
544
-
545
- gr.HTML('<div style="height: 30px;"></div>')
546
- gr.Markdown("## Task overview")
547
- gr.HTML('<div style="height: 10px;"></div>')
548
- with gr.Row():
549
- with gr.Column(scale=1):
550
- agent_dropdown = gr.Dropdown(label="Select Agent")
551
- with gr.Column(scale=1):
552
- task_dropdown = gr.Dropdown(label="Select USACO Task")
553
- gr.HTML('<div style="height: 10px;"></div>')
554
- with gr.Row():
555
- task_overview = gr.Markdown()
556
- with gr.Row():
557
- flow_chart = gr.Plot(label="Task Flow")
558
-
559
- # Initialize the agent dropdown with the best agent
560
- demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
561
- demo.load(update_task_analysis, inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
562
-
563
- agent_dropdown.change(update_task_analysis,
564
- inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown],
565
- outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
566
- task_dropdown.change(update_task_details,
567
- inputs=[gr.Textbox(value="usaco", visible=False), agent_dropdown, task_dropdown],
568
- outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
569
-
570
- gr.Markdown("## Raw predictions")
571
- gr.Markdown('Select an agent to see the raw predictions made by the agent for each task. We also provide information on token usage for each call.')
572
- with gr.Accordion("Expand to inspect raw predictions of agents...", open=False):
573
- with gr.Row():
574
- with gr.Column(scale=1):
575
- raw_agent_dropdown = gr.Dropdown(label="Select Agent")
576
- with gr.Column(scale=1):
577
- raw_task_dropdown = gr.Dropdown(label="Select Task")
578
- with gr.Column(scale=1):
579
- raw_step_dropdown = gr.Dropdown(label="Select Step")
580
- with gr.Row():
581
- raw_call_details = gr.HTML()
582
-
583
- def update_raw_task_dropdown(agent_name):
584
- analyzed_traces = get_analyzed_traces(agent_name, "usaco")
585
- if not analyzed_traces:
586
- return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
587
- task_ids = list(analyzed_traces.keys())
588
- steps = analyzed_traces[task_ids[0]]['steps']
589
- return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "usaco")[task_ids[0]]['steps'][0], 0)
590
-
591
- def update_raw_step_dropdown(agent_name, task_id):
592
- analyzed_traces = get_analyzed_traces(agent_name, "usaco")
593
- if not analyzed_traces or task_id not in analyzed_traces:
594
- return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
595
- steps = analyzed_traces[task_id]['steps']
596
- return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
597
-
598
- def update_raw_call_details(agent_name, task_id, step_index):
599
- analyzed_traces = get_analyzed_traces(agent_name, "usaco")
600
- if not analyzed_traces or task_id not in analyzed_traces:
601
- return "No data available for this selection."
602
- steps = analyzed_traces[task_id]['steps']
603
- if step_index is None:
604
- return "Invalid step selection."
605
- step = steps[step_index]
606
- return format_call_info(step, step_index)
607
-
608
- # Initialize the raw agent dropdown with all agents
609
- demo.load(update_agent_dropdown,
610
- inputs=[gr.Textbox(value="usaco", visible=False), gr.Textbox(value="Accuracy", visible=False)],
611
- outputs=[raw_agent_dropdown])
612
- demo.load(update_raw_task_dropdown,
613
- inputs=[raw_agent_dropdown],
614
- outputs=[raw_task_dropdown, raw_step_dropdown])
615
- demo.load(update_raw_call_details,
616
- inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
617
- outputs=[raw_call_details])
618
-
619
- raw_agent_dropdown.change(update_raw_task_dropdown,
620
- inputs=[raw_agent_dropdown],
621
- outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
622
- raw_task_dropdown.change(update_raw_step_dropdown,
623
- inputs=[raw_agent_dropdown, raw_task_dropdown],
624
- outputs=[raw_step_dropdown, raw_call_details])
625
- raw_step_dropdown.change(update_raw_call_details,
626
- inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
627
- outputs=[raw_call_details])
628
-
629
- with gr.Tab("SWE-bench Verified"):
630
- gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Verified is a human-validated subset of 500 problems reviewed by software engineers. The We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
631
- with gr.Row():
632
- with gr.Column(scale=2):
633
- Leaderboard(
634
- value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified'), ci_metrics=["Accuracy", "Total Cost"]),
635
- select_columns=SelectColumns(
636
- default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
637
- cant_deselect=["Agent Name"],
638
- label="Select Columns to Display:",
639
- ),
640
- hide_columns=config.SWEBENCH_HIDE_COLUMNS,
641
- search_columns=config.SWEBENCH_SEARCH_COLUMNS,
642
- )
643
- gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
644
- with gr.Row():
645
- gr.Markdown("### Accuracy vs. Cost for SWE-bench agents")
646
- with gr.Row():
647
- scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_verified', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
648
-
649
- gr.HTML('<div style="height: 30px;"></div>')
650
- gr.Markdown("## Task success heatmap")
651
- gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in SWE-bench are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
652
- with gr.Row():
653
- task_success_heatmap = gr.Plot()
654
- demo.load(
655
- lambda: create_task_success_heatmap(
656
- preprocessor.get_task_success_data('swebench_verified'),
657
- 'SWE-bench Verified'
658
- ),
659
- outputs=[task_success_heatmap]
660
- )
661
-
662
- gr.HTML("""
663
- <style>
664
- .grouped-section {
665
- border: 2px solid #dee2e6; /* Color matching unactivated tabs */
666
- border-radius: 10px;
667
- padding: 30px;
668
- margin-top: 40px;
669
- margin-bottom: 40px;
670
- position: relative;
671
- }
672
-
673
- .grouped-section-title {
674
- font-size: 1.7em;
675
- font-weight: bold;
676
- color: #2c3e50;
677
- margin-bottom: 20px;
678
- padding-bottom: 10px;
679
- border-bottom: 2px solid #dee2e6;
680
- }
681
- </style>
682
- """)
683
- with gr.Group(elem_classes=["grouped-section"]):
684
- gr.Markdown("# Agent monitor", elem_classes=["grouped-section-title"], elem_id="agent-monitor")
685
-
686
- gr.HTML('<div style="height: 10px;"></div>')
687
- gr.Markdown("## Failure report for each agent")
688
- gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.')
689
- gr.HTML('<div style="height: 10px;"></div>')
690
- with gr.Row():
691
- with gr.Column(scale=1):
692
- failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
693
- gr.HTML('<div style="height: 10px;"></div>')
694
- with gr.Row():
695
- with gr.Column(scale=1):
696
- failure_categories_overview = gr.Markdown()
697
-
698
- with gr.Column(scale=1):
699
- failure_categories_chart = gr.Plot()
700
-
701
- # Initialize the failure report agent dropdown with all agents
702
- demo.load(update_agent_dropdown,
703
- inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
704
- outputs=[failure_report_agent_dropdown])
705
-
706
- # Update failure report when agent is selected
707
- failure_report_agent_dropdown.change(update_failure_report,
708
- inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_verified", visible=False)],
709
- outputs=[failure_categories_overview, failure_categories_chart])
710
-
711
- gr.HTML('<div style="height: 30px;"></div>')
712
- gr.Markdown("## Task overview")
713
- gr.HTML('<div style="height: 10px;"></div>')
714
- with gr.Row():
715
- with gr.Column(scale=1):
716
- agent_dropdown = gr.Dropdown(label="Select Agent")
717
- with gr.Column(scale=1):
718
- task_dropdown = gr.Dropdown(label="Select SWE-bench Verified Task")
719
- gr.HTML('<div style="height: 10px;"></div>')
720
- with gr.Row():
721
- task_overview = gr.Markdown()
722
- with gr.Row():
723
- flow_chart = gr.Plot(label="Task Flow")
724
-
725
- # Initialize the agent dropdown with the best agent
726
- demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
727
- demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
728
-
729
- agent_dropdown.change(update_task_analysis,
730
- inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown],
731
- outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
732
- task_dropdown.change(update_task_details,
733
- inputs=[gr.Textbox(value="swebench_verified", visible=False), agent_dropdown, task_dropdown],
734
- outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
735
-
736
- gr.Markdown("## Raw predictions")
737
- gr.Markdown('Select an agent to see the raw predictions made by the agent for each task. We also provide information on token usage for each call.')
738
- with gr.Accordion("Expand to inspect raw predictions of agents...", open=False):
739
- with gr.Row():
740
- with gr.Column(scale=1):
741
- raw_agent_dropdown = gr.Dropdown(label="Select Agent")
742
- with gr.Column(scale=1):
743
- raw_task_dropdown = gr.Dropdown(label="Select Task")
744
- with gr.Column(scale=1):
745
- raw_step_dropdown = gr.Dropdown(label="Select Step")
746
- with gr.Row():
747
- raw_call_details = gr.HTML()
748
-
749
- def update_raw_task_dropdown(agent_name):
750
- analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
751
- if not analyzed_traces:
752
- return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
753
- task_ids = list(analyzed_traces.keys())
754
- steps = analyzed_traces[task_ids[0]]['steps']
755
- return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "swebench_verified")[task_ids[0]]['steps'][0], 0)
756
-
757
- def update_raw_step_dropdown(agent_name, task_id):
758
- analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
759
- if not analyzed_traces or task_id not in analyzed_traces:
760
- return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
761
- steps = analyzed_traces[task_id]['steps']
762
- return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
763
-
764
- def update_raw_call_details(agent_name, task_id, step_index):
765
- analyzed_traces = get_analyzed_traces(agent_name, "swebench_verified")
766
- if not analyzed_traces or task_id not in analyzed_traces:
767
- return "No data available for this selection."
768
- steps = analyzed_traces[task_id]['steps']
769
- if step_index is None:
770
- return "Invalid step selection."
771
- step = steps[step_index]
772
- return format_call_info(step, step_index)
773
-
774
- # Initialize the raw agent dropdown with all agents
775
- demo.load(update_agent_dropdown,
776
- inputs=[gr.Textbox(value="swebench_verified", visible=False), gr.Textbox(value="Accuracy", visible=False)],
777
- outputs=[raw_agent_dropdown])
778
- demo.load(update_raw_task_dropdown,
779
- inputs=[raw_agent_dropdown],
780
- outputs=[raw_task_dropdown, raw_step_dropdown])
781
- demo.load(update_raw_call_details,
782
- inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
783
- outputs=[raw_call_details])
784
-
785
- raw_agent_dropdown.change(update_raw_task_dropdown,
786
- inputs=[raw_agent_dropdown],
787
- outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
788
- raw_task_dropdown.change(update_raw_step_dropdown,
789
- inputs=[raw_agent_dropdown, raw_task_dropdown],
790
- outputs=[raw_step_dropdown, raw_call_details])
791
- raw_step_dropdown.change(update_raw_call_details,
792
- inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
793
- outputs=[raw_call_details])
794
-
795
-
796
- with gr.Tab("SWE-bench Lite"):
797
- gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Lite is a subset of 300 tasks of the original SWE-bench. We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
798
- with gr.Row():
799
- with gr.Column(scale=2):
800
- Leaderboard(
801
- value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite'), ci_metrics=["Accuracy", "Total Cost"]),
802
- select_columns=SelectColumns(
803
- default_selection=config.SWEBENCH_ON_LOAD_COLUMNS + ["Verified"],
804
- cant_deselect=["Agent Name"],
805
- label="Select Columns to Display:",
806
- ),
807
- hide_columns=config.SWEBENCH_HIDE_COLUMNS,
808
- search_columns=config.SWEBENCH_SEARCH_COLUMNS,
809
- )
810
- gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
811
- with gr.Row():
812
- gr.Markdown("### Accuracy vs. Cost for SWE-bench agents")
813
- with gr.Row():
814
- scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'swebench_lite', aggregate=False), "Total Cost", "Accuracy", "Total Cost (in USD)", "Accuracy", ["Agent Name"]))
815
-
816
- gr.HTML('<div style="height: 30px;"></div>')
817
- gr.Markdown("## Task success heatmap")
818
- gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in SWE-bench are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
819
- with gr.Row():
820
- task_success_heatmap = gr.Plot()
821
- demo.load(
822
- lambda: create_task_success_heatmap(
823
- preprocessor.get_task_success_data('swebench_lite'),
824
- 'SWE-bench Lite'
825
- ),
826
- outputs=[task_success_heatmap]
827
- )
828
-
829
- gr.HTML("""
830
- <style>
831
- .grouped-section {
832
- border: 2px solid #dee2e6; /* Color matching unactivated tabs */
833
- border-radius: 10px;
834
- padding: 30px;
835
- margin-top: 40px;
836
- margin-bottom: 40px;
837
- position: relative;
838
- }
839
-
840
- .grouped-section-title {
841
- font-size: 1.7em;
842
- font-weight: bold;
843
- color: #2c3e50;
844
- margin-bottom: 20px;
845
- padding-bottom: 10px;
846
- border-bottom: 2px solid #dee2e6;
847
- }
848
- </style>
849
- """)
850
- with gr.Group(elem_classes=["grouped-section"]):
851
- gr.Markdown("# Agent monitor", elem_classes=["grouped-section-title"], elem_id="agent-monitor")
852
-
853
- gr.HTML('<div style="height: 10px;"></div>')
854
- gr.Markdown("## Failure report for each agent")
855
- gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.')
856
- gr.HTML('<div style="height: 10px;"></div>')
857
- with gr.Row():
858
- with gr.Column(scale=1):
859
- failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
860
- gr.HTML('<div style="height: 10px;"></div>')
861
- with gr.Row():
862
- with gr.Column(scale=1):
863
- failure_categories_overview = gr.Markdown()
864
-
865
- with gr.Column(scale=1):
866
- failure_categories_chart = gr.Plot()
867
-
868
- # Initialize the failure report agent dropdown with all agents
869
- demo.load(update_agent_dropdown,
870
- inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)],
871
- outputs=[failure_report_agent_dropdown])
872
-
873
- # Update failure report when agent is selected
874
- failure_report_agent_dropdown.change(update_failure_report,
875
- inputs=[failure_report_agent_dropdown, gr.Textbox(value="swebench_lite", visible=False)],
876
- outputs=[failure_categories_overview, failure_categories_chart])
877
-
878
- gr.HTML('<div style="height: 30px;"></div>')
879
- gr.Markdown("## Task overview")
880
- gr.HTML('<div style="height: 10px;"></div>')
881
- with gr.Row():
882
- with gr.Column(scale=1):
883
- agent_dropdown = gr.Dropdown(label="Select Agent")
884
- with gr.Column(scale=1):
885
- task_dropdown = gr.Dropdown(label="Select SWE-bench Lite Task")
886
- gr.HTML('<div style="height: 10px;"></div>')
887
- with gr.Row():
888
- task_overview = gr.Markdown()
889
- with gr.Row():
890
- flow_chart = gr.Plot(label="Task Flow")
891
-
892
- # Initialize the agent dropdown with the best agent
893
- demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)], outputs=[agent_dropdown])
894
- demo.load(update_task_analysis, inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
895
-
896
- agent_dropdown.change(update_task_analysis,
897
- inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown],
898
- outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
899
- task_dropdown.change(update_task_details,
900
- inputs=[gr.Textbox(value="swebench_lite", visible=False), agent_dropdown, task_dropdown],
901
- outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
902
-
903
- gr.Markdown("## Raw predictions")
904
- gr.Markdown('Select an agent to see the raw predictions made by the agent for each task. We also provide information on token usage for each call.')
905
- with gr.Accordion("Expand to inspect raw predictions of agents...", open=False):
906
- with gr.Row():
907
- with gr.Column(scale=1):
908
- raw_agent_dropdown = gr.Dropdown(label="Select Agent")
909
- with gr.Column(scale=1):
910
- raw_task_dropdown = gr.Dropdown(label="Select Task")
911
- with gr.Column(scale=1):
912
- raw_step_dropdown = gr.Dropdown(label="Select Step")
913
- with gr.Row():
914
- raw_call_details = gr.HTML()
915
-
916
- def update_raw_task_dropdown(agent_name):
917
- analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
918
- if not analyzed_traces:
919
- return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
920
- task_ids = list(analyzed_traces.keys())
921
- steps = analyzed_traces[task_ids[0]]['steps']
922
- return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "swebench_lite")[task_ids[0]]['steps'][0], 0)
923
-
924
- def update_raw_step_dropdown(agent_name, task_id):
925
- analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
926
- if not analyzed_traces or task_id not in analyzed_traces:
927
- return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
928
- steps = analyzed_traces[task_id]['steps']
929
- return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
930
-
931
- def update_raw_call_details(agent_name, task_id, step_index):
932
- analyzed_traces = get_analyzed_traces(agent_name, "swebench_lite")
933
- if not analyzed_traces or task_id not in analyzed_traces:
934
- return "No data available for this selection."
935
- steps = analyzed_traces[task_id]['steps']
936
- if step_index is None:
937
- return "Invalid step selection."
938
- step = steps[step_index]
939
- return format_call_info(step, step_index)
940
-
941
- # Initialize the raw agent dropdown with all agents
942
- demo.load(update_agent_dropdown,
943
- inputs=[gr.Textbox(value="swebench_lite", visible=False), gr.Textbox(value="Accuracy", visible=False)],
944
- outputs=[raw_agent_dropdown])
945
- demo.load(update_raw_task_dropdown,
946
- inputs=[raw_agent_dropdown],
947
- outputs=[raw_task_dropdown, raw_step_dropdown])
948
- demo.load(update_raw_call_details,
949
- inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
950
- outputs=[raw_call_details])
951
-
952
- raw_agent_dropdown.change(update_raw_task_dropdown,
953
- inputs=[raw_agent_dropdown],
954
- outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
955
- raw_task_dropdown.change(update_raw_step_dropdown,
956
- inputs=[raw_agent_dropdown, raw_task_dropdown],
957
- outputs=[raw_step_dropdown, raw_call_details])
958
- raw_step_dropdown.change(update_raw_call_details,
959
- inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
960
- outputs=[raw_call_details])
961
-
962
-
963
-
964
- with gr.Tab("MLAgentBench"):
965
- gr.Markdown("""MLAgentBench is a suite of end-to-end Machine Learning (ML) experimentation tasks, where the agent aims to take a given dataset and a machine learning task description and autonomously develop or improve an ML model. We are currently actively developing this platform and this benchmark is not fully implemented yet. In particular, we only include one agent and a subset of tasks for this benchmark.""")
966
- with gr.Row():
967
- with gr.Column(scale=2):
968
- Leaderboard(
969
- value=create_leaderboard(parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench')),
970
- select_columns=SelectColumns(
971
- default_selection=config.MLAGENTBENCH_ON_LOAD_COLUMNS + ["Verified"],
972
- cant_deselect=["Agent Name"],
973
- label="Select Columns to Display:",
974
- ),
975
- hide_columns=config.MLAGENTBENCH_HIDE_COLUMNS,
976
- search_columns=config.MLAGENTBENCH_SEARCH_COLUMNS,
977
- )
978
- gr.Markdown("""*Error ranges span from the lowest to highest observed values in repeated runs.*""", elem_classes=["text-right"])
979
- with gr.Row():
980
- gr.Markdown("### Accuracy vs. Cost for MLAgentBench agents")
981
- with gr.Row():
982
- scatter_plot = gr.Plot(create_scatter_plot(parse_json_files(os.path.join(abs_path, "evals_live"), 'mlagentbench', aggregate=False), "Total Cost", "Overall Score", "Total Cost (in USD)", "Overall Score", ["Agent Name"]))
983
-
984
- # gr.HTML('<div style="height: 30px;"></div>')
985
- # gr.Markdown("## Task success heatmap")
986
- # gr.Markdown("The task success heatmap shows which agent can solve which tasks. Agents are sorted by total accuracy (higher is better); tasks in USACO are sorted by decreasing order of difficulty (tasks on the left are solved by the most agents; tasks on the right are solved by the least. For agents that have been run more than once, the run with the highest score is shown.")
987
- # with gr.Row():
988
- # task_success_heatmap = gr.Plot()
989
- # demo.load(
990
- # lambda: create_task_success_heatmap(
991
- # preprocessor.get_task_success_data('usaco'),
992
- # 'USACO'
993
- # ),
994
- # outputs=[task_success_heatmap]
995
- # )
996
-
997
- gr.HTML("""
998
- <style>
999
- .grouped-section {
1000
- border: 2px solid #dee2e6; /* Color matching unactivated tabs */
1001
- border-radius: 10px;
1002
- padding: 30px;
1003
- margin-top: 40px;
1004
- margin-bottom: 40px;
1005
- position: relative;
1006
- }
1007
-
1008
- .grouped-section-title {
1009
- font-size: 1.7em;
1010
- font-weight: bold;
1011
- color: #2c3e50;
1012
- margin-bottom: 20px;
1013
- padding-bottom: 10px;
1014
- border-bottom: 2px solid #dee2e6;
1015
- }
1016
- </style>
1017
- """)
1018
- with gr.Group(elem_classes=["grouped-section"]):
1019
- gr.Markdown("# Agent monitor", elem_classes=["grouped-section-title"], elem_id="agent-monitor")
1020
-
1021
- # gr.HTML('<div style="height: 10px;"></div>')
1022
- # gr.Markdown("## Failure report for each agent")
1023
- # gr.Markdown('Select an agent to see why the agent fails to solve tasks correctly. Note that these descriptions (and the failure categories) are generated by LLM-based evaluations of the agent logs and may contain inaccuracies.')
1024
- # gr.HTML('<div style="height: 10px;"></div>')
1025
- # with gr.Row():
1026
- # with gr.Column(scale=1):
1027
- # failure_report_agent_dropdown = gr.Dropdown(label="Select Agent for Failure Report")
1028
- # gr.HTML('<div style="height: 10px;"></div>')
1029
- # with gr.Row():
1030
- # with gr.Column(scale=1):
1031
- # failure_categories_overview = gr.Markdown()
1032
-
1033
- # with gr.Column(scale=1):
1034
- # failure_categories_chart = gr.Plot()
1035
-
1036
- # # Initialize the failure report agent dropdown with all agents
1037
- # demo.load(update_agent_dropdown,
1038
- # inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)],
1039
- # outputs=[failure_report_agent_dropdown])
1040
-
1041
- # # Update failure report when agent is selected
1042
- # failure_report_agent_dropdown.change(update_failure_report,
1043
- # inputs=[failure_report_agent_dropdown, gr.Textbox(value="mlagentbench", visible=False)],
1044
- # outputs=[failure_categories_overview, failure_categories_chart])
1045
-
1046
- gr.HTML('<div style="height: 30px;"></div>')
1047
- gr.Markdown("## Task overview")
1048
- gr.HTML('<div style="height: 10px;"></div>')
1049
- with gr.Row():
1050
- with gr.Column(scale=1):
1051
- agent_dropdown = gr.Dropdown(label="Select Agent")
1052
- with gr.Column(scale=1):
1053
- task_dropdown = gr.Dropdown(label="Select MLAgentBench Task")
1054
- gr.HTML('<div style="height: 10px;"></div>')
1055
- with gr.Row():
1056
- task_overview = gr.Markdown()
1057
- with gr.Row():
1058
- flow_chart = gr.Plot(label="Task Flow")
1059
-
1060
- # Initialize the agent dropdown with the best agent
1061
- demo.load(update_agent_dropdown, inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)], outputs=[agent_dropdown])
1062
- demo.load(update_task_analysis, inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown], outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
1063
-
1064
- agent_dropdown.change(update_task_analysis,
1065
- inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown],
1066
- outputs=[task_overview, flow_chart, task_dropdown, gr.Textbox(visible=False)])
1067
- task_dropdown.change(update_task_details,
1068
- inputs=[gr.Textbox(value="mlagentbench", visible=False), agent_dropdown, task_dropdown],
1069
- outputs=[task_overview, flow_chart, gr.Textbox(visible=False)])
1070
-
1071
- gr.Markdown("## Raw predictions")
1072
- gr.Markdown('Select an agent to see the raw predictions made by the agent for each task. We also provide information on token usage for each call.')
1073
- with gr.Accordion("Expand to inspect raw predictions of agents...", open=False):
1074
- with gr.Row():
1075
- with gr.Column(scale=1):
1076
- raw_agent_dropdown = gr.Dropdown(label="Select Agent")
1077
- with gr.Column(scale=1):
1078
- raw_task_dropdown = gr.Dropdown(label="Select Task")
1079
- with gr.Column(scale=1):
1080
- raw_step_dropdown = gr.Dropdown(label="Select Step")
1081
- with gr.Row():
1082
- raw_call_details = gr.HTML()
1083
-
1084
- def update_raw_task_dropdown(agent_name):
1085
- analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
1086
- if not analyzed_traces:
1087
- return gr.Dropdown(choices=[], label="Select Task"), gr.Dropdown(choices=[], label="Select Step"), f"No raw predictions data available for agent: {agent_name}."
1088
- task_ids = list(analyzed_traces.keys())
1089
- steps = analyzed_traces[task_ids[0]]['steps']
1090
- return gr.Dropdown(choices=task_ids, label="Select Task", value=task_ids[0]), gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(get_analyzed_traces(agent_name, "mlagentbench")[task_ids[0]]['steps'][0], 0)
1091
-
1092
- def update_raw_step_dropdown(agent_name, task_id):
1093
- analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
1094
- if not analyzed_traces or task_id not in analyzed_traces:
1095
- return gr.Dropdown(choices=[], label="Select Step", value="No data available.")
1096
- steps = analyzed_traces[task_id]['steps']
1097
- return gr.Dropdown(choices=[(f"Step {i+1}", i) for i in range(len(steps))], label="Select Step", value=0), format_call_info(steps[0], 0)
1098
-
1099
- def update_raw_call_details(agent_name, task_id, step_index):
1100
- analyzed_traces = get_analyzed_traces(agent_name, "mlagentbench")
1101
- if not analyzed_traces or task_id not in analyzed_traces:
1102
- return "No data available for this selection."
1103
- steps = analyzed_traces[task_id]['steps']
1104
- if step_index is None:
1105
- return "Invalid step selection."
1106
- step = steps[step_index]
1107
- return format_call_info(step, step_index)
1108
-
1109
- # Initialize the raw agent dropdown with all agents
1110
- demo.load(update_agent_dropdown,
1111
- inputs=[gr.Textbox(value="mlagentbench", visible=False), gr.Textbox(value="Overall Score", visible=False)],
1112
- outputs=[raw_agent_dropdown])
1113
- demo.load(update_raw_task_dropdown,
1114
- inputs=[raw_agent_dropdown],
1115
- outputs=[raw_task_dropdown, raw_step_dropdown])
1116
- demo.load(update_raw_call_details,
1117
- inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
1118
- outputs=[raw_call_details])
1119
-
1120
- raw_agent_dropdown.change(update_raw_task_dropdown,
1121
- inputs=[raw_agent_dropdown],
1122
- outputs=[raw_task_dropdown, raw_step_dropdown, raw_call_details])
1123
- raw_task_dropdown.change(update_raw_step_dropdown,
1124
- inputs=[raw_agent_dropdown, raw_task_dropdown],
1125
- outputs=[raw_step_dropdown, raw_call_details])
1126
- raw_step_dropdown.change(update_raw_call_details,
1127
- inputs=[raw_agent_dropdown, raw_task_dropdown, raw_step_dropdown],
1128
- outputs=[raw_call_details])
1129
-
1130
- with gr.Tab("About"):
1131
- gr.Markdown((Path(__file__).parent / "about.md").read_text())
1132
 
1133
  # Will trigger autoscaling of plots when tabs are switched
1134
  tabs.select(fn=None, inputs=None, outputs=None, js="""
@@ -1145,10 +440,6 @@ with gr.Blocks(theme=my_theme, css='css.css', title="HAL: Holistic Agent Leaderb
1145
  gr.HTML("""<h2 class="section-heading" id="reproduction-guide">How can I run evaluations?</h2>""")
1146
  gr.Markdown("""Coming soon...""")
1147
 
1148
-
1149
-
1150
-
1151
-
1152
  async def main():
1153
  # Preprocess traces
1154
  # preprocessor = TracePreprocessor()
 
281
  </style>
282
 
283
  <header class="hal-header">
284
+ <h1 class="hal-title">CORE-Bench Leaderboard</h1>
285
  <p class="hal-subtitle">
286
+ A leaderboard evaluating agent performance on CORE-Bench.
287
  </p>
288
  </header>""")
289
  gr.HTML("""
 
300
  background-color: #ffffff;
301
  border-radius: 10px;
302
  margin: 0 15px;
303
+ text-align: center;
304
  box-shadow: 0 6px 12px rgba(0, 0, 0, 0.1);
305
  display: flex;
306
  flex-direction: column;
307
+ align-items: center;
308
  border-top: 5px solid #3498db;
309
  transition: transform 0.3s ease, box-shadow 0.3s ease;
310
  }
 
316
  font-size: 1.2em;
317
  font-weight: bold;
318
  color: #1b9e77;
319
+ margin-bottom: 15px;
320
  text-transform: uppercase;
321
  letter-spacing: 1px;
322
  }
 
328
  line-height: 1.6;
329
  color: #333;
330
  }
331
+ .button {
332
+ margin: 15px 0;
333
+ padding: 10px 20px;
334
+ font-size: 1em;
335
+ font-weight: bold;
336
+ color: #fff;
337
+ background-color: #3498db;
338
+ border: none;
339
+ border-radius: 5px;
340
+ text-decoration: none;
341
+ display: inline-flex;
342
+ align-items: center;
343
+ transition: background-color 0.3s ease;
344
+ }
345
+ .button:hover {
346
+ background-color: #2980b9;
347
+ }
348
+ .button img {
349
+ margin-right: 8px;
350
+ height: 20px;
351
+ }
352
  </style>
353
 
354
  <div class="feature-row">
355
  <div class="feature-column">
356
+ <div class="feature-keyword">Paper</div>
357
  <div class="feature-content">
358
+ <a href="https://arxiv.org/abs/2409.11363" class="button">
359
+ <img src="https://example.com/favicon-paper.png" alt="Paper Icon"> View Paper
360
+ </a>
361
  </div>
362
  </div>
363
  <div class="feature-column">
364
+ <div class="feature-keyword">Github</div>
365
  <div class="feature-content">
366
+ <a href="https://github.com/siegelz/core-bench" class="button">
367
+ <img src="https://example.com/favicon-github.png" alt="Github Icon"> View Github
368
+ </a>
369
  </div>
370
  </div>
371
  <div class="feature-column">
372
+ <div class="feature-keyword">Dataset</div>
373
  <div class="feature-content">
374
+ <a href="https://huggingface.co/datasets/siegelz/core-bench" class="button">
375
+ <img src="https://example.com/favicon-dataset.png" alt="Dataset Icon"> View Dataset
376
+ </a>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  </div>
378
  </div>
379
  </div>
 
383
  """)
384
 
385
  with gr.Tabs() as tabs:
386
+ with gr.Tab("CORE-Bench"):
387
+ gr.Markdown("""
388
+ CORE-Bench evaluates the ability of agents to computationally reproduce the results of published scientific papers. Agents are given the codebase of a paper and must install all libraries and dependencies, run the code, and read through the output and figures to answer questions about the paper. The benchmark has tasks at three difficulty levels:
389
+
390
+ <b>CORE-Bench-Easy</b>: The agent is given the output of the code and must answer questions about the output without running any code. To answer questions, agents must navigate through the terminal output as well as files and figures generated by the code.
391
+
392
+ <b>CORE-Bench-Medium</b>: The agent is given a Dockerfile and instructions on how to use the Dockerfile to fully reproduce the paper. This level mainly evaluates agents ability to use and interact with the terminal. The agent must then answer questions about the output of the code, as in the above level.
393
+
394
+ <b>CORE-Bench-Hard</b>: The agent is given the codebase of the paper and must install all libraries and dependencies, run the code, and read through the output and figures to answer questions about the paper. This level is most akin to fully reproducing a paper and is the most realistic and challenging level.
395
+ """)
396
  with gr.Row():
397
  with gr.Column(scale=2):
398
  Leaderboard(
 
424
  outputs=[task_success_heatmap]
425
  )
426
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
 
428
  # Will trigger autoscaling of plots when tabs are switched
429
  tabs.select(fn=None, inputs=None, outputs=None, js="""
 
440
  gr.HTML("""<h2 class="section-heading" id="reproduction-guide">How can I run evaluations?</h2>""")
441
  gr.Markdown("""Coming soon...""")
442
 
 
 
 
 
443
  async def main():
444
  # Preprocess traces
445
  # preprocessor = TracePreprocessor()