benediktstroebl commited on
Commit
4e68e9f
·
1 Parent(s): e24146f

added one line descriptions to each benchmark with acknowledgements and modified headline

Browse files
Files changed (1) hide show
  1. app.py +3 -0
app.py CHANGED
@@ -520,6 +520,7 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
520
 
521
 
522
  with gr.Tab("SWE-Bench Verified"):
 
523
  with gr.Row():
524
  with gr.Column(scale=2):
525
  Leaderboard(
@@ -655,6 +656,7 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
655
  outputs=[raw_call_details])
656
 
657
  with gr.Tab("SWE-Bench Lite"):
 
658
  with gr.Row():
659
  with gr.Column(scale=2):
660
  Leaderboard(
@@ -791,6 +793,7 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
791
  outputs=[raw_call_details])
792
 
793
  with gr.Tab("MLAgentBench"):
 
794
  with gr.Row():
795
  with gr.Column(scale=2):
796
  Leaderboard(
 
520
 
521
 
522
  with gr.Tab("SWE-Bench Verified"):
523
+ gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Verified is a human-validated subset of 500 problems reviewed by software engineers. The We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
524
  with gr.Row():
525
  with gr.Column(scale=2):
526
  Leaderboard(
 
656
  outputs=[raw_call_details])
657
 
658
  with gr.Tab("SWE-Bench Lite"):
659
+ gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Lite is a subset of 300 tasks of the original SWE-bench. We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
660
  with gr.Row():
661
  with gr.Column(scale=2):
662
  Leaderboard(
 
793
  outputs=[raw_call_details])
794
 
795
  with gr.Tab("MLAgentBench"):
796
+ gr.Markdown("""MLAgentBench is a suite of end-to-end Machine Learning (ML) experimentation tasks, where the agent aims to take a given dataset and a machine learning task description and autonomously develop or improve an ML model. We are currently actively developing this platform and this benchmark is not fully implemented yet. In particular, we only include one agent and a subset of tasks for this benchmark.""")
797
  with gr.Row():
798
  with gr.Column(scale=2):
799
  Leaderboard(