core_leaderboard

Running

App Files Files Community

benediktstroebl commited on Aug 23, 2024

Commit

4e68e9f

1 Parent(s): e24146f

added one line descriptions to each benchmark with acknowledgements and modified headline

Browse files

Files changed (1) hide show

app.py +3 -0

app.py CHANGED Viewed

@@ -520,6 +520,7 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
         with gr.Tab("SWE-Bench Verified"):
             with gr.Row():
                 with gr.Column(scale=2):
                     Leaderboard(
@@ -655,6 +656,7 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
                                     outputs=[raw_call_details])
         with gr.Tab("SWE-Bench Lite"):
             with gr.Row():
                 with gr.Column(scale=2):
                     Leaderboard(
@@ -791,6 +793,7 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
                                     outputs=[raw_call_details])
         with gr.Tab("MLAgentBench"):
             with gr.Row():
                 with gr.Column(scale=2):
                     Leaderboard(

         with gr.Tab("SWE-Bench Verified"):
+            gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Verified is a human-validated subset of 500 problems reviewed by software engineers. The  We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
             with gr.Row():
                 with gr.Column(scale=2):
                     Leaderboard(
                                     outputs=[raw_call_details])
         with gr.Tab("SWE-Bench Lite"):
+            gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Lite is a subset of 300 tasks of the original SWE-bench. We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
             with gr.Row():
                 with gr.Column(scale=2):
                     Leaderboard(
                                     outputs=[raw_call_details])
         with gr.Tab("MLAgentBench"):
+            gr.Markdown("""MLAgentBench is a suite of end-to-end Machine Learning (ML) experimentation tasks, where the agent aims to take a given dataset and a machine learning task description and autonomously develop or improve an ML model. We are currently actively developing this platform and this benchmark is not fully implemented yet. In particular, we only include one agent and a subset of tasks for this benchmark.""")
             with gr.Row():
                 with gr.Column(scale=2):
                     Leaderboard(