Spaces:
Running
Running
Commit
·
4e68e9f
1
Parent(s):
e24146f
added one line descriptions to each benchmark with acknowledgements and modified headline
Browse files
app.py
CHANGED
@@ -520,6 +520,7 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
|
|
520 |
|
521 |
|
522 |
with gr.Tab("SWE-Bench Verified"):
|
|
|
523 |
with gr.Row():
|
524 |
with gr.Column(scale=2):
|
525 |
Leaderboard(
|
@@ -655,6 +656,7 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
|
|
655 |
outputs=[raw_call_details])
|
656 |
|
657 |
with gr.Tab("SWE-Bench Lite"):
|
|
|
658 |
with gr.Row():
|
659 |
with gr.Column(scale=2):
|
660 |
Leaderboard(
|
@@ -791,6 +793,7 @@ with gr.Blocks(theme=my_theme, css='css.css') as demo:
|
|
791 |
outputs=[raw_call_details])
|
792 |
|
793 |
with gr.Tab("MLAgentBench"):
|
|
|
794 |
with gr.Row():
|
795 |
with gr.Column(scale=2):
|
796 |
Leaderboard(
|
|
|
520 |
|
521 |
|
522 |
with gr.Tab("SWE-Bench Verified"):
|
523 |
+
gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Verified is a human-validated subset of 500 problems reviewed by software engineers. The We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
|
524 |
with gr.Row():
|
525 |
with gr.Column(scale=2):
|
526 |
Leaderboard(
|
|
|
656 |
outputs=[raw_call_details])
|
657 |
|
658 |
with gr.Tab("SWE-Bench Lite"):
|
659 |
+
gr.Markdown("""SWE-bench is a dataset that tests systems' ability to solve GitHub issues automatically. Lite is a subset of 300 tasks of the original SWE-bench. We are currently actively developing this platform and this benchmark is not fully implemented yet.""")
|
660 |
with gr.Row():
|
661 |
with gr.Column(scale=2):
|
662 |
Leaderboard(
|
|
|
793 |
outputs=[raw_call_details])
|
794 |
|
795 |
with gr.Tab("MLAgentBench"):
|
796 |
+
gr.Markdown("""MLAgentBench is a suite of end-to-end Machine Learning (ML) experimentation tasks, where the agent aims to take a given dataset and a machine learning task description and autonomously develop or improve an ML model. We are currently actively developing this platform and this benchmark is not fully implemented yet. In particular, we only include one agent and a subset of tasks for this benchmark.""")
|
797 |
with gr.Row():
|
798 |
with gr.Column(scale=2):
|
799 |
Leaderboard(
|