MLRC_Bench / src /data /metrics /absolute_improvement_to_baseline.json
Armeddinosaur's picture
Updating metrics
678bdbb
raw
history blame
2.3 kB
{
"perception_temporal_action_loc": {
"MLAB (claude-3-5-sonnet-v2)": 2.2,
"Top Human in Competition": 284.6,
"MLAB (gemini-exp-1206)": -1.3,
"MLAB (o3-mini)": 0.9,
"MLAB (gpt-4o)": 0.9,
"MLAB (llama3-1-405b-instruct)": 1.5,
"CoI-Agent (o1) + MLAB (gpt-4o)": 1.0,
"Human Idea + MLAB (gpt-4o)": 1.5
},
"llm-merging": {
"CoI-Agent (o1) + MLAB (gpt-4o)": -0.7,
"Top Human in Competition": 68.2,
"MLAB (claude-3-5-sonnet-v2)": 3.4,
"MLAB (gemini-exp-1206)": 3.4,
"MLAB (o3-mini)": -0.7,
"MLAB (gpt-4o)": 1.4,
"MLAB (llama3-1-405b-instruct)": -0.7,
"Human Idea + MLAB (gpt-4o)": -0.7
},
"product-recommendation": {
"MLAB (claude-3-5-sonnet-v2)": 12.3,
"Top Human in Competition": 412.6,
"MLAB (gemini-exp-1206)": 0.6,
"MLAB (o3-mini)": 0.6,
"MLAB (gpt-4o)": 2.6,
"MLAB (llama3-1-405b-instruct)": -0.0,
"Human Idea + MLAB (gpt-4o)": 8.9,
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.6
},
"weather_forcast": {
"CoI-Agent (o1) + MLAB (gpt-4o)": 83.6,
"Top Human in Competition": 212.0,
"Human Idea + MLAB (gpt-4o)": 26.1,
"MLAB (claude-3-5-sonnet-v2)": 31.0,
"MLAB (gemini-exp-1206)": 91.4,
"MLAB (o3-mini)": 53.3,
"MLAB (gpt-4o)": 100.8,
"MLAB (llama3-1-405b-instruct)": 66.7
},
"meta-learning": {
"MLAB (claude-3-5-sonnet-v2)": -14.9,
"Top Human in Competition": 304.5,
"MLAB (gemini-exp-1206)": -3.2,
"MLAB (o3-mini)": -14.9,
"MLAB (gpt-4o)": -14.9,
"MLAB (llama3-1-405b-instruct)": -14.9,
"Human Idea + MLAB (gpt-4o)": -14.9,
"CoI-Agent (o1) + MLAB (gpt-4o)": -14.9
},
"machine_unlearning": {
"Human Idea + MLAB (gpt-4o)": 4.2,
"Top Human in Competition": 61.9,
"CoI-Agent (o1) + MLAB (gpt-4o)": 7.3,
"MLAB (claude-3-5-sonnet-v2)": -58.6,
"MLAB (gemini-exp-1206)": 3.5,
"MLAB (o3-mini)": 2.2,
"MLAB (gpt-4o)": -11.1,
"MLAB (llama3-1-405b-instruct)": 3.8
},
"backdoor-trigger-recovery": {
"MLAB (gpt-4o)": 74.0,
"Top Human in Competition": 621.3,
"CoI-Agent (o1) + MLAB (gpt-4o)": 24.9,
"MLAB (claude-3-5-sonnet-v2)": 247.9,
"MLAB (gemini-exp-1206)": 80.4,
"MLAB (o3-mini)": 38.8,
"MLAB (llama3-1-405b-instruct)": 71.7,
"Human Idea + MLAB (gpt-4o)": 54.5
}
}