lm1-misc-oscar
/
1b121b21b
/evaluation
/lm1-1b1-21b-oscar-results_lm-eval_global_step39672_2022-12-02-14-19-24.json
{ | |
"results": { | |
"copa": { | |
"acc": 0.7, | |
"acc_stderr": 0.046056618647183814 | |
}, | |
"piqa": { | |
"acc": 0.6675734494015234, | |
"acc_stderr": 0.01099114155744559, | |
"acc_norm": 0.6697497279651795, | |
"acc_norm_stderr": 0.010972947133006297 | |
}, | |
"rte": { | |
"acc": 0.5631768953068592, | |
"acc_stderr": 0.029855247390314938 | |
}, | |
"winogrande": { | |
"acc": 0.5011838989739542, | |
"acc_stderr": 0.014052446290529009 | |
}, | |
"hendrycksTest-abstract_algebra": { | |
"acc": 0.22, | |
"acc_stderr": 0.0416333199893227, | |
"acc_norm": 0.22, | |
"acc_norm_stderr": 0.0416333199893227 | |
}, | |
"hendrycksTest-anatomy": { | |
"acc": 0.17037037037037037, | |
"acc_stderr": 0.03247781185995593, | |
"acc_norm": 0.17777777777777778, | |
"acc_norm_stderr": 0.0330278985990172 | |
}, | |
"hendrycksTest-astronomy": { | |
"acc": 0.21052631578947367, | |
"acc_stderr": 0.03317672787533156, | |
"acc_norm": 0.3026315789473684, | |
"acc_norm_stderr": 0.037385206761196686 | |
}, | |
"hendrycksTest-business_ethics": { | |
"acc": 0.32, | |
"acc_stderr": 0.046882617226215034, | |
"acc_norm": 0.31, | |
"acc_norm_stderr": 0.04648231987117316 | |
}, | |
"hendrycksTest-clinical_knowledge": { | |
"acc": 0.24528301886792453, | |
"acc_stderr": 0.026480357179895674, | |
"acc_norm": 0.3132075471698113, | |
"acc_norm_stderr": 0.028544793319055326 | |
}, | |
"hendrycksTest-college_biology": { | |
"acc": 0.2361111111111111, | |
"acc_stderr": 0.03551446610810826, | |
"acc_norm": 0.2013888888888889, | |
"acc_norm_stderr": 0.033536474697138406 | |
}, | |
"hendrycksTest-college_chemistry": { | |
"acc": 0.27, | |
"acc_stderr": 0.044619604333847394, | |
"acc_norm": 0.32, | |
"acc_norm_stderr": 0.046882617226215034 | |
}, | |
"hendrycksTest-college_computer_science": { | |
"acc": 0.29, | |
"acc_stderr": 0.04560480215720684, | |
"acc_norm": 0.3, | |
"acc_norm_stderr": 0.046056618647183814 | |
}, | |
"hendrycksTest-college_mathematics": { | |
"acc": 0.23, | |
"acc_stderr": 0.04229525846816505, | |
"acc_norm": 0.29, | |
"acc_norm_stderr": 0.04560480215720684 | |
}, | |
"hendrycksTest-college_medicine": { | |
"acc": 0.24855491329479767, | |
"acc_stderr": 0.03295304696818318, | |
"acc_norm": 0.2543352601156069, | |
"acc_norm_stderr": 0.0332055644308557 | |
}, | |
"hendrycksTest-college_physics": { | |
"acc": 0.2549019607843137, | |
"acc_stderr": 0.043364327079931785, | |
"acc_norm": 0.24509803921568626, | |
"acc_norm_stderr": 0.04280105837364395 | |
}, | |
"hendrycksTest-computer_security": { | |
"acc": 0.2, | |
"acc_stderr": 0.04020151261036844, | |
"acc_norm": 0.29, | |
"acc_norm_stderr": 0.04560480215720684 | |
}, | |
"hendrycksTest-conceptual_physics": { | |
"acc": 0.28936170212765955, | |
"acc_stderr": 0.02964400657700962, | |
"acc_norm": 0.225531914893617, | |
"acc_norm_stderr": 0.027321078417387536 | |
}, | |
"hendrycksTest-econometrics": { | |
"acc": 0.32456140350877194, | |
"acc_stderr": 0.04404556157374768, | |
"acc_norm": 0.2631578947368421, | |
"acc_norm_stderr": 0.04142439719489362 | |
}, | |
"hendrycksTest-electrical_engineering": { | |
"acc": 0.27586206896551724, | |
"acc_stderr": 0.03724563619774632, | |
"acc_norm": 0.2896551724137931, | |
"acc_norm_stderr": 0.03780019230438014 | |
}, | |
"hendrycksTest-elementary_mathematics": { | |
"acc": 0.20105820105820105, | |
"acc_stderr": 0.02064181078237016, | |
"acc_norm": 0.2275132275132275, | |
"acc_norm_stderr": 0.021591269407823778 | |
}, | |
"hendrycksTest-formal_logic": { | |
"acc": 0.29365079365079366, | |
"acc_stderr": 0.04073524322147127, | |
"acc_norm": 0.23809523809523808, | |
"acc_norm_stderr": 0.03809523809523811 | |
}, | |
"hendrycksTest-global_facts": { | |
"acc": 0.24, | |
"acc_stderr": 0.042923469599092816, | |
"acc_norm": 0.26, | |
"acc_norm_stderr": 0.044084400227680794 | |
}, | |
"hendrycksTest-high_school_biology": { | |
"acc": 0.2129032258064516, | |
"acc_stderr": 0.02328766512726854, | |
"acc_norm": 0.26129032258064516, | |
"acc_norm_stderr": 0.024993053397764812 | |
}, | |
"hendrycksTest-high_school_chemistry": { | |
"acc": 0.1921182266009852, | |
"acc_stderr": 0.027719315709614775, | |
"acc_norm": 0.23645320197044334, | |
"acc_norm_stderr": 0.029896114291733552 | |
}, | |
"hendrycksTest-high_school_computer_science": { | |
"acc": 0.25, | |
"acc_stderr": 0.04351941398892446, | |
"acc_norm": 0.29, | |
"acc_norm_stderr": 0.04560480215720684 | |
}, | |
"hendrycksTest-high_school_european_history": { | |
"acc": 0.24848484848484848, | |
"acc_stderr": 0.03374402644139404, | |
"acc_norm": 0.2909090909090909, | |
"acc_norm_stderr": 0.03546563019624336 | |
}, | |
"hendrycksTest-high_school_geography": { | |
"acc": 0.23737373737373738, | |
"acc_stderr": 0.030313710538198906, | |
"acc_norm": 0.3181818181818182, | |
"acc_norm_stderr": 0.033184773338453315 | |
}, | |
"hendrycksTest-high_school_government_and_politics": { | |
"acc": 0.26424870466321243, | |
"acc_stderr": 0.03182155050916648, | |
"acc_norm": 0.27979274611398963, | |
"acc_norm_stderr": 0.03239637046735702 | |
}, | |
"hendrycksTest-high_school_macroeconomics": { | |
"acc": 0.2512820512820513, | |
"acc_stderr": 0.021992016662370543, | |
"acc_norm": 0.2641025641025641, | |
"acc_norm_stderr": 0.022352193737453268 | |
}, | |
"hendrycksTest-high_school_mathematics": { | |
"acc": 0.2037037037037037, | |
"acc_stderr": 0.02455617221914128, | |
"acc_norm": 0.26296296296296295, | |
"acc_norm_stderr": 0.026842057873833713 | |
}, | |
"hendrycksTest-high_school_microeconomics": { | |
"acc": 0.25630252100840334, | |
"acc_stderr": 0.02835962087053395, | |
"acc_norm": 0.31512605042016806, | |
"acc_norm_stderr": 0.030176808288974337 | |
}, | |
"hendrycksTest-high_school_physics": { | |
"acc": 0.1986754966887417, | |
"acc_stderr": 0.03257847384436776, | |
"acc_norm": 0.23178807947019867, | |
"acc_norm_stderr": 0.03445406271987054 | |
}, | |
"hendrycksTest-high_school_psychology": { | |
"acc": 0.20917431192660552, | |
"acc_stderr": 0.017437937173343233, | |
"acc_norm": 0.22568807339449543, | |
"acc_norm_stderr": 0.017923087667803057 | |
}, | |
"hendrycksTest-high_school_statistics": { | |
"acc": 0.20833333333333334, | |
"acc_stderr": 0.027696910713093936, | |
"acc_norm": 0.30092592592592593, | |
"acc_norm_stderr": 0.031280390843298804 | |
}, | |
"hendrycksTest-high_school_us_history": { | |
"acc": 0.27450980392156865, | |
"acc_stderr": 0.031321798030832904, | |
"acc_norm": 0.28431372549019607, | |
"acc_norm_stderr": 0.03166009679399812 | |
}, | |
"hendrycksTest-high_school_world_history": { | |
"acc": 0.22784810126582278, | |
"acc_stderr": 0.027303484599069422, | |
"acc_norm": 0.28270042194092826, | |
"acc_norm_stderr": 0.02931281415395593 | |
}, | |
"hendrycksTest-human_aging": { | |
"acc": 0.29596412556053814, | |
"acc_stderr": 0.030636591348699796, | |
"acc_norm": 0.26905829596412556, | |
"acc_norm_stderr": 0.029763779406874972 | |
}, | |
"hendrycksTest-human_sexuality": { | |
"acc": 0.32061068702290074, | |
"acc_stderr": 0.04093329229834278, | |
"acc_norm": 0.31297709923664124, | |
"acc_norm_stderr": 0.04066962905677697 | |
}, | |
"hendrycksTest-international_law": { | |
"acc": 0.2231404958677686, | |
"acc_stderr": 0.03800754475228733, | |
"acc_norm": 0.4049586776859504, | |
"acc_norm_stderr": 0.04481137755942469 | |
}, | |
"hendrycksTest-jurisprudence": { | |
"acc": 0.24074074074074073, | |
"acc_stderr": 0.0413311944024384, | |
"acc_norm": 0.37962962962962965, | |
"acc_norm_stderr": 0.04691521224077742 | |
}, | |
"hendrycksTest-logical_fallacies": { | |
"acc": 0.1656441717791411, | |
"acc_stderr": 0.029208296231259104, | |
"acc_norm": 0.27607361963190186, | |
"acc_norm_stderr": 0.0351238528370505 | |
}, | |
"hendrycksTest-machine_learning": { | |
"acc": 0.3125, | |
"acc_stderr": 0.043994650575715215, | |
"acc_norm": 0.25892857142857145, | |
"acc_norm_stderr": 0.04157751539865629 | |
}, | |
"hendrycksTest-management": { | |
"acc": 0.2524271844660194, | |
"acc_stderr": 0.04301250399690878, | |
"acc_norm": 0.2912621359223301, | |
"acc_norm_stderr": 0.044986763205729224 | |
}, | |
"hendrycksTest-marketing": { | |
"acc": 0.29914529914529914, | |
"acc_stderr": 0.02999695185834948, | |
"acc_norm": 0.32051282051282054, | |
"acc_norm_stderr": 0.030572811310299604 | |
}, | |
"hendrycksTest-medical_genetics": { | |
"acc": 0.28, | |
"acc_stderr": 0.045126085985421255, | |
"acc_norm": 0.38, | |
"acc_norm_stderr": 0.04878317312145632 | |
}, | |
"hendrycksTest-miscellaneous": { | |
"acc": 0.2567049808429119, | |
"acc_stderr": 0.015620480263064535, | |
"acc_norm": 0.24904214559386972, | |
"acc_norm_stderr": 0.015464676163395962 | |
}, | |
"hendrycksTest-moral_disputes": { | |
"acc": 0.28901734104046245, | |
"acc_stderr": 0.02440517393578324, | |
"acc_norm": 0.30057803468208094, | |
"acc_norm_stderr": 0.0246853168672578 | |
}, | |
"hendrycksTest-moral_scenarios": { | |
"acc": 0.2346368715083799, | |
"acc_stderr": 0.014173044098303658, | |
"acc_norm": 0.27262569832402234, | |
"acc_norm_stderr": 0.01489339173524959 | |
}, | |
"hendrycksTest-nutrition": { | |
"acc": 0.25163398692810457, | |
"acc_stderr": 0.024848018263875202, | |
"acc_norm": 0.3366013071895425, | |
"acc_norm_stderr": 0.027057974624494385 | |
}, | |
"hendrycksTest-philosophy": { | |
"acc": 0.1832797427652733, | |
"acc_stderr": 0.021974198848265812, | |
"acc_norm": 0.2990353697749196, | |
"acc_norm_stderr": 0.02600330111788513 | |
}, | |
"hendrycksTest-prehistory": { | |
"acc": 0.2654320987654321, | |
"acc_stderr": 0.02456922360046085, | |
"acc_norm": 0.22839506172839505, | |
"acc_norm_stderr": 0.023358211840626267 | |
}, | |
"hendrycksTest-professional_accounting": { | |
"acc": 0.2127659574468085, | |
"acc_stderr": 0.024414612974307706, | |
"acc_norm": 0.2695035460992908, | |
"acc_norm_stderr": 0.026469036818590638 | |
}, | |
"hendrycksTest-professional_law": { | |
"acc": 0.2522816166883963, | |
"acc_stderr": 0.011092789056875238, | |
"acc_norm": 0.30182529335071706, | |
"acc_norm_stderr": 0.011724350518105893 | |
}, | |
"hendrycksTest-professional_medicine": { | |
"acc": 0.21323529411764705, | |
"acc_stderr": 0.024880971512294257, | |
"acc_norm": 0.2426470588235294, | |
"acc_norm_stderr": 0.026040662474201275 | |
}, | |
"hendrycksTest-professional_psychology": { | |
"acc": 0.2565359477124183, | |
"acc_stderr": 0.01766784161237899, | |
"acc_norm": 0.2565359477124183, | |
"acc_norm_stderr": 0.017667841612378977 | |
}, | |
"hendrycksTest-public_relations": { | |
"acc": 0.2818181818181818, | |
"acc_stderr": 0.04309118709946458, | |
"acc_norm": 0.2, | |
"acc_norm_stderr": 0.03831305140884603 | |
}, | |
"hendrycksTest-security_studies": { | |
"acc": 0.2938775510204082, | |
"acc_stderr": 0.02916273841024977, | |
"acc_norm": 0.2653061224489796, | |
"acc_norm_stderr": 0.028263889943784603 | |
}, | |
"hendrycksTest-sociology": { | |
"acc": 0.2537313432835821, | |
"acc_stderr": 0.03076944496729602, | |
"acc_norm": 0.2537313432835821, | |
"acc_norm_stderr": 0.03076944496729602 | |
}, | |
"hendrycksTest-us_foreign_policy": { | |
"acc": 0.3, | |
"acc_stderr": 0.046056618647183814, | |
"acc_norm": 0.35, | |
"acc_norm_stderr": 0.0479372485441102 | |
}, | |
"hendrycksTest-virology": { | |
"acc": 0.28313253012048195, | |
"acc_stderr": 0.03507295431370518, | |
"acc_norm": 0.2469879518072289, | |
"acc_norm_stderr": 0.03357351982064536 | |
}, | |
"hendrycksTest-world_religions": { | |
"acc": 0.3216374269005848, | |
"acc_stderr": 0.03582529442573122, | |
"acc_norm": 0.32748538011695905, | |
"acc_norm_stderr": 0.035993357714560276 | |
} | |
}, | |
"versions": { | |
"copa": 0, | |
"piqa": 0, | |
"rte": 0, | |
"winogrande": 0, | |
"hendrycksTest-abstract_algebra": 0, | |
"hendrycksTest-anatomy": 0, | |
"hendrycksTest-astronomy": 0, | |
"hendrycksTest-business_ethics": 0, | |
"hendrycksTest-clinical_knowledge": 0, | |
"hendrycksTest-college_biology": 0, | |
"hendrycksTest-college_chemistry": 0, | |
"hendrycksTest-college_computer_science": 0, | |
"hendrycksTest-college_mathematics": 0, | |
"hendrycksTest-college_medicine": 0, | |
"hendrycksTest-college_physics": 0, | |
"hendrycksTest-computer_security": 0, | |
"hendrycksTest-conceptual_physics": 0, | |
"hendrycksTest-econometrics": 0, | |
"hendrycksTest-electrical_engineering": 0, | |
"hendrycksTest-elementary_mathematics": 0, | |
"hendrycksTest-formal_logic": 0, | |
"hendrycksTest-global_facts": 0, | |
"hendrycksTest-high_school_biology": 0, | |
"hendrycksTest-high_school_chemistry": 0, | |
"hendrycksTest-high_school_computer_science": 0, | |
"hendrycksTest-high_school_european_history": 0, | |
"hendrycksTest-high_school_geography": 0, | |
"hendrycksTest-high_school_government_and_politics": 0, | |
"hendrycksTest-high_school_macroeconomics": 0, | |
"hendrycksTest-high_school_mathematics": 0, | |
"hendrycksTest-high_school_microeconomics": 0, | |
"hendrycksTest-high_school_physics": 0, | |
"hendrycksTest-high_school_psychology": 0, | |
"hendrycksTest-high_school_statistics": 0, | |
"hendrycksTest-high_school_us_history": 0, | |
"hendrycksTest-high_school_world_history": 0, | |
"hendrycksTest-human_aging": 0, | |
"hendrycksTest-human_sexuality": 0, | |
"hendrycksTest-international_law": 0, | |
"hendrycksTest-jurisprudence": 0, | |
"hendrycksTest-logical_fallacies": 0, | |
"hendrycksTest-machine_learning": 0, | |
"hendrycksTest-management": 0, | |
"hendrycksTest-marketing": 0, | |
"hendrycksTest-medical_genetics": 0, | |
"hendrycksTest-miscellaneous": 0, | |
"hendrycksTest-moral_disputes": 0, | |
"hendrycksTest-moral_scenarios": 0, | |
"hendrycksTest-nutrition": 0, | |
"hendrycksTest-philosophy": 0, | |
"hendrycksTest-prehistory": 0, | |
"hendrycksTest-professional_accounting": 0, | |
"hendrycksTest-professional_law": 0, | |
"hendrycksTest-professional_medicine": 0, | |
"hendrycksTest-professional_psychology": 0, | |
"hendrycksTest-public_relations": 0, | |
"hendrycksTest-security_studies": 0, | |
"hendrycksTest-sociology": 0, | |
"hendrycksTest-us_foreign_policy": 0, | |
"hendrycksTest-virology": 0, | |
"hendrycksTest-world_religions": 0 | |
} | |
} |