lm1-misc-oscar / 1b121b21b /evaluation /lm1-1b1-21b-oscar-results_lm-eval_global_step39672_2022-12-02-14-19-24.json
Muennighoff's picture
Add
8ec84fa
raw
history blame
16.6 kB
{
"results": {
"copa": {
"acc": 0.7,
"acc_stderr": 0.046056618647183814
},
"piqa": {
"acc": 0.6675734494015234,
"acc_stderr": 0.01099114155744559,
"acc_norm": 0.6697497279651795,
"acc_norm_stderr": 0.010972947133006297
},
"rte": {
"acc": 0.5631768953068592,
"acc_stderr": 0.029855247390314938
},
"winogrande": {
"acc": 0.5011838989739542,
"acc_stderr": 0.014052446290529009
},
"hendrycksTest-abstract_algebra": {
"acc": 0.22,
"acc_stderr": 0.0416333199893227,
"acc_norm": 0.22,
"acc_norm_stderr": 0.0416333199893227
},
"hendrycksTest-anatomy": {
"acc": 0.17037037037037037,
"acc_stderr": 0.03247781185995593,
"acc_norm": 0.17777777777777778,
"acc_norm_stderr": 0.0330278985990172
},
"hendrycksTest-astronomy": {
"acc": 0.21052631578947367,
"acc_stderr": 0.03317672787533156,
"acc_norm": 0.3026315789473684,
"acc_norm_stderr": 0.037385206761196686
},
"hendrycksTest-business_ethics": {
"acc": 0.32,
"acc_stderr": 0.046882617226215034,
"acc_norm": 0.31,
"acc_norm_stderr": 0.04648231987117316
},
"hendrycksTest-clinical_knowledge": {
"acc": 0.24528301886792453,
"acc_stderr": 0.026480357179895674,
"acc_norm": 0.3132075471698113,
"acc_norm_stderr": 0.028544793319055326
},
"hendrycksTest-college_biology": {
"acc": 0.2361111111111111,
"acc_stderr": 0.03551446610810826,
"acc_norm": 0.2013888888888889,
"acc_norm_stderr": 0.033536474697138406
},
"hendrycksTest-college_chemistry": {
"acc": 0.27,
"acc_stderr": 0.044619604333847394,
"acc_norm": 0.32,
"acc_norm_stderr": 0.046882617226215034
},
"hendrycksTest-college_computer_science": {
"acc": 0.29,
"acc_stderr": 0.04560480215720684,
"acc_norm": 0.3,
"acc_norm_stderr": 0.046056618647183814
},
"hendrycksTest-college_mathematics": {
"acc": 0.23,
"acc_stderr": 0.04229525846816505,
"acc_norm": 0.29,
"acc_norm_stderr": 0.04560480215720684
},
"hendrycksTest-college_medicine": {
"acc": 0.24855491329479767,
"acc_stderr": 0.03295304696818318,
"acc_norm": 0.2543352601156069,
"acc_norm_stderr": 0.0332055644308557
},
"hendrycksTest-college_physics": {
"acc": 0.2549019607843137,
"acc_stderr": 0.043364327079931785,
"acc_norm": 0.24509803921568626,
"acc_norm_stderr": 0.04280105837364395
},
"hendrycksTest-computer_security": {
"acc": 0.2,
"acc_stderr": 0.04020151261036844,
"acc_norm": 0.29,
"acc_norm_stderr": 0.04560480215720684
},
"hendrycksTest-conceptual_physics": {
"acc": 0.28936170212765955,
"acc_stderr": 0.02964400657700962,
"acc_norm": 0.225531914893617,
"acc_norm_stderr": 0.027321078417387536
},
"hendrycksTest-econometrics": {
"acc": 0.32456140350877194,
"acc_stderr": 0.04404556157374768,
"acc_norm": 0.2631578947368421,
"acc_norm_stderr": 0.04142439719489362
},
"hendrycksTest-electrical_engineering": {
"acc": 0.27586206896551724,
"acc_stderr": 0.03724563619774632,
"acc_norm": 0.2896551724137931,
"acc_norm_stderr": 0.03780019230438014
},
"hendrycksTest-elementary_mathematics": {
"acc": 0.20105820105820105,
"acc_stderr": 0.02064181078237016,
"acc_norm": 0.2275132275132275,
"acc_norm_stderr": 0.021591269407823778
},
"hendrycksTest-formal_logic": {
"acc": 0.29365079365079366,
"acc_stderr": 0.04073524322147127,
"acc_norm": 0.23809523809523808,
"acc_norm_stderr": 0.03809523809523811
},
"hendrycksTest-global_facts": {
"acc": 0.24,
"acc_stderr": 0.042923469599092816,
"acc_norm": 0.26,
"acc_norm_stderr": 0.044084400227680794
},
"hendrycksTest-high_school_biology": {
"acc": 0.2129032258064516,
"acc_stderr": 0.02328766512726854,
"acc_norm": 0.26129032258064516,
"acc_norm_stderr": 0.024993053397764812
},
"hendrycksTest-high_school_chemistry": {
"acc": 0.1921182266009852,
"acc_stderr": 0.027719315709614775,
"acc_norm": 0.23645320197044334,
"acc_norm_stderr": 0.029896114291733552
},
"hendrycksTest-high_school_computer_science": {
"acc": 0.25,
"acc_stderr": 0.04351941398892446,
"acc_norm": 0.29,
"acc_norm_stderr": 0.04560480215720684
},
"hendrycksTest-high_school_european_history": {
"acc": 0.24848484848484848,
"acc_stderr": 0.03374402644139404,
"acc_norm": 0.2909090909090909,
"acc_norm_stderr": 0.03546563019624336
},
"hendrycksTest-high_school_geography": {
"acc": 0.23737373737373738,
"acc_stderr": 0.030313710538198906,
"acc_norm": 0.3181818181818182,
"acc_norm_stderr": 0.033184773338453315
},
"hendrycksTest-high_school_government_and_politics": {
"acc": 0.26424870466321243,
"acc_stderr": 0.03182155050916648,
"acc_norm": 0.27979274611398963,
"acc_norm_stderr": 0.03239637046735702
},
"hendrycksTest-high_school_macroeconomics": {
"acc": 0.2512820512820513,
"acc_stderr": 0.021992016662370543,
"acc_norm": 0.2641025641025641,
"acc_norm_stderr": 0.022352193737453268
},
"hendrycksTest-high_school_mathematics": {
"acc": 0.2037037037037037,
"acc_stderr": 0.02455617221914128,
"acc_norm": 0.26296296296296295,
"acc_norm_stderr": 0.026842057873833713
},
"hendrycksTest-high_school_microeconomics": {
"acc": 0.25630252100840334,
"acc_stderr": 0.02835962087053395,
"acc_norm": 0.31512605042016806,
"acc_norm_stderr": 0.030176808288974337
},
"hendrycksTest-high_school_physics": {
"acc": 0.1986754966887417,
"acc_stderr": 0.03257847384436776,
"acc_norm": 0.23178807947019867,
"acc_norm_stderr": 0.03445406271987054
},
"hendrycksTest-high_school_psychology": {
"acc": 0.20917431192660552,
"acc_stderr": 0.017437937173343233,
"acc_norm": 0.22568807339449543,
"acc_norm_stderr": 0.017923087667803057
},
"hendrycksTest-high_school_statistics": {
"acc": 0.20833333333333334,
"acc_stderr": 0.027696910713093936,
"acc_norm": 0.30092592592592593,
"acc_norm_stderr": 0.031280390843298804
},
"hendrycksTest-high_school_us_history": {
"acc": 0.27450980392156865,
"acc_stderr": 0.031321798030832904,
"acc_norm": 0.28431372549019607,
"acc_norm_stderr": 0.03166009679399812
},
"hendrycksTest-high_school_world_history": {
"acc": 0.22784810126582278,
"acc_stderr": 0.027303484599069422,
"acc_norm": 0.28270042194092826,
"acc_norm_stderr": 0.02931281415395593
},
"hendrycksTest-human_aging": {
"acc": 0.29596412556053814,
"acc_stderr": 0.030636591348699796,
"acc_norm": 0.26905829596412556,
"acc_norm_stderr": 0.029763779406874972
},
"hendrycksTest-human_sexuality": {
"acc": 0.32061068702290074,
"acc_stderr": 0.04093329229834278,
"acc_norm": 0.31297709923664124,
"acc_norm_stderr": 0.04066962905677697
},
"hendrycksTest-international_law": {
"acc": 0.2231404958677686,
"acc_stderr": 0.03800754475228733,
"acc_norm": 0.4049586776859504,
"acc_norm_stderr": 0.04481137755942469
},
"hendrycksTest-jurisprudence": {
"acc": 0.24074074074074073,
"acc_stderr": 0.0413311944024384,
"acc_norm": 0.37962962962962965,
"acc_norm_stderr": 0.04691521224077742
},
"hendrycksTest-logical_fallacies": {
"acc": 0.1656441717791411,
"acc_stderr": 0.029208296231259104,
"acc_norm": 0.27607361963190186,
"acc_norm_stderr": 0.0351238528370505
},
"hendrycksTest-machine_learning": {
"acc": 0.3125,
"acc_stderr": 0.043994650575715215,
"acc_norm": 0.25892857142857145,
"acc_norm_stderr": 0.04157751539865629
},
"hendrycksTest-management": {
"acc": 0.2524271844660194,
"acc_stderr": 0.04301250399690878,
"acc_norm": 0.2912621359223301,
"acc_norm_stderr": 0.044986763205729224
},
"hendrycksTest-marketing": {
"acc": 0.29914529914529914,
"acc_stderr": 0.02999695185834948,
"acc_norm": 0.32051282051282054,
"acc_norm_stderr": 0.030572811310299604
},
"hendrycksTest-medical_genetics": {
"acc": 0.28,
"acc_stderr": 0.045126085985421255,
"acc_norm": 0.38,
"acc_norm_stderr": 0.04878317312145632
},
"hendrycksTest-miscellaneous": {
"acc": 0.2567049808429119,
"acc_stderr": 0.015620480263064535,
"acc_norm": 0.24904214559386972,
"acc_norm_stderr": 0.015464676163395962
},
"hendrycksTest-moral_disputes": {
"acc": 0.28901734104046245,
"acc_stderr": 0.02440517393578324,
"acc_norm": 0.30057803468208094,
"acc_norm_stderr": 0.0246853168672578
},
"hendrycksTest-moral_scenarios": {
"acc": 0.2346368715083799,
"acc_stderr": 0.014173044098303658,
"acc_norm": 0.27262569832402234,
"acc_norm_stderr": 0.01489339173524959
},
"hendrycksTest-nutrition": {
"acc": 0.25163398692810457,
"acc_stderr": 0.024848018263875202,
"acc_norm": 0.3366013071895425,
"acc_norm_stderr": 0.027057974624494385
},
"hendrycksTest-philosophy": {
"acc": 0.1832797427652733,
"acc_stderr": 0.021974198848265812,
"acc_norm": 0.2990353697749196,
"acc_norm_stderr": 0.02600330111788513
},
"hendrycksTest-prehistory": {
"acc": 0.2654320987654321,
"acc_stderr": 0.02456922360046085,
"acc_norm": 0.22839506172839505,
"acc_norm_stderr": 0.023358211840626267
},
"hendrycksTest-professional_accounting": {
"acc": 0.2127659574468085,
"acc_stderr": 0.024414612974307706,
"acc_norm": 0.2695035460992908,
"acc_norm_stderr": 0.026469036818590638
},
"hendrycksTest-professional_law": {
"acc": 0.2522816166883963,
"acc_stderr": 0.011092789056875238,
"acc_norm": 0.30182529335071706,
"acc_norm_stderr": 0.011724350518105893
},
"hendrycksTest-professional_medicine": {
"acc": 0.21323529411764705,
"acc_stderr": 0.024880971512294257,
"acc_norm": 0.2426470588235294,
"acc_norm_stderr": 0.026040662474201275
},
"hendrycksTest-professional_psychology": {
"acc": 0.2565359477124183,
"acc_stderr": 0.01766784161237899,
"acc_norm": 0.2565359477124183,
"acc_norm_stderr": 0.017667841612378977
},
"hendrycksTest-public_relations": {
"acc": 0.2818181818181818,
"acc_stderr": 0.04309118709946458,
"acc_norm": 0.2,
"acc_norm_stderr": 0.03831305140884603
},
"hendrycksTest-security_studies": {
"acc": 0.2938775510204082,
"acc_stderr": 0.02916273841024977,
"acc_norm": 0.2653061224489796,
"acc_norm_stderr": 0.028263889943784603
},
"hendrycksTest-sociology": {
"acc": 0.2537313432835821,
"acc_stderr": 0.03076944496729602,
"acc_norm": 0.2537313432835821,
"acc_norm_stderr": 0.03076944496729602
},
"hendrycksTest-us_foreign_policy": {
"acc": 0.3,
"acc_stderr": 0.046056618647183814,
"acc_norm": 0.35,
"acc_norm_stderr": 0.0479372485441102
},
"hendrycksTest-virology": {
"acc": 0.28313253012048195,
"acc_stderr": 0.03507295431370518,
"acc_norm": 0.2469879518072289,
"acc_norm_stderr": 0.03357351982064536
},
"hendrycksTest-world_religions": {
"acc": 0.3216374269005848,
"acc_stderr": 0.03582529442573122,
"acc_norm": 0.32748538011695905,
"acc_norm_stderr": 0.035993357714560276
}
},
"versions": {
"copa": 0,
"piqa": 0,
"rte": 0,
"winogrande": 0,
"hendrycksTest-abstract_algebra": 0,
"hendrycksTest-anatomy": 0,
"hendrycksTest-astronomy": 0,
"hendrycksTest-business_ethics": 0,
"hendrycksTest-clinical_knowledge": 0,
"hendrycksTest-college_biology": 0,
"hendrycksTest-college_chemistry": 0,
"hendrycksTest-college_computer_science": 0,
"hendrycksTest-college_mathematics": 0,
"hendrycksTest-college_medicine": 0,
"hendrycksTest-college_physics": 0,
"hendrycksTest-computer_security": 0,
"hendrycksTest-conceptual_physics": 0,
"hendrycksTest-econometrics": 0,
"hendrycksTest-electrical_engineering": 0,
"hendrycksTest-elementary_mathematics": 0,
"hendrycksTest-formal_logic": 0,
"hendrycksTest-global_facts": 0,
"hendrycksTest-high_school_biology": 0,
"hendrycksTest-high_school_chemistry": 0,
"hendrycksTest-high_school_computer_science": 0,
"hendrycksTest-high_school_european_history": 0,
"hendrycksTest-high_school_geography": 0,
"hendrycksTest-high_school_government_and_politics": 0,
"hendrycksTest-high_school_macroeconomics": 0,
"hendrycksTest-high_school_mathematics": 0,
"hendrycksTest-high_school_microeconomics": 0,
"hendrycksTest-high_school_physics": 0,
"hendrycksTest-high_school_psychology": 0,
"hendrycksTest-high_school_statistics": 0,
"hendrycksTest-high_school_us_history": 0,
"hendrycksTest-high_school_world_history": 0,
"hendrycksTest-human_aging": 0,
"hendrycksTest-human_sexuality": 0,
"hendrycksTest-international_law": 0,
"hendrycksTest-jurisprudence": 0,
"hendrycksTest-logical_fallacies": 0,
"hendrycksTest-machine_learning": 0,
"hendrycksTest-management": 0,
"hendrycksTest-marketing": 0,
"hendrycksTest-medical_genetics": 0,
"hendrycksTest-miscellaneous": 0,
"hendrycksTest-moral_disputes": 0,
"hendrycksTest-moral_scenarios": 0,
"hendrycksTest-nutrition": 0,
"hendrycksTest-philosophy": 0,
"hendrycksTest-prehistory": 0,
"hendrycksTest-professional_accounting": 0,
"hendrycksTest-professional_law": 0,
"hendrycksTest-professional_medicine": 0,
"hendrycksTest-professional_psychology": 0,
"hendrycksTest-public_relations": 0,
"hendrycksTest-security_studies": 0,
"hendrycksTest-sociology": 0,
"hendrycksTest-us_foreign_policy": 0,
"hendrycksTest-virology": 0,
"hendrycksTest-world_religions": 0
}
}