lm1-misc-oscar / 1b121b21b /evaluation /lm1-1b1-21b-oscar-results_lm-eval_global_step39672_2022-12-02-14-19-24.csv
Muennighoff's picture
Add
8ec84fa
task,metric,value,err,version
copa,acc,0.7,0.046056618647183814,0
hendrycksTest-abstract_algebra,acc,0.22,0.0416333199893227,0
hendrycksTest-abstract_algebra,acc_norm,0.22,0.0416333199893227,0
hendrycksTest-anatomy,acc,0.17037037037037037,0.03247781185995593,0
hendrycksTest-anatomy,acc_norm,0.17777777777777778,0.0330278985990172,0
hendrycksTest-astronomy,acc,0.21052631578947367,0.03317672787533156,0
hendrycksTest-astronomy,acc_norm,0.3026315789473684,0.037385206761196686,0
hendrycksTest-business_ethics,acc,0.32,0.046882617226215034,0
hendrycksTest-business_ethics,acc_norm,0.31,0.04648231987117316,0
hendrycksTest-clinical_knowledge,acc,0.24528301886792453,0.026480357179895674,0
hendrycksTest-clinical_knowledge,acc_norm,0.3132075471698113,0.028544793319055326,0
hendrycksTest-college_biology,acc,0.2361111111111111,0.03551446610810826,0
hendrycksTest-college_biology,acc_norm,0.2013888888888889,0.033536474697138406,0
hendrycksTest-college_chemistry,acc,0.27,0.044619604333847394,0
hendrycksTest-college_chemistry,acc_norm,0.32,0.046882617226215034,0
hendrycksTest-college_computer_science,acc,0.29,0.04560480215720684,0
hendrycksTest-college_computer_science,acc_norm,0.3,0.046056618647183814,0
hendrycksTest-college_mathematics,acc,0.23,0.04229525846816505,0
hendrycksTest-college_mathematics,acc_norm,0.29,0.04560480215720684,0
hendrycksTest-college_medicine,acc,0.24855491329479767,0.03295304696818318,0
hendrycksTest-college_medicine,acc_norm,0.2543352601156069,0.0332055644308557,0
hendrycksTest-college_physics,acc,0.2549019607843137,0.043364327079931785,0
hendrycksTest-college_physics,acc_norm,0.24509803921568626,0.04280105837364395,0
hendrycksTest-computer_security,acc,0.2,0.04020151261036844,0
hendrycksTest-computer_security,acc_norm,0.29,0.04560480215720684,0
hendrycksTest-conceptual_physics,acc,0.28936170212765955,0.02964400657700962,0
hendrycksTest-conceptual_physics,acc_norm,0.225531914893617,0.027321078417387536,0
hendrycksTest-econometrics,acc,0.32456140350877194,0.04404556157374768,0
hendrycksTest-econometrics,acc_norm,0.2631578947368421,0.04142439719489362,0
hendrycksTest-electrical_engineering,acc,0.27586206896551724,0.03724563619774632,0
hendrycksTest-electrical_engineering,acc_norm,0.2896551724137931,0.03780019230438014,0
hendrycksTest-elementary_mathematics,acc,0.20105820105820105,0.02064181078237016,0
hendrycksTest-elementary_mathematics,acc_norm,0.2275132275132275,0.021591269407823778,0
hendrycksTest-formal_logic,acc,0.29365079365079366,0.04073524322147127,0
hendrycksTest-formal_logic,acc_norm,0.23809523809523808,0.03809523809523811,0
hendrycksTest-global_facts,acc,0.24,0.042923469599092816,0
hendrycksTest-global_facts,acc_norm,0.26,0.044084400227680794,0
hendrycksTest-high_school_biology,acc,0.2129032258064516,0.02328766512726854,0
hendrycksTest-high_school_biology,acc_norm,0.26129032258064516,0.024993053397764812,0
hendrycksTest-high_school_chemistry,acc,0.1921182266009852,0.027719315709614775,0
hendrycksTest-high_school_chemistry,acc_norm,0.23645320197044334,0.029896114291733552,0
hendrycksTest-high_school_computer_science,acc,0.25,0.04351941398892446,0
hendrycksTest-high_school_computer_science,acc_norm,0.29,0.04560480215720684,0
hendrycksTest-high_school_european_history,acc,0.24848484848484848,0.03374402644139404,0
hendrycksTest-high_school_european_history,acc_norm,0.2909090909090909,0.03546563019624336,0
hendrycksTest-high_school_geography,acc,0.23737373737373738,0.030313710538198906,0
hendrycksTest-high_school_geography,acc_norm,0.3181818181818182,0.033184773338453315,0
hendrycksTest-high_school_government_and_politics,acc,0.26424870466321243,0.03182155050916648,0
hendrycksTest-high_school_government_and_politics,acc_norm,0.27979274611398963,0.03239637046735702,0
hendrycksTest-high_school_macroeconomics,acc,0.2512820512820513,0.021992016662370543,0
hendrycksTest-high_school_macroeconomics,acc_norm,0.2641025641025641,0.022352193737453268,0
hendrycksTest-high_school_mathematics,acc,0.2037037037037037,0.02455617221914128,0
hendrycksTest-high_school_mathematics,acc_norm,0.26296296296296295,0.026842057873833713,0
hendrycksTest-high_school_microeconomics,acc,0.25630252100840334,0.02835962087053395,0
hendrycksTest-high_school_microeconomics,acc_norm,0.31512605042016806,0.030176808288974337,0
hendrycksTest-high_school_physics,acc,0.1986754966887417,0.03257847384436776,0
hendrycksTest-high_school_physics,acc_norm,0.23178807947019867,0.03445406271987054,0
hendrycksTest-high_school_psychology,acc,0.20917431192660552,0.017437937173343233,0
hendrycksTest-high_school_psychology,acc_norm,0.22568807339449543,0.017923087667803057,0
hendrycksTest-high_school_statistics,acc,0.20833333333333334,0.027696910713093936,0
hendrycksTest-high_school_statistics,acc_norm,0.30092592592592593,0.031280390843298804,0
hendrycksTest-high_school_us_history,acc,0.27450980392156865,0.031321798030832904,0
hendrycksTest-high_school_us_history,acc_norm,0.28431372549019607,0.03166009679399812,0
hendrycksTest-high_school_world_history,acc,0.22784810126582278,0.027303484599069422,0
hendrycksTest-high_school_world_history,acc_norm,0.28270042194092826,0.02931281415395593,0
hendrycksTest-human_aging,acc,0.29596412556053814,0.030636591348699796,0
hendrycksTest-human_aging,acc_norm,0.26905829596412556,0.029763779406874972,0
hendrycksTest-human_sexuality,acc,0.32061068702290074,0.04093329229834278,0
hendrycksTest-human_sexuality,acc_norm,0.31297709923664124,0.04066962905677697,0
hendrycksTest-international_law,acc,0.2231404958677686,0.03800754475228733,0
hendrycksTest-international_law,acc_norm,0.4049586776859504,0.04481137755942469,0
hendrycksTest-jurisprudence,acc,0.24074074074074073,0.0413311944024384,0
hendrycksTest-jurisprudence,acc_norm,0.37962962962962965,0.04691521224077742,0
hendrycksTest-logical_fallacies,acc,0.1656441717791411,0.029208296231259104,0
hendrycksTest-logical_fallacies,acc_norm,0.27607361963190186,0.0351238528370505,0
hendrycksTest-machine_learning,acc,0.3125,0.043994650575715215,0
hendrycksTest-machine_learning,acc_norm,0.25892857142857145,0.04157751539865629,0
hendrycksTest-management,acc,0.2524271844660194,0.04301250399690878,0
hendrycksTest-management,acc_norm,0.2912621359223301,0.044986763205729224,0
hendrycksTest-marketing,acc,0.29914529914529914,0.02999695185834948,0
hendrycksTest-marketing,acc_norm,0.32051282051282054,0.030572811310299604,0
hendrycksTest-medical_genetics,acc,0.28,0.045126085985421255,0
hendrycksTest-medical_genetics,acc_norm,0.38,0.04878317312145632,0
hendrycksTest-miscellaneous,acc,0.2567049808429119,0.015620480263064535,0
hendrycksTest-miscellaneous,acc_norm,0.24904214559386972,0.015464676163395962,0
hendrycksTest-moral_disputes,acc,0.28901734104046245,0.02440517393578324,0
hendrycksTest-moral_disputes,acc_norm,0.30057803468208094,0.0246853168672578,0
hendrycksTest-moral_scenarios,acc,0.2346368715083799,0.014173044098303658,0
hendrycksTest-moral_scenarios,acc_norm,0.27262569832402234,0.01489339173524959,0
hendrycksTest-nutrition,acc,0.25163398692810457,0.024848018263875202,0
hendrycksTest-nutrition,acc_norm,0.3366013071895425,0.027057974624494385,0
hendrycksTest-philosophy,acc,0.1832797427652733,0.021974198848265812,0
hendrycksTest-philosophy,acc_norm,0.2990353697749196,0.02600330111788513,0
hendrycksTest-prehistory,acc,0.2654320987654321,0.02456922360046085,0
hendrycksTest-prehistory,acc_norm,0.22839506172839505,0.023358211840626267,0
hendrycksTest-professional_accounting,acc,0.2127659574468085,0.024414612974307706,0
hendrycksTest-professional_accounting,acc_norm,0.2695035460992908,0.026469036818590638,0
hendrycksTest-professional_law,acc,0.2522816166883963,0.011092789056875238,0
hendrycksTest-professional_law,acc_norm,0.30182529335071706,0.011724350518105893,0
hendrycksTest-professional_medicine,acc,0.21323529411764705,0.024880971512294257,0
hendrycksTest-professional_medicine,acc_norm,0.2426470588235294,0.026040662474201275,0
hendrycksTest-professional_psychology,acc,0.2565359477124183,0.01766784161237899,0
hendrycksTest-professional_psychology,acc_norm,0.2565359477124183,0.017667841612378977,0
hendrycksTest-public_relations,acc,0.2818181818181818,0.04309118709946458,0
hendrycksTest-public_relations,acc_norm,0.2,0.03831305140884603,0
hendrycksTest-security_studies,acc,0.2938775510204082,0.02916273841024977,0
hendrycksTest-security_studies,acc_norm,0.2653061224489796,0.028263889943784603,0
hendrycksTest-sociology,acc,0.2537313432835821,0.03076944496729602,0
hendrycksTest-sociology,acc_norm,0.2537313432835821,0.03076944496729602,0
hendrycksTest-us_foreign_policy,acc,0.3,0.046056618647183814,0
hendrycksTest-us_foreign_policy,acc_norm,0.35,0.0479372485441102,0
hendrycksTest-virology,acc,0.28313253012048195,0.03507295431370518,0
hendrycksTest-virology,acc_norm,0.2469879518072289,0.03357351982064536,0
hendrycksTest-world_religions,acc,0.3216374269005848,0.03582529442573122,0
hendrycksTest-world_religions,acc_norm,0.32748538011695905,0.035993357714560276,0
piqa,acc,0.6675734494015234,0.01099114155744559,0
piqa,acc_norm,0.6697497279651795,0.010972947133006297,0
rte,acc,0.5631768953068592,0.029855247390314938,0
winogrande,acc,0.5011838989739542,0.014052446290529009,0