{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016, "grad_norm": 0.43310171365737915, "learning_rate": 3.1746031746031746e-06, "loss": 0.2682, "mean_token_accuracy": 0.9273857176303864, "step": 1 }, { "epoch": 0.008, "grad_norm": 0.8490898609161377, "learning_rate": 1.5873015873015872e-05, "loss": 0.3388, "mean_token_accuracy": 0.9119045659899712, "step": 5 }, { "epoch": 0.016, "grad_norm": 0.34499526023864746, "learning_rate": 3.1746031746031745e-05, "loss": 0.3481, "mean_token_accuracy": 0.9102882027626038, "step": 10 }, { "epoch": 0.024, "grad_norm": 0.285628080368042, "learning_rate": 4.761904761904762e-05, "loss": 0.2976, "mean_token_accuracy": 0.9188290774822235, "step": 15 }, { "epoch": 0.032, "grad_norm": 0.16985873878002167, "learning_rate": 6.349206349206349e-05, "loss": 0.257, "mean_token_accuracy": 0.923795086145401, "step": 20 }, { "epoch": 0.04, "grad_norm": 0.16013243794441223, "learning_rate": 7.936507936507937e-05, "loss": 0.2077, "mean_token_accuracy": 0.9377835214138031, "step": 25 }, { "epoch": 0.048, "grad_norm": 0.17801596224308014, "learning_rate": 9.523809523809524e-05, "loss": 0.2022, "mean_token_accuracy": 0.9342319130897522, "step": 30 }, { "epoch": 0.056, "grad_norm": 0.12985391914844513, "learning_rate": 0.00011111111111111112, "loss": 0.1644, "mean_token_accuracy": 0.9495729327201843, "step": 35 }, { "epoch": 0.064, "grad_norm": 0.16224823892116547, "learning_rate": 0.00012698412698412698, "loss": 0.1243, "mean_token_accuracy": 0.9592924535274505, "step": 40 }, { "epoch": 0.072, "grad_norm": 0.12580786645412445, "learning_rate": 0.00014285714285714287, "loss": 0.101, "mean_token_accuracy": 0.9631378591060639, "step": 45 }, { "epoch": 0.08, "grad_norm": 0.12046530097723007, "learning_rate": 0.00015873015873015873, "loss": 0.0813, "mean_token_accuracy": 0.9683941245079041, "step": 50 }, { "epoch": 0.088, "grad_norm": 0.12417536973953247, "learning_rate": 0.00017460317460317462, "loss": 0.0586, "mean_token_accuracy": 0.9767989754676819, "step": 55 }, { "epoch": 0.096, "grad_norm": 0.13074961304664612, "learning_rate": 0.00019047619047619048, "loss": 0.0637, "mean_token_accuracy": 0.9737584054470062, "step": 60 }, { "epoch": 0.104, "grad_norm": 0.12429118901491165, "learning_rate": 0.00019999375039475277, "loss": 0.0537, "mean_token_accuracy": 0.9769091963768005, "step": 65 }, { "epoch": 0.112, "grad_norm": 0.11161642521619797, "learning_rate": 0.0001999234513064475, "loss": 0.0486, "mean_token_accuracy": 0.9791503012180328, "step": 70 }, { "epoch": 0.12, "grad_norm": 0.09909889847040176, "learning_rate": 0.00019977509622105233, "loss": 0.044, "mean_token_accuracy": 0.981050831079483, "step": 75 }, { "epoch": 0.128, "grad_norm": 0.07484742999076843, "learning_rate": 0.0001995488010273198, "loss": 0.0431, "mean_token_accuracy": 0.9817534804344177, "step": 80 }, { "epoch": 0.136, "grad_norm": 0.08371058106422424, "learning_rate": 0.00019924474249753655, "loss": 0.0441, "mean_token_accuracy": 0.9811403095722199, "step": 85 }, { "epoch": 0.144, "grad_norm": 0.08213909715414047, "learning_rate": 0.00019886315814943647, "loss": 0.0385, "mean_token_accuracy": 0.9830973207950592, "step": 90 }, { "epoch": 0.152, "grad_norm": 0.07752417027950287, "learning_rate": 0.0001984043460606618, "loss": 0.0442, "mean_token_accuracy": 0.9807018160820007, "step": 95 }, { "epoch": 0.16, "grad_norm": 0.07620657980442047, "learning_rate": 0.0001978686646359173, "loss": 0.0392, "mean_token_accuracy": 0.982789009809494, "step": 100 }, { "epoch": 0.168, "grad_norm": 0.083968386054039, "learning_rate": 0.0001972565323269996, "loss": 0.0373, "mean_token_accuracy": 0.9836896479129791, "step": 105 }, { "epoch": 0.176, "grad_norm": 0.051836077123880386, "learning_rate": 0.00019656842730592046, "loss": 0.0407, "mean_token_accuracy": 0.9821825683116913, "step": 110 }, { "epoch": 0.184, "grad_norm": 0.07549381256103516, "learning_rate": 0.0001958048870913786, "loss": 0.038, "mean_token_accuracy": 0.9829850435256958, "step": 115 }, { "epoch": 0.192, "grad_norm": 0.05420655012130737, "learning_rate": 0.0001949665081288729, "loss": 0.0347, "mean_token_accuracy": 0.9851674497127533, "step": 120 }, { "epoch": 0.2, "grad_norm": 0.06140986829996109, "learning_rate": 0.00019405394532478424, "loss": 0.033, "mean_token_accuracy": 0.9853869795799255, "step": 125 }, { "epoch": 0.208, "grad_norm": 0.06693119555711746, "learning_rate": 0.00019306791153479006, "loss": 0.0404, "mean_token_accuracy": 0.9823243021965027, "step": 130 }, { "epoch": 0.216, "grad_norm": 0.08230168372392654, "learning_rate": 0.00019200917700701176, "loss": 0.0369, "mean_token_accuracy": 0.9838479280471801, "step": 135 }, { "epoch": 0.224, "grad_norm": 0.06229570508003235, "learning_rate": 0.0001908785687803289, "loss": 0.0373, "mean_token_accuracy": 0.9834259986877442, "step": 140 }, { "epoch": 0.232, "grad_norm": 0.06680244952440262, "learning_rate": 0.00018967697003833157, "loss": 0.0383, "mean_token_accuracy": 0.9826522827148437, "step": 145 }, { "epoch": 0.24, "grad_norm": 0.061888325959444046, "learning_rate": 0.0001884053194194142, "loss": 0.0329, "mean_token_accuracy": 0.9850789308547974, "step": 150 }, { "epoch": 0.248, "grad_norm": 0.06167807802557945, "learning_rate": 0.00018706461028355104, "loss": 0.0349, "mean_token_accuracy": 0.9847366988658905, "step": 155 }, { "epoch": 0.256, "grad_norm": 0.051647648215293884, "learning_rate": 0.00018565588993632487, "loss": 0.0435, "mean_token_accuracy": 0.9802697420120239, "step": 160 }, { "epoch": 0.264, "grad_norm": 0.06429916620254517, "learning_rate": 0.0001841802588108161, "loss": 0.036, "mean_token_accuracy": 0.983528858423233, "step": 165 }, { "epoch": 0.272, "grad_norm": 0.07308773696422577, "learning_rate": 0.00018263886960799062, "loss": 0.0365, "mean_token_accuracy": 0.983112233877182, "step": 170 }, { "epoch": 0.28, "grad_norm": 0.05631113797426224, "learning_rate": 0.00018103292639625837, "loss": 0.0363, "mean_token_accuracy": 0.9835257828235626, "step": 175 }, { "epoch": 0.288, "grad_norm": 0.07624773681163788, "learning_rate": 0.0001793636836709057, "loss": 0.0353, "mean_token_accuracy": 0.9840705871582032, "step": 180 }, { "epoch": 0.296, "grad_norm": 0.05333884805440903, "learning_rate": 0.0001776324453741365, "loss": 0.0325, "mean_token_accuracy": 0.985572201013565, "step": 185 }, { "epoch": 0.304, "grad_norm": 0.05988354608416557, "learning_rate": 0.00017584056387648727, "loss": 0.0382, "mean_token_accuracy": 0.982496690750122, "step": 190 }, { "epoch": 0.312, "grad_norm": 0.05714619532227516, "learning_rate": 0.0001739894389204122, "loss": 0.0354, "mean_token_accuracy": 0.9839600920677185, "step": 195 }, { "epoch": 0.32, "grad_norm": 0.05059760436415672, "learning_rate": 0.00017208051652686335, "loss": 0.0349, "mean_token_accuracy": 0.984496396780014, "step": 200 }, { "epoch": 0.328, "grad_norm": 0.07071304321289062, "learning_rate": 0.00017011528786571969, "loss": 0.0305, "mean_token_accuracy": 0.9868167281150818, "step": 205 }, { "epoch": 0.336, "grad_norm": 0.05378766357898712, "learning_rate": 0.00016809528809094807, "loss": 0.0353, "mean_token_accuracy": 0.9841848373413086, "step": 210 }, { "epoch": 0.344, "grad_norm": 0.06888769567012787, "learning_rate": 0.0001660220951414055, "loss": 0.0331, "mean_token_accuracy": 0.9852972328662872, "step": 215 }, { "epoch": 0.352, "grad_norm": 0.06159176304936409, "learning_rate": 0.00016389732850821966, "loss": 0.0328, "mean_token_accuracy": 0.9849840342998505, "step": 220 }, { "epoch": 0.36, "grad_norm": 0.05890626460313797, "learning_rate": 0.0001617226479697105, "loss": 0.0318, "mean_token_accuracy": 0.9859302580356598, "step": 225 }, { "epoch": 0.368, "grad_norm": 0.06155586987733841, "learning_rate": 0.00015949975229484134, "loss": 0.0367, "mean_token_accuracy": 0.9838493525981903, "step": 230 }, { "epoch": 0.376, "grad_norm": 0.050045762211084366, "learning_rate": 0.00015723037791621193, "loss": 0.0325, "mean_token_accuracy": 0.9856610596179962, "step": 235 }, { "epoch": 0.384, "grad_norm": 0.05855004861950874, "learning_rate": 0.00015491629757363032, "loss": 0.0312, "mean_token_accuracy": 0.9855502784252167, "step": 240 }, { "epoch": 0.392, "grad_norm": 0.08005663007497787, "learning_rate": 0.00015255931892932333, "loss": 0.031, "mean_token_accuracy": 0.9869074642658233, "step": 245 }, { "epoch": 0.4, "grad_norm": 0.061815045773983, "learning_rate": 0.0001501612831558664, "loss": 0.0366, "mean_token_accuracy": 0.983307272195816, "step": 250 }, { "epoch": 0.408, "grad_norm": 0.0681883841753006, "learning_rate": 0.00014772406349793744, "loss": 0.0317, "mean_token_accuracy": 0.9858060896396637, "step": 255 }, { "epoch": 0.416, "grad_norm": 0.056142307817935944, "learning_rate": 0.0001452495638090167, "loss": 0.0304, "mean_token_accuracy": 0.9861166894435882, "step": 260 }, { "epoch": 0.424, "grad_norm": 0.0800497755408287, "learning_rate": 0.00014273971706417647, "loss": 0.0351, "mean_token_accuracy": 0.9837919533252716, "step": 265 }, { "epoch": 0.432, "grad_norm": 0.05837633088231087, "learning_rate": 0.00014019648385012244, "loss": 0.0304, "mean_token_accuracy": 0.9853955626487731, "step": 270 }, { "epoch": 0.44, "grad_norm": 0.058906685560941696, "learning_rate": 0.00013762185083366556, "loss": 0.0321, "mean_token_accuracy": 0.9853337109088898, "step": 275 }, { "epoch": 0.448, "grad_norm": 0.06374027580022812, "learning_rate": 0.00013501782920982184, "loss": 0.0379, "mean_token_accuracy": 0.9825663805007935, "step": 280 }, { "epoch": 0.456, "grad_norm": 0.062413912266492844, "learning_rate": 0.00013238645313075104, "loss": 0.0362, "mean_token_accuracy": 0.9839500784873962, "step": 285 }, { "epoch": 0.464, "grad_norm": 0.04723189026117325, "learning_rate": 0.00012972977811676287, "loss": 0.0316, "mean_token_accuracy": 0.9853797852993011, "step": 290 }, { "epoch": 0.472, "grad_norm": 0.05583701282739639, "learning_rate": 0.00012704987945063068, "loss": 0.0324, "mean_token_accuracy": 0.9851679742336273, "step": 295 }, { "epoch": 0.48, "grad_norm": 0.053289640694856644, "learning_rate": 0.00012434885055646823, "loss": 0.0304, "mean_token_accuracy": 0.9868038833141327, "step": 300 }, { "epoch": 0.488, "grad_norm": 0.0547807514667511, "learning_rate": 0.00012162880136443447, "loss": 0.0382, "mean_token_accuracy": 0.9832392990589142, "step": 305 }, { "epoch": 0.496, "grad_norm": 0.062117431312799454, "learning_rate": 0.00011889185666254506, "loss": 0.0347, "mean_token_accuracy": 0.9848644256591796, "step": 310 }, { "epoch": 0.504, "grad_norm": 0.05534973368048668, "learning_rate": 0.00011614015443687722, "loss": 0.032, "mean_token_accuracy": 0.9855094432830811, "step": 315 }, { "epoch": 0.512, "grad_norm": 0.04713955521583557, "learning_rate": 0.0001133758442014651, "loss": 0.0296, "mean_token_accuracy": 0.9867917716503143, "step": 320 }, { "epoch": 0.52, "grad_norm": 0.05600622668862343, "learning_rate": 0.00011060108531918971, "loss": 0.0342, "mean_token_accuracy": 0.9841290414333344, "step": 325 }, { "epoch": 0.528, "grad_norm": 0.04774011671543121, "learning_rate": 0.0001078180453149754, "loss": 0.031, "mean_token_accuracy": 0.9855308055877685, "step": 330 }, { "epoch": 0.536, "grad_norm": 0.05677397921681404, "learning_rate": 0.00010502889818261075, "loss": 0.0354, "mean_token_accuracy": 0.9833913028240204, "step": 335 }, { "epoch": 0.544, "grad_norm": 0.08369103819131851, "learning_rate": 0.00010223582268651586, "loss": 0.0344, "mean_token_accuracy": 0.98387970328331, "step": 340 }, { "epoch": 0.552, "grad_norm": 0.07704319059848785, "learning_rate": 9.94410006597835e-05, "loss": 0.0343, "mean_token_accuracy": 0.9841209590435028, "step": 345 }, { "epoch": 0.56, "grad_norm": 0.0598573163151741, "learning_rate": 9.66466152998226e-05, "loss": 0.0279, "mean_token_accuracy": 0.9878278017044068, "step": 350 }, { "epoch": 0.568, "grad_norm": 0.05838743597269058, "learning_rate": 9.385484946293637e-05, "loss": 0.0354, "mean_token_accuracy": 0.9834566116333008, "step": 355 }, { "epoch": 0.576, "grad_norm": 0.0644030049443245, "learning_rate": 9.106788395916678e-05, "loss": 0.0329, "mean_token_accuracy": 0.985417640209198, "step": 360 }, { "epoch": 0.584, "grad_norm": 0.05025125667452812, "learning_rate": 8.828789584873754e-05, "loss": 0.034, "mean_token_accuracy": 0.984911048412323, "step": 365 }, { "epoch": 0.592, "grad_norm": 0.04645683616399765, "learning_rate": 8.551705674142617e-05, "loss": 0.0345, "mean_token_accuracy": 0.9842629969120026, "step": 370 }, { "epoch": 0.6, "grad_norm": 0.06257304549217224, "learning_rate": 8.275753110019367e-05, "loss": 0.0307, "mean_token_accuracy": 0.9859090149402618, "step": 375 }, { "epoch": 0.608, "grad_norm": 0.08189195394515991, "learning_rate": 8.001147455039735e-05, "loss": 0.0293, "mean_token_accuracy": 0.9876027524471283, "step": 380 }, { "epoch": 0.616, "grad_norm": 0.05993218719959259, "learning_rate": 7.728103219590681e-05, "loss": 0.0289, "mean_token_accuracy": 0.9873842716217041, "step": 385 }, { "epoch": 0.624, "grad_norm": 0.04940414056181908, "learning_rate": 7.456833694343906e-05, "loss": 0.0314, "mean_token_accuracy": 0.9858180701732635, "step": 390 }, { "epoch": 0.632, "grad_norm": 0.06051292642951012, "learning_rate": 7.18755078364214e-05, "loss": 0.035, "mean_token_accuracy": 0.9843159735202789, "step": 395 }, { "epoch": 0.64, "grad_norm": 0.07288350909948349, "learning_rate": 6.920464839968405e-05, "loss": 0.0289, "mean_token_accuracy": 0.9867165565490723, "step": 400 }, { "epoch": 0.648, "grad_norm": 0.05235522985458374, "learning_rate": 6.65578449962749e-05, "loss": 0.0342, "mean_token_accuracy": 0.9844050407409668, "step": 405 }, { "epoch": 0.656, "grad_norm": 0.04962315410375595, "learning_rate": 6.393716519768047e-05, "loss": 0.0318, "mean_token_accuracy": 0.9854838192462921, "step": 410 }, { "epoch": 0.664, "grad_norm": 0.05337170138955116, "learning_rate": 6.134465616872598e-05, "loss": 0.029, "mean_token_accuracy": 0.9865143656730652, "step": 415 }, { "epoch": 0.672, "grad_norm": 0.061498258262872696, "learning_rate": 5.878234306841637e-05, "loss": 0.0304, "mean_token_accuracy": 0.9859014391899109, "step": 420 }, { "epoch": 0.68, "grad_norm": 0.05795282498002052, "learning_rate": 5.62522274679673e-05, "loss": 0.0311, "mean_token_accuracy": 0.9858847856521606, "step": 425 }, { "epoch": 0.688, "grad_norm": 0.05434182286262512, "learning_rate": 5.375628578726181e-05, "loss": 0.0262, "mean_token_accuracy": 0.9887398540973663, "step": 430 }, { "epoch": 0.696, "grad_norm": 0.0455637201666832, "learning_rate": 5.1296467750954314e-05, "loss": 0.0315, "mean_token_accuracy": 0.9861598789691925, "step": 435 }, { "epoch": 0.704, "grad_norm": 0.05739019066095352, "learning_rate": 4.8874694865427676e-05, "loss": 0.0304, "mean_token_accuracy": 0.9865131139755249, "step": 440 }, { "epoch": 0.712, "grad_norm": 0.07262022793292999, "learning_rate": 4.649285891779327e-05, "loss": 0.0277, "mean_token_accuracy": 0.9875944793224335, "step": 445 }, { "epoch": 0.72, "grad_norm": 0.06861643493175507, "learning_rate": 4.415282049810644e-05, "loss": 0.0288, "mean_token_accuracy": 0.9869144856929779, "step": 450 }, { "epoch": 0.728, "grad_norm": 0.057887740433216095, "learning_rate": 4.1856407545951834e-05, "loss": 0.0332, "mean_token_accuracy": 0.9847104012966156, "step": 455 }, { "epoch": 0.736, "grad_norm": 0.07549154758453369, "learning_rate": 3.9605413922533874e-05, "loss": 0.0294, "mean_token_accuracy": 0.9868144273757935, "step": 460 }, { "epoch": 0.744, "grad_norm": 0.058476611971855164, "learning_rate": 3.740159800938784e-05, "loss": 0.0246, "mean_token_accuracy": 0.9886395156383514, "step": 465 }, { "epoch": 0.752, "grad_norm": 0.05933303013443947, "learning_rate": 3.5246681334806175e-05, "loss": 0.0263, "mean_token_accuracy": 0.9887538492679596, "step": 470 }, { "epoch": 0.76, "grad_norm": 0.061903417110443115, "learning_rate": 3.3142347229053015e-05, "loss": 0.0333, "mean_token_accuracy": 0.9845255970954895, "step": 475 }, { "epoch": 0.768, "grad_norm": 0.06767533719539642, "learning_rate": 3.109023950941736e-05, "loss": 0.0289, "mean_token_accuracy": 0.9869068741798401, "step": 480 }, { "epoch": 0.776, "grad_norm": 0.05679545924067497, "learning_rate": 2.909196119613218e-05, "loss": 0.0313, "mean_token_accuracy": 0.9862040936946869, "step": 485 }, { "epoch": 0.784, "grad_norm": 0.04378621280193329, "learning_rate": 2.7149073260162416e-05, "loss": 0.0311, "mean_token_accuracy": 0.9860045194625855, "step": 490 }, { "epoch": 0.792, "grad_norm": 0.04507381096482277, "learning_rate": 2.5263093403840142e-05, "loss": 0.0324, "mean_token_accuracy": 0.985187429189682, "step": 495 }, { "epoch": 0.8, "grad_norm": 0.04896757751703262, "learning_rate": 2.3435494875299314e-05, "loss": 0.0285, "mean_token_accuracy": 0.9870898187160492, "step": 500 }, { "epoch": 0.808, "grad_norm": 0.06324876099824905, "learning_rate": 2.166770531763633e-05, "loss": 0.027, "mean_token_accuracy": 0.9879481792449951, "step": 505 }, { "epoch": 0.816, "grad_norm": 0.04500701278448105, "learning_rate": 1.9961105653695266e-05, "loss": 0.0328, "mean_token_accuracy": 0.9849318027496338, "step": 510 }, { "epoch": 0.824, "grad_norm": 0.05476031452417374, "learning_rate": 1.8317029007349085e-05, "loss": 0.0304, "mean_token_accuracy": 0.9857797741889953, "step": 515 }, { "epoch": 0.832, "grad_norm": 0.04693017527461052, "learning_rate": 1.6736759662119183e-05, "loss": 0.0278, "mean_token_accuracy": 0.9873194813728332, "step": 520 }, { "epoch": 0.84, "grad_norm": 0.07823780179023743, "learning_rate": 1.5221532057947419e-05, "loss": 0.0308, "mean_token_accuracy": 0.9862527191638947, "step": 525 }, { "epoch": 0.848, "grad_norm": 0.04295206442475319, "learning_rate": 1.3772529826903269e-05, "loss": 0.0258, "mean_token_accuracy": 0.9883043467998505, "step": 530 }, { "epoch": 0.856, "grad_norm": 0.046063318848609924, "learning_rate": 1.23908848685804e-05, "loss": 0.0284, "mean_token_accuracy": 0.9875360190868377, "step": 535 }, { "epoch": 0.864, "grad_norm": 0.0660807192325592, "learning_rate": 1.1077676465904208e-05, "loss": 0.0312, "mean_token_accuracy": 0.986429226398468, "step": 540 }, { "epoch": 0.872, "grad_norm": 0.059512872248888016, "learning_rate": 9.833930442041506e-06, "loss": 0.03, "mean_token_accuracy": 0.9860706746578216, "step": 545 }, { "epoch": 0.88, "grad_norm": 0.05064374580979347, "learning_rate": 8.660618359070604e-06, "loss": 0.0308, "mean_token_accuracy": 0.9860022485256195, "step": 550 }, { "epoch": 0.888, "grad_norm": 0.052208684384822845, "learning_rate": 7.558656759037797e-06, "loss": 0.0268, "mean_token_accuracy": 0.9876573204994201, "step": 555 }, { "epoch": 0.896, "grad_norm": 0.052323583513498306, "learning_rate": 6.528906447993288e-06, "loss": 0.0315, "mean_token_accuracy": 0.9857221782207489, "step": 560 }, { "epoch": 0.904, "grad_norm": 0.045587047934532166, "learning_rate": 5.572171823565797e-06, "loss": 0.0334, "mean_token_accuracy": 0.9845053553581238, "step": 565 }, { "epoch": 0.912, "grad_norm": 0.0517142117023468, "learning_rate": 4.689200246600867e-06, "loss": 0.0327, "mean_token_accuracy": 0.9847142338752747, "step": 570 }, { "epoch": 0.92, "grad_norm": 0.04421110823750496, "learning_rate": 3.880681457354118e-06, "loss": 0.028, "mean_token_accuracy": 0.9875537037849427, "step": 575 }, { "epoch": 0.928, "grad_norm": 0.040859125554561615, "learning_rate": 3.1472470366950334e-06, "loss": 0.0274, "mean_token_accuracy": 0.9872512996196747, "step": 580 }, { "epoch": 0.936, "grad_norm": 0.048171717673540115, "learning_rate": 2.4894699127426367e-06, "loss": 0.0287, "mean_token_accuracy": 0.9876733124256134, "step": 585 }, { "epoch": 0.944, "grad_norm": 0.04280434921383858, "learning_rate": 1.907863913318153e-06, "loss": 0.0283, "mean_token_accuracy": 0.987252289056778, "step": 590 }, { "epoch": 0.952, "grad_norm": 0.054707061499357224, "learning_rate": 1.4028833645643113e-06, "loss": 0.0324, "mean_token_accuracy": 0.9851353108882904, "step": 595 }, { "epoch": 0.96, "grad_norm": 0.056745924055576324, "learning_rate": 9.749227360448143e-07, "loss": 0.0279, "mean_token_accuracy": 0.9884349465370178, "step": 600 }, { "epoch": 0.968, "grad_norm": 0.04751133173704147, "learning_rate": 6.243163326014267e-07, "loss": 0.03, "mean_token_accuracy": 0.9863233804702759, "step": 605 }, { "epoch": 0.976, "grad_norm": 0.061580199748277664, "learning_rate": 3.5133803320896994e-07, "loss": 0.028, "mean_token_accuracy": 0.9869140684604645, "step": 610 }, { "epoch": 0.984, "grad_norm": 0.04928790032863617, "learning_rate": 1.562010770326916e-07, "loss": 0.036, "mean_token_accuracy": 0.9834135055541993, "step": 615 }, { "epoch": 0.992, "grad_norm": 0.057830002158880234, "learning_rate": 3.905789685471062e-08, "loss": 0.0306, "mean_token_accuracy": 0.9861779153347016, "step": 620 }, { "epoch": 1.0, "grad_norm": 0.0504627525806427, "learning_rate": 0.0, "loss": 0.0272, "mean_token_accuracy": 0.9880285739898682, "step": 625 }, { "epoch": 1.0, "step": 625, "total_flos": 8.527809495642931e+16, "train_loss": 0.04780370211601257, "train_runtime": 1247.5714, "train_samples_per_second": 4.008, "train_steps_per_second": 0.501 } ], "logging_steps": 5, "max_steps": 625, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.527809495642931e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }