|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 625, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0016, |
|
"grad_norm": 0.43310171365737915, |
|
"learning_rate": 3.1746031746031746e-06, |
|
"loss": 0.2682, |
|
"mean_token_accuracy": 0.9273857176303864, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 0.8490898609161377, |
|
"learning_rate": 1.5873015873015872e-05, |
|
"loss": 0.3388, |
|
"mean_token_accuracy": 0.9119045659899712, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 0.34499526023864746, |
|
"learning_rate": 3.1746031746031745e-05, |
|
"loss": 0.3481, |
|
"mean_token_accuracy": 0.9102882027626038, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.024, |
|
"grad_norm": 0.285628080368042, |
|
"learning_rate": 4.761904761904762e-05, |
|
"loss": 0.2976, |
|
"mean_token_accuracy": 0.9188290774822235, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 0.16985873878002167, |
|
"learning_rate": 6.349206349206349e-05, |
|
"loss": 0.257, |
|
"mean_token_accuracy": 0.923795086145401, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.16013243794441223, |
|
"learning_rate": 7.936507936507937e-05, |
|
"loss": 0.2077, |
|
"mean_token_accuracy": 0.9377835214138031, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 0.17801596224308014, |
|
"learning_rate": 9.523809523809524e-05, |
|
"loss": 0.2022, |
|
"mean_token_accuracy": 0.9342319130897522, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.056, |
|
"grad_norm": 0.12985391914844513, |
|
"learning_rate": 0.00011111111111111112, |
|
"loss": 0.1644, |
|
"mean_token_accuracy": 0.9495729327201843, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 0.16224823892116547, |
|
"learning_rate": 0.00012698412698412698, |
|
"loss": 0.1243, |
|
"mean_token_accuracy": 0.9592924535274505, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.072, |
|
"grad_norm": 0.12580786645412445, |
|
"learning_rate": 0.00014285714285714287, |
|
"loss": 0.101, |
|
"mean_token_accuracy": 0.9631378591060639, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.12046530097723007, |
|
"learning_rate": 0.00015873015873015873, |
|
"loss": 0.0813, |
|
"mean_token_accuracy": 0.9683941245079041, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.088, |
|
"grad_norm": 0.12417536973953247, |
|
"learning_rate": 0.00017460317460317462, |
|
"loss": 0.0586, |
|
"mean_token_accuracy": 0.9767989754676819, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 0.13074961304664612, |
|
"learning_rate": 0.00019047619047619048, |
|
"loss": 0.0637, |
|
"mean_token_accuracy": 0.9737584054470062, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.104, |
|
"grad_norm": 0.12429118901491165, |
|
"learning_rate": 0.00019999375039475277, |
|
"loss": 0.0537, |
|
"mean_token_accuracy": 0.9769091963768005, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 0.11161642521619797, |
|
"learning_rate": 0.0001999234513064475, |
|
"loss": 0.0486, |
|
"mean_token_accuracy": 0.9791503012180328, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.09909889847040176, |
|
"learning_rate": 0.00019977509622105233, |
|
"loss": 0.044, |
|
"mean_token_accuracy": 0.981050831079483, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 0.07484742999076843, |
|
"learning_rate": 0.0001995488010273198, |
|
"loss": 0.0431, |
|
"mean_token_accuracy": 0.9817534804344177, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.136, |
|
"grad_norm": 0.08371058106422424, |
|
"learning_rate": 0.00019924474249753655, |
|
"loss": 0.0441, |
|
"mean_token_accuracy": 0.9811403095722199, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 0.08213909715414047, |
|
"learning_rate": 0.00019886315814943647, |
|
"loss": 0.0385, |
|
"mean_token_accuracy": 0.9830973207950592, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.152, |
|
"grad_norm": 0.07752417027950287, |
|
"learning_rate": 0.0001984043460606618, |
|
"loss": 0.0442, |
|
"mean_token_accuracy": 0.9807018160820007, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.07620657980442047, |
|
"learning_rate": 0.0001978686646359173, |
|
"loss": 0.0392, |
|
"mean_token_accuracy": 0.982789009809494, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.168, |
|
"grad_norm": 0.083968386054039, |
|
"learning_rate": 0.0001972565323269996, |
|
"loss": 0.0373, |
|
"mean_token_accuracy": 0.9836896479129791, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 0.051836077123880386, |
|
"learning_rate": 0.00019656842730592046, |
|
"loss": 0.0407, |
|
"mean_token_accuracy": 0.9821825683116913, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.184, |
|
"grad_norm": 0.07549381256103516, |
|
"learning_rate": 0.0001958048870913786, |
|
"loss": 0.038, |
|
"mean_token_accuracy": 0.9829850435256958, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.05420655012130737, |
|
"learning_rate": 0.0001949665081288729, |
|
"loss": 0.0347, |
|
"mean_token_accuracy": 0.9851674497127533, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.06140986829996109, |
|
"learning_rate": 0.00019405394532478424, |
|
"loss": 0.033, |
|
"mean_token_accuracy": 0.9853869795799255, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 0.06693119555711746, |
|
"learning_rate": 0.00019306791153479006, |
|
"loss": 0.0404, |
|
"mean_token_accuracy": 0.9823243021965027, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.216, |
|
"grad_norm": 0.08230168372392654, |
|
"learning_rate": 0.00019200917700701176, |
|
"loss": 0.0369, |
|
"mean_token_accuracy": 0.9838479280471801, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 0.06229570508003235, |
|
"learning_rate": 0.0001908785687803289, |
|
"loss": 0.0373, |
|
"mean_token_accuracy": 0.9834259986877442, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.232, |
|
"grad_norm": 0.06680244952440262, |
|
"learning_rate": 0.00018967697003833157, |
|
"loss": 0.0383, |
|
"mean_token_accuracy": 0.9826522827148437, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.061888325959444046, |
|
"learning_rate": 0.0001884053194194142, |
|
"loss": 0.0329, |
|
"mean_token_accuracy": 0.9850789308547974, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.248, |
|
"grad_norm": 0.06167807802557945, |
|
"learning_rate": 0.00018706461028355104, |
|
"loss": 0.0349, |
|
"mean_token_accuracy": 0.9847366988658905, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.051647648215293884, |
|
"learning_rate": 0.00018565588993632487, |
|
"loss": 0.0435, |
|
"mean_token_accuracy": 0.9802697420120239, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.264, |
|
"grad_norm": 0.06429916620254517, |
|
"learning_rate": 0.0001841802588108161, |
|
"loss": 0.036, |
|
"mean_token_accuracy": 0.983528858423233, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 0.07308773696422577, |
|
"learning_rate": 0.00018263886960799062, |
|
"loss": 0.0365, |
|
"mean_token_accuracy": 0.983112233877182, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.05631113797426224, |
|
"learning_rate": 0.00018103292639625837, |
|
"loss": 0.0363, |
|
"mean_token_accuracy": 0.9835257828235626, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 0.07624773681163788, |
|
"learning_rate": 0.0001793636836709057, |
|
"loss": 0.0353, |
|
"mean_token_accuracy": 0.9840705871582032, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.296, |
|
"grad_norm": 0.05333884805440903, |
|
"learning_rate": 0.0001776324453741365, |
|
"loss": 0.0325, |
|
"mean_token_accuracy": 0.985572201013565, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 0.05988354608416557, |
|
"learning_rate": 0.00017584056387648727, |
|
"loss": 0.0382, |
|
"mean_token_accuracy": 0.982496690750122, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.312, |
|
"grad_norm": 0.05714619532227516, |
|
"learning_rate": 0.0001739894389204122, |
|
"loss": 0.0354, |
|
"mean_token_accuracy": 0.9839600920677185, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.05059760436415672, |
|
"learning_rate": 0.00017208051652686335, |
|
"loss": 0.0349, |
|
"mean_token_accuracy": 0.984496396780014, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.328, |
|
"grad_norm": 0.07071304321289062, |
|
"learning_rate": 0.00017011528786571969, |
|
"loss": 0.0305, |
|
"mean_token_accuracy": 0.9868167281150818, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 0.05378766357898712, |
|
"learning_rate": 0.00016809528809094807, |
|
"loss": 0.0353, |
|
"mean_token_accuracy": 0.9841848373413086, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.344, |
|
"grad_norm": 0.06888769567012787, |
|
"learning_rate": 0.0001660220951414055, |
|
"loss": 0.0331, |
|
"mean_token_accuracy": 0.9852972328662872, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 0.06159176304936409, |
|
"learning_rate": 0.00016389732850821966, |
|
"loss": 0.0328, |
|
"mean_token_accuracy": 0.9849840342998505, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.05890626460313797, |
|
"learning_rate": 0.0001617226479697105, |
|
"loss": 0.0318, |
|
"mean_token_accuracy": 0.9859302580356598, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 0.06155586987733841, |
|
"learning_rate": 0.00015949975229484134, |
|
"loss": 0.0367, |
|
"mean_token_accuracy": 0.9838493525981903, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.376, |
|
"grad_norm": 0.050045762211084366, |
|
"learning_rate": 0.00015723037791621193, |
|
"loss": 0.0325, |
|
"mean_token_accuracy": 0.9856610596179962, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.05855004861950874, |
|
"learning_rate": 0.00015491629757363032, |
|
"loss": 0.0312, |
|
"mean_token_accuracy": 0.9855502784252167, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.392, |
|
"grad_norm": 0.08005663007497787, |
|
"learning_rate": 0.00015255931892932333, |
|
"loss": 0.031, |
|
"mean_token_accuracy": 0.9869074642658233, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.061815045773983, |
|
"learning_rate": 0.0001501612831558664, |
|
"loss": 0.0366, |
|
"mean_token_accuracy": 0.983307272195816, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.408, |
|
"grad_norm": 0.0681883841753006, |
|
"learning_rate": 0.00014772406349793744, |
|
"loss": 0.0317, |
|
"mean_token_accuracy": 0.9858060896396637, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.056142307817935944, |
|
"learning_rate": 0.0001452495638090167, |
|
"loss": 0.0304, |
|
"mean_token_accuracy": 0.9861166894435882, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.424, |
|
"grad_norm": 0.0800497755408287, |
|
"learning_rate": 0.00014273971706417647, |
|
"loss": 0.0351, |
|
"mean_token_accuracy": 0.9837919533252716, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 0.05837633088231087, |
|
"learning_rate": 0.00014019648385012244, |
|
"loss": 0.0304, |
|
"mean_token_accuracy": 0.9853955626487731, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.058906685560941696, |
|
"learning_rate": 0.00013762185083366556, |
|
"loss": 0.0321, |
|
"mean_token_accuracy": 0.9853337109088898, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 0.06374027580022812, |
|
"learning_rate": 0.00013501782920982184, |
|
"loss": 0.0379, |
|
"mean_token_accuracy": 0.9825663805007935, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.456, |
|
"grad_norm": 0.062413912266492844, |
|
"learning_rate": 0.00013238645313075104, |
|
"loss": 0.0362, |
|
"mean_token_accuracy": 0.9839500784873962, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 0.04723189026117325, |
|
"learning_rate": 0.00012972977811676287, |
|
"loss": 0.0316, |
|
"mean_token_accuracy": 0.9853797852993011, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.472, |
|
"grad_norm": 0.05583701282739639, |
|
"learning_rate": 0.00012704987945063068, |
|
"loss": 0.0324, |
|
"mean_token_accuracy": 0.9851679742336273, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.053289640694856644, |
|
"learning_rate": 0.00012434885055646823, |
|
"loss": 0.0304, |
|
"mean_token_accuracy": 0.9868038833141327, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.488, |
|
"grad_norm": 0.0547807514667511, |
|
"learning_rate": 0.00012162880136443447, |
|
"loss": 0.0382, |
|
"mean_token_accuracy": 0.9832392990589142, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 0.062117431312799454, |
|
"learning_rate": 0.00011889185666254506, |
|
"loss": 0.0347, |
|
"mean_token_accuracy": 0.9848644256591796, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.504, |
|
"grad_norm": 0.05534973368048668, |
|
"learning_rate": 0.00011614015443687722, |
|
"loss": 0.032, |
|
"mean_token_accuracy": 0.9855094432830811, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 0.04713955521583557, |
|
"learning_rate": 0.0001133758442014651, |
|
"loss": 0.0296, |
|
"mean_token_accuracy": 0.9867917716503143, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.05600622668862343, |
|
"learning_rate": 0.00011060108531918971, |
|
"loss": 0.0342, |
|
"mean_token_accuracy": 0.9841290414333344, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 0.04774011671543121, |
|
"learning_rate": 0.0001078180453149754, |
|
"loss": 0.031, |
|
"mean_token_accuracy": 0.9855308055877685, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.536, |
|
"grad_norm": 0.05677397921681404, |
|
"learning_rate": 0.00010502889818261075, |
|
"loss": 0.0354, |
|
"mean_token_accuracy": 0.9833913028240204, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 0.08369103819131851, |
|
"learning_rate": 0.00010223582268651586, |
|
"loss": 0.0344, |
|
"mean_token_accuracy": 0.98387970328331, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.552, |
|
"grad_norm": 0.07704319059848785, |
|
"learning_rate": 9.94410006597835e-05, |
|
"loss": 0.0343, |
|
"mean_token_accuracy": 0.9841209590435028, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.0598573163151741, |
|
"learning_rate": 9.66466152998226e-05, |
|
"loss": 0.0279, |
|
"mean_token_accuracy": 0.9878278017044068, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.568, |
|
"grad_norm": 0.05838743597269058, |
|
"learning_rate": 9.385484946293637e-05, |
|
"loss": 0.0354, |
|
"mean_token_accuracy": 0.9834566116333008, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.0644030049443245, |
|
"learning_rate": 9.106788395916678e-05, |
|
"loss": 0.0329, |
|
"mean_token_accuracy": 0.985417640209198, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.584, |
|
"grad_norm": 0.05025125667452812, |
|
"learning_rate": 8.828789584873754e-05, |
|
"loss": 0.034, |
|
"mean_token_accuracy": 0.984911048412323, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 0.04645683616399765, |
|
"learning_rate": 8.551705674142617e-05, |
|
"loss": 0.0345, |
|
"mean_token_accuracy": 0.9842629969120026, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.06257304549217224, |
|
"learning_rate": 8.275753110019367e-05, |
|
"loss": 0.0307, |
|
"mean_token_accuracy": 0.9859090149402618, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 0.08189195394515991, |
|
"learning_rate": 8.001147455039735e-05, |
|
"loss": 0.0293, |
|
"mean_token_accuracy": 0.9876027524471283, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.616, |
|
"grad_norm": 0.05993218719959259, |
|
"learning_rate": 7.728103219590681e-05, |
|
"loss": 0.0289, |
|
"mean_token_accuracy": 0.9873842716217041, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 0.04940414056181908, |
|
"learning_rate": 7.456833694343906e-05, |
|
"loss": 0.0314, |
|
"mean_token_accuracy": 0.9858180701732635, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.632, |
|
"grad_norm": 0.06051292642951012, |
|
"learning_rate": 7.18755078364214e-05, |
|
"loss": 0.035, |
|
"mean_token_accuracy": 0.9843159735202789, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.07288350909948349, |
|
"learning_rate": 6.920464839968405e-05, |
|
"loss": 0.0289, |
|
"mean_token_accuracy": 0.9867165565490723, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.648, |
|
"grad_norm": 0.05235522985458374, |
|
"learning_rate": 6.65578449962749e-05, |
|
"loss": 0.0342, |
|
"mean_token_accuracy": 0.9844050407409668, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 0.04962315410375595, |
|
"learning_rate": 6.393716519768047e-05, |
|
"loss": 0.0318, |
|
"mean_token_accuracy": 0.9854838192462921, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.664, |
|
"grad_norm": 0.05337170138955116, |
|
"learning_rate": 6.134465616872598e-05, |
|
"loss": 0.029, |
|
"mean_token_accuracy": 0.9865143656730652, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.061498258262872696, |
|
"learning_rate": 5.878234306841637e-05, |
|
"loss": 0.0304, |
|
"mean_token_accuracy": 0.9859014391899109, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.05795282498002052, |
|
"learning_rate": 5.62522274679673e-05, |
|
"loss": 0.0311, |
|
"mean_token_accuracy": 0.9858847856521606, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 0.05434182286262512, |
|
"learning_rate": 5.375628578726181e-05, |
|
"loss": 0.0262, |
|
"mean_token_accuracy": 0.9887398540973663, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.696, |
|
"grad_norm": 0.0455637201666832, |
|
"learning_rate": 5.1296467750954314e-05, |
|
"loss": 0.0315, |
|
"mean_token_accuracy": 0.9861598789691925, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 0.05739019066095352, |
|
"learning_rate": 4.8874694865427676e-05, |
|
"loss": 0.0304, |
|
"mean_token_accuracy": 0.9865131139755249, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.712, |
|
"grad_norm": 0.07262022793292999, |
|
"learning_rate": 4.649285891779327e-05, |
|
"loss": 0.0277, |
|
"mean_token_accuracy": 0.9875944793224335, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.06861643493175507, |
|
"learning_rate": 4.415282049810644e-05, |
|
"loss": 0.0288, |
|
"mean_token_accuracy": 0.9869144856929779, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.728, |
|
"grad_norm": 0.057887740433216095, |
|
"learning_rate": 4.1856407545951834e-05, |
|
"loss": 0.0332, |
|
"mean_token_accuracy": 0.9847104012966156, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 0.07549154758453369, |
|
"learning_rate": 3.9605413922533874e-05, |
|
"loss": 0.0294, |
|
"mean_token_accuracy": 0.9868144273757935, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.744, |
|
"grad_norm": 0.058476611971855164, |
|
"learning_rate": 3.740159800938784e-05, |
|
"loss": 0.0246, |
|
"mean_token_accuracy": 0.9886395156383514, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 0.05933303013443947, |
|
"learning_rate": 3.5246681334806175e-05, |
|
"loss": 0.0263, |
|
"mean_token_accuracy": 0.9887538492679596, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.061903417110443115, |
|
"learning_rate": 3.3142347229053015e-05, |
|
"loss": 0.0333, |
|
"mean_token_accuracy": 0.9845255970954895, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.06767533719539642, |
|
"learning_rate": 3.109023950941736e-05, |
|
"loss": 0.0289, |
|
"mean_token_accuracy": 0.9869068741798401, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.776, |
|
"grad_norm": 0.05679545924067497, |
|
"learning_rate": 2.909196119613218e-05, |
|
"loss": 0.0313, |
|
"mean_token_accuracy": 0.9862040936946869, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 0.04378621280193329, |
|
"learning_rate": 2.7149073260162416e-05, |
|
"loss": 0.0311, |
|
"mean_token_accuracy": 0.9860045194625855, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.792, |
|
"grad_norm": 0.04507381096482277, |
|
"learning_rate": 2.5263093403840142e-05, |
|
"loss": 0.0324, |
|
"mean_token_accuracy": 0.985187429189682, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.04896757751703262, |
|
"learning_rate": 2.3435494875299314e-05, |
|
"loss": 0.0285, |
|
"mean_token_accuracy": 0.9870898187160492, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.808, |
|
"grad_norm": 0.06324876099824905, |
|
"learning_rate": 2.166770531763633e-05, |
|
"loss": 0.027, |
|
"mean_token_accuracy": 0.9879481792449951, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 0.04500701278448105, |
|
"learning_rate": 1.9961105653695266e-05, |
|
"loss": 0.0328, |
|
"mean_token_accuracy": 0.9849318027496338, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.824, |
|
"grad_norm": 0.05476031452417374, |
|
"learning_rate": 1.8317029007349085e-05, |
|
"loss": 0.0304, |
|
"mean_token_accuracy": 0.9857797741889953, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 0.04693017527461052, |
|
"learning_rate": 1.6736759662119183e-05, |
|
"loss": 0.0278, |
|
"mean_token_accuracy": 0.9873194813728332, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.07823780179023743, |
|
"learning_rate": 1.5221532057947419e-05, |
|
"loss": 0.0308, |
|
"mean_token_accuracy": 0.9862527191638947, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 0.04295206442475319, |
|
"learning_rate": 1.3772529826903269e-05, |
|
"loss": 0.0258, |
|
"mean_token_accuracy": 0.9883043467998505, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.856, |
|
"grad_norm": 0.046063318848609924, |
|
"learning_rate": 1.23908848685804e-05, |
|
"loss": 0.0284, |
|
"mean_token_accuracy": 0.9875360190868377, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 0.0660807192325592, |
|
"learning_rate": 1.1077676465904208e-05, |
|
"loss": 0.0312, |
|
"mean_token_accuracy": 0.986429226398468, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.872, |
|
"grad_norm": 0.059512872248888016, |
|
"learning_rate": 9.833930442041506e-06, |
|
"loss": 0.03, |
|
"mean_token_accuracy": 0.9860706746578216, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.05064374580979347, |
|
"learning_rate": 8.660618359070604e-06, |
|
"loss": 0.0308, |
|
"mean_token_accuracy": 0.9860022485256195, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.888, |
|
"grad_norm": 0.052208684384822845, |
|
"learning_rate": 7.558656759037797e-06, |
|
"loss": 0.0268, |
|
"mean_token_accuracy": 0.9876573204994201, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 0.052323583513498306, |
|
"learning_rate": 6.528906447993288e-06, |
|
"loss": 0.0315, |
|
"mean_token_accuracy": 0.9857221782207489, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.904, |
|
"grad_norm": 0.045587047934532166, |
|
"learning_rate": 5.572171823565797e-06, |
|
"loss": 0.0334, |
|
"mean_token_accuracy": 0.9845053553581238, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 0.0517142117023468, |
|
"learning_rate": 4.689200246600867e-06, |
|
"loss": 0.0327, |
|
"mean_token_accuracy": 0.9847142338752747, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.04421110823750496, |
|
"learning_rate": 3.880681457354118e-06, |
|
"loss": 0.028, |
|
"mean_token_accuracy": 0.9875537037849427, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 0.040859125554561615, |
|
"learning_rate": 3.1472470366950334e-06, |
|
"loss": 0.0274, |
|
"mean_token_accuracy": 0.9872512996196747, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.936, |
|
"grad_norm": 0.048171717673540115, |
|
"learning_rate": 2.4894699127426367e-06, |
|
"loss": 0.0287, |
|
"mean_token_accuracy": 0.9876733124256134, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 0.04280434921383858, |
|
"learning_rate": 1.907863913318153e-06, |
|
"loss": 0.0283, |
|
"mean_token_accuracy": 0.987252289056778, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.952, |
|
"grad_norm": 0.054707061499357224, |
|
"learning_rate": 1.4028833645643113e-06, |
|
"loss": 0.0324, |
|
"mean_token_accuracy": 0.9851353108882904, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.056745924055576324, |
|
"learning_rate": 9.749227360448143e-07, |
|
"loss": 0.0279, |
|
"mean_token_accuracy": 0.9884349465370178, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.968, |
|
"grad_norm": 0.04751133173704147, |
|
"learning_rate": 6.243163326014267e-07, |
|
"loss": 0.03, |
|
"mean_token_accuracy": 0.9863233804702759, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 0.061580199748277664, |
|
"learning_rate": 3.5133803320896994e-07, |
|
"loss": 0.028, |
|
"mean_token_accuracy": 0.9869140684604645, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.984, |
|
"grad_norm": 0.04928790032863617, |
|
"learning_rate": 1.562010770326916e-07, |
|
"loss": 0.036, |
|
"mean_token_accuracy": 0.9834135055541993, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 0.057830002158880234, |
|
"learning_rate": 3.905789685471062e-08, |
|
"loss": 0.0306, |
|
"mean_token_accuracy": 0.9861779153347016, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.0504627525806427, |
|
"learning_rate": 0.0, |
|
"loss": 0.0272, |
|
"mean_token_accuracy": 0.9880285739898682, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 625, |
|
"total_flos": 8.527809495642931e+16, |
|
"train_loss": 0.04780370211601257, |
|
"train_runtime": 1247.5714, |
|
"train_samples_per_second": 4.008, |
|
"train_steps_per_second": 0.501 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 625, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.527809495642931e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|