|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 671, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0014903129657228018, |
|
"grad_norm": 0.46681726349989877, |
|
"learning_rate": 4.411764705882353e-08, |
|
"loss": 1.7639, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.007451564828614009, |
|
"grad_norm": 0.41873451593648303, |
|
"learning_rate": 2.2058823529411765e-07, |
|
"loss": 1.7304, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.014903129657228018, |
|
"grad_norm": 0.4344144099098602, |
|
"learning_rate": 4.411764705882353e-07, |
|
"loss": 1.7792, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.022354694485842028, |
|
"grad_norm": 0.465770292151279, |
|
"learning_rate": 6.61764705882353e-07, |
|
"loss": 1.7689, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.029806259314456036, |
|
"grad_norm": 0.43483029835423426, |
|
"learning_rate": 8.823529411764706e-07, |
|
"loss": 1.7205, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.037257824143070044, |
|
"grad_norm": 0.44740719809064367, |
|
"learning_rate": 1.1029411764705884e-06, |
|
"loss": 1.7651, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.044709388971684055, |
|
"grad_norm": 0.43250694443997345, |
|
"learning_rate": 1.323529411764706e-06, |
|
"loss": 1.7926, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05216095380029806, |
|
"grad_norm": 0.43541824597905016, |
|
"learning_rate": 1.5441176470588234e-06, |
|
"loss": 1.7368, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.05961251862891207, |
|
"grad_norm": 0.44779413861684675, |
|
"learning_rate": 1.7647058823529412e-06, |
|
"loss": 1.7887, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06706408345752608, |
|
"grad_norm": 0.46078766735704463, |
|
"learning_rate": 1.9852941176470586e-06, |
|
"loss": 1.6828, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.07451564828614009, |
|
"grad_norm": 0.43875939336780606, |
|
"learning_rate": 2.2058823529411767e-06, |
|
"loss": 1.731, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08196721311475409, |
|
"grad_norm": 0.4229850780249163, |
|
"learning_rate": 2.4264705882352943e-06, |
|
"loss": 1.7005, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.08941877794336811, |
|
"grad_norm": 0.4045297221644775, |
|
"learning_rate": 2.647058823529412e-06, |
|
"loss": 1.7108, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.09687034277198212, |
|
"grad_norm": 0.4373623437488123, |
|
"learning_rate": 2.8676470588235296e-06, |
|
"loss": 1.7132, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.10432190760059612, |
|
"grad_norm": 0.37968682487624034, |
|
"learning_rate": 2.999918570372821e-06, |
|
"loss": 1.6389, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.11177347242921014, |
|
"grad_norm": 0.3852394407171853, |
|
"learning_rate": 2.9990025885979037e-06, |
|
"loss": 1.6099, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.11922503725782414, |
|
"grad_norm": 0.3557069482806777, |
|
"learning_rate": 2.997069461623824e-06, |
|
"loss": 1.6954, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.12667660208643816, |
|
"grad_norm": 0.3079289274590612, |
|
"learning_rate": 2.9941205011700118e-06, |
|
"loss": 1.6172, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.13412816691505217, |
|
"grad_norm": 0.29857210375623594, |
|
"learning_rate": 2.990157708247667e-06, |
|
"loss": 1.595, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.14157973174366617, |
|
"grad_norm": 0.2842045412377899, |
|
"learning_rate": 2.9851837718019762e-06, |
|
"loss": 1.6739, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.14903129657228018, |
|
"grad_norm": 0.24687715976280528, |
|
"learning_rate": 2.9792020668875367e-06, |
|
"loss": 1.5807, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15648286140089418, |
|
"grad_norm": 0.2369983753260976, |
|
"learning_rate": 2.9722166523782167e-06, |
|
"loss": 1.5851, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.16393442622950818, |
|
"grad_norm": 0.20261276348197832, |
|
"learning_rate": 2.964232268213018e-06, |
|
"loss": 1.5384, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.17138599105812222, |
|
"grad_norm": 0.20698300200844078, |
|
"learning_rate": 2.955254332179797e-06, |
|
"loss": 1.5694, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.17883755588673622, |
|
"grad_norm": 0.2043135439690471, |
|
"learning_rate": 2.9452889362390366e-06, |
|
"loss": 1.6602, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.18628912071535023, |
|
"grad_norm": 0.17624295165768436, |
|
"learning_rate": 2.9343428423901614e-06, |
|
"loss": 1.5703, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.19374068554396423, |
|
"grad_norm": 0.15837032215765645, |
|
"learning_rate": 2.9224234780831905e-06, |
|
"loss": 1.5531, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.20119225037257824, |
|
"grad_norm": 0.1811372542204385, |
|
"learning_rate": 2.9095389311788626e-06, |
|
"loss": 1.577, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.20864381520119224, |
|
"grad_norm": 0.15759029786619647, |
|
"learning_rate": 2.8956979444606303e-06, |
|
"loss": 1.5101, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.21609538002980627, |
|
"grad_norm": 0.1520856154563207, |
|
"learning_rate": 2.8809099097022624e-06, |
|
"loss": 1.5547, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.22354694485842028, |
|
"grad_norm": 0.14966084871837593, |
|
"learning_rate": 2.8651848612950768e-06, |
|
"loss": 1.5801, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.23099850968703428, |
|
"grad_norm": 0.1378402679865988, |
|
"learning_rate": 2.848533469439122e-06, |
|
"loss": 1.4604, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.23845007451564829, |
|
"grad_norm": 0.1464647323237256, |
|
"learning_rate": 2.8309670329029358e-06, |
|
"loss": 1.4894, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2459016393442623, |
|
"grad_norm": 0.1437051133777023, |
|
"learning_rate": 2.8124974713567872e-06, |
|
"loss": 1.5269, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.2533532041728763, |
|
"grad_norm": 0.14789288282077315, |
|
"learning_rate": 2.79313731728461e-06, |
|
"loss": 1.5537, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2608047690014903, |
|
"grad_norm": 0.14308275673434606, |
|
"learning_rate": 2.772899707480108e-06, |
|
"loss": 1.4818, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.26825633383010433, |
|
"grad_norm": 0.13899835934264645, |
|
"learning_rate": 2.7517983741328146e-06, |
|
"loss": 1.4718, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2757078986587183, |
|
"grad_norm": 0.13342232666402679, |
|
"learning_rate": 2.729847635510137e-06, |
|
"loss": 1.5693, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.28315946348733234, |
|
"grad_norm": 0.13133306157733085, |
|
"learning_rate": 2.70706238624173e-06, |
|
"loss": 1.5144, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2906110283159464, |
|
"grad_norm": 0.12346136386902845, |
|
"learning_rate": 2.6834580872127733e-06, |
|
"loss": 1.4706, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.29806259314456035, |
|
"grad_norm": 0.12962940757781438, |
|
"learning_rate": 2.6590507550730175e-06, |
|
"loss": 1.4595, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3055141579731744, |
|
"grad_norm": 0.1256519676373563, |
|
"learning_rate": 2.6338569513687182e-06, |
|
"loss": 1.4993, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.31296572280178836, |
|
"grad_norm": 0.12374610253004059, |
|
"learning_rate": 2.6078937713048357e-06, |
|
"loss": 1.459, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3204172876304024, |
|
"grad_norm": 0.13618026502855957, |
|
"learning_rate": 2.581178832145114e-06, |
|
"loss": 1.5273, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.32786885245901637, |
|
"grad_norm": 0.12673229090948254, |
|
"learning_rate": 2.553730261257924e-06, |
|
"loss": 1.4442, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3353204172876304, |
|
"grad_norm": 0.11648035880727567, |
|
"learning_rate": 2.525566683815973e-06, |
|
"loss": 1.4502, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.34277198211624443, |
|
"grad_norm": 0.12644477441341948, |
|
"learning_rate": 2.496707210158233e-06, |
|
"loss": 1.5173, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3502235469448584, |
|
"grad_norm": 0.11586720234125189, |
|
"learning_rate": 2.4671714228226542e-06, |
|
"loss": 1.46, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.35767511177347244, |
|
"grad_norm": 0.11883054977090766, |
|
"learning_rate": 2.4369793632584796e-06, |
|
"loss": 1.4847, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3651266766020864, |
|
"grad_norm": 0.13665955324099738, |
|
"learning_rate": 2.4061515182271535e-06, |
|
"loss": 1.5317, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.37257824143070045, |
|
"grad_norm": 0.11799056969645393, |
|
"learning_rate": 2.3747088059010745e-06, |
|
"loss": 1.4061, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.38002980625931443, |
|
"grad_norm": 0.12197294071650851, |
|
"learning_rate": 2.342672561669611e-06, |
|
"loss": 1.5242, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.38748137108792846, |
|
"grad_norm": 0.12328952083324084, |
|
"learning_rate": 2.3100645236620133e-06, |
|
"loss": 1.4472, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.3949329359165425, |
|
"grad_norm": 0.13371579203837833, |
|
"learning_rate": 2.276906817997054e-06, |
|
"loss": 1.4578, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.40238450074515647, |
|
"grad_norm": 0.11797510100880551, |
|
"learning_rate": 2.2432219437693897e-06, |
|
"loss": 1.4541, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4098360655737705, |
|
"grad_norm": 0.11652500829878425, |
|
"learning_rate": 2.209032757782848e-06, |
|
"loss": 1.428, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.4172876304023845, |
|
"grad_norm": 0.11784740588346589, |
|
"learning_rate": 2.174362459040989e-06, |
|
"loss": 1.4765, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4247391952309985, |
|
"grad_norm": 0.13214434156180432, |
|
"learning_rate": 2.139234573005468e-06, |
|
"loss": 1.4823, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.43219076005961254, |
|
"grad_norm": 0.12325194724942345, |
|
"learning_rate": 2.1036729356328806e-06, |
|
"loss": 1.4587, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4396423248882265, |
|
"grad_norm": 0.11758350672051872, |
|
"learning_rate": 2.06770167720092e-06, |
|
"loss": 1.427, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.44709388971684055, |
|
"grad_norm": 0.11607090772093101, |
|
"learning_rate": 2.0313452059348308e-06, |
|
"loss": 1.4912, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 0.11742494412844874, |
|
"learning_rate": 1.99462819144525e-06, |
|
"loss": 1.4829, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.46199701937406856, |
|
"grad_norm": 0.12666171818837882, |
|
"learning_rate": 1.957575547988697e-06, |
|
"loss": 1.4497, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.46944858420268254, |
|
"grad_norm": 0.125963894413993, |
|
"learning_rate": 1.9202124175620545e-06, |
|
"loss": 1.4255, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.47690014903129657, |
|
"grad_norm": 0.11914613772013677, |
|
"learning_rate": 1.8825641528425148e-06, |
|
"loss": 1.4425, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4843517138599106, |
|
"grad_norm": 0.12972400981235876, |
|
"learning_rate": 1.8446562999845715e-06, |
|
"loss": 1.4829, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.4918032786885246, |
|
"grad_norm": 0.12709044180361684, |
|
"learning_rate": 1.8065145812857305e-06, |
|
"loss": 1.4178, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.4992548435171386, |
|
"grad_norm": 0.1202237982692701, |
|
"learning_rate": 1.7681648777326943e-06, |
|
"loss": 1.4876, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.5067064083457526, |
|
"grad_norm": 0.12048604340467019, |
|
"learning_rate": 1.7296332114398704e-06, |
|
"loss": 1.4076, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5141579731743666, |
|
"grad_norm": 0.1337777649679242, |
|
"learning_rate": 1.6909457279921186e-06, |
|
"loss": 1.4171, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.5216095380029806, |
|
"grad_norm": 0.12802906461759175, |
|
"learning_rate": 1.6521286787037178e-06, |
|
"loss": 1.4569, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5290611028315947, |
|
"grad_norm": 0.11319710800264886, |
|
"learning_rate": 1.613208402805586e-06, |
|
"loss": 1.4541, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.5365126676602087, |
|
"grad_norm": 0.117227595325786, |
|
"learning_rate": 1.5742113095728515e-06, |
|
"loss": 1.4812, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5439642324888226, |
|
"grad_norm": 0.12215641426475456, |
|
"learning_rate": 1.535163860404891e-06, |
|
"loss": 1.4964, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.5514157973174366, |
|
"grad_norm": 0.13083431124739964, |
|
"learning_rate": 1.4960925508699984e-06, |
|
"loss": 1.4217, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5588673621460507, |
|
"grad_norm": 0.1409576348850734, |
|
"learning_rate": 1.4570238927268746e-06, |
|
"loss": 1.4805, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.5663189269746647, |
|
"grad_norm": 0.11615933961706335, |
|
"learning_rate": 1.4179843959351213e-06, |
|
"loss": 1.5039, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5737704918032787, |
|
"grad_norm": 0.12739604211462074, |
|
"learning_rate": 1.3790005506669643e-06, |
|
"loss": 1.4761, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.5812220566318927, |
|
"grad_norm": 0.1164562209744072, |
|
"learning_rate": 1.340098809332401e-06, |
|
"loss": 1.4594, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5886736214605067, |
|
"grad_norm": 0.1264012532912417, |
|
"learning_rate": 1.3013055686299683e-06, |
|
"loss": 1.4098, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.5961251862891207, |
|
"grad_norm": 0.1165323108273604, |
|
"learning_rate": 1.2626471516353158e-06, |
|
"loss": 1.4998, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6035767511177347, |
|
"grad_norm": 0.12051657476409453, |
|
"learning_rate": 1.22414978993974e-06, |
|
"loss": 1.3897, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.6110283159463488, |
|
"grad_norm": 0.1326292969168002, |
|
"learning_rate": 1.1858396058507837e-06, |
|
"loss": 1.543, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6184798807749627, |
|
"grad_norm": 0.12455825193543946, |
|
"learning_rate": 1.1477425946670016e-06, |
|
"loss": 1.4503, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.6259314456035767, |
|
"grad_norm": 0.12370626954874395, |
|
"learning_rate": 1.1098846070389027e-06, |
|
"loss": 1.4326, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6333830104321908, |
|
"grad_norm": 0.12765662301993327, |
|
"learning_rate": 1.0722913314280395e-06, |
|
"loss": 1.4533, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.6408345752608048, |
|
"grad_norm": 0.11436035436770554, |
|
"learning_rate": 1.0349882766761573e-06, |
|
"loss": 1.3693, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6482861400894188, |
|
"grad_norm": 0.11842787699252887, |
|
"learning_rate": 9.980007546962206e-07, |
|
"loss": 1.4324, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.6557377049180327, |
|
"grad_norm": 0.11630059586095602, |
|
"learning_rate": 9.613538632970634e-07, |
|
"loss": 1.4374, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6631892697466468, |
|
"grad_norm": 0.11529623096240363, |
|
"learning_rate": 9.250724691533223e-07, |
|
"loss": 1.4225, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.6706408345752608, |
|
"grad_norm": 0.11523744193256093, |
|
"learning_rate": 8.891811909322058e-07, |
|
"loss": 1.4843, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6780923994038748, |
|
"grad_norm": 0.13507595363725217, |
|
"learning_rate": 8.537043825885445e-07, |
|
"loss": 1.4891, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.6855439642324889, |
|
"grad_norm": 0.11541281167932681, |
|
"learning_rate": 8.186661168394658e-07, |
|
"loss": 1.4318, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.6929955290611028, |
|
"grad_norm": 0.12007185600365232, |
|
"learning_rate": 7.840901688299e-07, |
|
"loss": 1.4914, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.7004470938897168, |
|
"grad_norm": 0.10337470443027447, |
|
"learning_rate": 7.500000000000003e-07, |
|
"loss": 1.3677, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7078986587183308, |
|
"grad_norm": 0.1181215216758366, |
|
"learning_rate": 7.16418742165435e-07, |
|
"loss": 1.4494, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.7153502235469449, |
|
"grad_norm": 0.11631524814806833, |
|
"learning_rate": 6.83369181821336e-07, |
|
"loss": 1.4408, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7228017883755589, |
|
"grad_norm": 0.11883945369899837, |
|
"learning_rate": 6.508737446805704e-07, |
|
"loss": 1.4921, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.7302533532041728, |
|
"grad_norm": 0.12334235443369139, |
|
"learning_rate": 6.189544804568165e-07, |
|
"loss": 1.4098, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7377049180327869, |
|
"grad_norm": 0.11865941207884222, |
|
"learning_rate": 5.876330479027766e-07, |
|
"loss": 1.436, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.7451564828614009, |
|
"grad_norm": 0.11288437598350734, |
|
"learning_rate": 5.56930700113673e-07, |
|
"loss": 1.349, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7526080476900149, |
|
"grad_norm": 0.12263443449722483, |
|
"learning_rate": 5.268682701060012e-07, |
|
"loss": 1.3842, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.7600596125186289, |
|
"grad_norm": 0.13248150696752248, |
|
"learning_rate": 4.974661566813315e-07, |
|
"loss": 1.4917, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.767511177347243, |
|
"grad_norm": 0.1235746701113258, |
|
"learning_rate": 4.6874431058474127e-07, |
|
"loss": 1.3985, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.7749627421758569, |
|
"grad_norm": 0.12435997349736658, |
|
"learning_rate": 4.4072222096727663e-07, |
|
"loss": 1.3998, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7824143070044709, |
|
"grad_norm": 0.12146095464339451, |
|
"learning_rate": 4.1341890216162934e-07, |
|
"loss": 1.4523, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.789865871833085, |
|
"grad_norm": 0.11678985797029438, |
|
"learning_rate": 3.868528807799988e-07, |
|
"loss": 1.4505, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.797317436661699, |
|
"grad_norm": 0.1271289808154804, |
|
"learning_rate": 3.610421831428953e-07, |
|
"loss": 1.4415, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.8047690014903129, |
|
"grad_norm": 0.12203618843739349, |
|
"learning_rate": 3.36004323047419e-07, |
|
"loss": 1.4415, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.812220566318927, |
|
"grad_norm": 0.12970737402513247, |
|
"learning_rate": 3.11756289883306e-07, |
|
"loss": 1.4382, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.819672131147541, |
|
"grad_norm": 0.1290193224680504, |
|
"learning_rate": 2.883145371048133e-07, |
|
"loss": 1.4756, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.827123695976155, |
|
"grad_norm": 0.11595098181221819, |
|
"learning_rate": 2.656949710662591e-07, |
|
"loss": 1.4355, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.834575260804769, |
|
"grad_norm": 0.11852958967235772, |
|
"learning_rate": 2.4391294022879947e-07, |
|
"loss": 1.4338, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.842026825633383, |
|
"grad_norm": 0.12831441033578755, |
|
"learning_rate": 2.2298322474575838e-07, |
|
"loss": 1.4586, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.849478390461997, |
|
"grad_norm": 0.1226839578398384, |
|
"learning_rate": 2.0292002643358892e-07, |
|
"loss": 1.394, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.856929955290611, |
|
"grad_norm": 0.1321673633340756, |
|
"learning_rate": 1.8373695913525317e-07, |
|
"loss": 1.4389, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.8643815201192251, |
|
"grad_norm": 0.13356949132529727, |
|
"learning_rate": 1.6544703948258172e-07, |
|
"loss": 1.4139, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8718330849478391, |
|
"grad_norm": 0.12214303306764548, |
|
"learning_rate": 1.4806267806386093e-07, |
|
"loss": 1.4682, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.879284649776453, |
|
"grad_norm": 0.1127695145084169, |
|
"learning_rate": 1.3159567100265506e-07, |
|
"loss": 1.3907, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.886736214605067, |
|
"grad_norm": 0.11667918336134676, |
|
"learning_rate": 1.1605719195356806e-07, |
|
"loss": 1.4541, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.8941877794336811, |
|
"grad_norm": 0.12072677981772588, |
|
"learning_rate": 1.0145778452038629e-07, |
|
"loss": 1.3466, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9016393442622951, |
|
"grad_norm": 0.1127152867709536, |
|
"learning_rate": 8.780735510173316e-08, |
|
"loss": 1.4243, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.12054633218635875, |
|
"learning_rate": 7.51151661691048e-08, |
|
"loss": 1.4678, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.9165424739195231, |
|
"grad_norm": 0.1296306157204084, |
|
"learning_rate": 6.338982998183856e-08, |
|
"loss": 1.4972, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.9239940387481371, |
|
"grad_norm": 0.12472731735613568, |
|
"learning_rate": 5.263930274328044e-08, |
|
"loss": 1.4058, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9314456035767511, |
|
"grad_norm": 0.10862305460842242, |
|
"learning_rate": 4.287087920212035e-08, |
|
"loss": 1.3878, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.9388971684053651, |
|
"grad_norm": 0.11400949590495348, |
|
"learning_rate": 3.4091187702554485e-08, |
|
"loss": 1.4539, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9463487332339792, |
|
"grad_norm": 0.12503890566941747, |
|
"learning_rate": 2.630618568663584e-08, |
|
"loss": 1.4149, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.9538002980625931, |
|
"grad_norm": 0.11857672598071958, |
|
"learning_rate": 1.9521155651863854e-08, |
|
"loss": 1.4774, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9612518628912071, |
|
"grad_norm": 0.12362094771502449, |
|
"learning_rate": 1.3740701566756276e-08, |
|
"loss": 1.4155, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.9687034277198212, |
|
"grad_norm": 0.12708375923965276, |
|
"learning_rate": 8.968745746835983e-09, |
|
"loss": 1.3642, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9761549925484352, |
|
"grad_norm": 0.1196512207904661, |
|
"learning_rate": 5.208526193150764e-09, |
|
"loss": 1.4681, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.9836065573770492, |
|
"grad_norm": 0.12024283259629832, |
|
"learning_rate": 2.462594395134854e-09, |
|
"loss": 1.4664, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.9910581222056631, |
|
"grad_norm": 0.11574911429092992, |
|
"learning_rate": 7.328135993011631e-10, |
|
"loss": 1.4406, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.9985096870342772, |
|
"grad_norm": 0.11828232321325595, |
|
"learning_rate": 2.035754493812103e-11, |
|
"loss": 1.4711, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.4359323978424072, |
|
"eval_runtime": 564.1313, |
|
"eval_samples_per_second": 4.23, |
|
"eval_steps_per_second": 0.133, |
|
"step": 671 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 671, |
|
"total_flos": 2239826553733120.0, |
|
"train_loss": 1.4986209796544514, |
|
"train_runtime": 17014.7656, |
|
"train_samples_per_second": 1.262, |
|
"train_steps_per_second": 0.039 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 671, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2239826553733120.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|