|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.993979933110368, |
|
"eval_steps": 100, |
|
"global_step": 1119, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.013377926421404682, |
|
"grad_norm": 31.946590376675037, |
|
"learning_rate": 4.4642857142857147e-07, |
|
"loss": 1.2958, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.026755852842809364, |
|
"grad_norm": 6.55244388496526, |
|
"learning_rate": 8.928571428571429e-07, |
|
"loss": 1.2627, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04013377926421405, |
|
"grad_norm": 7.5989567440091585, |
|
"learning_rate": 1.3392857142857143e-06, |
|
"loss": 1.2874, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.05351170568561873, |
|
"grad_norm": 2.7362855062254225, |
|
"learning_rate": 1.7857142857142859e-06, |
|
"loss": 1.2337, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06688963210702341, |
|
"grad_norm": 2.1889352739395216, |
|
"learning_rate": 2.2321428571428573e-06, |
|
"loss": 1.2754, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0802675585284281, |
|
"grad_norm": 1.8597549946305532, |
|
"learning_rate": 2.6785714285714285e-06, |
|
"loss": 1.238, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09364548494983277, |
|
"grad_norm": 2.018444805401378, |
|
"learning_rate": 3.125e-06, |
|
"loss": 1.2122, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.10702341137123746, |
|
"grad_norm": 1.937412231850121, |
|
"learning_rate": 3.5714285714285718e-06, |
|
"loss": 1.1821, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12040133779264214, |
|
"grad_norm": 2.2790127360584083, |
|
"learning_rate": 4.017857142857143e-06, |
|
"loss": 1.1503, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.13377926421404682, |
|
"grad_norm": 1.6229723458044985, |
|
"learning_rate": 4.464285714285715e-06, |
|
"loss": 1.1426, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14715719063545152, |
|
"grad_norm": 1.9924287113006085, |
|
"learning_rate": 4.910714285714286e-06, |
|
"loss": 1.1012, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1605351170568562, |
|
"grad_norm": 1.832154946956557, |
|
"learning_rate": 5.357142857142857e-06, |
|
"loss": 1.1423, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.17391304347826086, |
|
"grad_norm": 2.2835904131577256, |
|
"learning_rate": 5.8035714285714295e-06, |
|
"loss": 1.1462, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.18729096989966554, |
|
"grad_norm": 1.7067435078913915, |
|
"learning_rate": 6.25e-06, |
|
"loss": 1.105, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.20066889632107024, |
|
"grad_norm": 1.8499351081387354, |
|
"learning_rate": 6.696428571428571e-06, |
|
"loss": 1.0921, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2140468227424749, |
|
"grad_norm": 2.3542380670481124, |
|
"learning_rate": 7.1428571428571436e-06, |
|
"loss": 1.109, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.22742474916387959, |
|
"grad_norm": 2.55387815202157, |
|
"learning_rate": 7.589285714285714e-06, |
|
"loss": 1.0625, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2408026755852843, |
|
"grad_norm": 1.392008189059501, |
|
"learning_rate": 8.035714285714286e-06, |
|
"loss": 1.0792, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.25418060200668896, |
|
"grad_norm": 1.4336889975083718, |
|
"learning_rate": 8.482142857142858e-06, |
|
"loss": 1.062, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.26755852842809363, |
|
"grad_norm": 1.414284998073309, |
|
"learning_rate": 8.92857142857143e-06, |
|
"loss": 1.042, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.26755852842809363, |
|
"eval_loss": 1.0254441499710083, |
|
"eval_runtime": 150.1041, |
|
"eval_samples_per_second": 35.395, |
|
"eval_steps_per_second": 1.113, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2809364548494983, |
|
"grad_norm": 1.517891168040724, |
|
"learning_rate": 9.375000000000001e-06, |
|
"loss": 1.0349, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.29431438127090304, |
|
"grad_norm": 1.6103336966412949, |
|
"learning_rate": 9.821428571428573e-06, |
|
"loss": 1.0631, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 1.3400760919486352, |
|
"learning_rate": 9.99978101208322e-06, |
|
"loss": 1.0234, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.3210702341137124, |
|
"grad_norm": 3.9922688605654546, |
|
"learning_rate": 9.99844282205785e-06, |
|
"loss": 1.0557, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.33444816053511706, |
|
"grad_norm": 1.4489282467883469, |
|
"learning_rate": 9.995888427170226e-06, |
|
"loss": 1.0582, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.34782608695652173, |
|
"grad_norm": 1.3370551994177822, |
|
"learning_rate": 9.992118448947408e-06, |
|
"loss": 1.0259, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3612040133779264, |
|
"grad_norm": 1.4251445984876532, |
|
"learning_rate": 9.987133804688247e-06, |
|
"loss": 1.0689, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.3745819397993311, |
|
"grad_norm": 1.2711652329134817, |
|
"learning_rate": 9.98093570724018e-06, |
|
"loss": 1.0185, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3879598662207358, |
|
"grad_norm": 1.144552453590232, |
|
"learning_rate": 9.973525664704137e-06, |
|
"loss": 1.0098, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.4013377926421405, |
|
"grad_norm": 1.2055734801642233, |
|
"learning_rate": 9.964905480067585e-06, |
|
"loss": 1.0266, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.41471571906354515, |
|
"grad_norm": 1.2157728777821908, |
|
"learning_rate": 9.955077250765833e-06, |
|
"loss": 1.0324, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.4280936454849498, |
|
"grad_norm": 1.2067546777875673, |
|
"learning_rate": 9.944043368171692e-06, |
|
"loss": 1.0481, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4414715719063545, |
|
"grad_norm": 2.6261182971339134, |
|
"learning_rate": 9.931806517013612e-06, |
|
"loss": 1.0721, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.45484949832775917, |
|
"grad_norm": 1.237982640721041, |
|
"learning_rate": 9.91836967472245e-06, |
|
"loss": 1.0181, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4682274247491639, |
|
"grad_norm": 1.188674540139363, |
|
"learning_rate": 9.903736110707001e-06, |
|
"loss": 1.0196, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4816053511705686, |
|
"grad_norm": 1.1470412076788694, |
|
"learning_rate": 9.8879093855585e-06, |
|
"loss": 1.0617, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.49498327759197325, |
|
"grad_norm": 1.1834785845562295, |
|
"learning_rate": 9.870893350184274e-06, |
|
"loss": 1.0196, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.5083612040133779, |
|
"grad_norm": 1.3318612284290117, |
|
"learning_rate": 9.852692144870746e-06, |
|
"loss": 1.0577, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5217391304347826, |
|
"grad_norm": 1.1976993059713574, |
|
"learning_rate": 9.833310198276037e-06, |
|
"loss": 1.036, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.5351170568561873, |
|
"grad_norm": 1.3579484538265107, |
|
"learning_rate": 9.812752226352405e-06, |
|
"loss": 0.9872, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5351170568561873, |
|
"eval_loss": 1.0063743591308594, |
|
"eval_runtime": 149.7108, |
|
"eval_samples_per_second": 35.488, |
|
"eval_steps_per_second": 1.115, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5484949832775919, |
|
"grad_norm": 1.2190388188014913, |
|
"learning_rate": 9.791023231198757e-06, |
|
"loss": 1.0283, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5618729096989966, |
|
"grad_norm": 1.022418475593247, |
|
"learning_rate": 9.768128499843579e-06, |
|
"loss": 1.0278, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5752508361204013, |
|
"grad_norm": 1.2003316921360592, |
|
"learning_rate": 9.744073602958493e-06, |
|
"loss": 1.0498, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5886287625418061, |
|
"grad_norm": 1.116688252576712, |
|
"learning_rate": 9.718864393502828e-06, |
|
"loss": 1.0187, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6020066889632107, |
|
"grad_norm": 1.2888011234637655, |
|
"learning_rate": 9.692507005299499e-06, |
|
"loss": 1.0398, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 1.1735950690203159, |
|
"learning_rate": 9.665007851542541e-06, |
|
"loss": 1.042, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6287625418060201, |
|
"grad_norm": 1.0955355204061985, |
|
"learning_rate": 9.636373623236672e-06, |
|
"loss": 0.9689, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.6421404682274248, |
|
"grad_norm": 1.217898633941405, |
|
"learning_rate": 9.60661128756926e-06, |
|
"loss": 0.9335, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6555183946488294, |
|
"grad_norm": 1.2872565874857864, |
|
"learning_rate": 9.575728086215093e-06, |
|
"loss": 1.0172, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6688963210702341, |
|
"grad_norm": 1.1483180612149577, |
|
"learning_rate": 9.543731533574349e-06, |
|
"loss": 0.9927, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6822742474916388, |
|
"grad_norm": 1.271466676847694, |
|
"learning_rate": 9.510629414944229e-06, |
|
"loss": 1.0104, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6956521739130435, |
|
"grad_norm": 1.3591750189647636, |
|
"learning_rate": 9.47642978462465e-06, |
|
"loss": 1.0316, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7090301003344481, |
|
"grad_norm": 1.253695733744314, |
|
"learning_rate": 9.441140963958515e-06, |
|
"loss": 1.0672, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.7224080267558528, |
|
"grad_norm": 1.2962413499952996, |
|
"learning_rate": 9.404771539306978e-06, |
|
"loss": 1.0328, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7357859531772575, |
|
"grad_norm": 1.058056902161627, |
|
"learning_rate": 9.367330359960239e-06, |
|
"loss": 0.9912, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.7491638795986622, |
|
"grad_norm": 1.3613124209671206, |
|
"learning_rate": 9.328826535984374e-06, |
|
"loss": 1.014, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7625418060200669, |
|
"grad_norm": 0.9916292842180835, |
|
"learning_rate": 9.289269436004692e-06, |
|
"loss": 0.9737, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7759197324414716, |
|
"grad_norm": 1.199932466102474, |
|
"learning_rate": 9.248668684926199e-06, |
|
"loss": 0.9971, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7892976588628763, |
|
"grad_norm": 1.2020222255226058, |
|
"learning_rate": 9.207034161591689e-06, |
|
"loss": 1.0176, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.802675585284281, |
|
"grad_norm": 0.9386029912207539, |
|
"learning_rate": 9.16437599637807e-06, |
|
"loss": 1.008, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.802675585284281, |
|
"eval_loss": 0.993434488773346, |
|
"eval_runtime": 150.5269, |
|
"eval_samples_per_second": 35.296, |
|
"eval_steps_per_second": 1.109, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8160535117056856, |
|
"grad_norm": 1.0269697222310779, |
|
"learning_rate": 9.120704568731455e-06, |
|
"loss": 1.0055, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.8294314381270903, |
|
"grad_norm": 1.1921850873424837, |
|
"learning_rate": 9.07603050464169e-06, |
|
"loss": 0.9808, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.842809364548495, |
|
"grad_norm": 1.4150961151993176, |
|
"learning_rate": 9.030364674056853e-06, |
|
"loss": 1.0094, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.8561872909698997, |
|
"grad_norm": 1.1824752842844202, |
|
"learning_rate": 8.983718188238428e-06, |
|
"loss": 0.9978, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 0.9474487837144705, |
|
"learning_rate": 8.936102397057737e-06, |
|
"loss": 0.9654, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.882943143812709, |
|
"grad_norm": 1.3116449529301724, |
|
"learning_rate": 8.88752888623434e-06, |
|
"loss": 0.9949, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8963210702341137, |
|
"grad_norm": 1.2357926812300235, |
|
"learning_rate": 8.838009474517022e-06, |
|
"loss": 1.0466, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.9096989966555183, |
|
"grad_norm": 1.21136665171992, |
|
"learning_rate": 8.787556210808101e-06, |
|
"loss": 1.0075, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 1.1046034738865587, |
|
"learning_rate": 8.736181371231728e-06, |
|
"loss": 1.0351, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.9364548494983278, |
|
"grad_norm": 1.0160775318780402, |
|
"learning_rate": 8.683897456146897e-06, |
|
"loss": 1.0235, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.9498327759197325, |
|
"grad_norm": 1.1967451568920842, |
|
"learning_rate": 8.630717187105902e-06, |
|
"loss": 0.9868, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.9632107023411371, |
|
"grad_norm": 1.0723467089474863, |
|
"learning_rate": 8.576653503758964e-06, |
|
"loss": 1.0067, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9765886287625418, |
|
"grad_norm": 1.246858371531942, |
|
"learning_rate": 8.52171956070581e-06, |
|
"loss": 0.9743, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.9899665551839465, |
|
"grad_norm": 1.0275971223118703, |
|
"learning_rate": 8.465928724294923e-06, |
|
"loss": 0.96, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.0033444816053512, |
|
"grad_norm": 0.9382551654479478, |
|
"learning_rate": 8.409294569371293e-06, |
|
"loss": 0.9897, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.0167224080267558, |
|
"grad_norm": 1.162719556520577, |
|
"learning_rate": 8.351830875973436e-06, |
|
"loss": 0.7912, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.0301003344481605, |
|
"grad_norm": 1.2039432672032517, |
|
"learning_rate": 8.293551625980468e-06, |
|
"loss": 0.7677, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.0434782608695652, |
|
"grad_norm": 1.0218121146847778, |
|
"learning_rate": 8.234470999710086e-06, |
|
"loss": 0.7457, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0568561872909699, |
|
"grad_norm": 1.2407699947992628, |
|
"learning_rate": 8.174603372468259e-06, |
|
"loss": 0.7489, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.0702341137123745, |
|
"grad_norm": 1.3082306318602033, |
|
"learning_rate": 8.113963311051474e-06, |
|
"loss": 0.7473, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0702341137123745, |
|
"eval_loss": 1.0105931758880615, |
|
"eval_runtime": 149.7077, |
|
"eval_samples_per_second": 35.489, |
|
"eval_steps_per_second": 1.116, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0836120401337792, |
|
"grad_norm": 1.056246079397668, |
|
"learning_rate": 8.052565570202394e-06, |
|
"loss": 0.78, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.0969899665551839, |
|
"grad_norm": 1.0671946804584624, |
|
"learning_rate": 7.990425089019774e-06, |
|
"loss": 0.7339, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.1103678929765886, |
|
"grad_norm": 1.288275985516129, |
|
"learning_rate": 7.927556987323534e-06, |
|
"loss": 0.775, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.1237458193979932, |
|
"grad_norm": 1.1249043744252742, |
|
"learning_rate": 7.86397656197586e-06, |
|
"loss": 0.7532, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.137123745819398, |
|
"grad_norm": 1.0933149663805157, |
|
"learning_rate": 7.799699283159199e-06, |
|
"loss": 0.792, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.1505016722408028, |
|
"grad_norm": 0.997956490849396, |
|
"learning_rate": 7.734740790612137e-06, |
|
"loss": 0.7748, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.1638795986622075, |
|
"grad_norm": 0.8427898442243963, |
|
"learning_rate": 7.669116889823955e-06, |
|
"loss": 0.7524, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.1772575250836121, |
|
"grad_norm": 1.181226726720212, |
|
"learning_rate": 7.602843548188915e-06, |
|
"loss": 0.7835, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.1906354515050168, |
|
"grad_norm": 1.2669045285704714, |
|
"learning_rate": 7.5359368911211115e-06, |
|
"loss": 0.7696, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.2040133779264215, |
|
"grad_norm": 0.9050504465278061, |
|
"learning_rate": 7.468413198130891e-06, |
|
"loss": 0.7652, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.2173913043478262, |
|
"grad_norm": 1.179211905798687, |
|
"learning_rate": 7.400288898863779e-06, |
|
"loss": 0.7223, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 1.0441122444048954, |
|
"learning_rate": 7.3315805691028615e-06, |
|
"loss": 0.7756, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.2441471571906355, |
|
"grad_norm": 1.3279168455135049, |
|
"learning_rate": 7.262304926735633e-06, |
|
"loss": 0.8294, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.2575250836120402, |
|
"grad_norm": 0.959241708442744, |
|
"learning_rate": 7.192478827686242e-06, |
|
"loss": 0.7392, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.2709030100334449, |
|
"grad_norm": 1.1289301016174298, |
|
"learning_rate": 7.122119261814175e-06, |
|
"loss": 0.7595, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.2842809364548495, |
|
"grad_norm": 1.087208477220754, |
|
"learning_rate": 7.0512433487803245e-06, |
|
"loss": 0.7432, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2976588628762542, |
|
"grad_norm": 1.0789795463274745, |
|
"learning_rate": 6.979868333881499e-06, |
|
"loss": 0.7664, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.3110367892976589, |
|
"grad_norm": 1.3380304539387544, |
|
"learning_rate": 6.908011583854353e-06, |
|
"loss": 0.7236, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.3244147157190636, |
|
"grad_norm": 1.0055939295359189, |
|
"learning_rate": 6.835690582649762e-06, |
|
"loss": 0.762, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.3377926421404682, |
|
"grad_norm": 1.5847854902361571, |
|
"learning_rate": 6.762922927178696e-06, |
|
"loss": 0.7788, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3377926421404682, |
|
"eval_loss": 1.0046015977859497, |
|
"eval_runtime": 149.6682, |
|
"eval_samples_per_second": 35.499, |
|
"eval_steps_per_second": 1.116, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.351170568561873, |
|
"grad_norm": 1.1518010042974471, |
|
"learning_rate": 6.689726323030582e-06, |
|
"loss": 0.7847, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.3645484949832776, |
|
"grad_norm": 1.0172541085747402, |
|
"learning_rate": 6.6161185801652495e-06, |
|
"loss": 0.7676, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.3779264214046822, |
|
"grad_norm": 0.9656720368153764, |
|
"learning_rate": 6.5421176085794645e-06, |
|
"loss": 0.7217, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.391304347826087, |
|
"grad_norm": 0.9939195607529424, |
|
"learning_rate": 6.467741413949124e-06, |
|
"loss": 0.7707, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.4046822742474916, |
|
"grad_norm": 1.0058019218908654, |
|
"learning_rate": 6.39300809324818e-06, |
|
"loss": 0.7755, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.4180602006688963, |
|
"grad_norm": 1.030206779995328, |
|
"learning_rate": 6.3179358303453386e-06, |
|
"loss": 0.7635, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.431438127090301, |
|
"grad_norm": 1.023059488522412, |
|
"learning_rate": 6.242542891579619e-06, |
|
"loss": 0.7505, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.4448160535117056, |
|
"grad_norm": 1.021326179356742, |
|
"learning_rate": 6.1668476213158525e-06, |
|
"loss": 0.7029, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.4581939799331103, |
|
"grad_norm": 0.9174722380364886, |
|
"learning_rate": 6.090868437481185e-06, |
|
"loss": 0.7637, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.471571906354515, |
|
"grad_norm": 0.9740471001361392, |
|
"learning_rate": 6.0146238270836895e-06, |
|
"loss": 0.8025, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.4849498327759196, |
|
"grad_norm": 0.9534933933758712, |
|
"learning_rate": 5.938132341714173e-06, |
|
"loss": 0.7436, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.4983277591973243, |
|
"grad_norm": 1.091666803054282, |
|
"learning_rate": 5.861412593032247e-06, |
|
"loss": 0.7566, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.511705685618729, |
|
"grad_norm": 1.136903874541745, |
|
"learning_rate": 5.7844832482378245e-06, |
|
"loss": 0.7137, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.5250836120401337, |
|
"grad_norm": 1.0944264729530682, |
|
"learning_rate": 5.7073630255290515e-06, |
|
"loss": 0.7716, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 1.1213292333748575, |
|
"learning_rate": 5.630070689547875e-06, |
|
"loss": 0.7631, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.551839464882943, |
|
"grad_norm": 1.0512016732048837, |
|
"learning_rate": 5.552625046814283e-06, |
|
"loss": 0.7737, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.5652173913043477, |
|
"grad_norm": 0.9851716844060948, |
|
"learning_rate": 5.475044941150361e-06, |
|
"loss": 0.7399, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.5785953177257523, |
|
"grad_norm": 1.0580186713020379, |
|
"learning_rate": 5.397349249095279e-06, |
|
"loss": 0.7521, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.5919732441471572, |
|
"grad_norm": 1.0741202044360585, |
|
"learning_rate": 5.319556875312313e-06, |
|
"loss": 0.7779, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.605351170568562, |
|
"grad_norm": 1.2559630227035101, |
|
"learning_rate": 5.241686747989023e-06, |
|
"loss": 0.7246, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.605351170568562, |
|
"eval_loss": 1.00019371509552, |
|
"eval_runtime": 150.3942, |
|
"eval_samples_per_second": 35.327, |
|
"eval_steps_per_second": 1.11, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.6187290969899666, |
|
"grad_norm": 0.9665058472664697, |
|
"learning_rate": 5.163757814231708e-06, |
|
"loss": 0.7553, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.6321070234113713, |
|
"grad_norm": 1.0298616708170112, |
|
"learning_rate": 5.085789035455256e-06, |
|
"loss": 0.7372, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.645484949832776, |
|
"grad_norm": 1.1440770710719401, |
|
"learning_rate": 5.007799382769516e-06, |
|
"loss": 0.7422, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.6588628762541806, |
|
"grad_norm": 1.0233808186087545, |
|
"learning_rate": 4.929807832363308e-06, |
|
"loss": 0.7515, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.6722408026755853, |
|
"grad_norm": 1.178015294752791, |
|
"learning_rate": 4.8518333608872015e-06, |
|
"loss": 0.758, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.68561872909699, |
|
"grad_norm": 1.0131920406972836, |
|
"learning_rate": 4.773894940836174e-06, |
|
"loss": 0.749, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.6989966555183946, |
|
"grad_norm": 1.1643352193905245, |
|
"learning_rate": 4.69601153593329e-06, |
|
"loss": 0.7784, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.7123745819397993, |
|
"grad_norm": 1.2241373501480923, |
|
"learning_rate": 4.618202096515505e-06, |
|
"loss": 0.7553, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.725752508361204, |
|
"grad_norm": 1.0198134391959597, |
|
"learning_rate": 4.540485554922729e-06, |
|
"loss": 0.7655, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.7391304347826086, |
|
"grad_norm": 1.0507444811075115, |
|
"learning_rate": 4.462880820891284e-06, |
|
"loss": 0.7506, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.7525083612040135, |
|
"grad_norm": 1.0689187605834707, |
|
"learning_rate": 4.385406776952833e-06, |
|
"loss": 0.7681, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.7658862876254182, |
|
"grad_norm": 1.023967031835118, |
|
"learning_rate": 4.308082273839953e-06, |
|
"loss": 0.7859, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.779264214046823, |
|
"grad_norm": 2.7354167382822037, |
|
"learning_rate": 4.230926125899432e-06, |
|
"loss": 0.7677, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.7926421404682276, |
|
"grad_norm": 1.2533171109574939, |
|
"learning_rate": 4.153957106514428e-06, |
|
"loss": 0.7224, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.8060200668896322, |
|
"grad_norm": 1.1666059608649366, |
|
"learning_rate": 4.0771939435365795e-06, |
|
"loss": 0.7286, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.819397993311037, |
|
"grad_norm": 1.0827920835385718, |
|
"learning_rate": 4.000655314729222e-06, |
|
"loss": 0.7939, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.8327759197324416, |
|
"grad_norm": 1.05022651820567, |
|
"learning_rate": 3.924359843222758e-06, |
|
"loss": 0.7843, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 1.093474005640841, |
|
"learning_rate": 3.848326092983356e-06, |
|
"loss": 0.732, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.859531772575251, |
|
"grad_norm": 1.0137348368542876, |
|
"learning_rate": 3.7725725642960047e-06, |
|
"loss": 0.7317, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.8729096989966556, |
|
"grad_norm": 1.0331644505317576, |
|
"learning_rate": 3.6971176892631048e-06, |
|
"loss": 0.7525, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.8729096989966556, |
|
"eval_loss": 0.9970734119415283, |
|
"eval_runtime": 149.7659, |
|
"eval_samples_per_second": 35.475, |
|
"eval_steps_per_second": 1.115, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.8862876254180603, |
|
"grad_norm": 1.0320209327878382, |
|
"learning_rate": 3.6219798273196147e-06, |
|
"loss": 0.7621, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.899665551839465, |
|
"grad_norm": 1.3023198090948238, |
|
"learning_rate": 3.547177260765904e-06, |
|
"loss": 0.7473, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.9130434782608696, |
|
"grad_norm": 1.0442173127014935, |
|
"learning_rate": 3.47272819031935e-06, |
|
"loss": 0.7279, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.9264214046822743, |
|
"grad_norm": 1.1980268226658213, |
|
"learning_rate": 3.398650730685813e-06, |
|
"loss": 0.7681, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.939799331103679, |
|
"grad_norm": 1.0904891997215738, |
|
"learning_rate": 3.3249629061520126e-06, |
|
"loss": 0.7551, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.9531772575250836, |
|
"grad_norm": 0.9747796170841941, |
|
"learning_rate": 3.2516826461999287e-06, |
|
"loss": 0.7567, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.9665551839464883, |
|
"grad_norm": 0.9766080320866084, |
|
"learning_rate": 3.1788277811442436e-06, |
|
"loss": 0.7621, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.979933110367893, |
|
"grad_norm": 0.9932850189281304, |
|
"learning_rate": 3.1064160377939457e-06, |
|
"loss": 0.7707, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.9933110367892977, |
|
"grad_norm": 1.4307674678624753, |
|
"learning_rate": 3.0344650351390797e-06, |
|
"loss": 0.7902, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 2.0066889632107023, |
|
"grad_norm": 1.0374602352029385, |
|
"learning_rate": 2.962992280063771e-06, |
|
"loss": 0.6413, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.020066889632107, |
|
"grad_norm": 1.0251714475763039, |
|
"learning_rate": 2.8920151630864906e-06, |
|
"loss": 0.5334, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 2.0334448160535117, |
|
"grad_norm": 1.0474079542961143, |
|
"learning_rate": 2.821550954128667e-06, |
|
"loss": 0.5353, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.0468227424749164, |
|
"grad_norm": 1.0781379123388604, |
|
"learning_rate": 2.7516167983126053e-06, |
|
"loss": 0.5371, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 2.060200668896321, |
|
"grad_norm": 0.9379537943202282, |
|
"learning_rate": 2.6822297117898144e-06, |
|
"loss": 0.535, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.0735785953177257, |
|
"grad_norm": 1.343473723166369, |
|
"learning_rate": 2.61340657760067e-06, |
|
"loss": 0.5595, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 2.0869565217391304, |
|
"grad_norm": 0.8716267946589176, |
|
"learning_rate": 2.5451641415665097e-06, |
|
"loss": 0.5129, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.100334448160535, |
|
"grad_norm": 1.083594982127184, |
|
"learning_rate": 2.4775190082150747e-06, |
|
"loss": 0.5391, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 2.1137123745819397, |
|
"grad_norm": 0.9685617426495453, |
|
"learning_rate": 2.4104876367403602e-06, |
|
"loss": 0.5462, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.1270903010033444, |
|
"grad_norm": 1.1798547706895686, |
|
"learning_rate": 2.344086336997819e-06, |
|
"loss": 0.5001, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 2.140468227424749, |
|
"grad_norm": 0.9742855432807195, |
|
"learning_rate": 2.278331265535898e-06, |
|
"loss": 0.529, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.140468227424749, |
|
"eval_loss": 1.0470257997512817, |
|
"eval_runtime": 149.7094, |
|
"eval_samples_per_second": 35.489, |
|
"eval_steps_per_second": 1.115, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"grad_norm": 0.976201736501433, |
|
"learning_rate": 2.2132384216648783e-06, |
|
"loss": 0.5773, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 2.1672240802675584, |
|
"grad_norm": 0.9423763960170988, |
|
"learning_rate": 2.148823643563973e-06, |
|
"loss": 0.5209, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.180602006688963, |
|
"grad_norm": 0.950162661211813, |
|
"learning_rate": 2.0851026044276405e-06, |
|
"loss": 0.5338, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 2.1939799331103678, |
|
"grad_norm": 1.004119449417975, |
|
"learning_rate": 2.022090808652024e-06, |
|
"loss": 0.5502, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.2073578595317724, |
|
"grad_norm": 1.147565589764388, |
|
"learning_rate": 1.9598035880624832e-06, |
|
"loss": 0.5352, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.220735785953177, |
|
"grad_norm": 0.9031033821480767, |
|
"learning_rate": 1.8982560981830911e-06, |
|
"loss": 0.5474, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.234113712374582, |
|
"grad_norm": 1.0009392979799976, |
|
"learning_rate": 1.8374633145490566e-06, |
|
"loss": 0.4991, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 2.2474916387959865, |
|
"grad_norm": 0.9582618373764454, |
|
"learning_rate": 1.7774400290629095e-06, |
|
"loss": 0.532, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.260869565217391, |
|
"grad_norm": 1.0009695871449187, |
|
"learning_rate": 1.7182008463953986e-06, |
|
"loss": 0.5582, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 2.274247491638796, |
|
"grad_norm": 0.9316164669727046, |
|
"learning_rate": 1.6597601804319186e-06, |
|
"loss": 0.508, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.2876254180602005, |
|
"grad_norm": 1.1628160696804308, |
|
"learning_rate": 1.602132250765378e-06, |
|
"loss": 0.5539, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 2.3010033444816056, |
|
"grad_norm": 0.9951585058908428, |
|
"learning_rate": 1.5453310792363275e-06, |
|
"loss": 0.545, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.3143812709030103, |
|
"grad_norm": 0.9609133074764106, |
|
"learning_rate": 1.489370486521219e-06, |
|
"loss": 0.5266, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 2.327759197324415, |
|
"grad_norm": 0.9987936738623786, |
|
"learning_rate": 1.4342640887695935e-06, |
|
"loss": 0.5175, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.3411371237458196, |
|
"grad_norm": 0.945306998480509, |
|
"learning_rate": 1.380025294291057e-06, |
|
"loss": 0.5132, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.3545150501672243, |
|
"grad_norm": 1.0723405116543199, |
|
"learning_rate": 1.3266673002927993e-06, |
|
"loss": 0.5381, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.367892976588629, |
|
"grad_norm": 1.0166089482172302, |
|
"learning_rate": 1.2742030896685075e-06, |
|
"loss": 0.5151, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 2.3812709030100336, |
|
"grad_norm": 1.110135707052159, |
|
"learning_rate": 1.2226454278393956e-06, |
|
"loss": 0.5314, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.3946488294314383, |
|
"grad_norm": 0.9125170499901653, |
|
"learning_rate": 1.1720068596481765e-06, |
|
"loss": 0.5083, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 2.408026755852843, |
|
"grad_norm": 0.8662645128241546, |
|
"learning_rate": 1.1222997063066855e-06, |
|
"loss": 0.5365, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.408026755852843, |
|
"eval_loss": 1.0517374277114868, |
|
"eval_runtime": 149.9889, |
|
"eval_samples_per_second": 35.423, |
|
"eval_steps_per_second": 1.113, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.4214046822742477, |
|
"grad_norm": 1.030613777960971, |
|
"learning_rate": 1.0735360623979201e-06, |
|
"loss": 0.5308, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 2.4347826086956523, |
|
"grad_norm": 1.0048676206192078, |
|
"learning_rate": 1.0257277929332332e-06, |
|
"loss": 0.5204, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.448160535117057, |
|
"grad_norm": 0.9609476114160428, |
|
"learning_rate": 9.788865304653727e-07, |
|
"loss": 0.5391, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 0.9640556560352546, |
|
"learning_rate": 9.330236722580832e-07, |
|
"loss": 0.5106, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.4749163879598663, |
|
"grad_norm": 0.9899691141087767, |
|
"learning_rate": 8.881503775129696e-07, |
|
"loss": 0.5413, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 2.488294314381271, |
|
"grad_norm": 1.0571419947794767, |
|
"learning_rate": 8.442775646542656e-07, |
|
"loss": 0.5388, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.5016722408026757, |
|
"grad_norm": 0.9680901349582856, |
|
"learning_rate": 8.014159086722101e-07, |
|
"loss": 0.5242, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 2.5150501672240804, |
|
"grad_norm": 1.0704814253502815, |
|
"learning_rate": 7.595758385256325e-07, |
|
"loss": 0.5477, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.528428093645485, |
|
"grad_norm": 0.9880390697232635, |
|
"learning_rate": 7.187675346044165e-07, |
|
"loss": 0.5078, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 2.5418060200668897, |
|
"grad_norm": 0.9815512843065104, |
|
"learning_rate": 6.790009262524455e-07, |
|
"loss": 0.5414, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.5551839464882944, |
|
"grad_norm": 0.978897636667664, |
|
"learning_rate": 6.40285689351619e-07, |
|
"loss": 0.5165, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 2.568561872909699, |
|
"grad_norm": 1.0210817819839888, |
|
"learning_rate": 6.026312439675553e-07, |
|
"loss": 0.5129, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.5819397993311037, |
|
"grad_norm": 1.0090720244525628, |
|
"learning_rate": 5.66046752057523e-07, |
|
"loss": 0.503, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 2.5953177257525084, |
|
"grad_norm": 1.1825668438151073, |
|
"learning_rate": 5.305411152411943e-07, |
|
"loss": 0.5241, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.608695652173913, |
|
"grad_norm": 1.1589618219400188, |
|
"learning_rate": 4.961229726347233e-07, |
|
"loss": 0.5176, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.6220735785953178, |
|
"grad_norm": 1.1916794312531742, |
|
"learning_rate": 4.628006987487088e-07, |
|
"loss": 0.4961, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.6354515050167224, |
|
"grad_norm": 0.9842745180406584, |
|
"learning_rate": 4.3058240145053223e-07, |
|
"loss": 0.5436, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 2.648829431438127, |
|
"grad_norm": 0.9987938748419682, |
|
"learning_rate": 3.994759199915821e-07, |
|
"loss": 0.5004, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.6622073578595318, |
|
"grad_norm": 1.1090522864890544, |
|
"learning_rate": 3.694888230998289e-07, |
|
"loss": 0.5275, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 2.6755852842809364, |
|
"grad_norm": 0.9438148481041575, |
|
"learning_rate": 3.406284071382288e-07, |
|
"loss": 0.5256, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.6755852842809364, |
|
"eval_loss": 1.0514318943023682, |
|
"eval_runtime": 149.7744, |
|
"eval_samples_per_second": 35.473, |
|
"eval_steps_per_second": 1.115, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.688963210702341, |
|
"grad_norm": 1.0051570371927316, |
|
"learning_rate": 3.1290169432939556e-07, |
|
"loss": 0.5022, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 2.702341137123746, |
|
"grad_norm": 1.0655831780347378, |
|
"learning_rate": 2.863154310469768e-07, |
|
"loss": 0.5397, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.7157190635451505, |
|
"grad_norm": 0.9818283546277681, |
|
"learning_rate": 2.6087608617414715e-07, |
|
"loss": 0.5359, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 2.729096989966555, |
|
"grad_norm": 1.0318807241047394, |
|
"learning_rate": 2.365898495296226e-07, |
|
"loss": 0.5013, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.74247491638796, |
|
"grad_norm": 1.0460771356529457, |
|
"learning_rate": 2.1346263036157112e-07, |
|
"loss": 0.5468, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 2.7558528428093645, |
|
"grad_norm": 1.0837948922809177, |
|
"learning_rate": 1.9150005590979682e-07, |
|
"loss": 0.52, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"grad_norm": 1.0353066956181127, |
|
"learning_rate": 1.707074700365341e-07, |
|
"loss": 0.4959, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 2.782608695652174, |
|
"grad_norm": 0.9986043986688646, |
|
"learning_rate": 1.5108993192620268e-07, |
|
"loss": 0.5014, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.7959866220735785, |
|
"grad_norm": 0.9948563482546479, |
|
"learning_rate": 1.326522148544157e-07, |
|
"loss": 0.5043, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 2.809364548494983, |
|
"grad_norm": 1.1025325621449409, |
|
"learning_rate": 1.153988050265692e-07, |
|
"loss": 0.5492, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.822742474916388, |
|
"grad_norm": 0.9369588676729006, |
|
"learning_rate": 9.933390048626645e-08, |
|
"loss": 0.5247, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 2.8361204013377925, |
|
"grad_norm": 0.9364006205818028, |
|
"learning_rate": 8.446141009386955e-08, |
|
"loss": 0.52, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.849498327759197, |
|
"grad_norm": 1.1077929391413845, |
|
"learning_rate": 7.078495257540341e-08, |
|
"loss": 0.5146, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 2.862876254180602, |
|
"grad_norm": 0.990932574232909, |
|
"learning_rate": 5.830785564206298e-08, |
|
"loss": 0.5674, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.8762541806020065, |
|
"grad_norm": 1.0941649208541484, |
|
"learning_rate": 4.7033155180522535e-08, |
|
"loss": 0.544, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 2.8896321070234112, |
|
"grad_norm": 0.8230619670119865, |
|
"learning_rate": 3.696359451425491e-08, |
|
"loss": 0.5008, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.903010033444816, |
|
"grad_norm": 0.9924164565453516, |
|
"learning_rate": 2.8101623736035422e-08, |
|
"loss": 0.5273, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 2.9163879598662206, |
|
"grad_norm": 1.000233223489882, |
|
"learning_rate": 2.0449399111791003e-08, |
|
"loss": 0.5461, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.9297658862876252, |
|
"grad_norm": 1.048907180597167, |
|
"learning_rate": 1.400878255594762e-08, |
|
"loss": 0.5118, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 2.94314381270903, |
|
"grad_norm": 1.0344098154390242, |
|
"learning_rate": 8.781341178393244e-09, |
|
"loss": 0.518, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.94314381270903, |
|
"eval_loss": 1.0516117811203003, |
|
"eval_runtime": 151.5953, |
|
"eval_samples_per_second": 35.047, |
|
"eval_steps_per_second": 1.102, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.9565217391304346, |
|
"grad_norm": 0.9510584400959354, |
|
"learning_rate": 4.76834690317396e-09, |
|
"loss": 0.5003, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 2.9698996655518393, |
|
"grad_norm": 0.93864893822128, |
|
"learning_rate": 1.9707761590148642e-09, |
|
"loss": 0.5345, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.983277591973244, |
|
"grad_norm": 0.9885959787002001, |
|
"learning_rate": 3.8930964173733434e-10, |
|
"loss": 0.5564, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 2.993979933110368, |
|
"step": 1119, |
|
"total_flos": 594922701717504.0, |
|
"train_loss": 0.7812928989383128, |
|
"train_runtime": 15592.0429, |
|
"train_samples_per_second": 9.2, |
|
"train_steps_per_second": 0.072 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1119, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 594922701717504.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|