diff --git "a/checkpoint-1168/trainer_state.json" "b/checkpoint-1168/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1168/trainer_state.json" @@ -0,0 +1,8289 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9997860047078965, + "eval_steps": 117, + "global_step": 1168, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008559811684142949, + "grad_norm": 4.09375, + "learning_rate": 2.5e-06, + "loss": 1.4683, + "step": 1 + }, + { + "epoch": 0.0008559811684142949, + "eval_loss": 1.6825672388076782, + "eval_runtime": 22.7989, + "eval_samples_per_second": 17.106, + "eval_steps_per_second": 17.106, + "step": 1 + }, + { + "epoch": 0.0017119623368285898, + "grad_norm": 2.90625, + "learning_rate": 5e-06, + "loss": 1.6305, + "step": 2 + }, + { + "epoch": 0.0025679435052428845, + "grad_norm": 3.0, + "learning_rate": 7.5e-06, + "loss": 1.6191, + "step": 3 + }, + { + "epoch": 0.0034239246736571796, + "grad_norm": 2.78125, + "learning_rate": 1e-05, + "loss": 1.6011, + "step": 4 + }, + { + "epoch": 0.004279905842071475, + "grad_norm": 2.671875, + "learning_rate": 1.25e-05, + "loss": 1.6021, + "step": 5 + }, + { + "epoch": 0.005135887010485769, + "grad_norm": 2.71875, + "learning_rate": 1.5e-05, + "loss": 1.4842, + "step": 6 + }, + { + "epoch": 0.005991868178900064, + "grad_norm": 2.3125, + "learning_rate": 1.75e-05, + "loss": 1.718, + "step": 7 + }, + { + "epoch": 0.006847849347314359, + "grad_norm": 2.328125, + "learning_rate": 2e-05, + "loss": 1.621, + "step": 8 + }, + { + "epoch": 0.007703830515728654, + "grad_norm": 2.203125, + "learning_rate": 2.25e-05, + "loss": 1.648, + "step": 9 + }, + { + "epoch": 0.00855981168414295, + "grad_norm": 2.078125, + "learning_rate": 2.5e-05, + "loss": 1.5684, + "step": 10 + }, + { + "epoch": 0.009415792852557245, + "grad_norm": 2.15625, + "learning_rate": 2.7500000000000004e-05, + "loss": 1.6588, + "step": 11 + }, + { + "epoch": 0.010271774020971538, + "grad_norm": 2.15625, + "learning_rate": 3e-05, + "loss": 1.5649, + "step": 12 + }, + { + "epoch": 0.011127755189385833, + "grad_norm": 2.21875, + "learning_rate": 3.2500000000000004e-05, + "loss": 1.5527, + "step": 13 + }, + { + "epoch": 0.011983736357800128, + "grad_norm": 2.0625, + "learning_rate": 3.5e-05, + "loss": 1.5464, + "step": 14 + }, + { + "epoch": 0.012839717526214423, + "grad_norm": 2.125, + "learning_rate": 3.7500000000000003e-05, + "loss": 1.7606, + "step": 15 + }, + { + "epoch": 0.013695698694628718, + "grad_norm": 2.21875, + "learning_rate": 4e-05, + "loss": 1.5089, + "step": 16 + }, + { + "epoch": 0.014551679863043013, + "grad_norm": 2.109375, + "learning_rate": 4.25e-05, + "loss": 1.5609, + "step": 17 + }, + { + "epoch": 0.015407661031457309, + "grad_norm": 2.265625, + "learning_rate": 4.5e-05, + "loss": 1.688, + "step": 18 + }, + { + "epoch": 0.016263642199871604, + "grad_norm": 2.140625, + "learning_rate": 4.75e-05, + "loss": 1.4745, + "step": 19 + }, + { + "epoch": 0.0171196233682859, + "grad_norm": 2.234375, + "learning_rate": 5e-05, + "loss": 1.6253, + "step": 20 + }, + { + "epoch": 0.017975604536700194, + "grad_norm": 2.34375, + "learning_rate": 4.999990638925292e-05, + "loss": 1.5921, + "step": 21 + }, + { + "epoch": 0.01883158570511449, + "grad_norm": 2.109375, + "learning_rate": 4.999962555771271e-05, + "loss": 1.443, + "step": 22 + }, + { + "epoch": 0.01968756687352878, + "grad_norm": 2.328125, + "learning_rate": 4.999915750748249e-05, + "loss": 1.609, + "step": 23 + }, + { + "epoch": 0.020543548041943076, + "grad_norm": 2.234375, + "learning_rate": 4.999850224206741e-05, + "loss": 1.6203, + "step": 24 + }, + { + "epoch": 0.02139952921035737, + "grad_norm": 2.1875, + "learning_rate": 4.999765976637467e-05, + "loss": 1.4801, + "step": 25 + }, + { + "epoch": 0.022255510378771666, + "grad_norm": 2.265625, + "learning_rate": 4.999663008671344e-05, + "loss": 1.6311, + "step": 26 + }, + { + "epoch": 0.02311149154718596, + "grad_norm": 2.0625, + "learning_rate": 4.9995413210794864e-05, + "loss": 1.586, + "step": 27 + }, + { + "epoch": 0.023967472715600256, + "grad_norm": 2.03125, + "learning_rate": 4.999400914773193e-05, + "loss": 1.5281, + "step": 28 + }, + { + "epoch": 0.02482345388401455, + "grad_norm": 2.0, + "learning_rate": 4.99924179080395e-05, + "loss": 1.589, + "step": 29 + }, + { + "epoch": 0.025679435052428846, + "grad_norm": 2.09375, + "learning_rate": 4.999063950363413e-05, + "loss": 1.6053, + "step": 30 + }, + { + "epoch": 0.02653541622084314, + "grad_norm": 2.046875, + "learning_rate": 4.998867394783404e-05, + "loss": 1.4413, + "step": 31 + }, + { + "epoch": 0.027391397389257437, + "grad_norm": 1.953125, + "learning_rate": 4.9986521255359004e-05, + "loss": 1.5999, + "step": 32 + }, + { + "epoch": 0.028247378557671732, + "grad_norm": 2.03125, + "learning_rate": 4.998418144233023e-05, + "loss": 1.6345, + "step": 33 + }, + { + "epoch": 0.029103359726086027, + "grad_norm": 2.3125, + "learning_rate": 4.998165452627025e-05, + "loss": 1.665, + "step": 34 + }, + { + "epoch": 0.029959340894500322, + "grad_norm": 2.0, + "learning_rate": 4.997894052610279e-05, + "loss": 1.5723, + "step": 35 + }, + { + "epoch": 0.030815322062914617, + "grad_norm": 2.0, + "learning_rate": 4.997603946215262e-05, + "loss": 1.4505, + "step": 36 + }, + { + "epoch": 0.03167130323132891, + "grad_norm": 2.015625, + "learning_rate": 4.997295135614539e-05, + "loss": 1.5724, + "step": 37 + }, + { + "epoch": 0.03252728439974321, + "grad_norm": 2.140625, + "learning_rate": 4.9969676231207494e-05, + "loss": 1.6605, + "step": 38 + }, + { + "epoch": 0.0333832655681575, + "grad_norm": 1.984375, + "learning_rate": 4.996621411186589e-05, + "loss": 1.5345, + "step": 39 + }, + { + "epoch": 0.0342392467365718, + "grad_norm": 1.9453125, + "learning_rate": 4.99625650240479e-05, + "loss": 1.6665, + "step": 40 + }, + { + "epoch": 0.03509522790498609, + "grad_norm": 2.0625, + "learning_rate": 4.995872899508103e-05, + "loss": 1.5707, + "step": 41 + }, + { + "epoch": 0.03595120907340039, + "grad_norm": 2.140625, + "learning_rate": 4.995470605369277e-05, + "loss": 1.652, + "step": 42 + }, + { + "epoch": 0.03680719024181468, + "grad_norm": 1.9453125, + "learning_rate": 4.995049623001036e-05, + "loss": 1.3974, + "step": 43 + }, + { + "epoch": 0.03766317141022898, + "grad_norm": 1.9296875, + "learning_rate": 4.9946099555560565e-05, + "loss": 1.613, + "step": 44 + }, + { + "epoch": 0.03851915257864327, + "grad_norm": 1.9921875, + "learning_rate": 4.994151606326949e-05, + "loss": 1.5067, + "step": 45 + }, + { + "epoch": 0.03937513374705756, + "grad_norm": 1.953125, + "learning_rate": 4.993674578746225e-05, + "loss": 1.5115, + "step": 46 + }, + { + "epoch": 0.040231114915471856, + "grad_norm": 2.234375, + "learning_rate": 4.993178876386278e-05, + "loss": 1.6309, + "step": 47 + }, + { + "epoch": 0.04108709608388615, + "grad_norm": 2.046875, + "learning_rate": 4.992664502959351e-05, + "loss": 1.6605, + "step": 48 + }, + { + "epoch": 0.04194307725230045, + "grad_norm": 2.0625, + "learning_rate": 4.9921314623175174e-05, + "loss": 1.5052, + "step": 49 + }, + { + "epoch": 0.04279905842071474, + "grad_norm": 2.0625, + "learning_rate": 4.991579758452644e-05, + "loss": 1.5388, + "step": 50 + }, + { + "epoch": 0.04365503958912904, + "grad_norm": 1.9140625, + "learning_rate": 4.99100939549636e-05, + "loss": 1.6761, + "step": 51 + }, + { + "epoch": 0.04451102075754333, + "grad_norm": 2.109375, + "learning_rate": 4.990420377720038e-05, + "loss": 1.6295, + "step": 52 + }, + { + "epoch": 0.04536700192595763, + "grad_norm": 1.953125, + "learning_rate": 4.9898127095347466e-05, + "loss": 1.5579, + "step": 53 + }, + { + "epoch": 0.04622298309437192, + "grad_norm": 1.8984375, + "learning_rate": 4.989186395491229e-05, + "loss": 1.4967, + "step": 54 + }, + { + "epoch": 0.04707896426278622, + "grad_norm": 1.921875, + "learning_rate": 4.9885414402798624e-05, + "loss": 1.4205, + "step": 55 + }, + { + "epoch": 0.04793494543120051, + "grad_norm": 2.03125, + "learning_rate": 4.987877848730627e-05, + "loss": 1.6522, + "step": 56 + }, + { + "epoch": 0.04879092659961481, + "grad_norm": 1.90625, + "learning_rate": 4.987195625813066e-05, + "loss": 1.5241, + "step": 57 + }, + { + "epoch": 0.0496469077680291, + "grad_norm": 2.09375, + "learning_rate": 4.9864947766362505e-05, + "loss": 1.548, + "step": 58 + }, + { + "epoch": 0.0505028889364434, + "grad_norm": 1.8984375, + "learning_rate": 4.985775306448743e-05, + "loss": 1.5058, + "step": 59 + }, + { + "epoch": 0.05135887010485769, + "grad_norm": 1.9765625, + "learning_rate": 4.985037220638555e-05, + "loss": 1.6028, + "step": 60 + }, + { + "epoch": 0.05221485127327199, + "grad_norm": 1.984375, + "learning_rate": 4.984280524733107e-05, + "loss": 1.5308, + "step": 61 + }, + { + "epoch": 0.05307083244168628, + "grad_norm": 2.0, + "learning_rate": 4.9835052243991874e-05, + "loss": 1.5042, + "step": 62 + }, + { + "epoch": 0.05392681361010058, + "grad_norm": 2.15625, + "learning_rate": 4.982711325442914e-05, + "loss": 1.5008, + "step": 63 + }, + { + "epoch": 0.05478279477851487, + "grad_norm": 2.109375, + "learning_rate": 4.981898833809683e-05, + "loss": 1.6986, + "step": 64 + }, + { + "epoch": 0.05563877594692917, + "grad_norm": 1.890625, + "learning_rate": 4.9810677555841314e-05, + "loss": 1.651, + "step": 65 + }, + { + "epoch": 0.056494757115343464, + "grad_norm": 1.9921875, + "learning_rate": 4.980218096990087e-05, + "loss": 1.5315, + "step": 66 + }, + { + "epoch": 0.05735073828375776, + "grad_norm": 1.84375, + "learning_rate": 4.9793498643905236e-05, + "loss": 1.5917, + "step": 67 + }, + { + "epoch": 0.058206719452172054, + "grad_norm": 1.9140625, + "learning_rate": 4.978463064287513e-05, + "loss": 1.5897, + "step": 68 + }, + { + "epoch": 0.05906270062058635, + "grad_norm": 2.453125, + "learning_rate": 4.977557703322178e-05, + "loss": 1.5924, + "step": 69 + }, + { + "epoch": 0.059918681789000644, + "grad_norm": 2.0625, + "learning_rate": 4.97663378827464e-05, + "loss": 1.5634, + "step": 70 + }, + { + "epoch": 0.06077466295741494, + "grad_norm": 2.171875, + "learning_rate": 4.9756913260639675e-05, + "loss": 1.5397, + "step": 71 + }, + { + "epoch": 0.061630644125829234, + "grad_norm": 1.90625, + "learning_rate": 4.974730323748129e-05, + "loss": 1.6735, + "step": 72 + }, + { + "epoch": 0.06248662529424353, + "grad_norm": 1.921875, + "learning_rate": 4.9737507885239366e-05, + "loss": 1.4538, + "step": 73 + }, + { + "epoch": 0.06334260646265782, + "grad_norm": 1.90625, + "learning_rate": 4.9727527277269915e-05, + "loss": 1.4092, + "step": 74 + }, + { + "epoch": 0.06419858763107211, + "grad_norm": 1.890625, + "learning_rate": 4.97173614883163e-05, + "loss": 1.5579, + "step": 75 + }, + { + "epoch": 0.06505456879948641, + "grad_norm": 1.8515625, + "learning_rate": 4.970701059450872e-05, + "loss": 1.6395, + "step": 76 + }, + { + "epoch": 0.0659105499679007, + "grad_norm": 1.8125, + "learning_rate": 4.9696474673363536e-05, + "loss": 1.4457, + "step": 77 + }, + { + "epoch": 0.066766531136315, + "grad_norm": 2.109375, + "learning_rate": 4.96857538037828e-05, + "loss": 1.5416, + "step": 78 + }, + { + "epoch": 0.0676225123047293, + "grad_norm": 1.921875, + "learning_rate": 4.9674848066053586e-05, + "loss": 1.4792, + "step": 79 + }, + { + "epoch": 0.0684784934731436, + "grad_norm": 1.9296875, + "learning_rate": 4.966375754184746e-05, + "loss": 1.467, + "step": 80 + }, + { + "epoch": 0.06933447464155788, + "grad_norm": 2.03125, + "learning_rate": 4.965248231421977e-05, + "loss": 1.6674, + "step": 81 + }, + { + "epoch": 0.07019045580997219, + "grad_norm": 2.015625, + "learning_rate": 4.964102246760914e-05, + "loss": 1.473, + "step": 82 + }, + { + "epoch": 0.07104643697838647, + "grad_norm": 2.03125, + "learning_rate": 4.962937808783675e-05, + "loss": 1.61, + "step": 83 + }, + { + "epoch": 0.07190241814680078, + "grad_norm": 1.875, + "learning_rate": 4.9617549262105724e-05, + "loss": 1.5847, + "step": 84 + }, + { + "epoch": 0.07275839931521506, + "grad_norm": 2.15625, + "learning_rate": 4.9605536079000476e-05, + "loss": 1.7443, + "step": 85 + }, + { + "epoch": 0.07361438048362937, + "grad_norm": 1.8359375, + "learning_rate": 4.9593338628486055e-05, + "loss": 1.5063, + "step": 86 + }, + { + "epoch": 0.07447036165204365, + "grad_norm": 1.9765625, + "learning_rate": 4.9580957001907445e-05, + "loss": 1.6636, + "step": 87 + }, + { + "epoch": 0.07532634282045796, + "grad_norm": 1.9921875, + "learning_rate": 4.9568391291988927e-05, + "loss": 1.6315, + "step": 88 + }, + { + "epoch": 0.07618232398887224, + "grad_norm": 1.8828125, + "learning_rate": 4.9555641592833334e-05, + "loss": 1.5544, + "step": 89 + }, + { + "epoch": 0.07703830515728655, + "grad_norm": 1.8203125, + "learning_rate": 4.954270799992138e-05, + "loss": 1.4513, + "step": 90 + }, + { + "epoch": 0.07789428632570083, + "grad_norm": 2.25, + "learning_rate": 4.9529590610110914e-05, + "loss": 1.5529, + "step": 91 + }, + { + "epoch": 0.07875026749411512, + "grad_norm": 1.796875, + "learning_rate": 4.9516289521636244e-05, + "loss": 1.3935, + "step": 92 + }, + { + "epoch": 0.07960624866252942, + "grad_norm": 1.9140625, + "learning_rate": 4.9502804834107354e-05, + "loss": 1.5309, + "step": 93 + }, + { + "epoch": 0.08046222983094371, + "grad_norm": 1.953125, + "learning_rate": 4.948913664850917e-05, + "loss": 1.5814, + "step": 94 + }, + { + "epoch": 0.08131821099935801, + "grad_norm": 1.8671875, + "learning_rate": 4.947528506720082e-05, + "loss": 1.5933, + "step": 95 + }, + { + "epoch": 0.0821741921677723, + "grad_norm": 1.8046875, + "learning_rate": 4.946125019391486e-05, + "loss": 1.4894, + "step": 96 + }, + { + "epoch": 0.0830301733361866, + "grad_norm": 1.828125, + "learning_rate": 4.944703213375648e-05, + "loss": 1.5702, + "step": 97 + }, + { + "epoch": 0.0838861545046009, + "grad_norm": 1.8828125, + "learning_rate": 4.943263099320275e-05, + "loss": 1.6595, + "step": 98 + }, + { + "epoch": 0.0847421356730152, + "grad_norm": 1.9375, + "learning_rate": 4.941804688010178e-05, + "loss": 1.5197, + "step": 99 + }, + { + "epoch": 0.08559811684142948, + "grad_norm": 2.015625, + "learning_rate": 4.940327990367195e-05, + "loss": 1.6567, + "step": 100 + }, + { + "epoch": 0.08645409800984379, + "grad_norm": 2.125, + "learning_rate": 4.938833017450108e-05, + "loss": 1.6511, + "step": 101 + }, + { + "epoch": 0.08731007917825807, + "grad_norm": 1.90625, + "learning_rate": 4.937319780454559e-05, + "loss": 1.6058, + "step": 102 + }, + { + "epoch": 0.08816606034667238, + "grad_norm": 1.984375, + "learning_rate": 4.9357882907129685e-05, + "loss": 1.5673, + "step": 103 + }, + { + "epoch": 0.08902204151508666, + "grad_norm": 1.96875, + "learning_rate": 4.934238559694448e-05, + "loss": 1.5504, + "step": 104 + }, + { + "epoch": 0.08987802268350097, + "grad_norm": 2.015625, + "learning_rate": 4.932670599004715e-05, + "loss": 1.5693, + "step": 105 + }, + { + "epoch": 0.09073400385191525, + "grad_norm": 2.484375, + "learning_rate": 4.9310844203860084e-05, + "loss": 1.5945, + "step": 106 + }, + { + "epoch": 0.09158998502032956, + "grad_norm": 1.96875, + "learning_rate": 4.929480035716997e-05, + "loss": 1.6466, + "step": 107 + }, + { + "epoch": 0.09244596618874384, + "grad_norm": 1.921875, + "learning_rate": 4.927857457012692e-05, + "loss": 1.6873, + "step": 108 + }, + { + "epoch": 0.09330194735715815, + "grad_norm": 1.9375, + "learning_rate": 4.9262166964243596e-05, + "loss": 1.7084, + "step": 109 + }, + { + "epoch": 0.09415792852557243, + "grad_norm": 1.7421875, + "learning_rate": 4.924557766239423e-05, + "loss": 1.4966, + "step": 110 + }, + { + "epoch": 0.09501390969398674, + "grad_norm": 1.8984375, + "learning_rate": 4.92288067888138e-05, + "loss": 1.5114, + "step": 111 + }, + { + "epoch": 0.09586989086240102, + "grad_norm": 2.046875, + "learning_rate": 4.921185446909702e-05, + "loss": 1.5532, + "step": 112 + }, + { + "epoch": 0.09672587203081533, + "grad_norm": 1.9609375, + "learning_rate": 4.919472083019743e-05, + "loss": 1.6787, + "step": 113 + }, + { + "epoch": 0.09758185319922962, + "grad_norm": 1.8359375, + "learning_rate": 4.917740600042645e-05, + "loss": 1.4609, + "step": 114 + }, + { + "epoch": 0.09843783436764392, + "grad_norm": 1.8203125, + "learning_rate": 4.915991010945241e-05, + "loss": 1.4925, + "step": 115 + }, + { + "epoch": 0.0992938155360582, + "grad_norm": 1.84375, + "learning_rate": 4.914223328829959e-05, + "loss": 1.5845, + "step": 116 + }, + { + "epoch": 0.10014979670447251, + "grad_norm": 2.09375, + "learning_rate": 4.912437566934723e-05, + "loss": 1.7777, + "step": 117 + }, + { + "epoch": 0.10014979670447251, + "eval_loss": 1.627388596534729, + "eval_runtime": 21.3696, + "eval_samples_per_second": 18.25, + "eval_steps_per_second": 18.25, + "step": 117 + }, + { + "epoch": 0.1010057778728868, + "grad_norm": 1.75, + "learning_rate": 4.9106337386328524e-05, + "loss": 1.6118, + "step": 118 + }, + { + "epoch": 0.1018617590413011, + "grad_norm": 1.859375, + "learning_rate": 4.908811857432965e-05, + "loss": 1.5514, + "step": 119 + }, + { + "epoch": 0.10271774020971539, + "grad_norm": 1.7734375, + "learning_rate": 4.9069719369788734e-05, + "loss": 1.5689, + "step": 120 + }, + { + "epoch": 0.10357372137812969, + "grad_norm": 1.90625, + "learning_rate": 4.905113991049484e-05, + "loss": 1.564, + "step": 121 + }, + { + "epoch": 0.10442970254654398, + "grad_norm": 1.96875, + "learning_rate": 4.903238033558692e-05, + "loss": 1.6917, + "step": 122 + }, + { + "epoch": 0.10528568371495826, + "grad_norm": 1.8046875, + "learning_rate": 4.901344078555282e-05, + "loss": 1.4474, + "step": 123 + }, + { + "epoch": 0.10614166488337257, + "grad_norm": 1.9921875, + "learning_rate": 4.899432140222816e-05, + "loss": 1.6063, + "step": 124 + }, + { + "epoch": 0.10699764605178685, + "grad_norm": 1.6328125, + "learning_rate": 4.8975022328795325e-05, + "loss": 1.5834, + "step": 125 + }, + { + "epoch": 0.10785362722020116, + "grad_norm": 1.921875, + "learning_rate": 4.895554370978238e-05, + "loss": 1.6613, + "step": 126 + }, + { + "epoch": 0.10870960838861544, + "grad_norm": 1.9140625, + "learning_rate": 4.893588569106195e-05, + "loss": 1.6858, + "step": 127 + }, + { + "epoch": 0.10956558955702975, + "grad_norm": 1.9609375, + "learning_rate": 4.89160484198502e-05, + "loss": 1.6384, + "step": 128 + }, + { + "epoch": 0.11042157072544403, + "grad_norm": 1.8671875, + "learning_rate": 4.8896032044705655e-05, + "loss": 1.5923, + "step": 129 + }, + { + "epoch": 0.11127755189385834, + "grad_norm": 2.0625, + "learning_rate": 4.887583671552816e-05, + "loss": 1.5658, + "step": 130 + }, + { + "epoch": 0.11213353306227263, + "grad_norm": 1.7109375, + "learning_rate": 4.885546258355769e-05, + "loss": 1.4684, + "step": 131 + }, + { + "epoch": 0.11298951423068693, + "grad_norm": 1.8203125, + "learning_rate": 4.8834909801373264e-05, + "loss": 1.5512, + "step": 132 + }, + { + "epoch": 0.11384549539910122, + "grad_norm": 1.7734375, + "learning_rate": 4.881417852289179e-05, + "loss": 1.5687, + "step": 133 + }, + { + "epoch": 0.11470147656751552, + "grad_norm": 2.109375, + "learning_rate": 4.8793268903366905e-05, + "loss": 1.6813, + "step": 134 + }, + { + "epoch": 0.1155574577359298, + "grad_norm": 1.75, + "learning_rate": 4.877218109938781e-05, + "loss": 1.4457, + "step": 135 + }, + { + "epoch": 0.11641343890434411, + "grad_norm": 1.828125, + "learning_rate": 4.875091526887813e-05, + "loss": 1.6283, + "step": 136 + }, + { + "epoch": 0.1172694200727584, + "grad_norm": 1.8203125, + "learning_rate": 4.872947157109467e-05, + "loss": 1.5411, + "step": 137 + }, + { + "epoch": 0.1181254012411727, + "grad_norm": 1.796875, + "learning_rate": 4.8707850166626266e-05, + "loss": 1.5107, + "step": 138 + }, + { + "epoch": 0.11898138240958699, + "grad_norm": 1.78125, + "learning_rate": 4.8686051217392606e-05, + "loss": 1.404, + "step": 139 + }, + { + "epoch": 0.11983736357800129, + "grad_norm": 2.15625, + "learning_rate": 4.866407488664296e-05, + "loss": 1.5754, + "step": 140 + }, + { + "epoch": 0.12069334474641558, + "grad_norm": 1.8984375, + "learning_rate": 4.864192133895498e-05, + "loss": 1.5735, + "step": 141 + }, + { + "epoch": 0.12154932591482988, + "grad_norm": 2.03125, + "learning_rate": 4.861959074023348e-05, + "loss": 1.5884, + "step": 142 + }, + { + "epoch": 0.12240530708324417, + "grad_norm": 1.9375, + "learning_rate": 4.8597083257709194e-05, + "loss": 1.5551, + "step": 143 + }, + { + "epoch": 0.12326128825165847, + "grad_norm": 1.8359375, + "learning_rate": 4.857439905993748e-05, + "loss": 1.4693, + "step": 144 + }, + { + "epoch": 0.12411726942007276, + "grad_norm": 1.859375, + "learning_rate": 4.855153831679713e-05, + "loss": 1.6085, + "step": 145 + }, + { + "epoch": 0.12497325058848706, + "grad_norm": 1.8203125, + "learning_rate": 4.852850119948904e-05, + "loss": 1.4771, + "step": 146 + }, + { + "epoch": 0.12582923175690136, + "grad_norm": 1.7578125, + "learning_rate": 4.850528788053495e-05, + "loss": 1.4144, + "step": 147 + }, + { + "epoch": 0.12668521292531565, + "grad_norm": 1.8203125, + "learning_rate": 4.848189853377615e-05, + "loss": 1.3908, + "step": 148 + }, + { + "epoch": 0.12754119409372994, + "grad_norm": 2.0, + "learning_rate": 4.8458333334372185e-05, + "loss": 1.6438, + "step": 149 + }, + { + "epoch": 0.12839717526214423, + "grad_norm": 1.7421875, + "learning_rate": 4.843459245879951e-05, + "loss": 1.5459, + "step": 150 + }, + { + "epoch": 0.12925315643055854, + "grad_norm": 1.78125, + "learning_rate": 4.841067608485024e-05, + "loss": 1.4941, + "step": 151 + }, + { + "epoch": 0.13010913759897283, + "grad_norm": 1.6953125, + "learning_rate": 4.8386584391630716e-05, + "loss": 1.3663, + "step": 152 + }, + { + "epoch": 0.13096511876738712, + "grad_norm": 1.7734375, + "learning_rate": 4.8362317559560274e-05, + "loss": 1.4986, + "step": 153 + }, + { + "epoch": 0.1318210999358014, + "grad_norm": 1.65625, + "learning_rate": 4.833787577036981e-05, + "loss": 1.4611, + "step": 154 + }, + { + "epoch": 0.1326770811042157, + "grad_norm": 1.9375, + "learning_rate": 4.831325920710045e-05, + "loss": 1.6472, + "step": 155 + }, + { + "epoch": 0.13353306227263, + "grad_norm": 1.7265625, + "learning_rate": 4.8288468054102186e-05, + "loss": 1.5, + "step": 156 + }, + { + "epoch": 0.1343890434410443, + "grad_norm": 1.8125, + "learning_rate": 4.8263502497032484e-05, + "loss": 1.4545, + "step": 157 + }, + { + "epoch": 0.1352450246094586, + "grad_norm": 1.703125, + "learning_rate": 4.823836272285491e-05, + "loss": 1.5297, + "step": 158 + }, + { + "epoch": 0.13610100577787287, + "grad_norm": 2.078125, + "learning_rate": 4.82130489198377e-05, + "loss": 1.5259, + "step": 159 + }, + { + "epoch": 0.1369569869462872, + "grad_norm": 1.8984375, + "learning_rate": 4.8187561277552374e-05, + "loss": 1.554, + "step": 160 + }, + { + "epoch": 0.13781296811470148, + "grad_norm": 1.8984375, + "learning_rate": 4.816189998687231e-05, + "loss": 1.6408, + "step": 161 + }, + { + "epoch": 0.13866894928311577, + "grad_norm": 1.96875, + "learning_rate": 4.813606523997132e-05, + "loss": 1.5234, + "step": 162 + }, + { + "epoch": 0.13952493045153005, + "grad_norm": 1.7890625, + "learning_rate": 4.811005723032219e-05, + "loss": 1.4525, + "step": 163 + }, + { + "epoch": 0.14038091161994437, + "grad_norm": 1.7578125, + "learning_rate": 4.808387615269528e-05, + "loss": 1.5951, + "step": 164 + }, + { + "epoch": 0.14123689278835866, + "grad_norm": 1.7421875, + "learning_rate": 4.805752220315699e-05, + "loss": 1.3059, + "step": 165 + }, + { + "epoch": 0.14209287395677295, + "grad_norm": 1.9140625, + "learning_rate": 4.8030995579068356e-05, + "loss": 1.5359, + "step": 166 + }, + { + "epoch": 0.14294885512518724, + "grad_norm": 1.734375, + "learning_rate": 4.800429647908354e-05, + "loss": 1.5332, + "step": 167 + }, + { + "epoch": 0.14380483629360155, + "grad_norm": 1.796875, + "learning_rate": 4.797742510314838e-05, + "loss": 1.5602, + "step": 168 + }, + { + "epoch": 0.14466081746201584, + "grad_norm": 1.7734375, + "learning_rate": 4.7950381652498816e-05, + "loss": 1.5634, + "step": 169 + }, + { + "epoch": 0.14551679863043013, + "grad_norm": 1.734375, + "learning_rate": 4.7923166329659466e-05, + "loss": 1.5805, + "step": 170 + }, + { + "epoch": 0.14637277979884442, + "grad_norm": 1.875, + "learning_rate": 4.7895779338442076e-05, + "loss": 1.5187, + "step": 171 + }, + { + "epoch": 0.14722876096725873, + "grad_norm": 1.8515625, + "learning_rate": 4.786822088394397e-05, + "loss": 1.664, + "step": 172 + }, + { + "epoch": 0.14808474213567302, + "grad_norm": 1.9765625, + "learning_rate": 4.784049117254656e-05, + "loss": 1.6186, + "step": 173 + }, + { + "epoch": 0.1489407233040873, + "grad_norm": 1.65625, + "learning_rate": 4.781259041191375e-05, + "loss": 1.4065, + "step": 174 + }, + { + "epoch": 0.1497967044725016, + "grad_norm": 1.8828125, + "learning_rate": 4.778451881099044e-05, + "loss": 1.652, + "step": 175 + }, + { + "epoch": 0.1506526856409159, + "grad_norm": 1.734375, + "learning_rate": 4.775627658000091e-05, + "loss": 1.4527, + "step": 176 + }, + { + "epoch": 0.1515086668093302, + "grad_norm": 1.8828125, + "learning_rate": 4.772786393044726e-05, + "loss": 1.4748, + "step": 177 + }, + { + "epoch": 0.1523646479777445, + "grad_norm": 1.9609375, + "learning_rate": 4.7699281075107835e-05, + "loss": 1.6003, + "step": 178 + }, + { + "epoch": 0.15322062914615878, + "grad_norm": 1.8359375, + "learning_rate": 4.767052822803565e-05, + "loss": 1.6305, + "step": 179 + }, + { + "epoch": 0.1540766103145731, + "grad_norm": 1.765625, + "learning_rate": 4.764160560455673e-05, + "loss": 1.3937, + "step": 180 + }, + { + "epoch": 0.15493259148298738, + "grad_norm": 1.8125, + "learning_rate": 4.7612513421268544e-05, + "loss": 1.4548, + "step": 181 + }, + { + "epoch": 0.15578857265140167, + "grad_norm": 1.8828125, + "learning_rate": 4.7583251896038386e-05, + "loss": 1.4323, + "step": 182 + }, + { + "epoch": 0.15664455381981596, + "grad_norm": 1.7890625, + "learning_rate": 4.7553821248001695e-05, + "loss": 1.4816, + "step": 183 + }, + { + "epoch": 0.15750053498823025, + "grad_norm": 1.75, + "learning_rate": 4.752422169756048e-05, + "loss": 1.4022, + "step": 184 + }, + { + "epoch": 0.15835651615664456, + "grad_norm": 1.7890625, + "learning_rate": 4.749445346638163e-05, + "loss": 1.5193, + "step": 185 + }, + { + "epoch": 0.15921249732505885, + "grad_norm": 1.9921875, + "learning_rate": 4.7464516777395234e-05, + "loss": 1.589, + "step": 186 + }, + { + "epoch": 0.16006847849347314, + "grad_norm": 1.703125, + "learning_rate": 4.743441185479297e-05, + "loss": 1.4739, + "step": 187 + }, + { + "epoch": 0.16092445966188743, + "grad_norm": 1.765625, + "learning_rate": 4.740413892402639e-05, + "loss": 1.4312, + "step": 188 + }, + { + "epoch": 0.16178044083030174, + "grad_norm": 1.984375, + "learning_rate": 4.7373698211805215e-05, + "loss": 1.677, + "step": 189 + }, + { + "epoch": 0.16263642199871603, + "grad_norm": 1.6953125, + "learning_rate": 4.7343089946095674e-05, + "loss": 1.6992, + "step": 190 + }, + { + "epoch": 0.16349240316713032, + "grad_norm": 1.8125, + "learning_rate": 4.7312314356118776e-05, + "loss": 1.5619, + "step": 191 + }, + { + "epoch": 0.1643483843355446, + "grad_norm": 1.671875, + "learning_rate": 4.7281371672348595e-05, + "loss": 1.6068, + "step": 192 + }, + { + "epoch": 0.16520436550395892, + "grad_norm": 1.78125, + "learning_rate": 4.725026212651056e-05, + "loss": 1.6795, + "step": 193 + }, + { + "epoch": 0.1660603466723732, + "grad_norm": 1.671875, + "learning_rate": 4.7218985951579685e-05, + "loss": 1.6281, + "step": 194 + }, + { + "epoch": 0.1669163278407875, + "grad_norm": 1.6171875, + "learning_rate": 4.7187543381778864e-05, + "loss": 1.4485, + "step": 195 + }, + { + "epoch": 0.1677723090092018, + "grad_norm": 1.875, + "learning_rate": 4.715593465257709e-05, + "loss": 1.5356, + "step": 196 + }, + { + "epoch": 0.1686282901776161, + "grad_norm": 2.0, + "learning_rate": 4.712416000068771e-05, + "loss": 1.6105, + "step": 197 + }, + { + "epoch": 0.1694842713460304, + "grad_norm": 2.171875, + "learning_rate": 4.7092219664066636e-05, + "loss": 1.7753, + "step": 198 + }, + { + "epoch": 0.17034025251444468, + "grad_norm": 1.8671875, + "learning_rate": 4.706011388191057e-05, + "loss": 1.6989, + "step": 199 + }, + { + "epoch": 0.17119623368285897, + "grad_norm": 1.625, + "learning_rate": 4.7027842894655205e-05, + "loss": 1.5058, + "step": 200 + }, + { + "epoch": 0.17205221485127328, + "grad_norm": 1.78125, + "learning_rate": 4.699540694397343e-05, + "loss": 1.6399, + "step": 201 + }, + { + "epoch": 0.17290819601968757, + "grad_norm": 1.828125, + "learning_rate": 4.6962806272773564e-05, + "loss": 1.491, + "step": 202 + }, + { + "epoch": 0.17376417718810186, + "grad_norm": 1.8984375, + "learning_rate": 4.693004112519743e-05, + "loss": 1.5155, + "step": 203 + }, + { + "epoch": 0.17462015835651615, + "grad_norm": 1.7421875, + "learning_rate": 4.689711174661864e-05, + "loss": 1.4796, + "step": 204 + }, + { + "epoch": 0.17547613952493046, + "grad_norm": 2.125, + "learning_rate": 4.686401838364068e-05, + "loss": 1.5699, + "step": 205 + }, + { + "epoch": 0.17633212069334475, + "grad_norm": 1.765625, + "learning_rate": 4.683076128409512e-05, + "loss": 1.5628, + "step": 206 + }, + { + "epoch": 0.17718810186175904, + "grad_norm": 1.703125, + "learning_rate": 4.6797340697039705e-05, + "loss": 1.5281, + "step": 207 + }, + { + "epoch": 0.17804408303017333, + "grad_norm": 1.6875, + "learning_rate": 4.6763756872756525e-05, + "loss": 1.5223, + "step": 208 + }, + { + "epoch": 0.17890006419858764, + "grad_norm": 1.6953125, + "learning_rate": 4.6730010062750134e-05, + "loss": 1.5561, + "step": 209 + }, + { + "epoch": 0.17975604536700193, + "grad_norm": 1.78125, + "learning_rate": 4.669610051974566e-05, + "loss": 1.3003, + "step": 210 + }, + { + "epoch": 0.18061202653541622, + "grad_norm": 1.859375, + "learning_rate": 4.6662028497686905e-05, + "loss": 1.5831, + "step": 211 + }, + { + "epoch": 0.1814680077038305, + "grad_norm": 1.9765625, + "learning_rate": 4.662779425173448e-05, + "loss": 1.4068, + "step": 212 + }, + { + "epoch": 0.18232398887224482, + "grad_norm": 1.7890625, + "learning_rate": 4.659339803826384e-05, + "loss": 1.2956, + "step": 213 + }, + { + "epoch": 0.1831799700406591, + "grad_norm": 1.734375, + "learning_rate": 4.655884011486341e-05, + "loss": 1.4742, + "step": 214 + }, + { + "epoch": 0.1840359512090734, + "grad_norm": 1.734375, + "learning_rate": 4.652412074033263e-05, + "loss": 1.4319, + "step": 215 + }, + { + "epoch": 0.1848919323774877, + "grad_norm": 1.765625, + "learning_rate": 4.648924017468003e-05, + "loss": 1.4521, + "step": 216 + }, + { + "epoch": 0.18574791354590198, + "grad_norm": 1.84375, + "learning_rate": 4.645419867912128e-05, + "loss": 1.5488, + "step": 217 + }, + { + "epoch": 0.1866038947143163, + "grad_norm": 1.8359375, + "learning_rate": 4.6418996516077205e-05, + "loss": 1.6545, + "step": 218 + }, + { + "epoch": 0.18745987588273058, + "grad_norm": 1.75, + "learning_rate": 4.6383633949171884e-05, + "loss": 1.5419, + "step": 219 + }, + { + "epoch": 0.18831585705114487, + "grad_norm": 1.640625, + "learning_rate": 4.634811124323062e-05, + "loss": 1.4832, + "step": 220 + }, + { + "epoch": 0.18917183821955916, + "grad_norm": 1.734375, + "learning_rate": 4.6312428664277976e-05, + "loss": 1.6318, + "step": 221 + }, + { + "epoch": 0.19002781938797347, + "grad_norm": 1.796875, + "learning_rate": 4.627658647953579e-05, + "loss": 1.4994, + "step": 222 + }, + { + "epoch": 0.19088380055638776, + "grad_norm": 1.8828125, + "learning_rate": 4.624058495742114e-05, + "loss": 1.5991, + "step": 223 + }, + { + "epoch": 0.19173978172480205, + "grad_norm": 1.6484375, + "learning_rate": 4.620442436754438e-05, + "loss": 1.4461, + "step": 224 + }, + { + "epoch": 0.19259576289321634, + "grad_norm": 1.71875, + "learning_rate": 4.6168104980707107e-05, + "loss": 1.5396, + "step": 225 + }, + { + "epoch": 0.19345174406163065, + "grad_norm": 1.71875, + "learning_rate": 4.613162706890011e-05, + "loss": 1.4974, + "step": 226 + }, + { + "epoch": 0.19430772523004494, + "grad_norm": 1.9140625, + "learning_rate": 4.609499090530136e-05, + "loss": 1.6796, + "step": 227 + }, + { + "epoch": 0.19516370639845923, + "grad_norm": 1.6953125, + "learning_rate": 4.605819676427393e-05, + "loss": 1.4685, + "step": 228 + }, + { + "epoch": 0.19601968756687352, + "grad_norm": 1.8671875, + "learning_rate": 4.602124492136401e-05, + "loss": 1.5252, + "step": 229 + }, + { + "epoch": 0.19687566873528783, + "grad_norm": 1.765625, + "learning_rate": 4.598413565329875e-05, + "loss": 1.5882, + "step": 230 + }, + { + "epoch": 0.19773164990370212, + "grad_norm": 1.8203125, + "learning_rate": 4.594686923798426e-05, + "loss": 1.5452, + "step": 231 + }, + { + "epoch": 0.1985876310721164, + "grad_norm": 2.03125, + "learning_rate": 4.5909445954503506e-05, + "loss": 1.5358, + "step": 232 + }, + { + "epoch": 0.1994436122405307, + "grad_norm": 1.8203125, + "learning_rate": 4.5871866083114204e-05, + "loss": 1.6252, + "step": 233 + }, + { + "epoch": 0.20029959340894501, + "grad_norm": 1.8203125, + "learning_rate": 4.5834129905246725e-05, + "loss": 1.4701, + "step": 234 + }, + { + "epoch": 0.20029959340894501, + "eval_loss": 1.6031174659729004, + "eval_runtime": 21.3555, + "eval_samples_per_second": 18.262, + "eval_steps_per_second": 18.262, + "step": 234 + }, + { + "epoch": 0.2011555745773593, + "grad_norm": 1.734375, + "learning_rate": 4.5796237703502044e-05, + "loss": 1.6016, + "step": 235 + }, + { + "epoch": 0.2020115557457736, + "grad_norm": 1.578125, + "learning_rate": 4.5758189761649514e-05, + "loss": 1.5205, + "step": 236 + }, + { + "epoch": 0.20286753691418788, + "grad_norm": 1.7109375, + "learning_rate": 4.5719986364624866e-05, + "loss": 1.4364, + "step": 237 + }, + { + "epoch": 0.2037235180826022, + "grad_norm": 2.171875, + "learning_rate": 4.5681627798527965e-05, + "loss": 1.254, + "step": 238 + }, + { + "epoch": 0.20457949925101648, + "grad_norm": 1.9296875, + "learning_rate": 4.564311435062074e-05, + "loss": 1.5015, + "step": 239 + }, + { + "epoch": 0.20543548041943077, + "grad_norm": 1.828125, + "learning_rate": 4.5604446309324986e-05, + "loss": 1.3402, + "step": 240 + }, + { + "epoch": 0.20629146158784506, + "grad_norm": 1.71875, + "learning_rate": 4.5565623964220266e-05, + "loss": 1.436, + "step": 241 + }, + { + "epoch": 0.20714744275625938, + "grad_norm": 1.7734375, + "learning_rate": 4.5526647606041666e-05, + "loss": 1.6074, + "step": 242 + }, + { + "epoch": 0.20800342392467366, + "grad_norm": 1.921875, + "learning_rate": 4.548751752667767e-05, + "loss": 1.5374, + "step": 243 + }, + { + "epoch": 0.20885940509308795, + "grad_norm": 1.828125, + "learning_rate": 4.5448234019167945e-05, + "loss": 1.4411, + "step": 244 + }, + { + "epoch": 0.20971538626150224, + "grad_norm": 1.6640625, + "learning_rate": 4.5408797377701176e-05, + "loss": 1.4943, + "step": 245 + }, + { + "epoch": 0.21057136742991653, + "grad_norm": 1.7890625, + "learning_rate": 4.5369207897612854e-05, + "loss": 1.567, + "step": 246 + }, + { + "epoch": 0.21142734859833084, + "grad_norm": 1.75, + "learning_rate": 4.532946587538302e-05, + "loss": 1.587, + "step": 247 + }, + { + "epoch": 0.21228332976674513, + "grad_norm": 1.6484375, + "learning_rate": 4.5289571608634116e-05, + "loss": 1.4585, + "step": 248 + }, + { + "epoch": 0.21313931093515942, + "grad_norm": 1.6484375, + "learning_rate": 4.524952539612872e-05, + "loss": 1.5406, + "step": 249 + }, + { + "epoch": 0.2139952921035737, + "grad_norm": 1.8671875, + "learning_rate": 4.5209327537767295e-05, + "loss": 1.4958, + "step": 250 + }, + { + "epoch": 0.21485127327198802, + "grad_norm": 2.015625, + "learning_rate": 4.5168978334585956e-05, + "loss": 1.6202, + "step": 251 + }, + { + "epoch": 0.2157072544404023, + "grad_norm": 1.7578125, + "learning_rate": 4.512847808875424e-05, + "loss": 1.5408, + "step": 252 + }, + { + "epoch": 0.2165632356088166, + "grad_norm": 1.953125, + "learning_rate": 4.5087827103572796e-05, + "loss": 1.6394, + "step": 253 + }, + { + "epoch": 0.2174192167772309, + "grad_norm": 1.6484375, + "learning_rate": 4.504702568347117e-05, + "loss": 1.3343, + "step": 254 + }, + { + "epoch": 0.2182751979456452, + "grad_norm": 1.7265625, + "learning_rate": 4.500607413400546e-05, + "loss": 1.5471, + "step": 255 + }, + { + "epoch": 0.2191311791140595, + "grad_norm": 1.7734375, + "learning_rate": 4.4964972761856084e-05, + "loss": 1.4912, + "step": 256 + }, + { + "epoch": 0.21998716028247378, + "grad_norm": 1.5703125, + "learning_rate": 4.492372187482545e-05, + "loss": 1.2951, + "step": 257 + }, + { + "epoch": 0.22084314145088807, + "grad_norm": 1.6484375, + "learning_rate": 4.488232178183567e-05, + "loss": 1.4651, + "step": 258 + }, + { + "epoch": 0.22169912261930239, + "grad_norm": 1.7109375, + "learning_rate": 4.484077279292622e-05, + "loss": 1.3435, + "step": 259 + }, + { + "epoch": 0.22255510378771667, + "grad_norm": 1.828125, + "learning_rate": 4.479907521925168e-05, + "loss": 1.5813, + "step": 260 + }, + { + "epoch": 0.22341108495613096, + "grad_norm": 1.875, + "learning_rate": 4.4757229373079306e-05, + "loss": 1.3951, + "step": 261 + }, + { + "epoch": 0.22426706612454525, + "grad_norm": 1.7890625, + "learning_rate": 4.471523556778679e-05, + "loss": 1.4809, + "step": 262 + }, + { + "epoch": 0.22512304729295957, + "grad_norm": 1.5390625, + "learning_rate": 4.467309411785984e-05, + "loss": 1.4175, + "step": 263 + }, + { + "epoch": 0.22597902846137385, + "grad_norm": 1.8828125, + "learning_rate": 4.4630805338889866e-05, + "loss": 1.587, + "step": 264 + }, + { + "epoch": 0.22683500962978814, + "grad_norm": 1.71875, + "learning_rate": 4.458836954757161e-05, + "loss": 1.3758, + "step": 265 + }, + { + "epoch": 0.22769099079820243, + "grad_norm": 1.859375, + "learning_rate": 4.454578706170075e-05, + "loss": 1.4746, + "step": 266 + }, + { + "epoch": 0.22854697196661675, + "grad_norm": 1.703125, + "learning_rate": 4.450305820017156e-05, + "loss": 1.5459, + "step": 267 + }, + { + "epoch": 0.22940295313503103, + "grad_norm": 1.671875, + "learning_rate": 4.446018328297449e-05, + "loss": 1.361, + "step": 268 + }, + { + "epoch": 0.23025893430344532, + "grad_norm": 1.875, + "learning_rate": 4.441716263119379e-05, + "loss": 1.6767, + "step": 269 + }, + { + "epoch": 0.2311149154718596, + "grad_norm": 1.78125, + "learning_rate": 4.437399656700507e-05, + "loss": 1.4742, + "step": 270 + }, + { + "epoch": 0.23197089664027393, + "grad_norm": 1.6875, + "learning_rate": 4.433068541367295e-05, + "loss": 1.5136, + "step": 271 + }, + { + "epoch": 0.23282687780868822, + "grad_norm": 1.65625, + "learning_rate": 4.428722949554857e-05, + "loss": 1.4492, + "step": 272 + }, + { + "epoch": 0.2336828589771025, + "grad_norm": 1.6484375, + "learning_rate": 4.424362913806722e-05, + "loss": 1.4585, + "step": 273 + }, + { + "epoch": 0.2345388401455168, + "grad_norm": 1.578125, + "learning_rate": 4.419988466774586e-05, + "loss": 1.3074, + "step": 274 + }, + { + "epoch": 0.23539482131393108, + "grad_norm": 1.5390625, + "learning_rate": 4.415599641218068e-05, + "loss": 1.2787, + "step": 275 + }, + { + "epoch": 0.2362508024823454, + "grad_norm": 1.515625, + "learning_rate": 4.4111964700044686e-05, + "loss": 1.489, + "step": 276 + }, + { + "epoch": 0.23710678365075968, + "grad_norm": 1.578125, + "learning_rate": 4.4067789861085185e-05, + "loss": 1.4373, + "step": 277 + }, + { + "epoch": 0.23796276481917397, + "grad_norm": 1.703125, + "learning_rate": 4.402347222612137e-05, + "loss": 1.4773, + "step": 278 + }, + { + "epoch": 0.23881874598758826, + "grad_norm": 2.0, + "learning_rate": 4.397901212704176e-05, + "loss": 1.4799, + "step": 279 + }, + { + "epoch": 0.23967472715600258, + "grad_norm": 1.7421875, + "learning_rate": 4.393440989680184e-05, + "loss": 1.4964, + "step": 280 + }, + { + "epoch": 0.24053070832441686, + "grad_norm": 1.5859375, + "learning_rate": 4.3889665869421436e-05, + "loss": 1.3405, + "step": 281 + }, + { + "epoch": 0.24138668949283115, + "grad_norm": 1.5703125, + "learning_rate": 4.3844780379982296e-05, + "loss": 1.4144, + "step": 282 + }, + { + "epoch": 0.24224267066124544, + "grad_norm": 1.8359375, + "learning_rate": 4.3799753764625564e-05, + "loss": 1.4202, + "step": 283 + }, + { + "epoch": 0.24309865182965976, + "grad_norm": 1.7578125, + "learning_rate": 4.375458636054924e-05, + "loss": 1.6295, + "step": 284 + }, + { + "epoch": 0.24395463299807404, + "grad_norm": 1.84375, + "learning_rate": 4.370927850600569e-05, + "loss": 1.5213, + "step": 285 + }, + { + "epoch": 0.24481061416648833, + "grad_norm": 1.8125, + "learning_rate": 4.366383054029906e-05, + "loss": 1.4423, + "step": 286 + }, + { + "epoch": 0.24566659533490262, + "grad_norm": 1.7265625, + "learning_rate": 4.3618242803782825e-05, + "loss": 1.6341, + "step": 287 + }, + { + "epoch": 0.24652257650331694, + "grad_norm": 1.7890625, + "learning_rate": 4.357251563785713e-05, + "loss": 1.5936, + "step": 288 + }, + { + "epoch": 0.24737855767173123, + "grad_norm": 1.640625, + "learning_rate": 4.352664938496631e-05, + "loss": 1.5026, + "step": 289 + }, + { + "epoch": 0.2482345388401455, + "grad_norm": 1.765625, + "learning_rate": 4.348064438859629e-05, + "loss": 1.6062, + "step": 290 + }, + { + "epoch": 0.2490905200085598, + "grad_norm": 1.734375, + "learning_rate": 4.3434500993272066e-05, + "loss": 1.5012, + "step": 291 + }, + { + "epoch": 0.24994650117697412, + "grad_norm": 1.6328125, + "learning_rate": 4.338821954455503e-05, + "loss": 1.5942, + "step": 292 + }, + { + "epoch": 0.2508024823453884, + "grad_norm": 1.578125, + "learning_rate": 4.334180038904046e-05, + "loss": 1.5837, + "step": 293 + }, + { + "epoch": 0.2516584635138027, + "grad_norm": 1.9921875, + "learning_rate": 4.3295243874354926e-05, + "loss": 1.6746, + "step": 294 + }, + { + "epoch": 0.252514444682217, + "grad_norm": 1.7265625, + "learning_rate": 4.3248550349153616e-05, + "loss": 1.467, + "step": 295 + }, + { + "epoch": 0.2533704258506313, + "grad_norm": 1.671875, + "learning_rate": 4.3201720163117795e-05, + "loss": 1.497, + "step": 296 + }, + { + "epoch": 0.2542264070190456, + "grad_norm": 1.5390625, + "learning_rate": 4.315475366695217e-05, + "loss": 1.2926, + "step": 297 + }, + { + "epoch": 0.2550823881874599, + "grad_norm": 1.859375, + "learning_rate": 4.3107651212382236e-05, + "loss": 1.6157, + "step": 298 + }, + { + "epoch": 0.25593836935587416, + "grad_norm": 1.8359375, + "learning_rate": 4.306041315215167e-05, + "loss": 1.538, + "step": 299 + }, + { + "epoch": 0.25679435052428845, + "grad_norm": 1.6796875, + "learning_rate": 4.301303984001967e-05, + "loss": 1.4402, + "step": 300 + }, + { + "epoch": 0.25765033169270274, + "grad_norm": 1.859375, + "learning_rate": 4.296553163075836e-05, + "loss": 1.6127, + "step": 301 + }, + { + "epoch": 0.2585063128611171, + "grad_norm": 1.7890625, + "learning_rate": 4.291788888015002e-05, + "loss": 1.5769, + "step": 302 + }, + { + "epoch": 0.25936229402953137, + "grad_norm": 1.953125, + "learning_rate": 4.287011194498456e-05, + "loss": 1.2251, + "step": 303 + }, + { + "epoch": 0.26021827519794566, + "grad_norm": 1.5078125, + "learning_rate": 4.282220118305672e-05, + "loss": 1.4914, + "step": 304 + }, + { + "epoch": 0.26107425636635995, + "grad_norm": 1.640625, + "learning_rate": 4.277415695316349e-05, + "loss": 1.5531, + "step": 305 + }, + { + "epoch": 0.26193023753477424, + "grad_norm": 1.640625, + "learning_rate": 4.272597961510137e-05, + "loss": 1.3468, + "step": 306 + }, + { + "epoch": 0.2627862187031885, + "grad_norm": 1.7265625, + "learning_rate": 4.267766952966369e-05, + "loss": 1.5566, + "step": 307 + }, + { + "epoch": 0.2636421998716028, + "grad_norm": 1.7734375, + "learning_rate": 4.2629227058637904e-05, + "loss": 1.4052, + "step": 308 + }, + { + "epoch": 0.2644981810400171, + "grad_norm": 1.8046875, + "learning_rate": 4.258065256480288e-05, + "loss": 1.4669, + "step": 309 + }, + { + "epoch": 0.2653541622084314, + "grad_norm": 1.578125, + "learning_rate": 4.253194641192621e-05, + "loss": 1.3902, + "step": 310 + }, + { + "epoch": 0.26621014337684573, + "grad_norm": 1.8515625, + "learning_rate": 4.24831089647614e-05, + "loss": 1.4913, + "step": 311 + }, + { + "epoch": 0.26706612454526, + "grad_norm": 1.765625, + "learning_rate": 4.243414058904528e-05, + "loss": 1.4332, + "step": 312 + }, + { + "epoch": 0.2679221057136743, + "grad_norm": 1.53125, + "learning_rate": 4.238504165149515e-05, + "loss": 1.3904, + "step": 313 + }, + { + "epoch": 0.2687780868820886, + "grad_norm": 1.90625, + "learning_rate": 4.233581251980604e-05, + "loss": 1.5778, + "step": 314 + }, + { + "epoch": 0.2696340680505029, + "grad_norm": 1.765625, + "learning_rate": 4.2286453562648046e-05, + "loss": 1.6316, + "step": 315 + }, + { + "epoch": 0.2704900492189172, + "grad_norm": 1.8125, + "learning_rate": 4.223696514966346e-05, + "loss": 1.5792, + "step": 316 + }, + { + "epoch": 0.27134603038733146, + "grad_norm": 1.578125, + "learning_rate": 4.2187347651464055e-05, + "loss": 1.4227, + "step": 317 + }, + { + "epoch": 0.27220201155574575, + "grad_norm": 1.703125, + "learning_rate": 4.213760143962834e-05, + "loss": 1.3087, + "step": 318 + }, + { + "epoch": 0.2730579927241601, + "grad_norm": 1.734375, + "learning_rate": 4.20877268866987e-05, + "loss": 1.5096, + "step": 319 + }, + { + "epoch": 0.2739139738925744, + "grad_norm": 1.640625, + "learning_rate": 4.203772436617868e-05, + "loss": 1.3995, + "step": 320 + }, + { + "epoch": 0.27476995506098867, + "grad_norm": 1.875, + "learning_rate": 4.198759425253014e-05, + "loss": 1.6112, + "step": 321 + }, + { + "epoch": 0.27562593622940296, + "grad_norm": 1.6796875, + "learning_rate": 4.1937336921170476e-05, + "loss": 1.6356, + "step": 322 + }, + { + "epoch": 0.27648191739781725, + "grad_norm": 1.5078125, + "learning_rate": 4.188695274846979e-05, + "loss": 1.3759, + "step": 323 + }, + { + "epoch": 0.27733789856623153, + "grad_norm": 1.7734375, + "learning_rate": 4.183644211174809e-05, + "loss": 1.4551, + "step": 324 + }, + { + "epoch": 0.2781938797346458, + "grad_norm": 1.5859375, + "learning_rate": 4.1785805389272445e-05, + "loss": 1.4036, + "step": 325 + }, + { + "epoch": 0.2790498609030601, + "grad_norm": 1.6484375, + "learning_rate": 4.173504296025417e-05, + "loss": 1.3411, + "step": 326 + }, + { + "epoch": 0.27990584207147445, + "grad_norm": 1.609375, + "learning_rate": 4.1684155204845974e-05, + "loss": 1.5365, + "step": 327 + }, + { + "epoch": 0.28076182323988874, + "grad_norm": 2.109375, + "learning_rate": 4.163314250413913e-05, + "loss": 1.6192, + "step": 328 + }, + { + "epoch": 0.28161780440830303, + "grad_norm": 1.8125, + "learning_rate": 4.15820052401606e-05, + "loss": 1.6317, + "step": 329 + }, + { + "epoch": 0.2824737855767173, + "grad_norm": 1.7578125, + "learning_rate": 4.153074379587018e-05, + "loss": 1.5873, + "step": 330 + }, + { + "epoch": 0.2833297667451316, + "grad_norm": 1.578125, + "learning_rate": 4.147935855515763e-05, + "loss": 1.4148, + "step": 331 + }, + { + "epoch": 0.2841857479135459, + "grad_norm": 1.8671875, + "learning_rate": 4.142784990283982e-05, + "loss": 1.5794, + "step": 332 + }, + { + "epoch": 0.2850417290819602, + "grad_norm": 1.5625, + "learning_rate": 4.1376218224657825e-05, + "loss": 1.4822, + "step": 333 + }, + { + "epoch": 0.28589771025037447, + "grad_norm": 1.984375, + "learning_rate": 4.132446390727404e-05, + "loss": 1.4558, + "step": 334 + }, + { + "epoch": 0.2867536914187888, + "grad_norm": 1.8359375, + "learning_rate": 4.127258733826929e-05, + "loss": 1.61, + "step": 335 + }, + { + "epoch": 0.2876096725872031, + "grad_norm": 1.8203125, + "learning_rate": 4.122058890613991e-05, + "loss": 1.3766, + "step": 336 + }, + { + "epoch": 0.2884656537556174, + "grad_norm": 1.6015625, + "learning_rate": 4.1168469000294895e-05, + "loss": 1.4012, + "step": 337 + }, + { + "epoch": 0.2893216349240317, + "grad_norm": 1.8046875, + "learning_rate": 4.11162280110529e-05, + "loss": 1.5115, + "step": 338 + }, + { + "epoch": 0.29017761609244597, + "grad_norm": 1.609375, + "learning_rate": 4.106386632963936e-05, + "loss": 1.5486, + "step": 339 + }, + { + "epoch": 0.29103359726086026, + "grad_norm": 1.6328125, + "learning_rate": 4.101138434818357e-05, + "loss": 1.4817, + "step": 340 + }, + { + "epoch": 0.29188957842927454, + "grad_norm": 1.6796875, + "learning_rate": 4.095878245971573e-05, + "loss": 1.5482, + "step": 341 + }, + { + "epoch": 0.29274555959768883, + "grad_norm": 1.5859375, + "learning_rate": 4.0906061058163995e-05, + "loss": 1.48, + "step": 342 + }, + { + "epoch": 0.2936015407661031, + "grad_norm": 1.8671875, + "learning_rate": 4.085322053835157e-05, + "loss": 1.4816, + "step": 343 + }, + { + "epoch": 0.29445752193451746, + "grad_norm": 1.6484375, + "learning_rate": 4.080026129599368e-05, + "loss": 1.4987, + "step": 344 + }, + { + "epoch": 0.29531350310293175, + "grad_norm": 1.625, + "learning_rate": 4.0747183727694674e-05, + "loss": 1.6119, + "step": 345 + }, + { + "epoch": 0.29616948427134604, + "grad_norm": 1.65625, + "learning_rate": 4.0693988230945004e-05, + "loss": 1.5121, + "step": 346 + }, + { + "epoch": 0.29702546543976033, + "grad_norm": 1.8359375, + "learning_rate": 4.064067520411831e-05, + "loss": 1.5578, + "step": 347 + }, + { + "epoch": 0.2978814466081746, + "grad_norm": 2.078125, + "learning_rate": 4.058724504646834e-05, + "loss": 1.4593, + "step": 348 + }, + { + "epoch": 0.2987374277765889, + "grad_norm": 4.53125, + "learning_rate": 4.0533698158126085e-05, + "loss": 1.3536, + "step": 349 + }, + { + "epoch": 0.2995934089450032, + "grad_norm": 1.6953125, + "learning_rate": 4.048003494009666e-05, + "loss": 1.4781, + "step": 350 + }, + { + "epoch": 0.3004493901134175, + "grad_norm": 1.625, + "learning_rate": 4.042625579425639e-05, + "loss": 1.6591, + "step": 351 + }, + { + "epoch": 0.3004493901134175, + "eval_loss": 1.5815147161483765, + "eval_runtime": 21.3462, + "eval_samples_per_second": 18.27, + "eval_steps_per_second": 18.27, + "step": 351 + }, + { + "epoch": 0.3013053712818318, + "grad_norm": 1.5390625, + "learning_rate": 4.0372361123349756e-05, + "loss": 1.3439, + "step": 352 + }, + { + "epoch": 0.3021613524502461, + "grad_norm": 1.7890625, + "learning_rate": 4.031835133098639e-05, + "loss": 1.5028, + "step": 353 + }, + { + "epoch": 0.3030173336186604, + "grad_norm": 1.8515625, + "learning_rate": 4.026422682163804e-05, + "loss": 1.5099, + "step": 354 + }, + { + "epoch": 0.3038733147870747, + "grad_norm": 1.671875, + "learning_rate": 4.020998800063559e-05, + "loss": 1.4798, + "step": 355 + }, + { + "epoch": 0.304729295955489, + "grad_norm": 1.8125, + "learning_rate": 4.015563527416595e-05, + "loss": 1.4064, + "step": 356 + }, + { + "epoch": 0.30558527712390327, + "grad_norm": 1.9375, + "learning_rate": 4.010116904926907e-05, + "loss": 1.5338, + "step": 357 + }, + { + "epoch": 0.30644125829231755, + "grad_norm": 1.8046875, + "learning_rate": 4.0046589733834875e-05, + "loss": 1.5153, + "step": 358 + }, + { + "epoch": 0.30729723946073184, + "grad_norm": 1.8828125, + "learning_rate": 3.9991897736600184e-05, + "loss": 1.4596, + "step": 359 + }, + { + "epoch": 0.3081532206291462, + "grad_norm": 1.625, + "learning_rate": 3.9937093467145726e-05, + "loss": 1.5873, + "step": 360 + }, + { + "epoch": 0.3090092017975605, + "grad_norm": 1.65625, + "learning_rate": 3.988217733589296e-05, + "loss": 1.5941, + "step": 361 + }, + { + "epoch": 0.30986518296597476, + "grad_norm": 1.8203125, + "learning_rate": 3.982714975410111e-05, + "loss": 1.4578, + "step": 362 + }, + { + "epoch": 0.31072116413438905, + "grad_norm": 1.5234375, + "learning_rate": 3.977201113386402e-05, + "loss": 1.4387, + "step": 363 + }, + { + "epoch": 0.31157714530280334, + "grad_norm": 1.546875, + "learning_rate": 3.971676188810707e-05, + "loss": 1.496, + "step": 364 + }, + { + "epoch": 0.3124331264712176, + "grad_norm": 1.609375, + "learning_rate": 3.966140243058413e-05, + "loss": 1.3948, + "step": 365 + }, + { + "epoch": 0.3132891076396319, + "grad_norm": 1.7578125, + "learning_rate": 3.96059331758744e-05, + "loss": 1.5101, + "step": 366 + }, + { + "epoch": 0.3141450888080462, + "grad_norm": 1.8125, + "learning_rate": 3.955035453937935e-05, + "loss": 1.5071, + "step": 367 + }, + { + "epoch": 0.3150010699764605, + "grad_norm": 1.6171875, + "learning_rate": 3.949466693731962e-05, + "loss": 1.4645, + "step": 368 + }, + { + "epoch": 0.31585705114487483, + "grad_norm": 1.703125, + "learning_rate": 3.9438870786731815e-05, + "loss": 1.522, + "step": 369 + }, + { + "epoch": 0.3167130323132891, + "grad_norm": 1.5859375, + "learning_rate": 3.938296650546552e-05, + "loss": 1.4065, + "step": 370 + }, + { + "epoch": 0.3175690134817034, + "grad_norm": 1.6875, + "learning_rate": 3.9326954512180026e-05, + "loss": 1.4124, + "step": 371 + }, + { + "epoch": 0.3184249946501177, + "grad_norm": 1.703125, + "learning_rate": 3.927083522634132e-05, + "loss": 1.4137, + "step": 372 + }, + { + "epoch": 0.319280975818532, + "grad_norm": 1.828125, + "learning_rate": 3.9214609068218834e-05, + "loss": 1.482, + "step": 373 + }, + { + "epoch": 0.3201369569869463, + "grad_norm": 2.03125, + "learning_rate": 3.915827645888241e-05, + "loss": 1.3655, + "step": 374 + }, + { + "epoch": 0.32099293815536056, + "grad_norm": 1.6875, + "learning_rate": 3.910183782019905e-05, + "loss": 1.3776, + "step": 375 + }, + { + "epoch": 0.32184891932377485, + "grad_norm": 1.7421875, + "learning_rate": 3.9045293574829814e-05, + "loss": 1.5067, + "step": 376 + }, + { + "epoch": 0.3227049004921892, + "grad_norm": 1.8125, + "learning_rate": 3.8988644146226606e-05, + "loss": 1.4391, + "step": 377 + }, + { + "epoch": 0.3235608816606035, + "grad_norm": 1.8671875, + "learning_rate": 3.8931889958629066e-05, + "loss": 1.4054, + "step": 378 + }, + { + "epoch": 0.32441686282901777, + "grad_norm": 1.734375, + "learning_rate": 3.887503143706134e-05, + "loss": 1.721, + "step": 379 + }, + { + "epoch": 0.32527284399743206, + "grad_norm": 1.7578125, + "learning_rate": 3.881806900732893e-05, + "loss": 1.5304, + "step": 380 + }, + { + "epoch": 0.32612882516584635, + "grad_norm": 1.671875, + "learning_rate": 3.8761003096015466e-05, + "loss": 1.4313, + "step": 381 + }, + { + "epoch": 0.32698480633426064, + "grad_norm": 1.828125, + "learning_rate": 3.870383413047959e-05, + "loss": 1.4311, + "step": 382 + }, + { + "epoch": 0.3278407875026749, + "grad_norm": 1.828125, + "learning_rate": 3.864656253885163e-05, + "loss": 1.5491, + "step": 383 + }, + { + "epoch": 0.3286967686710892, + "grad_norm": 1.765625, + "learning_rate": 3.858918875003053e-05, + "loss": 1.5921, + "step": 384 + }, + { + "epoch": 0.32955274983950356, + "grad_norm": 1.8203125, + "learning_rate": 3.853171319368054e-05, + "loss": 1.3189, + "step": 385 + }, + { + "epoch": 0.33040873100791784, + "grad_norm": 1.578125, + "learning_rate": 3.847413630022804e-05, + "loss": 1.5709, + "step": 386 + }, + { + "epoch": 0.33126471217633213, + "grad_norm": 1.7578125, + "learning_rate": 3.841645850085831e-05, + "loss": 1.5226, + "step": 387 + }, + { + "epoch": 0.3321206933447464, + "grad_norm": 1.6171875, + "learning_rate": 3.835868022751231e-05, + "loss": 1.6103, + "step": 388 + }, + { + "epoch": 0.3329766745131607, + "grad_norm": 1.71875, + "learning_rate": 3.830080191288342e-05, + "loss": 1.4644, + "step": 389 + }, + { + "epoch": 0.333832655681575, + "grad_norm": 1.6015625, + "learning_rate": 3.8242823990414214e-05, + "loss": 1.5841, + "step": 390 + }, + { + "epoch": 0.3346886368499893, + "grad_norm": 1.7421875, + "learning_rate": 3.818474689429323e-05, + "loss": 1.4086, + "step": 391 + }, + { + "epoch": 0.3355446180184036, + "grad_norm": 1.8046875, + "learning_rate": 3.812657105945171e-05, + "loss": 1.4696, + "step": 392 + }, + { + "epoch": 0.3364005991868179, + "grad_norm": 1.5703125, + "learning_rate": 3.806829692156031e-05, + "loss": 1.3922, + "step": 393 + }, + { + "epoch": 0.3372565803552322, + "grad_norm": 1.6796875, + "learning_rate": 3.8009924917025864e-05, + "loss": 1.4289, + "step": 394 + }, + { + "epoch": 0.3381125615236465, + "grad_norm": 1.8046875, + "learning_rate": 3.795145548298815e-05, + "loss": 1.435, + "step": 395 + }, + { + "epoch": 0.3389685426920608, + "grad_norm": 1.625, + "learning_rate": 3.789288905731655e-05, + "loss": 1.4943, + "step": 396 + }, + { + "epoch": 0.33982452386047507, + "grad_norm": 1.75, + "learning_rate": 3.783422607860681e-05, + "loss": 1.5017, + "step": 397 + }, + { + "epoch": 0.34068050502888936, + "grad_norm": 4.375, + "learning_rate": 3.777546698617776e-05, + "loss": 1.5723, + "step": 398 + }, + { + "epoch": 0.34153648619730365, + "grad_norm": 1.609375, + "learning_rate": 3.7716612220068006e-05, + "loss": 1.4734, + "step": 399 + }, + { + "epoch": 0.34239246736571793, + "grad_norm": 1.6796875, + "learning_rate": 3.765766222103262e-05, + "loss": 1.5986, + "step": 400 + }, + { + "epoch": 0.3432484485341322, + "grad_norm": 1.7421875, + "learning_rate": 3.7598617430539884e-05, + "loss": 1.4154, + "step": 401 + }, + { + "epoch": 0.34410442970254657, + "grad_norm": 1.8046875, + "learning_rate": 3.753947829076797e-05, + "loss": 1.6668, + "step": 402 + }, + { + "epoch": 0.34496041087096085, + "grad_norm": 1.5703125, + "learning_rate": 3.7480245244601584e-05, + "loss": 1.4141, + "step": 403 + }, + { + "epoch": 0.34581639203937514, + "grad_norm": 1.484375, + "learning_rate": 3.742091873562871e-05, + "loss": 1.3079, + "step": 404 + }, + { + "epoch": 0.34667237320778943, + "grad_norm": 1.4609375, + "learning_rate": 3.7361499208137254e-05, + "loss": 1.5055, + "step": 405 + }, + { + "epoch": 0.3475283543762037, + "grad_norm": 1.6875, + "learning_rate": 3.730198710711173e-05, + "loss": 1.457, + "step": 406 + }, + { + "epoch": 0.348384335544618, + "grad_norm": 1.75, + "learning_rate": 3.724238287822991e-05, + "loss": 1.4187, + "step": 407 + }, + { + "epoch": 0.3492403167130323, + "grad_norm": 1.640625, + "learning_rate": 3.71826869678595e-05, + "loss": 1.4398, + "step": 408 + }, + { + "epoch": 0.3500962978814466, + "grad_norm": 1.8125, + "learning_rate": 3.7122899823054814e-05, + "loss": 1.4736, + "step": 409 + }, + { + "epoch": 0.3509522790498609, + "grad_norm": 2.03125, + "learning_rate": 3.706302189155338e-05, + "loss": 1.4837, + "step": 410 + }, + { + "epoch": 0.3518082602182752, + "grad_norm": 1.6171875, + "learning_rate": 3.7003053621772656e-05, + "loss": 1.4027, + "step": 411 + }, + { + "epoch": 0.3526642413866895, + "grad_norm": 1.796875, + "learning_rate": 3.694299546280657e-05, + "loss": 1.6534, + "step": 412 + }, + { + "epoch": 0.3535202225551038, + "grad_norm": 1.703125, + "learning_rate": 3.688284786442229e-05, + "loss": 1.5668, + "step": 413 + }, + { + "epoch": 0.3543762037235181, + "grad_norm": 1.65625, + "learning_rate": 3.682261127705671e-05, + "loss": 1.3467, + "step": 414 + }, + { + "epoch": 0.35523218489193237, + "grad_norm": 1.828125, + "learning_rate": 3.676228615181321e-05, + "loss": 1.4635, + "step": 415 + }, + { + "epoch": 0.35608816606034666, + "grad_norm": 1.5625, + "learning_rate": 3.6701872940458186e-05, + "loss": 1.3886, + "step": 416 + }, + { + "epoch": 0.35694414722876094, + "grad_norm": 1.609375, + "learning_rate": 3.66413720954177e-05, + "loss": 1.5161, + "step": 417 + }, + { + "epoch": 0.3578001283971753, + "grad_norm": 1.625, + "learning_rate": 3.6580784069774105e-05, + "loss": 1.5301, + "step": 418 + }, + { + "epoch": 0.3586561095655896, + "grad_norm": 1.7421875, + "learning_rate": 3.652010931726262e-05, + "loss": 1.3991, + "step": 419 + }, + { + "epoch": 0.35951209073400386, + "grad_norm": 1.65625, + "learning_rate": 3.645934829226797e-05, + "loss": 1.4226, + "step": 420 + }, + { + "epoch": 0.36036807190241815, + "grad_norm": 1.6796875, + "learning_rate": 3.6398501449820936e-05, + "loss": 1.5157, + "step": 421 + }, + { + "epoch": 0.36122405307083244, + "grad_norm": 1.6328125, + "learning_rate": 3.6337569245595005e-05, + "loss": 1.5619, + "step": 422 + }, + { + "epoch": 0.36208003423924673, + "grad_norm": 1.6015625, + "learning_rate": 3.62765521359029e-05, + "loss": 1.4751, + "step": 423 + }, + { + "epoch": 0.362936015407661, + "grad_norm": 3.21875, + "learning_rate": 3.6215450577693196e-05, + "loss": 1.4708, + "step": 424 + }, + { + "epoch": 0.3637919965760753, + "grad_norm": 1.671875, + "learning_rate": 3.615426502854689e-05, + "loss": 1.4924, + "step": 425 + }, + { + "epoch": 0.36464797774448965, + "grad_norm": 1.6015625, + "learning_rate": 3.6092995946673994e-05, + "loss": 1.5001, + "step": 426 + }, + { + "epoch": 0.36550395891290394, + "grad_norm": 1.484375, + "learning_rate": 3.603164379091006e-05, + "loss": 1.3498, + "step": 427 + }, + { + "epoch": 0.3663599400813182, + "grad_norm": 1.5703125, + "learning_rate": 3.597020902071278e-05, + "loss": 1.378, + "step": 428 + }, + { + "epoch": 0.3672159212497325, + "grad_norm": 2.015625, + "learning_rate": 3.590869209615854e-05, + "loss": 1.7722, + "step": 429 + }, + { + "epoch": 0.3680719024181468, + "grad_norm": 1.7421875, + "learning_rate": 3.5847093477938956e-05, + "loss": 1.5215, + "step": 430 + }, + { + "epoch": 0.3689278835865611, + "grad_norm": 1.6875, + "learning_rate": 3.578541362735744e-05, + "loss": 1.5693, + "step": 431 + }, + { + "epoch": 0.3697838647549754, + "grad_norm": 1.5703125, + "learning_rate": 3.572365300632574e-05, + "loss": 1.5959, + "step": 432 + }, + { + "epoch": 0.37063984592338967, + "grad_norm": 1.7734375, + "learning_rate": 3.56618120773605e-05, + "loss": 1.6924, + "step": 433 + }, + { + "epoch": 0.37149582709180395, + "grad_norm": 1.703125, + "learning_rate": 3.5599891303579746e-05, + "loss": 1.6631, + "step": 434 + }, + { + "epoch": 0.3723518082602183, + "grad_norm": 1.828125, + "learning_rate": 3.553789114869947e-05, + "loss": 1.4271, + "step": 435 + }, + { + "epoch": 0.3732077894286326, + "grad_norm": 1.5546875, + "learning_rate": 3.547581207703017e-05, + "loss": 1.4559, + "step": 436 + }, + { + "epoch": 0.3740637705970469, + "grad_norm": 1.6875, + "learning_rate": 3.541365455347327e-05, + "loss": 1.3832, + "step": 437 + }, + { + "epoch": 0.37491975176546116, + "grad_norm": 1.8203125, + "learning_rate": 3.535141904351776e-05, + "loss": 1.5994, + "step": 438 + }, + { + "epoch": 0.37577573293387545, + "grad_norm": 1.5546875, + "learning_rate": 3.528910601323666e-05, + "loss": 1.4947, + "step": 439 + }, + { + "epoch": 0.37663171410228974, + "grad_norm": 1.8671875, + "learning_rate": 3.5226715929283506e-05, + "loss": 1.3042, + "step": 440 + }, + { + "epoch": 0.377487695270704, + "grad_norm": 1.671875, + "learning_rate": 3.516424925888887e-05, + "loss": 1.4926, + "step": 441 + }, + { + "epoch": 0.3783436764391183, + "grad_norm": 1.5859375, + "learning_rate": 3.510170646985691e-05, + "loss": 1.4419, + "step": 442 + }, + { + "epoch": 0.37919965760753266, + "grad_norm": 1.5625, + "learning_rate": 3.50390880305618e-05, + "loss": 1.4541, + "step": 443 + }, + { + "epoch": 0.38005563877594695, + "grad_norm": 1.703125, + "learning_rate": 3.497639440994424e-05, + "loss": 1.5821, + "step": 444 + }, + { + "epoch": 0.38091161994436123, + "grad_norm": 1.625, + "learning_rate": 3.491362607750796e-05, + "loss": 1.4526, + "step": 445 + }, + { + "epoch": 0.3817676011127755, + "grad_norm": 1.546875, + "learning_rate": 3.485078350331622e-05, + "loss": 1.5525, + "step": 446 + }, + { + "epoch": 0.3826235822811898, + "grad_norm": 1.5703125, + "learning_rate": 3.478786715798823e-05, + "loss": 1.3649, + "step": 447 + }, + { + "epoch": 0.3834795634496041, + "grad_norm": 1.8125, + "learning_rate": 3.4724877512695674e-05, + "loss": 1.6517, + "step": 448 + }, + { + "epoch": 0.3843355446180184, + "grad_norm": 1.6953125, + "learning_rate": 3.466181503915918e-05, + "loss": 1.441, + "step": 449 + }, + { + "epoch": 0.3851915257864327, + "grad_norm": 1.84375, + "learning_rate": 3.459868020964478e-05, + "loss": 1.6027, + "step": 450 + }, + { + "epoch": 0.386047506954847, + "grad_norm": 1.453125, + "learning_rate": 3.453547349696033e-05, + "loss": 1.3575, + "step": 451 + }, + { + "epoch": 0.3869034881232613, + "grad_norm": 1.53125, + "learning_rate": 3.447219537445207e-05, + "loss": 1.4457, + "step": 452 + }, + { + "epoch": 0.3877594692916756, + "grad_norm": 1.6171875, + "learning_rate": 3.4408846316000956e-05, + "loss": 1.4387, + "step": 453 + }, + { + "epoch": 0.3886154504600899, + "grad_norm": 1.84375, + "learning_rate": 3.434542679601922e-05, + "loss": 1.5498, + "step": 454 + }, + { + "epoch": 0.38947143162850417, + "grad_norm": 1.78125, + "learning_rate": 3.428193728944675e-05, + "loss": 1.3684, + "step": 455 + }, + { + "epoch": 0.39032741279691846, + "grad_norm": 1.546875, + "learning_rate": 3.421837827174757e-05, + "loss": 1.5111, + "step": 456 + }, + { + "epoch": 0.39118339396533275, + "grad_norm": 1.703125, + "learning_rate": 3.415475021890622e-05, + "loss": 1.5642, + "step": 457 + }, + { + "epoch": 0.39203937513374704, + "grad_norm": 1.640625, + "learning_rate": 3.4091053607424295e-05, + "loss": 1.4413, + "step": 458 + }, + { + "epoch": 0.3928953563021613, + "grad_norm": 1.5546875, + "learning_rate": 3.402728891431677e-05, + "loss": 1.3544, + "step": 459 + }, + { + "epoch": 0.39375133747057567, + "grad_norm": 1.6953125, + "learning_rate": 3.396345661710849e-05, + "loss": 1.4379, + "step": 460 + }, + { + "epoch": 0.39460731863898996, + "grad_norm": 1.84375, + "learning_rate": 3.389955719383058e-05, + "loss": 1.7564, + "step": 461 + }, + { + "epoch": 0.39546329980740424, + "grad_norm": 1.53125, + "learning_rate": 3.3835591123016865e-05, + "loss": 1.5366, + "step": 462 + }, + { + "epoch": 0.39631928097581853, + "grad_norm": 1.6015625, + "learning_rate": 3.3771558883700284e-05, + "loss": 1.7521, + "step": 463 + }, + { + "epoch": 0.3971752621442328, + "grad_norm": 1.4375, + "learning_rate": 3.370746095540928e-05, + "loss": 1.4594, + "step": 464 + }, + { + "epoch": 0.3980312433126471, + "grad_norm": 1.5859375, + "learning_rate": 3.364329781816426e-05, + "loss": 1.4018, + "step": 465 + }, + { + "epoch": 0.3988872244810614, + "grad_norm": 1.7421875, + "learning_rate": 3.357906995247396e-05, + "loss": 1.5263, + "step": 466 + }, + { + "epoch": 0.3997432056494757, + "grad_norm": 1.7265625, + "learning_rate": 3.3514777839331856e-05, + "loss": 1.5457, + "step": 467 + }, + { + "epoch": 0.40059918681789003, + "grad_norm": 1.59375, + "learning_rate": 3.3450421960212566e-05, + "loss": 1.664, + "step": 468 + }, + { + "epoch": 0.40059918681789003, + "eval_loss": 1.5587416887283325, + "eval_runtime": 21.3401, + "eval_samples_per_second": 18.275, + "eval_steps_per_second": 18.275, + "step": 468 + }, + { + "epoch": 0.4014551679863043, + "grad_norm": 1.7578125, + "learning_rate": 3.338600279706826e-05, + "loss": 1.5381, + "step": 469 + }, + { + "epoch": 0.4023111491547186, + "grad_norm": 2.421875, + "learning_rate": 3.3321520832325e-05, + "loss": 1.4321, + "step": 470 + }, + { + "epoch": 0.4031671303231329, + "grad_norm": 1.6953125, + "learning_rate": 3.3256976548879184e-05, + "loss": 1.4431, + "step": 471 + }, + { + "epoch": 0.4040231114915472, + "grad_norm": 1.75, + "learning_rate": 3.319237043009389e-05, + "loss": 1.3993, + "step": 472 + }, + { + "epoch": 0.40487909265996147, + "grad_norm": 1.5234375, + "learning_rate": 3.3127702959795296e-05, + "loss": 1.3284, + "step": 473 + }, + { + "epoch": 0.40573507382837576, + "grad_norm": 1.6875, + "learning_rate": 3.306297462226901e-05, + "loss": 1.3601, + "step": 474 + }, + { + "epoch": 0.40659105499679005, + "grad_norm": 1.5546875, + "learning_rate": 3.299818590225647e-05, + "loss": 1.4164, + "step": 475 + }, + { + "epoch": 0.4074470361652044, + "grad_norm": 1.7109375, + "learning_rate": 3.2933337284951336e-05, + "loss": 1.4316, + "step": 476 + }, + { + "epoch": 0.4083030173336187, + "grad_norm": 1.5546875, + "learning_rate": 3.286842925599579e-05, + "loss": 1.5327, + "step": 477 + }, + { + "epoch": 0.40915899850203297, + "grad_norm": 1.6640625, + "learning_rate": 3.2803462301476964e-05, + "loss": 1.3832, + "step": 478 + }, + { + "epoch": 0.41001497967044725, + "grad_norm": 1.46875, + "learning_rate": 3.273843690792326e-05, + "loss": 1.2295, + "step": 479 + }, + { + "epoch": 0.41087096083886154, + "grad_norm": 1.8046875, + "learning_rate": 3.267335356230075e-05, + "loss": 1.4291, + "step": 480 + }, + { + "epoch": 0.41172694200727583, + "grad_norm": 1.640625, + "learning_rate": 3.260821275200947e-05, + "loss": 1.7269, + "step": 481 + }, + { + "epoch": 0.4125829231756901, + "grad_norm": 1.5234375, + "learning_rate": 3.2543014964879816e-05, + "loss": 1.3251, + "step": 482 + }, + { + "epoch": 0.4134389043441044, + "grad_norm": 1.828125, + "learning_rate": 3.247776068916887e-05, + "loss": 1.6163, + "step": 483 + }, + { + "epoch": 0.41429488551251875, + "grad_norm": 1.5546875, + "learning_rate": 3.241245041355675e-05, + "loss": 1.3584, + "step": 484 + }, + { + "epoch": 0.41515086668093304, + "grad_norm": 1.703125, + "learning_rate": 3.234708462714297e-05, + "loss": 1.4595, + "step": 485 + }, + { + "epoch": 0.4160068478493473, + "grad_norm": 2.078125, + "learning_rate": 3.228166381944272e-05, + "loss": 1.7641, + "step": 486 + }, + { + "epoch": 0.4168628290177616, + "grad_norm": 1.640625, + "learning_rate": 3.2216188480383256e-05, + "loss": 1.4908, + "step": 487 + }, + { + "epoch": 0.4177188101861759, + "grad_norm": 1.9140625, + "learning_rate": 3.215065910030021e-05, + "loss": 1.6466, + "step": 488 + }, + { + "epoch": 0.4185747913545902, + "grad_norm": 1.71875, + "learning_rate": 3.208507616993393e-05, + "loss": 1.4535, + "step": 489 + }, + { + "epoch": 0.4194307725230045, + "grad_norm": 1.5390625, + "learning_rate": 3.201944018042577e-05, + "loss": 1.4366, + "step": 490 + }, + { + "epoch": 0.42028675369141877, + "grad_norm": 1.6796875, + "learning_rate": 3.1953751623314475e-05, + "loss": 1.3296, + "step": 491 + }, + { + "epoch": 0.42114273485983306, + "grad_norm": 1.546875, + "learning_rate": 3.1888010990532415e-05, + "loss": 1.4605, + "step": 492 + }, + { + "epoch": 0.4219987160282474, + "grad_norm": 1.6484375, + "learning_rate": 3.182221877440198e-05, + "loss": 1.3257, + "step": 493 + }, + { + "epoch": 0.4228546971966617, + "grad_norm": 1.5703125, + "learning_rate": 3.175637546763183e-05, + "loss": 1.4084, + "step": 494 + }, + { + "epoch": 0.423710678365076, + "grad_norm": 1.5, + "learning_rate": 3.169048156331329e-05, + "loss": 1.5077, + "step": 495 + }, + { + "epoch": 0.42456665953349026, + "grad_norm": 1.96875, + "learning_rate": 3.162453755491655e-05, + "loss": 1.2778, + "step": 496 + }, + { + "epoch": 0.42542264070190455, + "grad_norm": 2.0, + "learning_rate": 3.1558543936287035e-05, + "loss": 1.3954, + "step": 497 + }, + { + "epoch": 0.42627862187031884, + "grad_norm": 1.671875, + "learning_rate": 3.149250120164171e-05, + "loss": 1.4434, + "step": 498 + }, + { + "epoch": 0.42713460303873313, + "grad_norm": 1.515625, + "learning_rate": 3.142640984556536e-05, + "loss": 1.5035, + "step": 499 + }, + { + "epoch": 0.4279905842071474, + "grad_norm": 1.7265625, + "learning_rate": 3.136027036300687e-05, + "loss": 1.5234, + "step": 500 + }, + { + "epoch": 0.42884656537556176, + "grad_norm": 1.5859375, + "learning_rate": 3.1294083249275545e-05, + "loss": 1.3764, + "step": 501 + }, + { + "epoch": 0.42970254654397605, + "grad_norm": 1.578125, + "learning_rate": 3.122784900003742e-05, + "loss": 1.4066, + "step": 502 + }, + { + "epoch": 0.43055852771239034, + "grad_norm": 1.828125, + "learning_rate": 3.116156811131148e-05, + "loss": 1.6143, + "step": 503 + }, + { + "epoch": 0.4314145088808046, + "grad_norm": 1.5703125, + "learning_rate": 3.109524107946602e-05, + "loss": 1.3665, + "step": 504 + }, + { + "epoch": 0.4322704900492189, + "grad_norm": 1.65625, + "learning_rate": 3.102886840121486e-05, + "loss": 1.3919, + "step": 505 + }, + { + "epoch": 0.4331264712176332, + "grad_norm": 1.8125, + "learning_rate": 3.0962450573613704e-05, + "loss": 1.6993, + "step": 506 + }, + { + "epoch": 0.4339824523860475, + "grad_norm": 1.65625, + "learning_rate": 3.089598809405633e-05, + "loss": 1.3292, + "step": 507 + }, + { + "epoch": 0.4348384335544618, + "grad_norm": 1.515625, + "learning_rate": 3.0829481460270936e-05, + "loss": 1.3597, + "step": 508 + }, + { + "epoch": 0.4356944147228761, + "grad_norm": 1.453125, + "learning_rate": 3.0762931170316385e-05, + "loss": 1.3326, + "step": 509 + }, + { + "epoch": 0.4365503958912904, + "grad_norm": 1.59375, + "learning_rate": 3.0696337722578444e-05, + "loss": 1.4273, + "step": 510 + }, + { + "epoch": 0.4374063770597047, + "grad_norm": 1.4375, + "learning_rate": 3.062970161576612e-05, + "loss": 1.4425, + "step": 511 + }, + { + "epoch": 0.438262358228119, + "grad_norm": 1.6015625, + "learning_rate": 3.056302334890786e-05, + "loss": 1.5967, + "step": 512 + }, + { + "epoch": 0.4391183393965333, + "grad_norm": 1.6484375, + "learning_rate": 3.0496303421347872e-05, + "loss": 1.5083, + "step": 513 + }, + { + "epoch": 0.43997432056494756, + "grad_norm": 1.5, + "learning_rate": 3.0429542332742323e-05, + "loss": 1.3709, + "step": 514 + }, + { + "epoch": 0.44083030173336185, + "grad_norm": 1.6328125, + "learning_rate": 3.036274058305565e-05, + "loss": 1.4481, + "step": 515 + }, + { + "epoch": 0.44168628290177614, + "grad_norm": 1.5625, + "learning_rate": 3.029589867255678e-05, + "loss": 1.4541, + "step": 516 + }, + { + "epoch": 0.4425422640701905, + "grad_norm": 1.625, + "learning_rate": 3.022901710181542e-05, + "loss": 1.6127, + "step": 517 + }, + { + "epoch": 0.44339824523860477, + "grad_norm": 1.640625, + "learning_rate": 3.0162096371698267e-05, + "loss": 1.2699, + "step": 518 + }, + { + "epoch": 0.44425422640701906, + "grad_norm": 1.5859375, + "learning_rate": 3.0095136983365286e-05, + "loss": 1.4119, + "step": 519 + }, + { + "epoch": 0.44511020757543335, + "grad_norm": 1.5078125, + "learning_rate": 3.0028139438265944e-05, + "loss": 1.4058, + "step": 520 + }, + { + "epoch": 0.44596618874384764, + "grad_norm": 1.703125, + "learning_rate": 2.9961104238135457e-05, + "loss": 1.6121, + "step": 521 + }, + { + "epoch": 0.4468221699122619, + "grad_norm": 1.6484375, + "learning_rate": 2.989403188499105e-05, + "loss": 1.5662, + "step": 522 + }, + { + "epoch": 0.4476781510806762, + "grad_norm": 1.453125, + "learning_rate": 2.9826922881128162e-05, + "loss": 1.5012, + "step": 523 + }, + { + "epoch": 0.4485341322490905, + "grad_norm": 1.515625, + "learning_rate": 2.975977772911671e-05, + "loss": 1.4917, + "step": 524 + }, + { + "epoch": 0.4493901134175048, + "grad_norm": 1.7109375, + "learning_rate": 2.969259693179733e-05, + "loss": 1.3906, + "step": 525 + }, + { + "epoch": 0.45024609458591913, + "grad_norm": 1.6953125, + "learning_rate": 2.9625380992277584e-05, + "loss": 1.583, + "step": 526 + }, + { + "epoch": 0.4511020757543334, + "grad_norm": 1.6875, + "learning_rate": 2.955813041392822e-05, + "loss": 1.4414, + "step": 527 + }, + { + "epoch": 0.4519580569227477, + "grad_norm": 1.453125, + "learning_rate": 2.949084570037939e-05, + "loss": 1.2735, + "step": 528 + }, + { + "epoch": 0.452814038091162, + "grad_norm": 1.46875, + "learning_rate": 2.9423527355516876e-05, + "loss": 1.3283, + "step": 529 + }, + { + "epoch": 0.4536700192595763, + "grad_norm": 1.671875, + "learning_rate": 2.9356175883478322e-05, + "loss": 1.5274, + "step": 530 + }, + { + "epoch": 0.4545260004279906, + "grad_norm": 1.5234375, + "learning_rate": 2.9288791788649462e-05, + "loss": 1.4455, + "step": 531 + }, + { + "epoch": 0.45538198159640486, + "grad_norm": 1.6796875, + "learning_rate": 2.922137557566032e-05, + "loss": 1.4383, + "step": 532 + }, + { + "epoch": 0.45623796276481915, + "grad_norm": 1.5859375, + "learning_rate": 2.9153927749381483e-05, + "loss": 1.4231, + "step": 533 + }, + { + "epoch": 0.4570939439332335, + "grad_norm": 1.4921875, + "learning_rate": 2.9086448814920242e-05, + "loss": 1.4336, + "step": 534 + }, + { + "epoch": 0.4579499251016478, + "grad_norm": 1.5078125, + "learning_rate": 2.9018939277616886e-05, + "loss": 1.3865, + "step": 535 + }, + { + "epoch": 0.45880590627006207, + "grad_norm": 1.71875, + "learning_rate": 2.8951399643040867e-05, + "loss": 1.3812, + "step": 536 + }, + { + "epoch": 0.45966188743847636, + "grad_norm": 1.4609375, + "learning_rate": 2.888383041698704e-05, + "loss": 1.4111, + "step": 537 + }, + { + "epoch": 0.46051786860689065, + "grad_norm": 1.5234375, + "learning_rate": 2.8816232105471863e-05, + "loss": 1.2808, + "step": 538 + }, + { + "epoch": 0.46137384977530493, + "grad_norm": 1.4765625, + "learning_rate": 2.874860521472962e-05, + "loss": 1.4054, + "step": 539 + }, + { + "epoch": 0.4622298309437192, + "grad_norm": 1.6015625, + "learning_rate": 2.8680950251208595e-05, + "loss": 1.4313, + "step": 540 + }, + { + "epoch": 0.4630858121121335, + "grad_norm": 1.671875, + "learning_rate": 2.8613267721567333e-05, + "loss": 1.3595, + "step": 541 + }, + { + "epoch": 0.46394179328054785, + "grad_norm": 1.6015625, + "learning_rate": 2.8545558132670803e-05, + "loss": 1.4876, + "step": 542 + }, + { + "epoch": 0.46479777444896214, + "grad_norm": 1.7734375, + "learning_rate": 2.847782199158663e-05, + "loss": 1.4332, + "step": 543 + }, + { + "epoch": 0.46565375561737643, + "grad_norm": 1.8046875, + "learning_rate": 2.8410059805581258e-05, + "loss": 1.4712, + "step": 544 + }, + { + "epoch": 0.4665097367857907, + "grad_norm": 1.9375, + "learning_rate": 2.834227208211621e-05, + "loss": 1.4455, + "step": 545 + }, + { + "epoch": 0.467365717954205, + "grad_norm": 1.6015625, + "learning_rate": 2.8274459328844248e-05, + "loss": 1.4987, + "step": 546 + }, + { + "epoch": 0.4682216991226193, + "grad_norm": 1.78125, + "learning_rate": 2.8206622053605553e-05, + "loss": 1.4329, + "step": 547 + }, + { + "epoch": 0.4690776802910336, + "grad_norm": 1.6015625, + "learning_rate": 2.813876076442397e-05, + "loss": 1.3499, + "step": 548 + }, + { + "epoch": 0.46993366145944787, + "grad_norm": 1.671875, + "learning_rate": 2.8070875969503192e-05, + "loss": 1.4936, + "step": 549 + }, + { + "epoch": 0.47078964262786216, + "grad_norm": 1.75, + "learning_rate": 2.8002968177222917e-05, + "loss": 1.4108, + "step": 550 + }, + { + "epoch": 0.4716456237962765, + "grad_norm": 1.4296875, + "learning_rate": 2.793503789613507e-05, + "loss": 1.4677, + "step": 551 + }, + { + "epoch": 0.4725016049646908, + "grad_norm": 1.5625, + "learning_rate": 2.7867085634960016e-05, + "loss": 1.6387, + "step": 552 + }, + { + "epoch": 0.4733575861331051, + "grad_norm": 1.546875, + "learning_rate": 2.7799111902582696e-05, + "loss": 1.4241, + "step": 553 + }, + { + "epoch": 0.47421356730151937, + "grad_norm": 1.6875, + "learning_rate": 2.7731117208048872e-05, + "loss": 1.5287, + "step": 554 + }, + { + "epoch": 0.47506954846993366, + "grad_norm": 1.6484375, + "learning_rate": 2.7663102060561275e-05, + "loss": 1.4029, + "step": 555 + }, + { + "epoch": 0.47592552963834794, + "grad_norm": 1.5234375, + "learning_rate": 2.75950669694758e-05, + "loss": 1.3678, + "step": 556 + }, + { + "epoch": 0.47678151080676223, + "grad_norm": 1.5, + "learning_rate": 2.7527012444297707e-05, + "loss": 1.3775, + "step": 557 + }, + { + "epoch": 0.4776374919751765, + "grad_norm": 1.5625, + "learning_rate": 2.7458938994677786e-05, + "loss": 1.6167, + "step": 558 + }, + { + "epoch": 0.47849347314359086, + "grad_norm": 1.625, + "learning_rate": 2.739084713040856e-05, + "loss": 1.4628, + "step": 559 + }, + { + "epoch": 0.47934945431200515, + "grad_norm": 1.59375, + "learning_rate": 2.7322737361420454e-05, + "loss": 1.5349, + "step": 560 + }, + { + "epoch": 0.48020543548041944, + "grad_norm": 1.6328125, + "learning_rate": 2.725461019777797e-05, + "loss": 1.4614, + "step": 561 + }, + { + "epoch": 0.48106141664883373, + "grad_norm": 1.609375, + "learning_rate": 2.7186466149675887e-05, + "loss": 1.6509, + "step": 562 + }, + { + "epoch": 0.481917397817248, + "grad_norm": 1.8046875, + "learning_rate": 2.7118305727435434e-05, + "loss": 1.4552, + "step": 563 + }, + { + "epoch": 0.4827733789856623, + "grad_norm": 1.765625, + "learning_rate": 2.7050129441500436e-05, + "loss": 1.6248, + "step": 564 + }, + { + "epoch": 0.4836293601540766, + "grad_norm": 1.515625, + "learning_rate": 2.698193780243355e-05, + "loss": 1.4198, + "step": 565 + }, + { + "epoch": 0.4844853413224909, + "grad_norm": 1.9140625, + "learning_rate": 2.69137313209124e-05, + "loss": 1.4712, + "step": 566 + }, + { + "epoch": 0.4853413224909052, + "grad_norm": 1.5703125, + "learning_rate": 2.6845510507725745e-05, + "loss": 1.3251, + "step": 567 + }, + { + "epoch": 0.4861973036593195, + "grad_norm": 1.5390625, + "learning_rate": 2.67772758737697e-05, + "loss": 1.3109, + "step": 568 + }, + { + "epoch": 0.4870532848277338, + "grad_norm": 1.4765625, + "learning_rate": 2.670902793004389e-05, + "loss": 1.4285, + "step": 569 + }, + { + "epoch": 0.4879092659961481, + "grad_norm": 1.5390625, + "learning_rate": 2.664076718764756e-05, + "loss": 1.4363, + "step": 570 + }, + { + "epoch": 0.4887652471645624, + "grad_norm": 1.578125, + "learning_rate": 2.657249415777585e-05, + "loss": 1.2128, + "step": 571 + }, + { + "epoch": 0.48962122833297667, + "grad_norm": 1.546875, + "learning_rate": 2.6504209351715914e-05, + "loss": 1.472, + "step": 572 + }, + { + "epoch": 0.49047720950139095, + "grad_norm": 2.53125, + "learning_rate": 2.643591328084309e-05, + "loss": 1.3816, + "step": 573 + }, + { + "epoch": 0.49133319066980524, + "grad_norm": 1.8984375, + "learning_rate": 2.6367606456617055e-05, + "loss": 1.5654, + "step": 574 + }, + { + "epoch": 0.4921891718382196, + "grad_norm": 1.8046875, + "learning_rate": 2.6299289390578053e-05, + "loss": 1.5554, + "step": 575 + }, + { + "epoch": 0.4930451530066339, + "grad_norm": 1.6875, + "learning_rate": 2.623096259434302e-05, + "loss": 1.5279, + "step": 576 + }, + { + "epoch": 0.49390113417504816, + "grad_norm": 1.84375, + "learning_rate": 2.616262657960173e-05, + "loss": 1.4617, + "step": 577 + }, + { + "epoch": 0.49475711534346245, + "grad_norm": 1.640625, + "learning_rate": 2.6094281858113022e-05, + "loss": 1.4409, + "step": 578 + }, + { + "epoch": 0.49561309651187674, + "grad_norm": 1.5, + "learning_rate": 2.6025928941700945e-05, + "loss": 1.38, + "step": 579 + }, + { + "epoch": 0.496469077680291, + "grad_norm": 1.6328125, + "learning_rate": 2.595756834225089e-05, + "loss": 1.4494, + "step": 580 + }, + { + "epoch": 0.4973250588487053, + "grad_norm": 1.6484375, + "learning_rate": 2.5889200571705795e-05, + "loss": 1.4874, + "step": 581 + }, + { + "epoch": 0.4981810400171196, + "grad_norm": 1.6484375, + "learning_rate": 2.5820826142062323e-05, + "loss": 1.5417, + "step": 582 + }, + { + "epoch": 0.4990370211855339, + "grad_norm": 1.6328125, + "learning_rate": 2.575244556536697e-05, + "loss": 1.4868, + "step": 583 + }, + { + "epoch": 0.49989300235394823, + "grad_norm": 1.6796875, + "learning_rate": 2.5684059353712307e-05, + "loss": 1.3093, + "step": 584 + }, + { + "epoch": 0.5007489835223625, + "grad_norm": 1.921875, + "learning_rate": 2.5615668019233064e-05, + "loss": 1.5308, + "step": 585 + }, + { + "epoch": 0.5007489835223625, + "eval_loss": 1.5403519868850708, + "eval_runtime": 21.3271, + "eval_samples_per_second": 18.287, + "eval_steps_per_second": 18.287, + "step": 585 + }, + { + "epoch": 0.5016049646907768, + "grad_norm": 1.4453125, + "learning_rate": 2.5547272074102374e-05, + "loss": 1.339, + "step": 586 + }, + { + "epoch": 0.5024609458591911, + "grad_norm": 1.5625, + "learning_rate": 2.5478872030527855e-05, + "loss": 1.413, + "step": 587 + }, + { + "epoch": 0.5033169270276054, + "grad_norm": 1.6640625, + "learning_rate": 2.5410468400747854e-05, + "loss": 1.399, + "step": 588 + }, + { + "epoch": 0.5041729081960197, + "grad_norm": 1.546875, + "learning_rate": 2.534206169702757e-05, + "loss": 1.5245, + "step": 589 + }, + { + "epoch": 0.505028889364434, + "grad_norm": 1.7578125, + "learning_rate": 2.5273652431655204e-05, + "loss": 1.418, + "step": 590 + }, + { + "epoch": 0.5058848705328483, + "grad_norm": 1.5078125, + "learning_rate": 2.520524111693814e-05, + "loss": 1.4231, + "step": 591 + }, + { + "epoch": 0.5067408517012626, + "grad_norm": 1.5390625, + "learning_rate": 2.513682826519914e-05, + "loss": 1.3967, + "step": 592 + }, + { + "epoch": 0.5075968328696768, + "grad_norm": 1.453125, + "learning_rate": 2.5068414388772453e-05, + "loss": 1.3799, + "step": 593 + }, + { + "epoch": 0.5084528140380912, + "grad_norm": 1.84375, + "learning_rate": 2.5e-05, + "loss": 1.4701, + "step": 594 + }, + { + "epoch": 0.5093087952065054, + "grad_norm": 1.5078125, + "learning_rate": 2.4931585611227543e-05, + "loss": 1.3946, + "step": 595 + }, + { + "epoch": 0.5101647763749197, + "grad_norm": 1.765625, + "learning_rate": 2.4863171734800865e-05, + "loss": 1.5882, + "step": 596 + }, + { + "epoch": 0.5110207575433341, + "grad_norm": 1.5625, + "learning_rate": 2.479475888306186e-05, + "loss": 1.4909, + "step": 597 + }, + { + "epoch": 0.5118767387117483, + "grad_norm": 1.453125, + "learning_rate": 2.472634756834481e-05, + "loss": 1.2668, + "step": 598 + }, + { + "epoch": 0.5127327198801627, + "grad_norm": 1.5, + "learning_rate": 2.4657938302972437e-05, + "loss": 1.2743, + "step": 599 + }, + { + "epoch": 0.5135887010485769, + "grad_norm": 1.453125, + "learning_rate": 2.458953159925215e-05, + "loss": 1.3327, + "step": 600 + }, + { + "epoch": 0.5144446822169912, + "grad_norm": 1.625, + "learning_rate": 2.4521127969472148e-05, + "loss": 1.5656, + "step": 601 + }, + { + "epoch": 0.5153006633854055, + "grad_norm": 1.546875, + "learning_rate": 2.4452727925897635e-05, + "loss": 1.2883, + "step": 602 + }, + { + "epoch": 0.5161566445538198, + "grad_norm": 1.546875, + "learning_rate": 2.438433198076694e-05, + "loss": 1.4471, + "step": 603 + }, + { + "epoch": 0.5170126257222342, + "grad_norm": 1.5703125, + "learning_rate": 2.4315940646287695e-05, + "loss": 1.3376, + "step": 604 + }, + { + "epoch": 0.5178686068906484, + "grad_norm": 1.6796875, + "learning_rate": 2.424755443463303e-05, + "loss": 1.4541, + "step": 605 + }, + { + "epoch": 0.5187245880590627, + "grad_norm": 1.4453125, + "learning_rate": 2.4179173857937683e-05, + "loss": 1.2946, + "step": 606 + }, + { + "epoch": 0.519580569227477, + "grad_norm": 1.6484375, + "learning_rate": 2.411079942829421e-05, + "loss": 1.4473, + "step": 607 + }, + { + "epoch": 0.5204365503958913, + "grad_norm": 1.7578125, + "learning_rate": 2.4042431657749117e-05, + "loss": 1.5921, + "step": 608 + }, + { + "epoch": 0.5212925315643056, + "grad_norm": 1.59375, + "learning_rate": 2.3974071058299064e-05, + "loss": 1.3892, + "step": 609 + }, + { + "epoch": 0.5221485127327199, + "grad_norm": 1.546875, + "learning_rate": 2.390571814188698e-05, + "loss": 1.4598, + "step": 610 + }, + { + "epoch": 0.5230044939011341, + "grad_norm": 1.625, + "learning_rate": 2.383737342039827e-05, + "loss": 1.4553, + "step": 611 + }, + { + "epoch": 0.5238604750695485, + "grad_norm": 1.8046875, + "learning_rate": 2.3769037405656987e-05, + "loss": 1.5219, + "step": 612 + }, + { + "epoch": 0.5247164562379628, + "grad_norm": 1.4765625, + "learning_rate": 2.3700710609421946e-05, + "loss": 1.255, + "step": 613 + }, + { + "epoch": 0.525572437406377, + "grad_norm": 1.6875, + "learning_rate": 2.3632393543382954e-05, + "loss": 1.4204, + "step": 614 + }, + { + "epoch": 0.5264284185747914, + "grad_norm": 1.546875, + "learning_rate": 2.356408671915692e-05, + "loss": 1.4509, + "step": 615 + }, + { + "epoch": 0.5272843997432056, + "grad_norm": 1.4921875, + "learning_rate": 2.3495790648284092e-05, + "loss": 1.3018, + "step": 616 + }, + { + "epoch": 0.52814038091162, + "grad_norm": 1.546875, + "learning_rate": 2.3427505842224154e-05, + "loss": 1.5016, + "step": 617 + }, + { + "epoch": 0.5289963620800342, + "grad_norm": 1.5234375, + "learning_rate": 2.3359232812352443e-05, + "loss": 1.3029, + "step": 618 + }, + { + "epoch": 0.5298523432484485, + "grad_norm": 1.7421875, + "learning_rate": 2.3290972069956117e-05, + "loss": 1.4533, + "step": 619 + }, + { + "epoch": 0.5307083244168628, + "grad_norm": 1.59375, + "learning_rate": 2.3222724126230294e-05, + "loss": 1.36, + "step": 620 + }, + { + "epoch": 0.5315643055852771, + "grad_norm": 1.875, + "learning_rate": 2.315448949227426e-05, + "loss": 1.5453, + "step": 621 + }, + { + "epoch": 0.5324202867536915, + "grad_norm": 1.6328125, + "learning_rate": 2.3086268679087607e-05, + "loss": 1.3677, + "step": 622 + }, + { + "epoch": 0.5332762679221057, + "grad_norm": 1.7578125, + "learning_rate": 2.3018062197566462e-05, + "loss": 1.5106, + "step": 623 + }, + { + "epoch": 0.53413224909052, + "grad_norm": 1.765625, + "learning_rate": 2.294987055849957e-05, + "loss": 1.6279, + "step": 624 + }, + { + "epoch": 0.5349882302589343, + "grad_norm": 1.5234375, + "learning_rate": 2.288169427256458e-05, + "loss": 1.4241, + "step": 625 + }, + { + "epoch": 0.5358442114273486, + "grad_norm": 1.484375, + "learning_rate": 2.281353385032412e-05, + "loss": 1.4114, + "step": 626 + }, + { + "epoch": 0.5367001925957628, + "grad_norm": 1.5703125, + "learning_rate": 2.2745389802222032e-05, + "loss": 1.4671, + "step": 627 + }, + { + "epoch": 0.5375561737641772, + "grad_norm": 1.6875, + "learning_rate": 2.2677262638579555e-05, + "loss": 1.5669, + "step": 628 + }, + { + "epoch": 0.5384121549325915, + "grad_norm": 1.5703125, + "learning_rate": 2.2609152869591446e-05, + "loss": 1.4634, + "step": 629 + }, + { + "epoch": 0.5392681361010058, + "grad_norm": 1.671875, + "learning_rate": 2.2541061005322227e-05, + "loss": 1.4757, + "step": 630 + }, + { + "epoch": 0.5401241172694201, + "grad_norm": 1.6484375, + "learning_rate": 2.2472987555702302e-05, + "loss": 1.504, + "step": 631 + }, + { + "epoch": 0.5409800984378343, + "grad_norm": 1.6875, + "learning_rate": 2.240493303052421e-05, + "loss": 1.5711, + "step": 632 + }, + { + "epoch": 0.5418360796062487, + "grad_norm": 1.390625, + "learning_rate": 2.2336897939438734e-05, + "loss": 1.3183, + "step": 633 + }, + { + "epoch": 0.5426920607746629, + "grad_norm": 1.5078125, + "learning_rate": 2.2268882791951127e-05, + "loss": 1.4867, + "step": 634 + }, + { + "epoch": 0.5435480419430773, + "grad_norm": 1.390625, + "learning_rate": 2.2200888097417307e-05, + "loss": 1.2882, + "step": 635 + }, + { + "epoch": 0.5444040231114915, + "grad_norm": 1.609375, + "learning_rate": 2.2132914365039993e-05, + "loss": 1.4977, + "step": 636 + }, + { + "epoch": 0.5452600042799058, + "grad_norm": 1.4296875, + "learning_rate": 2.2064962103864937e-05, + "loss": 1.4808, + "step": 637 + }, + { + "epoch": 0.5461159854483202, + "grad_norm": 1.6484375, + "learning_rate": 2.1997031822777093e-05, + "loss": 1.3365, + "step": 638 + }, + { + "epoch": 0.5469719666167344, + "grad_norm": 1.375, + "learning_rate": 2.1929124030496817e-05, + "loss": 1.3079, + "step": 639 + }, + { + "epoch": 0.5478279477851488, + "grad_norm": 1.5546875, + "learning_rate": 2.186123923557603e-05, + "loss": 1.4077, + "step": 640 + }, + { + "epoch": 0.548683928953563, + "grad_norm": 1.578125, + "learning_rate": 2.1793377946394446e-05, + "loss": 1.5337, + "step": 641 + }, + { + "epoch": 0.5495399101219773, + "grad_norm": 1.5859375, + "learning_rate": 2.1725540671155758e-05, + "loss": 1.3779, + "step": 642 + }, + { + "epoch": 0.5503958912903916, + "grad_norm": 1.6484375, + "learning_rate": 2.165772791788379e-05, + "loss": 1.2943, + "step": 643 + }, + { + "epoch": 0.5512518724588059, + "grad_norm": 1.515625, + "learning_rate": 2.1589940194418748e-05, + "loss": 1.4558, + "step": 644 + }, + { + "epoch": 0.5521078536272201, + "grad_norm": 1.4921875, + "learning_rate": 2.1522178008413377e-05, + "loss": 1.3845, + "step": 645 + }, + { + "epoch": 0.5529638347956345, + "grad_norm": 1.6171875, + "learning_rate": 2.1454441867329203e-05, + "loss": 1.4121, + "step": 646 + }, + { + "epoch": 0.5538198159640488, + "grad_norm": 2.625, + "learning_rate": 2.1386732278432676e-05, + "loss": 1.3775, + "step": 647 + }, + { + "epoch": 0.5546757971324631, + "grad_norm": 1.734375, + "learning_rate": 2.1319049748791418e-05, + "loss": 1.3581, + "step": 648 + }, + { + "epoch": 0.5555317783008774, + "grad_norm": 1.8125, + "learning_rate": 2.1251394785270386e-05, + "loss": 1.5385, + "step": 649 + }, + { + "epoch": 0.5563877594692916, + "grad_norm": 1.578125, + "learning_rate": 2.1183767894528136e-05, + "loss": 1.4733, + "step": 650 + }, + { + "epoch": 0.557243740637706, + "grad_norm": 1.4921875, + "learning_rate": 2.1116169583012965e-05, + "loss": 1.3986, + "step": 651 + }, + { + "epoch": 0.5580997218061202, + "grad_norm": 1.4140625, + "learning_rate": 2.1048600356959132e-05, + "loss": 1.3114, + "step": 652 + }, + { + "epoch": 0.5589557029745346, + "grad_norm": 1.75, + "learning_rate": 2.0981060722383127e-05, + "loss": 1.33, + "step": 653 + }, + { + "epoch": 0.5598116841429489, + "grad_norm": 1.546875, + "learning_rate": 2.0913551185079764e-05, + "loss": 1.4388, + "step": 654 + }, + { + "epoch": 0.5606676653113631, + "grad_norm": 1.8046875, + "learning_rate": 2.084607225061853e-05, + "loss": 1.6617, + "step": 655 + }, + { + "epoch": 0.5615236464797775, + "grad_norm": 1.53125, + "learning_rate": 2.077862442433968e-05, + "loss": 1.3882, + "step": 656 + }, + { + "epoch": 0.5623796276481917, + "grad_norm": 1.5546875, + "learning_rate": 2.071120821135054e-05, + "loss": 1.2734, + "step": 657 + }, + { + "epoch": 0.5632356088166061, + "grad_norm": 1.4765625, + "learning_rate": 2.064382411652168e-05, + "loss": 1.4545, + "step": 658 + }, + { + "epoch": 0.5640915899850203, + "grad_norm": 1.6328125, + "learning_rate": 2.057647264448313e-05, + "loss": 1.4795, + "step": 659 + }, + { + "epoch": 0.5649475711534346, + "grad_norm": 1.65625, + "learning_rate": 2.050915429962062e-05, + "loss": 1.5849, + "step": 660 + }, + { + "epoch": 0.5658035523218489, + "grad_norm": 1.6796875, + "learning_rate": 2.0441869586071783e-05, + "loss": 1.5012, + "step": 661 + }, + { + "epoch": 0.5666595334902632, + "grad_norm": 1.6875, + "learning_rate": 2.037461900772242e-05, + "loss": 1.5541, + "step": 662 + }, + { + "epoch": 0.5675155146586776, + "grad_norm": 1.65625, + "learning_rate": 2.0307403068202676e-05, + "loss": 1.4741, + "step": 663 + }, + { + "epoch": 0.5683714958270918, + "grad_norm": 1.546875, + "learning_rate": 2.0240222270883288e-05, + "loss": 1.5354, + "step": 664 + }, + { + "epoch": 0.5692274769955061, + "grad_norm": 1.546875, + "learning_rate": 2.0173077118871844e-05, + "loss": 1.4909, + "step": 665 + }, + { + "epoch": 0.5700834581639204, + "grad_norm": 1.5390625, + "learning_rate": 2.0105968115008954e-05, + "loss": 1.5927, + "step": 666 + }, + { + "epoch": 0.5709394393323347, + "grad_norm": 1.6171875, + "learning_rate": 2.003889576186455e-05, + "loss": 1.4568, + "step": 667 + }, + { + "epoch": 0.5717954205007489, + "grad_norm": 1.8046875, + "learning_rate": 1.997186056173406e-05, + "loss": 1.4905, + "step": 668 + }, + { + "epoch": 0.5726514016691633, + "grad_norm": 1.5390625, + "learning_rate": 1.9904863016634723e-05, + "loss": 1.5317, + "step": 669 + }, + { + "epoch": 0.5735073828375776, + "grad_norm": 1.65625, + "learning_rate": 1.983790362830174e-05, + "loss": 1.4985, + "step": 670 + }, + { + "epoch": 0.5743633640059919, + "grad_norm": 1.421875, + "learning_rate": 1.977098289818459e-05, + "loss": 1.4502, + "step": 671 + }, + { + "epoch": 0.5752193451744062, + "grad_norm": 1.5625, + "learning_rate": 1.970410132744322e-05, + "loss": 1.3937, + "step": 672 + }, + { + "epoch": 0.5760753263428204, + "grad_norm": 1.484375, + "learning_rate": 1.9637259416944352e-05, + "loss": 1.3821, + "step": 673 + }, + { + "epoch": 0.5769313075112348, + "grad_norm": 1.5234375, + "learning_rate": 1.9570457667257686e-05, + "loss": 1.4048, + "step": 674 + }, + { + "epoch": 0.577787288679649, + "grad_norm": 1.5703125, + "learning_rate": 1.950369657865213e-05, + "loss": 1.3841, + "step": 675 + }, + { + "epoch": 0.5786432698480634, + "grad_norm": 1.53125, + "learning_rate": 1.9436976651092144e-05, + "loss": 1.4192, + "step": 676 + }, + { + "epoch": 0.5794992510164776, + "grad_norm": 1.5, + "learning_rate": 1.937029838423389e-05, + "loss": 1.2927, + "step": 677 + }, + { + "epoch": 0.5803552321848919, + "grad_norm": 1.515625, + "learning_rate": 1.9303662277421568e-05, + "loss": 1.5403, + "step": 678 + }, + { + "epoch": 0.5812112133533063, + "grad_norm": 1.390625, + "learning_rate": 1.923706882968362e-05, + "loss": 1.3693, + "step": 679 + }, + { + "epoch": 0.5820671945217205, + "grad_norm": 1.4453125, + "learning_rate": 1.917051853972906e-05, + "loss": 1.3371, + "step": 680 + }, + { + "epoch": 0.5829231756901349, + "grad_norm": 1.5703125, + "learning_rate": 1.910401190594367e-05, + "loss": 1.4528, + "step": 681 + }, + { + "epoch": 0.5837791568585491, + "grad_norm": 1.5703125, + "learning_rate": 1.9037549426386302e-05, + "loss": 1.4057, + "step": 682 + }, + { + "epoch": 0.5846351380269634, + "grad_norm": 1.78125, + "learning_rate": 1.8971131598785148e-05, + "loss": 1.4727, + "step": 683 + }, + { + "epoch": 0.5854911191953777, + "grad_norm": 1.5625, + "learning_rate": 1.8904758920533988e-05, + "loss": 1.4969, + "step": 684 + }, + { + "epoch": 0.586347100363792, + "grad_norm": 1.4140625, + "learning_rate": 1.8838431888688527e-05, + "loss": 1.3984, + "step": 685 + }, + { + "epoch": 0.5872030815322062, + "grad_norm": 1.703125, + "learning_rate": 1.8772150999962587e-05, + "loss": 1.4929, + "step": 686 + }, + { + "epoch": 0.5880590627006206, + "grad_norm": 1.46875, + "learning_rate": 1.870591675072446e-05, + "loss": 1.3202, + "step": 687 + }, + { + "epoch": 0.5889150438690349, + "grad_norm": 1.5, + "learning_rate": 1.863972963699314e-05, + "loss": 1.5529, + "step": 688 + }, + { + "epoch": 0.5897710250374492, + "grad_norm": 1.6640625, + "learning_rate": 1.857359015443465e-05, + "loss": 1.4185, + "step": 689 + }, + { + "epoch": 0.5906270062058635, + "grad_norm": 1.5234375, + "learning_rate": 1.8507498798358297e-05, + "loss": 1.4122, + "step": 690 + }, + { + "epoch": 0.5914829873742777, + "grad_norm": 1.8203125, + "learning_rate": 1.844145606371297e-05, + "loss": 1.3178, + "step": 691 + }, + { + "epoch": 0.5923389685426921, + "grad_norm": 1.640625, + "learning_rate": 1.8375462445083464e-05, + "loss": 1.4875, + "step": 692 + }, + { + "epoch": 0.5931949497111063, + "grad_norm": 1.5390625, + "learning_rate": 1.830951843668672e-05, + "loss": 1.443, + "step": 693 + }, + { + "epoch": 0.5940509308795207, + "grad_norm": 1.625, + "learning_rate": 1.8243624532368174e-05, + "loss": 1.4547, + "step": 694 + }, + { + "epoch": 0.594906912047935, + "grad_norm": 1.453125, + "learning_rate": 1.8177781225598032e-05, + "loss": 1.3457, + "step": 695 + }, + { + "epoch": 0.5957628932163492, + "grad_norm": 1.6171875, + "learning_rate": 1.811198900946759e-05, + "loss": 1.4981, + "step": 696 + }, + { + "epoch": 0.5966188743847636, + "grad_norm": 2.359375, + "learning_rate": 1.804624837668553e-05, + "loss": 1.5599, + "step": 697 + }, + { + "epoch": 0.5974748555531778, + "grad_norm": 1.4921875, + "learning_rate": 1.7980559819574223e-05, + "loss": 1.3979, + "step": 698 + }, + { + "epoch": 0.5983308367215922, + "grad_norm": 1.5625, + "learning_rate": 1.7914923830066074e-05, + "loss": 1.4061, + "step": 699 + }, + { + "epoch": 0.5991868178900064, + "grad_norm": 1.6015625, + "learning_rate": 1.784934089969979e-05, + "loss": 1.5827, + "step": 700 + }, + { + "epoch": 0.6000427990584207, + "grad_norm": 1.7734375, + "learning_rate": 1.7783811519616757e-05, + "loss": 1.4095, + "step": 701 + }, + { + "epoch": 0.600898780226835, + "grad_norm": 1.5, + "learning_rate": 1.7718336180557288e-05, + "loss": 1.3583, + "step": 702 + }, + { + "epoch": 0.600898780226835, + "eval_loss": 1.5267729759216309, + "eval_runtime": 21.3333, + "eval_samples_per_second": 18.281, + "eval_steps_per_second": 18.281, + "step": 702 + }, + { + "epoch": 0.6017547613952493, + "grad_norm": 1.4296875, + "learning_rate": 1.7652915372857035e-05, + "loss": 1.4024, + "step": 703 + }, + { + "epoch": 0.6026107425636636, + "grad_norm": 1.4375, + "learning_rate": 1.7587549586443252e-05, + "loss": 1.349, + "step": 704 + }, + { + "epoch": 0.6034667237320779, + "grad_norm": 1.6484375, + "learning_rate": 1.7522239310831134e-05, + "loss": 1.5471, + "step": 705 + }, + { + "epoch": 0.6043227049004922, + "grad_norm": 1.828125, + "learning_rate": 1.7456985035120193e-05, + "loss": 1.4457, + "step": 706 + }, + { + "epoch": 0.6051786860689065, + "grad_norm": 1.671875, + "learning_rate": 1.7391787247990538e-05, + "loss": 1.2629, + "step": 707 + }, + { + "epoch": 0.6060346672373208, + "grad_norm": 1.5546875, + "learning_rate": 1.732664643769926e-05, + "loss": 1.4819, + "step": 708 + }, + { + "epoch": 0.606890648405735, + "grad_norm": 1.9296875, + "learning_rate": 1.726156309207674e-05, + "loss": 1.4687, + "step": 709 + }, + { + "epoch": 0.6077466295741494, + "grad_norm": 1.5078125, + "learning_rate": 1.7196537698523052e-05, + "loss": 1.4168, + "step": 710 + }, + { + "epoch": 0.6086026107425636, + "grad_norm": 1.7734375, + "learning_rate": 1.7131570744004215e-05, + "loss": 1.4856, + "step": 711 + }, + { + "epoch": 0.609458591910978, + "grad_norm": 1.5078125, + "learning_rate": 1.7066662715048666e-05, + "loss": 1.4287, + "step": 712 + }, + { + "epoch": 0.6103145730793923, + "grad_norm": 1.578125, + "learning_rate": 1.7001814097743528e-05, + "loss": 1.5557, + "step": 713 + }, + { + "epoch": 0.6111705542478065, + "grad_norm": 1.6015625, + "learning_rate": 1.693702537773099e-05, + "loss": 1.4353, + "step": 714 + }, + { + "epoch": 0.6120265354162209, + "grad_norm": 1.6796875, + "learning_rate": 1.687229704020471e-05, + "loss": 1.5126, + "step": 715 + }, + { + "epoch": 0.6128825165846351, + "grad_norm": 1.6171875, + "learning_rate": 1.6807629569906112e-05, + "loss": 1.479, + "step": 716 + }, + { + "epoch": 0.6137384977530495, + "grad_norm": 1.78125, + "learning_rate": 1.6743023451120832e-05, + "loss": 1.4706, + "step": 717 + }, + { + "epoch": 0.6145944789214637, + "grad_norm": 1.609375, + "learning_rate": 1.6678479167675006e-05, + "loss": 1.6114, + "step": 718 + }, + { + "epoch": 0.615450460089878, + "grad_norm": 1.625, + "learning_rate": 1.6613997202931746e-05, + "loss": 1.4916, + "step": 719 + }, + { + "epoch": 0.6163064412582924, + "grad_norm": 1.5703125, + "learning_rate": 1.6549578039787436e-05, + "loss": 1.3918, + "step": 720 + }, + { + "epoch": 0.6171624224267066, + "grad_norm": 1.4296875, + "learning_rate": 1.6485222160668146e-05, + "loss": 1.3791, + "step": 721 + }, + { + "epoch": 0.618018403595121, + "grad_norm": 1.5390625, + "learning_rate": 1.642093004752605e-05, + "loss": 1.5026, + "step": 722 + }, + { + "epoch": 0.6188743847635352, + "grad_norm": 1.640625, + "learning_rate": 1.635670218183575e-05, + "loss": 1.4059, + "step": 723 + }, + { + "epoch": 0.6197303659319495, + "grad_norm": 1.4765625, + "learning_rate": 1.629253904459073e-05, + "loss": 1.4202, + "step": 724 + }, + { + "epoch": 0.6205863471003638, + "grad_norm": 1.5, + "learning_rate": 1.622844111629972e-05, + "loss": 1.4348, + "step": 725 + }, + { + "epoch": 0.6214423282687781, + "grad_norm": 1.4921875, + "learning_rate": 1.616440887698313e-05, + "loss": 1.4223, + "step": 726 + }, + { + "epoch": 0.6222983094371923, + "grad_norm": 1.6015625, + "learning_rate": 1.6100442806169422e-05, + "loss": 1.4637, + "step": 727 + }, + { + "epoch": 0.6231542906056067, + "grad_norm": 1.5390625, + "learning_rate": 1.6036543382891512e-05, + "loss": 1.3871, + "step": 728 + }, + { + "epoch": 0.624010271774021, + "grad_norm": 1.5, + "learning_rate": 1.597271108568324e-05, + "loss": 1.5021, + "step": 729 + }, + { + "epoch": 0.6248662529424353, + "grad_norm": 1.6953125, + "learning_rate": 1.5908946392575714e-05, + "loss": 1.628, + "step": 730 + }, + { + "epoch": 0.6257222341108496, + "grad_norm": 1.546875, + "learning_rate": 1.5845249781093786e-05, + "loss": 1.4596, + "step": 731 + }, + { + "epoch": 0.6265782152792638, + "grad_norm": 1.46875, + "learning_rate": 1.578162172825244e-05, + "loss": 1.1683, + "step": 732 + }, + { + "epoch": 0.6274341964476782, + "grad_norm": 1.7265625, + "learning_rate": 1.5718062710553253e-05, + "loss": 1.3545, + "step": 733 + }, + { + "epoch": 0.6282901776160924, + "grad_norm": 1.5546875, + "learning_rate": 1.5654573203980784e-05, + "loss": 1.3087, + "step": 734 + }, + { + "epoch": 0.6291461587845067, + "grad_norm": 1.515625, + "learning_rate": 1.5591153683999043e-05, + "loss": 1.3387, + "step": 735 + }, + { + "epoch": 0.630002139952921, + "grad_norm": 1.4296875, + "learning_rate": 1.5527804625547938e-05, + "loss": 1.3403, + "step": 736 + }, + { + "epoch": 0.6308581211213353, + "grad_norm": 1.421875, + "learning_rate": 1.5464526503039666e-05, + "loss": 1.4556, + "step": 737 + }, + { + "epoch": 0.6317141022897497, + "grad_norm": 2.109375, + "learning_rate": 1.540131979035523e-05, + "loss": 1.3776, + "step": 738 + }, + { + "epoch": 0.6325700834581639, + "grad_norm": 1.5703125, + "learning_rate": 1.5338184960840824e-05, + "loss": 1.3059, + "step": 739 + }, + { + "epoch": 0.6334260646265782, + "grad_norm": 1.703125, + "learning_rate": 1.5275122487304335e-05, + "loss": 1.5742, + "step": 740 + }, + { + "epoch": 0.6342820457949925, + "grad_norm": 1.6015625, + "learning_rate": 1.5212132842011779e-05, + "loss": 1.4275, + "step": 741 + }, + { + "epoch": 0.6351380269634068, + "grad_norm": 1.5234375, + "learning_rate": 1.5149216496683787e-05, + "loss": 1.489, + "step": 742 + }, + { + "epoch": 0.635994008131821, + "grad_norm": 1.4609375, + "learning_rate": 1.5086373922492048e-05, + "loss": 1.4186, + "step": 743 + }, + { + "epoch": 0.6368499893002354, + "grad_norm": 1.7265625, + "learning_rate": 1.5023605590055767e-05, + "loss": 1.4414, + "step": 744 + }, + { + "epoch": 0.6377059704686497, + "grad_norm": 1.65625, + "learning_rate": 1.4960911969438213e-05, + "loss": 1.3893, + "step": 745 + }, + { + "epoch": 0.638561951637064, + "grad_norm": 1.6640625, + "learning_rate": 1.4898293530143095e-05, + "loss": 1.4831, + "step": 746 + }, + { + "epoch": 0.6394179328054783, + "grad_norm": 1.484375, + "learning_rate": 1.4835750741111138e-05, + "loss": 1.4675, + "step": 747 + }, + { + "epoch": 0.6402739139738926, + "grad_norm": 1.65625, + "learning_rate": 1.4773284070716503e-05, + "loss": 1.4084, + "step": 748 + }, + { + "epoch": 0.6411298951423069, + "grad_norm": 1.484375, + "learning_rate": 1.4710893986763347e-05, + "loss": 1.4119, + "step": 749 + }, + { + "epoch": 0.6419858763107211, + "grad_norm": 1.484375, + "learning_rate": 1.464858095648224e-05, + "loss": 1.464, + "step": 750 + }, + { + "epoch": 0.6428418574791355, + "grad_norm": 1.6796875, + "learning_rate": 1.4586345446526733e-05, + "loss": 1.3932, + "step": 751 + }, + { + "epoch": 0.6436978386475497, + "grad_norm": 1.859375, + "learning_rate": 1.4524187922969839e-05, + "loss": 1.4852, + "step": 752 + }, + { + "epoch": 0.644553819815964, + "grad_norm": 1.6875, + "learning_rate": 1.4462108851300523e-05, + "loss": 1.3278, + "step": 753 + }, + { + "epoch": 0.6454098009843784, + "grad_norm": 1.359375, + "learning_rate": 1.4400108696420264e-05, + "loss": 1.3441, + "step": 754 + }, + { + "epoch": 0.6462657821527926, + "grad_norm": 1.5, + "learning_rate": 1.4338187922639507e-05, + "loss": 1.3425, + "step": 755 + }, + { + "epoch": 0.647121763321207, + "grad_norm": 1.5703125, + "learning_rate": 1.4276346993674266e-05, + "loss": 1.381, + "step": 756 + }, + { + "epoch": 0.6479777444896212, + "grad_norm": 1.6484375, + "learning_rate": 1.4214586372642563e-05, + "loss": 1.4587, + "step": 757 + }, + { + "epoch": 0.6488337256580355, + "grad_norm": 1.53125, + "learning_rate": 1.4152906522061048e-05, + "loss": 1.3396, + "step": 758 + }, + { + "epoch": 0.6496897068264498, + "grad_norm": 1.7578125, + "learning_rate": 1.4091307903841466e-05, + "loss": 1.3532, + "step": 759 + }, + { + "epoch": 0.6505456879948641, + "grad_norm": 1.5, + "learning_rate": 1.4029790979287216e-05, + "loss": 1.3586, + "step": 760 + }, + { + "epoch": 0.6514016691632785, + "grad_norm": 1.5390625, + "learning_rate": 1.3968356209089944e-05, + "loss": 1.4067, + "step": 761 + }, + { + "epoch": 0.6522576503316927, + "grad_norm": 1.5234375, + "learning_rate": 1.3907004053326006e-05, + "loss": 1.4696, + "step": 762 + }, + { + "epoch": 0.653113631500107, + "grad_norm": 1.46875, + "learning_rate": 1.3845734971453114e-05, + "loss": 1.4284, + "step": 763 + }, + { + "epoch": 0.6539696126685213, + "grad_norm": 1.46875, + "learning_rate": 1.3784549422306808e-05, + "loss": 1.4767, + "step": 764 + }, + { + "epoch": 0.6548255938369356, + "grad_norm": 1.7734375, + "learning_rate": 1.3723447864097105e-05, + "loss": 1.579, + "step": 765 + }, + { + "epoch": 0.6556815750053498, + "grad_norm": 1.453125, + "learning_rate": 1.3662430754405004e-05, + "loss": 1.3239, + "step": 766 + }, + { + "epoch": 0.6565375561737642, + "grad_norm": 1.4296875, + "learning_rate": 1.360149855017906e-05, + "loss": 1.1992, + "step": 767 + }, + { + "epoch": 0.6573935373421784, + "grad_norm": 1.5546875, + "learning_rate": 1.3540651707732035e-05, + "loss": 1.354, + "step": 768 + }, + { + "epoch": 0.6582495185105928, + "grad_norm": 1.4765625, + "learning_rate": 1.3479890682737379e-05, + "loss": 1.4431, + "step": 769 + }, + { + "epoch": 0.6591054996790071, + "grad_norm": 1.5703125, + "learning_rate": 1.3419215930225899e-05, + "loss": 1.3813, + "step": 770 + }, + { + "epoch": 0.6599614808474213, + "grad_norm": 1.5390625, + "learning_rate": 1.3358627904582307e-05, + "loss": 1.3639, + "step": 771 + }, + { + "epoch": 0.6608174620158357, + "grad_norm": 1.609375, + "learning_rate": 1.3298127059541828e-05, + "loss": 1.3568, + "step": 772 + }, + { + "epoch": 0.6616734431842499, + "grad_norm": 1.6328125, + "learning_rate": 1.3237713848186797e-05, + "loss": 1.4048, + "step": 773 + }, + { + "epoch": 0.6625294243526643, + "grad_norm": 1.546875, + "learning_rate": 1.317738872294329e-05, + "loss": 1.4503, + "step": 774 + }, + { + "epoch": 0.6633854055210785, + "grad_norm": 1.6640625, + "learning_rate": 1.311715213557772e-05, + "loss": 1.3446, + "step": 775 + }, + { + "epoch": 0.6642413866894928, + "grad_norm": 1.6796875, + "learning_rate": 1.3057004537193423e-05, + "loss": 1.2524, + "step": 776 + }, + { + "epoch": 0.6650973678579071, + "grad_norm": 1.5625, + "learning_rate": 1.2996946378227352e-05, + "loss": 1.5227, + "step": 777 + }, + { + "epoch": 0.6659533490263214, + "grad_norm": 1.515625, + "learning_rate": 1.2936978108446624e-05, + "loss": 1.4289, + "step": 778 + }, + { + "epoch": 0.6668093301947358, + "grad_norm": 1.75, + "learning_rate": 1.28771001769452e-05, + "loss": 1.6197, + "step": 779 + }, + { + "epoch": 0.66766531136315, + "grad_norm": 1.5078125, + "learning_rate": 1.2817313032140505e-05, + "loss": 1.4775, + "step": 780 + }, + { + "epoch": 0.6685212925315643, + "grad_norm": 1.4609375, + "learning_rate": 1.2757617121770093e-05, + "loss": 1.4731, + "step": 781 + }, + { + "epoch": 0.6693772736999786, + "grad_norm": 1.5703125, + "learning_rate": 1.2698012892888272e-05, + "loss": 1.4356, + "step": 782 + }, + { + "epoch": 0.6702332548683929, + "grad_norm": 1.453125, + "learning_rate": 1.263850079186274e-05, + "loss": 1.3329, + "step": 783 + }, + { + "epoch": 0.6710892360368071, + "grad_norm": 1.4453125, + "learning_rate": 1.257908126437129e-05, + "loss": 1.4069, + "step": 784 + }, + { + "epoch": 0.6719452172052215, + "grad_norm": 1.734375, + "learning_rate": 1.2519754755398422e-05, + "loss": 1.501, + "step": 785 + }, + { + "epoch": 0.6728011983736358, + "grad_norm": 1.7265625, + "learning_rate": 1.2460521709232043e-05, + "loss": 1.5482, + "step": 786 + }, + { + "epoch": 0.6736571795420501, + "grad_norm": 1.609375, + "learning_rate": 1.2401382569460119e-05, + "loss": 1.3473, + "step": 787 + }, + { + "epoch": 0.6745131607104644, + "grad_norm": 1.5546875, + "learning_rate": 1.2342337778967384e-05, + "loss": 1.4373, + "step": 788 + }, + { + "epoch": 0.6753691418788786, + "grad_norm": 1.4296875, + "learning_rate": 1.2283387779932005e-05, + "loss": 1.4588, + "step": 789 + }, + { + "epoch": 0.676225123047293, + "grad_norm": 1.6171875, + "learning_rate": 1.2224533013822238e-05, + "loss": 1.2549, + "step": 790 + }, + { + "epoch": 0.6770811042157072, + "grad_norm": 1.546875, + "learning_rate": 1.216577392139319e-05, + "loss": 1.4916, + "step": 791 + }, + { + "epoch": 0.6779370853841216, + "grad_norm": 1.5390625, + "learning_rate": 1.2107110942683459e-05, + "loss": 1.4571, + "step": 792 + }, + { + "epoch": 0.6787930665525358, + "grad_norm": 1.6953125, + "learning_rate": 1.2048544517011862e-05, + "loss": 1.4943, + "step": 793 + }, + { + "epoch": 0.6796490477209501, + "grad_norm": 1.5, + "learning_rate": 1.1990075082974139e-05, + "loss": 1.3433, + "step": 794 + }, + { + "epoch": 0.6805050288893645, + "grad_norm": 1.6484375, + "learning_rate": 1.1931703078439704e-05, + "loss": 1.5043, + "step": 795 + }, + { + "epoch": 0.6813610100577787, + "grad_norm": 1.7890625, + "learning_rate": 1.1873428940548292e-05, + "loss": 1.5344, + "step": 796 + }, + { + "epoch": 0.6822169912261931, + "grad_norm": 1.453125, + "learning_rate": 1.181525310570677e-05, + "loss": 1.4948, + "step": 797 + }, + { + "epoch": 0.6830729723946073, + "grad_norm": 1.3984375, + "learning_rate": 1.1757176009585793e-05, + "loss": 1.4303, + "step": 798 + }, + { + "epoch": 0.6839289535630216, + "grad_norm": 1.6875, + "learning_rate": 1.1699198087116589e-05, + "loss": 1.4565, + "step": 799 + }, + { + "epoch": 0.6847849347314359, + "grad_norm": 1.5078125, + "learning_rate": 1.1641319772487699e-05, + "loss": 1.5477, + "step": 800 + }, + { + "epoch": 0.6856409158998502, + "grad_norm": 1.515625, + "learning_rate": 1.158354149914169e-05, + "loss": 1.4628, + "step": 801 + }, + { + "epoch": 0.6864968970682644, + "grad_norm": 1.578125, + "learning_rate": 1.1525863699771966e-05, + "loss": 1.5269, + "step": 802 + }, + { + "epoch": 0.6873528782366788, + "grad_norm": 1.5078125, + "learning_rate": 1.1468286806319462e-05, + "loss": 1.355, + "step": 803 + }, + { + "epoch": 0.6882088594050931, + "grad_norm": 1.671875, + "learning_rate": 1.1410811249969475e-05, + "loss": 1.531, + "step": 804 + }, + { + "epoch": 0.6890648405735074, + "grad_norm": 1.609375, + "learning_rate": 1.1353437461148377e-05, + "loss": 1.596, + "step": 805 + }, + { + "epoch": 0.6899208217419217, + "grad_norm": 1.515625, + "learning_rate": 1.129616586952042e-05, + "loss": 1.2953, + "step": 806 + }, + { + "epoch": 0.6907768029103359, + "grad_norm": 1.4453125, + "learning_rate": 1.1238996903984537e-05, + "loss": 1.2693, + "step": 807 + }, + { + "epoch": 0.6916327840787503, + "grad_norm": 1.5078125, + "learning_rate": 1.1181930992671078e-05, + "loss": 1.176, + "step": 808 + }, + { + "epoch": 0.6924887652471645, + "grad_norm": 1.640625, + "learning_rate": 1.112496856293867e-05, + "loss": 1.3185, + "step": 809 + }, + { + "epoch": 0.6933447464155789, + "grad_norm": 1.46875, + "learning_rate": 1.1068110041370938e-05, + "loss": 1.4027, + "step": 810 + }, + { + "epoch": 0.6942007275839932, + "grad_norm": 1.453125, + "learning_rate": 1.10113558537734e-05, + "loss": 1.3788, + "step": 811 + }, + { + "epoch": 0.6950567087524074, + "grad_norm": 1.6015625, + "learning_rate": 1.0954706425170197e-05, + "loss": 1.4144, + "step": 812 + }, + { + "epoch": 0.6959126899208218, + "grad_norm": 1.625, + "learning_rate": 1.0898162179800947e-05, + "loss": 1.5627, + "step": 813 + }, + { + "epoch": 0.696768671089236, + "grad_norm": 1.703125, + "learning_rate": 1.0841723541117594e-05, + "loss": 1.5203, + "step": 814 + }, + { + "epoch": 0.6976246522576504, + "grad_norm": 1.5625, + "learning_rate": 1.0785390931781165e-05, + "loss": 1.5606, + "step": 815 + }, + { + "epoch": 0.6984806334260646, + "grad_norm": 1.75, + "learning_rate": 1.0729164773658693e-05, + "loss": 1.399, + "step": 816 + }, + { + "epoch": 0.6993366145944789, + "grad_norm": 1.5546875, + "learning_rate": 1.0673045487819975e-05, + "loss": 1.3372, + "step": 817 + }, + { + "epoch": 0.7001925957628932, + "grad_norm": 1.5078125, + "learning_rate": 1.0617033494534486e-05, + "loss": 1.3698, + "step": 818 + }, + { + "epoch": 0.7010485769313075, + "grad_norm": 1.46875, + "learning_rate": 1.0561129213268187e-05, + "loss": 1.4297, + "step": 819 + }, + { + "epoch": 0.7010485769313075, + "eval_loss": 1.5197569131851196, + "eval_runtime": 21.3185, + "eval_samples_per_second": 18.294, + "eval_steps_per_second": 18.294, + "step": 819 + }, + { + "epoch": 0.7019045580997219, + "grad_norm": 1.5390625, + "learning_rate": 1.0505333062680383e-05, + "loss": 1.4227, + "step": 820 + }, + { + "epoch": 0.7027605392681361, + "grad_norm": 1.5859375, + "learning_rate": 1.0449645460620649e-05, + "loss": 1.3861, + "step": 821 + }, + { + "epoch": 0.7036165204365504, + "grad_norm": 1.578125, + "learning_rate": 1.0394066824125603e-05, + "loss": 1.4062, + "step": 822 + }, + { + "epoch": 0.7044725016049647, + "grad_norm": 1.5546875, + "learning_rate": 1.0338597569415877e-05, + "loss": 1.3354, + "step": 823 + }, + { + "epoch": 0.705328482773379, + "grad_norm": 1.5625, + "learning_rate": 1.028323811189293e-05, + "loss": 1.4555, + "step": 824 + }, + { + "epoch": 0.7061844639417932, + "grad_norm": 1.484375, + "learning_rate": 1.0227988866135996e-05, + "loss": 1.2839, + "step": 825 + }, + { + "epoch": 0.7070404451102076, + "grad_norm": 1.5234375, + "learning_rate": 1.0172850245898893e-05, + "loss": 1.5304, + "step": 826 + }, + { + "epoch": 0.7078964262786218, + "grad_norm": 1.6875, + "learning_rate": 1.0117822664107038e-05, + "loss": 1.6997, + "step": 827 + }, + { + "epoch": 0.7087524074470362, + "grad_norm": 1.5234375, + "learning_rate": 1.0062906532854283e-05, + "loss": 1.3367, + "step": 828 + }, + { + "epoch": 0.7096083886154505, + "grad_norm": 1.5234375, + "learning_rate": 1.000810226339981e-05, + "loss": 1.3577, + "step": 829 + }, + { + "epoch": 0.7104643697838647, + "grad_norm": 1.7265625, + "learning_rate": 9.95341026616513e-06, + "loss": 1.5752, + "step": 830 + }, + { + "epoch": 0.7113203509522791, + "grad_norm": 1.6640625, + "learning_rate": 9.898830950730933e-06, + "loss": 1.5784, + "step": 831 + }, + { + "epoch": 0.7121763321206933, + "grad_norm": 1.546875, + "learning_rate": 9.844364725834057e-06, + "loss": 1.527, + "step": 832 + }, + { + "epoch": 0.7130323132891077, + "grad_norm": 1.5390625, + "learning_rate": 9.790011999364413e-06, + "loss": 1.5338, + "step": 833 + }, + { + "epoch": 0.7138882944575219, + "grad_norm": 1.6015625, + "learning_rate": 9.735773178361964e-06, + "loss": 1.3994, + "step": 834 + }, + { + "epoch": 0.7147442756259362, + "grad_norm": 1.6953125, + "learning_rate": 9.681648669013619e-06, + "loss": 1.4432, + "step": 835 + }, + { + "epoch": 0.7156002567943506, + "grad_norm": 1.40625, + "learning_rate": 9.627638876650243e-06, + "loss": 1.3741, + "step": 836 + }, + { + "epoch": 0.7164562379627648, + "grad_norm": 1.6171875, + "learning_rate": 9.573744205743612e-06, + "loss": 1.3791, + "step": 837 + }, + { + "epoch": 0.7173122191311792, + "grad_norm": 1.53125, + "learning_rate": 9.519965059903349e-06, + "loss": 1.4102, + "step": 838 + }, + { + "epoch": 0.7181682002995934, + "grad_norm": 1.421875, + "learning_rate": 9.46630184187393e-06, + "loss": 1.3081, + "step": 839 + }, + { + "epoch": 0.7190241814680077, + "grad_norm": 1.4765625, + "learning_rate": 9.412754953531663e-06, + "loss": 1.3067, + "step": 840 + }, + { + "epoch": 0.719880162636422, + "grad_norm": 1.5, + "learning_rate": 9.359324795881708e-06, + "loss": 1.3967, + "step": 841 + }, + { + "epoch": 0.7207361438048363, + "grad_norm": 1.4765625, + "learning_rate": 9.306011769054998e-06, + "loss": 1.3527, + "step": 842 + }, + { + "epoch": 0.7215921249732505, + "grad_norm": 1.5234375, + "learning_rate": 9.252816272305329e-06, + "loss": 1.4973, + "step": 843 + }, + { + "epoch": 0.7224481061416649, + "grad_norm": 1.5, + "learning_rate": 9.199738704006321e-06, + "loss": 1.3451, + "step": 844 + }, + { + "epoch": 0.7233040873100792, + "grad_norm": 1.453125, + "learning_rate": 9.146779461648436e-06, + "loss": 1.3985, + "step": 845 + }, + { + "epoch": 0.7241600684784935, + "grad_norm": 1.5625, + "learning_rate": 9.09393894183601e-06, + "loss": 1.5013, + "step": 846 + }, + { + "epoch": 0.7250160496469078, + "grad_norm": 1.6484375, + "learning_rate": 9.041217540284277e-06, + "loss": 1.4524, + "step": 847 + }, + { + "epoch": 0.725872030815322, + "grad_norm": 1.5703125, + "learning_rate": 8.98861565181644e-06, + "loss": 1.4127, + "step": 848 + }, + { + "epoch": 0.7267280119837364, + "grad_norm": 1.6796875, + "learning_rate": 8.936133670360644e-06, + "loss": 1.5011, + "step": 849 + }, + { + "epoch": 0.7275839931521506, + "grad_norm": 1.59375, + "learning_rate": 8.883771988947099e-06, + "loss": 1.4038, + "step": 850 + }, + { + "epoch": 0.728439974320565, + "grad_norm": 1.578125, + "learning_rate": 8.831530999705104e-06, + "loss": 1.4896, + "step": 851 + }, + { + "epoch": 0.7292959554889793, + "grad_norm": 1.6171875, + "learning_rate": 8.77941109386009e-06, + "loss": 1.3577, + "step": 852 + }, + { + "epoch": 0.7301519366573935, + "grad_norm": 1.5, + "learning_rate": 8.727412661730724e-06, + "loss": 1.3243, + "step": 853 + }, + { + "epoch": 0.7310079178258079, + "grad_norm": 1.6484375, + "learning_rate": 8.675536092725966e-06, + "loss": 1.482, + "step": 854 + }, + { + "epoch": 0.7318638989942221, + "grad_norm": 1.84375, + "learning_rate": 8.623781775342183e-06, + "loss": 1.5252, + "step": 855 + }, + { + "epoch": 0.7327198801626364, + "grad_norm": 1.71875, + "learning_rate": 8.572150097160179e-06, + "loss": 1.5078, + "step": 856 + }, + { + "epoch": 0.7335758613310507, + "grad_norm": 1.4296875, + "learning_rate": 8.520641444842373e-06, + "loss": 1.4596, + "step": 857 + }, + { + "epoch": 0.734431842499465, + "grad_norm": 1.6953125, + "learning_rate": 8.469256204129828e-06, + "loss": 1.4019, + "step": 858 + }, + { + "epoch": 0.7352878236678793, + "grad_norm": 1.5078125, + "learning_rate": 8.417994759839401e-06, + "loss": 1.3862, + "step": 859 + }, + { + "epoch": 0.7361438048362936, + "grad_norm": 1.78125, + "learning_rate": 8.36685749586087e-06, + "loss": 1.5438, + "step": 860 + }, + { + "epoch": 0.736999786004708, + "grad_norm": 1.5234375, + "learning_rate": 8.315844795154024e-06, + "loss": 1.1669, + "step": 861 + }, + { + "epoch": 0.7378557671731222, + "grad_norm": 1.5703125, + "learning_rate": 8.264957039745836e-06, + "loss": 1.2759, + "step": 862 + }, + { + "epoch": 0.7387117483415365, + "grad_norm": 1.578125, + "learning_rate": 8.214194610727557e-06, + "loss": 1.3324, + "step": 863 + }, + { + "epoch": 0.7395677295099508, + "grad_norm": 1.640625, + "learning_rate": 8.163557888251917e-06, + "loss": 1.4036, + "step": 864 + }, + { + "epoch": 0.7404237106783651, + "grad_norm": 1.4296875, + "learning_rate": 8.113047251530215e-06, + "loss": 1.4018, + "step": 865 + }, + { + "epoch": 0.7412796918467793, + "grad_norm": 1.703125, + "learning_rate": 8.062663078829525e-06, + "loss": 1.3247, + "step": 866 + }, + { + "epoch": 0.7421356730151937, + "grad_norm": 1.8671875, + "learning_rate": 8.012405747469862e-06, + "loss": 1.4302, + "step": 867 + }, + { + "epoch": 0.7429916541836079, + "grad_norm": 1.453125, + "learning_rate": 7.96227563382132e-06, + "loss": 1.5155, + "step": 868 + }, + { + "epoch": 0.7438476353520223, + "grad_norm": 1.421875, + "learning_rate": 7.912273113301306e-06, + "loss": 1.4633, + "step": 869 + }, + { + "epoch": 0.7447036165204366, + "grad_norm": 1.46875, + "learning_rate": 7.862398560371664e-06, + "loss": 1.3607, + "step": 870 + }, + { + "epoch": 0.7455595976888508, + "grad_norm": 1.484375, + "learning_rate": 7.812652348535948e-06, + "loss": 1.4725, + "step": 871 + }, + { + "epoch": 0.7464155788572652, + "grad_norm": 1.5859375, + "learning_rate": 7.763034850336553e-06, + "loss": 1.4298, + "step": 872 + }, + { + "epoch": 0.7472715600256794, + "grad_norm": 1.390625, + "learning_rate": 7.713546437351965e-06, + "loss": 1.3457, + "step": 873 + }, + { + "epoch": 0.7481275411940937, + "grad_norm": 1.7890625, + "learning_rate": 7.66418748019396e-06, + "loss": 1.265, + "step": 874 + }, + { + "epoch": 0.748983522362508, + "grad_norm": 1.7890625, + "learning_rate": 7.614958348504853e-06, + "loss": 1.5109, + "step": 875 + }, + { + "epoch": 0.7498395035309223, + "grad_norm": 1.5703125, + "learning_rate": 7.565859410954718e-06, + "loss": 1.457, + "step": 876 + }, + { + "epoch": 0.7506954846993367, + "grad_norm": 1.625, + "learning_rate": 7.516891035238596e-06, + "loss": 1.6443, + "step": 877 + }, + { + "epoch": 0.7515514658677509, + "grad_norm": 1.4375, + "learning_rate": 7.468053588073803e-06, + "loss": 1.4027, + "step": 878 + }, + { + "epoch": 0.7524074470361652, + "grad_norm": 1.4765625, + "learning_rate": 7.4193474351971245e-06, + "loss": 1.4607, + "step": 879 + }, + { + "epoch": 0.7532634282045795, + "grad_norm": 1.578125, + "learning_rate": 7.3707729413621055e-06, + "loss": 1.4838, + "step": 880 + }, + { + "epoch": 0.7541194093729938, + "grad_norm": 1.546875, + "learning_rate": 7.3223304703363135e-06, + "loss": 1.309, + "step": 881 + }, + { + "epoch": 0.754975390541408, + "grad_norm": 1.65625, + "learning_rate": 7.274020384898628e-06, + "loss": 1.3888, + "step": 882 + }, + { + "epoch": 0.7558313717098224, + "grad_norm": 1.5625, + "learning_rate": 7.225843046836514e-06, + "loss": 1.649, + "step": 883 + }, + { + "epoch": 0.7566873528782366, + "grad_norm": 1.640625, + "learning_rate": 7.177798816943287e-06, + "loss": 1.4189, + "step": 884 + }, + { + "epoch": 0.757543334046651, + "grad_norm": 1.515625, + "learning_rate": 7.129888055015455e-06, + "loss": 1.4323, + "step": 885 + }, + { + "epoch": 0.7583993152150653, + "grad_norm": 1.4609375, + "learning_rate": 7.0821111198499795e-06, + "loss": 1.4199, + "step": 886 + }, + { + "epoch": 0.7592552963834795, + "grad_norm": 1.84375, + "learning_rate": 7.034468369241651e-06, + "loss": 1.6481, + "step": 887 + }, + { + "epoch": 0.7601112775518939, + "grad_norm": 1.625, + "learning_rate": 6.986960159980327e-06, + "loss": 1.4306, + "step": 888 + }, + { + "epoch": 0.7609672587203081, + "grad_norm": 1.671875, + "learning_rate": 6.939586847848334e-06, + "loss": 1.4569, + "step": 889 + }, + { + "epoch": 0.7618232398887225, + "grad_norm": 1.5703125, + "learning_rate": 6.892348787617769e-06, + "loss": 1.4033, + "step": 890 + }, + { + "epoch": 0.7626792210571367, + "grad_norm": 1.578125, + "learning_rate": 6.845246333047836e-06, + "loss": 1.5268, + "step": 891 + }, + { + "epoch": 0.763535202225551, + "grad_norm": 1.5, + "learning_rate": 6.79827983688221e-06, + "loss": 1.4712, + "step": 892 + }, + { + "epoch": 0.7643911833939653, + "grad_norm": 1.5546875, + "learning_rate": 6.751449650846389e-06, + "loss": 1.3403, + "step": 893 + }, + { + "epoch": 0.7652471645623796, + "grad_norm": 1.59375, + "learning_rate": 6.704756125645081e-06, + "loss": 1.3823, + "step": 894 + }, + { + "epoch": 0.766103145730794, + "grad_norm": 1.515625, + "learning_rate": 6.658199610959537e-06, + "loss": 1.2359, + "step": 895 + }, + { + "epoch": 0.7669591268992082, + "grad_norm": 1.453125, + "learning_rate": 6.611780455444979e-06, + "loss": 1.3427, + "step": 896 + }, + { + "epoch": 0.7678151080676225, + "grad_norm": 1.6640625, + "learning_rate": 6.565499006727938e-06, + "loss": 1.4077, + "step": 897 + }, + { + "epoch": 0.7686710892360368, + "grad_norm": 1.7421875, + "learning_rate": 6.51935561140371e-06, + "loss": 1.5555, + "step": 898 + }, + { + "epoch": 0.7695270704044511, + "grad_norm": 1.6171875, + "learning_rate": 6.4733506150337016e-06, + "loss": 1.3699, + "step": 899 + }, + { + "epoch": 0.7703830515728654, + "grad_norm": 1.625, + "learning_rate": 6.427484362142877e-06, + "loss": 1.3224, + "step": 900 + }, + { + "epoch": 0.7712390327412797, + "grad_norm": 1.7890625, + "learning_rate": 6.381757196217181e-06, + "loss": 1.5472, + "step": 901 + }, + { + "epoch": 0.772095013909694, + "grad_norm": 1.5234375, + "learning_rate": 6.336169459700933e-06, + "loss": 1.5253, + "step": 902 + }, + { + "epoch": 0.7729509950781083, + "grad_norm": 1.3984375, + "learning_rate": 6.290721493994317e-06, + "loss": 1.3984, + "step": 903 + }, + { + "epoch": 0.7738069762465226, + "grad_norm": 1.4921875, + "learning_rate": 6.245413639450757e-06, + "loss": 1.3538, + "step": 904 + }, + { + "epoch": 0.7746629574149368, + "grad_norm": 1.671875, + "learning_rate": 6.200246235374438e-06, + "loss": 1.5044, + "step": 905 + }, + { + "epoch": 0.7755189385833512, + "grad_norm": 1.53125, + "learning_rate": 6.155219620017708e-06, + "loss": 1.4854, + "step": 906 + }, + { + "epoch": 0.7763749197517654, + "grad_norm": 1.671875, + "learning_rate": 6.1103341305785655e-06, + "loss": 1.6012, + "step": 907 + }, + { + "epoch": 0.7772309009201798, + "grad_norm": 1.625, + "learning_rate": 6.065590103198165e-06, + "loss": 1.4091, + "step": 908 + }, + { + "epoch": 0.778086882088594, + "grad_norm": 1.4765625, + "learning_rate": 6.020987872958236e-06, + "loss": 1.4079, + "step": 909 + }, + { + "epoch": 0.7789428632570083, + "grad_norm": 1.6328125, + "learning_rate": 5.97652777387864e-06, + "loss": 1.4951, + "step": 910 + }, + { + "epoch": 0.7797988444254227, + "grad_norm": 1.5078125, + "learning_rate": 5.932210138914821e-06, + "loss": 1.5049, + "step": 911 + }, + { + "epoch": 0.7806548255938369, + "grad_norm": 1.65625, + "learning_rate": 5.888035299955325e-06, + "loss": 1.3488, + "step": 912 + }, + { + "epoch": 0.7815108067622513, + "grad_norm": 1.5625, + "learning_rate": 5.844003587819327e-06, + "loss": 1.5192, + "step": 913 + }, + { + "epoch": 0.7823667879306655, + "grad_norm": 1.6953125, + "learning_rate": 5.800115332254144e-06, + "loss": 1.549, + "step": 914 + }, + { + "epoch": 0.7832227690990798, + "grad_norm": 1.453125, + "learning_rate": 5.75637086193278e-06, + "loss": 1.4354, + "step": 915 + }, + { + "epoch": 0.7840787502674941, + "grad_norm": 1.8046875, + "learning_rate": 5.712770504451426e-06, + "loss": 1.4676, + "step": 916 + }, + { + "epoch": 0.7849347314359084, + "grad_norm": 1.625, + "learning_rate": 5.669314586327054e-06, + "loss": 1.5199, + "step": 917 + }, + { + "epoch": 0.7857907126043226, + "grad_norm": 1.515625, + "learning_rate": 5.626003432994933e-06, + "loss": 1.4853, + "step": 918 + }, + { + "epoch": 0.786646693772737, + "grad_norm": 1.640625, + "learning_rate": 5.582837368806224e-06, + "loss": 1.3789, + "step": 919 + }, + { + "epoch": 0.7875026749411513, + "grad_norm": 1.5390625, + "learning_rate": 5.539816717025515e-06, + "loss": 1.5069, + "step": 920 + }, + { + "epoch": 0.7883586561095656, + "grad_norm": 1.4921875, + "learning_rate": 5.496941799828443e-06, + "loss": 1.364, + "step": 921 + }, + { + "epoch": 0.7892146372779799, + "grad_norm": 1.7890625, + "learning_rate": 5.454212938299255e-06, + "loss": 1.4134, + "step": 922 + }, + { + "epoch": 0.7900706184463941, + "grad_norm": 1.5078125, + "learning_rate": 5.411630452428395e-06, + "loss": 1.4641, + "step": 923 + }, + { + "epoch": 0.7909265996148085, + "grad_norm": 1.7421875, + "learning_rate": 5.369194661110138e-06, + "loss": 1.2542, + "step": 924 + }, + { + "epoch": 0.7917825807832227, + "grad_norm": 1.4921875, + "learning_rate": 5.326905882140168e-06, + "loss": 1.5729, + "step": 925 + }, + { + "epoch": 0.7926385619516371, + "grad_norm": 1.5703125, + "learning_rate": 5.284764432213221e-06, + "loss": 1.5403, + "step": 926 + }, + { + "epoch": 0.7934945431200514, + "grad_norm": 1.5234375, + "learning_rate": 5.242770626920695e-06, + "loss": 1.418, + "step": 927 + }, + { + "epoch": 0.7943505242884656, + "grad_norm": 1.5703125, + "learning_rate": 5.200924780748323e-06, + "loss": 1.4128, + "step": 928 + }, + { + "epoch": 0.79520650545688, + "grad_norm": 1.6796875, + "learning_rate": 5.15922720707378e-06, + "loss": 1.4845, + "step": 929 + }, + { + "epoch": 0.7960624866252942, + "grad_norm": 1.515625, + "learning_rate": 5.117678218164338e-06, + "loss": 1.5405, + "step": 930 + }, + { + "epoch": 0.7969184677937086, + "grad_norm": 1.6875, + "learning_rate": 5.076278125174555e-06, + "loss": 1.361, + "step": 931 + }, + { + "epoch": 0.7977744489621228, + "grad_norm": 1.7265625, + "learning_rate": 5.0350272381439244e-06, + "loss": 1.5649, + "step": 932 + }, + { + "epoch": 0.7986304301305371, + "grad_norm": 1.4609375, + "learning_rate": 4.993925865994548e-06, + "loss": 1.388, + "step": 933 + }, + { + "epoch": 0.7994864112989514, + "grad_norm": 1.6953125, + "learning_rate": 4.952974316528833e-06, + "loss": 1.5386, + "step": 934 + }, + { + "epoch": 0.8003423924673657, + "grad_norm": 1.40625, + "learning_rate": 4.912172896427205e-06, + "loss": 1.2794, + "step": 935 + }, + { + "epoch": 0.8011983736357801, + "grad_norm": 1.703125, + "learning_rate": 4.8715219112457635e-06, + "loss": 1.7561, + "step": 936 + }, + { + "epoch": 0.8011983736357801, + "eval_loss": 1.516785979270935, + "eval_runtime": 21.3507, + "eval_samples_per_second": 18.266, + "eval_steps_per_second": 18.266, + "step": 936 + }, + { + "epoch": 0.8020543548041943, + "grad_norm": 1.7109375, + "learning_rate": 4.8310216654140425e-06, + "loss": 1.5413, + "step": 937 + }, + { + "epoch": 0.8029103359726086, + "grad_norm": 1.3671875, + "learning_rate": 4.790672462232715e-06, + "loss": 1.3485, + "step": 938 + }, + { + "epoch": 0.8037663171410229, + "grad_norm": 1.515625, + "learning_rate": 4.750474603871283e-06, + "loss": 1.3616, + "step": 939 + }, + { + "epoch": 0.8046222983094372, + "grad_norm": 1.609375, + "learning_rate": 4.710428391365887e-06, + "loss": 1.5232, + "step": 940 + }, + { + "epoch": 0.8054782794778514, + "grad_norm": 1.625, + "learning_rate": 4.670534124616982e-06, + "loss": 1.5764, + "step": 941 + }, + { + "epoch": 0.8063342606462658, + "grad_norm": 1.53125, + "learning_rate": 4.630792102387155e-06, + "loss": 1.4513, + "step": 942 + }, + { + "epoch": 0.8071902418146801, + "grad_norm": 1.6953125, + "learning_rate": 4.591202622298824e-06, + "loss": 1.4137, + "step": 943 + }, + { + "epoch": 0.8080462229830944, + "grad_norm": 1.5390625, + "learning_rate": 4.551765980832059e-06, + "loss": 1.3718, + "step": 944 + }, + { + "epoch": 0.8089022041515087, + "grad_norm": 1.671875, + "learning_rate": 4.512482473322341e-06, + "loss": 1.4205, + "step": 945 + }, + { + "epoch": 0.8097581853199229, + "grad_norm": 1.5234375, + "learning_rate": 4.473352393958338e-06, + "loss": 1.5571, + "step": 946 + }, + { + "epoch": 0.8106141664883373, + "grad_norm": 1.4296875, + "learning_rate": 4.4343760357797386e-06, + "loss": 1.2288, + "step": 947 + }, + { + "epoch": 0.8114701476567515, + "grad_norm": 1.46875, + "learning_rate": 4.3955536906750135e-06, + "loss": 1.3646, + "step": 948 + }, + { + "epoch": 0.8123261288251659, + "grad_norm": 1.46875, + "learning_rate": 4.356885649379269e-06, + "loss": 1.4272, + "step": 949 + }, + { + "epoch": 0.8131821099935801, + "grad_norm": 1.640625, + "learning_rate": 4.318372201472037e-06, + "loss": 1.4271, + "step": 950 + }, + { + "epoch": 0.8140380911619944, + "grad_norm": 1.46875, + "learning_rate": 4.280013635375138e-06, + "loss": 1.5182, + "step": 951 + }, + { + "epoch": 0.8148940723304088, + "grad_norm": 1.5390625, + "learning_rate": 4.2418102383504885e-06, + "loss": 1.3662, + "step": 952 + }, + { + "epoch": 0.815750053498823, + "grad_norm": 1.7265625, + "learning_rate": 4.203762296497965e-06, + "loss": 1.5375, + "step": 953 + }, + { + "epoch": 0.8166060346672374, + "grad_norm": 1.53125, + "learning_rate": 4.1658700947532795e-06, + "loss": 1.4522, + "step": 954 + }, + { + "epoch": 0.8174620158356516, + "grad_norm": 1.6015625, + "learning_rate": 4.128133916885804e-06, + "loss": 1.4576, + "step": 955 + }, + { + "epoch": 0.8183179970040659, + "grad_norm": 1.453125, + "learning_rate": 4.0905540454965006e-06, + "loss": 1.3513, + "step": 956 + }, + { + "epoch": 0.8191739781724802, + "grad_norm": 1.65625, + "learning_rate": 4.053130762015736e-06, + "loss": 1.4043, + "step": 957 + }, + { + "epoch": 0.8200299593408945, + "grad_norm": 1.3828125, + "learning_rate": 4.015864346701251e-06, + "loss": 1.3098, + "step": 958 + }, + { + "epoch": 0.8208859405093087, + "grad_norm": 1.6484375, + "learning_rate": 3.978755078635995e-06, + "loss": 1.5399, + "step": 959 + }, + { + "epoch": 0.8217419216777231, + "grad_norm": 1.671875, + "learning_rate": 3.941803235726069e-06, + "loss": 1.6757, + "step": 960 + }, + { + "epoch": 0.8225979028461374, + "grad_norm": 1.6171875, + "learning_rate": 3.90500909469865e-06, + "loss": 1.3494, + "step": 961 + }, + { + "epoch": 0.8234538840145517, + "grad_norm": 1.46875, + "learning_rate": 3.8683729310998926e-06, + "loss": 1.3379, + "step": 962 + }, + { + "epoch": 0.824309865182966, + "grad_norm": 1.625, + "learning_rate": 3.831895019292897e-06, + "loss": 1.5126, + "step": 963 + }, + { + "epoch": 0.8251658463513802, + "grad_norm": 1.734375, + "learning_rate": 3.7955756324556197e-06, + "loss": 1.507, + "step": 964 + }, + { + "epoch": 0.8260218275197946, + "grad_norm": 1.5078125, + "learning_rate": 3.7594150425788675e-06, + "loss": 1.3546, + "step": 965 + }, + { + "epoch": 0.8268778086882088, + "grad_norm": 1.6640625, + "learning_rate": 3.7234135204642195e-06, + "loss": 1.466, + "step": 966 + }, + { + "epoch": 0.8277337898566232, + "grad_norm": 1.6484375, + "learning_rate": 3.687571335722023e-06, + "loss": 1.388, + "step": 967 + }, + { + "epoch": 0.8285897710250375, + "grad_norm": 1.796875, + "learning_rate": 3.651888756769381e-06, + "loss": 1.4069, + "step": 968 + }, + { + "epoch": 0.8294457521934517, + "grad_norm": 1.6953125, + "learning_rate": 3.6163660508281154e-06, + "loss": 1.451, + "step": 969 + }, + { + "epoch": 0.8303017333618661, + "grad_norm": 1.59375, + "learning_rate": 3.5810034839228015e-06, + "loss": 1.4336, + "step": 970 + }, + { + "epoch": 0.8311577145302803, + "grad_norm": 1.765625, + "learning_rate": 3.5458013208787333e-06, + "loss": 1.4418, + "step": 971 + }, + { + "epoch": 0.8320136956986947, + "grad_norm": 1.5546875, + "learning_rate": 3.5107598253199758e-06, + "loss": 1.4126, + "step": 972 + }, + { + "epoch": 0.8328696768671089, + "grad_norm": 1.5078125, + "learning_rate": 3.4758792596673725e-06, + "loss": 1.3229, + "step": 973 + }, + { + "epoch": 0.8337256580355232, + "grad_norm": 1.5703125, + "learning_rate": 3.4411598851365966e-06, + "loss": 1.2822, + "step": 974 + }, + { + "epoch": 0.8345816392039375, + "grad_norm": 1.6328125, + "learning_rate": 3.406601961736164e-06, + "loss": 1.607, + "step": 975 + }, + { + "epoch": 0.8354376203723518, + "grad_norm": 1.4609375, + "learning_rate": 3.372205748265522e-06, + "loss": 1.5054, + "step": 976 + }, + { + "epoch": 0.8362936015407662, + "grad_norm": 1.5625, + "learning_rate": 3.337971502313095e-06, + "loss": 1.4882, + "step": 977 + }, + { + "epoch": 0.8371495827091804, + "grad_norm": 1.625, + "learning_rate": 3.3038994802543467e-06, + "loss": 1.5285, + "step": 978 + }, + { + "epoch": 0.8380055638775947, + "grad_norm": 1.671875, + "learning_rate": 3.2699899372498733e-06, + "loss": 1.4404, + "step": 979 + }, + { + "epoch": 0.838861545046009, + "grad_norm": 1.5625, + "learning_rate": 3.236243127243477e-06, + "loss": 1.5433, + "step": 980 + }, + { + "epoch": 0.8397175262144233, + "grad_norm": 1.5625, + "learning_rate": 3.202659302960301e-06, + "loss": 1.207, + "step": 981 + }, + { + "epoch": 0.8405735073828375, + "grad_norm": 1.40625, + "learning_rate": 3.169238715904882e-06, + "loss": 1.4336, + "step": 982 + }, + { + "epoch": 0.8414294885512519, + "grad_norm": 1.859375, + "learning_rate": 3.135981616359315e-06, + "loss": 1.4036, + "step": 983 + }, + { + "epoch": 0.8422854697196661, + "grad_norm": 1.65625, + "learning_rate": 3.1028882533813643e-06, + "loss": 1.4233, + "step": 984 + }, + { + "epoch": 0.8431414508880805, + "grad_norm": 1.421875, + "learning_rate": 3.0699588748025755e-06, + "loss": 1.3475, + "step": 985 + }, + { + "epoch": 0.8439974320564948, + "grad_norm": 1.7109375, + "learning_rate": 3.037193727226445e-06, + "loss": 1.3735, + "step": 986 + }, + { + "epoch": 0.844853413224909, + "grad_norm": 1.640625, + "learning_rate": 3.0045930560265666e-06, + "loss": 1.49, + "step": 987 + }, + { + "epoch": 0.8457093943933234, + "grad_norm": 1.6171875, + "learning_rate": 2.9721571053448053e-06, + "loss": 1.5413, + "step": 988 + }, + { + "epoch": 0.8465653755617376, + "grad_norm": 1.4453125, + "learning_rate": 2.9398861180894355e-06, + "loss": 1.4234, + "step": 989 + }, + { + "epoch": 0.847421356730152, + "grad_norm": 1.6328125, + "learning_rate": 2.9077803359333607e-06, + "loss": 1.542, + "step": 990 + }, + { + "epoch": 0.8482773378985662, + "grad_norm": 1.4765625, + "learning_rate": 2.8758399993122854e-06, + "loss": 1.4682, + "step": 991 + }, + { + "epoch": 0.8491333190669805, + "grad_norm": 1.5234375, + "learning_rate": 2.8440653474229085e-06, + "loss": 1.3124, + "step": 992 + }, + { + "epoch": 0.8499893002353949, + "grad_norm": 1.703125, + "learning_rate": 2.812456618221143e-06, + "loss": 1.5196, + "step": 993 + }, + { + "epoch": 0.8508452814038091, + "grad_norm": 1.5, + "learning_rate": 2.7810140484203188e-06, + "loss": 1.4316, + "step": 994 + }, + { + "epoch": 0.8517012625722234, + "grad_norm": 1.71875, + "learning_rate": 2.7497378734894497e-06, + "loss": 1.408, + "step": 995 + }, + { + "epoch": 0.8525572437406377, + "grad_norm": 1.5234375, + "learning_rate": 2.718628327651407e-06, + "loss": 1.4881, + "step": 996 + }, + { + "epoch": 0.853413224909052, + "grad_norm": 1.5546875, + "learning_rate": 2.6876856438812296e-06, + "loss": 1.4838, + "step": 997 + }, + { + "epoch": 0.8542692060774663, + "grad_norm": 1.5703125, + "learning_rate": 2.6569100539043325e-06, + "loss": 1.4737, + "step": 998 + }, + { + "epoch": 0.8551251872458806, + "grad_norm": 1.75, + "learning_rate": 2.626301788194785e-06, + "loss": 1.319, + "step": 999 + }, + { + "epoch": 0.8559811684142948, + "grad_norm": 1.3515625, + "learning_rate": 2.595861075973613e-06, + "loss": 1.2919, + "step": 1000 + }, + { + "epoch": 0.8568371495827092, + "grad_norm": 1.6796875, + "learning_rate": 2.5655881452070264e-06, + "loss": 1.5445, + "step": 1001 + }, + { + "epoch": 0.8576931307511235, + "grad_norm": 1.4375, + "learning_rate": 2.5354832226047705e-06, + "loss": 1.2502, + "step": 1002 + }, + { + "epoch": 0.8585491119195378, + "grad_norm": 1.4453125, + "learning_rate": 2.5055465336183774e-06, + "loss": 1.3512, + "step": 1003 + }, + { + "epoch": 0.8594050930879521, + "grad_norm": 1.5625, + "learning_rate": 2.475778302439524e-06, + "loss": 1.4999, + "step": 1004 + }, + { + "epoch": 0.8602610742563663, + "grad_norm": 1.4765625, + "learning_rate": 2.4461787519983127e-06, + "loss": 1.2433, + "step": 1005 + }, + { + "epoch": 0.8611170554247807, + "grad_norm": 1.46875, + "learning_rate": 2.416748103961625e-06, + "loss": 1.4821, + "step": 1006 + }, + { + "epoch": 0.8619730365931949, + "grad_norm": 1.4375, + "learning_rate": 2.3874865787314598e-06, + "loss": 1.4389, + "step": 1007 + }, + { + "epoch": 0.8628290177616093, + "grad_norm": 1.8671875, + "learning_rate": 2.3583943954432725e-06, + "loss": 1.6777, + "step": 1008 + }, + { + "epoch": 0.8636849989300235, + "grad_norm": 1.5859375, + "learning_rate": 2.3294717719643534e-06, + "loss": 1.5674, + "step": 1009 + }, + { + "epoch": 0.8645409800984378, + "grad_norm": 1.6953125, + "learning_rate": 2.300718924892159e-06, + "loss": 1.3248, + "step": 1010 + }, + { + "epoch": 0.8653969612668522, + "grad_norm": 1.6484375, + "learning_rate": 2.2721360695527437e-06, + "loss": 1.3882, + "step": 1011 + }, + { + "epoch": 0.8662529424352664, + "grad_norm": 1.59375, + "learning_rate": 2.243723419999097e-06, + "loss": 1.4108, + "step": 1012 + }, + { + "epoch": 0.8671089236036807, + "grad_norm": 1.4296875, + "learning_rate": 2.2154811890095605e-06, + "loss": 1.2796, + "step": 1013 + }, + { + "epoch": 0.867964904772095, + "grad_norm": 1.578125, + "learning_rate": 2.1874095880862505e-06, + "loss": 1.4911, + "step": 1014 + }, + { + "epoch": 0.8688208859405093, + "grad_norm": 1.5859375, + "learning_rate": 2.1595088274534436e-06, + "loss": 1.4234, + "step": 1015 + }, + { + "epoch": 0.8696768671089236, + "grad_norm": 1.6328125, + "learning_rate": 2.1317791160560318e-06, + "loss": 1.4759, + "step": 1016 + }, + { + "epoch": 0.8705328482773379, + "grad_norm": 1.5625, + "learning_rate": 2.1042206615579237e-06, + "loss": 1.5507, + "step": 1017 + }, + { + "epoch": 0.8713888294457522, + "grad_norm": 1.421875, + "learning_rate": 2.076833670340533e-06, + "loss": 1.3777, + "step": 1018 + }, + { + "epoch": 0.8722448106141665, + "grad_norm": 1.359375, + "learning_rate": 2.0496183475011894e-06, + "loss": 1.4103, + "step": 1019 + }, + { + "epoch": 0.8731007917825808, + "grad_norm": 1.6953125, + "learning_rate": 2.0225748968516284e-06, + "loss": 1.2965, + "step": 1020 + }, + { + "epoch": 0.873956772950995, + "grad_norm": 1.4375, + "learning_rate": 1.995703520916456e-06, + "loss": 1.4232, + "step": 1021 + }, + { + "epoch": 0.8748127541194094, + "grad_norm": 1.390625, + "learning_rate": 1.9690044209316444e-06, + "loss": 1.4387, + "step": 1022 + }, + { + "epoch": 0.8756687352878236, + "grad_norm": 1.828125, + "learning_rate": 1.9424777968430146e-06, + "loss": 1.4927, + "step": 1023 + }, + { + "epoch": 0.876524716456238, + "grad_norm": 1.4921875, + "learning_rate": 1.916123847304721e-06, + "loss": 1.6027, + "step": 1024 + }, + { + "epoch": 0.8773806976246522, + "grad_norm": 1.6796875, + "learning_rate": 1.8899427696778105e-06, + "loss": 1.5225, + "step": 1025 + }, + { + "epoch": 0.8782366787930665, + "grad_norm": 1.5078125, + "learning_rate": 1.8639347600286877e-06, + "loss": 1.4555, + "step": 1026 + }, + { + "epoch": 0.8790926599614809, + "grad_norm": 1.6328125, + "learning_rate": 1.8381000131277e-06, + "loss": 1.4311, + "step": 1027 + }, + { + "epoch": 0.8799486411298951, + "grad_norm": 1.625, + "learning_rate": 1.8124387224476347e-06, + "loss": 1.5006, + "step": 1028 + }, + { + "epoch": 0.8808046222983095, + "grad_norm": 1.5703125, + "learning_rate": 1.7869510801623053e-06, + "loss": 1.4779, + "step": 1029 + }, + { + "epoch": 0.8816606034667237, + "grad_norm": 1.4140625, + "learning_rate": 1.761637277145095e-06, + "loss": 1.2851, + "step": 1030 + }, + { + "epoch": 0.882516584635138, + "grad_norm": 1.5703125, + "learning_rate": 1.7364975029675184e-06, + "loss": 1.4212, + "step": 1031 + }, + { + "epoch": 0.8833725658035523, + "grad_norm": 1.53125, + "learning_rate": 1.7115319458978236e-06, + "loss": 1.5496, + "step": 1032 + }, + { + "epoch": 0.8842285469719666, + "grad_norm": 1.5390625, + "learning_rate": 1.6867407928995577e-06, + "loss": 1.4217, + "step": 1033 + }, + { + "epoch": 0.885084528140381, + "grad_norm": 1.6015625, + "learning_rate": 1.6621242296301964e-06, + "loss": 1.5435, + "step": 1034 + }, + { + "epoch": 0.8859405093087952, + "grad_norm": 1.4921875, + "learning_rate": 1.6376824404397251e-06, + "loss": 1.3545, + "step": 1035 + }, + { + "epoch": 0.8867964904772095, + "grad_norm": 1.609375, + "learning_rate": 1.613415608369284e-06, + "loss": 1.4856, + "step": 1036 + }, + { + "epoch": 0.8876524716456238, + "grad_norm": 1.453125, + "learning_rate": 1.5893239151497652e-06, + "loss": 1.3376, + "step": 1037 + }, + { + "epoch": 0.8885084528140381, + "grad_norm": 1.4765625, + "learning_rate": 1.5654075412004893e-06, + "loss": 1.4068, + "step": 1038 + }, + { + "epoch": 0.8893644339824524, + "grad_norm": 1.6328125, + "learning_rate": 1.5416666656278222e-06, + "loss": 1.3882, + "step": 1039 + }, + { + "epoch": 0.8902204151508667, + "grad_norm": 1.4375, + "learning_rate": 1.5181014662238508e-06, + "loss": 1.2639, + "step": 1040 + }, + { + "epoch": 0.8910763963192809, + "grad_norm": 1.5078125, + "learning_rate": 1.4947121194650527e-06, + "loss": 1.388, + "step": 1041 + }, + { + "epoch": 0.8919323774876953, + "grad_norm": 1.5, + "learning_rate": 1.471498800510962e-06, + "loss": 1.5616, + "step": 1042 + }, + { + "epoch": 0.8927883586561096, + "grad_norm": 1.6953125, + "learning_rate": 1.448461683202873e-06, + "loss": 1.5843, + "step": 1043 + }, + { + "epoch": 0.8936443398245238, + "grad_norm": 1.6328125, + "learning_rate": 1.4256009400625214e-06, + "loss": 1.4752, + "step": 1044 + }, + { + "epoch": 0.8945003209929382, + "grad_norm": 1.453125, + "learning_rate": 1.4029167422908107e-06, + "loss": 1.4538, + "step": 1045 + }, + { + "epoch": 0.8953563021613524, + "grad_norm": 1.515625, + "learning_rate": 1.3804092597665186e-06, + "loss": 1.4397, + "step": 1046 + }, + { + "epoch": 0.8962122833297668, + "grad_norm": 1.5703125, + "learning_rate": 1.3580786610450202e-06, + "loss": 1.3437, + "step": 1047 + }, + { + "epoch": 0.897068264498181, + "grad_norm": 1.3828125, + "learning_rate": 1.335925113357042e-06, + "loss": 1.3665, + "step": 1048 + }, + { + "epoch": 0.8979242456665953, + "grad_norm": 3.375, + "learning_rate": 1.3139487826073937e-06, + "loss": 1.2993, + "step": 1049 + }, + { + "epoch": 0.8987802268350096, + "grad_norm": 1.71875, + "learning_rate": 1.2921498333737375e-06, + "loss": 1.4769, + "step": 1050 + }, + { + "epoch": 0.8996362080034239, + "grad_norm": 1.546875, + "learning_rate": 1.2705284289053403e-06, + "loss": 1.5111, + "step": 1051 + }, + { + "epoch": 0.9004921891718383, + "grad_norm": 1.71875, + "learning_rate": 1.2490847311218773e-06, + "loss": 1.619, + "step": 1052 + }, + { + "epoch": 0.9013481703402525, + "grad_norm": 1.6328125, + "learning_rate": 1.2278189006121904e-06, + "loss": 1.6656, + "step": 1053 + }, + { + "epoch": 0.9013481703402525, + "eval_loss": 1.516126275062561, + "eval_runtime": 21.3286, + "eval_samples_per_second": 18.285, + "eval_steps_per_second": 18.285, + "step": 1053 + }, + { + "epoch": 0.9022041515086668, + "grad_norm": 1.578125, + "learning_rate": 1.2067310966330959e-06, + "loss": 1.0402, + "step": 1054 + }, + { + "epoch": 0.9030601326770811, + "grad_norm": 1.4609375, + "learning_rate": 1.185821477108212e-06, + "loss": 1.231, + "step": 1055 + }, + { + "epoch": 0.9039161138454954, + "grad_norm": 2.0, + "learning_rate": 1.1650901986267365e-06, + "loss": 1.2964, + "step": 1056 + }, + { + "epoch": 0.9047720950139096, + "grad_norm": 1.8359375, + "learning_rate": 1.144537416442315e-06, + "loss": 1.5158, + "step": 1057 + }, + { + "epoch": 0.905628076182324, + "grad_norm": 1.6015625, + "learning_rate": 1.1241632844718465e-06, + "loss": 1.5144, + "step": 1058 + }, + { + "epoch": 0.9064840573507383, + "grad_norm": 1.4375, + "learning_rate": 1.1039679552943493e-06, + "loss": 1.4618, + "step": 1059 + }, + { + "epoch": 0.9073400385191526, + "grad_norm": 1.4765625, + "learning_rate": 1.0839515801498084e-06, + "loss": 1.3926, + "step": 1060 + }, + { + "epoch": 0.9081960196875669, + "grad_norm": 1.609375, + "learning_rate": 1.0641143089380523e-06, + "loss": 1.454, + "step": 1061 + }, + { + "epoch": 0.9090520008559811, + "grad_norm": 1.46875, + "learning_rate": 1.0444562902176296e-06, + "loss": 1.349, + "step": 1062 + }, + { + "epoch": 0.9099079820243955, + "grad_norm": 1.5234375, + "learning_rate": 1.0249776712046744e-06, + "loss": 1.33, + "step": 1063 + }, + { + "epoch": 0.9107639631928097, + "grad_norm": 1.453125, + "learning_rate": 1.0056785977718447e-06, + "loss": 1.3807, + "step": 1064 + }, + { + "epoch": 0.9116199443612241, + "grad_norm": 1.65625, + "learning_rate": 9.865592144471886e-07, + "loss": 1.539, + "step": 1065 + }, + { + "epoch": 0.9124759255296383, + "grad_norm": 1.46875, + "learning_rate": 9.67619664413086e-07, + "loss": 1.4141, + "step": 1066 + }, + { + "epoch": 0.9133319066980526, + "grad_norm": 1.484375, + "learning_rate": 9.488600895051714e-07, + "loss": 1.4709, + "step": 1067 + }, + { + "epoch": 0.914187887866467, + "grad_norm": 1.4921875, + "learning_rate": 9.302806302112693e-07, + "loss": 1.383, + "step": 1068 + }, + { + "epoch": 0.9150438690348812, + "grad_norm": 1.59375, + "learning_rate": 9.118814256703523e-07, + "loss": 1.3668, + "step": 1069 + }, + { + "epoch": 0.9158998502032956, + "grad_norm": 1.4765625, + "learning_rate": 8.936626136714754e-07, + "loss": 1.4376, + "step": 1070 + }, + { + "epoch": 0.9167558313717098, + "grad_norm": 1.6171875, + "learning_rate": 8.756243306527689e-07, + "loss": 1.3291, + "step": 1071 + }, + { + "epoch": 0.9176118125401241, + "grad_norm": 1.4140625, + "learning_rate": 8.577667117004085e-07, + "loss": 1.447, + "step": 1072 + }, + { + "epoch": 0.9184677937085384, + "grad_norm": 1.671875, + "learning_rate": 8.400898905475934e-07, + "loss": 1.4864, + "step": 1073 + }, + { + "epoch": 0.9193237748769527, + "grad_norm": 1.484375, + "learning_rate": 8.225939995735593e-07, + "loss": 1.3948, + "step": 1074 + }, + { + "epoch": 0.920179756045367, + "grad_norm": 1.859375, + "learning_rate": 8.05279169802578e-07, + "loss": 1.5526, + "step": 1075 + }, + { + "epoch": 0.9210357372137813, + "grad_norm": 1.4296875, + "learning_rate": 7.881455309029894e-07, + "loss": 1.2545, + "step": 1076 + }, + { + "epoch": 0.9218917183821956, + "grad_norm": 1.5078125, + "learning_rate": 7.711932111862025e-07, + "loss": 1.4127, + "step": 1077 + }, + { + "epoch": 0.9227476995506099, + "grad_norm": 1.5546875, + "learning_rate": 7.544223376057702e-07, + "loss": 1.3862, + "step": 1078 + }, + { + "epoch": 0.9236036807190242, + "grad_norm": 1.5234375, + "learning_rate": 7.378330357564134e-07, + "loss": 1.3634, + "step": 1079 + }, + { + "epoch": 0.9244596618874384, + "grad_norm": 1.5703125, + "learning_rate": 7.214254298730793e-07, + "loss": 1.4651, + "step": 1080 + }, + { + "epoch": 0.9253156430558528, + "grad_norm": 1.7265625, + "learning_rate": 7.051996428300317e-07, + "loss": 1.5383, + "step": 1081 + }, + { + "epoch": 0.926171624224267, + "grad_norm": 1.484375, + "learning_rate": 6.891557961399175e-07, + "loss": 1.3888, + "step": 1082 + }, + { + "epoch": 0.9270276053926814, + "grad_norm": 1.5390625, + "learning_rate": 6.73294009952849e-07, + "loss": 1.3839, + "step": 1083 + }, + { + "epoch": 0.9278835865610957, + "grad_norm": 1.53125, + "learning_rate": 6.576144030555259e-07, + "loss": 1.3927, + "step": 1084 + }, + { + "epoch": 0.9287395677295099, + "grad_norm": 1.6484375, + "learning_rate": 6.421170928703174e-07, + "loss": 1.4722, + "step": 1085 + }, + { + "epoch": 0.9295955488979243, + "grad_norm": 1.546875, + "learning_rate": 6.268021954544096e-07, + "loss": 1.2901, + "step": 1086 + }, + { + "epoch": 0.9304515300663385, + "grad_norm": 1.546875, + "learning_rate": 6.116698254989256e-07, + "loss": 1.39, + "step": 1087 + }, + { + "epoch": 0.9313075112347529, + "grad_norm": 1.4921875, + "learning_rate": 5.967200963280545e-07, + "loss": 1.2328, + "step": 1088 + }, + { + "epoch": 0.9321634924031671, + "grad_norm": 1.5859375, + "learning_rate": 5.819531198982264e-07, + "loss": 1.3817, + "step": 1089 + }, + { + "epoch": 0.9330194735715814, + "grad_norm": 1.8046875, + "learning_rate": 5.673690067972553e-07, + "loss": 1.521, + "step": 1090 + }, + { + "epoch": 0.9338754547399957, + "grad_norm": 1.53125, + "learning_rate": 5.529678662435228e-07, + "loss": 1.5298, + "step": 1091 + }, + { + "epoch": 0.93473143590841, + "grad_norm": 1.8125, + "learning_rate": 5.387498060851454e-07, + "loss": 1.4792, + "step": 1092 + }, + { + "epoch": 0.9355874170768244, + "grad_norm": 1.4375, + "learning_rate": 5.247149327991835e-07, + "loss": 1.3686, + "step": 1093 + }, + { + "epoch": 0.9364433982452386, + "grad_norm": 1.4921875, + "learning_rate": 5.108633514908367e-07, + "loss": 1.4196, + "step": 1094 + }, + { + "epoch": 0.9372993794136529, + "grad_norm": 1.5625, + "learning_rate": 4.971951658926527e-07, + "loss": 1.3794, + "step": 1095 + }, + { + "epoch": 0.9381553605820672, + "grad_norm": 1.546875, + "learning_rate": 4.83710478363758e-07, + "loss": 1.5475, + "step": 1096 + }, + { + "epoch": 0.9390113417504815, + "grad_norm": 1.6015625, + "learning_rate": 4.704093898890871e-07, + "loss": 1.4144, + "step": 1097 + }, + { + "epoch": 0.9398673229188957, + "grad_norm": 1.5546875, + "learning_rate": 4.5729200007862683e-07, + "loss": 1.3816, + "step": 1098 + }, + { + "epoch": 0.9407233040873101, + "grad_norm": 1.53125, + "learning_rate": 4.4435840716667007e-07, + "loss": 1.4647, + "step": 1099 + }, + { + "epoch": 0.9415792852557243, + "grad_norm": 1.4765625, + "learning_rate": 4.316087080110748e-07, + "loss": 1.2495, + "step": 1100 + }, + { + "epoch": 0.9424352664241387, + "grad_norm": 1.5859375, + "learning_rate": 4.1904299809255867e-07, + "loss": 1.4592, + "step": 1101 + }, + { + "epoch": 0.943291247592553, + "grad_norm": 1.5390625, + "learning_rate": 4.0666137151395277e-07, + "loss": 1.4884, + "step": 1102 + }, + { + "epoch": 0.9441472287609672, + "grad_norm": 1.6015625, + "learning_rate": 3.944639209995299e-07, + "loss": 1.5319, + "step": 1103 + }, + { + "epoch": 0.9450032099293816, + "grad_norm": 1.5, + "learning_rate": 3.824507378942799e-07, + "loss": 1.2856, + "step": 1104 + }, + { + "epoch": 0.9458591910977958, + "grad_norm": 1.40625, + "learning_rate": 3.70621912163252e-07, + "loss": 1.3458, + "step": 1105 + }, + { + "epoch": 0.9467151722662102, + "grad_norm": 1.8671875, + "learning_rate": 3.589775323908612e-07, + "loss": 1.7292, + "step": 1106 + }, + { + "epoch": 0.9475711534346244, + "grad_norm": 1.421875, + "learning_rate": 3.475176857802298e-07, + "loss": 1.3163, + "step": 1107 + }, + { + "epoch": 0.9484271346030387, + "grad_norm": 1.6875, + "learning_rate": 3.3624245815254975e-07, + "loss": 1.5198, + "step": 1108 + }, + { + "epoch": 0.9492831157714531, + "grad_norm": 1.625, + "learning_rate": 3.2515193394641595e-07, + "loss": 1.5222, + "step": 1109 + }, + { + "epoch": 0.9501390969398673, + "grad_norm": 1.84375, + "learning_rate": 3.142461962172105e-07, + "loss": 1.3569, + "step": 1110 + }, + { + "epoch": 0.9509950781082817, + "grad_norm": 1.515625, + "learning_rate": 3.035253266364696e-07, + "loss": 1.4204, + "step": 1111 + }, + { + "epoch": 0.9518510592766959, + "grad_norm": 1.4765625, + "learning_rate": 2.9298940549128964e-07, + "loss": 1.3198, + "step": 1112 + }, + { + "epoch": 0.9527070404451102, + "grad_norm": 1.65625, + "learning_rate": 2.8263851168369714e-07, + "loss": 1.4136, + "step": 1113 + }, + { + "epoch": 0.9535630216135245, + "grad_norm": 1.5, + "learning_rate": 2.724727227300911e-07, + "loss": 1.4979, + "step": 1114 + }, + { + "epoch": 0.9544190027819388, + "grad_norm": 1.46875, + "learning_rate": 2.624921147606374e-07, + "loss": 1.3033, + "step": 1115 + }, + { + "epoch": 0.955274983950353, + "grad_norm": 1.4921875, + "learning_rate": 2.526967625187088e-07, + "loss": 1.429, + "step": 1116 + }, + { + "epoch": 0.9561309651187674, + "grad_norm": 1.59375, + "learning_rate": 2.4308673936032646e-07, + "loss": 1.4569, + "step": 1117 + }, + { + "epoch": 0.9569869462871817, + "grad_norm": 1.3984375, + "learning_rate": 2.3366211725360798e-07, + "loss": 1.3534, + "step": 1118 + }, + { + "epoch": 0.957842927455596, + "grad_norm": 1.5078125, + "learning_rate": 2.244229667782205e-07, + "loss": 1.5615, + "step": 1119 + }, + { + "epoch": 0.9586989086240103, + "grad_norm": 1.40625, + "learning_rate": 2.1536935712486994e-07, + "loss": 1.4168, + "step": 1120 + }, + { + "epoch": 0.9595548897924245, + "grad_norm": 1.484375, + "learning_rate": 2.0650135609477094e-07, + "loss": 1.3854, + "step": 1121 + }, + { + "epoch": 0.9604108709608389, + "grad_norm": 1.4921875, + "learning_rate": 1.9781903009913338e-07, + "loss": 1.3355, + "step": 1122 + }, + { + "epoch": 0.9612668521292531, + "grad_norm": 1.9375, + "learning_rate": 1.893224441586877e-07, + "loss": 1.5445, + "step": 1123 + }, + { + "epoch": 0.9621228332976675, + "grad_norm": 1.7109375, + "learning_rate": 1.8101166190316875e-07, + "loss": 1.4701, + "step": 1124 + }, + { + "epoch": 0.9629788144660818, + "grad_norm": 1.5625, + "learning_rate": 1.7288674557086048e-07, + "loss": 1.4356, + "step": 1125 + }, + { + "epoch": 0.963834795634496, + "grad_norm": 1.75, + "learning_rate": 1.6494775600812417e-07, + "loss": 1.4501, + "step": 1126 + }, + { + "epoch": 0.9646907768029104, + "grad_norm": 1.6484375, + "learning_rate": 1.571947526689349e-07, + "loss": 1.4544, + "step": 1127 + }, + { + "epoch": 0.9655467579713246, + "grad_norm": 1.5625, + "learning_rate": 1.4962779361445412e-07, + "loss": 1.5713, + "step": 1128 + }, + { + "epoch": 0.966402739139739, + "grad_norm": 1.6796875, + "learning_rate": 1.4224693551256885e-07, + "loss": 1.7056, + "step": 1129 + }, + { + "epoch": 0.9672587203081532, + "grad_norm": 1.390625, + "learning_rate": 1.3505223363749487e-07, + "loss": 1.2895, + "step": 1130 + }, + { + "epoch": 0.9681147014765675, + "grad_norm": 1.4921875, + "learning_rate": 1.2804374186934643e-07, + "loss": 1.3881, + "step": 1131 + }, + { + "epoch": 0.9689706826449818, + "grad_norm": 1.53125, + "learning_rate": 1.2122151269373383e-07, + "loss": 1.2761, + "step": 1132 + }, + { + "epoch": 0.9698266638133961, + "grad_norm": 1.5, + "learning_rate": 1.1458559720137762e-07, + "loss": 1.3987, + "step": 1133 + }, + { + "epoch": 0.9706826449818104, + "grad_norm": 1.53125, + "learning_rate": 1.0813604508771169e-07, + "loss": 1.3975, + "step": 1134 + }, + { + "epoch": 0.9715386261502247, + "grad_norm": 1.859375, + "learning_rate": 1.018729046525363e-07, + "loss": 1.3861, + "step": 1135 + }, + { + "epoch": 0.972394607318639, + "grad_norm": 1.4453125, + "learning_rate": 9.579622279962397e-08, + "loss": 1.3842, + "step": 1136 + }, + { + "epoch": 0.9732505884870533, + "grad_norm": 1.546875, + "learning_rate": 8.990604503639477e-08, + "loss": 1.4654, + "step": 1137 + }, + { + "epoch": 0.9741065696554676, + "grad_norm": 1.578125, + "learning_rate": 8.420241547356933e-08, + "loss": 1.4066, + "step": 1138 + }, + { + "epoch": 0.9749625508238818, + "grad_norm": 1.3515625, + "learning_rate": 7.868537682482469e-08, + "loss": 1.3077, + "step": 1139 + }, + { + "epoch": 0.9758185319922962, + "grad_norm": 1.5, + "learning_rate": 7.335497040648898e-08, + "loss": 1.4708, + "step": 1140 + }, + { + "epoch": 0.9766745131607104, + "grad_norm": 1.515625, + "learning_rate": 6.821123613723057e-08, + "loss": 1.6011, + "step": 1141 + }, + { + "epoch": 0.9775304943291248, + "grad_norm": 1.4453125, + "learning_rate": 6.325421253775277e-08, + "loss": 1.2807, + "step": 1142 + }, + { + "epoch": 0.9783864754975391, + "grad_norm": 1.484375, + "learning_rate": 5.848393673051067e-08, + "loss": 1.3443, + "step": 1143 + }, + { + "epoch": 0.9792424566659533, + "grad_norm": 1.4140625, + "learning_rate": 5.390044443943365e-08, + "loss": 1.5044, + "step": 1144 + }, + { + "epoch": 0.9800984378343677, + "grad_norm": 1.8125, + "learning_rate": 4.9503769989647786e-08, + "loss": 1.3441, + "step": 1145 + }, + { + "epoch": 0.9809544190027819, + "grad_norm": 1.546875, + "learning_rate": 4.529394630723438e-08, + "loss": 1.3954, + "step": 1146 + }, + { + "epoch": 0.9818104001711963, + "grad_norm": 1.59375, + "learning_rate": 4.1271004918971847e-08, + "loss": 1.3292, + "step": 1147 + }, + { + "epoch": 0.9826663813396105, + "grad_norm": 1.421875, + "learning_rate": 3.7434975952102546e-08, + "loss": 1.2322, + "step": 1148 + }, + { + "epoch": 0.9835223625080248, + "grad_norm": 1.5, + "learning_rate": 3.378588813411354e-08, + "loss": 1.3188, + "step": 1149 + }, + { + "epoch": 0.9843783436764392, + "grad_norm": 1.421875, + "learning_rate": 3.032376879250898e-08, + "loss": 1.2855, + "step": 1150 + }, + { + "epoch": 0.9852343248448534, + "grad_norm": 1.71875, + "learning_rate": 2.7048643854615806e-08, + "loss": 1.6131, + "step": 1151 + }, + { + "epoch": 0.9860903060132677, + "grad_norm": 1.3359375, + "learning_rate": 2.3960537847383946e-08, + "loss": 1.2274, + "step": 1152 + }, + { + "epoch": 0.986946287181682, + "grad_norm": 1.7421875, + "learning_rate": 2.1059473897208637e-08, + "loss": 1.4714, + "step": 1153 + }, + { + "epoch": 0.9878022683500963, + "grad_norm": 1.46875, + "learning_rate": 1.834547372975004e-08, + "loss": 1.4097, + "step": 1154 + }, + { + "epoch": 0.9886582495185106, + "grad_norm": 1.5859375, + "learning_rate": 1.581855766977225e-08, + "loss": 1.292, + "step": 1155 + }, + { + "epoch": 0.9895142306869249, + "grad_norm": 1.5, + "learning_rate": 1.3478744640998963e-08, + "loss": 1.1981, + "step": 1156 + }, + { + "epoch": 0.9903702118553391, + "grad_norm": 1.6171875, + "learning_rate": 1.1326052165960831e-08, + "loss": 1.5196, + "step": 1157 + }, + { + "epoch": 0.9912261930237535, + "grad_norm": 1.5625, + "learning_rate": 9.360496365870553e-09, + "loss": 1.4829, + "step": 1158 + }, + { + "epoch": 0.9920821741921678, + "grad_norm": 1.6875, + "learning_rate": 7.582091960497973e-09, + "loss": 1.4483, + "step": 1159 + }, + { + "epoch": 0.992938155360582, + "grad_norm": 1.4296875, + "learning_rate": 5.990852268064618e-09, + "loss": 1.5446, + "step": 1160 + }, + { + "epoch": 0.9937941365289964, + "grad_norm": 1.421875, + "learning_rate": 4.586789205140995e-09, + "loss": 1.39, + "step": 1161 + }, + { + "epoch": 0.9946501176974106, + "grad_norm": 1.4296875, + "learning_rate": 3.3699132865605553e-09, + "loss": 1.2557, + "step": 1162 + }, + { + "epoch": 0.995506098865825, + "grad_norm": 1.453125, + "learning_rate": 2.3402336253364187e-09, + "loss": 1.4511, + "step": 1163 + }, + { + "epoch": 0.9963620800342392, + "grad_norm": 1.6328125, + "learning_rate": 1.4977579325919923e-09, + "loss": 1.3592, + "step": 1164 + }, + { + "epoch": 0.9972180612026535, + "grad_norm": 1.6953125, + "learning_rate": 8.424925175137821e-10, + "loss": 1.4615, + "step": 1165 + }, + { + "epoch": 0.9980740423710678, + "grad_norm": 1.6015625, + "learning_rate": 3.744422872875575e-10, + "loss": 1.5993, + "step": 1166 + }, + { + "epoch": 0.9989300235394821, + "grad_norm": 1.4453125, + "learning_rate": 9.361074708169604e-11, + "loss": 1.3935, + "step": 1167 + }, + { + "epoch": 0.9997860047078965, + "grad_norm": 1.6875, + "learning_rate": 0.0, + "loss": 1.271, + "step": 1168 + } + ], + "logging_steps": 1, + "max_steps": 1168, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.472919195037204e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}