{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9975284231339594, "eval_steps": 500, "global_step": 3033, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009886307464162135, "grad_norm": 6.3861361061549555, "learning_rate": 1.6447368421052632e-07, "loss": 0.803, "step": 1 }, { "epoch": 0.001977261492832427, "grad_norm": 6.941698165262999, "learning_rate": 3.2894736842105264e-07, "loss": 0.8588, "step": 2 }, { "epoch": 0.002965892239248641, "grad_norm": 7.651413632102669, "learning_rate": 4.934210526315789e-07, "loss": 0.9041, "step": 3 }, { "epoch": 0.003954522985664854, "grad_norm": 8.093772147745591, "learning_rate": 6.578947368421053e-07, "loss": 0.8877, "step": 4 }, { "epoch": 0.004943153732081068, "grad_norm": 7.361275559597148, "learning_rate": 8.223684210526316e-07, "loss": 0.8836, "step": 5 }, { "epoch": 0.005931784478497282, "grad_norm": 6.961739831068877, "learning_rate": 9.868421052631579e-07, "loss": 0.8726, "step": 6 }, { "epoch": 0.006920415224913495, "grad_norm": 6.404835810219006, "learning_rate": 1.1513157894736842e-06, "loss": 0.8157, "step": 7 }, { "epoch": 0.007909045971329708, "grad_norm": 5.961110110404005, "learning_rate": 1.3157894736842106e-06, "loss": 0.8491, "step": 8 }, { "epoch": 0.008897676717745922, "grad_norm": 4.811693595656182, "learning_rate": 1.480263157894737e-06, "loss": 0.7864, "step": 9 }, { "epoch": 0.009886307464162136, "grad_norm": 4.423634392084826, "learning_rate": 1.6447368421052632e-06, "loss": 0.8381, "step": 10 }, { "epoch": 0.010874938210578349, "grad_norm": 4.2596604096947015, "learning_rate": 1.8092105263157896e-06, "loss": 0.7647, "step": 11 }, { "epoch": 0.011863568956994563, "grad_norm": 2.351335867267863, "learning_rate": 1.9736842105263157e-06, "loss": 0.7901, "step": 12 }, { "epoch": 0.012852199703410776, "grad_norm": 2.078513167320523, "learning_rate": 2.138157894736842e-06, "loss": 0.7279, "step": 13 }, { "epoch": 0.01384083044982699, "grad_norm": 1.805754852143027, "learning_rate": 2.3026315789473684e-06, "loss": 0.7971, "step": 14 }, { "epoch": 0.014829461196243203, "grad_norm": 1.6576167113718026, "learning_rate": 2.4671052631578948e-06, "loss": 0.7465, "step": 15 }, { "epoch": 0.015818091942659415, "grad_norm": 3.313692598065759, "learning_rate": 2.631578947368421e-06, "loss": 0.7637, "step": 16 }, { "epoch": 0.01680672268907563, "grad_norm": 3.4734670901202755, "learning_rate": 2.7960526315789475e-06, "loss": 0.798, "step": 17 }, { "epoch": 0.017795353435491844, "grad_norm": 3.249065236497784, "learning_rate": 2.960526315789474e-06, "loss": 0.7411, "step": 18 }, { "epoch": 0.018783984181908058, "grad_norm": 3.238234468108743, "learning_rate": 3.125e-06, "loss": 0.7613, "step": 19 }, { "epoch": 0.019772614928324272, "grad_norm": 3.028773839548598, "learning_rate": 3.2894736842105265e-06, "loss": 0.7584, "step": 20 }, { "epoch": 0.020761245674740483, "grad_norm": 2.3723601228944537, "learning_rate": 3.4539473684210533e-06, "loss": 0.7082, "step": 21 }, { "epoch": 0.021749876421156698, "grad_norm": 2.086997097224021, "learning_rate": 3.618421052631579e-06, "loss": 0.7174, "step": 22 }, { "epoch": 0.022738507167572912, "grad_norm": 1.7419265206207886, "learning_rate": 3.7828947368421055e-06, "loss": 0.6727, "step": 23 }, { "epoch": 0.023727137913989126, "grad_norm": 1.3637565521795993, "learning_rate": 3.9473684210526315e-06, "loss": 0.6358, "step": 24 }, { "epoch": 0.024715768660405337, "grad_norm": 1.255734685751118, "learning_rate": 4.111842105263159e-06, "loss": 0.6412, "step": 25 }, { "epoch": 0.02570439940682155, "grad_norm": 1.4374358358926842, "learning_rate": 4.276315789473684e-06, "loss": 0.6919, "step": 26 }, { "epoch": 0.026693030153237766, "grad_norm": 1.4612889033144707, "learning_rate": 4.4407894736842105e-06, "loss": 0.7515, "step": 27 }, { "epoch": 0.02768166089965398, "grad_norm": 1.298203037774009, "learning_rate": 4.605263157894737e-06, "loss": 0.6581, "step": 28 }, { "epoch": 0.028670291646070194, "grad_norm": 1.0717402613425981, "learning_rate": 4.769736842105264e-06, "loss": 0.6436, "step": 29 }, { "epoch": 0.029658922392486405, "grad_norm": 0.9456121632098667, "learning_rate": 4.9342105263157895e-06, "loss": 0.665, "step": 30 }, { "epoch": 0.03064755313890262, "grad_norm": 1.1255847201282951, "learning_rate": 5.098684210526316e-06, "loss": 0.6378, "step": 31 }, { "epoch": 0.03163618388531883, "grad_norm": 0.8606520329045211, "learning_rate": 5.263157894736842e-06, "loss": 0.6379, "step": 32 }, { "epoch": 0.03262481463173505, "grad_norm": 0.9932945262172661, "learning_rate": 5.4276315789473686e-06, "loss": 0.7002, "step": 33 }, { "epoch": 0.03361344537815126, "grad_norm": 1.1227835997190032, "learning_rate": 5.592105263157895e-06, "loss": 0.6487, "step": 34 }, { "epoch": 0.03460207612456748, "grad_norm": 0.8067522226788918, "learning_rate": 5.756578947368421e-06, "loss": 0.6423, "step": 35 }, { "epoch": 0.03559070687098369, "grad_norm": 0.7237216681019313, "learning_rate": 5.921052631578948e-06, "loss": 0.6288, "step": 36 }, { "epoch": 0.0365793376173999, "grad_norm": 0.7206726788812405, "learning_rate": 6.085526315789474e-06, "loss": 0.6697, "step": 37 }, { "epoch": 0.037567968363816116, "grad_norm": 0.8548189243671117, "learning_rate": 6.25e-06, "loss": 0.6348, "step": 38 }, { "epoch": 0.03855659911023233, "grad_norm": 0.8001773343421509, "learning_rate": 6.4144736842105275e-06, "loss": 0.6553, "step": 39 }, { "epoch": 0.039545229856648545, "grad_norm": 0.6816647220800844, "learning_rate": 6.578947368421053e-06, "loss": 0.6824, "step": 40 }, { "epoch": 0.040533860603064756, "grad_norm": 0.7327044376745491, "learning_rate": 6.743421052631579e-06, "loss": 0.6248, "step": 41 }, { "epoch": 0.04152249134948097, "grad_norm": 0.5743043472941969, "learning_rate": 6.9078947368421065e-06, "loss": 0.5917, "step": 42 }, { "epoch": 0.042511122095897184, "grad_norm": 0.7188554091395161, "learning_rate": 7.072368421052632e-06, "loss": 0.64, "step": 43 }, { "epoch": 0.043499752842313395, "grad_norm": 0.6313153648741602, "learning_rate": 7.236842105263158e-06, "loss": 0.5965, "step": 44 }, { "epoch": 0.044488383588729606, "grad_norm": 0.5742354922049958, "learning_rate": 7.401315789473684e-06, "loss": 0.6101, "step": 45 }, { "epoch": 0.045477014335145824, "grad_norm": 0.586949541916523, "learning_rate": 7.565789473684211e-06, "loss": 0.6285, "step": 46 }, { "epoch": 0.046465645081562035, "grad_norm": 0.5749425032904734, "learning_rate": 7.730263157894737e-06, "loss": 0.6163, "step": 47 }, { "epoch": 0.04745427582797825, "grad_norm": 0.5574793039592276, "learning_rate": 7.894736842105263e-06, "loss": 0.6394, "step": 48 }, { "epoch": 0.04844290657439446, "grad_norm": 0.5818207824411209, "learning_rate": 8.05921052631579e-06, "loss": 0.5495, "step": 49 }, { "epoch": 0.049431537320810674, "grad_norm": 0.6207220859812312, "learning_rate": 8.223684210526317e-06, "loss": 0.6474, "step": 50 }, { "epoch": 0.05042016806722689, "grad_norm": 0.4979775612364802, "learning_rate": 8.388157894736843e-06, "loss": 0.6217, "step": 51 }, { "epoch": 0.0514087988136431, "grad_norm": 0.6828245132866628, "learning_rate": 8.552631578947368e-06, "loss": 0.5859, "step": 52 }, { "epoch": 0.05239742956005932, "grad_norm": 0.7116059029061585, "learning_rate": 8.717105263157894e-06, "loss": 0.5931, "step": 53 }, { "epoch": 0.05338606030647553, "grad_norm": 0.6116268035220537, "learning_rate": 8.881578947368421e-06, "loss": 0.6295, "step": 54 }, { "epoch": 0.05437469105289174, "grad_norm": 0.6212523753157727, "learning_rate": 9.046052631578948e-06, "loss": 0.6212, "step": 55 }, { "epoch": 0.05536332179930796, "grad_norm": 0.5376801887230652, "learning_rate": 9.210526315789474e-06, "loss": 0.5984, "step": 56 }, { "epoch": 0.05635195254572417, "grad_norm": 0.5900872679378051, "learning_rate": 9.375000000000001e-06, "loss": 0.5972, "step": 57 }, { "epoch": 0.05734058329214039, "grad_norm": 0.5593507790578194, "learning_rate": 9.539473684210528e-06, "loss": 0.6392, "step": 58 }, { "epoch": 0.0583292140385566, "grad_norm": 0.5924025987859581, "learning_rate": 9.703947368421054e-06, "loss": 0.5997, "step": 59 }, { "epoch": 0.05931784478497281, "grad_norm": 0.5712134459633726, "learning_rate": 9.868421052631579e-06, "loss": 0.5877, "step": 60 }, { "epoch": 0.06030647553138903, "grad_norm": 0.5791081386905925, "learning_rate": 1.0032894736842106e-05, "loss": 0.5871, "step": 61 }, { "epoch": 0.06129510627780524, "grad_norm": 0.5426796031513715, "learning_rate": 1.0197368421052632e-05, "loss": 0.6236, "step": 62 }, { "epoch": 0.06228373702422145, "grad_norm": 0.5769498261706899, "learning_rate": 1.0361842105263159e-05, "loss": 0.6192, "step": 63 }, { "epoch": 0.06327236777063766, "grad_norm": 0.5773621773596161, "learning_rate": 1.0526315789473684e-05, "loss": 0.5616, "step": 64 }, { "epoch": 0.06426099851705389, "grad_norm": 0.5036737502986666, "learning_rate": 1.0690789473684212e-05, "loss": 0.5662, "step": 65 }, { "epoch": 0.0652496292634701, "grad_norm": 0.5545212463699442, "learning_rate": 1.0855263157894737e-05, "loss": 0.5803, "step": 66 }, { "epoch": 0.06623826000988631, "grad_norm": 0.5810182394672145, "learning_rate": 1.1019736842105263e-05, "loss": 0.5929, "step": 67 }, { "epoch": 0.06722689075630252, "grad_norm": 0.5704373013285302, "learning_rate": 1.118421052631579e-05, "loss": 0.5427, "step": 68 }, { "epoch": 0.06821552150271873, "grad_norm": 0.5228333327072504, "learning_rate": 1.1348684210526317e-05, "loss": 0.5602, "step": 69 }, { "epoch": 0.06920415224913495, "grad_norm": 0.6795444775218397, "learning_rate": 1.1513157894736843e-05, "loss": 0.5532, "step": 70 }, { "epoch": 0.07019278299555116, "grad_norm": 0.6607803631119596, "learning_rate": 1.167763157894737e-05, "loss": 0.5557, "step": 71 }, { "epoch": 0.07118141374196738, "grad_norm": 0.5438739416963149, "learning_rate": 1.1842105263157895e-05, "loss": 0.6069, "step": 72 }, { "epoch": 0.07217004448838359, "grad_norm": 0.6108642343992123, "learning_rate": 1.200657894736842e-05, "loss": 0.5888, "step": 73 }, { "epoch": 0.0731586752347998, "grad_norm": 0.5758824027713001, "learning_rate": 1.2171052631578948e-05, "loss": 0.533, "step": 74 }, { "epoch": 0.07414730598121602, "grad_norm": 0.5112537049343512, "learning_rate": 1.2335526315789473e-05, "loss": 0.5761, "step": 75 }, { "epoch": 0.07513593672763223, "grad_norm": 0.5638469459168819, "learning_rate": 1.25e-05, "loss": 0.5516, "step": 76 }, { "epoch": 0.07612456747404844, "grad_norm": 0.6446015218726103, "learning_rate": 1.2664473684210526e-05, "loss": 0.5745, "step": 77 }, { "epoch": 0.07711319822046465, "grad_norm": 0.5708559658360864, "learning_rate": 1.2828947368421055e-05, "loss": 0.5354, "step": 78 }, { "epoch": 0.07810182896688087, "grad_norm": 0.6330760743431157, "learning_rate": 1.299342105263158e-05, "loss": 0.5475, "step": 79 }, { "epoch": 0.07909045971329709, "grad_norm": 0.596966882304382, "learning_rate": 1.3157894736842106e-05, "loss": 0.5618, "step": 80 }, { "epoch": 0.0800790904597133, "grad_norm": 0.6358422434802019, "learning_rate": 1.3322368421052633e-05, "loss": 0.5411, "step": 81 }, { "epoch": 0.08106772120612951, "grad_norm": 0.6880953297621528, "learning_rate": 1.3486842105263159e-05, "loss": 0.588, "step": 82 }, { "epoch": 0.08205635195254572, "grad_norm": 0.5668205759482905, "learning_rate": 1.3651315789473684e-05, "loss": 0.5979, "step": 83 }, { "epoch": 0.08304498269896193, "grad_norm": 0.6036021774520155, "learning_rate": 1.3815789473684213e-05, "loss": 0.5211, "step": 84 }, { "epoch": 0.08403361344537816, "grad_norm": 0.666271338197437, "learning_rate": 1.3980263157894739e-05, "loss": 0.5827, "step": 85 }, { "epoch": 0.08502224419179437, "grad_norm": 0.5331769607302415, "learning_rate": 1.4144736842105264e-05, "loss": 0.5183, "step": 86 }, { "epoch": 0.08601087493821058, "grad_norm": 0.5254408624700205, "learning_rate": 1.430921052631579e-05, "loss": 0.533, "step": 87 }, { "epoch": 0.08699950568462679, "grad_norm": 0.5810438536194783, "learning_rate": 1.4473684210526317e-05, "loss": 0.5615, "step": 88 }, { "epoch": 0.087988136431043, "grad_norm": 0.5167959806955196, "learning_rate": 1.4638157894736842e-05, "loss": 0.5759, "step": 89 }, { "epoch": 0.08897676717745921, "grad_norm": 0.5375462048439674, "learning_rate": 1.4802631578947368e-05, "loss": 0.5129, "step": 90 }, { "epoch": 0.08996539792387544, "grad_norm": 0.5088462196018916, "learning_rate": 1.4967105263157897e-05, "loss": 0.5711, "step": 91 }, { "epoch": 0.09095402867029165, "grad_norm": 0.5199092845078265, "learning_rate": 1.5131578947368422e-05, "loss": 0.5598, "step": 92 }, { "epoch": 0.09194265941670786, "grad_norm": 0.5601869734320992, "learning_rate": 1.5296052631578946e-05, "loss": 0.5682, "step": 93 }, { "epoch": 0.09293129016312407, "grad_norm": 0.5843088978646793, "learning_rate": 1.5460526315789475e-05, "loss": 0.6209, "step": 94 }, { "epoch": 0.09391992090954028, "grad_norm": 0.5975025268185784, "learning_rate": 1.5625e-05, "loss": 0.5778, "step": 95 }, { "epoch": 0.0949085516559565, "grad_norm": 0.5377289635116049, "learning_rate": 1.5789473684210526e-05, "loss": 0.5379, "step": 96 }, { "epoch": 0.09589718240237272, "grad_norm": 0.5856273151184899, "learning_rate": 1.5953947368421055e-05, "loss": 0.5232, "step": 97 }, { "epoch": 0.09688581314878893, "grad_norm": 0.5081586007555687, "learning_rate": 1.611842105263158e-05, "loss": 0.5506, "step": 98 }, { "epoch": 0.09787444389520514, "grad_norm": 0.570613104365877, "learning_rate": 1.6282894736842106e-05, "loss": 0.5805, "step": 99 }, { "epoch": 0.09886307464162135, "grad_norm": 0.47710897053691703, "learning_rate": 1.6447368421052635e-05, "loss": 0.5437, "step": 100 }, { "epoch": 0.09985170538803757, "grad_norm": 0.5545261003028369, "learning_rate": 1.661184210526316e-05, "loss": 0.5431, "step": 101 }, { "epoch": 0.10084033613445378, "grad_norm": 0.5379726077923472, "learning_rate": 1.6776315789473686e-05, "loss": 0.5199, "step": 102 }, { "epoch": 0.10182896688087, "grad_norm": 0.5601304341136, "learning_rate": 1.694078947368421e-05, "loss": 0.5602, "step": 103 }, { "epoch": 0.1028175976272862, "grad_norm": 0.6261209818644539, "learning_rate": 1.7105263157894737e-05, "loss": 0.5, "step": 104 }, { "epoch": 0.10380622837370242, "grad_norm": 0.6199090112969365, "learning_rate": 1.7269736842105262e-05, "loss": 0.5442, "step": 105 }, { "epoch": 0.10479485912011864, "grad_norm": 0.5982664351250674, "learning_rate": 1.7434210526315788e-05, "loss": 0.5456, "step": 106 }, { "epoch": 0.10578348986653485, "grad_norm": 0.7136618895840487, "learning_rate": 1.7598684210526316e-05, "loss": 0.4941, "step": 107 }, { "epoch": 0.10677212061295106, "grad_norm": 0.7240621045323994, "learning_rate": 1.7763157894736842e-05, "loss": 0.5567, "step": 108 }, { "epoch": 0.10776075135936727, "grad_norm": 0.5603388365347287, "learning_rate": 1.7927631578947367e-05, "loss": 0.5087, "step": 109 }, { "epoch": 0.10874938210578348, "grad_norm": 0.7320815730720595, "learning_rate": 1.8092105263157896e-05, "loss": 0.5948, "step": 110 }, { "epoch": 0.10973801285219971, "grad_norm": 0.7359518866272572, "learning_rate": 1.8256578947368422e-05, "loss": 0.6193, "step": 111 }, { "epoch": 0.11072664359861592, "grad_norm": 0.508109781003832, "learning_rate": 1.8421052631578947e-05, "loss": 0.5642, "step": 112 }, { "epoch": 0.11171527434503213, "grad_norm": 0.5845409403422831, "learning_rate": 1.8585526315789476e-05, "loss": 0.5428, "step": 113 }, { "epoch": 0.11270390509144834, "grad_norm": 0.5845352421820057, "learning_rate": 1.8750000000000002e-05, "loss": 0.5528, "step": 114 }, { "epoch": 0.11369253583786455, "grad_norm": 0.5421128995276316, "learning_rate": 1.8914473684210527e-05, "loss": 0.5473, "step": 115 }, { "epoch": 0.11468116658428078, "grad_norm": 0.5444821893476489, "learning_rate": 1.9078947368421056e-05, "loss": 0.5875, "step": 116 }, { "epoch": 0.11566979733069699, "grad_norm": 0.6138499966686716, "learning_rate": 1.924342105263158e-05, "loss": 0.5081, "step": 117 }, { "epoch": 0.1166584280771132, "grad_norm": 0.5428088095552697, "learning_rate": 1.9407894736842107e-05, "loss": 0.5418, "step": 118 }, { "epoch": 0.11764705882352941, "grad_norm": 0.5758346011314138, "learning_rate": 1.9572368421052633e-05, "loss": 0.5155, "step": 119 }, { "epoch": 0.11863568956994562, "grad_norm": 0.6449992337566779, "learning_rate": 1.9736842105263158e-05, "loss": 0.5335, "step": 120 }, { "epoch": 0.11962432031636185, "grad_norm": 0.7574114917967006, "learning_rate": 1.9901315789473684e-05, "loss": 0.5584, "step": 121 }, { "epoch": 0.12061295106277806, "grad_norm": 0.691767316226935, "learning_rate": 2.0065789473684213e-05, "loss": 0.5525, "step": 122 }, { "epoch": 0.12160158180919427, "grad_norm": 0.7057062723235376, "learning_rate": 2.0230263157894738e-05, "loss": 0.5134, "step": 123 }, { "epoch": 0.12259021255561048, "grad_norm": 0.7244934673910823, "learning_rate": 2.0394736842105264e-05, "loss": 0.5899, "step": 124 }, { "epoch": 0.12357884330202669, "grad_norm": 0.8171674529470953, "learning_rate": 2.055921052631579e-05, "loss": 0.5418, "step": 125 }, { "epoch": 0.1245674740484429, "grad_norm": 0.7882895049178926, "learning_rate": 2.0723684210526318e-05, "loss": 0.5455, "step": 126 }, { "epoch": 0.12555610479485912, "grad_norm": 0.6938169222913904, "learning_rate": 2.0888157894736843e-05, "loss": 0.5463, "step": 127 }, { "epoch": 0.12654473554127532, "grad_norm": 0.6575991942338371, "learning_rate": 2.105263157894737e-05, "loss": 0.4878, "step": 128 }, { "epoch": 0.12753336628769155, "grad_norm": 0.6446201737019337, "learning_rate": 2.1217105263157898e-05, "loss": 0.5829, "step": 129 }, { "epoch": 0.12852199703410777, "grad_norm": 0.6124278619361072, "learning_rate": 2.1381578947368423e-05, "loss": 0.5061, "step": 130 }, { "epoch": 0.12951062778052397, "grad_norm": 0.731663686256956, "learning_rate": 2.154605263157895e-05, "loss": 0.5316, "step": 131 }, { "epoch": 0.1304992585269402, "grad_norm": 0.6337684896971748, "learning_rate": 2.1710526315789474e-05, "loss": 0.4839, "step": 132 }, { "epoch": 0.1314878892733564, "grad_norm": 0.9526040312695694, "learning_rate": 2.1875e-05, "loss": 0.5559, "step": 133 }, { "epoch": 0.13247652001977261, "grad_norm": 0.6350173060349796, "learning_rate": 2.2039473684210525e-05, "loss": 0.5786, "step": 134 }, { "epoch": 0.13346515076618884, "grad_norm": 1.1469094853176187, "learning_rate": 2.2203947368421054e-05, "loss": 0.4787, "step": 135 }, { "epoch": 0.13445378151260504, "grad_norm": 0.7868665607196328, "learning_rate": 2.236842105263158e-05, "loss": 0.5064, "step": 136 }, { "epoch": 0.13544241225902126, "grad_norm": 0.8386731807265939, "learning_rate": 2.2532894736842105e-05, "loss": 0.4597, "step": 137 }, { "epoch": 0.13643104300543746, "grad_norm": 0.7234113989380704, "learning_rate": 2.2697368421052634e-05, "loss": 0.5696, "step": 138 }, { "epoch": 0.13741967375185368, "grad_norm": 0.8935048887221402, "learning_rate": 2.286184210526316e-05, "loss": 0.548, "step": 139 }, { "epoch": 0.1384083044982699, "grad_norm": 0.6143028883310963, "learning_rate": 2.3026315789473685e-05, "loss": 0.5305, "step": 140 }, { "epoch": 0.1393969352446861, "grad_norm": 0.8980610607195922, "learning_rate": 2.3190789473684214e-05, "loss": 0.525, "step": 141 }, { "epoch": 0.14038556599110233, "grad_norm": 0.8158575114043127, "learning_rate": 2.335526315789474e-05, "loss": 0.5477, "step": 142 }, { "epoch": 0.14137419673751853, "grad_norm": 0.7926771499301681, "learning_rate": 2.3519736842105265e-05, "loss": 0.5547, "step": 143 }, { "epoch": 0.14236282748393475, "grad_norm": 0.8431320231483894, "learning_rate": 2.368421052631579e-05, "loss": 0.5357, "step": 144 }, { "epoch": 0.14335145823035098, "grad_norm": 0.6952719484285427, "learning_rate": 2.3848684210526316e-05, "loss": 0.5433, "step": 145 }, { "epoch": 0.14434008897676717, "grad_norm": 1.037681481572082, "learning_rate": 2.401315789473684e-05, "loss": 0.5517, "step": 146 }, { "epoch": 0.1453287197231834, "grad_norm": 0.7134565589158198, "learning_rate": 2.4177631578947367e-05, "loss": 0.5265, "step": 147 }, { "epoch": 0.1463173504695996, "grad_norm": 0.6643813128128795, "learning_rate": 2.4342105263157896e-05, "loss": 0.5448, "step": 148 }, { "epoch": 0.14730598121601582, "grad_norm": 0.710450802627643, "learning_rate": 2.450657894736842e-05, "loss": 0.5813, "step": 149 }, { "epoch": 0.14829461196243204, "grad_norm": 0.6132537759498533, "learning_rate": 2.4671052631578947e-05, "loss": 0.4877, "step": 150 }, { "epoch": 0.14928324270884824, "grad_norm": 0.7719808257492512, "learning_rate": 2.4835526315789476e-05, "loss": 0.4871, "step": 151 }, { "epoch": 0.15027187345526447, "grad_norm": 0.9245958382624601, "learning_rate": 2.5e-05, "loss": 0.5415, "step": 152 }, { "epoch": 0.15126050420168066, "grad_norm": 0.7204324986891321, "learning_rate": 2.5164473684210527e-05, "loss": 0.5157, "step": 153 }, { "epoch": 0.1522491349480969, "grad_norm": 0.7596078817285238, "learning_rate": 2.5328947368421052e-05, "loss": 0.5283, "step": 154 }, { "epoch": 0.1532377656945131, "grad_norm": 0.7601434203843385, "learning_rate": 2.5493421052631578e-05, "loss": 0.5119, "step": 155 }, { "epoch": 0.1542263964409293, "grad_norm": 0.6569138135595929, "learning_rate": 2.565789473684211e-05, "loss": 0.5238, "step": 156 }, { "epoch": 0.15521502718734553, "grad_norm": 0.7476996562848468, "learning_rate": 2.5822368421052635e-05, "loss": 0.5102, "step": 157 }, { "epoch": 0.15620365793376173, "grad_norm": 0.6543530663049902, "learning_rate": 2.598684210526316e-05, "loss": 0.5249, "step": 158 }, { "epoch": 0.15719228868017796, "grad_norm": 0.8540894148808678, "learning_rate": 2.6151315789473686e-05, "loss": 0.534, "step": 159 }, { "epoch": 0.15818091942659418, "grad_norm": 1.0082269859172448, "learning_rate": 2.6315789473684212e-05, "loss": 0.5158, "step": 160 }, { "epoch": 0.15916955017301038, "grad_norm": 0.7867333510254219, "learning_rate": 2.6480263157894737e-05, "loss": 0.6038, "step": 161 }, { "epoch": 0.1601581809194266, "grad_norm": 0.8792837925504117, "learning_rate": 2.6644736842105266e-05, "loss": 0.5435, "step": 162 }, { "epoch": 0.1611468116658428, "grad_norm": 0.8935435346848154, "learning_rate": 2.6809210526315792e-05, "loss": 0.5646, "step": 163 }, { "epoch": 0.16213544241225902, "grad_norm": 0.6888891568034172, "learning_rate": 2.6973684210526317e-05, "loss": 0.6073, "step": 164 }, { "epoch": 0.16312407315867525, "grad_norm": 1.0823536680053514, "learning_rate": 2.7138157894736843e-05, "loss": 0.5413, "step": 165 }, { "epoch": 0.16411270390509144, "grad_norm": 0.8073982409196414, "learning_rate": 2.730263157894737e-05, "loss": 0.5496, "step": 166 }, { "epoch": 0.16510133465150767, "grad_norm": 0.7559748902027903, "learning_rate": 2.7467105263157894e-05, "loss": 0.5352, "step": 167 }, { "epoch": 0.16608996539792387, "grad_norm": 0.9816003882271781, "learning_rate": 2.7631578947368426e-05, "loss": 0.5446, "step": 168 }, { "epoch": 0.1670785961443401, "grad_norm": 0.8140955484479716, "learning_rate": 2.779605263157895e-05, "loss": 0.5457, "step": 169 }, { "epoch": 0.16806722689075632, "grad_norm": 0.9782243681318998, "learning_rate": 2.7960526315789477e-05, "loss": 0.5564, "step": 170 }, { "epoch": 0.1690558576371725, "grad_norm": 0.7641990461068666, "learning_rate": 2.8125000000000003e-05, "loss": 0.5384, "step": 171 }, { "epoch": 0.17004448838358874, "grad_norm": 0.6629376981580309, "learning_rate": 2.8289473684210528e-05, "loss": 0.5612, "step": 172 }, { "epoch": 0.17103311913000493, "grad_norm": 0.6527492683427962, "learning_rate": 2.8453947368421054e-05, "loss": 0.5748, "step": 173 }, { "epoch": 0.17202174987642116, "grad_norm": 0.6830152325581331, "learning_rate": 2.861842105263158e-05, "loss": 0.5038, "step": 174 }, { "epoch": 0.17301038062283736, "grad_norm": 0.6956426798733234, "learning_rate": 2.8782894736842108e-05, "loss": 0.524, "step": 175 }, { "epoch": 0.17399901136925358, "grad_norm": 0.7765831652885082, "learning_rate": 2.8947368421052634e-05, "loss": 0.5712, "step": 176 }, { "epoch": 0.1749876421156698, "grad_norm": 0.779129637157843, "learning_rate": 2.911184210526316e-05, "loss": 0.5441, "step": 177 }, { "epoch": 0.175976272862086, "grad_norm": 1.7129827827260813, "learning_rate": 2.9276315789473684e-05, "loss": 0.5833, "step": 178 }, { "epoch": 0.17696490360850223, "grad_norm": 0.9008559129865658, "learning_rate": 2.944078947368421e-05, "loss": 0.514, "step": 179 }, { "epoch": 0.17795353435491842, "grad_norm": 1.075711254676957, "learning_rate": 2.9605263157894735e-05, "loss": 0.5308, "step": 180 }, { "epoch": 0.17894216510133465, "grad_norm": 0.7563727839723896, "learning_rate": 2.9769736842105268e-05, "loss": 0.507, "step": 181 }, { "epoch": 0.17993079584775087, "grad_norm": 0.6547968838352691, "learning_rate": 2.9934210526315793e-05, "loss": 0.5268, "step": 182 }, { "epoch": 0.18091942659416707, "grad_norm": 0.9002189408191256, "learning_rate": 3.009868421052632e-05, "loss": 0.5074, "step": 183 }, { "epoch": 0.1819080573405833, "grad_norm": 0.7759071412227819, "learning_rate": 3.0263157894736844e-05, "loss": 0.5693, "step": 184 }, { "epoch": 0.1828966880869995, "grad_norm": 0.8085720353937003, "learning_rate": 3.042763157894737e-05, "loss": 0.5483, "step": 185 }, { "epoch": 0.18388531883341572, "grad_norm": 0.7366701353400955, "learning_rate": 3.059210526315789e-05, "loss": 0.5171, "step": 186 }, { "epoch": 0.18487394957983194, "grad_norm": 1.096916727285644, "learning_rate": 3.075657894736843e-05, "loss": 0.5647, "step": 187 }, { "epoch": 0.18586258032624814, "grad_norm": 0.8006228059688815, "learning_rate": 3.092105263157895e-05, "loss": 0.5549, "step": 188 }, { "epoch": 0.18685121107266436, "grad_norm": 0.7992918867964869, "learning_rate": 3.108552631578948e-05, "loss": 0.5387, "step": 189 }, { "epoch": 0.18783984181908056, "grad_norm": 0.6991559372423174, "learning_rate": 3.125e-05, "loss": 0.5124, "step": 190 }, { "epoch": 0.18882847256549679, "grad_norm": 0.7094657821405944, "learning_rate": 3.141447368421053e-05, "loss": 0.5011, "step": 191 }, { "epoch": 0.189817103311913, "grad_norm": 0.5663527928324437, "learning_rate": 3.157894736842105e-05, "loss": 0.5156, "step": 192 }, { "epoch": 0.1908057340583292, "grad_norm": 0.7960635754142633, "learning_rate": 3.174342105263158e-05, "loss": 0.5629, "step": 193 }, { "epoch": 0.19179436480474543, "grad_norm": 0.6872494293521877, "learning_rate": 3.190789473684211e-05, "loss": 0.5009, "step": 194 }, { "epoch": 0.19278299555116163, "grad_norm": 1.0749456551526493, "learning_rate": 3.207236842105263e-05, "loss": 0.5031, "step": 195 }, { "epoch": 0.19377162629757785, "grad_norm": 0.9218000338068714, "learning_rate": 3.223684210526316e-05, "loss": 0.536, "step": 196 }, { "epoch": 0.19476025704399408, "grad_norm": 0.7088525361129349, "learning_rate": 3.240131578947368e-05, "loss": 0.536, "step": 197 }, { "epoch": 0.19574888779041028, "grad_norm": 0.8354111072607651, "learning_rate": 3.256578947368421e-05, "loss": 0.5704, "step": 198 }, { "epoch": 0.1967375185368265, "grad_norm": 0.8118080299406779, "learning_rate": 3.2730263157894734e-05, "loss": 0.5665, "step": 199 }, { "epoch": 0.1977261492832427, "grad_norm": 0.7115360093462467, "learning_rate": 3.289473684210527e-05, "loss": 0.564, "step": 200 }, { "epoch": 0.19871478002965892, "grad_norm": 0.703906654518423, "learning_rate": 3.305921052631579e-05, "loss": 0.5762, "step": 201 }, { "epoch": 0.19970341077607515, "grad_norm": 0.8674618949644629, "learning_rate": 3.322368421052632e-05, "loss": 0.5142, "step": 202 }, { "epoch": 0.20069204152249134, "grad_norm": 1.0532060090122812, "learning_rate": 3.338815789473684e-05, "loss": 0.5512, "step": 203 }, { "epoch": 0.20168067226890757, "grad_norm": 0.8287743577968085, "learning_rate": 3.355263157894737e-05, "loss": 0.5054, "step": 204 }, { "epoch": 0.20266930301532377, "grad_norm": 1.2982734140712242, "learning_rate": 3.371710526315789e-05, "loss": 0.5463, "step": 205 }, { "epoch": 0.20365793376174, "grad_norm": 4.719556245687763, "learning_rate": 3.388157894736842e-05, "loss": 0.5137, "step": 206 }, { "epoch": 0.20464656450815621, "grad_norm": 1.2565634781954083, "learning_rate": 3.404605263157895e-05, "loss": 0.5922, "step": 207 }, { "epoch": 0.2056351952545724, "grad_norm": 0.932593830205006, "learning_rate": 3.421052631578947e-05, "loss": 0.5403, "step": 208 }, { "epoch": 0.20662382600098864, "grad_norm": 0.8559766130496362, "learning_rate": 3.4375e-05, "loss": 0.5986, "step": 209 }, { "epoch": 0.20761245674740483, "grad_norm": 0.9727438796307076, "learning_rate": 3.4539473684210524e-05, "loss": 0.4969, "step": 210 }, { "epoch": 0.20860108749382106, "grad_norm": 0.6912408854266138, "learning_rate": 3.470394736842105e-05, "loss": 0.5279, "step": 211 }, { "epoch": 0.20958971824023728, "grad_norm": 12.378810560436905, "learning_rate": 3.4868421052631575e-05, "loss": 0.7363, "step": 212 }, { "epoch": 0.21057834898665348, "grad_norm": 2.3817866094260047, "learning_rate": 3.503289473684211e-05, "loss": 0.5753, "step": 213 }, { "epoch": 0.2115669797330697, "grad_norm": 33.09046388374292, "learning_rate": 3.519736842105263e-05, "loss": 1.9877, "step": 214 }, { "epoch": 0.2125556104794859, "grad_norm": 0.9907440255251754, "learning_rate": 3.536184210526316e-05, "loss": 0.5611, "step": 215 }, { "epoch": 0.21354424122590213, "grad_norm": 1.4381978684587284, "learning_rate": 3.5526315789473684e-05, "loss": 0.5362, "step": 216 }, { "epoch": 0.21453287197231835, "grad_norm": 0.7437995510174058, "learning_rate": 3.569078947368421e-05, "loss": 0.576, "step": 217 }, { "epoch": 0.21552150271873455, "grad_norm": 15.979563761390763, "learning_rate": 3.5855263157894735e-05, "loss": 0.8592, "step": 218 }, { "epoch": 0.21651013346515077, "grad_norm": 1.195635147727035, "learning_rate": 3.6019736842105264e-05, "loss": 0.5086, "step": 219 }, { "epoch": 0.21749876421156697, "grad_norm": 0.771230141218996, "learning_rate": 3.618421052631579e-05, "loss": 0.544, "step": 220 }, { "epoch": 0.2184873949579832, "grad_norm": 1.0786703682130654, "learning_rate": 3.6348684210526315e-05, "loss": 0.5319, "step": 221 }, { "epoch": 0.21947602570439942, "grad_norm": 0.8711392559082117, "learning_rate": 3.6513157894736844e-05, "loss": 0.4767, "step": 222 }, { "epoch": 0.22046465645081562, "grad_norm": 8.234648409851951, "learning_rate": 3.6677631578947366e-05, "loss": 0.8922, "step": 223 }, { "epoch": 0.22145328719723184, "grad_norm": 2.46305407649344, "learning_rate": 3.6842105263157895e-05, "loss": 0.5362, "step": 224 }, { "epoch": 0.22244191794364804, "grad_norm": 1.6887007847589737, "learning_rate": 3.7006578947368424e-05, "loss": 0.538, "step": 225 }, { "epoch": 0.22343054869006426, "grad_norm": 0.9777395277490546, "learning_rate": 3.717105263157895e-05, "loss": 0.5502, "step": 226 }, { "epoch": 0.2244191794364805, "grad_norm": 1.8535109001908308, "learning_rate": 3.7335526315789475e-05, "loss": 0.5634, "step": 227 }, { "epoch": 0.22540781018289668, "grad_norm": 1.6644884295493159, "learning_rate": 3.7500000000000003e-05, "loss": 0.5999, "step": 228 }, { "epoch": 0.2263964409293129, "grad_norm": 0.9460957765647642, "learning_rate": 3.7664473684210526e-05, "loss": 0.5882, "step": 229 }, { "epoch": 0.2273850716757291, "grad_norm": 34.84441475241127, "learning_rate": 3.7828947368421054e-05, "loss": 1.0963, "step": 230 }, { "epoch": 0.22837370242214533, "grad_norm": 2.574887154732525, "learning_rate": 3.7993421052631577e-05, "loss": 0.5762, "step": 231 }, { "epoch": 0.22936233316856156, "grad_norm": 2.0486698397989813, "learning_rate": 3.815789473684211e-05, "loss": 0.6326, "step": 232 }, { "epoch": 0.23035096391497775, "grad_norm": 1.909358450539062, "learning_rate": 3.8322368421052634e-05, "loss": 0.546, "step": 233 }, { "epoch": 0.23133959466139398, "grad_norm": 1.851634277516247, "learning_rate": 3.848684210526316e-05, "loss": 0.5275, "step": 234 }, { "epoch": 0.23232822540781017, "grad_norm": 1.06053619433391, "learning_rate": 3.8651315789473685e-05, "loss": 0.5751, "step": 235 }, { "epoch": 0.2333168561542264, "grad_norm": 1.5068103981309784, "learning_rate": 3.8815789473684214e-05, "loss": 0.6234, "step": 236 }, { "epoch": 0.23430548690064262, "grad_norm": 1.1990853605410445, "learning_rate": 3.8980263157894736e-05, "loss": 0.5079, "step": 237 }, { "epoch": 0.23529411764705882, "grad_norm": 1.1961889677900601, "learning_rate": 3.9144736842105265e-05, "loss": 0.5367, "step": 238 }, { "epoch": 0.23628274839347504, "grad_norm": 0.9781805587632191, "learning_rate": 3.9309210526315794e-05, "loss": 0.5411, "step": 239 }, { "epoch": 0.23727137913989124, "grad_norm": 1.0856218596076486, "learning_rate": 3.9473684210526316e-05, "loss": 0.5033, "step": 240 }, { "epoch": 0.23826000988630747, "grad_norm": 1.113434395484073, "learning_rate": 3.9638157894736845e-05, "loss": 0.4864, "step": 241 }, { "epoch": 0.2392486406327237, "grad_norm": 0.8436389700654071, "learning_rate": 3.980263157894737e-05, "loss": 0.5448, "step": 242 }, { "epoch": 0.2402372713791399, "grad_norm": 1.0907915061935083, "learning_rate": 3.9967105263157896e-05, "loss": 0.5197, "step": 243 }, { "epoch": 0.2412259021255561, "grad_norm": 1.1077803027762376, "learning_rate": 4.0131578947368425e-05, "loss": 0.5476, "step": 244 }, { "epoch": 0.2422145328719723, "grad_norm": 0.8397108536192612, "learning_rate": 4.0296052631578954e-05, "loss": 0.5748, "step": 245 }, { "epoch": 0.24320316361838853, "grad_norm": 1.146299346315683, "learning_rate": 4.0460526315789476e-05, "loss": 0.5299, "step": 246 }, { "epoch": 0.24419179436480473, "grad_norm": 1.1009911737977578, "learning_rate": 4.0625000000000005e-05, "loss": 0.4951, "step": 247 }, { "epoch": 0.24518042511122096, "grad_norm": 0.88852031354678, "learning_rate": 4.078947368421053e-05, "loss": 0.5126, "step": 248 }, { "epoch": 0.24616905585763718, "grad_norm": 1.3904218847266294, "learning_rate": 4.0953947368421056e-05, "loss": 0.4914, "step": 249 }, { "epoch": 0.24715768660405338, "grad_norm": 0.6822714291311399, "learning_rate": 4.111842105263158e-05, "loss": 0.5788, "step": 250 }, { "epoch": 0.2481463173504696, "grad_norm": 1.1683664616062095, "learning_rate": 4.128289473684211e-05, "loss": 0.5099, "step": 251 }, { "epoch": 0.2491349480968858, "grad_norm": 5.583396791119196, "learning_rate": 4.1447368421052636e-05, "loss": 0.5279, "step": 252 }, { "epoch": 0.25012357884330205, "grad_norm": 0.7913369920311625, "learning_rate": 4.161184210526316e-05, "loss": 0.5514, "step": 253 }, { "epoch": 0.25111220958971825, "grad_norm": 0.9449278022000636, "learning_rate": 4.177631578947369e-05, "loss": 0.5681, "step": 254 }, { "epoch": 0.25210084033613445, "grad_norm": 0.7705363493705614, "learning_rate": 4.194078947368421e-05, "loss": 0.5576, "step": 255 }, { "epoch": 0.25308947108255064, "grad_norm": 0.749809394162025, "learning_rate": 4.210526315789474e-05, "loss": 0.5191, "step": 256 }, { "epoch": 0.2540781018289669, "grad_norm": 0.5994119987014904, "learning_rate": 4.226973684210527e-05, "loss": 0.4726, "step": 257 }, { "epoch": 0.2550667325753831, "grad_norm": 1.168848128058618, "learning_rate": 4.2434210526315796e-05, "loss": 0.5571, "step": 258 }, { "epoch": 0.2560553633217993, "grad_norm": 0.745121621574835, "learning_rate": 4.259868421052632e-05, "loss": 0.5333, "step": 259 }, { "epoch": 0.25704399406821554, "grad_norm": 3.688652046393065, "learning_rate": 4.2763157894736847e-05, "loss": 0.5422, "step": 260 }, { "epoch": 0.25803262481463174, "grad_norm": 0.9947200908486775, "learning_rate": 4.292763157894737e-05, "loss": 0.6013, "step": 261 }, { "epoch": 0.25902125556104794, "grad_norm": 0.7337639906422971, "learning_rate": 4.30921052631579e-05, "loss": 0.5082, "step": 262 }, { "epoch": 0.2600098863074642, "grad_norm": 0.7497135106995108, "learning_rate": 4.3256578947368426e-05, "loss": 0.5304, "step": 263 }, { "epoch": 0.2609985170538804, "grad_norm": 4.127695771067983, "learning_rate": 4.342105263157895e-05, "loss": 0.578, "step": 264 }, { "epoch": 0.2619871478002966, "grad_norm": 8.728129441440524, "learning_rate": 4.358552631578948e-05, "loss": 0.5132, "step": 265 }, { "epoch": 0.2629757785467128, "grad_norm": 1.5243962045431791, "learning_rate": 4.375e-05, "loss": 0.5552, "step": 266 }, { "epoch": 0.26396440929312903, "grad_norm": 1.0981330564653544, "learning_rate": 4.391447368421053e-05, "loss": 0.6023, "step": 267 }, { "epoch": 0.26495304003954523, "grad_norm": 0.9795877390893064, "learning_rate": 4.407894736842105e-05, "loss": 0.5153, "step": 268 }, { "epoch": 0.2659416707859614, "grad_norm": 3.2027638274352275, "learning_rate": 4.424342105263158e-05, "loss": 0.6384, "step": 269 }, { "epoch": 0.2669303015323777, "grad_norm": 13.430603101394833, "learning_rate": 4.440789473684211e-05, "loss": 0.8855, "step": 270 }, { "epoch": 0.2679189322787939, "grad_norm": 2.2953580097643895, "learning_rate": 4.457236842105264e-05, "loss": 0.574, "step": 271 }, { "epoch": 0.2689075630252101, "grad_norm": 1.3518305907119672, "learning_rate": 4.473684210526316e-05, "loss": 0.5609, "step": 272 }, { "epoch": 0.2698961937716263, "grad_norm": 1.417125996419156, "learning_rate": 4.490131578947369e-05, "loss": 0.511, "step": 273 }, { "epoch": 0.2708848245180425, "grad_norm": 1.4869556938109536, "learning_rate": 4.506578947368421e-05, "loss": 0.6063, "step": 274 }, { "epoch": 0.2718734552644587, "grad_norm": 0.9112448419229069, "learning_rate": 4.523026315789474e-05, "loss": 0.5367, "step": 275 }, { "epoch": 0.2728620860108749, "grad_norm": 1.230214115321251, "learning_rate": 4.539473684210527e-05, "loss": 0.5764, "step": 276 }, { "epoch": 0.27385071675729117, "grad_norm": 1.1421150396053565, "learning_rate": 4.555921052631579e-05, "loss": 0.5216, "step": 277 }, { "epoch": 0.27483934750370737, "grad_norm": 0.7421337991918203, "learning_rate": 4.572368421052632e-05, "loss": 0.528, "step": 278 }, { "epoch": 0.27582797825012356, "grad_norm": 1.1167174092326901, "learning_rate": 4.588815789473684e-05, "loss": 0.5665, "step": 279 }, { "epoch": 0.2768166089965398, "grad_norm": 0.7033222495281078, "learning_rate": 4.605263157894737e-05, "loss": 0.5487, "step": 280 }, { "epoch": 0.277805239742956, "grad_norm": 0.8111486072422073, "learning_rate": 4.621710526315789e-05, "loss": 0.57, "step": 281 }, { "epoch": 0.2787938704893722, "grad_norm": 0.7696527934351552, "learning_rate": 4.638157894736843e-05, "loss": 0.4579, "step": 282 }, { "epoch": 0.2797825012357884, "grad_norm": 0.6397427535364163, "learning_rate": 4.654605263157895e-05, "loss": 0.4968, "step": 283 }, { "epoch": 0.28077113198220466, "grad_norm": 0.8211838837770046, "learning_rate": 4.671052631578948e-05, "loss": 0.5518, "step": 284 }, { "epoch": 0.28175976272862086, "grad_norm": 1.0335505174509207, "learning_rate": 4.6875e-05, "loss": 0.5389, "step": 285 }, { "epoch": 0.28274839347503705, "grad_norm": 1.8567398506207058, "learning_rate": 4.703947368421053e-05, "loss": 0.5618, "step": 286 }, { "epoch": 0.2837370242214533, "grad_norm": 0.8725719022566313, "learning_rate": 4.720394736842105e-05, "loss": 0.5515, "step": 287 }, { "epoch": 0.2847256549678695, "grad_norm": 1.0437587511093662, "learning_rate": 4.736842105263158e-05, "loss": 0.5671, "step": 288 }, { "epoch": 0.2857142857142857, "grad_norm": 1.885204389381782, "learning_rate": 4.753289473684211e-05, "loss": 0.5648, "step": 289 }, { "epoch": 0.28670291646070195, "grad_norm": 1.2222416786479768, "learning_rate": 4.769736842105263e-05, "loss": 0.5196, "step": 290 }, { "epoch": 0.28769154720711815, "grad_norm": 1.043271910882622, "learning_rate": 4.786184210526316e-05, "loss": 0.5123, "step": 291 }, { "epoch": 0.28868017795353434, "grad_norm": 0.8248587509494123, "learning_rate": 4.802631578947368e-05, "loss": 0.5004, "step": 292 }, { "epoch": 0.28966880869995054, "grad_norm": 1.3944281891889212, "learning_rate": 4.819078947368421e-05, "loss": 0.593, "step": 293 }, { "epoch": 0.2906574394463668, "grad_norm": 0.885703782044485, "learning_rate": 4.8355263157894734e-05, "loss": 0.4829, "step": 294 }, { "epoch": 0.291646070192783, "grad_norm": 0.914046418624891, "learning_rate": 4.851973684210527e-05, "loss": 0.568, "step": 295 }, { "epoch": 0.2926347009391992, "grad_norm": 3.447360651311809, "learning_rate": 4.868421052631579e-05, "loss": 0.5374, "step": 296 }, { "epoch": 0.29362333168561544, "grad_norm": 1.871306574394939, "learning_rate": 4.884868421052632e-05, "loss": 0.5456, "step": 297 }, { "epoch": 0.29461196243203164, "grad_norm": 0.9694207843116719, "learning_rate": 4.901315789473684e-05, "loss": 0.4912, "step": 298 }, { "epoch": 0.29560059317844783, "grad_norm": 1.0723981970749665, "learning_rate": 4.917763157894737e-05, "loss": 0.576, "step": 299 }, { "epoch": 0.2965892239248641, "grad_norm": 2.295322189828728, "learning_rate": 4.9342105263157894e-05, "loss": 0.5736, "step": 300 }, { "epoch": 0.2975778546712803, "grad_norm": 2.0574001792315944, "learning_rate": 4.950657894736843e-05, "loss": 0.5758, "step": 301 }, { "epoch": 0.2985664854176965, "grad_norm": 0.863968571817255, "learning_rate": 4.967105263157895e-05, "loss": 0.6248, "step": 302 }, { "epoch": 0.2995551161641127, "grad_norm": 0.7442076699190427, "learning_rate": 4.983552631578948e-05, "loss": 0.5605, "step": 303 }, { "epoch": 0.30054374691052893, "grad_norm": 0.9259205263140676, "learning_rate": 5e-05, "loss": 0.5896, "step": 304 }, { "epoch": 0.3015323776569451, "grad_norm": 1.0443009138407497, "learning_rate": 4.9981678270428736e-05, "loss": 0.5106, "step": 305 }, { "epoch": 0.3025210084033613, "grad_norm": 9.934238307237306, "learning_rate": 4.996335654085746e-05, "loss": 1.0811, "step": 306 }, { "epoch": 0.3035096391497776, "grad_norm": 1.24702218903648, "learning_rate": 4.994503481128619e-05, "loss": 0.5241, "step": 307 }, { "epoch": 0.3044982698961938, "grad_norm": 2.39545914119442, "learning_rate": 4.9926713081714915e-05, "loss": 0.5745, "step": 308 }, { "epoch": 0.30548690064260997, "grad_norm": 1.056356704808135, "learning_rate": 4.990839135214365e-05, "loss": 0.511, "step": 309 }, { "epoch": 0.3064755313890262, "grad_norm": 0.9762355331558236, "learning_rate": 4.9890069622572374e-05, "loss": 0.5398, "step": 310 }, { "epoch": 0.3074641621354424, "grad_norm": 8.533878147960262, "learning_rate": 4.98717478930011e-05, "loss": 1.2985, "step": 311 }, { "epoch": 0.3084527928818586, "grad_norm": 27.862817098297064, "learning_rate": 4.985342616342983e-05, "loss": 0.921, "step": 312 }, { "epoch": 0.3094414236282748, "grad_norm": 3.8883047974065748, "learning_rate": 4.983510443385856e-05, "loss": 0.5581, "step": 313 }, { "epoch": 0.31043005437469107, "grad_norm": 3.8074945159766713, "learning_rate": 4.981678270428729e-05, "loss": 0.6145, "step": 314 }, { "epoch": 0.31141868512110726, "grad_norm": 20.816248176947298, "learning_rate": 4.979846097471601e-05, "loss": 0.8715, "step": 315 }, { "epoch": 0.31240731586752346, "grad_norm": 50.914925319898806, "learning_rate": 4.9780139245144747e-05, "loss": 1.3217, "step": 316 }, { "epoch": 0.3133959466139397, "grad_norm": 62.08473298168534, "learning_rate": 4.976181751557347e-05, "loss": 1.3666, "step": 317 }, { "epoch": 0.3143845773603559, "grad_norm": 2.14944730597795, "learning_rate": 4.9743495786002206e-05, "loss": 0.5983, "step": 318 }, { "epoch": 0.3153732081067721, "grad_norm": 0.9522634866712261, "learning_rate": 4.9725174056430926e-05, "loss": 0.5185, "step": 319 }, { "epoch": 0.31636183885318836, "grad_norm": 3.4573614326659943, "learning_rate": 4.970685232685966e-05, "loss": 0.8373, "step": 320 }, { "epoch": 0.31735046959960456, "grad_norm": 2.8006031724823077, "learning_rate": 4.9688530597288385e-05, "loss": 0.5604, "step": 321 }, { "epoch": 0.31833910034602075, "grad_norm": 1.958350569153921, "learning_rate": 4.967020886771712e-05, "loss": 0.5349, "step": 322 }, { "epoch": 0.31932773109243695, "grad_norm": 1.1464016116839237, "learning_rate": 4.9651887138145845e-05, "loss": 0.5576, "step": 323 }, { "epoch": 0.3203163618388532, "grad_norm": 3.937258961809421, "learning_rate": 4.963356540857457e-05, "loss": 0.6373, "step": 324 }, { "epoch": 0.3213049925852694, "grad_norm": 2.9402277705745186, "learning_rate": 4.96152436790033e-05, "loss": 0.5855, "step": 325 }, { "epoch": 0.3222936233316856, "grad_norm": 2.1246284395831876, "learning_rate": 4.959692194943203e-05, "loss": 0.6194, "step": 326 }, { "epoch": 0.32328225407810185, "grad_norm": 1.0760143140372522, "learning_rate": 4.957860021986076e-05, "loss": 0.546, "step": 327 }, { "epoch": 0.32427088482451805, "grad_norm": 1.2230451947768197, "learning_rate": 4.9560278490289484e-05, "loss": 0.4959, "step": 328 }, { "epoch": 0.32525951557093424, "grad_norm": 1.1837540741097599, "learning_rate": 4.954195676071822e-05, "loss": 0.5648, "step": 329 }, { "epoch": 0.3262481463173505, "grad_norm": 2.3417335393662926, "learning_rate": 4.9523635031146943e-05, "loss": 0.5527, "step": 330 }, { "epoch": 0.3272367770637667, "grad_norm": 1.246411720677882, "learning_rate": 4.950531330157567e-05, "loss": 0.5667, "step": 331 }, { "epoch": 0.3282254078101829, "grad_norm": 2.8208051591189522, "learning_rate": 4.9486991572004396e-05, "loss": 0.6102, "step": 332 }, { "epoch": 0.3292140385565991, "grad_norm": 8.095312978513194, "learning_rate": 4.946866984243313e-05, "loss": 0.6145, "step": 333 }, { "epoch": 0.33020266930301534, "grad_norm": 2.334131615677423, "learning_rate": 4.9450348112861856e-05, "loss": 0.583, "step": 334 }, { "epoch": 0.33119130004943154, "grad_norm": 2.738557242742542, "learning_rate": 4.943202638329059e-05, "loss": 0.5432, "step": 335 }, { "epoch": 0.33217993079584773, "grad_norm": 0.9665382045294144, "learning_rate": 4.941370465371931e-05, "loss": 0.5188, "step": 336 }, { "epoch": 0.333168561542264, "grad_norm": 0.9179732522385432, "learning_rate": 4.939538292414804e-05, "loss": 0.5456, "step": 337 }, { "epoch": 0.3341571922886802, "grad_norm": 0.8557404028051055, "learning_rate": 4.937706119457677e-05, "loss": 0.5869, "step": 338 }, { "epoch": 0.3351458230350964, "grad_norm": 0.8390627319130548, "learning_rate": 4.93587394650055e-05, "loss": 0.5489, "step": 339 }, { "epoch": 0.33613445378151263, "grad_norm": 0.81221672946298, "learning_rate": 4.934041773543423e-05, "loss": 0.5413, "step": 340 }, { "epoch": 0.33712308452792883, "grad_norm": 5.506077329541011, "learning_rate": 4.9322096005862954e-05, "loss": 0.6194, "step": 341 }, { "epoch": 0.338111715274345, "grad_norm": 0.8520796298923052, "learning_rate": 4.930377427629169e-05, "loss": 0.5526, "step": 342 }, { "epoch": 0.3391003460207612, "grad_norm": 0.8293806408970669, "learning_rate": 4.9285452546720414e-05, "loss": 0.5296, "step": 343 }, { "epoch": 0.3400889767671775, "grad_norm": 0.671896837049396, "learning_rate": 4.926713081714914e-05, "loss": 0.5536, "step": 344 }, { "epoch": 0.3410776075135937, "grad_norm": 1.0473051630255334, "learning_rate": 4.924880908757787e-05, "loss": 0.5929, "step": 345 }, { "epoch": 0.34206623826000987, "grad_norm": 0.8180767067303919, "learning_rate": 4.92304873580066e-05, "loss": 0.5017, "step": 346 }, { "epoch": 0.3430548690064261, "grad_norm": 0.7275663332400587, "learning_rate": 4.9212165628435326e-05, "loss": 0.5034, "step": 347 }, { "epoch": 0.3440434997528423, "grad_norm": 0.6762888506646861, "learning_rate": 4.919384389886405e-05, "loss": 0.5428, "step": 348 }, { "epoch": 0.3450321304992585, "grad_norm": 6.027676887139397, "learning_rate": 4.917552216929278e-05, "loss": 0.6113, "step": 349 }, { "epoch": 0.3460207612456747, "grad_norm": 1.1618979799699611, "learning_rate": 4.915720043972151e-05, "loss": 0.5266, "step": 350 }, { "epoch": 0.34700939199209097, "grad_norm": 1.2922539177975536, "learning_rate": 4.913887871015024e-05, "loss": 0.5587, "step": 351 }, { "epoch": 0.34799802273850716, "grad_norm": 4.806741343165558, "learning_rate": 4.912055698057897e-05, "loss": 0.6281, "step": 352 }, { "epoch": 0.34898665348492336, "grad_norm": 1.370505594440967, "learning_rate": 4.91022352510077e-05, "loss": 0.5625, "step": 353 }, { "epoch": 0.3499752842313396, "grad_norm": 5.52384587274037, "learning_rate": 4.9083913521436425e-05, "loss": 0.5955, "step": 354 }, { "epoch": 0.3509639149777558, "grad_norm": 1.079741514545984, "learning_rate": 4.906559179186516e-05, "loss": 0.5593, "step": 355 }, { "epoch": 0.351952545724172, "grad_norm": 1.1946751624194836, "learning_rate": 4.9047270062293885e-05, "loss": 0.5481, "step": 356 }, { "epoch": 0.35294117647058826, "grad_norm": 0.9095714116060707, "learning_rate": 4.902894833272261e-05, "loss": 0.5433, "step": 357 }, { "epoch": 0.35392980721700446, "grad_norm": 1.801990308880758, "learning_rate": 4.901062660315134e-05, "loss": 0.5542, "step": 358 }, { "epoch": 0.35491843796342065, "grad_norm": 1.258970166660417, "learning_rate": 4.899230487358007e-05, "loss": 0.5458, "step": 359 }, { "epoch": 0.35590706870983685, "grad_norm": 1.4202808150884736, "learning_rate": 4.89739831440088e-05, "loss": 0.5461, "step": 360 }, { "epoch": 0.3568956994562531, "grad_norm": 6.605459450792524, "learning_rate": 4.8955661414437523e-05, "loss": 0.6458, "step": 361 }, { "epoch": 0.3578843302026693, "grad_norm": 1.4110414013944033, "learning_rate": 4.893733968486625e-05, "loss": 0.5732, "step": 362 }, { "epoch": 0.3588729609490855, "grad_norm": 0.9145065326642036, "learning_rate": 4.891901795529498e-05, "loss": 0.5606, "step": 363 }, { "epoch": 0.35986159169550175, "grad_norm": 10.893261859158297, "learning_rate": 4.8900696225723716e-05, "loss": 0.57, "step": 364 }, { "epoch": 0.36085022244191794, "grad_norm": 4.973558055340178, "learning_rate": 4.8882374496152436e-05, "loss": 0.573, "step": 365 }, { "epoch": 0.36183885318833414, "grad_norm": 1.6742458318258424, "learning_rate": 4.886405276658117e-05, "loss": 0.5516, "step": 366 }, { "epoch": 0.3628274839347504, "grad_norm": 3.493425075123204, "learning_rate": 4.8845731037009895e-05, "loss": 0.5859, "step": 367 }, { "epoch": 0.3638161146811666, "grad_norm": 14.21901001046267, "learning_rate": 4.882740930743863e-05, "loss": 0.5506, "step": 368 }, { "epoch": 0.3648047454275828, "grad_norm": 2.227483276941566, "learning_rate": 4.8809087577867355e-05, "loss": 0.5741, "step": 369 }, { "epoch": 0.365793376173999, "grad_norm": 1.2727597180153734, "learning_rate": 4.879076584829608e-05, "loss": 0.5504, "step": 370 }, { "epoch": 0.36678200692041524, "grad_norm": 1.3449253250865592, "learning_rate": 4.877244411872481e-05, "loss": 0.5278, "step": 371 }, { "epoch": 0.36777063766683143, "grad_norm": 1.6964318780035699, "learning_rate": 4.875412238915354e-05, "loss": 0.4871, "step": 372 }, { "epoch": 0.36875926841324763, "grad_norm": 1.1735457979340223, "learning_rate": 4.873580065958227e-05, "loss": 0.5793, "step": 373 }, { "epoch": 0.3697478991596639, "grad_norm": 9.348994436747693, "learning_rate": 4.8717478930010994e-05, "loss": 0.7241, "step": 374 }, { "epoch": 0.3707365299060801, "grad_norm": 1.5526346927714094, "learning_rate": 4.869915720043972e-05, "loss": 0.5711, "step": 375 }, { "epoch": 0.3717251606524963, "grad_norm": 1.0316327786226993, "learning_rate": 4.8680835470868454e-05, "loss": 0.5527, "step": 376 }, { "epoch": 0.37271379139891253, "grad_norm": 0.8533195614446781, "learning_rate": 4.866251374129718e-05, "loss": 0.5109, "step": 377 }, { "epoch": 0.3737024221453287, "grad_norm": 1.4021944837160423, "learning_rate": 4.8644192011725906e-05, "loss": 0.5695, "step": 378 }, { "epoch": 0.3746910528917449, "grad_norm": 8.319784797994807, "learning_rate": 4.862587028215464e-05, "loss": 0.6758, "step": 379 }, { "epoch": 0.3756796836381611, "grad_norm": 1.0359300374989289, "learning_rate": 4.8607548552583366e-05, "loss": 0.5261, "step": 380 }, { "epoch": 0.3766683143845774, "grad_norm": 0.8801320225747099, "learning_rate": 4.85892268230121e-05, "loss": 0.5894, "step": 381 }, { "epoch": 0.37765694513099357, "grad_norm": 0.9154778069057681, "learning_rate": 4.857090509344082e-05, "loss": 0.5503, "step": 382 }, { "epoch": 0.37864557587740977, "grad_norm": 0.9100468396649973, "learning_rate": 4.855258336386955e-05, "loss": 0.5438, "step": 383 }, { "epoch": 0.379634206623826, "grad_norm": 0.81687539536218, "learning_rate": 4.853426163429828e-05, "loss": 0.546, "step": 384 }, { "epoch": 0.3806228373702422, "grad_norm": 0.870957446782063, "learning_rate": 4.851593990472701e-05, "loss": 0.5069, "step": 385 }, { "epoch": 0.3816114681166584, "grad_norm": 0.8822975045395368, "learning_rate": 4.849761817515573e-05, "loss": 0.5162, "step": 386 }, { "epoch": 0.38260009886307467, "grad_norm": 0.5867330976079286, "learning_rate": 4.8479296445584464e-05, "loss": 0.5064, "step": 387 }, { "epoch": 0.38358872960949086, "grad_norm": 0.9751846014420066, "learning_rate": 4.84609747160132e-05, "loss": 0.5216, "step": 388 }, { "epoch": 0.38457736035590706, "grad_norm": 0.6124030682332463, "learning_rate": 4.8442652986441924e-05, "loss": 0.5089, "step": 389 }, { "epoch": 0.38556599110232326, "grad_norm": 0.890888926265597, "learning_rate": 4.842433125687065e-05, "loss": 0.5334, "step": 390 }, { "epoch": 0.3865546218487395, "grad_norm": 0.8109466655366843, "learning_rate": 4.840600952729938e-05, "loss": 0.5161, "step": 391 }, { "epoch": 0.3875432525951557, "grad_norm": 0.6869684608537004, "learning_rate": 4.838768779772811e-05, "loss": 0.5487, "step": 392 }, { "epoch": 0.3885318833415719, "grad_norm": 0.8909878537645322, "learning_rate": 4.8369366068156837e-05, "loss": 0.5303, "step": 393 }, { "epoch": 0.38952051408798816, "grad_norm": 0.7593734906075879, "learning_rate": 4.835104433858556e-05, "loss": 0.5747, "step": 394 }, { "epoch": 0.39050914483440435, "grad_norm": 0.6604551939977313, "learning_rate": 4.833272260901429e-05, "loss": 0.4894, "step": 395 }, { "epoch": 0.39149777558082055, "grad_norm": 0.6729998498406504, "learning_rate": 4.831440087944302e-05, "loss": 0.5121, "step": 396 }, { "epoch": 0.3924864063272368, "grad_norm": 0.6770115776696115, "learning_rate": 4.829607914987175e-05, "loss": 0.5017, "step": 397 }, { "epoch": 0.393475037073653, "grad_norm": 1.6852210427605963, "learning_rate": 4.827775742030048e-05, "loss": 0.5807, "step": 398 }, { "epoch": 0.3944636678200692, "grad_norm": 0.9540894947303474, "learning_rate": 4.82594356907292e-05, "loss": 0.527, "step": 399 }, { "epoch": 0.3954522985664854, "grad_norm": 0.6085596012402876, "learning_rate": 4.8241113961157935e-05, "loss": 0.5451, "step": 400 }, { "epoch": 0.39644092931290165, "grad_norm": 0.9691647951983359, "learning_rate": 4.822279223158667e-05, "loss": 0.5313, "step": 401 }, { "epoch": 0.39742956005931784, "grad_norm": 0.5563474587392323, "learning_rate": 4.8204470502015395e-05, "loss": 0.5357, "step": 402 }, { "epoch": 0.39841819080573404, "grad_norm": 0.8526905644782354, "learning_rate": 4.818614877244412e-05, "loss": 0.5287, "step": 403 }, { "epoch": 0.3994068215521503, "grad_norm": 5.072789334007214, "learning_rate": 4.816782704287285e-05, "loss": 0.822, "step": 404 }, { "epoch": 0.4003954522985665, "grad_norm": 4.455107299455752, "learning_rate": 4.814950531330158e-05, "loss": 0.6164, "step": 405 }, { "epoch": 0.4013840830449827, "grad_norm": 1.7413572848724335, "learning_rate": 4.813118358373031e-05, "loss": 0.562, "step": 406 }, { "epoch": 0.40237271379139894, "grad_norm": 0.7170689449242919, "learning_rate": 4.8112861854159033e-05, "loss": 0.5769, "step": 407 }, { "epoch": 0.40336134453781514, "grad_norm": 1.2414838074909595, "learning_rate": 4.809454012458776e-05, "loss": 0.5148, "step": 408 }, { "epoch": 0.40434997528423133, "grad_norm": 0.9518688499270709, "learning_rate": 4.807621839501649e-05, "loss": 0.5428, "step": 409 }, { "epoch": 0.40533860603064753, "grad_norm": 0.9825415197299213, "learning_rate": 4.805789666544522e-05, "loss": 0.5245, "step": 410 }, { "epoch": 0.4063272367770638, "grad_norm": 0.9504309702014101, "learning_rate": 4.8039574935873946e-05, "loss": 0.4817, "step": 411 }, { "epoch": 0.40731586752348, "grad_norm": 0.6581408631820181, "learning_rate": 4.802125320630268e-05, "loss": 0.5354, "step": 412 }, { "epoch": 0.4083044982698962, "grad_norm": 0.8410138003787695, "learning_rate": 4.8002931476731406e-05, "loss": 0.5029, "step": 413 }, { "epoch": 0.40929312901631243, "grad_norm": 0.653246534636777, "learning_rate": 4.798460974716014e-05, "loss": 0.5155, "step": 414 }, { "epoch": 0.4102817597627286, "grad_norm": 0.7822933168414329, "learning_rate": 4.7966288017588865e-05, "loss": 0.4818, "step": 415 }, { "epoch": 0.4112703905091448, "grad_norm": 7.8566528764552634, "learning_rate": 4.794796628801759e-05, "loss": 0.7637, "step": 416 }, { "epoch": 0.4122590212555611, "grad_norm": 0.8118299252896028, "learning_rate": 4.792964455844632e-05, "loss": 0.5769, "step": 417 }, { "epoch": 0.4132476520019773, "grad_norm": 105.29491419417695, "learning_rate": 4.791132282887505e-05, "loss": 1.1404, "step": 418 }, { "epoch": 0.41423628274839347, "grad_norm": 1.1948281579268618, "learning_rate": 4.789300109930378e-05, "loss": 0.5511, "step": 419 }, { "epoch": 0.41522491349480967, "grad_norm": 0.6478102117459191, "learning_rate": 4.7874679369732504e-05, "loss": 0.4957, "step": 420 }, { "epoch": 0.4162135442412259, "grad_norm": 2.163398064113745, "learning_rate": 4.785635764016123e-05, "loss": 0.5171, "step": 421 }, { "epoch": 0.4172021749876421, "grad_norm": 1.2813302357263308, "learning_rate": 4.7838035910589964e-05, "loss": 0.4808, "step": 422 }, { "epoch": 0.4181908057340583, "grad_norm": 0.9059532396001785, "learning_rate": 4.781971418101869e-05, "loss": 0.502, "step": 423 }, { "epoch": 0.41917943648047457, "grad_norm": 0.8673855130986834, "learning_rate": 4.7801392451447416e-05, "loss": 0.5712, "step": 424 }, { "epoch": 0.42016806722689076, "grad_norm": 0.9503106868793048, "learning_rate": 4.778307072187615e-05, "loss": 0.5448, "step": 425 }, { "epoch": 0.42115669797330696, "grad_norm": 0.6879929813132313, "learning_rate": 4.7764748992304876e-05, "loss": 0.4851, "step": 426 }, { "epoch": 0.42214532871972316, "grad_norm": 1.082941455391773, "learning_rate": 4.774642726273361e-05, "loss": 0.5225, "step": 427 }, { "epoch": 0.4231339594661394, "grad_norm": 0.7332415468147727, "learning_rate": 4.772810553316233e-05, "loss": 0.4687, "step": 428 }, { "epoch": 0.4241225902125556, "grad_norm": 0.7124186401454005, "learning_rate": 4.770978380359106e-05, "loss": 0.5182, "step": 429 }, { "epoch": 0.4251112209589718, "grad_norm": 0.711883368342981, "learning_rate": 4.769146207401979e-05, "loss": 0.4774, "step": 430 }, { "epoch": 0.42609985170538806, "grad_norm": 0.6845930726644581, "learning_rate": 4.767314034444852e-05, "loss": 0.5301, "step": 431 }, { "epoch": 0.42708848245180425, "grad_norm": 0.9721003240859732, "learning_rate": 4.765481861487724e-05, "loss": 0.522, "step": 432 }, { "epoch": 0.42807711319822045, "grad_norm": 0.6712527185988681, "learning_rate": 4.7636496885305975e-05, "loss": 0.5705, "step": 433 }, { "epoch": 0.4290657439446367, "grad_norm": 0.7960082292386055, "learning_rate": 4.76181751557347e-05, "loss": 0.5127, "step": 434 }, { "epoch": 0.4300543746910529, "grad_norm": 0.8398381402299684, "learning_rate": 4.7599853426163434e-05, "loss": 0.5685, "step": 435 }, { "epoch": 0.4310430054374691, "grad_norm": 16.341402763469212, "learning_rate": 4.758153169659216e-05, "loss": 1.0064, "step": 436 }, { "epoch": 0.4320316361838853, "grad_norm": 1.0879341858163396, "learning_rate": 4.756320996702089e-05, "loss": 0.5549, "step": 437 }, { "epoch": 0.43302026693030155, "grad_norm": 0.7089215799980899, "learning_rate": 4.754488823744962e-05, "loss": 0.5146, "step": 438 }, { "epoch": 0.43400889767671774, "grad_norm": 0.9380484610004953, "learning_rate": 4.7526566507878347e-05, "loss": 0.5828, "step": 439 }, { "epoch": 0.43499752842313394, "grad_norm": 0.7320109663834132, "learning_rate": 4.750824477830707e-05, "loss": 0.5273, "step": 440 }, { "epoch": 0.4359861591695502, "grad_norm": 0.728230791180507, "learning_rate": 4.74899230487358e-05, "loss": 0.5502, "step": 441 }, { "epoch": 0.4369747899159664, "grad_norm": 0.7460617672226004, "learning_rate": 4.747160131916453e-05, "loss": 0.4956, "step": 442 }, { "epoch": 0.4379634206623826, "grad_norm": 0.6547872847025556, "learning_rate": 4.745327958959326e-05, "loss": 0.582, "step": 443 }, { "epoch": 0.43895205140879884, "grad_norm": 0.6938611847179136, "learning_rate": 4.743495786002199e-05, "loss": 0.5592, "step": 444 }, { "epoch": 0.43994068215521503, "grad_norm": 0.810796814100502, "learning_rate": 4.741663613045071e-05, "loss": 0.5305, "step": 445 }, { "epoch": 0.44092931290163123, "grad_norm": 0.5787015961503812, "learning_rate": 4.7398314400879445e-05, "loss": 0.523, "step": 446 }, { "epoch": 0.44191794364804743, "grad_norm": 0.8843150627446372, "learning_rate": 4.737999267130817e-05, "loss": 0.4969, "step": 447 }, { "epoch": 0.4429065743944637, "grad_norm": 0.5647358102995981, "learning_rate": 4.7361670941736905e-05, "loss": 0.4799, "step": 448 }, { "epoch": 0.4438952051408799, "grad_norm": 0.8190593059814746, "learning_rate": 4.734334921216563e-05, "loss": 0.5361, "step": 449 }, { "epoch": 0.4448838358872961, "grad_norm": 0.7430357677495286, "learning_rate": 4.732502748259436e-05, "loss": 0.5257, "step": 450 }, { "epoch": 0.4458724666337123, "grad_norm": 1.0967741197112155, "learning_rate": 4.730670575302309e-05, "loss": 0.5436, "step": 451 }, { "epoch": 0.4468610973801285, "grad_norm": 0.8343780159768259, "learning_rate": 4.728838402345182e-05, "loss": 0.5478, "step": 452 }, { "epoch": 0.4478497281265447, "grad_norm": 2.7154289486397847, "learning_rate": 4.7270062293880544e-05, "loss": 0.5298, "step": 453 }, { "epoch": 0.448838358872961, "grad_norm": 0.831794483966034, "learning_rate": 4.725174056430927e-05, "loss": 0.5784, "step": 454 }, { "epoch": 0.44982698961937717, "grad_norm": 0.7109774402496122, "learning_rate": 4.7233418834738e-05, "loss": 0.5024, "step": 455 }, { "epoch": 0.45081562036579337, "grad_norm": 0.7137902935851209, "learning_rate": 4.721509710516673e-05, "loss": 0.568, "step": 456 }, { "epoch": 0.45180425111220957, "grad_norm": 0.9003094838040753, "learning_rate": 4.7196775375595456e-05, "loss": 0.5666, "step": 457 }, { "epoch": 0.4527928818586258, "grad_norm": 0.5836229136142835, "learning_rate": 4.717845364602418e-05, "loss": 0.5333, "step": 458 }, { "epoch": 0.453781512605042, "grad_norm": 1.1321088306672311, "learning_rate": 4.7160131916452916e-05, "loss": 0.5073, "step": 459 }, { "epoch": 0.4547701433514582, "grad_norm": 0.9153987301324248, "learning_rate": 4.714181018688165e-05, "loss": 0.5323, "step": 460 }, { "epoch": 0.45575877409787446, "grad_norm": 0.7964972809790274, "learning_rate": 4.7123488457310375e-05, "loss": 0.5482, "step": 461 }, { "epoch": 0.45674740484429066, "grad_norm": 0.8616127517216328, "learning_rate": 4.71051667277391e-05, "loss": 0.5165, "step": 462 }, { "epoch": 0.45773603559070686, "grad_norm": 0.6915162587768408, "learning_rate": 4.708684499816783e-05, "loss": 0.5523, "step": 463 }, { "epoch": 0.4587246663371231, "grad_norm": 0.7913445857212036, "learning_rate": 4.706852326859656e-05, "loss": 0.5007, "step": 464 }, { "epoch": 0.4597132970835393, "grad_norm": 0.7664317878793733, "learning_rate": 4.705020153902529e-05, "loss": 0.5484, "step": 465 }, { "epoch": 0.4607019278299555, "grad_norm": 0.6260843470544474, "learning_rate": 4.7031879809454014e-05, "loss": 0.5017, "step": 466 }, { "epoch": 0.4616905585763717, "grad_norm": 0.7810408565614969, "learning_rate": 4.701355807988274e-05, "loss": 0.5106, "step": 467 }, { "epoch": 0.46267918932278795, "grad_norm": 0.7060120633759918, "learning_rate": 4.6995236350311474e-05, "loss": 0.5379, "step": 468 }, { "epoch": 0.46366782006920415, "grad_norm": 0.7083089860935861, "learning_rate": 4.69769146207402e-05, "loss": 0.5339, "step": 469 }, { "epoch": 0.46465645081562035, "grad_norm": 1.875666223282253, "learning_rate": 4.6958592891168927e-05, "loss": 0.5232, "step": 470 }, { "epoch": 0.4656450815620366, "grad_norm": 12.461465203338479, "learning_rate": 4.694027116159766e-05, "loss": 0.5539, "step": 471 }, { "epoch": 0.4666337123084528, "grad_norm": 1.1236022550326692, "learning_rate": 4.6921949432026386e-05, "loss": 0.5434, "step": 472 }, { "epoch": 0.467622343054869, "grad_norm": 0.6689339022683986, "learning_rate": 4.690362770245512e-05, "loss": 0.5018, "step": 473 }, { "epoch": 0.46861097380128525, "grad_norm": 19.07793845971655, "learning_rate": 4.688530597288384e-05, "loss": 0.5417, "step": 474 }, { "epoch": 0.46959960454770144, "grad_norm": 1.3162132111355245, "learning_rate": 4.686698424331257e-05, "loss": 0.5472, "step": 475 }, { "epoch": 0.47058823529411764, "grad_norm": 1.044379523607098, "learning_rate": 4.68486625137413e-05, "loss": 0.5702, "step": 476 }, { "epoch": 0.47157686604053384, "grad_norm": 1.9337321099853593, "learning_rate": 4.683034078417003e-05, "loss": 0.5191, "step": 477 }, { "epoch": 0.4725654967869501, "grad_norm": 1.3909912914056803, "learning_rate": 4.681201905459875e-05, "loss": 0.5582, "step": 478 }, { "epoch": 0.4735541275333663, "grad_norm": 1.7972243506992336, "learning_rate": 4.6793697325027485e-05, "loss": 0.4857, "step": 479 }, { "epoch": 0.4745427582797825, "grad_norm": 1.0876156684983036, "learning_rate": 4.677537559545621e-05, "loss": 0.5668, "step": 480 }, { "epoch": 0.47553138902619874, "grad_norm": 0.862437854467378, "learning_rate": 4.6757053865884944e-05, "loss": 0.4909, "step": 481 }, { "epoch": 0.47652001977261493, "grad_norm": 0.8831979198261652, "learning_rate": 4.673873213631367e-05, "loss": 0.5014, "step": 482 }, { "epoch": 0.47750865051903113, "grad_norm": 3.589309285899447, "learning_rate": 4.67204104067424e-05, "loss": 0.7209, "step": 483 }, { "epoch": 0.4784972812654474, "grad_norm": 3.6265710592064315, "learning_rate": 4.670208867717113e-05, "loss": 0.5497, "step": 484 }, { "epoch": 0.4794859120118636, "grad_norm": 1.2208629586326059, "learning_rate": 4.668376694759986e-05, "loss": 0.5053, "step": 485 }, { "epoch": 0.4804745427582798, "grad_norm": 2.150086926541622, "learning_rate": 4.666544521802858e-05, "loss": 0.5339, "step": 486 }, { "epoch": 0.481463173504696, "grad_norm": 1.328122920055381, "learning_rate": 4.664712348845731e-05, "loss": 0.5341, "step": 487 }, { "epoch": 0.4824518042511122, "grad_norm": 1.0286778235641112, "learning_rate": 4.662880175888604e-05, "loss": 0.5744, "step": 488 }, { "epoch": 0.4834404349975284, "grad_norm": 2.8922180092825953, "learning_rate": 4.661048002931477e-05, "loss": 0.571, "step": 489 }, { "epoch": 0.4844290657439446, "grad_norm": 15.822491081434347, "learning_rate": 4.65921582997435e-05, "loss": 0.773, "step": 490 }, { "epoch": 0.4854176964903609, "grad_norm": 1.7229896402672253, "learning_rate": 4.657383657017222e-05, "loss": 0.5819, "step": 491 }, { "epoch": 0.48640632723677707, "grad_norm": 1.2343899256750996, "learning_rate": 4.6555514840600955e-05, "loss": 0.5729, "step": 492 }, { "epoch": 0.48739495798319327, "grad_norm": 1.0137447789945886, "learning_rate": 4.653719311102968e-05, "loss": 0.5475, "step": 493 }, { "epoch": 0.48838358872960946, "grad_norm": 1.2182934817867745, "learning_rate": 4.6518871381458415e-05, "loss": 0.5082, "step": 494 }, { "epoch": 0.4893722194760257, "grad_norm": 1.0150225263336468, "learning_rate": 4.650054965188714e-05, "loss": 0.4768, "step": 495 }, { "epoch": 0.4903608502224419, "grad_norm": 0.9148424559760188, "learning_rate": 4.648222792231587e-05, "loss": 0.5214, "step": 496 }, { "epoch": 0.4913494809688581, "grad_norm": 0.8637302886923566, "learning_rate": 4.64639061927446e-05, "loss": 0.4658, "step": 497 }, { "epoch": 0.49233811171527436, "grad_norm": 0.9635101045512545, "learning_rate": 4.644558446317333e-05, "loss": 0.5273, "step": 498 }, { "epoch": 0.49332674246169056, "grad_norm": 0.7011766560309427, "learning_rate": 4.6427262733602054e-05, "loss": 0.5998, "step": 499 }, { "epoch": 0.49431537320810676, "grad_norm": 0.8815888260096139, "learning_rate": 4.640894100403078e-05, "loss": 0.5537, "step": 500 }, { "epoch": 0.495304003954523, "grad_norm": 2.4626766225831, "learning_rate": 4.639061927445951e-05, "loss": 0.5782, "step": 501 }, { "epoch": 0.4962926347009392, "grad_norm": 0.7334316074666641, "learning_rate": 4.637229754488824e-05, "loss": 0.5242, "step": 502 }, { "epoch": 0.4972812654473554, "grad_norm": 0.7337131151339925, "learning_rate": 4.6353975815316966e-05, "loss": 0.5234, "step": 503 }, { "epoch": 0.4982698961937716, "grad_norm": 0.6725658646581332, "learning_rate": 4.633565408574569e-05, "loss": 0.5988, "step": 504 }, { "epoch": 0.49925852694018785, "grad_norm": 0.7359172023871537, "learning_rate": 4.6317332356174426e-05, "loss": 0.5406, "step": 505 }, { "epoch": 0.5002471576866041, "grad_norm": 0.572006511397366, "learning_rate": 4.629901062660315e-05, "loss": 0.559, "step": 506 }, { "epoch": 0.5012357884330203, "grad_norm": 0.704459053728086, "learning_rate": 4.6280688897031885e-05, "loss": 0.5355, "step": 507 }, { "epoch": 0.5022244191794365, "grad_norm": 0.5318163749996885, "learning_rate": 4.626236716746061e-05, "loss": 0.4787, "step": 508 }, { "epoch": 0.5032130499258527, "grad_norm": 3.9438529197801673, "learning_rate": 4.624404543788934e-05, "loss": 0.6349, "step": 509 }, { "epoch": 0.5042016806722689, "grad_norm": 1.0548773523753974, "learning_rate": 4.622572370831807e-05, "loss": 0.5232, "step": 510 }, { "epoch": 0.5051903114186851, "grad_norm": 0.7366038224004618, "learning_rate": 4.62074019787468e-05, "loss": 0.507, "step": 511 }, { "epoch": 0.5061789421651013, "grad_norm": 0.701653385438351, "learning_rate": 4.6189080249175524e-05, "loss": 0.5094, "step": 512 }, { "epoch": 0.5071675729115176, "grad_norm": 3.3109434746110873, "learning_rate": 4.617075851960425e-05, "loss": 0.5684, "step": 513 }, { "epoch": 0.5081562036579338, "grad_norm": 1.877063634394585, "learning_rate": 4.6152436790032984e-05, "loss": 0.4779, "step": 514 }, { "epoch": 0.50914483440435, "grad_norm": 0.8673399879449802, "learning_rate": 4.613411506046171e-05, "loss": 0.5435, "step": 515 }, { "epoch": 0.5101334651507662, "grad_norm": 0.930978176135324, "learning_rate": 4.6115793330890437e-05, "loss": 0.5067, "step": 516 }, { "epoch": 0.5111220958971824, "grad_norm": 0.623152066888893, "learning_rate": 4.609747160131916e-05, "loss": 0.4967, "step": 517 }, { "epoch": 0.5121107266435986, "grad_norm": 1.2007785802381739, "learning_rate": 4.6079149871747896e-05, "loss": 0.6132, "step": 518 }, { "epoch": 0.5130993573900148, "grad_norm": 0.6308715569639965, "learning_rate": 4.606082814217663e-05, "loss": 0.5564, "step": 519 }, { "epoch": 0.5140879881364311, "grad_norm": 1.0180576512965347, "learning_rate": 4.604250641260535e-05, "loss": 0.5091, "step": 520 }, { "epoch": 0.5150766188828473, "grad_norm": 0.7470505377129495, "learning_rate": 4.602418468303408e-05, "loss": 0.5267, "step": 521 }, { "epoch": 0.5160652496292635, "grad_norm": 0.8338613392274956, "learning_rate": 4.600586295346281e-05, "loss": 0.5217, "step": 522 }, { "epoch": 0.5170538803756797, "grad_norm": 0.9348463799331983, "learning_rate": 4.598754122389154e-05, "loss": 0.5081, "step": 523 }, { "epoch": 0.5180425111220959, "grad_norm": 0.6431027917760584, "learning_rate": 4.596921949432026e-05, "loss": 0.4968, "step": 524 }, { "epoch": 0.5190311418685121, "grad_norm": 0.5600224627776736, "learning_rate": 4.5950897764748995e-05, "loss": 0.5173, "step": 525 }, { "epoch": 0.5200197726149284, "grad_norm": 0.7346275389418706, "learning_rate": 4.593257603517772e-05, "loss": 0.544, "step": 526 }, { "epoch": 0.5210084033613446, "grad_norm": 0.9248031780130603, "learning_rate": 4.5914254305606454e-05, "loss": 0.5873, "step": 527 }, { "epoch": 0.5219970341077608, "grad_norm": 1.2558048138626847, "learning_rate": 4.589593257603518e-05, "loss": 0.5104, "step": 528 }, { "epoch": 0.522985664854177, "grad_norm": 1.1178847557731268, "learning_rate": 4.587761084646391e-05, "loss": 0.4637, "step": 529 }, { "epoch": 0.5239742956005932, "grad_norm": 0.7775657052535542, "learning_rate": 4.5859289116892634e-05, "loss": 0.5091, "step": 530 }, { "epoch": 0.5249629263470094, "grad_norm": 1.2126744196506467, "learning_rate": 4.584096738732137e-05, "loss": 0.4948, "step": 531 }, { "epoch": 0.5259515570934256, "grad_norm": 0.7987670497029579, "learning_rate": 4.582264565775009e-05, "loss": 0.5287, "step": 532 }, { "epoch": 0.5269401878398419, "grad_norm": 0.6304141948634191, "learning_rate": 4.580432392817882e-05, "loss": 0.5248, "step": 533 }, { "epoch": 0.5279288185862581, "grad_norm": 0.6704881274328388, "learning_rate": 4.578600219860755e-05, "loss": 0.5655, "step": 534 }, { "epoch": 0.5289174493326743, "grad_norm": 0.6360094554421949, "learning_rate": 4.576768046903628e-05, "loss": 0.5253, "step": 535 }, { "epoch": 0.5299060800790905, "grad_norm": 0.6052854917298444, "learning_rate": 4.574935873946501e-05, "loss": 0.5861, "step": 536 }, { "epoch": 0.5308947108255067, "grad_norm": 0.5636452740308454, "learning_rate": 4.573103700989373e-05, "loss": 0.5458, "step": 537 }, { "epoch": 0.5318833415719229, "grad_norm": 0.6045535259068093, "learning_rate": 4.5712715280322465e-05, "loss": 0.5122, "step": 538 }, { "epoch": 0.532871972318339, "grad_norm": 0.48888739388896557, "learning_rate": 4.569439355075119e-05, "loss": 0.5682, "step": 539 }, { "epoch": 0.5338606030647554, "grad_norm": 0.6700670719741539, "learning_rate": 4.5676071821179925e-05, "loss": 0.5217, "step": 540 }, { "epoch": 0.5348492338111716, "grad_norm": 0.5638479162869483, "learning_rate": 4.5657750091608644e-05, "loss": 0.5376, "step": 541 }, { "epoch": 0.5358378645575878, "grad_norm": 0.552884957182551, "learning_rate": 4.563942836203738e-05, "loss": 0.5278, "step": 542 }, { "epoch": 0.536826495304004, "grad_norm": 0.5797579473114043, "learning_rate": 4.562110663246611e-05, "loss": 0.537, "step": 543 }, { "epoch": 0.5378151260504201, "grad_norm": 0.5593244478532714, "learning_rate": 4.560278490289484e-05, "loss": 0.5049, "step": 544 }, { "epoch": 0.5388037567968363, "grad_norm": 0.5943422088333956, "learning_rate": 4.5584463173323564e-05, "loss": 0.5468, "step": 545 }, { "epoch": 0.5397923875432526, "grad_norm": 0.6281746956915036, "learning_rate": 4.556614144375229e-05, "loss": 0.5514, "step": 546 }, { "epoch": 0.5407810182896688, "grad_norm": 0.4933590047601997, "learning_rate": 4.554781971418102e-05, "loss": 0.5151, "step": 547 }, { "epoch": 0.541769649036085, "grad_norm": 2.761677801068474, "learning_rate": 4.552949798460975e-05, "loss": 0.4919, "step": 548 }, { "epoch": 0.5427582797825012, "grad_norm": 0.9719762046748948, "learning_rate": 4.5511176255038476e-05, "loss": 0.4894, "step": 549 }, { "epoch": 0.5437469105289174, "grad_norm": 0.5662456292961024, "learning_rate": 4.54928545254672e-05, "loss": 0.4809, "step": 550 }, { "epoch": 0.5447355412753336, "grad_norm": 0.706709101413056, "learning_rate": 4.5474532795895936e-05, "loss": 0.5588, "step": 551 }, { "epoch": 0.5457241720217498, "grad_norm": 0.7156919473131014, "learning_rate": 4.545621106632466e-05, "loss": 0.5157, "step": 552 }, { "epoch": 0.5467128027681661, "grad_norm": 0.6036065236344407, "learning_rate": 4.5437889336753395e-05, "loss": 0.5133, "step": 553 }, { "epoch": 0.5477014335145823, "grad_norm": 0.7044000500260265, "learning_rate": 4.5419567607182115e-05, "loss": 0.5501, "step": 554 }, { "epoch": 0.5486900642609985, "grad_norm": 0.6182877384612294, "learning_rate": 4.540124587761085e-05, "loss": 0.5067, "step": 555 }, { "epoch": 0.5496786950074147, "grad_norm": 2.0788487564246636, "learning_rate": 4.538292414803958e-05, "loss": 0.5297, "step": 556 }, { "epoch": 0.5506673257538309, "grad_norm": 0.662316393460378, "learning_rate": 4.536460241846831e-05, "loss": 0.5077, "step": 557 }, { "epoch": 0.5516559565002471, "grad_norm": 0.7143055727254721, "learning_rate": 4.5346280688897034e-05, "loss": 0.5507, "step": 558 }, { "epoch": 0.5526445872466633, "grad_norm": 0.4692225956856845, "learning_rate": 4.532795895932576e-05, "loss": 0.5077, "step": 559 }, { "epoch": 0.5536332179930796, "grad_norm": 0.6556094248380152, "learning_rate": 4.5309637229754494e-05, "loss": 0.5483, "step": 560 }, { "epoch": 0.5546218487394958, "grad_norm": 0.5079816686651779, "learning_rate": 4.529131550018322e-05, "loss": 0.538, "step": 561 }, { "epoch": 0.555610479485912, "grad_norm": 0.7150133830530668, "learning_rate": 4.527299377061195e-05, "loss": 0.5132, "step": 562 }, { "epoch": 0.5565991102323282, "grad_norm": 0.44210371117001124, "learning_rate": 4.525467204104067e-05, "loss": 0.5225, "step": 563 }, { "epoch": 0.5575877409787444, "grad_norm": 0.6594280272816959, "learning_rate": 4.5236350311469406e-05, "loss": 0.5057, "step": 564 }, { "epoch": 0.5585763717251606, "grad_norm": 2.2223909735124723, "learning_rate": 4.521802858189813e-05, "loss": 0.5586, "step": 565 }, { "epoch": 0.5595650024715768, "grad_norm": 1.0813305532136213, "learning_rate": 4.519970685232686e-05, "loss": 0.4975, "step": 566 }, { "epoch": 0.5605536332179931, "grad_norm": 0.7031140135360182, "learning_rate": 4.518138512275559e-05, "loss": 0.5044, "step": 567 }, { "epoch": 0.5615422639644093, "grad_norm": 0.6595460596796324, "learning_rate": 4.516306339318432e-05, "loss": 0.4891, "step": 568 }, { "epoch": 0.5625308947108255, "grad_norm": 0.5856122636762489, "learning_rate": 4.514474166361305e-05, "loss": 0.5295, "step": 569 }, { "epoch": 0.5635195254572417, "grad_norm": 0.5596907719426581, "learning_rate": 4.512641993404177e-05, "loss": 0.4728, "step": 570 }, { "epoch": 0.5645081562036579, "grad_norm": 0.7593489935886681, "learning_rate": 4.5108098204470505e-05, "loss": 0.4959, "step": 571 }, { "epoch": 0.5654967869500741, "grad_norm": 0.48378178829248153, "learning_rate": 4.508977647489923e-05, "loss": 0.5101, "step": 572 }, { "epoch": 0.5664854176964904, "grad_norm": 0.7291911555971496, "learning_rate": 4.5071454745327964e-05, "loss": 0.5657, "step": 573 }, { "epoch": 0.5674740484429066, "grad_norm": 12.678113347844269, "learning_rate": 4.505313301575669e-05, "loss": 0.7829, "step": 574 }, { "epoch": 0.5684626791893228, "grad_norm": 1.1933332626422042, "learning_rate": 4.503481128618542e-05, "loss": 0.4989, "step": 575 }, { "epoch": 0.569451309935739, "grad_norm": 0.5858502371740975, "learning_rate": 4.5016489556614144e-05, "loss": 0.4801, "step": 576 }, { "epoch": 0.5704399406821552, "grad_norm": 0.8836483908080288, "learning_rate": 4.499816782704288e-05, "loss": 0.5304, "step": 577 }, { "epoch": 0.5714285714285714, "grad_norm": 0.6381131374611547, "learning_rate": 4.49798460974716e-05, "loss": 0.4962, "step": 578 }, { "epoch": 0.5724172021749876, "grad_norm": 0.694148836797196, "learning_rate": 4.496152436790033e-05, "loss": 0.5038, "step": 579 }, { "epoch": 0.5734058329214039, "grad_norm": 0.8507242512011981, "learning_rate": 4.494320263832906e-05, "loss": 0.5348, "step": 580 }, { "epoch": 0.5743944636678201, "grad_norm": 0.542359071937966, "learning_rate": 4.492488090875779e-05, "loss": 0.5081, "step": 581 }, { "epoch": 0.5753830944142363, "grad_norm": 1.0479106038523704, "learning_rate": 4.490655917918652e-05, "loss": 0.5442, "step": 582 }, { "epoch": 0.5763717251606525, "grad_norm": 0.5904221518138637, "learning_rate": 4.488823744961524e-05, "loss": 0.5688, "step": 583 }, { "epoch": 0.5773603559070687, "grad_norm": 0.9848943299146304, "learning_rate": 4.4869915720043975e-05, "loss": 0.5266, "step": 584 }, { "epoch": 0.5783489866534849, "grad_norm": 0.5402146123339285, "learning_rate": 4.48515939904727e-05, "loss": 0.4911, "step": 585 }, { "epoch": 0.5793376173999011, "grad_norm": 0.8399243557581442, "learning_rate": 4.4833272260901435e-05, "loss": 0.5299, "step": 586 }, { "epoch": 0.5803262481463174, "grad_norm": 0.9734946892237011, "learning_rate": 4.4814950531330155e-05, "loss": 0.5472, "step": 587 }, { "epoch": 0.5813148788927336, "grad_norm": 0.7602072590138316, "learning_rate": 4.479662880175889e-05, "loss": 0.5497, "step": 588 }, { "epoch": 0.5823035096391498, "grad_norm": 0.7201868291423408, "learning_rate": 4.4778307072187614e-05, "loss": 0.5337, "step": 589 }, { "epoch": 0.583292140385566, "grad_norm": 0.649237043452001, "learning_rate": 4.475998534261635e-05, "loss": 0.5349, "step": 590 }, { "epoch": 0.5842807711319822, "grad_norm": 4.78855872490126, "learning_rate": 4.4741663613045074e-05, "loss": 0.5989, "step": 591 }, { "epoch": 0.5852694018783984, "grad_norm": 0.9285021265990379, "learning_rate": 4.47233418834738e-05, "loss": 0.5266, "step": 592 }, { "epoch": 0.5862580326248147, "grad_norm": 0.7995364512746768, "learning_rate": 4.4705020153902533e-05, "loss": 0.5433, "step": 593 }, { "epoch": 0.5872466633712309, "grad_norm": 1.5912058330908954, "learning_rate": 4.468669842433126e-05, "loss": 0.5061, "step": 594 }, { "epoch": 0.5882352941176471, "grad_norm": 0.643362851050352, "learning_rate": 4.4668376694759986e-05, "loss": 0.5331, "step": 595 }, { "epoch": 0.5892239248640633, "grad_norm": 0.6201871879796232, "learning_rate": 4.465005496518871e-05, "loss": 0.4574, "step": 596 }, { "epoch": 0.5902125556104795, "grad_norm": 0.5602195565043749, "learning_rate": 4.4631733235617446e-05, "loss": 0.5314, "step": 597 }, { "epoch": 0.5912011863568957, "grad_norm": 0.5800014525670484, "learning_rate": 4.461341150604617e-05, "loss": 0.5139, "step": 598 }, { "epoch": 0.5921898171033119, "grad_norm": 0.5389804197360742, "learning_rate": 4.45950897764749e-05, "loss": 0.4639, "step": 599 }, { "epoch": 0.5931784478497282, "grad_norm": 0.6009502163466464, "learning_rate": 4.4576768046903625e-05, "loss": 0.5421, "step": 600 }, { "epoch": 0.5941670785961444, "grad_norm": 0.5823232850495574, "learning_rate": 4.455844631733236e-05, "loss": 0.5845, "step": 601 }, { "epoch": 0.5951557093425606, "grad_norm": 0.6331978704280957, "learning_rate": 4.454012458776109e-05, "loss": 0.4376, "step": 602 }, { "epoch": 0.5961443400889768, "grad_norm": 0.5676577193708788, "learning_rate": 4.452180285818982e-05, "loss": 0.4773, "step": 603 }, { "epoch": 0.597132970835393, "grad_norm": 0.5515245354211675, "learning_rate": 4.4503481128618544e-05, "loss": 0.5291, "step": 604 }, { "epoch": 0.5981216015818092, "grad_norm": 1.307907407882031, "learning_rate": 4.448515939904727e-05, "loss": 0.4837, "step": 605 }, { "epoch": 0.5991102323282254, "grad_norm": 0.5321419586253535, "learning_rate": 4.4466837669476004e-05, "loss": 0.5371, "step": 606 }, { "epoch": 0.6000988630746417, "grad_norm": 0.6658924909776882, "learning_rate": 4.444851593990473e-05, "loss": 0.5513, "step": 607 }, { "epoch": 0.6010874938210579, "grad_norm": 0.7488591498529322, "learning_rate": 4.443019421033346e-05, "loss": 0.5501, "step": 608 }, { "epoch": 0.6020761245674741, "grad_norm": 0.4989300670102643, "learning_rate": 4.441187248076218e-05, "loss": 0.5155, "step": 609 }, { "epoch": 0.6030647553138903, "grad_norm": 0.592956507030979, "learning_rate": 4.4393550751190916e-05, "loss": 0.5721, "step": 610 }, { "epoch": 0.6040533860603065, "grad_norm": 0.57930555577889, "learning_rate": 4.437522902161964e-05, "loss": 0.566, "step": 611 }, { "epoch": 0.6050420168067226, "grad_norm": 0.5958639469873257, "learning_rate": 4.435690729204837e-05, "loss": 0.5416, "step": 612 }, { "epoch": 0.606030647553139, "grad_norm": 0.5658514410790444, "learning_rate": 4.4338585562477096e-05, "loss": 0.5111, "step": 613 }, { "epoch": 0.6070192782995552, "grad_norm": 0.4360677158948701, "learning_rate": 4.432026383290583e-05, "loss": 0.4631, "step": 614 }, { "epoch": 0.6080079090459714, "grad_norm": 0.673042467529231, "learning_rate": 4.430194210333456e-05, "loss": 0.5111, "step": 615 }, { "epoch": 0.6089965397923875, "grad_norm": 0.5325269067461875, "learning_rate": 4.428362037376328e-05, "loss": 0.5491, "step": 616 }, { "epoch": 0.6099851705388037, "grad_norm": 0.5191732120409539, "learning_rate": 4.4265298644192015e-05, "loss": 0.5039, "step": 617 }, { "epoch": 0.6109738012852199, "grad_norm": 0.7777992654613309, "learning_rate": 4.424697691462074e-05, "loss": 0.5007, "step": 618 }, { "epoch": 0.6119624320316361, "grad_norm": 0.5553270219417682, "learning_rate": 4.4228655185049474e-05, "loss": 0.5128, "step": 619 }, { "epoch": 0.6129510627780524, "grad_norm": 0.5381845615724501, "learning_rate": 4.42103334554782e-05, "loss": 0.515, "step": 620 }, { "epoch": 0.6139396935244686, "grad_norm": 0.46375821394953454, "learning_rate": 4.419201172590693e-05, "loss": 0.4936, "step": 621 }, { "epoch": 0.6149283242708848, "grad_norm": 0.5922237999968413, "learning_rate": 4.4173689996335654e-05, "loss": 0.5263, "step": 622 }, { "epoch": 0.615916955017301, "grad_norm": 0.4744481456646715, "learning_rate": 4.415536826676439e-05, "loss": 0.4784, "step": 623 }, { "epoch": 0.6169055857637172, "grad_norm": 0.9887800404042643, "learning_rate": 4.413704653719311e-05, "loss": 0.559, "step": 624 }, { "epoch": 0.6178942165101334, "grad_norm": 0.7341047813395088, "learning_rate": 4.411872480762184e-05, "loss": 0.5152, "step": 625 }, { "epoch": 0.6188828472565496, "grad_norm": 0.5630009486688937, "learning_rate": 4.410040307805057e-05, "loss": 0.5004, "step": 626 }, { "epoch": 0.6198714780029659, "grad_norm": 0.6318238494681988, "learning_rate": 4.40820813484793e-05, "loss": 0.5316, "step": 627 }, { "epoch": 0.6208601087493821, "grad_norm": 0.5579146615666924, "learning_rate": 4.406375961890803e-05, "loss": 0.5193, "step": 628 }, { "epoch": 0.6218487394957983, "grad_norm": 0.7706028403912982, "learning_rate": 4.404543788933675e-05, "loss": 0.4841, "step": 629 }, { "epoch": 0.6228373702422145, "grad_norm": 0.49904941483020226, "learning_rate": 4.4027116159765485e-05, "loss": 0.4999, "step": 630 }, { "epoch": 0.6238260009886307, "grad_norm": 0.5924370964483197, "learning_rate": 4.400879443019421e-05, "loss": 0.4814, "step": 631 }, { "epoch": 0.6248146317350469, "grad_norm": 0.5783763036472963, "learning_rate": 4.3990472700622945e-05, "loss": 0.5149, "step": 632 }, { "epoch": 0.6258032624814631, "grad_norm": 0.4650970031115923, "learning_rate": 4.3972150971051665e-05, "loss": 0.5755, "step": 633 }, { "epoch": 0.6267918932278794, "grad_norm": 0.6752162485448957, "learning_rate": 4.39538292414804e-05, "loss": 0.5202, "step": 634 }, { "epoch": 0.6277805239742956, "grad_norm": 0.5777551920592214, "learning_rate": 4.3935507511909124e-05, "loss": 0.5524, "step": 635 }, { "epoch": 0.6287691547207118, "grad_norm": 0.8197722076033177, "learning_rate": 4.391718578233786e-05, "loss": 0.547, "step": 636 }, { "epoch": 0.629757785467128, "grad_norm": 0.55509276538108, "learning_rate": 4.3898864052766584e-05, "loss": 0.485, "step": 637 }, { "epoch": 0.6307464162135442, "grad_norm": 0.7574527928927102, "learning_rate": 4.388054232319531e-05, "loss": 0.5381, "step": 638 }, { "epoch": 0.6317350469599604, "grad_norm": 0.5112845835454353, "learning_rate": 4.3862220593624043e-05, "loss": 0.5237, "step": 639 }, { "epoch": 0.6327236777063767, "grad_norm": 0.6443813567657862, "learning_rate": 4.384389886405277e-05, "loss": 0.4809, "step": 640 }, { "epoch": 0.6337123084527929, "grad_norm": 0.648128225196569, "learning_rate": 4.3825577134481496e-05, "loss": 0.4948, "step": 641 }, { "epoch": 0.6347009391992091, "grad_norm": 0.6543498156745187, "learning_rate": 4.380725540491022e-05, "loss": 0.4925, "step": 642 }, { "epoch": 0.6356895699456253, "grad_norm": 0.7048319246643159, "learning_rate": 4.3788933675338956e-05, "loss": 0.5039, "step": 643 }, { "epoch": 0.6366782006920415, "grad_norm": 0.5841973329297562, "learning_rate": 4.377061194576768e-05, "loss": 0.5139, "step": 644 }, { "epoch": 0.6376668314384577, "grad_norm": 0.6007965850331226, "learning_rate": 4.375229021619641e-05, "loss": 0.5261, "step": 645 }, { "epoch": 0.6386554621848739, "grad_norm": 0.44023196537390197, "learning_rate": 4.3733968486625135e-05, "loss": 0.5519, "step": 646 }, { "epoch": 0.6396440929312902, "grad_norm": 0.5397681998748973, "learning_rate": 4.371564675705387e-05, "loss": 0.4872, "step": 647 }, { "epoch": 0.6406327236777064, "grad_norm": 0.43501628982048585, "learning_rate": 4.3697325027482595e-05, "loss": 0.5303, "step": 648 }, { "epoch": 0.6416213544241226, "grad_norm": 0.6441159995945562, "learning_rate": 4.367900329791133e-05, "loss": 0.5648, "step": 649 }, { "epoch": 0.6426099851705388, "grad_norm": 0.46292382074282434, "learning_rate": 4.3660681568340054e-05, "loss": 0.4928, "step": 650 }, { "epoch": 0.643598615916955, "grad_norm": 0.5650286385501193, "learning_rate": 4.364235983876878e-05, "loss": 0.5259, "step": 651 }, { "epoch": 0.6445872466633712, "grad_norm": 0.6020758058704381, "learning_rate": 4.3624038109197514e-05, "loss": 0.4893, "step": 652 }, { "epoch": 0.6455758774097874, "grad_norm": 0.5420047946803517, "learning_rate": 4.360571637962624e-05, "loss": 0.5157, "step": 653 }, { "epoch": 0.6465645081562037, "grad_norm": 0.5319044160367246, "learning_rate": 4.358739465005497e-05, "loss": 0.4442, "step": 654 }, { "epoch": 0.6475531389026199, "grad_norm": 0.6337001516487323, "learning_rate": 4.356907292048369e-05, "loss": 0.5642, "step": 655 }, { "epoch": 0.6485417696490361, "grad_norm": 3.089647222677914, "learning_rate": 4.3550751190912426e-05, "loss": 0.5822, "step": 656 }, { "epoch": 0.6495304003954523, "grad_norm": 1.077249510260834, "learning_rate": 4.353242946134115e-05, "loss": 0.4718, "step": 657 }, { "epoch": 0.6505190311418685, "grad_norm": 0.5398952412185243, "learning_rate": 4.351410773176988e-05, "loss": 0.4875, "step": 658 }, { "epoch": 0.6515076618882847, "grad_norm": 1.420997472854509, "learning_rate": 4.3495786002198606e-05, "loss": 0.4793, "step": 659 }, { "epoch": 0.652496292634701, "grad_norm": 0.8170021648888403, "learning_rate": 4.347746427262734e-05, "loss": 0.492, "step": 660 }, { "epoch": 0.6534849233811172, "grad_norm": 0.8372618593431737, "learning_rate": 4.3459142543056065e-05, "loss": 0.4994, "step": 661 }, { "epoch": 0.6544735541275334, "grad_norm": 0.5951025617104139, "learning_rate": 4.344082081348479e-05, "loss": 0.5364, "step": 662 }, { "epoch": 0.6554621848739496, "grad_norm": 0.9453051267871362, "learning_rate": 4.3422499083913525e-05, "loss": 0.4848, "step": 663 }, { "epoch": 0.6564508156203658, "grad_norm": 0.5159708375649645, "learning_rate": 4.340417735434225e-05, "loss": 0.4852, "step": 664 }, { "epoch": 0.657439446366782, "grad_norm": 0.7342093692323344, "learning_rate": 4.3385855624770985e-05, "loss": 0.5436, "step": 665 }, { "epoch": 0.6584280771131982, "grad_norm": 0.6272644741815578, "learning_rate": 4.336753389519971e-05, "loss": 0.5006, "step": 666 }, { "epoch": 0.6594167078596145, "grad_norm": 0.6071505224621178, "learning_rate": 4.334921216562844e-05, "loss": 0.4705, "step": 667 }, { "epoch": 0.6604053386060307, "grad_norm": 0.6709192010699455, "learning_rate": 4.3330890436057164e-05, "loss": 0.5719, "step": 668 }, { "epoch": 0.6613939693524469, "grad_norm": 0.5305901328371826, "learning_rate": 4.33125687064859e-05, "loss": 0.4442, "step": 669 }, { "epoch": 0.6623826000988631, "grad_norm": 0.5309034575037989, "learning_rate": 4.3294246976914623e-05, "loss": 0.482, "step": 670 }, { "epoch": 0.6633712308452793, "grad_norm": 0.5348600980090499, "learning_rate": 4.327592524734335e-05, "loss": 0.4789, "step": 671 }, { "epoch": 0.6643598615916955, "grad_norm": 0.5932298411164915, "learning_rate": 4.3257603517772076e-05, "loss": 0.4821, "step": 672 }, { "epoch": 0.6653484923381117, "grad_norm": 0.4670153459943543, "learning_rate": 4.323928178820081e-05, "loss": 0.5308, "step": 673 }, { "epoch": 0.666337123084528, "grad_norm": 0.464933092712643, "learning_rate": 4.322096005862954e-05, "loss": 0.4885, "step": 674 }, { "epoch": 0.6673257538309442, "grad_norm": 0.497874952939138, "learning_rate": 4.320263832905826e-05, "loss": 0.5086, "step": 675 }, { "epoch": 0.6683143845773604, "grad_norm": 0.47619173747527355, "learning_rate": 4.3184316599486995e-05, "loss": 0.5365, "step": 676 }, { "epoch": 0.6693030153237766, "grad_norm": 0.5080542712982548, "learning_rate": 4.316599486991572e-05, "loss": 0.5222, "step": 677 }, { "epoch": 0.6702916460701928, "grad_norm": 0.5741524203751168, "learning_rate": 4.3147673140344455e-05, "loss": 0.4972, "step": 678 }, { "epoch": 0.671280276816609, "grad_norm": 0.5046484714368985, "learning_rate": 4.3129351410773175e-05, "loss": 0.4486, "step": 679 }, { "epoch": 0.6722689075630253, "grad_norm": 0.572953739650896, "learning_rate": 4.311102968120191e-05, "loss": 0.5118, "step": 680 }, { "epoch": 0.6732575383094415, "grad_norm": 0.4758197032345209, "learning_rate": 4.3092707951630634e-05, "loss": 0.5624, "step": 681 }, { "epoch": 0.6742461690558577, "grad_norm": 0.5703661175746134, "learning_rate": 4.307438622205937e-05, "loss": 0.5021, "step": 682 }, { "epoch": 0.6752347998022739, "grad_norm": 0.513215525511048, "learning_rate": 4.3056064492488094e-05, "loss": 0.5021, "step": 683 }, { "epoch": 0.67622343054869, "grad_norm": 2.7518980327017317, "learning_rate": 4.303774276291682e-05, "loss": 0.4697, "step": 684 }, { "epoch": 0.6772120612951062, "grad_norm": 0.7944324516136357, "learning_rate": 4.301942103334555e-05, "loss": 0.5676, "step": 685 }, { "epoch": 0.6782006920415224, "grad_norm": 0.5977857051688232, "learning_rate": 4.300109930377428e-05, "loss": 0.5258, "step": 686 }, { "epoch": 0.6791893227879388, "grad_norm": 0.561294175545027, "learning_rate": 4.2982777574203006e-05, "loss": 0.4852, "step": 687 }, { "epoch": 0.680177953534355, "grad_norm": 0.6885593289289706, "learning_rate": 4.296445584463173e-05, "loss": 0.5416, "step": 688 }, { "epoch": 0.6811665842807711, "grad_norm": 0.47980928583436866, "learning_rate": 4.2946134115060466e-05, "loss": 0.483, "step": 689 }, { "epoch": 0.6821552150271873, "grad_norm": 0.6700531279997685, "learning_rate": 4.292781238548919e-05, "loss": 0.5596, "step": 690 }, { "epoch": 0.6831438457736035, "grad_norm": 0.7000939964251864, "learning_rate": 4.290949065591792e-05, "loss": 0.5706, "step": 691 }, { "epoch": 0.6841324765200197, "grad_norm": 0.7232545368791979, "learning_rate": 4.2891168926346645e-05, "loss": 0.4989, "step": 692 }, { "epoch": 0.6851211072664359, "grad_norm": 0.6734109645088125, "learning_rate": 4.287284719677538e-05, "loss": 0.5452, "step": 693 }, { "epoch": 0.6861097380128522, "grad_norm": 0.898340481785558, "learning_rate": 4.2854525467204105e-05, "loss": 0.5039, "step": 694 }, { "epoch": 0.6870983687592684, "grad_norm": 0.6400976104367452, "learning_rate": 4.283620373763284e-05, "loss": 0.5096, "step": 695 }, { "epoch": 0.6880869995056846, "grad_norm": 0.7330862839756865, "learning_rate": 4.281788200806156e-05, "loss": 0.4855, "step": 696 }, { "epoch": 0.6890756302521008, "grad_norm": 0.6760571434307273, "learning_rate": 4.279956027849029e-05, "loss": 0.4857, "step": 697 }, { "epoch": 0.690064260998517, "grad_norm": 1.298074468220026, "learning_rate": 4.2781238548919024e-05, "loss": 0.514, "step": 698 }, { "epoch": 0.6910528917449332, "grad_norm": 0.7996821796590299, "learning_rate": 4.276291681934775e-05, "loss": 0.4865, "step": 699 }, { "epoch": 0.6920415224913494, "grad_norm": 1.0399718629763757, "learning_rate": 4.274459508977648e-05, "loss": 0.5236, "step": 700 }, { "epoch": 0.6930301532377657, "grad_norm": 0.6220856151288175, "learning_rate": 4.27262733602052e-05, "loss": 0.4586, "step": 701 }, { "epoch": 0.6940187839841819, "grad_norm": 0.6352603627556183, "learning_rate": 4.2707951630633937e-05, "loss": 0.5101, "step": 702 }, { "epoch": 0.6950074147305981, "grad_norm": 0.6078916139780475, "learning_rate": 4.268962990106266e-05, "loss": 0.4946, "step": 703 }, { "epoch": 0.6959960454770143, "grad_norm": 0.560249982201983, "learning_rate": 4.267130817149139e-05, "loss": 0.4937, "step": 704 }, { "epoch": 0.6969846762234305, "grad_norm": 0.7296214310456001, "learning_rate": 4.2652986441920116e-05, "loss": 0.5543, "step": 705 }, { "epoch": 0.6979733069698467, "grad_norm": 0.5375934811724911, "learning_rate": 4.263466471234885e-05, "loss": 0.5199, "step": 706 }, { "epoch": 0.698961937716263, "grad_norm": 0.6627354103029831, "learning_rate": 4.2616342982777575e-05, "loss": 0.5113, "step": 707 }, { "epoch": 0.6999505684626792, "grad_norm": 0.5041317977617901, "learning_rate": 4.25980212532063e-05, "loss": 0.4941, "step": 708 }, { "epoch": 0.7009391992090954, "grad_norm": 0.6488545110538267, "learning_rate": 4.257969952363503e-05, "loss": 0.5389, "step": 709 }, { "epoch": 0.7019278299555116, "grad_norm": 0.5358610710531121, "learning_rate": 4.256137779406376e-05, "loss": 0.4681, "step": 710 }, { "epoch": 0.7029164607019278, "grad_norm": 0.5422155806475425, "learning_rate": 4.2543056064492495e-05, "loss": 0.5312, "step": 711 }, { "epoch": 0.703905091448344, "grad_norm": 0.5108820913797038, "learning_rate": 4.252473433492122e-05, "loss": 0.4757, "step": 712 }, { "epoch": 0.7048937221947602, "grad_norm": 0.5724194207682355, "learning_rate": 4.250641260534995e-05, "loss": 0.4863, "step": 713 }, { "epoch": 0.7058823529411765, "grad_norm": 2.7248273954755113, "learning_rate": 4.2488090875778674e-05, "loss": 0.5735, "step": 714 }, { "epoch": 0.7068709836875927, "grad_norm": 0.741648878607506, "learning_rate": 4.246976914620741e-05, "loss": 0.4983, "step": 715 }, { "epoch": 0.7078596144340089, "grad_norm": 0.5486661676519939, "learning_rate": 4.2451447416636133e-05, "loss": 0.4953, "step": 716 }, { "epoch": 0.7088482451804251, "grad_norm": 0.70203530633674, "learning_rate": 4.243312568706486e-05, "loss": 0.5183, "step": 717 }, { "epoch": 0.7098368759268413, "grad_norm": 0.6033209579306077, "learning_rate": 4.2414803957493586e-05, "loss": 0.5672, "step": 718 }, { "epoch": 0.7108255066732575, "grad_norm": 0.5711360756317189, "learning_rate": 4.239648222792232e-05, "loss": 0.4974, "step": 719 }, { "epoch": 0.7118141374196737, "grad_norm": 0.7424210509192726, "learning_rate": 4.2378160498351046e-05, "loss": 0.5056, "step": 720 }, { "epoch": 0.71280276816609, "grad_norm": 0.4244295252690348, "learning_rate": 4.235983876877977e-05, "loss": 0.4577, "step": 721 }, { "epoch": 0.7137913989125062, "grad_norm": 0.7968712584380067, "learning_rate": 4.2341517039208506e-05, "loss": 0.5464, "step": 722 }, { "epoch": 0.7147800296589224, "grad_norm": 0.5348622784525965, "learning_rate": 4.232319530963723e-05, "loss": 0.5119, "step": 723 }, { "epoch": 0.7157686604053386, "grad_norm": 0.5761367644286391, "learning_rate": 4.2304873580065965e-05, "loss": 0.491, "step": 724 }, { "epoch": 0.7167572911517548, "grad_norm": 0.6770258176306798, "learning_rate": 4.2286551850494685e-05, "loss": 0.4801, "step": 725 }, { "epoch": 0.717745921898171, "grad_norm": 0.5734888742704597, "learning_rate": 4.226823012092342e-05, "loss": 0.4844, "step": 726 }, { "epoch": 0.7187345526445873, "grad_norm": 0.6380538129719566, "learning_rate": 4.2249908391352144e-05, "loss": 0.493, "step": 727 }, { "epoch": 0.7197231833910035, "grad_norm": 0.5165154576462443, "learning_rate": 4.223158666178088e-05, "loss": 0.5045, "step": 728 }, { "epoch": 0.7207118141374197, "grad_norm": 0.6319656637799026, "learning_rate": 4.2213264932209604e-05, "loss": 0.4613, "step": 729 }, { "epoch": 0.7217004448838359, "grad_norm": 0.48300661066440276, "learning_rate": 4.219494320263833e-05, "loss": 0.5154, "step": 730 }, { "epoch": 0.7226890756302521, "grad_norm": 0.48132738073953424, "learning_rate": 4.217662147306706e-05, "loss": 0.5471, "step": 731 }, { "epoch": 0.7236777063766683, "grad_norm": 0.5460823538420072, "learning_rate": 4.215829974349579e-05, "loss": 0.5051, "step": 732 }, { "epoch": 0.7246663371230845, "grad_norm": 0.5400595004945816, "learning_rate": 4.2139978013924516e-05, "loss": 0.4673, "step": 733 }, { "epoch": 0.7256549678695008, "grad_norm": 0.5115334211946606, "learning_rate": 4.212165628435324e-05, "loss": 0.5368, "step": 734 }, { "epoch": 0.726643598615917, "grad_norm": 0.6100526864601407, "learning_rate": 4.2103334554781976e-05, "loss": 0.5208, "step": 735 }, { "epoch": 0.7276322293623332, "grad_norm": 0.5650086142058912, "learning_rate": 4.20850128252107e-05, "loss": 0.5016, "step": 736 }, { "epoch": 0.7286208601087494, "grad_norm": 0.49786229767661394, "learning_rate": 4.206669109563943e-05, "loss": 0.4757, "step": 737 }, { "epoch": 0.7296094908551656, "grad_norm": 0.5991380807727567, "learning_rate": 4.2048369366068155e-05, "loss": 0.4878, "step": 738 }, { "epoch": 0.7305981216015818, "grad_norm": 0.5459170309250957, "learning_rate": 4.203004763649689e-05, "loss": 0.4957, "step": 739 }, { "epoch": 0.731586752347998, "grad_norm": 0.6876826009835283, "learning_rate": 4.2011725906925615e-05, "loss": 0.4701, "step": 740 }, { "epoch": 0.7325753830944143, "grad_norm": 0.506291577810582, "learning_rate": 4.199340417735435e-05, "loss": 0.512, "step": 741 }, { "epoch": 0.7335640138408305, "grad_norm": 0.542921274032451, "learning_rate": 4.197508244778307e-05, "loss": 0.4706, "step": 742 }, { "epoch": 0.7345526445872467, "grad_norm": 0.4019511843907105, "learning_rate": 4.19567607182118e-05, "loss": 0.515, "step": 743 }, { "epoch": 0.7355412753336629, "grad_norm": 0.5981119154146153, "learning_rate": 4.193843898864053e-05, "loss": 0.5464, "step": 744 }, { "epoch": 0.7365299060800791, "grad_norm": 0.5586664897448791, "learning_rate": 4.192011725906926e-05, "loss": 0.5829, "step": 745 }, { "epoch": 0.7375185368264953, "grad_norm": 0.4449824107705569, "learning_rate": 4.190179552949799e-05, "loss": 0.4668, "step": 746 }, { "epoch": 0.7385071675729116, "grad_norm": 0.9376661499520814, "learning_rate": 4.1883473799926713e-05, "loss": 0.5056, "step": 747 }, { "epoch": 0.7394957983193278, "grad_norm": 0.4628525149851496, "learning_rate": 4.1865152070355447e-05, "loss": 0.5189, "step": 748 }, { "epoch": 0.740484429065744, "grad_norm": 0.4819560360324785, "learning_rate": 4.184683034078417e-05, "loss": 0.4744, "step": 749 }, { "epoch": 0.7414730598121602, "grad_norm": 0.4471500587801672, "learning_rate": 4.18285086112129e-05, "loss": 0.5418, "step": 750 }, { "epoch": 0.7424616905585764, "grad_norm": 0.5114074773411583, "learning_rate": 4.1810186881641626e-05, "loss": 0.5191, "step": 751 }, { "epoch": 0.7434503213049926, "grad_norm": 0.5318534442554298, "learning_rate": 4.179186515207036e-05, "loss": 0.4781, "step": 752 }, { "epoch": 0.7444389520514088, "grad_norm": 1.4660383792082003, "learning_rate": 4.1773543422499085e-05, "loss": 0.5485, "step": 753 }, { "epoch": 0.7454275827978251, "grad_norm": 0.7525533254242762, "learning_rate": 4.175522169292781e-05, "loss": 0.5097, "step": 754 }, { "epoch": 0.7464162135442413, "grad_norm": 0.4422012540261371, "learning_rate": 4.173689996335654e-05, "loss": 0.5251, "step": 755 }, { "epoch": 0.7474048442906575, "grad_norm": 0.557081195949443, "learning_rate": 4.171857823378527e-05, "loss": 0.4895, "step": 756 }, { "epoch": 0.7483934750370737, "grad_norm": 0.5485652337278741, "learning_rate": 4.1700256504214005e-05, "loss": 0.5001, "step": 757 }, { "epoch": 0.7493821057834898, "grad_norm": 0.6280140501666162, "learning_rate": 4.168193477464273e-05, "loss": 0.5445, "step": 758 }, { "epoch": 0.750370736529906, "grad_norm": 0.6075936978039935, "learning_rate": 4.166361304507146e-05, "loss": 0.5566, "step": 759 }, { "epoch": 0.7513593672763222, "grad_norm": 0.539112758403566, "learning_rate": 4.1645291315500184e-05, "loss": 0.5237, "step": 760 }, { "epoch": 0.7523479980227386, "grad_norm": 0.5026988284619105, "learning_rate": 4.162696958592892e-05, "loss": 0.5132, "step": 761 }, { "epoch": 0.7533366287691547, "grad_norm": 0.5925892433004932, "learning_rate": 4.1608647856357644e-05, "loss": 0.4999, "step": 762 }, { "epoch": 0.754325259515571, "grad_norm": 0.5242418866262498, "learning_rate": 4.159032612678637e-05, "loss": 0.5841, "step": 763 }, { "epoch": 0.7553138902619871, "grad_norm": 0.6161581639561535, "learning_rate": 4.1572004397215096e-05, "loss": 0.5062, "step": 764 }, { "epoch": 0.7563025210084033, "grad_norm": 0.6153915002755543, "learning_rate": 4.155368266764383e-05, "loss": 0.5537, "step": 765 }, { "epoch": 0.7572911517548195, "grad_norm": 0.4979894215149821, "learning_rate": 4.1535360938072556e-05, "loss": 0.5315, "step": 766 }, { "epoch": 0.7582797825012358, "grad_norm": 1.9169320568701518, "learning_rate": 4.151703920850128e-05, "loss": 0.5245, "step": 767 }, { "epoch": 0.759268413247652, "grad_norm": 6.658128213043368, "learning_rate": 4.149871747893001e-05, "loss": 0.572, "step": 768 }, { "epoch": 0.7602570439940682, "grad_norm": 0.9429330921723152, "learning_rate": 4.148039574935874e-05, "loss": 0.5689, "step": 769 }, { "epoch": 0.7612456747404844, "grad_norm": 0.5456304664411259, "learning_rate": 4.1462074019787475e-05, "loss": 0.4567, "step": 770 }, { "epoch": 0.7622343054869006, "grad_norm": 0.9468180363040658, "learning_rate": 4.1443752290216195e-05, "loss": 0.5098, "step": 771 }, { "epoch": 0.7632229362333168, "grad_norm": 0.4381567263561604, "learning_rate": 4.142543056064493e-05, "loss": 0.4691, "step": 772 }, { "epoch": 0.764211566979733, "grad_norm": 0.8475094401168158, "learning_rate": 4.1407108831073654e-05, "loss": 0.4964, "step": 773 }, { "epoch": 0.7652001977261493, "grad_norm": 0.6116905587289241, "learning_rate": 4.138878710150239e-05, "loss": 0.5096, "step": 774 }, { "epoch": 0.7661888284725655, "grad_norm": 0.6585820467314685, "learning_rate": 4.1370465371931114e-05, "loss": 0.4502, "step": 775 }, { "epoch": 0.7671774592189817, "grad_norm": 0.5724545188633945, "learning_rate": 4.135214364235984e-05, "loss": 0.4888, "step": 776 }, { "epoch": 0.7681660899653979, "grad_norm": 1.1580759675915724, "learning_rate": 4.133382191278857e-05, "loss": 0.542, "step": 777 }, { "epoch": 0.7691547207118141, "grad_norm": 0.53763530252112, "learning_rate": 4.13155001832173e-05, "loss": 0.4843, "step": 778 }, { "epoch": 0.7701433514582303, "grad_norm": 0.6308092383865201, "learning_rate": 4.1297178453646027e-05, "loss": 0.5277, "step": 779 }, { "epoch": 0.7711319822046465, "grad_norm": 0.5766836613511642, "learning_rate": 4.127885672407475e-05, "loss": 0.554, "step": 780 }, { "epoch": 0.7721206129510628, "grad_norm": 0.6511075387816251, "learning_rate": 4.1260534994503486e-05, "loss": 0.4902, "step": 781 }, { "epoch": 0.773109243697479, "grad_norm": 0.5066655346841276, "learning_rate": 4.124221326493221e-05, "loss": 0.491, "step": 782 }, { "epoch": 0.7740978744438952, "grad_norm": 0.5744239167093317, "learning_rate": 4.122389153536094e-05, "loss": 0.4821, "step": 783 }, { "epoch": 0.7750865051903114, "grad_norm": 0.498462219287242, "learning_rate": 4.1205569805789665e-05, "loss": 0.5279, "step": 784 }, { "epoch": 0.7760751359367276, "grad_norm": 0.5862879122585035, "learning_rate": 4.11872480762184e-05, "loss": 0.5133, "step": 785 }, { "epoch": 0.7770637666831438, "grad_norm": 0.4770133031399091, "learning_rate": 4.1168926346647125e-05, "loss": 0.5415, "step": 786 }, { "epoch": 0.77805239742956, "grad_norm": 0.618444037581815, "learning_rate": 4.115060461707586e-05, "loss": 0.4798, "step": 787 }, { "epoch": 0.7790410281759763, "grad_norm": 0.5064972595382464, "learning_rate": 4.113228288750458e-05, "loss": 0.4957, "step": 788 }, { "epoch": 0.7800296589223925, "grad_norm": 0.5037168691389556, "learning_rate": 4.111396115793331e-05, "loss": 0.4877, "step": 789 }, { "epoch": 0.7810182896688087, "grad_norm": 0.45574774249271016, "learning_rate": 4.109563942836204e-05, "loss": 0.5516, "step": 790 }, { "epoch": 0.7820069204152249, "grad_norm": 0.4531184469912075, "learning_rate": 4.107731769879077e-05, "loss": 0.5194, "step": 791 }, { "epoch": 0.7829955511616411, "grad_norm": 0.44963311683670065, "learning_rate": 4.105899596921949e-05, "loss": 0.5126, "step": 792 }, { "epoch": 0.7839841819080573, "grad_norm": 0.8571489750096418, "learning_rate": 4.1040674239648223e-05, "loss": 0.4921, "step": 793 }, { "epoch": 0.7849728126544736, "grad_norm": 0.5508154806060637, "learning_rate": 4.102235251007696e-05, "loss": 0.5401, "step": 794 }, { "epoch": 0.7859614434008898, "grad_norm": 0.4700104372855393, "learning_rate": 4.100403078050568e-05, "loss": 0.5168, "step": 795 }, { "epoch": 0.786950074147306, "grad_norm": 0.5976912629127711, "learning_rate": 4.098570905093441e-05, "loss": 0.4971, "step": 796 }, { "epoch": 0.7879387048937222, "grad_norm": 0.4713020972246892, "learning_rate": 4.0967387321363136e-05, "loss": 0.4792, "step": 797 }, { "epoch": 0.7889273356401384, "grad_norm": 0.5168180520110578, "learning_rate": 4.094906559179187e-05, "loss": 0.4665, "step": 798 }, { "epoch": 0.7899159663865546, "grad_norm": 0.4461318555834143, "learning_rate": 4.0930743862220596e-05, "loss": 0.4733, "step": 799 }, { "epoch": 0.7909045971329708, "grad_norm": 0.5687505157914393, "learning_rate": 4.091242213264932e-05, "loss": 0.538, "step": 800 }, { "epoch": 0.7918932278793871, "grad_norm": 0.5218657521755182, "learning_rate": 4.089410040307805e-05, "loss": 0.5512, "step": 801 }, { "epoch": 0.7928818586258033, "grad_norm": 0.5010341479190303, "learning_rate": 4.087577867350678e-05, "loss": 0.4688, "step": 802 }, { "epoch": 0.7938704893722195, "grad_norm": 0.5840580434430013, "learning_rate": 4.085745694393551e-05, "loss": 0.4944, "step": 803 }, { "epoch": 0.7948591201186357, "grad_norm": 0.4743055480485977, "learning_rate": 4.083913521436424e-05, "loss": 0.5524, "step": 804 }, { "epoch": 0.7958477508650519, "grad_norm": 0.48051795979640577, "learning_rate": 4.082081348479297e-05, "loss": 0.4819, "step": 805 }, { "epoch": 0.7968363816114681, "grad_norm": 0.4665283161802883, "learning_rate": 4.0802491755221694e-05, "loss": 0.464, "step": 806 }, { "epoch": 0.7978250123578843, "grad_norm": 0.53799276032993, "learning_rate": 4.078417002565043e-05, "loss": 0.4996, "step": 807 }, { "epoch": 0.7988136431043006, "grad_norm": 0.46724376330824136, "learning_rate": 4.0765848296079154e-05, "loss": 0.5136, "step": 808 }, { "epoch": 0.7998022738507168, "grad_norm": 0.6158635888655473, "learning_rate": 4.074752656650788e-05, "loss": 0.5074, "step": 809 }, { "epoch": 0.800790904597133, "grad_norm": 0.6145872789795157, "learning_rate": 4.0729204836936606e-05, "loss": 0.4764, "step": 810 }, { "epoch": 0.8017795353435492, "grad_norm": 0.6240710052445666, "learning_rate": 4.071088310736534e-05, "loss": 0.5471, "step": 811 }, { "epoch": 0.8027681660899654, "grad_norm": 0.688393278778474, "learning_rate": 4.0692561377794066e-05, "loss": 0.5514, "step": 812 }, { "epoch": 0.8037567968363816, "grad_norm": 0.5787162901759351, "learning_rate": 4.067423964822279e-05, "loss": 0.4752, "step": 813 }, { "epoch": 0.8047454275827979, "grad_norm": 0.5817989836906865, "learning_rate": 4.065591791865152e-05, "loss": 0.5215, "step": 814 }, { "epoch": 0.8057340583292141, "grad_norm": 0.6140370023158416, "learning_rate": 4.063759618908025e-05, "loss": 0.4963, "step": 815 }, { "epoch": 0.8067226890756303, "grad_norm": 0.6447215019129773, "learning_rate": 4.061927445950898e-05, "loss": 0.4935, "step": 816 }, { "epoch": 0.8077113198220465, "grad_norm": 0.5094256290053805, "learning_rate": 4.0600952729937705e-05, "loss": 0.4939, "step": 817 }, { "epoch": 0.8086999505684627, "grad_norm": 0.6612886264041997, "learning_rate": 4.058263100036644e-05, "loss": 0.5022, "step": 818 }, { "epoch": 0.8096885813148789, "grad_norm": 0.5803600238042551, "learning_rate": 4.0564309270795165e-05, "loss": 0.4502, "step": 819 }, { "epoch": 0.8106772120612951, "grad_norm": 0.4828902692341567, "learning_rate": 4.05459875412239e-05, "loss": 0.4856, "step": 820 }, { "epoch": 0.8116658428077114, "grad_norm": 0.5481096041815683, "learning_rate": 4.0527665811652624e-05, "loss": 0.5037, "step": 821 }, { "epoch": 0.8126544735541276, "grad_norm": 0.5521601562583286, "learning_rate": 4.050934408208135e-05, "loss": 0.5243, "step": 822 }, { "epoch": 0.8136431043005438, "grad_norm": 0.4010624505991008, "learning_rate": 4.049102235251008e-05, "loss": 0.5151, "step": 823 }, { "epoch": 0.81463173504696, "grad_norm": 0.5597607966455216, "learning_rate": 4.047270062293881e-05, "loss": 0.4758, "step": 824 }, { "epoch": 0.8156203657933762, "grad_norm": 0.4946946923197819, "learning_rate": 4.045437889336754e-05, "loss": 0.5274, "step": 825 }, { "epoch": 0.8166089965397924, "grad_norm": 0.46327895967672256, "learning_rate": 4.043605716379626e-05, "loss": 0.478, "step": 826 }, { "epoch": 0.8175976272862086, "grad_norm": 0.999567969257603, "learning_rate": 4.041773543422499e-05, "loss": 0.5598, "step": 827 }, { "epoch": 0.8185862580326249, "grad_norm": 4.778016351396734, "learning_rate": 4.039941370465372e-05, "loss": 0.5991, "step": 828 }, { "epoch": 0.819574888779041, "grad_norm": 0.7459528721453509, "learning_rate": 4.038109197508245e-05, "loss": 0.5234, "step": 829 }, { "epoch": 0.8205635195254573, "grad_norm": 0.5617172011306228, "learning_rate": 4.0362770245511175e-05, "loss": 0.5145, "step": 830 }, { "epoch": 0.8215521502718734, "grad_norm": 0.834501118256013, "learning_rate": 4.034444851593991e-05, "loss": 0.5442, "step": 831 }, { "epoch": 0.8225407810182896, "grad_norm": 0.6173466176531238, "learning_rate": 4.0326126786368635e-05, "loss": 0.5526, "step": 832 }, { "epoch": 0.8235294117647058, "grad_norm": 0.6388628059786791, "learning_rate": 4.030780505679737e-05, "loss": 0.5049, "step": 833 }, { "epoch": 0.8245180425111222, "grad_norm": 0.7270226357086785, "learning_rate": 4.028948332722609e-05, "loss": 0.5314, "step": 834 }, { "epoch": 0.8255066732575383, "grad_norm": 0.617262750801938, "learning_rate": 4.027116159765482e-05, "loss": 0.4564, "step": 835 }, { "epoch": 0.8264953040039545, "grad_norm": 0.7886545447759482, "learning_rate": 4.025283986808355e-05, "loss": 0.5341, "step": 836 }, { "epoch": 0.8274839347503707, "grad_norm": 1.292714412975818, "learning_rate": 4.023451813851228e-05, "loss": 0.4904, "step": 837 }, { "epoch": 0.8284725654967869, "grad_norm": 0.8968021659196027, "learning_rate": 4.0216196408941e-05, "loss": 0.5022, "step": 838 }, { "epoch": 0.8294611962432031, "grad_norm": 0.5014605751162985, "learning_rate": 4.0197874679369734e-05, "loss": 0.5331, "step": 839 }, { "epoch": 0.8304498269896193, "grad_norm": 0.7637173827566388, "learning_rate": 4.017955294979846e-05, "loss": 0.5599, "step": 840 }, { "epoch": 0.8314384577360356, "grad_norm": 0.5981153407129368, "learning_rate": 4.016123122022719e-05, "loss": 0.5231, "step": 841 }, { "epoch": 0.8324270884824518, "grad_norm": 0.609919059129479, "learning_rate": 4.014290949065592e-05, "loss": 0.4893, "step": 842 }, { "epoch": 0.833415719228868, "grad_norm": 0.6487534557793767, "learning_rate": 4.0124587761084646e-05, "loss": 0.4899, "step": 843 }, { "epoch": 0.8344043499752842, "grad_norm": 0.5900194643572374, "learning_rate": 4.010626603151338e-05, "loss": 0.4993, "step": 844 }, { "epoch": 0.8353929807217004, "grad_norm": 0.6346737651652127, "learning_rate": 4.0087944301942106e-05, "loss": 0.4832, "step": 845 }, { "epoch": 0.8363816114681166, "grad_norm": 0.5511838188325354, "learning_rate": 4.006962257237083e-05, "loss": 0.4934, "step": 846 }, { "epoch": 0.8373702422145328, "grad_norm": 0.6723071856311088, "learning_rate": 4.005130084279956e-05, "loss": 0.5009, "step": 847 }, { "epoch": 0.8383588729609491, "grad_norm": 0.5479387543883424, "learning_rate": 4.003297911322829e-05, "loss": 0.5054, "step": 848 }, { "epoch": 0.8393475037073653, "grad_norm": 0.5716610507020627, "learning_rate": 4.001465738365702e-05, "loss": 0.5252, "step": 849 }, { "epoch": 0.8403361344537815, "grad_norm": 0.4471584752934043, "learning_rate": 3.999633565408575e-05, "loss": 0.455, "step": 850 }, { "epoch": 0.8413247652001977, "grad_norm": 0.7785821000889109, "learning_rate": 3.997801392451447e-05, "loss": 0.5471, "step": 851 }, { "epoch": 0.8423133959466139, "grad_norm": 0.4856333733748894, "learning_rate": 3.9959692194943204e-05, "loss": 0.4718, "step": 852 }, { "epoch": 0.8433020266930301, "grad_norm": 0.53794979474513, "learning_rate": 3.994137046537194e-05, "loss": 0.5023, "step": 853 }, { "epoch": 0.8442906574394463, "grad_norm": 0.5643431673593808, "learning_rate": 3.9923048735800664e-05, "loss": 0.47, "step": 854 }, { "epoch": 0.8452792881858626, "grad_norm": 0.4309721054618446, "learning_rate": 3.990472700622939e-05, "loss": 0.5069, "step": 855 }, { "epoch": 0.8462679189322788, "grad_norm": 0.5191277768328939, "learning_rate": 3.9886405276658117e-05, "loss": 0.535, "step": 856 }, { "epoch": 0.847256549678695, "grad_norm": 0.5469906290485024, "learning_rate": 3.986808354708685e-05, "loss": 0.4989, "step": 857 }, { "epoch": 0.8482451804251112, "grad_norm": 0.5086073359190132, "learning_rate": 3.9849761817515576e-05, "loss": 0.4937, "step": 858 }, { "epoch": 0.8492338111715274, "grad_norm": 2.821180385091994, "learning_rate": 3.98314400879443e-05, "loss": 0.5118, "step": 859 }, { "epoch": 0.8502224419179436, "grad_norm": 4.849004797748899, "learning_rate": 3.981311835837303e-05, "loss": 0.5974, "step": 860 }, { "epoch": 0.8512110726643599, "grad_norm": 0.8665644468970722, "learning_rate": 3.979479662880176e-05, "loss": 0.497, "step": 861 }, { "epoch": 0.8521997034107761, "grad_norm": 1.6619363020813245, "learning_rate": 3.977647489923049e-05, "loss": 0.5374, "step": 862 }, { "epoch": 0.8531883341571923, "grad_norm": 0.7919501598134441, "learning_rate": 3.9758153169659215e-05, "loss": 0.4692, "step": 863 }, { "epoch": 0.8541769649036085, "grad_norm": 0.9380004713383713, "learning_rate": 3.973983144008794e-05, "loss": 0.5452, "step": 864 }, { "epoch": 0.8551655956500247, "grad_norm": 0.6129594972063509, "learning_rate": 3.9721509710516675e-05, "loss": 0.5227, "step": 865 }, { "epoch": 0.8561542263964409, "grad_norm": 0.8483625435848314, "learning_rate": 3.970318798094541e-05, "loss": 0.4729, "step": 866 }, { "epoch": 0.8571428571428571, "grad_norm": 0.6783410410509974, "learning_rate": 3.9684866251374134e-05, "loss": 0.4839, "step": 867 }, { "epoch": 0.8581314878892734, "grad_norm": 1.823632758075281, "learning_rate": 3.966654452180286e-05, "loss": 0.5121, "step": 868 }, { "epoch": 0.8591201186356896, "grad_norm": 0.8186988183426722, "learning_rate": 3.964822279223159e-05, "loss": 0.5425, "step": 869 }, { "epoch": 0.8601087493821058, "grad_norm": 11.053376056966838, "learning_rate": 3.962990106266032e-05, "loss": 0.5776, "step": 870 }, { "epoch": 0.861097380128522, "grad_norm": 1.4490227378224407, "learning_rate": 3.961157933308905e-05, "loss": 0.5103, "step": 871 }, { "epoch": 0.8620860108749382, "grad_norm": 0.6561285292175026, "learning_rate": 3.959325760351777e-05, "loss": 0.4975, "step": 872 }, { "epoch": 0.8630746416213544, "grad_norm": 1.15301308813239, "learning_rate": 3.95749358739465e-05, "loss": 0.5044, "step": 873 }, { "epoch": 0.8640632723677706, "grad_norm": 0.6474659772257714, "learning_rate": 3.955661414437523e-05, "loss": 0.5094, "step": 874 }, { "epoch": 0.8650519031141869, "grad_norm": 1.0424038226611325, "learning_rate": 3.953829241480396e-05, "loss": 0.5125, "step": 875 }, { "epoch": 0.8660405338606031, "grad_norm": 0.5010083736051416, "learning_rate": 3.9519970685232686e-05, "loss": 0.5012, "step": 876 }, { "epoch": 0.8670291646070193, "grad_norm": 0.8917671354450384, "learning_rate": 3.950164895566142e-05, "loss": 0.4903, "step": 877 }, { "epoch": 0.8680177953534355, "grad_norm": 0.49584884609414404, "learning_rate": 3.9483327226090145e-05, "loss": 0.4651, "step": 878 }, { "epoch": 0.8690064260998517, "grad_norm": 0.7803743763028045, "learning_rate": 3.946500549651888e-05, "loss": 0.4615, "step": 879 }, { "epoch": 0.8699950568462679, "grad_norm": 0.47087173937303817, "learning_rate": 3.94466837669476e-05, "loss": 0.5059, "step": 880 }, { "epoch": 0.8709836875926842, "grad_norm": 0.7223810327702713, "learning_rate": 3.942836203737633e-05, "loss": 0.5003, "step": 881 }, { "epoch": 0.8719723183391004, "grad_norm": 0.5480628713067308, "learning_rate": 3.941004030780506e-05, "loss": 0.5095, "step": 882 }, { "epoch": 0.8729609490855166, "grad_norm": 0.7189694993076244, "learning_rate": 3.939171857823379e-05, "loss": 0.5197, "step": 883 }, { "epoch": 0.8739495798319328, "grad_norm": 0.6887490345492323, "learning_rate": 3.937339684866251e-05, "loss": 0.5226, "step": 884 }, { "epoch": 0.874938210578349, "grad_norm": 0.7250003441509966, "learning_rate": 3.9355075119091244e-05, "loss": 0.5185, "step": 885 }, { "epoch": 0.8759268413247652, "grad_norm": 0.7041262878528474, "learning_rate": 3.933675338951997e-05, "loss": 0.5102, "step": 886 }, { "epoch": 0.8769154720711814, "grad_norm": 0.69303468542175, "learning_rate": 3.93184316599487e-05, "loss": 0.5077, "step": 887 }, { "epoch": 0.8779041028175977, "grad_norm": 0.686320650438643, "learning_rate": 3.930010993037743e-05, "loss": 0.5213, "step": 888 }, { "epoch": 0.8788927335640139, "grad_norm": 0.7943728065722955, "learning_rate": 3.9281788200806156e-05, "loss": 0.5203, "step": 889 }, { "epoch": 0.8798813643104301, "grad_norm": 0.5022744867496698, "learning_rate": 3.926346647123489e-05, "loss": 0.523, "step": 890 }, { "epoch": 0.8808699950568463, "grad_norm": 0.7430315212181137, "learning_rate": 3.9245144741663616e-05, "loss": 0.4788, "step": 891 }, { "epoch": 0.8818586258032625, "grad_norm": 0.7309222155599202, "learning_rate": 3.922682301209234e-05, "loss": 0.4885, "step": 892 }, { "epoch": 0.8828472565496787, "grad_norm": 0.5624859987750184, "learning_rate": 3.920850128252107e-05, "loss": 0.5184, "step": 893 }, { "epoch": 0.8838358872960949, "grad_norm": 0.8504452322913478, "learning_rate": 3.91901795529498e-05, "loss": 0.537, "step": 894 }, { "epoch": 0.8848245180425112, "grad_norm": 0.42589801477452915, "learning_rate": 3.917185782337853e-05, "loss": 0.4789, "step": 895 }, { "epoch": 0.8858131487889274, "grad_norm": 0.6358218694712313, "learning_rate": 3.915353609380726e-05, "loss": 0.4822, "step": 896 }, { "epoch": 0.8868017795353436, "grad_norm": 0.5544022709214241, "learning_rate": 3.913521436423598e-05, "loss": 0.5121, "step": 897 }, { "epoch": 0.8877904102817598, "grad_norm": 0.5279110578153374, "learning_rate": 3.9116892634664714e-05, "loss": 0.5441, "step": 898 }, { "epoch": 0.888779041028176, "grad_norm": 0.5467716983991355, "learning_rate": 3.909857090509344e-05, "loss": 0.5209, "step": 899 }, { "epoch": 0.8897676717745922, "grad_norm": 0.5534012238370134, "learning_rate": 3.9080249175522174e-05, "loss": 0.4834, "step": 900 }, { "epoch": 0.8907563025210085, "grad_norm": 0.5828273713347715, "learning_rate": 3.90619274459509e-05, "loss": 0.5022, "step": 901 }, { "epoch": 0.8917449332674247, "grad_norm": 0.6695379326313492, "learning_rate": 3.904360571637963e-05, "loss": 0.4942, "step": 902 }, { "epoch": 0.8927335640138409, "grad_norm": 0.6139048169844047, "learning_rate": 3.902528398680836e-05, "loss": 0.4732, "step": 903 }, { "epoch": 0.893722194760257, "grad_norm": 0.6325378085948202, "learning_rate": 3.9006962257237086e-05, "loss": 0.5322, "step": 904 }, { "epoch": 0.8947108255066732, "grad_norm": 0.5627569355476826, "learning_rate": 3.898864052766581e-05, "loss": 0.5514, "step": 905 }, { "epoch": 0.8956994562530894, "grad_norm": 0.547334139055531, "learning_rate": 3.897031879809454e-05, "loss": 0.4735, "step": 906 }, { "epoch": 0.8966880869995056, "grad_norm": 0.5273317369652111, "learning_rate": 3.895199706852327e-05, "loss": 0.4899, "step": 907 }, { "epoch": 0.897676717745922, "grad_norm": 0.5515478292335528, "learning_rate": 3.8933675338952e-05, "loss": 0.486, "step": 908 }, { "epoch": 0.8986653484923381, "grad_norm": 0.5153623397931197, "learning_rate": 3.8915353609380725e-05, "loss": 0.5212, "step": 909 }, { "epoch": 0.8996539792387543, "grad_norm": 0.5162715158538681, "learning_rate": 3.889703187980945e-05, "loss": 0.5078, "step": 910 }, { "epoch": 0.9006426099851705, "grad_norm": 0.5536561035567681, "learning_rate": 3.8878710150238185e-05, "loss": 0.5406, "step": 911 }, { "epoch": 0.9016312407315867, "grad_norm": 0.4824353469720794, "learning_rate": 3.886038842066692e-05, "loss": 0.4765, "step": 912 }, { "epoch": 0.9026198714780029, "grad_norm": 0.5537740261288194, "learning_rate": 3.8842066691095644e-05, "loss": 0.4755, "step": 913 }, { "epoch": 0.9036085022244191, "grad_norm": 0.5481616928501789, "learning_rate": 3.882374496152437e-05, "loss": 0.5004, "step": 914 }, { "epoch": 0.9045971329708354, "grad_norm": 0.612065169369696, "learning_rate": 3.88054232319531e-05, "loss": 0.4489, "step": 915 }, { "epoch": 0.9055857637172516, "grad_norm": 0.4298423809646432, "learning_rate": 3.878710150238183e-05, "loss": 0.4681, "step": 916 }, { "epoch": 0.9065743944636678, "grad_norm": 0.5512959804650045, "learning_rate": 3.876877977281056e-05, "loss": 0.5274, "step": 917 }, { "epoch": 0.907563025210084, "grad_norm": 0.5897212646019293, "learning_rate": 3.875045804323928e-05, "loss": 0.504, "step": 918 }, { "epoch": 0.9085516559565002, "grad_norm": 0.47157716831564717, "learning_rate": 3.873213631366801e-05, "loss": 0.488, "step": 919 }, { "epoch": 0.9095402867029164, "grad_norm": 0.5233349848390669, "learning_rate": 3.871381458409674e-05, "loss": 0.448, "step": 920 }, { "epoch": 0.9105289174493326, "grad_norm": 0.5487715443395557, "learning_rate": 3.869549285452547e-05, "loss": 0.5281, "step": 921 }, { "epoch": 0.9115175481957489, "grad_norm": 0.49622667064733267, "learning_rate": 3.8677171124954196e-05, "loss": 0.4864, "step": 922 }, { "epoch": 0.9125061789421651, "grad_norm": 0.5740741185952918, "learning_rate": 3.865884939538292e-05, "loss": 0.5248, "step": 923 }, { "epoch": 0.9134948096885813, "grad_norm": 0.653374518881781, "learning_rate": 3.8640527665811655e-05, "loss": 0.4869, "step": 924 }, { "epoch": 0.9144834404349975, "grad_norm": 0.5791234658946921, "learning_rate": 3.862220593624039e-05, "loss": 0.4946, "step": 925 }, { "epoch": 0.9154720711814137, "grad_norm": 0.609502069173124, "learning_rate": 3.860388420666911e-05, "loss": 0.5034, "step": 926 }, { "epoch": 0.9164607019278299, "grad_norm": 0.6590093439201093, "learning_rate": 3.858556247709784e-05, "loss": 0.5217, "step": 927 }, { "epoch": 0.9174493326742462, "grad_norm": 0.48775454135544083, "learning_rate": 3.856724074752657e-05, "loss": 0.4394, "step": 928 }, { "epoch": 0.9184379634206624, "grad_norm": 0.5414568069015901, "learning_rate": 3.85489190179553e-05, "loss": 0.4814, "step": 929 }, { "epoch": 0.9194265941670786, "grad_norm": 0.5317303788623311, "learning_rate": 3.853059728838402e-05, "loss": 0.4352, "step": 930 }, { "epoch": 0.9204152249134948, "grad_norm": 1.0224709074649452, "learning_rate": 3.8512275558812754e-05, "loss": 0.4884, "step": 931 }, { "epoch": 0.921403855659911, "grad_norm": 0.5862613046459262, "learning_rate": 3.849395382924148e-05, "loss": 0.4243, "step": 932 }, { "epoch": 0.9223924864063272, "grad_norm": 0.7127937471090146, "learning_rate": 3.847563209967021e-05, "loss": 0.521, "step": 933 }, { "epoch": 0.9233811171527434, "grad_norm": 0.515223434283003, "learning_rate": 3.845731037009894e-05, "loss": 0.4614, "step": 934 }, { "epoch": 0.9243697478991597, "grad_norm": 0.5958211371586949, "learning_rate": 3.8438988640527666e-05, "loss": 0.5211, "step": 935 }, { "epoch": 0.9253583786455759, "grad_norm": 0.618425163266337, "learning_rate": 3.84206669109564e-05, "loss": 0.515, "step": 936 }, { "epoch": 0.9263470093919921, "grad_norm": 0.5711072515022357, "learning_rate": 3.8402345181385126e-05, "loss": 0.4784, "step": 937 }, { "epoch": 0.9273356401384083, "grad_norm": 0.543487472976375, "learning_rate": 3.838402345181385e-05, "loss": 0.474, "step": 938 }, { "epoch": 0.9283242708848245, "grad_norm": 0.6321318885428366, "learning_rate": 3.836570172224258e-05, "loss": 0.5254, "step": 939 }, { "epoch": 0.9293129016312407, "grad_norm": 0.5494964672524449, "learning_rate": 3.834737999267131e-05, "loss": 0.4905, "step": 940 }, { "epoch": 0.9303015323776569, "grad_norm": 0.5456400727287741, "learning_rate": 3.832905826310004e-05, "loss": 0.5427, "step": 941 }, { "epoch": 0.9312901631240732, "grad_norm": 5.6361054156738115, "learning_rate": 3.831073653352877e-05, "loss": 0.5201, "step": 942 }, { "epoch": 0.9322787938704894, "grad_norm": 0.8377402610207537, "learning_rate": 3.829241480395749e-05, "loss": 0.511, "step": 943 }, { "epoch": 0.9332674246169056, "grad_norm": 0.5601168795131766, "learning_rate": 3.8274093074386224e-05, "loss": 0.455, "step": 944 }, { "epoch": 0.9342560553633218, "grad_norm": 0.8070861984026463, "learning_rate": 3.825577134481495e-05, "loss": 0.5128, "step": 945 }, { "epoch": 0.935244686109738, "grad_norm": 0.7530306838360581, "learning_rate": 3.8237449615243684e-05, "loss": 0.5234, "step": 946 }, { "epoch": 0.9362333168561542, "grad_norm": 0.45488250350275, "learning_rate": 3.8219127885672404e-05, "loss": 0.471, "step": 947 }, { "epoch": 0.9372219476025705, "grad_norm": 0.7376442506087357, "learning_rate": 3.820080615610114e-05, "loss": 0.4856, "step": 948 }, { "epoch": 0.9382105783489867, "grad_norm": 0.601090929329931, "learning_rate": 3.818248442652987e-05, "loss": 0.5101, "step": 949 }, { "epoch": 0.9391992090954029, "grad_norm": 0.7186606277376796, "learning_rate": 3.8164162696958596e-05, "loss": 0.5184, "step": 950 }, { "epoch": 0.9401878398418191, "grad_norm": 0.504247077151111, "learning_rate": 3.814584096738732e-05, "loss": 0.4715, "step": 951 }, { "epoch": 0.9411764705882353, "grad_norm": 0.5174350749668243, "learning_rate": 3.812751923781605e-05, "loss": 0.453, "step": 952 }, { "epoch": 0.9421651013346515, "grad_norm": 0.703771273286328, "learning_rate": 3.810919750824478e-05, "loss": 0.4973, "step": 953 }, { "epoch": 0.9431537320810677, "grad_norm": 0.5007184184522951, "learning_rate": 3.809087577867351e-05, "loss": 0.5115, "step": 954 }, { "epoch": 0.944142362827484, "grad_norm": 0.6235225760291775, "learning_rate": 3.8072554049102235e-05, "loss": 0.5104, "step": 955 }, { "epoch": 0.9451309935739002, "grad_norm": 0.5359628307446125, "learning_rate": 3.805423231953096e-05, "loss": 0.4799, "step": 956 }, { "epoch": 0.9461196243203164, "grad_norm": 0.44346748101027506, "learning_rate": 3.8035910589959695e-05, "loss": 0.4962, "step": 957 }, { "epoch": 0.9471082550667326, "grad_norm": 0.6126253946486948, "learning_rate": 3.801758886038842e-05, "loss": 0.5211, "step": 958 }, { "epoch": 0.9480968858131488, "grad_norm": 0.4531518501861476, "learning_rate": 3.7999267130817154e-05, "loss": 0.5253, "step": 959 }, { "epoch": 0.949085516559565, "grad_norm": 0.5242696895952674, "learning_rate": 3.798094540124588e-05, "loss": 0.5011, "step": 960 }, { "epoch": 0.9500741473059812, "grad_norm": 0.534859489396952, "learning_rate": 3.796262367167461e-05, "loss": 0.5101, "step": 961 }, { "epoch": 0.9510627780523975, "grad_norm": 0.3932790722334226, "learning_rate": 3.794430194210334e-05, "loss": 0.5038, "step": 962 }, { "epoch": 0.9520514087988137, "grad_norm": 0.541173221680243, "learning_rate": 3.792598021253207e-05, "loss": 0.4456, "step": 963 }, { "epoch": 0.9530400395452299, "grad_norm": 0.4283593518533953, "learning_rate": 3.790765848296079e-05, "loss": 0.5148, "step": 964 }, { "epoch": 0.9540286702916461, "grad_norm": 0.4894292700732316, "learning_rate": 3.788933675338952e-05, "loss": 0.5076, "step": 965 }, { "epoch": 0.9550173010380623, "grad_norm": 0.5809797308015682, "learning_rate": 3.787101502381825e-05, "loss": 0.5097, "step": 966 }, { "epoch": 0.9560059317844785, "grad_norm": 1.7686832196275746, "learning_rate": 3.785269329424698e-05, "loss": 0.4368, "step": 967 }, { "epoch": 0.9569945625308948, "grad_norm": 0.8188935124093534, "learning_rate": 3.7834371564675706e-05, "loss": 0.4856, "step": 968 }, { "epoch": 0.957983193277311, "grad_norm": 0.6237046182922688, "learning_rate": 3.781604983510443e-05, "loss": 0.5437, "step": 969 }, { "epoch": 0.9589718240237272, "grad_norm": 0.7227293528398483, "learning_rate": 3.7797728105533165e-05, "loss": 0.4834, "step": 970 }, { "epoch": 0.9599604547701434, "grad_norm": 0.6685898268000396, "learning_rate": 3.777940637596189e-05, "loss": 0.4777, "step": 971 }, { "epoch": 0.9609490855165596, "grad_norm": 0.6116281599289481, "learning_rate": 3.776108464639062e-05, "loss": 0.4805, "step": 972 }, { "epoch": 0.9619377162629758, "grad_norm": 0.7063173566109168, "learning_rate": 3.774276291681935e-05, "loss": 0.5856, "step": 973 }, { "epoch": 0.962926347009392, "grad_norm": 0.587248882287278, "learning_rate": 3.772444118724808e-05, "loss": 0.4968, "step": 974 }, { "epoch": 0.9639149777558083, "grad_norm": 0.6068673116003089, "learning_rate": 3.770611945767681e-05, "loss": 0.5248, "step": 975 }, { "epoch": 0.9649036085022245, "grad_norm": 0.7576096357683711, "learning_rate": 3.768779772810553e-05, "loss": 0.4911, "step": 976 }, { "epoch": 0.9658922392486406, "grad_norm": 0.7285592789597706, "learning_rate": 3.7669475998534264e-05, "loss": 0.5082, "step": 977 }, { "epoch": 0.9668808699950568, "grad_norm": 0.6735343724182004, "learning_rate": 3.765115426896299e-05, "loss": 0.5158, "step": 978 }, { "epoch": 0.967869500741473, "grad_norm": 0.695681195078194, "learning_rate": 3.7632832539391723e-05, "loss": 0.5005, "step": 979 }, { "epoch": 0.9688581314878892, "grad_norm": 0.5127425892997801, "learning_rate": 3.761451080982045e-05, "loss": 0.4979, "step": 980 }, { "epoch": 0.9698467622343054, "grad_norm": 0.7012076481656422, "learning_rate": 3.7596189080249176e-05, "loss": 0.508, "step": 981 }, { "epoch": 0.9708353929807217, "grad_norm": 0.49834383958260375, "learning_rate": 3.75778673506779e-05, "loss": 0.4844, "step": 982 }, { "epoch": 0.9718240237271379, "grad_norm": 0.6722140778617647, "learning_rate": 3.7559545621106636e-05, "loss": 0.4844, "step": 983 }, { "epoch": 0.9728126544735541, "grad_norm": 0.5153644615672031, "learning_rate": 3.754122389153536e-05, "loss": 0.5768, "step": 984 }, { "epoch": 0.9738012852199703, "grad_norm": 0.6144340512262525, "learning_rate": 3.752290216196409e-05, "loss": 0.5293, "step": 985 }, { "epoch": 0.9747899159663865, "grad_norm": 0.49519946210962484, "learning_rate": 3.750458043239282e-05, "loss": 0.4758, "step": 986 }, { "epoch": 0.9757785467128027, "grad_norm": 0.6307564268070287, "learning_rate": 3.748625870282155e-05, "loss": 0.4846, "step": 987 }, { "epoch": 0.9767671774592189, "grad_norm": 0.6329343237954675, "learning_rate": 3.746793697325028e-05, "loss": 0.4938, "step": 988 }, { "epoch": 0.9777558082056352, "grad_norm": 0.4815452206203413, "learning_rate": 3.7449615243679e-05, "loss": 0.5055, "step": 989 }, { "epoch": 0.9787444389520514, "grad_norm": 0.5250625619328858, "learning_rate": 3.7431293514107734e-05, "loss": 0.4718, "step": 990 }, { "epoch": 0.9797330696984676, "grad_norm": 0.6130287772488747, "learning_rate": 3.741297178453646e-05, "loss": 0.5108, "step": 991 }, { "epoch": 0.9807217004448838, "grad_norm": 0.5603014772060368, "learning_rate": 3.7394650054965194e-05, "loss": 0.5128, "step": 992 }, { "epoch": 0.9817103311913, "grad_norm": 0.44559158276708183, "learning_rate": 3.7376328325393914e-05, "loss": 0.5608, "step": 993 }, { "epoch": 0.9826989619377162, "grad_norm": 0.4895370973212571, "learning_rate": 3.735800659582265e-05, "loss": 0.4625, "step": 994 }, { "epoch": 0.9836875926841325, "grad_norm": 0.4663008732919852, "learning_rate": 3.733968486625137e-05, "loss": 0.4892, "step": 995 }, { "epoch": 0.9846762234305487, "grad_norm": 0.8453781619707287, "learning_rate": 3.7321363136680106e-05, "loss": 0.4378, "step": 996 }, { "epoch": 0.9856648541769649, "grad_norm": 0.5268629937650361, "learning_rate": 3.730304140710883e-05, "loss": 0.5439, "step": 997 }, { "epoch": 0.9866534849233811, "grad_norm": 0.6049735128942173, "learning_rate": 3.728471967753756e-05, "loss": 0.4759, "step": 998 }, { "epoch": 0.9876421156697973, "grad_norm": 0.5091543984803096, "learning_rate": 3.726639794796629e-05, "loss": 0.4767, "step": 999 }, { "epoch": 0.9886307464162135, "grad_norm": 0.5502200745076581, "learning_rate": 3.724807621839502e-05, "loss": 0.4689, "step": 1000 }, { "epoch": 0.9896193771626297, "grad_norm": 0.4986661368441836, "learning_rate": 3.7229754488823745e-05, "loss": 0.5338, "step": 1001 }, { "epoch": 0.990608007909046, "grad_norm": 0.46419435638877266, "learning_rate": 3.721143275925247e-05, "loss": 0.4258, "step": 1002 }, { "epoch": 0.9915966386554622, "grad_norm": 0.4977875807906908, "learning_rate": 3.7193111029681205e-05, "loss": 0.472, "step": 1003 }, { "epoch": 0.9925852694018784, "grad_norm": 0.5608694255553063, "learning_rate": 3.717478930010993e-05, "loss": 0.4537, "step": 1004 }, { "epoch": 0.9935739001482946, "grad_norm": 0.5738045395914029, "learning_rate": 3.7156467570538664e-05, "loss": 0.49, "step": 1005 }, { "epoch": 0.9945625308947108, "grad_norm": 0.5135910613113412, "learning_rate": 3.7138145840967384e-05, "loss": 0.453, "step": 1006 }, { "epoch": 0.995551161641127, "grad_norm": 0.6629176804932837, "learning_rate": 3.711982411139612e-05, "loss": 0.496, "step": 1007 }, { "epoch": 0.9965397923875432, "grad_norm": 0.5956193876641982, "learning_rate": 3.710150238182485e-05, "loss": 0.5376, "step": 1008 }, { "epoch": 0.9975284231339595, "grad_norm": 0.7369363351926873, "learning_rate": 3.708318065225358e-05, "loss": 0.4624, "step": 1009 }, { "epoch": 0.9985170538803757, "grad_norm": 0.6900680735101773, "learning_rate": 3.70648589226823e-05, "loss": 0.5496, "step": 1010 }, { "epoch": 0.9995056846267919, "grad_norm": 0.4881128305199494, "learning_rate": 3.704653719311103e-05, "loss": 0.4633, "step": 1011 }, { "epoch": 1.0, "grad_norm": 0.4881128305199494, "learning_rate": 3.702821546353976e-05, "loss": 0.4589, "step": 1012 }, { "epoch": 1.0009886307464162, "grad_norm": 0.8633781362529162, "learning_rate": 3.700989373396849e-05, "loss": 0.4316, "step": 1013 }, { "epoch": 1.0019772614928324, "grad_norm": 0.541995233782162, "learning_rate": 3.6991572004397216e-05, "loss": 0.4118, "step": 1014 }, { "epoch": 1.0029658922392486, "grad_norm": 0.5816837886913223, "learning_rate": 3.697325027482594e-05, "loss": 0.4098, "step": 1015 }, { "epoch": 1.0039545229856648, "grad_norm": 0.5974413371080508, "learning_rate": 3.6954928545254675e-05, "loss": 0.7168, "step": 1016 }, { "epoch": 1.004943153732081, "grad_norm": 10.36265230923502, "learning_rate": 3.69366068156834e-05, "loss": 0.3769, "step": 1017 }, { "epoch": 1.0059317844784972, "grad_norm": 0.8579662181234357, "learning_rate": 3.691828508611213e-05, "loss": 0.4141, "step": 1018 }, { "epoch": 1.0069204152249136, "grad_norm": 0.531670650474628, "learning_rate": 3.6899963356540855e-05, "loss": 0.3911, "step": 1019 }, { "epoch": 1.0079090459713298, "grad_norm": 0.5990428649067933, "learning_rate": 3.688164162696959e-05, "loss": 0.4011, "step": 1020 }, { "epoch": 1.008897676717746, "grad_norm": 0.8124875667383584, "learning_rate": 3.686331989739832e-05, "loss": 0.4447, "step": 1021 }, { "epoch": 1.0098863074641622, "grad_norm": 0.6746244296152569, "learning_rate": 3.684499816782704e-05, "loss": 0.3803, "step": 1022 }, { "epoch": 1.0108749382105784, "grad_norm": 0.5517043163728408, "learning_rate": 3.6826676438255774e-05, "loss": 0.3974, "step": 1023 }, { "epoch": 1.0118635689569946, "grad_norm": 0.5837793004243023, "learning_rate": 3.68083547086845e-05, "loss": 0.4181, "step": 1024 }, { "epoch": 1.0128521997034108, "grad_norm": 0.5157910908400077, "learning_rate": 3.6790032979113233e-05, "loss": 0.3921, "step": 1025 }, { "epoch": 1.013840830449827, "grad_norm": 0.4297180138275274, "learning_rate": 3.677171124954196e-05, "loss": 0.424, "step": 1026 }, { "epoch": 1.0148294611962432, "grad_norm": 4.485698423526859, "learning_rate": 3.6753389519970686e-05, "loss": 0.5973, "step": 1027 }, { "epoch": 1.0158180919426594, "grad_norm": 0.6307135183301575, "learning_rate": 3.673506779039941e-05, "loss": 0.3902, "step": 1028 }, { "epoch": 1.0168067226890756, "grad_norm": 0.49015710636605997, "learning_rate": 3.6716746060828146e-05, "loss": 0.4422, "step": 1029 }, { "epoch": 1.0177953534354918, "grad_norm": 0.599392996549095, "learning_rate": 3.669842433125687e-05, "loss": 0.3928, "step": 1030 }, { "epoch": 1.018783984181908, "grad_norm": 0.5833092218809854, "learning_rate": 3.66801026016856e-05, "loss": 0.375, "step": 1031 }, { "epoch": 1.0197726149283244, "grad_norm": 0.478076931791154, "learning_rate": 3.666178087211433e-05, "loss": 0.4139, "step": 1032 }, { "epoch": 1.0207612456747406, "grad_norm": 0.6549259893499045, "learning_rate": 3.664345914254306e-05, "loss": 0.4402, "step": 1033 }, { "epoch": 1.0217498764211568, "grad_norm": 0.5823550685466264, "learning_rate": 3.662513741297179e-05, "loss": 0.4282, "step": 1034 }, { "epoch": 1.022738507167573, "grad_norm": 0.5065545819258918, "learning_rate": 3.660681568340051e-05, "loss": 0.418, "step": 1035 }, { "epoch": 1.0237271379139892, "grad_norm": 0.558093046151218, "learning_rate": 3.6588493953829244e-05, "loss": 0.3721, "step": 1036 }, { "epoch": 1.0247157686604054, "grad_norm": 0.4887736155463212, "learning_rate": 3.657017222425797e-05, "loss": 0.4103, "step": 1037 }, { "epoch": 1.0257043994068216, "grad_norm": 3.2886682129723317, "learning_rate": 3.6551850494686704e-05, "loss": 0.4126, "step": 1038 }, { "epoch": 1.0266930301532378, "grad_norm": 0.6166876965490495, "learning_rate": 3.6533528765115424e-05, "loss": 0.4234, "step": 1039 }, { "epoch": 1.027681660899654, "grad_norm": 0.4674873173398825, "learning_rate": 3.651520703554416e-05, "loss": 0.3993, "step": 1040 }, { "epoch": 1.0286702916460702, "grad_norm": 0.6233838200536495, "learning_rate": 3.649688530597288e-05, "loss": 0.3929, "step": 1041 }, { "epoch": 1.0296589223924864, "grad_norm": 0.5933680157677127, "learning_rate": 3.6478563576401616e-05, "loss": 0.4043, "step": 1042 }, { "epoch": 1.0306475531389026, "grad_norm": 0.49655348023560436, "learning_rate": 3.646024184683034e-05, "loss": 0.4265, "step": 1043 }, { "epoch": 1.0316361838853187, "grad_norm": 0.5690507881720489, "learning_rate": 3.644192011725907e-05, "loss": 0.3715, "step": 1044 }, { "epoch": 1.032624814631735, "grad_norm": 0.6444338552002088, "learning_rate": 3.64235983876878e-05, "loss": 0.3901, "step": 1045 }, { "epoch": 1.0336134453781514, "grad_norm": 0.41326677683115876, "learning_rate": 3.640527665811653e-05, "loss": 0.4275, "step": 1046 }, { "epoch": 1.0346020761245676, "grad_norm": 0.5017144279408321, "learning_rate": 3.6386954928545255e-05, "loss": 0.4041, "step": 1047 }, { "epoch": 1.0355907068709838, "grad_norm": 0.5416657819287108, "learning_rate": 3.636863319897398e-05, "loss": 0.4448, "step": 1048 }, { "epoch": 1.0365793376174, "grad_norm": 0.4347095232247909, "learning_rate": 3.6350311469402715e-05, "loss": 0.417, "step": 1049 }, { "epoch": 1.0375679683638162, "grad_norm": 0.5132452457263779, "learning_rate": 3.633198973983144e-05, "loss": 0.4191, "step": 1050 }, { "epoch": 1.0385565991102323, "grad_norm": 0.4935850508539199, "learning_rate": 3.6313668010260175e-05, "loss": 0.3837, "step": 1051 }, { "epoch": 1.0395452298566485, "grad_norm": 0.4401078057982667, "learning_rate": 3.6295346280688894e-05, "loss": 0.4136, "step": 1052 }, { "epoch": 1.0405338606030647, "grad_norm": 0.5010699369163379, "learning_rate": 3.627702455111763e-05, "loss": 0.3672, "step": 1053 }, { "epoch": 1.041522491349481, "grad_norm": 0.8922127282637141, "learning_rate": 3.6258702821546354e-05, "loss": 0.3824, "step": 1054 }, { "epoch": 1.0425111220958971, "grad_norm": 0.5133456899301762, "learning_rate": 3.624038109197509e-05, "loss": 0.4034, "step": 1055 }, { "epoch": 1.0434997528423133, "grad_norm": 0.41734018785865995, "learning_rate": 3.6222059362403813e-05, "loss": 0.4267, "step": 1056 }, { "epoch": 1.0444883835887295, "grad_norm": 0.4490941424660359, "learning_rate": 3.620373763283254e-05, "loss": 0.3941, "step": 1057 }, { "epoch": 1.0454770143351457, "grad_norm": 0.4612035861833097, "learning_rate": 3.618541590326127e-05, "loss": 0.3773, "step": 1058 }, { "epoch": 1.046465645081562, "grad_norm": 0.49863400855632817, "learning_rate": 3.616709417369e-05, "loss": 0.4382, "step": 1059 }, { "epoch": 1.0474542758279783, "grad_norm": 0.4985410036725187, "learning_rate": 3.6148772444118726e-05, "loss": 0.415, "step": 1060 }, { "epoch": 1.0484429065743945, "grad_norm": 0.46070338241077713, "learning_rate": 3.613045071454745e-05, "loss": 0.4182, "step": 1061 }, { "epoch": 1.0494315373208107, "grad_norm": 0.48822872924404737, "learning_rate": 3.6112128984976185e-05, "loss": 0.4243, "step": 1062 }, { "epoch": 1.050420168067227, "grad_norm": 0.4780678734013317, "learning_rate": 3.609380725540491e-05, "loss": 0.4766, "step": 1063 }, { "epoch": 1.0514087988136431, "grad_norm": 0.4810399526570658, "learning_rate": 3.607548552583364e-05, "loss": 0.3953, "step": 1064 }, { "epoch": 1.0523974295600593, "grad_norm": 0.47951274236131447, "learning_rate": 3.6057163796262365e-05, "loss": 0.3716, "step": 1065 }, { "epoch": 1.0533860603064755, "grad_norm": 0.4459274880375414, "learning_rate": 3.60388420666911e-05, "loss": 0.4223, "step": 1066 }, { "epoch": 1.0543746910528917, "grad_norm": 0.4123405397591178, "learning_rate": 3.602052033711983e-05, "loss": 0.4269, "step": 1067 }, { "epoch": 1.055363321799308, "grad_norm": 0.5131345243193345, "learning_rate": 3.600219860754855e-05, "loss": 0.4034, "step": 1068 }, { "epoch": 1.0563519525457241, "grad_norm": 0.5016767616301848, "learning_rate": 3.5983876877977284e-05, "loss": 0.4854, "step": 1069 }, { "epoch": 1.0573405832921403, "grad_norm": 13.360128383584842, "learning_rate": 3.596555514840601e-05, "loss": 0.3862, "step": 1070 }, { "epoch": 1.0583292140385565, "grad_norm": 0.7088120693608745, "learning_rate": 3.5947233418834744e-05, "loss": 0.3887, "step": 1071 }, { "epoch": 1.0593178447849727, "grad_norm": 0.8552050582253385, "learning_rate": 3.592891168926347e-05, "loss": 0.4375, "step": 1072 }, { "epoch": 1.0603064755313891, "grad_norm": 0.5709344364077757, "learning_rate": 3.5910589959692196e-05, "loss": 0.3812, "step": 1073 }, { "epoch": 1.0612951062778053, "grad_norm": 1.5296790150728625, "learning_rate": 3.589226823012092e-05, "loss": 0.4203, "step": 1074 }, { "epoch": 1.0622837370242215, "grad_norm": 0.7496618113699111, "learning_rate": 3.5873946500549656e-05, "loss": 0.3445, "step": 1075 }, { "epoch": 1.0632723677706377, "grad_norm": 0.4671268540813679, "learning_rate": 3.585562477097838e-05, "loss": 0.4061, "step": 1076 }, { "epoch": 1.064260998517054, "grad_norm": 0.7509716957509666, "learning_rate": 3.583730304140711e-05, "loss": 0.4387, "step": 1077 }, { "epoch": 1.06524962926347, "grad_norm": 0.5216976148450676, "learning_rate": 3.5818981311835835e-05, "loss": 0.3954, "step": 1078 }, { "epoch": 1.0662382600098863, "grad_norm": 0.581376132726836, "learning_rate": 3.580065958226457e-05, "loss": 0.3991, "step": 1079 }, { "epoch": 1.0672268907563025, "grad_norm": 0.6328510720868211, "learning_rate": 3.57823378526933e-05, "loss": 0.4486, "step": 1080 }, { "epoch": 1.0682155215027187, "grad_norm": 0.5765359256542131, "learning_rate": 3.576401612312202e-05, "loss": 0.4007, "step": 1081 }, { "epoch": 1.069204152249135, "grad_norm": 0.5551438937291179, "learning_rate": 3.5745694393550754e-05, "loss": 0.3756, "step": 1082 }, { "epoch": 1.070192782995551, "grad_norm": 0.6624374063201511, "learning_rate": 3.572737266397948e-05, "loss": 0.4544, "step": 1083 }, { "epoch": 1.0711814137419673, "grad_norm": 0.6747618888306202, "learning_rate": 3.5709050934408214e-05, "loss": 0.3879, "step": 1084 }, { "epoch": 1.0721700444883835, "grad_norm": 0.5947085011351032, "learning_rate": 3.5690729204836934e-05, "loss": 0.4368, "step": 1085 }, { "epoch": 1.0731586752348, "grad_norm": 1.1026169496026146, "learning_rate": 3.567240747526567e-05, "loss": 0.4281, "step": 1086 }, { "epoch": 1.074147305981216, "grad_norm": 0.6589462642342916, "learning_rate": 3.565408574569439e-05, "loss": 0.4499, "step": 1087 }, { "epoch": 1.0751359367276323, "grad_norm": 0.572303491984068, "learning_rate": 3.5635764016123127e-05, "loss": 0.3752, "step": 1088 }, { "epoch": 1.0761245674740485, "grad_norm": 0.46463008519322124, "learning_rate": 3.561744228655185e-05, "loss": 0.3814, "step": 1089 }, { "epoch": 1.0771131982204647, "grad_norm": 0.45642934229718213, "learning_rate": 3.559912055698058e-05, "loss": 0.3918, "step": 1090 }, { "epoch": 1.078101828966881, "grad_norm": 0.45009649925760514, "learning_rate": 3.558079882740931e-05, "loss": 0.4094, "step": 1091 }, { "epoch": 1.079090459713297, "grad_norm": 0.4600715071788134, "learning_rate": 3.556247709783804e-05, "loss": 0.4079, "step": 1092 }, { "epoch": 1.0800790904597133, "grad_norm": 0.4748923789206349, "learning_rate": 3.5544155368266765e-05, "loss": 0.4006, "step": 1093 }, { "epoch": 1.0810677212061295, "grad_norm": 0.49711354716039824, "learning_rate": 3.552583363869549e-05, "loss": 0.4568, "step": 1094 }, { "epoch": 1.0820563519525457, "grad_norm": 0.4701081548187508, "learning_rate": 3.5507511909124225e-05, "loss": 0.4068, "step": 1095 }, { "epoch": 1.0830449826989619, "grad_norm": 0.504549571795409, "learning_rate": 3.548919017955295e-05, "loss": 0.3784, "step": 1096 }, { "epoch": 1.084033613445378, "grad_norm": 0.514340100358308, "learning_rate": 3.547086844998168e-05, "loss": 0.4118, "step": 1097 }, { "epoch": 1.0850222441917943, "grad_norm": 0.4643121966786987, "learning_rate": 3.5452546720410404e-05, "loss": 0.3828, "step": 1098 }, { "epoch": 1.0860108749382107, "grad_norm": 0.5555263926038552, "learning_rate": 3.543422499083914e-05, "loss": 0.4439, "step": 1099 }, { "epoch": 1.0869995056846269, "grad_norm": 0.4746024284510746, "learning_rate": 3.5415903261267864e-05, "loss": 0.418, "step": 1100 }, { "epoch": 1.087988136431043, "grad_norm": 0.5242948396261634, "learning_rate": 3.53975815316966e-05, "loss": 0.4337, "step": 1101 }, { "epoch": 1.0889767671774593, "grad_norm": 0.5268528572946369, "learning_rate": 3.537925980212532e-05, "loss": 0.4349, "step": 1102 }, { "epoch": 1.0899653979238755, "grad_norm": 0.4472711810360087, "learning_rate": 3.536093807255405e-05, "loss": 0.3982, "step": 1103 }, { "epoch": 1.0909540286702917, "grad_norm": 0.48761767888380425, "learning_rate": 3.534261634298278e-05, "loss": 0.4149, "step": 1104 }, { "epoch": 1.0919426594167079, "grad_norm": 0.8433809795264677, "learning_rate": 3.532429461341151e-05, "loss": 0.3908, "step": 1105 }, { "epoch": 1.092931290163124, "grad_norm": 0.4351886595172912, "learning_rate": 3.5305972883840236e-05, "loss": 0.3765, "step": 1106 }, { "epoch": 1.0939199209095403, "grad_norm": 0.5543024408889665, "learning_rate": 3.528765115426896e-05, "loss": 0.3739, "step": 1107 }, { "epoch": 1.0949085516559565, "grad_norm": 0.4336437720242299, "learning_rate": 3.5269329424697696e-05, "loss": 0.3998, "step": 1108 }, { "epoch": 1.0958971824023727, "grad_norm": 0.47334273829803475, "learning_rate": 3.525100769512642e-05, "loss": 0.3985, "step": 1109 }, { "epoch": 1.0968858131487889, "grad_norm": 0.521356482304534, "learning_rate": 3.523268596555515e-05, "loss": 0.4424, "step": 1110 }, { "epoch": 1.097874443895205, "grad_norm": 0.5309064821128673, "learning_rate": 3.5214364235983875e-05, "loss": 0.4652, "step": 1111 }, { "epoch": 1.0988630746416213, "grad_norm": 0.4006816268284708, "learning_rate": 3.519604250641261e-05, "loss": 0.3731, "step": 1112 }, { "epoch": 1.0998517053880377, "grad_norm": 0.5284272621774074, "learning_rate": 3.5177720776841334e-05, "loss": 0.3621, "step": 1113 }, { "epoch": 1.1008403361344539, "grad_norm": 0.426695200183691, "learning_rate": 3.515939904727006e-05, "loss": 0.4022, "step": 1114 }, { "epoch": 1.10182896688087, "grad_norm": 0.5192808950206832, "learning_rate": 3.5141077317698794e-05, "loss": 0.4387, "step": 1115 }, { "epoch": 1.1028175976272863, "grad_norm": 0.49238711567617927, "learning_rate": 3.512275558812752e-05, "loss": 0.396, "step": 1116 }, { "epoch": 1.1038062283737025, "grad_norm": 0.5107394875020088, "learning_rate": 3.5104433858556254e-05, "loss": 0.3586, "step": 1117 }, { "epoch": 1.1047948591201187, "grad_norm": 0.4270132253829762, "learning_rate": 3.508611212898498e-05, "loss": 0.3948, "step": 1118 }, { "epoch": 1.1057834898665349, "grad_norm": 0.5258065533127412, "learning_rate": 3.5067790399413706e-05, "loss": 0.4535, "step": 1119 }, { "epoch": 1.106772120612951, "grad_norm": 1.1067326873617762, "learning_rate": 3.504946866984243e-05, "loss": 0.433, "step": 1120 }, { "epoch": 1.1077607513593672, "grad_norm": 0.5346849873046929, "learning_rate": 3.5031146940271166e-05, "loss": 0.4613, "step": 1121 }, { "epoch": 1.1087493821057834, "grad_norm": 0.4372297448840709, "learning_rate": 3.501282521069989e-05, "loss": 0.4228, "step": 1122 }, { "epoch": 1.1097380128521996, "grad_norm": 0.5423980972999004, "learning_rate": 3.499450348112862e-05, "loss": 0.4203, "step": 1123 }, { "epoch": 1.1107266435986158, "grad_norm": 0.40963362303959683, "learning_rate": 3.4976181751557345e-05, "loss": 0.4246, "step": 1124 }, { "epoch": 1.111715274345032, "grad_norm": 4.179651803548241, "learning_rate": 3.495786002198608e-05, "loss": 0.4215, "step": 1125 }, { "epoch": 1.1127039050914482, "grad_norm": 0.9540663938544639, "learning_rate": 3.4939538292414805e-05, "loss": 0.4275, "step": 1126 }, { "epoch": 1.1136925358378647, "grad_norm": 0.7297605532751592, "learning_rate": 3.492121656284353e-05, "loss": 0.3936, "step": 1127 }, { "epoch": 1.1146811665842808, "grad_norm": 0.5844739872108731, "learning_rate": 3.4902894833272265e-05, "loss": 0.3804, "step": 1128 }, { "epoch": 1.115669797330697, "grad_norm": 0.4746758759371227, "learning_rate": 3.488457310370099e-05, "loss": 0.4284, "step": 1129 }, { "epoch": 1.1166584280771132, "grad_norm": 0.5439675475541536, "learning_rate": 3.4866251374129724e-05, "loss": 0.3758, "step": 1130 }, { "epoch": 1.1176470588235294, "grad_norm": 0.45098455388911085, "learning_rate": 3.4847929644558444e-05, "loss": 0.3853, "step": 1131 }, { "epoch": 1.1186356895699456, "grad_norm": 0.5369238402689367, "learning_rate": 3.482960791498718e-05, "loss": 0.3841, "step": 1132 }, { "epoch": 1.1196243203163618, "grad_norm": 0.42076263807702574, "learning_rate": 3.4811286185415903e-05, "loss": 0.4332, "step": 1133 }, { "epoch": 1.120612951062778, "grad_norm": 0.5247830734585238, "learning_rate": 3.479296445584464e-05, "loss": 0.3823, "step": 1134 }, { "epoch": 1.1216015818091942, "grad_norm": 0.3981584543877383, "learning_rate": 3.477464272627336e-05, "loss": 0.3842, "step": 1135 }, { "epoch": 1.1225902125556104, "grad_norm": 0.4900534371066377, "learning_rate": 3.475632099670209e-05, "loss": 0.3737, "step": 1136 }, { "epoch": 1.1235788433020266, "grad_norm": 0.4651157203545069, "learning_rate": 3.4737999267130816e-05, "loss": 0.4074, "step": 1137 }, { "epoch": 1.1245674740484428, "grad_norm": 0.4659819531316393, "learning_rate": 3.471967753755955e-05, "loss": 0.3788, "step": 1138 }, { "epoch": 1.125556104794859, "grad_norm": 0.49099630176051356, "learning_rate": 3.4701355807988275e-05, "loss": 0.4259, "step": 1139 }, { "epoch": 1.1265447355412754, "grad_norm": 0.4569924298785774, "learning_rate": 3.4683034078417e-05, "loss": 0.4028, "step": 1140 }, { "epoch": 1.1275333662876916, "grad_norm": 0.45793120948617716, "learning_rate": 3.4664712348845735e-05, "loss": 0.4108, "step": 1141 }, { "epoch": 1.1285219970341078, "grad_norm": 0.5273019137627184, "learning_rate": 3.464639061927446e-05, "loss": 0.4645, "step": 1142 }, { "epoch": 1.129510627780524, "grad_norm": 0.4774861649651099, "learning_rate": 3.462806888970319e-05, "loss": 0.41, "step": 1143 }, { "epoch": 1.1304992585269402, "grad_norm": 0.47005340241569027, "learning_rate": 3.4609747160131914e-05, "loss": 0.4309, "step": 1144 }, { "epoch": 1.1314878892733564, "grad_norm": 0.469464746322135, "learning_rate": 3.459142543056065e-05, "loss": 0.4029, "step": 1145 }, { "epoch": 1.1324765200197726, "grad_norm": 0.44206309979284936, "learning_rate": 3.4573103700989374e-05, "loss": 0.4216, "step": 1146 }, { "epoch": 1.1334651507661888, "grad_norm": 0.6674198403005634, "learning_rate": 3.455478197141811e-05, "loss": 0.3854, "step": 1147 }, { "epoch": 1.134453781512605, "grad_norm": 0.41410562483801694, "learning_rate": 3.453646024184683e-05, "loss": 0.381, "step": 1148 }, { "epoch": 1.1354424122590212, "grad_norm": 1.5341883997217969, "learning_rate": 3.451813851227556e-05, "loss": 0.4446, "step": 1149 }, { "epoch": 1.1364310430054374, "grad_norm": 0.6230364043897253, "learning_rate": 3.4499816782704286e-05, "loss": 0.3895, "step": 1150 }, { "epoch": 1.1374196737518536, "grad_norm": 0.5179558217565707, "learning_rate": 3.448149505313302e-05, "loss": 0.4354, "step": 1151 }, { "epoch": 1.1384083044982698, "grad_norm": 0.5930759228439343, "learning_rate": 3.4463173323561746e-05, "loss": 0.347, "step": 1152 }, { "epoch": 1.1393969352446862, "grad_norm": 0.6121468484720709, "learning_rate": 3.444485159399047e-05, "loss": 0.3874, "step": 1153 }, { "epoch": 1.1403855659911024, "grad_norm": 0.5929303054074163, "learning_rate": 3.4426529864419206e-05, "loss": 0.404, "step": 1154 }, { "epoch": 1.1413741967375186, "grad_norm": 0.4356473047570349, "learning_rate": 3.440820813484793e-05, "loss": 0.4255, "step": 1155 }, { "epoch": 1.1423628274839348, "grad_norm": 0.6816895112486464, "learning_rate": 3.438988640527666e-05, "loss": 0.4001, "step": 1156 }, { "epoch": 1.143351458230351, "grad_norm": 0.47548358696892945, "learning_rate": 3.4371564675705385e-05, "loss": 0.4096, "step": 1157 }, { "epoch": 1.1443400889767672, "grad_norm": 0.5076813095785352, "learning_rate": 3.435324294613412e-05, "loss": 0.3877, "step": 1158 }, { "epoch": 1.1453287197231834, "grad_norm": 0.48823508133480414, "learning_rate": 3.4334921216562844e-05, "loss": 0.4212, "step": 1159 }, { "epoch": 1.1463173504695996, "grad_norm": 0.5274637166733133, "learning_rate": 3.431659948699157e-05, "loss": 0.3894, "step": 1160 }, { "epoch": 1.1473059812160158, "grad_norm": 0.3998512690739394, "learning_rate": 3.42982777574203e-05, "loss": 0.4188, "step": 1161 }, { "epoch": 1.148294611962432, "grad_norm": 0.5893719649560664, "learning_rate": 3.427995602784903e-05, "loss": 0.4362, "step": 1162 }, { "epoch": 1.1492832427088482, "grad_norm": 0.44735785420756236, "learning_rate": 3.4261634298277764e-05, "loss": 0.3909, "step": 1163 }, { "epoch": 1.1502718734552644, "grad_norm": 0.5241800369662903, "learning_rate": 3.424331256870649e-05, "loss": 0.3858, "step": 1164 }, { "epoch": 1.1512605042016806, "grad_norm": 0.5313407221349512, "learning_rate": 3.4224990839135217e-05, "loss": 0.4055, "step": 1165 }, { "epoch": 1.152249134948097, "grad_norm": 0.48846580082068963, "learning_rate": 3.420666910956394e-05, "loss": 0.3688, "step": 1166 }, { "epoch": 1.1532377656945132, "grad_norm": 0.4900279176013573, "learning_rate": 3.4188347379992676e-05, "loss": 0.4148, "step": 1167 }, { "epoch": 1.1542263964409294, "grad_norm": 0.6267911909563195, "learning_rate": 3.41700256504214e-05, "loss": 0.4146, "step": 1168 }, { "epoch": 1.1552150271873456, "grad_norm": 0.5027302154237945, "learning_rate": 3.415170392085013e-05, "loss": 0.3901, "step": 1169 }, { "epoch": 1.1562036579337618, "grad_norm": 0.5160801945035289, "learning_rate": 3.4133382191278855e-05, "loss": 0.3748, "step": 1170 }, { "epoch": 1.157192288680178, "grad_norm": 0.5599435221241195, "learning_rate": 3.411506046170759e-05, "loss": 0.3786, "step": 1171 }, { "epoch": 1.1581809194265942, "grad_norm": 0.490745897262082, "learning_rate": 3.4096738732136315e-05, "loss": 0.4074, "step": 1172 }, { "epoch": 1.1591695501730104, "grad_norm": 0.6230276629965269, "learning_rate": 3.407841700256504e-05, "loss": 0.381, "step": 1173 }, { "epoch": 1.1601581809194266, "grad_norm": 0.5475890492383286, "learning_rate": 3.4060095272993775e-05, "loss": 0.3927, "step": 1174 }, { "epoch": 1.1611468116658428, "grad_norm": 0.4944111648941994, "learning_rate": 3.40417735434225e-05, "loss": 0.4614, "step": 1175 }, { "epoch": 1.162135442412259, "grad_norm": 0.5745983226331672, "learning_rate": 3.4023451813851234e-05, "loss": 0.3904, "step": 1176 }, { "epoch": 1.1631240731586752, "grad_norm": 0.4880528473468052, "learning_rate": 3.4005130084279954e-05, "loss": 0.3872, "step": 1177 }, { "epoch": 1.1641127039050914, "grad_norm": 0.49145353713402845, "learning_rate": 3.398680835470869e-05, "loss": 0.4041, "step": 1178 }, { "epoch": 1.1651013346515078, "grad_norm": 0.46970386714321466, "learning_rate": 3.3968486625137414e-05, "loss": 0.4479, "step": 1179 }, { "epoch": 1.1660899653979238, "grad_norm": 0.473463044385959, "learning_rate": 3.395016489556615e-05, "loss": 0.3657, "step": 1180 }, { "epoch": 1.1670785961443402, "grad_norm": 0.4656985549669563, "learning_rate": 3.393184316599487e-05, "loss": 0.396, "step": 1181 }, { "epoch": 1.1680672268907564, "grad_norm": 0.4272810043595414, "learning_rate": 3.39135214364236e-05, "loss": 0.3852, "step": 1182 }, { "epoch": 1.1690558576371726, "grad_norm": 0.4326399590807857, "learning_rate": 3.3895199706852326e-05, "loss": 0.3759, "step": 1183 }, { "epoch": 1.1700444883835888, "grad_norm": 0.4387815086099449, "learning_rate": 3.387687797728106e-05, "loss": 0.4238, "step": 1184 }, { "epoch": 1.171033119130005, "grad_norm": 0.48892032267884594, "learning_rate": 3.3858556247709786e-05, "loss": 0.4068, "step": 1185 }, { "epoch": 1.1720217498764212, "grad_norm": 0.4223332125129285, "learning_rate": 3.384023451813851e-05, "loss": 0.3411, "step": 1186 }, { "epoch": 1.1730103806228374, "grad_norm": 0.5356115770130613, "learning_rate": 3.3821912788567245e-05, "loss": 0.4636, "step": 1187 }, { "epoch": 1.1739990113692536, "grad_norm": 0.48450457257877855, "learning_rate": 3.380359105899597e-05, "loss": 0.3995, "step": 1188 }, { "epoch": 1.1749876421156698, "grad_norm": 0.5795060141617989, "learning_rate": 3.37852693294247e-05, "loss": 0.3691, "step": 1189 }, { "epoch": 1.175976272862086, "grad_norm": 0.49981885183797176, "learning_rate": 3.3766947599853424e-05, "loss": 0.391, "step": 1190 }, { "epoch": 1.1769649036085021, "grad_norm": 0.8576851507274152, "learning_rate": 3.374862587028216e-05, "loss": 0.4269, "step": 1191 }, { "epoch": 1.1779535343549183, "grad_norm": 0.504953404444672, "learning_rate": 3.3730304140710884e-05, "loss": 0.3571, "step": 1192 }, { "epoch": 1.1789421651013345, "grad_norm": 0.5600575818384391, "learning_rate": 3.371198241113962e-05, "loss": 0.4266, "step": 1193 }, { "epoch": 1.179930795847751, "grad_norm": 6.721173145300414, "learning_rate": 3.369366068156834e-05, "loss": 0.4496, "step": 1194 }, { "epoch": 1.1809194265941672, "grad_norm": 0.6701645895347984, "learning_rate": 3.367533895199707e-05, "loss": 0.4308, "step": 1195 }, { "epoch": 1.1819080573405834, "grad_norm": 0.5634590580429855, "learning_rate": 3.3657017222425796e-05, "loss": 0.3754, "step": 1196 }, { "epoch": 1.1828966880869995, "grad_norm": 0.4659566309221283, "learning_rate": 3.363869549285453e-05, "loss": 0.395, "step": 1197 }, { "epoch": 1.1838853188334157, "grad_norm": 0.6422359952810714, "learning_rate": 3.3620373763283256e-05, "loss": 0.3972, "step": 1198 }, { "epoch": 1.184873949579832, "grad_norm": 0.4888351373999051, "learning_rate": 3.360205203371198e-05, "loss": 0.4228, "step": 1199 }, { "epoch": 1.1858625803262481, "grad_norm": 0.5230815459123017, "learning_rate": 3.3583730304140716e-05, "loss": 0.3689, "step": 1200 }, { "epoch": 1.1868512110726643, "grad_norm": 0.4753947508502005, "learning_rate": 3.356540857456944e-05, "loss": 0.4139, "step": 1201 }, { "epoch": 1.1878398418190805, "grad_norm": 0.502982836153419, "learning_rate": 3.354708684499817e-05, "loss": 0.4505, "step": 1202 }, { "epoch": 1.1888284725654967, "grad_norm": 0.47673048437152654, "learning_rate": 3.3528765115426895e-05, "loss": 0.3998, "step": 1203 }, { "epoch": 1.189817103311913, "grad_norm": 0.45929798547703826, "learning_rate": 3.351044338585563e-05, "loss": 0.4099, "step": 1204 }, { "epoch": 1.1908057340583291, "grad_norm": 0.5253853312077976, "learning_rate": 3.3492121656284355e-05, "loss": 0.4048, "step": 1205 }, { "epoch": 1.1917943648047453, "grad_norm": 0.4780680664247436, "learning_rate": 3.347379992671308e-05, "loss": 0.4071, "step": 1206 }, { "epoch": 1.1927829955511617, "grad_norm": 0.8011062236091037, "learning_rate": 3.345547819714181e-05, "loss": 0.4377, "step": 1207 }, { "epoch": 1.193771626297578, "grad_norm": 17.624042153705222, "learning_rate": 3.343715646757054e-05, "loss": 0.4987, "step": 1208 }, { "epoch": 1.1947602570439941, "grad_norm": 0.5584606817185883, "learning_rate": 3.341883473799927e-05, "loss": 0.3841, "step": 1209 }, { "epoch": 1.1957488877904103, "grad_norm": 0.4296694752616605, "learning_rate": 3.3400513008428e-05, "loss": 0.4106, "step": 1210 }, { "epoch": 1.1967375185368265, "grad_norm": 0.5172777435228926, "learning_rate": 3.338219127885673e-05, "loss": 0.3984, "step": 1211 }, { "epoch": 1.1977261492832427, "grad_norm": 0.5091784952146166, "learning_rate": 3.336386954928545e-05, "loss": 0.4024, "step": 1212 }, { "epoch": 1.198714780029659, "grad_norm": 0.7881746689964182, "learning_rate": 3.3345547819714186e-05, "loss": 0.3836, "step": 1213 }, { "epoch": 1.1997034107760751, "grad_norm": 0.6064548790019155, "learning_rate": 3.332722609014291e-05, "loss": 0.4111, "step": 1214 }, { "epoch": 1.2006920415224913, "grad_norm": 0.4566511040609773, "learning_rate": 3.330890436057164e-05, "loss": 0.4085, "step": 1215 }, { "epoch": 1.2016806722689075, "grad_norm": 0.5180154695516163, "learning_rate": 3.3290582631000365e-05, "loss": 0.4181, "step": 1216 }, { "epoch": 1.2026693030153237, "grad_norm": 0.48999052958370926, "learning_rate": 3.32722609014291e-05, "loss": 0.4563, "step": 1217 }, { "epoch": 1.20365793376174, "grad_norm": 0.44070663775988356, "learning_rate": 3.3253939171857825e-05, "loss": 0.3679, "step": 1218 }, { "epoch": 1.204646564508156, "grad_norm": 0.5014148228132058, "learning_rate": 3.323561744228655e-05, "loss": 0.4645, "step": 1219 }, { "epoch": 1.2056351952545725, "grad_norm": 0.4175068415061516, "learning_rate": 3.321729571271528e-05, "loss": 0.4244, "step": 1220 }, { "epoch": 1.2066238260009887, "grad_norm": 0.5270845720645412, "learning_rate": 3.319897398314401e-05, "loss": 0.4186, "step": 1221 }, { "epoch": 1.207612456747405, "grad_norm": 0.42927683752896717, "learning_rate": 3.3180652253572744e-05, "loss": 0.3956, "step": 1222 }, { "epoch": 1.2086010874938211, "grad_norm": 0.4590510281677793, "learning_rate": 3.3162330524001464e-05, "loss": 0.4029, "step": 1223 }, { "epoch": 1.2095897182402373, "grad_norm": 0.4838151607464785, "learning_rate": 3.31440087944302e-05, "loss": 0.3972, "step": 1224 }, { "epoch": 1.2105783489866535, "grad_norm": 0.4071252322246172, "learning_rate": 3.3125687064858924e-05, "loss": 0.3963, "step": 1225 }, { "epoch": 1.2115669797330697, "grad_norm": 0.6017611727838739, "learning_rate": 3.310736533528766e-05, "loss": 0.4405, "step": 1226 }, { "epoch": 1.212555610479486, "grad_norm": 0.5043943157981741, "learning_rate": 3.308904360571638e-05, "loss": 0.396, "step": 1227 }, { "epoch": 1.213544241225902, "grad_norm": 0.4629540031278141, "learning_rate": 3.307072187614511e-05, "loss": 0.4196, "step": 1228 }, { "epoch": 1.2145328719723183, "grad_norm": 0.5496789992081548, "learning_rate": 3.3052400146573836e-05, "loss": 0.3943, "step": 1229 }, { "epoch": 1.2155215027187345, "grad_norm": 0.49183355370477005, "learning_rate": 3.303407841700257e-05, "loss": 0.4218, "step": 1230 }, { "epoch": 1.2165101334651507, "grad_norm": 0.5660606739463244, "learning_rate": 3.3015756687431296e-05, "loss": 0.4069, "step": 1231 }, { "epoch": 1.2174987642115669, "grad_norm": 0.6148638222961746, "learning_rate": 3.299743495786002e-05, "loss": 0.4294, "step": 1232 }, { "epoch": 1.2184873949579833, "grad_norm": 0.4973348196802497, "learning_rate": 3.297911322828875e-05, "loss": 0.4191, "step": 1233 }, { "epoch": 1.2194760257043995, "grad_norm": 0.5840994139320763, "learning_rate": 3.296079149871748e-05, "loss": 0.4855, "step": 1234 }, { "epoch": 1.2204646564508157, "grad_norm": 0.44804974063881164, "learning_rate": 3.294246976914621e-05, "loss": 0.4149, "step": 1235 }, { "epoch": 1.221453287197232, "grad_norm": 0.4621006049378892, "learning_rate": 3.2924148039574935e-05, "loss": 0.3704, "step": 1236 }, { "epoch": 1.222441917943648, "grad_norm": 0.4648225484263692, "learning_rate": 3.290582631000367e-05, "loss": 0.3991, "step": 1237 }, { "epoch": 1.2234305486900643, "grad_norm": 0.4242105666038702, "learning_rate": 3.2887504580432394e-05, "loss": 0.4196, "step": 1238 }, { "epoch": 1.2244191794364805, "grad_norm": 0.4583284855844578, "learning_rate": 3.286918285086113e-05, "loss": 0.3953, "step": 1239 }, { "epoch": 1.2254078101828967, "grad_norm": 0.4841069495131265, "learning_rate": 3.285086112128985e-05, "loss": 0.3656, "step": 1240 }, { "epoch": 1.2263964409293129, "grad_norm": 0.5074554090268437, "learning_rate": 3.283253939171858e-05, "loss": 0.3752, "step": 1241 }, { "epoch": 1.227385071675729, "grad_norm": 0.45872179622232856, "learning_rate": 3.2814217662147307e-05, "loss": 0.4057, "step": 1242 }, { "epoch": 1.2283737024221453, "grad_norm": 0.4432771328812737, "learning_rate": 3.279589593257604e-05, "loss": 0.4352, "step": 1243 }, { "epoch": 1.2293623331685615, "grad_norm": 0.484683520708316, "learning_rate": 3.277757420300476e-05, "loss": 0.38, "step": 1244 }, { "epoch": 1.2303509639149777, "grad_norm": 0.3896884484132767, "learning_rate": 3.275925247343349e-05, "loss": 0.4488, "step": 1245 }, { "epoch": 1.231339594661394, "grad_norm": 0.5529367350130314, "learning_rate": 3.2740930743862226e-05, "loss": 0.3776, "step": 1246 }, { "epoch": 1.23232822540781, "grad_norm": 0.41207855674469784, "learning_rate": 3.272260901429095e-05, "loss": 0.3997, "step": 1247 }, { "epoch": 1.2333168561542265, "grad_norm": 0.5021723914311218, "learning_rate": 3.270428728471968e-05, "loss": 0.4058, "step": 1248 }, { "epoch": 1.2343054869006427, "grad_norm": 0.4204777732661184, "learning_rate": 3.2685965555148405e-05, "loss": 0.4182, "step": 1249 }, { "epoch": 1.2352941176470589, "grad_norm": 0.4855448333943084, "learning_rate": 3.266764382557714e-05, "loss": 0.3897, "step": 1250 }, { "epoch": 1.236282748393475, "grad_norm": 0.5129472597566673, "learning_rate": 3.2649322096005865e-05, "loss": 0.4158, "step": 1251 }, { "epoch": 1.2372713791398913, "grad_norm": 0.4234038969588133, "learning_rate": 3.263100036643459e-05, "loss": 0.4003, "step": 1252 }, { "epoch": 1.2382600098863075, "grad_norm": 0.5418140967630688, "learning_rate": 3.261267863686332e-05, "loss": 0.8289, "step": 1253 }, { "epoch": 1.2392486406327237, "grad_norm": 59.127092941719205, "learning_rate": 3.259435690729205e-05, "loss": 0.406, "step": 1254 }, { "epoch": 1.2402372713791399, "grad_norm": 0.5709955151818443, "learning_rate": 3.257603517772078e-05, "loss": 0.4329, "step": 1255 }, { "epoch": 1.241225902125556, "grad_norm": 0.5184014229574909, "learning_rate": 3.255771344814951e-05, "loss": 0.3768, "step": 1256 }, { "epoch": 1.2422145328719723, "grad_norm": 0.46126630082257514, "learning_rate": 3.253939171857823e-05, "loss": 0.4664, "step": 1257 }, { "epoch": 1.2432031636183885, "grad_norm": 0.6969826343308186, "learning_rate": 3.252106998900696e-05, "loss": 0.4004, "step": 1258 }, { "epoch": 1.2441917943648046, "grad_norm": 0.5295120507712621, "learning_rate": 3.2502748259435696e-05, "loss": 0.4113, "step": 1259 }, { "epoch": 1.2451804251112208, "grad_norm": 0.5672479755746335, "learning_rate": 3.248442652986442e-05, "loss": 0.3823, "step": 1260 }, { "epoch": 1.2461690558576373, "grad_norm": 0.44567007445150447, "learning_rate": 3.246610480029315e-05, "loss": 0.4243, "step": 1261 }, { "epoch": 1.2471576866040535, "grad_norm": 0.5954620878221879, "learning_rate": 3.2447783070721876e-05, "loss": 0.3989, "step": 1262 }, { "epoch": 1.2481463173504697, "grad_norm": 0.5164118996480238, "learning_rate": 3.242946134115061e-05, "loss": 0.4164, "step": 1263 }, { "epoch": 1.2491349480968859, "grad_norm": 0.5642356073391426, "learning_rate": 3.2411139611579335e-05, "loss": 0.3646, "step": 1264 }, { "epoch": 1.250123578843302, "grad_norm": 0.5496356897347335, "learning_rate": 3.239281788200806e-05, "loss": 0.4021, "step": 1265 }, { "epoch": 1.2511122095897182, "grad_norm": 0.6192087192002566, "learning_rate": 3.237449615243679e-05, "loss": 0.4301, "step": 1266 }, { "epoch": 1.2521008403361344, "grad_norm": 0.49523632572783405, "learning_rate": 3.235617442286552e-05, "loss": 0.3701, "step": 1267 }, { "epoch": 1.2530894710825506, "grad_norm": 0.6056988400549271, "learning_rate": 3.233785269329425e-05, "loss": 0.4034, "step": 1268 }, { "epoch": 1.2540781018289668, "grad_norm": 0.6138127838378273, "learning_rate": 3.2319530963722974e-05, "loss": 0.419, "step": 1269 }, { "epoch": 1.255066732575383, "grad_norm": 0.5941423059788324, "learning_rate": 3.230120923415171e-05, "loss": 0.3682, "step": 1270 }, { "epoch": 1.2560553633217992, "grad_norm": 0.5157860886532021, "learning_rate": 3.2282887504580434e-05, "loss": 0.4638, "step": 1271 }, { "epoch": 1.2570439940682157, "grad_norm": 0.56887794112486, "learning_rate": 3.226456577500917e-05, "loss": 0.4096, "step": 1272 }, { "epoch": 1.2580326248146316, "grad_norm": 0.5740135888963899, "learning_rate": 3.224624404543789e-05, "loss": 0.4205, "step": 1273 }, { "epoch": 1.259021255561048, "grad_norm": 0.46044177602421005, "learning_rate": 3.222792231586662e-05, "loss": 0.4263, "step": 1274 }, { "epoch": 1.2600098863074642, "grad_norm": 0.4912874945664819, "learning_rate": 3.2209600586295346e-05, "loss": 0.3998, "step": 1275 }, { "epoch": 1.2609985170538804, "grad_norm": 0.555151465548005, "learning_rate": 3.219127885672408e-05, "loss": 0.3923, "step": 1276 }, { "epoch": 1.2619871478002966, "grad_norm": 0.46060860474112914, "learning_rate": 3.2172957127152806e-05, "loss": 0.4514, "step": 1277 }, { "epoch": 1.2629757785467128, "grad_norm": 0.4554628642216183, "learning_rate": 3.215463539758153e-05, "loss": 0.3973, "step": 1278 }, { "epoch": 1.263964409293129, "grad_norm": 0.476770872739123, "learning_rate": 3.213631366801026e-05, "loss": 0.401, "step": 1279 }, { "epoch": 1.2649530400395452, "grad_norm": 0.43478066416666505, "learning_rate": 3.211799193843899e-05, "loss": 0.3888, "step": 1280 }, { "epoch": 1.2659416707859614, "grad_norm": 0.5206516501520284, "learning_rate": 3.209967020886772e-05, "loss": 0.4376, "step": 1281 }, { "epoch": 1.2669303015323776, "grad_norm": 0.5111983248366821, "learning_rate": 3.2081348479296445e-05, "loss": 0.3967, "step": 1282 }, { "epoch": 1.2679189322787938, "grad_norm": 0.48550964083002573, "learning_rate": 3.206302674972518e-05, "loss": 0.4146, "step": 1283 }, { "epoch": 1.26890756302521, "grad_norm": 0.5566992312122225, "learning_rate": 3.2044705020153904e-05, "loss": 0.3983, "step": 1284 }, { "epoch": 1.2698961937716264, "grad_norm": 0.45115507798874444, "learning_rate": 3.202638329058264e-05, "loss": 0.3816, "step": 1285 }, { "epoch": 1.2708848245180424, "grad_norm": 0.5006595897688888, "learning_rate": 3.200806156101136e-05, "loss": 0.3885, "step": 1286 }, { "epoch": 1.2718734552644588, "grad_norm": 0.46478397767724117, "learning_rate": 3.198973983144009e-05, "loss": 0.3768, "step": 1287 }, { "epoch": 1.2728620860108748, "grad_norm": 0.4662182654395551, "learning_rate": 3.197141810186882e-05, "loss": 0.3835, "step": 1288 }, { "epoch": 1.2738507167572912, "grad_norm": 0.465371567431321, "learning_rate": 3.195309637229755e-05, "loss": 0.3749, "step": 1289 }, { "epoch": 1.2748393475037074, "grad_norm": 0.5670290184512599, "learning_rate": 3.193477464272627e-05, "loss": 0.3944, "step": 1290 }, { "epoch": 1.2758279782501236, "grad_norm": 0.46501614677422276, "learning_rate": 3.1916452913155e-05, "loss": 0.4382, "step": 1291 }, { "epoch": 1.2768166089965398, "grad_norm": 0.40662798825046886, "learning_rate": 3.189813118358373e-05, "loss": 0.4017, "step": 1292 }, { "epoch": 1.277805239742956, "grad_norm": 0.49081206446731274, "learning_rate": 3.187980945401246e-05, "loss": 0.4165, "step": 1293 }, { "epoch": 1.2787938704893722, "grad_norm": 0.546590577740249, "learning_rate": 3.186148772444119e-05, "loss": 0.4624, "step": 1294 }, { "epoch": 1.2797825012357884, "grad_norm": 0.3996466598612872, "learning_rate": 3.1843165994869915e-05, "loss": 0.4102, "step": 1295 }, { "epoch": 1.2807711319822046, "grad_norm": 0.5407591087034463, "learning_rate": 3.182484426529865e-05, "loss": 0.3683, "step": 1296 }, { "epoch": 1.2817597627286208, "grad_norm": 1.5152277351784715, "learning_rate": 3.1806522535727375e-05, "loss": 0.4579, "step": 1297 }, { "epoch": 1.282748393475037, "grad_norm": 0.5886976672564956, "learning_rate": 3.17882008061561e-05, "loss": 0.3517, "step": 1298 }, { "epoch": 1.2837370242214532, "grad_norm": 0.47643284473457603, "learning_rate": 3.176987907658483e-05, "loss": 0.3566, "step": 1299 }, { "epoch": 1.2847256549678696, "grad_norm": 0.4520368560623289, "learning_rate": 3.175155734701356e-05, "loss": 0.4143, "step": 1300 }, { "epoch": 1.2857142857142856, "grad_norm": 0.5656951145258373, "learning_rate": 3.173323561744229e-05, "loss": 0.4143, "step": 1301 }, { "epoch": 1.286702916460702, "grad_norm": 0.4771887050859622, "learning_rate": 3.171491388787102e-05, "loss": 0.4546, "step": 1302 }, { "epoch": 1.2876915472071182, "grad_norm": 0.4967656368315062, "learning_rate": 3.169659215829974e-05, "loss": 0.3908, "step": 1303 }, { "epoch": 1.2886801779535344, "grad_norm": 0.6321552754087176, "learning_rate": 3.167827042872847e-05, "loss": 0.386, "step": 1304 }, { "epoch": 1.2896688086999506, "grad_norm": 0.5620899608766924, "learning_rate": 3.16599486991572e-05, "loss": 0.3873, "step": 1305 }, { "epoch": 1.2906574394463668, "grad_norm": 0.48887349226838034, "learning_rate": 3.164162696958593e-05, "loss": 0.4238, "step": 1306 }, { "epoch": 1.291646070192783, "grad_norm": 0.6071293597209868, "learning_rate": 3.162330524001466e-05, "loss": 0.3994, "step": 1307 }, { "epoch": 1.2926347009391992, "grad_norm": 0.4175223455560433, "learning_rate": 3.1604983510443386e-05, "loss": 0.4285, "step": 1308 }, { "epoch": 1.2936233316856154, "grad_norm": 0.5199309668301119, "learning_rate": 3.158666178087212e-05, "loss": 0.4099, "step": 1309 }, { "epoch": 1.2946119624320316, "grad_norm": 4.267186886835991, "learning_rate": 3.1568340051300845e-05, "loss": 0.5247, "step": 1310 }, { "epoch": 1.2956005931784478, "grad_norm": 0.4779055867615723, "learning_rate": 3.155001832172957e-05, "loss": 0.4896, "step": 1311 }, { "epoch": 1.296589223924864, "grad_norm": 0.46437720272395633, "learning_rate": 3.15316965921583e-05, "loss": 0.3516, "step": 1312 }, { "epoch": 1.2975778546712804, "grad_norm": 0.41876335891738437, "learning_rate": 3.151337486258703e-05, "loss": 0.433, "step": 1313 }, { "epoch": 1.2985664854176964, "grad_norm": 0.48392926896854743, "learning_rate": 3.149505313301576e-05, "loss": 0.4522, "step": 1314 }, { "epoch": 1.2995551161641128, "grad_norm": 0.3918876745429913, "learning_rate": 3.1476731403444484e-05, "loss": 0.4156, "step": 1315 }, { "epoch": 1.300543746910529, "grad_norm": 0.3739370540127078, "learning_rate": 3.145840967387321e-05, "loss": 0.4198, "step": 1316 }, { "epoch": 1.3015323776569452, "grad_norm": 0.5457851106009632, "learning_rate": 3.1440087944301944e-05, "loss": 0.41, "step": 1317 }, { "epoch": 1.3025210084033614, "grad_norm": 0.4201227985959441, "learning_rate": 3.142176621473068e-05, "loss": 0.3679, "step": 1318 }, { "epoch": 1.3035096391497776, "grad_norm": 0.449043645786765, "learning_rate": 3.14034444851594e-05, "loss": 0.4242, "step": 1319 }, { "epoch": 1.3044982698961938, "grad_norm": 0.4527174966211094, "learning_rate": 3.138512275558813e-05, "loss": 0.4653, "step": 1320 }, { "epoch": 1.30548690064261, "grad_norm": 0.4658321252472473, "learning_rate": 3.1366801026016856e-05, "loss": 0.4029, "step": 1321 }, { "epoch": 1.3064755313890262, "grad_norm": 0.47710067768034164, "learning_rate": 3.134847929644559e-05, "loss": 0.4573, "step": 1322 }, { "epoch": 1.3074641621354424, "grad_norm": 0.40897490109372336, "learning_rate": 3.1330157566874316e-05, "loss": 0.3783, "step": 1323 }, { "epoch": 1.3084527928818586, "grad_norm": 0.43048787831099794, "learning_rate": 3.131183583730304e-05, "loss": 0.3516, "step": 1324 }, { "epoch": 1.3094414236282748, "grad_norm": 0.5107224905477062, "learning_rate": 3.129351410773177e-05, "loss": 0.4223, "step": 1325 }, { "epoch": 1.3104300543746912, "grad_norm": 0.4516855348679064, "learning_rate": 3.12751923781605e-05, "loss": 0.4021, "step": 1326 }, { "epoch": 1.3114186851211072, "grad_norm": 0.461839562644497, "learning_rate": 3.125687064858923e-05, "loss": 0.4107, "step": 1327 }, { "epoch": 1.3124073158675236, "grad_norm": 0.5878632399650365, "learning_rate": 3.1238548919017955e-05, "loss": 0.3933, "step": 1328 }, { "epoch": 1.3133959466139398, "grad_norm": 0.4179257373766747, "learning_rate": 3.122022718944669e-05, "loss": 0.4238, "step": 1329 }, { "epoch": 1.314384577360356, "grad_norm": 0.47036285652774024, "learning_rate": 3.1201905459875414e-05, "loss": 0.3983, "step": 1330 }, { "epoch": 1.3153732081067722, "grad_norm": 0.4945738645334576, "learning_rate": 3.118358373030415e-05, "loss": 0.4088, "step": 1331 }, { "epoch": 1.3163618388531884, "grad_norm": 0.45510503671344105, "learning_rate": 3.116526200073287e-05, "loss": 0.4058, "step": 1332 }, { "epoch": 1.3173504695996046, "grad_norm": 3.9999916120479004, "learning_rate": 3.11469402711616e-05, "loss": 0.4081, "step": 1333 }, { "epoch": 1.3183391003460208, "grad_norm": 0.5433031881444589, "learning_rate": 3.112861854159033e-05, "loss": 0.3989, "step": 1334 }, { "epoch": 1.319327731092437, "grad_norm": 0.43671811790428505, "learning_rate": 3.111029681201906e-05, "loss": 0.42, "step": 1335 }, { "epoch": 1.3203163618388531, "grad_norm": 0.4184028403344784, "learning_rate": 3.109197508244778e-05, "loss": 0.415, "step": 1336 }, { "epoch": 1.3213049925852693, "grad_norm": 0.5079977075082068, "learning_rate": 3.107365335287651e-05, "loss": 0.4234, "step": 1337 }, { "epoch": 1.3222936233316855, "grad_norm": 0.4561008449204876, "learning_rate": 3.105533162330524e-05, "loss": 0.4642, "step": 1338 }, { "epoch": 1.323282254078102, "grad_norm": 0.5583392710194877, "learning_rate": 3.103700989373397e-05, "loss": 0.4708, "step": 1339 }, { "epoch": 1.324270884824518, "grad_norm": 0.46862780419454314, "learning_rate": 3.10186881641627e-05, "loss": 0.4265, "step": 1340 }, { "epoch": 1.3252595155709344, "grad_norm": 0.4176709299786641, "learning_rate": 3.1000366434591425e-05, "loss": 0.4007, "step": 1341 }, { "epoch": 1.3262481463173506, "grad_norm": 0.4571100165226851, "learning_rate": 3.098204470502016e-05, "loss": 0.3926, "step": 1342 }, { "epoch": 1.3272367770637667, "grad_norm": 0.4352790802510749, "learning_rate": 3.0963722975448885e-05, "loss": 0.4273, "step": 1343 }, { "epoch": 1.328225407810183, "grad_norm": 0.44195900004427335, "learning_rate": 3.094540124587761e-05, "loss": 0.4339, "step": 1344 }, { "epoch": 1.3292140385565991, "grad_norm": 0.45770312201004365, "learning_rate": 3.092707951630634e-05, "loss": 0.4294, "step": 1345 }, { "epoch": 1.3302026693030153, "grad_norm": 0.5017517770960911, "learning_rate": 3.090875778673507e-05, "loss": 0.4593, "step": 1346 }, { "epoch": 1.3311913000494315, "grad_norm": 0.41657012292599577, "learning_rate": 3.08904360571638e-05, "loss": 0.3848, "step": 1347 }, { "epoch": 1.3321799307958477, "grad_norm": 0.4740856581049128, "learning_rate": 3.087211432759253e-05, "loss": 0.3943, "step": 1348 }, { "epoch": 1.333168561542264, "grad_norm": 0.4780682463130518, "learning_rate": 3.085379259802125e-05, "loss": 0.4135, "step": 1349 }, { "epoch": 1.3341571922886801, "grad_norm": 0.41425852862906426, "learning_rate": 3.083547086844998e-05, "loss": 0.4063, "step": 1350 }, { "epoch": 1.3351458230350963, "grad_norm": 0.4152563199617288, "learning_rate": 3.081714913887871e-05, "loss": 0.4371, "step": 1351 }, { "epoch": 1.3361344537815127, "grad_norm": 0.43415775516448757, "learning_rate": 3.079882740930744e-05, "loss": 0.4187, "step": 1352 }, { "epoch": 1.3371230845279287, "grad_norm": 0.42741375010546784, "learning_rate": 3.078050567973617e-05, "loss": 0.4458, "step": 1353 }, { "epoch": 1.3381117152743451, "grad_norm": 0.3826266934969745, "learning_rate": 3.0762183950164896e-05, "loss": 0.395, "step": 1354 }, { "epoch": 1.3391003460207611, "grad_norm": 0.4444801990109976, "learning_rate": 3.074386222059363e-05, "loss": 0.3782, "step": 1355 }, { "epoch": 1.3400889767671775, "grad_norm": 0.39248534592415374, "learning_rate": 3.0725540491022355e-05, "loss": 0.4302, "step": 1356 }, { "epoch": 1.3410776075135937, "grad_norm": 0.4776022675712131, "learning_rate": 3.070721876145108e-05, "loss": 0.4411, "step": 1357 }, { "epoch": 1.34206623826001, "grad_norm": 0.4619472362659103, "learning_rate": 3.068889703187981e-05, "loss": 0.4228, "step": 1358 }, { "epoch": 1.3430548690064261, "grad_norm": 0.5265078337328719, "learning_rate": 3.067057530230854e-05, "loss": 0.4232, "step": 1359 }, { "epoch": 1.3440434997528423, "grad_norm": 0.4433814105017831, "learning_rate": 3.065225357273727e-05, "loss": 0.3926, "step": 1360 }, { "epoch": 1.3450321304992585, "grad_norm": 0.4916383343657681, "learning_rate": 3.0633931843165994e-05, "loss": 0.4476, "step": 1361 }, { "epoch": 1.3460207612456747, "grad_norm": 0.5018234762609542, "learning_rate": 3.061561011359472e-05, "loss": 0.3989, "step": 1362 }, { "epoch": 1.347009391992091, "grad_norm": 0.4136058020135965, "learning_rate": 3.0597288384023454e-05, "loss": 0.3983, "step": 1363 }, { "epoch": 1.347998022738507, "grad_norm": 0.47294096088352344, "learning_rate": 3.057896665445218e-05, "loss": 0.4073, "step": 1364 }, { "epoch": 1.3489866534849233, "grad_norm": 0.5317939176248584, "learning_rate": 3.0560644924880913e-05, "loss": 0.4163, "step": 1365 }, { "epoch": 1.3499752842313395, "grad_norm": 0.4338976456062952, "learning_rate": 3.054232319530964e-05, "loss": 0.3801, "step": 1366 }, { "epoch": 1.350963914977756, "grad_norm": 0.46308817187735013, "learning_rate": 3.0524001465738366e-05, "loss": 0.4088, "step": 1367 }, { "epoch": 1.351952545724172, "grad_norm": 0.4706197525430812, "learning_rate": 3.0505679736167096e-05, "loss": 0.3898, "step": 1368 }, { "epoch": 1.3529411764705883, "grad_norm": 0.4256611148326342, "learning_rate": 3.0487358006595822e-05, "loss": 0.3928, "step": 1369 }, { "epoch": 1.3539298072170045, "grad_norm": 0.43342088299072884, "learning_rate": 3.0469036277024556e-05, "loss": 0.3842, "step": 1370 }, { "epoch": 1.3549184379634207, "grad_norm": 0.4177025967866133, "learning_rate": 3.045071454745328e-05, "loss": 0.384, "step": 1371 }, { "epoch": 1.355907068709837, "grad_norm": 0.4237050422324884, "learning_rate": 3.0432392817882012e-05, "loss": 0.407, "step": 1372 }, { "epoch": 1.356895699456253, "grad_norm": 0.46001566392518645, "learning_rate": 3.0414071088310735e-05, "loss": 0.3644, "step": 1373 }, { "epoch": 1.3578843302026693, "grad_norm": 0.40901426355182147, "learning_rate": 3.0395749358739468e-05, "loss": 0.3757, "step": 1374 }, { "epoch": 1.3588729609490855, "grad_norm": 0.4272997791257551, "learning_rate": 3.037742762916819e-05, "loss": 0.3845, "step": 1375 }, { "epoch": 1.3598615916955017, "grad_norm": 0.5277711746587774, "learning_rate": 3.0359105899596924e-05, "loss": 0.3737, "step": 1376 }, { "epoch": 1.360850222441918, "grad_norm": 0.5227697028857449, "learning_rate": 3.0340784170025654e-05, "loss": 0.4221, "step": 1377 }, { "epoch": 1.361838853188334, "grad_norm": 0.486242578140482, "learning_rate": 3.032246244045438e-05, "loss": 0.4704, "step": 1378 }, { "epoch": 1.3628274839347503, "grad_norm": 0.4033866685232076, "learning_rate": 3.030414071088311e-05, "loss": 0.3695, "step": 1379 }, { "epoch": 1.3638161146811667, "grad_norm": 0.5345651343378519, "learning_rate": 3.0285818981311837e-05, "loss": 0.3892, "step": 1380 }, { "epoch": 1.3648047454275827, "grad_norm": 0.47325544510857287, "learning_rate": 3.0267497251740567e-05, "loss": 0.3872, "step": 1381 }, { "epoch": 1.365793376173999, "grad_norm": 0.5099356747295937, "learning_rate": 3.0249175522169293e-05, "loss": 0.3814, "step": 1382 }, { "epoch": 1.3667820069204153, "grad_norm": 0.4057338047440771, "learning_rate": 3.0230853792598023e-05, "loss": 0.3873, "step": 1383 }, { "epoch": 1.3677706376668315, "grad_norm": 0.7473490364852073, "learning_rate": 3.021253206302675e-05, "loss": 0.4005, "step": 1384 }, { "epoch": 1.3687592684132477, "grad_norm": 0.48978724079856245, "learning_rate": 3.019421033345548e-05, "loss": 0.4519, "step": 1385 }, { "epoch": 1.3697478991596639, "grad_norm": 0.4243356424797571, "learning_rate": 3.0175888603884205e-05, "loss": 0.3829, "step": 1386 }, { "epoch": 1.37073652990608, "grad_norm": 0.4691687510490797, "learning_rate": 3.0157566874312935e-05, "loss": 0.3905, "step": 1387 }, { "epoch": 1.3717251606524963, "grad_norm": 0.45946908004291753, "learning_rate": 3.013924514474166e-05, "loss": 0.4024, "step": 1388 }, { "epoch": 1.3727137913989125, "grad_norm": 0.5023960748658135, "learning_rate": 3.0120923415170395e-05, "loss": 0.3742, "step": 1389 }, { "epoch": 1.3737024221453287, "grad_norm": 0.36816368450556713, "learning_rate": 3.0102601685599125e-05, "loss": 0.4114, "step": 1390 }, { "epoch": 1.3746910528917449, "grad_norm": 0.6879619600779466, "learning_rate": 3.008427995602785e-05, "loss": 0.3504, "step": 1391 }, { "epoch": 1.375679683638161, "grad_norm": 0.5388847535333136, "learning_rate": 3.006595822645658e-05, "loss": 0.4105, "step": 1392 }, { "epoch": 1.3766683143845775, "grad_norm": 0.43779996842972885, "learning_rate": 3.0047636496885307e-05, "loss": 0.4242, "step": 1393 }, { "epoch": 1.3776569451309935, "grad_norm": 0.4795112637040601, "learning_rate": 3.0029314767314037e-05, "loss": 0.425, "step": 1394 }, { "epoch": 1.3786455758774099, "grad_norm": 0.4969719190630632, "learning_rate": 3.0010993037742764e-05, "loss": 0.3827, "step": 1395 }, { "epoch": 1.379634206623826, "grad_norm": 0.3675434470598756, "learning_rate": 2.9992671308171493e-05, "loss": 0.4035, "step": 1396 }, { "epoch": 1.3806228373702423, "grad_norm": 0.49383640033501514, "learning_rate": 2.997434957860022e-05, "loss": 0.4378, "step": 1397 }, { "epoch": 1.3816114681166585, "grad_norm": 0.48486731087862633, "learning_rate": 2.995602784902895e-05, "loss": 0.3906, "step": 1398 }, { "epoch": 1.3826000988630747, "grad_norm": 0.4097542036678895, "learning_rate": 2.9937706119457676e-05, "loss": 0.3843, "step": 1399 }, { "epoch": 1.3835887296094909, "grad_norm": 0.5051631819266683, "learning_rate": 2.9919384389886406e-05, "loss": 0.4699, "step": 1400 }, { "epoch": 1.384577360355907, "grad_norm": 0.4198425751229584, "learning_rate": 2.990106266031514e-05, "loss": 0.4055, "step": 1401 }, { "epoch": 1.3855659911023233, "grad_norm": 0.44146231211279435, "learning_rate": 2.9882740930743862e-05, "loss": 0.4114, "step": 1402 }, { "epoch": 1.3865546218487395, "grad_norm": 0.444770850682391, "learning_rate": 2.9864419201172595e-05, "loss": 0.4025, "step": 1403 }, { "epoch": 1.3875432525951557, "grad_norm": 0.4041460947580985, "learning_rate": 2.9846097471601318e-05, "loss": 0.3837, "step": 1404 }, { "epoch": 1.3885318833415718, "grad_norm": 0.9975805031051895, "learning_rate": 2.982777574203005e-05, "loss": 0.3909, "step": 1405 }, { "epoch": 1.3895205140879883, "grad_norm": 0.441334219705333, "learning_rate": 2.9809454012458778e-05, "loss": 0.4227, "step": 1406 }, { "epoch": 1.3905091448344042, "grad_norm": 0.426260953993668, "learning_rate": 2.9791132282887508e-05, "loss": 0.3926, "step": 1407 }, { "epoch": 1.3914977755808207, "grad_norm": 0.40767382730220714, "learning_rate": 2.9772810553316234e-05, "loss": 0.4134, "step": 1408 }, { "epoch": 1.3924864063272369, "grad_norm": 0.44312511950212763, "learning_rate": 2.9754488823744964e-05, "loss": 0.3968, "step": 1409 }, { "epoch": 1.393475037073653, "grad_norm": 0.41386324737404107, "learning_rate": 2.973616709417369e-05, "loss": 0.4176, "step": 1410 }, { "epoch": 1.3944636678200693, "grad_norm": 0.46309820681206665, "learning_rate": 2.971784536460242e-05, "loss": 0.4144, "step": 1411 }, { "epoch": 1.3954522985664854, "grad_norm": 0.43320545142209255, "learning_rate": 2.9699523635031147e-05, "loss": 0.3914, "step": 1412 }, { "epoch": 1.3964409293129016, "grad_norm": 0.3877742370789793, "learning_rate": 2.9681201905459876e-05, "loss": 0.3906, "step": 1413 }, { "epoch": 1.3974295600593178, "grad_norm": 0.5477162837726974, "learning_rate": 2.9662880175888606e-05, "loss": 0.4015, "step": 1414 }, { "epoch": 1.398418190805734, "grad_norm": 0.5390536600210483, "learning_rate": 2.9644558446317333e-05, "loss": 0.3822, "step": 1415 }, { "epoch": 1.3994068215521502, "grad_norm": 0.4368173869111797, "learning_rate": 2.9626236716746066e-05, "loss": 0.3779, "step": 1416 }, { "epoch": 1.4003954522985664, "grad_norm": 0.4370090587121356, "learning_rate": 2.960791498717479e-05, "loss": 0.4116, "step": 1417 }, { "epoch": 1.4013840830449826, "grad_norm": 0.42922543146310366, "learning_rate": 2.9589593257603522e-05, "loss": 0.4043, "step": 1418 }, { "epoch": 1.402372713791399, "grad_norm": 0.4299377112060891, "learning_rate": 2.9571271528032245e-05, "loss": 0.3936, "step": 1419 }, { "epoch": 1.403361344537815, "grad_norm": 0.43608316927933105, "learning_rate": 2.9552949798460978e-05, "loss": 0.4171, "step": 1420 }, { "epoch": 1.4043499752842314, "grad_norm": 0.42263149818550055, "learning_rate": 2.95346280688897e-05, "loss": 0.3944, "step": 1421 }, { "epoch": 1.4053386060306474, "grad_norm": 0.4755706926751286, "learning_rate": 2.9516306339318434e-05, "loss": 0.4098, "step": 1422 }, { "epoch": 1.4063272367770638, "grad_norm": 0.472713263857781, "learning_rate": 2.9497984609747157e-05, "loss": 0.4283, "step": 1423 }, { "epoch": 1.40731586752348, "grad_norm": 0.47667038007238577, "learning_rate": 2.947966288017589e-05, "loss": 0.4224, "step": 1424 }, { "epoch": 1.4083044982698962, "grad_norm": 0.4374359814380336, "learning_rate": 2.946134115060462e-05, "loss": 0.3833, "step": 1425 }, { "epoch": 1.4092931290163124, "grad_norm": 1.1281600108268426, "learning_rate": 2.9443019421033347e-05, "loss": 0.396, "step": 1426 }, { "epoch": 1.4102817597627286, "grad_norm": 0.45769066205716, "learning_rate": 2.9424697691462077e-05, "loss": 0.3943, "step": 1427 }, { "epoch": 1.4112703905091448, "grad_norm": 0.42682946479997985, "learning_rate": 2.9406375961890803e-05, "loss": 0.4048, "step": 1428 }, { "epoch": 1.412259021255561, "grad_norm": 0.4272054502534407, "learning_rate": 2.9388054232319533e-05, "loss": 0.4141, "step": 1429 }, { "epoch": 1.4132476520019772, "grad_norm": 0.4163314132777008, "learning_rate": 2.936973250274826e-05, "loss": 0.4108, "step": 1430 }, { "epoch": 1.4142362827483934, "grad_norm": 0.47240918972929635, "learning_rate": 2.935141077317699e-05, "loss": 0.3765, "step": 1431 }, { "epoch": 1.4152249134948096, "grad_norm": 0.40276843794670175, "learning_rate": 2.9333089043605716e-05, "loss": 0.393, "step": 1432 }, { "epoch": 1.4162135442412258, "grad_norm": 0.45635568700737, "learning_rate": 2.9314767314034445e-05, "loss": 0.3892, "step": 1433 }, { "epoch": 1.4172021749876422, "grad_norm": 0.37741092285924105, "learning_rate": 2.9296445584463172e-05, "loss": 0.4059, "step": 1434 }, { "epoch": 1.4181908057340582, "grad_norm": 0.37502877212854535, "learning_rate": 2.9278123854891905e-05, "loss": 0.3828, "step": 1435 }, { "epoch": 1.4191794364804746, "grad_norm": 0.43935881417383515, "learning_rate": 2.9259802125320628e-05, "loss": 0.3788, "step": 1436 }, { "epoch": 1.4201680672268908, "grad_norm": 0.46865200154019476, "learning_rate": 2.924148039574936e-05, "loss": 0.3624, "step": 1437 }, { "epoch": 1.421156697973307, "grad_norm": 0.3699007330378844, "learning_rate": 2.922315866617809e-05, "loss": 0.3969, "step": 1438 }, { "epoch": 1.4221453287197232, "grad_norm": 0.43503836306174276, "learning_rate": 2.9204836936606817e-05, "loss": 0.3796, "step": 1439 }, { "epoch": 1.4231339594661394, "grad_norm": 0.3700681440277868, "learning_rate": 2.9186515207035547e-05, "loss": 0.406, "step": 1440 }, { "epoch": 1.4241225902125556, "grad_norm": 0.38594737834830545, "learning_rate": 2.9168193477464274e-05, "loss": 0.4187, "step": 1441 }, { "epoch": 1.4251112209589718, "grad_norm": 0.4099159353246273, "learning_rate": 2.9149871747893003e-05, "loss": 0.4261, "step": 1442 }, { "epoch": 1.426099851705388, "grad_norm": 0.4349778874597987, "learning_rate": 2.913155001832173e-05, "loss": 0.4072, "step": 1443 }, { "epoch": 1.4270884824518042, "grad_norm": 0.4616806187978402, "learning_rate": 2.911322828875046e-05, "loss": 0.428, "step": 1444 }, { "epoch": 1.4280771131982204, "grad_norm": 0.4647002533853401, "learning_rate": 2.9094906559179186e-05, "loss": 0.4269, "step": 1445 }, { "epoch": 1.4290657439446366, "grad_norm": 0.4498429078135831, "learning_rate": 2.9076584829607916e-05, "loss": 0.3735, "step": 1446 }, { "epoch": 1.430054374691053, "grad_norm": 0.411554118934773, "learning_rate": 2.9058263100036642e-05, "loss": 0.3509, "step": 1447 }, { "epoch": 1.431043005437469, "grad_norm": 0.4066845327431252, "learning_rate": 2.9039941370465372e-05, "loss": 0.3862, "step": 1448 }, { "epoch": 1.4320316361838854, "grad_norm": 0.3870140495434619, "learning_rate": 2.9021619640894105e-05, "loss": 0.4351, "step": 1449 }, { "epoch": 1.4330202669303016, "grad_norm": 0.4616640852520161, "learning_rate": 2.900329791132283e-05, "loss": 0.3956, "step": 1450 }, { "epoch": 1.4340088976767178, "grad_norm": 0.556345236394565, "learning_rate": 2.898497618175156e-05, "loss": 0.4037, "step": 1451 }, { "epoch": 1.434997528423134, "grad_norm": 0.4076334918549623, "learning_rate": 2.8966654452180288e-05, "loss": 0.3895, "step": 1452 }, { "epoch": 1.4359861591695502, "grad_norm": 0.5010027240485496, "learning_rate": 2.8948332722609018e-05, "loss": 0.4166, "step": 1453 }, { "epoch": 1.4369747899159664, "grad_norm": 0.4714606603056528, "learning_rate": 2.8930010993037744e-05, "loss": 0.3907, "step": 1454 }, { "epoch": 1.4379634206623826, "grad_norm": 0.4068727523579981, "learning_rate": 2.8911689263466474e-05, "loss": 0.435, "step": 1455 }, { "epoch": 1.4389520514087988, "grad_norm": 0.45375175012445673, "learning_rate": 2.88933675338952e-05, "loss": 0.4101, "step": 1456 }, { "epoch": 1.439940682155215, "grad_norm": 0.38577232059116356, "learning_rate": 2.887504580432393e-05, "loss": 0.354, "step": 1457 }, { "epoch": 1.4409293129016312, "grad_norm": 0.5295377123908974, "learning_rate": 2.8856724074752657e-05, "loss": 0.3734, "step": 1458 }, { "epoch": 1.4419179436480474, "grad_norm": 0.49634293568885707, "learning_rate": 2.8838402345181386e-05, "loss": 0.3972, "step": 1459 }, { "epoch": 1.4429065743944638, "grad_norm": 0.4456021438722438, "learning_rate": 2.8820080615610116e-05, "loss": 0.4136, "step": 1460 }, { "epoch": 1.4438952051408798, "grad_norm": 0.5152990728222527, "learning_rate": 2.8801758886038843e-05, "loss": 0.4101, "step": 1461 }, { "epoch": 1.4448838358872962, "grad_norm": 0.49737259581958904, "learning_rate": 2.8783437156467576e-05, "loss": 0.3775, "step": 1462 }, { "epoch": 1.4458724666337124, "grad_norm": 0.5007663894375608, "learning_rate": 2.87651154268963e-05, "loss": 0.4363, "step": 1463 }, { "epoch": 1.4468610973801286, "grad_norm": 0.4724529669286991, "learning_rate": 2.8746793697325032e-05, "loss": 0.3596, "step": 1464 }, { "epoch": 1.4478497281265448, "grad_norm": 0.4102205134681722, "learning_rate": 2.8728471967753755e-05, "loss": 0.4083, "step": 1465 }, { "epoch": 1.448838358872961, "grad_norm": 0.4927613544637606, "learning_rate": 2.8710150238182488e-05, "loss": 0.412, "step": 1466 }, { "epoch": 1.4498269896193772, "grad_norm": 0.4866787305607417, "learning_rate": 2.869182850861121e-05, "loss": 0.4122, "step": 1467 }, { "epoch": 1.4508156203657934, "grad_norm": 0.41416308050466516, "learning_rate": 2.8673506779039945e-05, "loss": 0.3714, "step": 1468 }, { "epoch": 1.4518042511122096, "grad_norm": 0.4942705851935734, "learning_rate": 2.8655185049468668e-05, "loss": 0.3888, "step": 1469 }, { "epoch": 1.4527928818586258, "grad_norm": 0.4345708018222794, "learning_rate": 2.86368633198974e-05, "loss": 0.357, "step": 1470 }, { "epoch": 1.453781512605042, "grad_norm": 0.35508259878323506, "learning_rate": 2.8618541590326127e-05, "loss": 0.3454, "step": 1471 }, { "epoch": 1.4547701433514582, "grad_norm": 0.45904347457657824, "learning_rate": 2.8600219860754857e-05, "loss": 0.4179, "step": 1472 }, { "epoch": 1.4557587740978746, "grad_norm": 0.5164106179647712, "learning_rate": 2.8581898131183587e-05, "loss": 0.3896, "step": 1473 }, { "epoch": 1.4567474048442905, "grad_norm": 0.40059896558947966, "learning_rate": 2.8563576401612313e-05, "loss": 0.4056, "step": 1474 }, { "epoch": 1.457736035590707, "grad_norm": 0.5372716933188573, "learning_rate": 2.8545254672041043e-05, "loss": 0.4274, "step": 1475 }, { "epoch": 1.4587246663371232, "grad_norm": 0.40725754977374223, "learning_rate": 2.852693294246977e-05, "loss": 0.4179, "step": 1476 }, { "epoch": 1.4597132970835394, "grad_norm": 0.44390518615798935, "learning_rate": 2.85086112128985e-05, "loss": 0.4097, "step": 1477 }, { "epoch": 1.4607019278299556, "grad_norm": 0.48615961968656457, "learning_rate": 2.8490289483327226e-05, "loss": 0.36, "step": 1478 }, { "epoch": 1.4616905585763718, "grad_norm": 0.3714349943918567, "learning_rate": 2.8471967753755955e-05, "loss": 0.378, "step": 1479 }, { "epoch": 1.462679189322788, "grad_norm": 0.4058671619642228, "learning_rate": 2.8453646024184682e-05, "loss": 0.4081, "step": 1480 }, { "epoch": 1.4636678200692042, "grad_norm": 0.5055842159551689, "learning_rate": 2.8435324294613415e-05, "loss": 0.3841, "step": 1481 }, { "epoch": 1.4646564508156203, "grad_norm": 0.47797252087167624, "learning_rate": 2.8417002565042138e-05, "loss": 0.4038, "step": 1482 }, { "epoch": 1.4656450815620365, "grad_norm": 0.44493513719100947, "learning_rate": 2.839868083547087e-05, "loss": 0.3899, "step": 1483 }, { "epoch": 1.4666337123084527, "grad_norm": 0.5372533121822897, "learning_rate": 2.83803591058996e-05, "loss": 0.4216, "step": 1484 }, { "epoch": 1.467622343054869, "grad_norm": 0.4429336992125347, "learning_rate": 2.8362037376328327e-05, "loss": 0.4481, "step": 1485 }, { "epoch": 1.4686109738012854, "grad_norm": 0.4376024304523968, "learning_rate": 2.8343715646757057e-05, "loss": 0.3983, "step": 1486 }, { "epoch": 1.4695996045477013, "grad_norm": 0.4682188352448761, "learning_rate": 2.8325393917185784e-05, "loss": 0.3802, "step": 1487 }, { "epoch": 1.4705882352941178, "grad_norm": 0.39119610113045195, "learning_rate": 2.8307072187614514e-05, "loss": 0.4119, "step": 1488 }, { "epoch": 1.4715768660405337, "grad_norm": 0.5137951214431529, "learning_rate": 2.828875045804324e-05, "loss": 0.4288, "step": 1489 }, { "epoch": 1.4725654967869501, "grad_norm": 3.103776243925187, "learning_rate": 2.827042872847197e-05, "loss": 0.3754, "step": 1490 }, { "epoch": 1.4735541275333663, "grad_norm": 0.4879054924978363, "learning_rate": 2.8252106998900696e-05, "loss": 0.3798, "step": 1491 }, { "epoch": 1.4745427582797825, "grad_norm": 0.7217635109257813, "learning_rate": 2.8233785269329426e-05, "loss": 0.4278, "step": 1492 }, { "epoch": 1.4755313890261987, "grad_norm": 0.4986536237719154, "learning_rate": 2.8215463539758152e-05, "loss": 0.4065, "step": 1493 }, { "epoch": 1.476520019772615, "grad_norm": 0.9989100749488137, "learning_rate": 2.8197141810186882e-05, "loss": 0.3953, "step": 1494 }, { "epoch": 1.4775086505190311, "grad_norm": 0.46437297385662046, "learning_rate": 2.817882008061561e-05, "loss": 0.3635, "step": 1495 }, { "epoch": 1.4784972812654473, "grad_norm": 0.3997210054617243, "learning_rate": 2.816049835104434e-05, "loss": 0.3832, "step": 1496 }, { "epoch": 1.4794859120118635, "grad_norm": 0.41933975236331383, "learning_rate": 2.814217662147307e-05, "loss": 0.4178, "step": 1497 }, { "epoch": 1.4804745427582797, "grad_norm": 0.4811522606067737, "learning_rate": 2.8123854891901798e-05, "loss": 0.389, "step": 1498 }, { "epoch": 1.481463173504696, "grad_norm": 0.4132122123585108, "learning_rate": 2.8105533162330528e-05, "loss": 0.3853, "step": 1499 }, { "epoch": 1.4824518042511121, "grad_norm": 0.4979391858140964, "learning_rate": 2.8087211432759254e-05, "loss": 0.4074, "step": 1500 }, { "epoch": 1.4834404349975285, "grad_norm": 0.4494998588537741, "learning_rate": 2.8068889703187984e-05, "loss": 0.3883, "step": 1501 }, { "epoch": 1.4844290657439445, "grad_norm": 0.5114359012300849, "learning_rate": 2.805056797361671e-05, "loss": 0.3797, "step": 1502 }, { "epoch": 1.485417696490361, "grad_norm": 0.41034372016482507, "learning_rate": 2.803224624404544e-05, "loss": 0.4268, "step": 1503 }, { "epoch": 1.4864063272367771, "grad_norm": 0.4476158043314473, "learning_rate": 2.8013924514474167e-05, "loss": 0.3766, "step": 1504 }, { "epoch": 1.4873949579831933, "grad_norm": 0.41090035728085006, "learning_rate": 2.7995602784902896e-05, "loss": 0.4117, "step": 1505 }, { "epoch": 1.4883835887296095, "grad_norm": 0.4506400265519517, "learning_rate": 2.7977281055331623e-05, "loss": 0.4091, "step": 1506 }, { "epoch": 1.4893722194760257, "grad_norm": 0.4899830888032862, "learning_rate": 2.7958959325760353e-05, "loss": 0.4369, "step": 1507 }, { "epoch": 1.490360850222442, "grad_norm": 0.43671611709874125, "learning_rate": 2.7940637596189086e-05, "loss": 0.4037, "step": 1508 }, { "epoch": 1.491349480968858, "grad_norm": 0.5167892154691446, "learning_rate": 2.792231586661781e-05, "loss": 0.3827, "step": 1509 }, { "epoch": 1.4923381117152743, "grad_norm": 0.371922751234388, "learning_rate": 2.7903994137046542e-05, "loss": 0.3957, "step": 1510 }, { "epoch": 1.4933267424616905, "grad_norm": 0.6108575614148054, "learning_rate": 2.7885672407475265e-05, "loss": 0.4541, "step": 1511 }, { "epoch": 1.4943153732081067, "grad_norm": 0.41429154591364836, "learning_rate": 2.7867350677904e-05, "loss": 0.4111, "step": 1512 }, { "epoch": 1.495304003954523, "grad_norm": 0.596547081300597, "learning_rate": 2.784902894833272e-05, "loss": 0.3919, "step": 1513 }, { "epoch": 1.4962926347009393, "grad_norm": 0.5094539608133031, "learning_rate": 2.7830707218761455e-05, "loss": 0.3805, "step": 1514 }, { "epoch": 1.4972812654473553, "grad_norm": 0.56961460141681, "learning_rate": 2.7812385489190178e-05, "loss": 0.3813, "step": 1515 }, { "epoch": 1.4982698961937717, "grad_norm": 0.46339538421967663, "learning_rate": 2.779406375961891e-05, "loss": 0.4113, "step": 1516 }, { "epoch": 1.499258526940188, "grad_norm": 0.5812882943435661, "learning_rate": 2.7775742030047637e-05, "loss": 0.3955, "step": 1517 }, { "epoch": 1.500247157686604, "grad_norm": 0.47243958692012183, "learning_rate": 2.7757420300476367e-05, "loss": 0.3819, "step": 1518 }, { "epoch": 1.5012357884330203, "grad_norm": 0.5343270404175471, "learning_rate": 2.7739098570905093e-05, "loss": 0.4002, "step": 1519 }, { "epoch": 1.5022244191794365, "grad_norm": 0.48978763196581326, "learning_rate": 2.7720776841333823e-05, "loss": 0.4108, "step": 1520 }, { "epoch": 1.5032130499258527, "grad_norm": 0.4584060551574947, "learning_rate": 2.7702455111762553e-05, "loss": 0.4027, "step": 1521 }, { "epoch": 1.504201680672269, "grad_norm": 0.5314473004612787, "learning_rate": 2.768413338219128e-05, "loss": 0.4127, "step": 1522 }, { "epoch": 1.505190311418685, "grad_norm": 0.5028433150434978, "learning_rate": 2.766581165262001e-05, "loss": 0.3966, "step": 1523 }, { "epoch": 1.5061789421651013, "grad_norm": 0.4643288049813141, "learning_rate": 2.7647489923048736e-05, "loss": 0.4748, "step": 1524 }, { "epoch": 1.5071675729115177, "grad_norm": 0.40581260323859836, "learning_rate": 2.7629168193477466e-05, "loss": 0.3693, "step": 1525 }, { "epoch": 1.5081562036579337, "grad_norm": 0.46423684511384644, "learning_rate": 2.7610846463906192e-05, "loss": 0.3647, "step": 1526 }, { "epoch": 1.50914483440435, "grad_norm": 0.37491623934334417, "learning_rate": 2.7592524734334925e-05, "loss": 0.3793, "step": 1527 }, { "epoch": 1.510133465150766, "grad_norm": 0.42816971887652916, "learning_rate": 2.7574203004763648e-05, "loss": 0.422, "step": 1528 }, { "epoch": 1.5111220958971825, "grad_norm": 0.4751355807445936, "learning_rate": 2.755588127519238e-05, "loss": 0.3806, "step": 1529 }, { "epoch": 1.5121107266435985, "grad_norm": 0.4411839547269056, "learning_rate": 2.7537559545621104e-05, "loss": 0.3461, "step": 1530 }, { "epoch": 1.5130993573900149, "grad_norm": 0.38518242850077455, "learning_rate": 2.7519237816049838e-05, "loss": 0.3713, "step": 1531 }, { "epoch": 1.514087988136431, "grad_norm": 0.5156378658334005, "learning_rate": 2.7500916086478567e-05, "loss": 0.4222, "step": 1532 }, { "epoch": 1.5150766188828473, "grad_norm": 0.44661701686926886, "learning_rate": 2.7482594356907294e-05, "loss": 0.3903, "step": 1533 }, { "epoch": 1.5160652496292635, "grad_norm": 0.3544520482430884, "learning_rate": 2.7464272627336024e-05, "loss": 0.4139, "step": 1534 }, { "epoch": 1.5170538803756797, "grad_norm": 0.4915153065702854, "learning_rate": 2.744595089776475e-05, "loss": 0.3847, "step": 1535 }, { "epoch": 1.5180425111220959, "grad_norm": 0.4089527960840625, "learning_rate": 2.742762916819348e-05, "loss": 0.4333, "step": 1536 }, { "epoch": 1.519031141868512, "grad_norm": 0.614587175836764, "learning_rate": 2.7409307438622206e-05, "loss": 0.399, "step": 1537 }, { "epoch": 1.5200197726149285, "grad_norm": 0.45550662372604833, "learning_rate": 2.7390985709050936e-05, "loss": 0.441, "step": 1538 }, { "epoch": 1.5210084033613445, "grad_norm": 0.4824956195108096, "learning_rate": 2.7372663979479662e-05, "loss": 0.4104, "step": 1539 }, { "epoch": 1.5219970341077609, "grad_norm": 0.46708521242933027, "learning_rate": 2.7354342249908392e-05, "loss": 0.3851, "step": 1540 }, { "epoch": 1.5229856648541769, "grad_norm": 0.46393531963664086, "learning_rate": 2.733602052033712e-05, "loss": 0.43, "step": 1541 }, { "epoch": 1.5239742956005933, "grad_norm": 0.4921231829673673, "learning_rate": 2.731769879076585e-05, "loss": 0.4068, "step": 1542 }, { "epoch": 1.5249629263470093, "grad_norm": 0.4347199942936415, "learning_rate": 2.7299377061194575e-05, "loss": 0.3934, "step": 1543 }, { "epoch": 1.5259515570934257, "grad_norm": 0.4057898630632911, "learning_rate": 2.7281055331623305e-05, "loss": 0.3884, "step": 1544 }, { "epoch": 1.5269401878398419, "grad_norm": 0.42667667061674325, "learning_rate": 2.7262733602052038e-05, "loss": 0.4065, "step": 1545 }, { "epoch": 1.527928818586258, "grad_norm": 0.4488997684290841, "learning_rate": 2.7244411872480764e-05, "loss": 0.3876, "step": 1546 }, { "epoch": 1.5289174493326743, "grad_norm": 0.4336831490379225, "learning_rate": 2.7226090142909494e-05, "loss": 0.3865, "step": 1547 }, { "epoch": 1.5299060800790905, "grad_norm": 0.4686451574373533, "learning_rate": 2.720776841333822e-05, "loss": 0.3865, "step": 1548 }, { "epoch": 1.5308947108255067, "grad_norm": 0.39695352946571816, "learning_rate": 2.718944668376695e-05, "loss": 0.423, "step": 1549 }, { "epoch": 1.5318833415719229, "grad_norm": 0.5716269351677139, "learning_rate": 2.7171124954195677e-05, "loss": 0.4009, "step": 1550 }, { "epoch": 1.532871972318339, "grad_norm": 0.4587099837052053, "learning_rate": 2.7152803224624407e-05, "loss": 0.4496, "step": 1551 }, { "epoch": 1.5338606030647552, "grad_norm": 0.4631352660544808, "learning_rate": 2.7134481495053133e-05, "loss": 0.3833, "step": 1552 }, { "epoch": 1.5348492338111717, "grad_norm": 0.5129221035772229, "learning_rate": 2.7116159765481863e-05, "loss": 0.3785, "step": 1553 }, { "epoch": 1.5358378645575876, "grad_norm": 0.4325122899001676, "learning_rate": 2.709783803591059e-05, "loss": 0.3668, "step": 1554 }, { "epoch": 1.536826495304004, "grad_norm": 0.5239351028759107, "learning_rate": 2.707951630633932e-05, "loss": 0.3796, "step": 1555 }, { "epoch": 1.53781512605042, "grad_norm": 0.4672290219768155, "learning_rate": 2.7061194576768052e-05, "loss": 0.398, "step": 1556 }, { "epoch": 1.5388037567968365, "grad_norm": 0.49589231342523293, "learning_rate": 2.7042872847196775e-05, "loss": 0.3967, "step": 1557 }, { "epoch": 1.5397923875432526, "grad_norm": 0.41649128689603704, "learning_rate": 2.702455111762551e-05, "loss": 0.3854, "step": 1558 }, { "epoch": 1.5407810182896688, "grad_norm": 0.44211366879564284, "learning_rate": 2.700622938805423e-05, "loss": 0.4066, "step": 1559 }, { "epoch": 1.541769649036085, "grad_norm": 0.5255125526417789, "learning_rate": 2.6987907658482965e-05, "loss": 0.3911, "step": 1560 }, { "epoch": 1.5427582797825012, "grad_norm": 0.3585096582291703, "learning_rate": 2.6969585928911688e-05, "loss": 0.3851, "step": 1561 }, { "epoch": 1.5437469105289174, "grad_norm": 0.4557113587278548, "learning_rate": 2.695126419934042e-05, "loss": 0.4115, "step": 1562 }, { "epoch": 1.5447355412753336, "grad_norm": 0.5117357709104579, "learning_rate": 2.6932942469769147e-05, "loss": 0.4057, "step": 1563 }, { "epoch": 1.5457241720217498, "grad_norm": 0.4203491673321529, "learning_rate": 2.6914620740197877e-05, "loss": 0.3908, "step": 1564 }, { "epoch": 1.546712802768166, "grad_norm": 0.5204307305060335, "learning_rate": 2.6896299010626604e-05, "loss": 0.3709, "step": 1565 }, { "epoch": 1.5477014335145824, "grad_norm": 0.526740798881196, "learning_rate": 2.6877977281055333e-05, "loss": 0.451, "step": 1566 }, { "epoch": 1.5486900642609984, "grad_norm": 0.4578379362180909, "learning_rate": 2.685965555148406e-05, "loss": 0.4173, "step": 1567 }, { "epoch": 1.5496786950074148, "grad_norm": 0.5616856153852952, "learning_rate": 2.684133382191279e-05, "loss": 0.355, "step": 1568 }, { "epoch": 1.5506673257538308, "grad_norm": 0.3835955282164102, "learning_rate": 2.682301209234152e-05, "loss": 0.3777, "step": 1569 }, { "epoch": 1.5516559565002472, "grad_norm": 0.7208717650221993, "learning_rate": 2.6804690362770246e-05, "loss": 0.397, "step": 1570 }, { "epoch": 1.5526445872466632, "grad_norm": 0.3853466170336842, "learning_rate": 2.6786368633198976e-05, "loss": 0.3977, "step": 1571 }, { "epoch": 1.5536332179930796, "grad_norm": 0.5046699217091802, "learning_rate": 2.6768046903627702e-05, "loss": 0.3999, "step": 1572 }, { "epoch": 1.5546218487394958, "grad_norm": 0.42074241549368513, "learning_rate": 2.6749725174056435e-05, "loss": 0.4363, "step": 1573 }, { "epoch": 1.555610479485912, "grad_norm": 0.41829320975141493, "learning_rate": 2.6731403444485158e-05, "loss": 0.4069, "step": 1574 }, { "epoch": 1.5565991102323282, "grad_norm": 0.40181383043360847, "learning_rate": 2.671308171491389e-05, "loss": 0.3912, "step": 1575 }, { "epoch": 1.5575877409787444, "grad_norm": 0.3934136259577379, "learning_rate": 2.6694759985342614e-05, "loss": 0.4208, "step": 1576 }, { "epoch": 1.5585763717251606, "grad_norm": 0.38033001297621777, "learning_rate": 2.6676438255771348e-05, "loss": 0.4152, "step": 1577 }, { "epoch": 1.5595650024715768, "grad_norm": 0.4113357550225746, "learning_rate": 2.665811652620007e-05, "loss": 0.4021, "step": 1578 }, { "epoch": 1.5605536332179932, "grad_norm": 0.35734672492848346, "learning_rate": 2.6639794796628804e-05, "loss": 0.3754, "step": 1579 }, { "epoch": 1.5615422639644092, "grad_norm": 0.3911806867594729, "learning_rate": 2.6621473067057534e-05, "loss": 0.3457, "step": 1580 }, { "epoch": 1.5625308947108256, "grad_norm": 0.3991150699950756, "learning_rate": 2.660315133748626e-05, "loss": 0.3567, "step": 1581 }, { "epoch": 1.5635195254572416, "grad_norm": 0.35326735393067965, "learning_rate": 2.658482960791499e-05, "loss": 0.3659, "step": 1582 }, { "epoch": 1.564508156203658, "grad_norm": 0.4255209356643333, "learning_rate": 2.6566507878343716e-05, "loss": 0.4292, "step": 1583 }, { "epoch": 1.565496786950074, "grad_norm": 0.4353740970659335, "learning_rate": 2.6548186148772446e-05, "loss": 0.4002, "step": 1584 }, { "epoch": 1.5664854176964904, "grad_norm": 0.42081879951436735, "learning_rate": 2.6529864419201173e-05, "loss": 0.3782, "step": 1585 }, { "epoch": 1.5674740484429066, "grad_norm": 0.3972031919119579, "learning_rate": 2.6511542689629902e-05, "loss": 0.3745, "step": 1586 }, { "epoch": 1.5684626791893228, "grad_norm": 0.3969095309957812, "learning_rate": 2.649322096005863e-05, "loss": 0.4093, "step": 1587 }, { "epoch": 1.569451309935739, "grad_norm": 0.4561685715882719, "learning_rate": 2.647489923048736e-05, "loss": 0.4179, "step": 1588 }, { "epoch": 1.5704399406821552, "grad_norm": 0.3905192588914526, "learning_rate": 2.6456577500916085e-05, "loss": 0.4024, "step": 1589 }, { "epoch": 1.5714285714285714, "grad_norm": 0.4880112714785334, "learning_rate": 2.6438255771344815e-05, "loss": 0.4267, "step": 1590 }, { "epoch": 1.5724172021749876, "grad_norm": 0.44144264018529167, "learning_rate": 2.641993404177354e-05, "loss": 0.3923, "step": 1591 }, { "epoch": 1.573405832921404, "grad_norm": 0.4520891954712245, "learning_rate": 2.6401612312202274e-05, "loss": 0.3512, "step": 1592 }, { "epoch": 1.57439446366782, "grad_norm": 0.3968382938238775, "learning_rate": 2.6383290582631004e-05, "loss": 0.3881, "step": 1593 }, { "epoch": 1.5753830944142364, "grad_norm": 0.4291691262159773, "learning_rate": 2.636496885305973e-05, "loss": 0.433, "step": 1594 }, { "epoch": 1.5763717251606524, "grad_norm": 0.41935775226347205, "learning_rate": 2.634664712348846e-05, "loss": 0.3922, "step": 1595 }, { "epoch": 1.5773603559070688, "grad_norm": 0.43360261823368007, "learning_rate": 2.6328325393917187e-05, "loss": 0.396, "step": 1596 }, { "epoch": 1.5783489866534848, "grad_norm": 0.4383713270861738, "learning_rate": 2.6310003664345917e-05, "loss": 0.3887, "step": 1597 }, { "epoch": 1.5793376173999012, "grad_norm": 0.36390497709690867, "learning_rate": 2.6291681934774643e-05, "loss": 0.4425, "step": 1598 }, { "epoch": 1.5803262481463174, "grad_norm": 0.45135046474611445, "learning_rate": 2.6273360205203373e-05, "loss": 0.3862, "step": 1599 }, { "epoch": 1.5813148788927336, "grad_norm": 0.92954368995101, "learning_rate": 2.62550384756321e-05, "loss": 0.3756, "step": 1600 }, { "epoch": 1.5823035096391498, "grad_norm": 0.40958295171463, "learning_rate": 2.623671674606083e-05, "loss": 0.3979, "step": 1601 }, { "epoch": 1.583292140385566, "grad_norm": 0.41452893354607, "learning_rate": 2.6218395016489556e-05, "loss": 0.4288, "step": 1602 }, { "epoch": 1.5842807711319822, "grad_norm": 0.48473094536207595, "learning_rate": 2.6200073286918285e-05, "loss": 0.3971, "step": 1603 }, { "epoch": 1.5852694018783984, "grad_norm": 0.42085590958203406, "learning_rate": 2.618175155734702e-05, "loss": 0.4016, "step": 1604 }, { "epoch": 1.5862580326248148, "grad_norm": 0.4669388384357012, "learning_rate": 2.616342982777574e-05, "loss": 0.3646, "step": 1605 }, { "epoch": 1.5872466633712308, "grad_norm": 0.8315584679703752, "learning_rate": 2.6145108098204475e-05, "loss": 0.3803, "step": 1606 }, { "epoch": 1.5882352941176472, "grad_norm": 0.38156186820815857, "learning_rate": 2.6126786368633198e-05, "loss": 0.4005, "step": 1607 }, { "epoch": 1.5892239248640632, "grad_norm": 0.5310062095974154, "learning_rate": 2.610846463906193e-05, "loss": 0.4068, "step": 1608 }, { "epoch": 1.5902125556104796, "grad_norm": 0.42807078833352297, "learning_rate": 2.6090142909490657e-05, "loss": 0.4144, "step": 1609 }, { "epoch": 1.5912011863568956, "grad_norm": 0.46074314056849425, "learning_rate": 2.6071821179919387e-05, "loss": 0.3554, "step": 1610 }, { "epoch": 1.592189817103312, "grad_norm": 1.8133458142893883, "learning_rate": 2.6053499450348114e-05, "loss": 0.4515, "step": 1611 }, { "epoch": 1.5931784478497282, "grad_norm": 0.5048943121065849, "learning_rate": 2.6035177720776843e-05, "loss": 0.4008, "step": 1612 }, { "epoch": 1.5941670785961444, "grad_norm": 0.4657591101871947, "learning_rate": 2.601685599120557e-05, "loss": 0.4229, "step": 1613 }, { "epoch": 1.5951557093425606, "grad_norm": 0.5183467852607514, "learning_rate": 2.59985342616343e-05, "loss": 0.4144, "step": 1614 }, { "epoch": 1.5961443400889768, "grad_norm": 0.44071678691580635, "learning_rate": 2.598021253206303e-05, "loss": 0.4117, "step": 1615 }, { "epoch": 1.597132970835393, "grad_norm": 0.40235668016483833, "learning_rate": 2.5961890802491756e-05, "loss": 0.42, "step": 1616 }, { "epoch": 1.5981216015818092, "grad_norm": 0.5277066211320178, "learning_rate": 2.5943569072920486e-05, "loss": 0.4076, "step": 1617 }, { "epoch": 1.5991102323282254, "grad_norm": 0.36175350180593563, "learning_rate": 2.5925247343349212e-05, "loss": 0.3701, "step": 1618 }, { "epoch": 1.6000988630746416, "grad_norm": 0.3959549290443186, "learning_rate": 2.5906925613777945e-05, "loss": 0.3866, "step": 1619 }, { "epoch": 1.601087493821058, "grad_norm": 0.38483902641881595, "learning_rate": 2.5888603884206668e-05, "loss": 0.4304, "step": 1620 }, { "epoch": 1.602076124567474, "grad_norm": 0.421933415535332, "learning_rate": 2.58702821546354e-05, "loss": 0.3657, "step": 1621 }, { "epoch": 1.6030647553138904, "grad_norm": 0.37841821347024, "learning_rate": 2.5851960425064125e-05, "loss": 0.4089, "step": 1622 }, { "epoch": 1.6040533860603063, "grad_norm": 0.4931090146334283, "learning_rate": 2.5833638695492858e-05, "loss": 0.3816, "step": 1623 }, { "epoch": 1.6050420168067228, "grad_norm": 0.4521718183941848, "learning_rate": 2.581531696592158e-05, "loss": 0.3824, "step": 1624 }, { "epoch": 1.606030647553139, "grad_norm": 0.4206126560719891, "learning_rate": 2.5796995236350314e-05, "loss": 0.4141, "step": 1625 }, { "epoch": 1.6070192782995552, "grad_norm": 0.40268309810586955, "learning_rate": 2.5778673506779037e-05, "loss": 0.4155, "step": 1626 }, { "epoch": 1.6080079090459714, "grad_norm": 0.4816197776174261, "learning_rate": 2.576035177720777e-05, "loss": 0.4273, "step": 1627 }, { "epoch": 1.6089965397923875, "grad_norm": 0.4087791283535, "learning_rate": 2.57420300476365e-05, "loss": 0.4462, "step": 1628 }, { "epoch": 1.6099851705388037, "grad_norm": 0.43409196529537075, "learning_rate": 2.5723708318065226e-05, "loss": 0.4048, "step": 1629 }, { "epoch": 1.61097380128522, "grad_norm": 0.47886453083503044, "learning_rate": 2.5705386588493956e-05, "loss": 0.4233, "step": 1630 }, { "epoch": 1.6119624320316361, "grad_norm": 0.40022228703666884, "learning_rate": 2.5687064858922683e-05, "loss": 0.3658, "step": 1631 }, { "epoch": 1.6129510627780523, "grad_norm": 0.5732323102107635, "learning_rate": 2.5668743129351412e-05, "loss": 0.4252, "step": 1632 }, { "epoch": 1.6139396935244688, "grad_norm": 0.4275189816534108, "learning_rate": 2.565042139978014e-05, "loss": 0.4857, "step": 1633 }, { "epoch": 1.6149283242708847, "grad_norm": 2.7664515807974825, "learning_rate": 2.563209967020887e-05, "loss": 0.3946, "step": 1634 }, { "epoch": 1.6159169550173011, "grad_norm": 0.47740952603627956, "learning_rate": 2.5613777940637595e-05, "loss": 0.3569, "step": 1635 }, { "epoch": 1.6169055857637171, "grad_norm": 0.40553609004295255, "learning_rate": 2.5595456211066325e-05, "loss": 0.3968, "step": 1636 }, { "epoch": 1.6178942165101335, "grad_norm": 0.5446213252351468, "learning_rate": 2.557713448149505e-05, "loss": 0.4128, "step": 1637 }, { "epoch": 1.6188828472565495, "grad_norm": 0.4095982608814269, "learning_rate": 2.5558812751923784e-05, "loss": 0.4258, "step": 1638 }, { "epoch": 1.619871478002966, "grad_norm": 0.5117643060127223, "learning_rate": 2.5540491022352514e-05, "loss": 0.4283, "step": 1639 }, { "epoch": 1.6208601087493821, "grad_norm": 1.6625336983020944, "learning_rate": 2.552216929278124e-05, "loss": 0.437, "step": 1640 }, { "epoch": 1.6218487394957983, "grad_norm": 0.5156974297095375, "learning_rate": 2.550384756320997e-05, "loss": 0.409, "step": 1641 }, { "epoch": 1.6228373702422145, "grad_norm": 0.48650815962335153, "learning_rate": 2.5485525833638697e-05, "loss": 0.3825, "step": 1642 }, { "epoch": 1.6238260009886307, "grad_norm": 0.4021513167015719, "learning_rate": 2.5467204104067427e-05, "loss": 0.4277, "step": 1643 }, { "epoch": 1.624814631735047, "grad_norm": 0.5020147328835822, "learning_rate": 2.5448882374496153e-05, "loss": 0.429, "step": 1644 }, { "epoch": 1.6258032624814631, "grad_norm": 0.4523159525038009, "learning_rate": 2.5430560644924883e-05, "loss": 0.3986, "step": 1645 }, { "epoch": 1.6267918932278795, "grad_norm": 0.7784192628078034, "learning_rate": 2.541223891535361e-05, "loss": 0.3761, "step": 1646 }, { "epoch": 1.6277805239742955, "grad_norm": 0.47795886340761057, "learning_rate": 2.539391718578234e-05, "loss": 0.4274, "step": 1647 }, { "epoch": 1.628769154720712, "grad_norm": 0.5147293673069718, "learning_rate": 2.5375595456211066e-05, "loss": 0.393, "step": 1648 }, { "epoch": 1.629757785467128, "grad_norm": 0.42791844516397193, "learning_rate": 2.5357273726639795e-05, "loss": 0.4304, "step": 1649 }, { "epoch": 1.6307464162135443, "grad_norm": 0.5185729621910314, "learning_rate": 2.5338951997068522e-05, "loss": 0.4517, "step": 1650 }, { "epoch": 1.6317350469599603, "grad_norm": 0.4792853861358086, "learning_rate": 2.532063026749725e-05, "loss": 0.4469, "step": 1651 }, { "epoch": 1.6327236777063767, "grad_norm": 0.4834453492296931, "learning_rate": 2.5302308537925985e-05, "loss": 0.4038, "step": 1652 }, { "epoch": 1.633712308452793, "grad_norm": 0.4722652896419518, "learning_rate": 2.5283986808354708e-05, "loss": 0.3522, "step": 1653 }, { "epoch": 1.6347009391992091, "grad_norm": 0.40105850893211614, "learning_rate": 2.526566507878344e-05, "loss": 0.3588, "step": 1654 }, { "epoch": 1.6356895699456253, "grad_norm": 0.4197033883977299, "learning_rate": 2.5247343349212167e-05, "loss": 0.3885, "step": 1655 }, { "epoch": 1.6366782006920415, "grad_norm": 0.4100143580970242, "learning_rate": 2.5229021619640897e-05, "loss": 0.4245, "step": 1656 }, { "epoch": 1.6376668314384577, "grad_norm": 0.3997437938914889, "learning_rate": 2.5210699890069624e-05, "loss": 0.4216, "step": 1657 }, { "epoch": 1.638655462184874, "grad_norm": 0.41454663322526747, "learning_rate": 2.5192378160498353e-05, "loss": 0.3922, "step": 1658 }, { "epoch": 1.6396440929312903, "grad_norm": 0.4211346984278495, "learning_rate": 2.517405643092708e-05, "loss": 0.4067, "step": 1659 }, { "epoch": 1.6406327236777063, "grad_norm": 0.4151763724243206, "learning_rate": 2.515573470135581e-05, "loss": 0.397, "step": 1660 }, { "epoch": 1.6416213544241227, "grad_norm": 0.42506578283934277, "learning_rate": 2.5137412971784536e-05, "loss": 0.4012, "step": 1661 }, { "epoch": 1.6426099851705387, "grad_norm": 0.4379673176662196, "learning_rate": 2.5119091242213266e-05, "loss": 0.3987, "step": 1662 }, { "epoch": 1.643598615916955, "grad_norm": 0.40625558071253093, "learning_rate": 2.5100769512641996e-05, "loss": 0.3859, "step": 1663 }, { "epoch": 1.644587246663371, "grad_norm": 0.5840267843067266, "learning_rate": 2.5082447783070722e-05, "loss": 0.3454, "step": 1664 }, { "epoch": 1.6455758774097875, "grad_norm": 0.4339461928480405, "learning_rate": 2.5064126053499455e-05, "loss": 0.3883, "step": 1665 }, { "epoch": 1.6465645081562037, "grad_norm": 0.45034271166876527, "learning_rate": 2.504580432392818e-05, "loss": 0.4222, "step": 1666 }, { "epoch": 1.64755313890262, "grad_norm": 0.5028462944221008, "learning_rate": 2.502748259435691e-05, "loss": 0.4194, "step": 1667 }, { "epoch": 1.648541769649036, "grad_norm": 0.38530024504191945, "learning_rate": 2.5009160864785635e-05, "loss": 0.4039, "step": 1668 }, { "epoch": 1.6495304003954523, "grad_norm": 0.4712414855043585, "learning_rate": 2.4990839135214368e-05, "loss": 0.4385, "step": 1669 }, { "epoch": 1.6505190311418685, "grad_norm": 0.3905810952039061, "learning_rate": 2.4972517405643094e-05, "loss": 0.3811, "step": 1670 }, { "epoch": 1.6515076618882847, "grad_norm": 0.39645913073415345, "learning_rate": 2.4954195676071824e-05, "loss": 0.4139, "step": 1671 }, { "epoch": 1.652496292634701, "grad_norm": 0.43698911144494107, "learning_rate": 2.493587394650055e-05, "loss": 0.4206, "step": 1672 }, { "epoch": 1.653484923381117, "grad_norm": 0.4371984606315748, "learning_rate": 2.491755221692928e-05, "loss": 0.3975, "step": 1673 }, { "epoch": 1.6544735541275335, "grad_norm": 0.43011553834831445, "learning_rate": 2.4899230487358007e-05, "loss": 0.3942, "step": 1674 }, { "epoch": 1.6554621848739495, "grad_norm": 0.3721640123552812, "learning_rate": 2.4880908757786736e-05, "loss": 0.3975, "step": 1675 }, { "epoch": 1.656450815620366, "grad_norm": 0.4747876872451059, "learning_rate": 2.4862587028215463e-05, "loss": 0.3847, "step": 1676 }, { "epoch": 1.6574394463667819, "grad_norm": 0.38027194519346086, "learning_rate": 2.4844265298644193e-05, "loss": 0.3788, "step": 1677 }, { "epoch": 1.6584280771131983, "grad_norm": 0.4508516150367328, "learning_rate": 2.4825943569072922e-05, "loss": 0.3616, "step": 1678 }, { "epoch": 1.6594167078596145, "grad_norm": 0.3803706814279655, "learning_rate": 2.480762183950165e-05, "loss": 0.3497, "step": 1679 }, { "epoch": 1.6604053386060307, "grad_norm": 0.423786571723237, "learning_rate": 2.478930010993038e-05, "loss": 0.4228, "step": 1680 }, { "epoch": 1.6613939693524469, "grad_norm": 2.1611494042929396, "learning_rate": 2.477097838035911e-05, "loss": 0.3682, "step": 1681 }, { "epoch": 1.662382600098863, "grad_norm": 0.40466781878101676, "learning_rate": 2.4752656650787835e-05, "loss": 0.4156, "step": 1682 }, { "epoch": 1.6633712308452793, "grad_norm": 0.4058235074559792, "learning_rate": 2.4734334921216565e-05, "loss": 0.4221, "step": 1683 }, { "epoch": 1.6643598615916955, "grad_norm": 0.46122683896111627, "learning_rate": 2.4716013191645295e-05, "loss": 0.3814, "step": 1684 }, { "epoch": 1.6653484923381117, "grad_norm": 0.4583540948741572, "learning_rate": 2.469769146207402e-05, "loss": 0.4018, "step": 1685 }, { "epoch": 1.6663371230845279, "grad_norm": 0.4140372114229349, "learning_rate": 2.467936973250275e-05, "loss": 0.3753, "step": 1686 }, { "epoch": 1.6673257538309443, "grad_norm": 0.4525593967149509, "learning_rate": 2.4661048002931477e-05, "loss": 0.3969, "step": 1687 }, { "epoch": 1.6683143845773603, "grad_norm": 0.464744768430069, "learning_rate": 2.4642726273360207e-05, "loss": 0.3807, "step": 1688 }, { "epoch": 1.6693030153237767, "grad_norm": 0.44718805735362294, "learning_rate": 2.4624404543788933e-05, "loss": 0.4182, "step": 1689 }, { "epoch": 1.6702916460701926, "grad_norm": 0.40644001219568815, "learning_rate": 2.4606082814217663e-05, "loss": 0.3937, "step": 1690 }, { "epoch": 1.671280276816609, "grad_norm": 0.5170250923249823, "learning_rate": 2.458776108464639e-05, "loss": 0.4285, "step": 1691 }, { "epoch": 1.6722689075630253, "grad_norm": 0.513329640416937, "learning_rate": 2.456943935507512e-05, "loss": 0.4114, "step": 1692 }, { "epoch": 1.6732575383094415, "grad_norm": 0.4880517499767201, "learning_rate": 2.455111762550385e-05, "loss": 0.3919, "step": 1693 }, { "epoch": 1.6742461690558577, "grad_norm": 0.5212895035611711, "learning_rate": 2.453279589593258e-05, "loss": 0.3944, "step": 1694 }, { "epoch": 1.6752347998022739, "grad_norm": 0.4260019127836852, "learning_rate": 2.4514474166361305e-05, "loss": 0.3544, "step": 1695 }, { "epoch": 1.67622343054869, "grad_norm": 0.43546704326466656, "learning_rate": 2.4496152436790035e-05, "loss": 0.3759, "step": 1696 }, { "epoch": 1.6772120612951062, "grad_norm": 0.5341439035881296, "learning_rate": 2.4477830707218762e-05, "loss": 0.3937, "step": 1697 }, { "epoch": 1.6782006920415224, "grad_norm": 0.46971407790265535, "learning_rate": 2.445950897764749e-05, "loss": 0.3924, "step": 1698 }, { "epoch": 1.6791893227879386, "grad_norm": 0.4666699252543721, "learning_rate": 2.4441187248076218e-05, "loss": 0.4236, "step": 1699 }, { "epoch": 1.680177953534355, "grad_norm": 0.46297576198947665, "learning_rate": 2.4422865518504948e-05, "loss": 0.4111, "step": 1700 }, { "epoch": 1.681166584280771, "grad_norm": 0.43920700791196987, "learning_rate": 2.4404543788933678e-05, "loss": 0.429, "step": 1701 }, { "epoch": 1.6821552150271875, "grad_norm": 0.4425388567246449, "learning_rate": 2.4386222059362404e-05, "loss": 0.423, "step": 1702 }, { "epoch": 1.6831438457736034, "grad_norm": 0.504396108944337, "learning_rate": 2.4367900329791134e-05, "loss": 0.4541, "step": 1703 }, { "epoch": 1.6841324765200198, "grad_norm": 0.38629905249767643, "learning_rate": 2.434957860021986e-05, "loss": 0.4456, "step": 1704 }, { "epoch": 1.6851211072664358, "grad_norm": 0.42255590367847, "learning_rate": 2.433125687064859e-05, "loss": 0.3161, "step": 1705 }, { "epoch": 1.6861097380128522, "grad_norm": 9.279413565817112, "learning_rate": 2.431293514107732e-05, "loss": 0.4019, "step": 1706 }, { "epoch": 1.6870983687592684, "grad_norm": 0.5115032573505213, "learning_rate": 2.429461341150605e-05, "loss": 0.4016, "step": 1707 }, { "epoch": 1.6880869995056846, "grad_norm": 0.6125061411000199, "learning_rate": 2.4276291681934776e-05, "loss": 0.4173, "step": 1708 }, { "epoch": 1.6890756302521008, "grad_norm": 0.40034097829937065, "learning_rate": 2.4257969952363506e-05, "loss": 0.3928, "step": 1709 }, { "epoch": 1.690064260998517, "grad_norm": 0.558700379013928, "learning_rate": 2.4239648222792232e-05, "loss": 0.3917, "step": 1710 }, { "epoch": 1.6910528917449332, "grad_norm": 0.5213021159154378, "learning_rate": 2.4221326493220962e-05, "loss": 0.4169, "step": 1711 }, { "epoch": 1.6920415224913494, "grad_norm": 0.42444867290823024, "learning_rate": 2.420300476364969e-05, "loss": 0.3955, "step": 1712 }, { "epoch": 1.6930301532377658, "grad_norm": 0.6301852396073467, "learning_rate": 2.4184683034078418e-05, "loss": 0.368, "step": 1713 }, { "epoch": 1.6940187839841818, "grad_norm": 0.43853223753735604, "learning_rate": 2.4166361304507145e-05, "loss": 0.3824, "step": 1714 }, { "epoch": 1.6950074147305982, "grad_norm": 0.4227705602239579, "learning_rate": 2.4148039574935874e-05, "loss": 0.3902, "step": 1715 }, { "epoch": 1.6959960454770142, "grad_norm": 0.5599932443918436, "learning_rate": 2.41297178453646e-05, "loss": 0.39, "step": 1716 }, { "epoch": 1.6969846762234306, "grad_norm": 0.4605264249554903, "learning_rate": 2.4111396115793334e-05, "loss": 0.3668, "step": 1717 }, { "epoch": 1.6979733069698466, "grad_norm": 0.4166433036526539, "learning_rate": 2.409307438622206e-05, "loss": 0.3731, "step": 1718 }, { "epoch": 1.698961937716263, "grad_norm": 0.5020488057760636, "learning_rate": 2.407475265665079e-05, "loss": 0.3796, "step": 1719 }, { "epoch": 1.6999505684626792, "grad_norm": 0.4814379075294116, "learning_rate": 2.4056430927079517e-05, "loss": 0.3899, "step": 1720 }, { "epoch": 1.7009391992090954, "grad_norm": 0.41308950212104645, "learning_rate": 2.4038109197508247e-05, "loss": 0.3883, "step": 1721 }, { "epoch": 1.7019278299555116, "grad_norm": 0.4621467295711173, "learning_rate": 2.4019787467936973e-05, "loss": 0.3956, "step": 1722 }, { "epoch": 1.7029164607019278, "grad_norm": 0.4359495625224401, "learning_rate": 2.4001465738365703e-05, "loss": 0.364, "step": 1723 }, { "epoch": 1.703905091448344, "grad_norm": 0.4264377309299344, "learning_rate": 2.3983144008794433e-05, "loss": 0.392, "step": 1724 }, { "epoch": 1.7048937221947602, "grad_norm": 0.40499618340346094, "learning_rate": 2.396482227922316e-05, "loss": 0.3609, "step": 1725 }, { "epoch": 1.7058823529411766, "grad_norm": 0.5397084817880251, "learning_rate": 2.394650054965189e-05, "loss": 0.3858, "step": 1726 }, { "epoch": 1.7068709836875926, "grad_norm": 0.4186205614816439, "learning_rate": 2.3928178820080615e-05, "loss": 0.4257, "step": 1727 }, { "epoch": 1.707859614434009, "grad_norm": 0.7056941693726977, "learning_rate": 2.3909857090509345e-05, "loss": 0.3913, "step": 1728 }, { "epoch": 1.708848245180425, "grad_norm": 0.5286884567339488, "learning_rate": 2.3891535360938075e-05, "loss": 0.3697, "step": 1729 }, { "epoch": 1.7098368759268414, "grad_norm": 0.3784718462794333, "learning_rate": 2.3873213631366805e-05, "loss": 0.3498, "step": 1730 }, { "epoch": 1.7108255066732574, "grad_norm": 0.46278906777742407, "learning_rate": 2.385489190179553e-05, "loss": 0.3996, "step": 1731 }, { "epoch": 1.7118141374196738, "grad_norm": 0.40881123085003773, "learning_rate": 2.383657017222426e-05, "loss": 0.4097, "step": 1732 }, { "epoch": 1.71280276816609, "grad_norm": 0.37954107493848216, "learning_rate": 2.3818248442652987e-05, "loss": 0.4089, "step": 1733 }, { "epoch": 1.7137913989125062, "grad_norm": 0.4235644663302972, "learning_rate": 2.3799926713081717e-05, "loss": 0.4087, "step": 1734 }, { "epoch": 1.7147800296589224, "grad_norm": 0.375409077398812, "learning_rate": 2.3781604983510443e-05, "loss": 0.4098, "step": 1735 }, { "epoch": 1.7157686604053386, "grad_norm": 0.4013282768368208, "learning_rate": 2.3763283253939173e-05, "loss": 0.4611, "step": 1736 }, { "epoch": 1.7167572911517548, "grad_norm": 0.42812258914339685, "learning_rate": 2.37449615243679e-05, "loss": 0.3901, "step": 1737 }, { "epoch": 1.717745921898171, "grad_norm": 0.39121395903660927, "learning_rate": 2.372663979479663e-05, "loss": 0.3816, "step": 1738 }, { "epoch": 1.7187345526445874, "grad_norm": 0.40658071534681106, "learning_rate": 2.3708318065225356e-05, "loss": 0.4132, "step": 1739 }, { "epoch": 1.7197231833910034, "grad_norm": 0.41409405317664205, "learning_rate": 2.3689996335654086e-05, "loss": 0.351, "step": 1740 }, { "epoch": 1.7207118141374198, "grad_norm": 0.46282285745589746, "learning_rate": 2.3671674606082816e-05, "loss": 0.3909, "step": 1741 }, { "epoch": 1.7217004448838358, "grad_norm": 0.42871344347331874, "learning_rate": 2.3653352876511545e-05, "loss": 0.4117, "step": 1742 }, { "epoch": 1.7226890756302522, "grad_norm": 0.43367976541959785, "learning_rate": 2.3635031146940272e-05, "loss": 0.3868, "step": 1743 }, { "epoch": 1.7236777063766682, "grad_norm": 0.8340268987953244, "learning_rate": 2.3616709417369e-05, "loss": 0.4059, "step": 1744 }, { "epoch": 1.7246663371230846, "grad_norm": 0.47887969919550666, "learning_rate": 2.3598387687797728e-05, "loss": 0.4088, "step": 1745 }, { "epoch": 1.7256549678695008, "grad_norm": 0.4600009377694261, "learning_rate": 2.3580065958226458e-05, "loss": 0.3726, "step": 1746 }, { "epoch": 1.726643598615917, "grad_norm": 0.4394834932717993, "learning_rate": 2.3561744228655188e-05, "loss": 0.3982, "step": 1747 }, { "epoch": 1.7276322293623332, "grad_norm": 0.42660372365177784, "learning_rate": 2.3543422499083914e-05, "loss": 0.3666, "step": 1748 }, { "epoch": 1.7286208601087494, "grad_norm": 0.4105890660793924, "learning_rate": 2.3525100769512644e-05, "loss": 0.3943, "step": 1749 }, { "epoch": 1.7296094908551656, "grad_norm": 0.4214308858125806, "learning_rate": 2.350677903994137e-05, "loss": 0.4035, "step": 1750 }, { "epoch": 1.7305981216015818, "grad_norm": 0.3557297175589846, "learning_rate": 2.34884573103701e-05, "loss": 0.3497, "step": 1751 }, { "epoch": 1.731586752347998, "grad_norm": 0.4618387309477068, "learning_rate": 2.347013558079883e-05, "loss": 0.3921, "step": 1752 }, { "epoch": 1.7325753830944142, "grad_norm": 0.4233725554545632, "learning_rate": 2.345181385122756e-05, "loss": 0.3504, "step": 1753 }, { "epoch": 1.7335640138408306, "grad_norm": 0.3939435196529794, "learning_rate": 2.3433492121656286e-05, "loss": 0.3776, "step": 1754 }, { "epoch": 1.7345526445872466, "grad_norm": 0.45856071632339207, "learning_rate": 2.3415170392085016e-05, "loss": 0.4147, "step": 1755 }, { "epoch": 1.735541275333663, "grad_norm": 0.5309912172149773, "learning_rate": 2.3396848662513742e-05, "loss": 0.4617, "step": 1756 }, { "epoch": 1.736529906080079, "grad_norm": 0.4212350527426451, "learning_rate": 2.3378526932942472e-05, "loss": 0.4295, "step": 1757 }, { "epoch": 1.7375185368264954, "grad_norm": 0.45168823978830624, "learning_rate": 2.33602052033712e-05, "loss": 0.3755, "step": 1758 }, { "epoch": 1.7385071675729116, "grad_norm": 0.4222445918652455, "learning_rate": 2.334188347379993e-05, "loss": 0.4085, "step": 1759 }, { "epoch": 1.7394957983193278, "grad_norm": 0.45247188668684607, "learning_rate": 2.3323561744228655e-05, "loss": 0.3609, "step": 1760 }, { "epoch": 1.740484429065744, "grad_norm": 0.39681245346902994, "learning_rate": 2.3305240014657385e-05, "loss": 0.3768, "step": 1761 }, { "epoch": 1.7414730598121602, "grad_norm": 0.3982514509417074, "learning_rate": 2.328691828508611e-05, "loss": 0.4201, "step": 1762 }, { "epoch": 1.7424616905585764, "grad_norm": 0.4504189750771721, "learning_rate": 2.326859655551484e-05, "loss": 0.3484, "step": 1763 }, { "epoch": 1.7434503213049926, "grad_norm": 0.49152624029214176, "learning_rate": 2.325027482594357e-05, "loss": 0.3694, "step": 1764 }, { "epoch": 1.7444389520514088, "grad_norm": 0.4121047852097205, "learning_rate": 2.32319530963723e-05, "loss": 0.3999, "step": 1765 }, { "epoch": 1.745427582797825, "grad_norm": 0.5441045548619896, "learning_rate": 2.3213631366801027e-05, "loss": 0.3911, "step": 1766 }, { "epoch": 1.7464162135442414, "grad_norm": 0.4108305755896109, "learning_rate": 2.3195309637229757e-05, "loss": 0.3646, "step": 1767 }, { "epoch": 1.7474048442906573, "grad_norm": 0.41737754675297384, "learning_rate": 2.3176987907658483e-05, "loss": 0.395, "step": 1768 }, { "epoch": 1.7483934750370738, "grad_norm": 0.4150316472121021, "learning_rate": 2.3158666178087213e-05, "loss": 0.3383, "step": 1769 }, { "epoch": 1.7493821057834897, "grad_norm": 0.39702856623789656, "learning_rate": 2.3140344448515943e-05, "loss": 0.4022, "step": 1770 }, { "epoch": 1.7503707365299062, "grad_norm": 0.40516138394776213, "learning_rate": 2.312202271894467e-05, "loss": 0.3996, "step": 1771 }, { "epoch": 1.7513593672763221, "grad_norm": 0.39762393557434283, "learning_rate": 2.31037009893734e-05, "loss": 0.4106, "step": 1772 }, { "epoch": 1.7523479980227386, "grad_norm": 0.46186273484647006, "learning_rate": 2.3085379259802125e-05, "loss": 0.3792, "step": 1773 }, { "epoch": 1.7533366287691547, "grad_norm": 0.39237452846154863, "learning_rate": 2.3067057530230855e-05, "loss": 0.3605, "step": 1774 }, { "epoch": 1.754325259515571, "grad_norm": 0.36877700573737, "learning_rate": 2.304873580065958e-05, "loss": 0.4116, "step": 1775 }, { "epoch": 1.7553138902619871, "grad_norm": 0.39378753624957163, "learning_rate": 2.3030414071088315e-05, "loss": 0.3939, "step": 1776 }, { "epoch": 1.7563025210084033, "grad_norm": 0.4355987524545655, "learning_rate": 2.301209234151704e-05, "loss": 0.4179, "step": 1777 }, { "epoch": 1.7572911517548195, "grad_norm": 0.4911840325290055, "learning_rate": 2.299377061194577e-05, "loss": 0.4296, "step": 1778 }, { "epoch": 1.7582797825012357, "grad_norm": 0.3960822279217589, "learning_rate": 2.2975448882374497e-05, "loss": 0.3822, "step": 1779 }, { "epoch": 1.7592684132476522, "grad_norm": 0.5199670339468879, "learning_rate": 2.2957127152803227e-05, "loss": 0.4231, "step": 1780 }, { "epoch": 1.7602570439940681, "grad_norm": 0.40567151974553317, "learning_rate": 2.2938805423231954e-05, "loss": 0.3897, "step": 1781 }, { "epoch": 1.7612456747404845, "grad_norm": 0.40375841398343415, "learning_rate": 2.2920483693660683e-05, "loss": 0.3808, "step": 1782 }, { "epoch": 1.7622343054869005, "grad_norm": 0.3922238505381371, "learning_rate": 2.290216196408941e-05, "loss": 0.376, "step": 1783 }, { "epoch": 1.763222936233317, "grad_norm": 0.44494797247877094, "learning_rate": 2.288384023451814e-05, "loss": 0.4069, "step": 1784 }, { "epoch": 1.764211566979733, "grad_norm": 0.41733919109667755, "learning_rate": 2.2865518504946866e-05, "loss": 0.4014, "step": 1785 }, { "epoch": 1.7652001977261493, "grad_norm": 0.4191347646330663, "learning_rate": 2.2847196775375596e-05, "loss": 0.3727, "step": 1786 }, { "epoch": 1.7661888284725655, "grad_norm": 0.45690382944007973, "learning_rate": 2.2828875045804322e-05, "loss": 0.4409, "step": 1787 }, { "epoch": 1.7671774592189817, "grad_norm": 0.3727295780768645, "learning_rate": 2.2810553316233055e-05, "loss": 0.3864, "step": 1788 }, { "epoch": 1.768166089965398, "grad_norm": 0.5002074369304106, "learning_rate": 2.2792231586661782e-05, "loss": 0.3826, "step": 1789 }, { "epoch": 1.7691547207118141, "grad_norm": 0.34898434542471624, "learning_rate": 2.277390985709051e-05, "loss": 0.4184, "step": 1790 }, { "epoch": 1.7701433514582303, "grad_norm": 0.43943958183024484, "learning_rate": 2.2755588127519238e-05, "loss": 0.3819, "step": 1791 }, { "epoch": 1.7711319822046465, "grad_norm": 0.4049123301582885, "learning_rate": 2.2737266397947968e-05, "loss": 0.3787, "step": 1792 }, { "epoch": 1.772120612951063, "grad_norm": 0.414183301867182, "learning_rate": 2.2718944668376698e-05, "loss": 0.3965, "step": 1793 }, { "epoch": 1.773109243697479, "grad_norm": 0.44134575378778584, "learning_rate": 2.2700622938805424e-05, "loss": 0.4069, "step": 1794 }, { "epoch": 1.7740978744438953, "grad_norm": 0.40040328095949096, "learning_rate": 2.2682301209234154e-05, "loss": 0.3791, "step": 1795 }, { "epoch": 1.7750865051903113, "grad_norm": 0.38546124363593853, "learning_rate": 2.266397947966288e-05, "loss": 0.3959, "step": 1796 }, { "epoch": 1.7760751359367277, "grad_norm": 0.398644429096709, "learning_rate": 2.264565775009161e-05, "loss": 0.4082, "step": 1797 }, { "epoch": 1.7770637666831437, "grad_norm": 0.35714848357969653, "learning_rate": 2.2627336020520337e-05, "loss": 0.3756, "step": 1798 }, { "epoch": 1.7780523974295601, "grad_norm": 0.41668738095465974, "learning_rate": 2.2609014290949066e-05, "loss": 0.3948, "step": 1799 }, { "epoch": 1.7790410281759763, "grad_norm": 0.3632505009175128, "learning_rate": 2.2590692561377796e-05, "loss": 0.4234, "step": 1800 }, { "epoch": 1.7800296589223925, "grad_norm": 0.3709178397100954, "learning_rate": 2.2572370831806526e-05, "loss": 0.3949, "step": 1801 }, { "epoch": 1.7810182896688087, "grad_norm": 0.3734671886850258, "learning_rate": 2.2554049102235252e-05, "loss": 0.4171, "step": 1802 }, { "epoch": 1.782006920415225, "grad_norm": 0.3855530952579648, "learning_rate": 2.2535727372663982e-05, "loss": 0.3993, "step": 1803 }, { "epoch": 1.782995551161641, "grad_norm": 0.3594222151619567, "learning_rate": 2.251740564309271e-05, "loss": 0.3868, "step": 1804 }, { "epoch": 1.7839841819080573, "grad_norm": 0.3917616648474125, "learning_rate": 2.249908391352144e-05, "loss": 0.3503, "step": 1805 }, { "epoch": 1.7849728126544737, "grad_norm": 0.4830258763380352, "learning_rate": 2.2480762183950165e-05, "loss": 0.411, "step": 1806 }, { "epoch": 1.7859614434008897, "grad_norm": 0.4047392516322923, "learning_rate": 2.2462440454378895e-05, "loss": 0.4486, "step": 1807 }, { "epoch": 1.7869500741473061, "grad_norm": 0.42527894403677624, "learning_rate": 2.244411872480762e-05, "loss": 0.3831, "step": 1808 }, { "epoch": 1.787938704893722, "grad_norm": 0.4351291092660424, "learning_rate": 2.242579699523635e-05, "loss": 0.3879, "step": 1809 }, { "epoch": 1.7889273356401385, "grad_norm": 0.45281798297457104, "learning_rate": 2.2407475265665077e-05, "loss": 0.3955, "step": 1810 }, { "epoch": 1.7899159663865545, "grad_norm": 0.38668378269316833, "learning_rate": 2.2389153536093807e-05, "loss": 0.3683, "step": 1811 }, { "epoch": 1.790904597132971, "grad_norm": 0.46235193460894697, "learning_rate": 2.2370831806522537e-05, "loss": 0.3572, "step": 1812 }, { "epoch": 1.791893227879387, "grad_norm": 0.44899427678203924, "learning_rate": 2.2352510076951267e-05, "loss": 0.4053, "step": 1813 }, { "epoch": 1.7928818586258033, "grad_norm": 0.5519762562047122, "learning_rate": 2.2334188347379993e-05, "loss": 0.4065, "step": 1814 }, { "epoch": 1.7938704893722195, "grad_norm": 0.3943784563467948, "learning_rate": 2.2315866617808723e-05, "loss": 0.3995, "step": 1815 }, { "epoch": 1.7948591201186357, "grad_norm": 0.4668506285944307, "learning_rate": 2.229754488823745e-05, "loss": 0.4135, "step": 1816 }, { "epoch": 1.7958477508650519, "grad_norm": 0.4202497031877333, "learning_rate": 2.227922315866618e-05, "loss": 0.3584, "step": 1817 }, { "epoch": 1.796836381611468, "grad_norm": 0.4659130780105852, "learning_rate": 2.226090142909491e-05, "loss": 0.4103, "step": 1818 }, { "epoch": 1.7978250123578843, "grad_norm": 0.5213853371500113, "learning_rate": 2.2242579699523635e-05, "loss": 0.4664, "step": 1819 }, { "epoch": 1.7988136431043005, "grad_norm": 0.40951427305230137, "learning_rate": 2.2224257969952365e-05, "loss": 0.3792, "step": 1820 }, { "epoch": 1.799802273850717, "grad_norm": 0.508979649960008, "learning_rate": 2.220593624038109e-05, "loss": 0.3529, "step": 1821 }, { "epoch": 1.8007909045971329, "grad_norm": 0.4620000741761235, "learning_rate": 2.218761451080982e-05, "loss": 0.4077, "step": 1822 }, { "epoch": 1.8017795353435493, "grad_norm": 0.5103311066082106, "learning_rate": 2.2169292781238548e-05, "loss": 0.4059, "step": 1823 }, { "epoch": 1.8027681660899653, "grad_norm": 0.44883479928282066, "learning_rate": 2.215097105166728e-05, "loss": 0.3612, "step": 1824 }, { "epoch": 1.8037567968363817, "grad_norm": 0.4921908476798029, "learning_rate": 2.2132649322096007e-05, "loss": 0.3611, "step": 1825 }, { "epoch": 1.8047454275827979, "grad_norm": 0.5528524272509014, "learning_rate": 2.2114327592524737e-05, "loss": 0.3892, "step": 1826 }, { "epoch": 1.805734058329214, "grad_norm": 0.3719563132245558, "learning_rate": 2.2096005862953464e-05, "loss": 0.3672, "step": 1827 }, { "epoch": 1.8067226890756303, "grad_norm": 0.8501087366170521, "learning_rate": 2.2077684133382193e-05, "loss": 0.3829, "step": 1828 }, { "epoch": 1.8077113198220465, "grad_norm": 0.47985908712605296, "learning_rate": 2.205936240381092e-05, "loss": 0.4178, "step": 1829 }, { "epoch": 1.8086999505684627, "grad_norm": 0.4497639457292069, "learning_rate": 2.204104067423965e-05, "loss": 0.3869, "step": 1830 }, { "epoch": 1.8096885813148789, "grad_norm": 0.46991664186049353, "learning_rate": 2.2022718944668376e-05, "loss": 0.3883, "step": 1831 }, { "epoch": 1.810677212061295, "grad_norm": 0.45579770546675846, "learning_rate": 2.2004397215097106e-05, "loss": 0.3688, "step": 1832 }, { "epoch": 1.8116658428077113, "grad_norm": 0.47592531627346, "learning_rate": 2.1986075485525832e-05, "loss": 0.4081, "step": 1833 }, { "epoch": 1.8126544735541277, "grad_norm": 0.4101515314065871, "learning_rate": 2.1967753755954562e-05, "loss": 0.3878, "step": 1834 }, { "epoch": 1.8136431043005437, "grad_norm": 0.46767362761705156, "learning_rate": 2.1949432026383292e-05, "loss": 0.3596, "step": 1835 }, { "epoch": 1.81463173504696, "grad_norm": 0.4421324204623352, "learning_rate": 2.1931110296812022e-05, "loss": 0.4216, "step": 1836 }, { "epoch": 1.815620365793376, "grad_norm": 0.43635860485749495, "learning_rate": 2.1912788567240748e-05, "loss": 0.4013, "step": 1837 }, { "epoch": 1.8166089965397925, "grad_norm": 0.4636915673167522, "learning_rate": 2.1894466837669478e-05, "loss": 0.3773, "step": 1838 }, { "epoch": 1.8175976272862084, "grad_norm": 0.3900520157247378, "learning_rate": 2.1876145108098204e-05, "loss": 0.4229, "step": 1839 }, { "epoch": 1.8185862580326249, "grad_norm": 0.6303887555214883, "learning_rate": 2.1857823378526934e-05, "loss": 0.4167, "step": 1840 }, { "epoch": 1.819574888779041, "grad_norm": 0.5203035216613996, "learning_rate": 2.1839501648955664e-05, "loss": 0.4185, "step": 1841 }, { "epoch": 1.8205635195254573, "grad_norm": 0.3994426147126881, "learning_rate": 2.182117991938439e-05, "loss": 0.3798, "step": 1842 }, { "epoch": 1.8215521502718734, "grad_norm": 0.5446895031160309, "learning_rate": 2.180285818981312e-05, "loss": 0.3735, "step": 1843 }, { "epoch": 1.8225407810182896, "grad_norm": 0.4041011505552412, "learning_rate": 2.1784536460241847e-05, "loss": 0.4339, "step": 1844 }, { "epoch": 1.8235294117647058, "grad_norm": 0.4088690998511784, "learning_rate": 2.1766214730670576e-05, "loss": 0.3842, "step": 1845 }, { "epoch": 1.824518042511122, "grad_norm": 0.4830772317609623, "learning_rate": 2.1747893001099303e-05, "loss": 0.3922, "step": 1846 }, { "epoch": 1.8255066732575385, "grad_norm": 0.43398465007152726, "learning_rate": 2.1729571271528033e-05, "loss": 0.3828, "step": 1847 }, { "epoch": 1.8264953040039544, "grad_norm": 0.43334358148258806, "learning_rate": 2.1711249541956762e-05, "loss": 0.3933, "step": 1848 }, { "epoch": 1.8274839347503709, "grad_norm": 0.3721549817560843, "learning_rate": 2.1692927812385492e-05, "loss": 0.3869, "step": 1849 }, { "epoch": 1.8284725654967868, "grad_norm": 0.4966235111987048, "learning_rate": 2.167460608281422e-05, "loss": 0.3706, "step": 1850 }, { "epoch": 1.8294611962432032, "grad_norm": 0.38224264064484786, "learning_rate": 2.165628435324295e-05, "loss": 0.4115, "step": 1851 }, { "epoch": 1.8304498269896192, "grad_norm": 0.4199813131412609, "learning_rate": 2.1637962623671675e-05, "loss": 0.4236, "step": 1852 }, { "epoch": 1.8314384577360356, "grad_norm": 0.41300926260155457, "learning_rate": 2.1619640894100405e-05, "loss": 0.3868, "step": 1853 }, { "epoch": 1.8324270884824518, "grad_norm": 0.4314336204133963, "learning_rate": 2.160131916452913e-05, "loss": 0.3978, "step": 1854 }, { "epoch": 1.833415719228868, "grad_norm": 0.44386907253961577, "learning_rate": 2.158299743495786e-05, "loss": 0.3477, "step": 1855 }, { "epoch": 1.8344043499752842, "grad_norm": 7.789884702273126, "learning_rate": 2.1564675705386587e-05, "loss": 0.3635, "step": 1856 }, { "epoch": 1.8353929807217004, "grad_norm": 0.5395792334137349, "learning_rate": 2.1546353975815317e-05, "loss": 0.3914, "step": 1857 }, { "epoch": 1.8363816114681166, "grad_norm": 0.504124864266562, "learning_rate": 2.1528032246244047e-05, "loss": 0.42, "step": 1858 }, { "epoch": 1.8373702422145328, "grad_norm": 0.42602009801588436, "learning_rate": 2.1509710516672773e-05, "loss": 0.4168, "step": 1859 }, { "epoch": 1.8383588729609492, "grad_norm": 0.5033663599747783, "learning_rate": 2.1491388787101503e-05, "loss": 0.414, "step": 1860 }, { "epoch": 1.8393475037073652, "grad_norm": 0.4059936764165169, "learning_rate": 2.1473067057530233e-05, "loss": 0.4104, "step": 1861 }, { "epoch": 1.8403361344537816, "grad_norm": 0.4702468491521716, "learning_rate": 2.145474532795896e-05, "loss": 0.4212, "step": 1862 }, { "epoch": 1.8413247652001976, "grad_norm": 0.3825761047755286, "learning_rate": 2.143642359838769e-05, "loss": 0.3716, "step": 1863 }, { "epoch": 1.842313395946614, "grad_norm": 0.43755364131811325, "learning_rate": 2.141810186881642e-05, "loss": 0.4018, "step": 1864 }, { "epoch": 1.84330202669303, "grad_norm": 1.028275909571067, "learning_rate": 2.1399780139245145e-05, "loss": 0.421, "step": 1865 }, { "epoch": 1.8442906574394464, "grad_norm": 0.5305379627869363, "learning_rate": 2.1381458409673875e-05, "loss": 0.4026, "step": 1866 }, { "epoch": 1.8452792881858626, "grad_norm": 0.42554031015753946, "learning_rate": 2.13631366801026e-05, "loss": 0.3979, "step": 1867 }, { "epoch": 1.8462679189322788, "grad_norm": 0.4189500932924751, "learning_rate": 2.134481495053133e-05, "loss": 0.3642, "step": 1868 }, { "epoch": 1.847256549678695, "grad_norm": 0.40635176128575934, "learning_rate": 2.1326493220960058e-05, "loss": 0.3876, "step": 1869 }, { "epoch": 1.8482451804251112, "grad_norm": 1.3917250769318816, "learning_rate": 2.1308171491388788e-05, "loss": 0.3927, "step": 1870 }, { "epoch": 1.8492338111715274, "grad_norm": 0.6977302112271614, "learning_rate": 2.1289849761817514e-05, "loss": 0.3953, "step": 1871 }, { "epoch": 1.8502224419179436, "grad_norm": 0.4275769544748631, "learning_rate": 2.1271528032246247e-05, "loss": 0.415, "step": 1872 }, { "epoch": 1.85121107266436, "grad_norm": 0.4608364947333012, "learning_rate": 2.1253206302674974e-05, "loss": 0.3889, "step": 1873 }, { "epoch": 1.852199703410776, "grad_norm": 0.4449306337525076, "learning_rate": 2.1234884573103704e-05, "loss": 0.4183, "step": 1874 }, { "epoch": 1.8531883341571924, "grad_norm": 0.38936858556224063, "learning_rate": 2.121656284353243e-05, "loss": 0.3996, "step": 1875 }, { "epoch": 1.8541769649036084, "grad_norm": 0.39027583815299394, "learning_rate": 2.119824111396116e-05, "loss": 0.4207, "step": 1876 }, { "epoch": 1.8551655956500248, "grad_norm": 1.507043698569568, "learning_rate": 2.1179919384389886e-05, "loss": 0.3882, "step": 1877 }, { "epoch": 1.8561542263964408, "grad_norm": 0.4459949156712435, "learning_rate": 2.1161597654818616e-05, "loss": 0.364, "step": 1878 }, { "epoch": 1.8571428571428572, "grad_norm": 0.3790644589857312, "learning_rate": 2.1143275925247342e-05, "loss": 0.3915, "step": 1879 }, { "epoch": 1.8581314878892734, "grad_norm": 0.4581785590029793, "learning_rate": 2.1124954195676072e-05, "loss": 0.4228, "step": 1880 }, { "epoch": 1.8591201186356896, "grad_norm": 0.5388614204728224, "learning_rate": 2.1106632466104802e-05, "loss": 0.4177, "step": 1881 }, { "epoch": 1.8601087493821058, "grad_norm": 0.35468186915170435, "learning_rate": 2.108831073653353e-05, "loss": 0.403, "step": 1882 }, { "epoch": 1.861097380128522, "grad_norm": 0.49129068414802907, "learning_rate": 2.1069989006962258e-05, "loss": 0.3907, "step": 1883 }, { "epoch": 1.8620860108749382, "grad_norm": 0.46821275806847973, "learning_rate": 2.1051667277390988e-05, "loss": 0.4169, "step": 1884 }, { "epoch": 1.8630746416213544, "grad_norm": 0.43117126487737645, "learning_rate": 2.1033345547819714e-05, "loss": 0.4231, "step": 1885 }, { "epoch": 1.8640632723677706, "grad_norm": 0.49842379725635766, "learning_rate": 2.1015023818248444e-05, "loss": 0.4289, "step": 1886 }, { "epoch": 1.8650519031141868, "grad_norm": 0.4766648844522579, "learning_rate": 2.0996702088677174e-05, "loss": 0.4032, "step": 1887 }, { "epoch": 1.8660405338606032, "grad_norm": 0.47421834874242447, "learning_rate": 2.09783803591059e-05, "loss": 0.4274, "step": 1888 }, { "epoch": 1.8670291646070192, "grad_norm": 0.5872274824398597, "learning_rate": 2.096005862953463e-05, "loss": 0.4038, "step": 1889 }, { "epoch": 1.8680177953534356, "grad_norm": 0.47633875627667355, "learning_rate": 2.0941736899963357e-05, "loss": 0.4337, "step": 1890 }, { "epoch": 1.8690064260998516, "grad_norm": 0.46620979420838377, "learning_rate": 2.0923415170392087e-05, "loss": 0.445, "step": 1891 }, { "epoch": 1.869995056846268, "grad_norm": 0.4666482470796989, "learning_rate": 2.0905093440820813e-05, "loss": 0.4015, "step": 1892 }, { "epoch": 1.8709836875926842, "grad_norm": 0.4559735826616051, "learning_rate": 2.0886771711249543e-05, "loss": 0.3768, "step": 1893 }, { "epoch": 1.8719723183391004, "grad_norm": 0.41809760074565916, "learning_rate": 2.086844998167827e-05, "loss": 0.4364, "step": 1894 }, { "epoch": 1.8729609490855166, "grad_norm": 0.3456636545630197, "learning_rate": 2.0850128252107002e-05, "loss": 0.3554, "step": 1895 }, { "epoch": 1.8739495798319328, "grad_norm": 0.5340991846749612, "learning_rate": 2.083180652253573e-05, "loss": 0.3922, "step": 1896 }, { "epoch": 1.874938210578349, "grad_norm": 0.44420437262876167, "learning_rate": 2.081348479296446e-05, "loss": 0.4062, "step": 1897 }, { "epoch": 1.8759268413247652, "grad_norm": 0.39084204129225064, "learning_rate": 2.0795163063393185e-05, "loss": 0.3941, "step": 1898 }, { "epoch": 1.8769154720711814, "grad_norm": 0.4538150341231665, "learning_rate": 2.0776841333821915e-05, "loss": 0.3762, "step": 1899 }, { "epoch": 1.8779041028175976, "grad_norm": 0.4105570098816077, "learning_rate": 2.075851960425064e-05, "loss": 0.3992, "step": 1900 }, { "epoch": 1.878892733564014, "grad_norm": 0.4398378377453247, "learning_rate": 2.074019787467937e-05, "loss": 0.3768, "step": 1901 }, { "epoch": 1.87988136431043, "grad_norm": 0.42635946863806945, "learning_rate": 2.0721876145108097e-05, "loss": 0.3833, "step": 1902 }, { "epoch": 1.8808699950568464, "grad_norm": 0.4155830382623376, "learning_rate": 2.0703554415536827e-05, "loss": 0.3702, "step": 1903 }, { "epoch": 1.8818586258032624, "grad_norm": 2.223237779316283, "learning_rate": 2.0685232685965557e-05, "loss": 0.4083, "step": 1904 }, { "epoch": 1.8828472565496788, "grad_norm": 0.4729351569985199, "learning_rate": 2.0666910956394283e-05, "loss": 0.4336, "step": 1905 }, { "epoch": 1.8838358872960947, "grad_norm": 0.4656654044909552, "learning_rate": 2.0648589226823013e-05, "loss": 0.3975, "step": 1906 }, { "epoch": 1.8848245180425112, "grad_norm": 0.3860000518569276, "learning_rate": 2.0630267497251743e-05, "loss": 0.3949, "step": 1907 }, { "epoch": 1.8858131487889274, "grad_norm": 0.5713765677021372, "learning_rate": 2.061194576768047e-05, "loss": 0.4204, "step": 1908 }, { "epoch": 1.8868017795353436, "grad_norm": 0.44233786013550336, "learning_rate": 2.05936240381092e-05, "loss": 0.3821, "step": 1909 }, { "epoch": 1.8877904102817598, "grad_norm": 0.34070127543327544, "learning_rate": 2.057530230853793e-05, "loss": 0.3905, "step": 1910 }, { "epoch": 1.888779041028176, "grad_norm": 0.46020656833458873, "learning_rate": 2.0556980578966656e-05, "loss": 0.405, "step": 1911 }, { "epoch": 1.8897676717745922, "grad_norm": 0.4326058661074342, "learning_rate": 2.0538658849395385e-05, "loss": 0.4057, "step": 1912 }, { "epoch": 1.8907563025210083, "grad_norm": 0.4174219064846593, "learning_rate": 2.0520337119824112e-05, "loss": 0.3393, "step": 1913 }, { "epoch": 1.8917449332674248, "grad_norm": 0.8230735011506927, "learning_rate": 2.050201539025284e-05, "loss": 0.3276, "step": 1914 }, { "epoch": 1.8927335640138407, "grad_norm": 0.4642341591196125, "learning_rate": 2.0483693660681568e-05, "loss": 0.363, "step": 1915 }, { "epoch": 1.8937221947602572, "grad_norm": 0.49893367022705376, "learning_rate": 2.0465371931110298e-05, "loss": 0.4254, "step": 1916 }, { "epoch": 1.8947108255066731, "grad_norm": 0.3945357054801614, "learning_rate": 2.0447050201539024e-05, "loss": 0.3544, "step": 1917 }, { "epoch": 1.8956994562530896, "grad_norm": 0.4201676979077517, "learning_rate": 2.0428728471967754e-05, "loss": 0.3902, "step": 1918 }, { "epoch": 1.8966880869995055, "grad_norm": 0.4475248307682215, "learning_rate": 2.0410406742396484e-05, "loss": 0.4088, "step": 1919 }, { "epoch": 1.897676717745922, "grad_norm": 0.4489632529503379, "learning_rate": 2.0392085012825214e-05, "loss": 0.3899, "step": 1920 }, { "epoch": 1.8986653484923381, "grad_norm": 0.39073362336621753, "learning_rate": 2.037376328325394e-05, "loss": 0.3624, "step": 1921 }, { "epoch": 1.8996539792387543, "grad_norm": 0.4812598722588095, "learning_rate": 2.035544155368267e-05, "loss": 0.3862, "step": 1922 }, { "epoch": 1.9006426099851705, "grad_norm": 0.7089801657453119, "learning_rate": 2.0337119824111396e-05, "loss": 0.3776, "step": 1923 }, { "epoch": 1.9016312407315867, "grad_norm": 0.39325011699460505, "learning_rate": 2.0318798094540126e-05, "loss": 0.3857, "step": 1924 }, { "epoch": 1.902619871478003, "grad_norm": 0.494718732062733, "learning_rate": 2.0300476364968852e-05, "loss": 0.4304, "step": 1925 }, { "epoch": 1.9036085022244191, "grad_norm": 0.7412859882243775, "learning_rate": 2.0282154635397582e-05, "loss": 0.4203, "step": 1926 }, { "epoch": 1.9045971329708355, "grad_norm": 0.41204548755950393, "learning_rate": 2.0263832905826312e-05, "loss": 0.3937, "step": 1927 }, { "epoch": 1.9055857637172515, "grad_norm": 0.37823112207745757, "learning_rate": 2.024551117625504e-05, "loss": 0.418, "step": 1928 }, { "epoch": 1.906574394463668, "grad_norm": 0.5270243390499773, "learning_rate": 2.022718944668377e-05, "loss": 0.4191, "step": 1929 }, { "epoch": 1.907563025210084, "grad_norm": 0.4073646912648506, "learning_rate": 2.0208867717112495e-05, "loss": 0.4139, "step": 1930 }, { "epoch": 1.9085516559565003, "grad_norm": 0.4162723049197515, "learning_rate": 2.0190545987541225e-05, "loss": 0.39, "step": 1931 }, { "epoch": 1.9095402867029163, "grad_norm": 0.41112277797876734, "learning_rate": 2.0172224257969954e-05, "loss": 0.7736, "step": 1932 }, { "epoch": 1.9105289174493327, "grad_norm": 16.554549864032225, "learning_rate": 2.0153902528398684e-05, "loss": 0.39, "step": 1933 }, { "epoch": 1.911517548195749, "grad_norm": 0.4048996284849255, "learning_rate": 2.013558079882741e-05, "loss": 0.4192, "step": 1934 }, { "epoch": 1.9125061789421651, "grad_norm": 0.39334953370830594, "learning_rate": 2.011725906925614e-05, "loss": 0.418, "step": 1935 }, { "epoch": 1.9134948096885813, "grad_norm": 0.3717923867698007, "learning_rate": 2.0098937339684867e-05, "loss": 0.3985, "step": 1936 }, { "epoch": 1.9144834404349975, "grad_norm": 0.3429823539806353, "learning_rate": 2.0080615610113597e-05, "loss": 0.3961, "step": 1937 }, { "epoch": 1.9154720711814137, "grad_norm": 0.4336258506869759, "learning_rate": 2.0062293880542323e-05, "loss": 0.4007, "step": 1938 }, { "epoch": 1.91646070192783, "grad_norm": 0.3850257238593946, "learning_rate": 2.0043972150971053e-05, "loss": 0.3817, "step": 1939 }, { "epoch": 1.9174493326742463, "grad_norm": 0.3727204921701167, "learning_rate": 2.002565042139978e-05, "loss": 0.413, "step": 1940 }, { "epoch": 1.9184379634206623, "grad_norm": 0.4414996364257584, "learning_rate": 2.000732869182851e-05, "loss": 0.4057, "step": 1941 }, { "epoch": 1.9194265941670787, "grad_norm": 0.4072444874179425, "learning_rate": 1.9989006962257235e-05, "loss": 0.4001, "step": 1942 }, { "epoch": 1.9204152249134947, "grad_norm": 0.36037935938784904, "learning_rate": 1.997068523268597e-05, "loss": 0.3638, "step": 1943 }, { "epoch": 1.9214038556599111, "grad_norm": 0.3604296360849209, "learning_rate": 1.9952363503114695e-05, "loss": 0.3673, "step": 1944 }, { "epoch": 1.922392486406327, "grad_norm": 0.4412408554510627, "learning_rate": 1.9934041773543425e-05, "loss": 0.33, "step": 1945 }, { "epoch": 1.9233811171527435, "grad_norm": 0.3268884676785922, "learning_rate": 1.991572004397215e-05, "loss": 0.3646, "step": 1946 }, { "epoch": 1.9243697478991597, "grad_norm": 0.35266819437280916, "learning_rate": 1.989739831440088e-05, "loss": 0.3533, "step": 1947 }, { "epoch": 1.925358378645576, "grad_norm": 0.4741792221865709, "learning_rate": 1.9879076584829608e-05, "loss": 0.4036, "step": 1948 }, { "epoch": 1.926347009391992, "grad_norm": 0.4356630650165645, "learning_rate": 1.9860754855258337e-05, "loss": 0.3522, "step": 1949 }, { "epoch": 1.9273356401384083, "grad_norm": 0.3577131222758982, "learning_rate": 1.9842433125687067e-05, "loss": 0.3651, "step": 1950 }, { "epoch": 1.9283242708848245, "grad_norm": 0.36048164067349486, "learning_rate": 1.9824111396115794e-05, "loss": 0.4116, "step": 1951 }, { "epoch": 1.9293129016312407, "grad_norm": 0.46526039579172646, "learning_rate": 1.9805789666544523e-05, "loss": 0.3962, "step": 1952 }, { "epoch": 1.930301532377657, "grad_norm": 0.41038563504461395, "learning_rate": 1.978746793697325e-05, "loss": 0.3898, "step": 1953 }, { "epoch": 1.931290163124073, "grad_norm": 0.3939988474803238, "learning_rate": 1.976914620740198e-05, "loss": 0.3995, "step": 1954 }, { "epoch": 1.9322787938704895, "grad_norm": 0.42633278842919997, "learning_rate": 1.975082447783071e-05, "loss": 0.4045, "step": 1955 }, { "epoch": 1.9332674246169055, "grad_norm": 0.45855875590467016, "learning_rate": 1.973250274825944e-05, "loss": 0.4027, "step": 1956 }, { "epoch": 1.934256055363322, "grad_norm": 0.39585678271934954, "learning_rate": 1.9714181018688166e-05, "loss": 0.3874, "step": 1957 }, { "epoch": 1.9352446861097379, "grad_norm": 0.457049006550548, "learning_rate": 1.9695859289116895e-05, "loss": 0.373, "step": 1958 }, { "epoch": 1.9362333168561543, "grad_norm": 0.41484705113758413, "learning_rate": 1.9677537559545622e-05, "loss": 0.3652, "step": 1959 }, { "epoch": 1.9372219476025705, "grad_norm": 0.4492303275857904, "learning_rate": 1.965921582997435e-05, "loss": 0.4088, "step": 1960 }, { "epoch": 1.9382105783489867, "grad_norm": 0.42539365943210217, "learning_rate": 1.9640894100403078e-05, "loss": 0.3621, "step": 1961 }, { "epoch": 1.9391992090954029, "grad_norm": 0.47214431173670385, "learning_rate": 1.9622572370831808e-05, "loss": 0.395, "step": 1962 }, { "epoch": 1.940187839841819, "grad_norm": 0.42459554932409566, "learning_rate": 1.9604250641260534e-05, "loss": 0.3485, "step": 1963 }, { "epoch": 1.9411764705882353, "grad_norm": 0.44439908271047907, "learning_rate": 1.9585928911689264e-05, "loss": 0.3816, "step": 1964 }, { "epoch": 1.9421651013346515, "grad_norm": 0.4325805261907603, "learning_rate": 1.956760718211799e-05, "loss": 0.3868, "step": 1965 }, { "epoch": 1.9431537320810677, "grad_norm": 0.43345970341790446, "learning_rate": 1.954928545254672e-05, "loss": 0.3896, "step": 1966 }, { "epoch": 1.9441423628274839, "grad_norm": 0.4614611116767667, "learning_rate": 1.953096372297545e-05, "loss": 0.3765, "step": 1967 }, { "epoch": 1.9451309935739003, "grad_norm": 0.4088398660836556, "learning_rate": 1.951264199340418e-05, "loss": 0.4392, "step": 1968 }, { "epoch": 1.9461196243203163, "grad_norm": 0.4015215391537035, "learning_rate": 1.9494320263832906e-05, "loss": 0.4059, "step": 1969 }, { "epoch": 1.9471082550667327, "grad_norm": 0.39730676287362293, "learning_rate": 1.9475998534261636e-05, "loss": 0.4153, "step": 1970 }, { "epoch": 1.9480968858131487, "grad_norm": 0.36031601843191513, "learning_rate": 1.9457676804690363e-05, "loss": 0.4076, "step": 1971 }, { "epoch": 1.949085516559565, "grad_norm": 0.4033129059729962, "learning_rate": 1.9439355075119092e-05, "loss": 0.3755, "step": 1972 }, { "epoch": 1.950074147305981, "grad_norm": 0.43803541379459277, "learning_rate": 1.9421033345547822e-05, "loss": 0.4326, "step": 1973 }, { "epoch": 1.9510627780523975, "grad_norm": 0.43257226073578925, "learning_rate": 1.940271161597655e-05, "loss": 0.4111, "step": 1974 }, { "epoch": 1.9520514087988137, "grad_norm": 0.5744165688616281, "learning_rate": 1.938438988640528e-05, "loss": 0.4051, "step": 1975 }, { "epoch": 1.9530400395452299, "grad_norm": 0.5507505844997722, "learning_rate": 1.9366068156834005e-05, "loss": 0.4343, "step": 1976 }, { "epoch": 1.954028670291646, "grad_norm": 0.47393498269332246, "learning_rate": 1.9347746427262735e-05, "loss": 0.3923, "step": 1977 }, { "epoch": 1.9550173010380623, "grad_norm": 0.4121553927817406, "learning_rate": 1.932942469769146e-05, "loss": 0.4078, "step": 1978 }, { "epoch": 1.9560059317844785, "grad_norm": 0.5557362310952685, "learning_rate": 1.9311102968120194e-05, "loss": 0.4527, "step": 1979 }, { "epoch": 1.9569945625308947, "grad_norm": 0.47551461210214935, "learning_rate": 1.929278123854892e-05, "loss": 0.3799, "step": 1980 }, { "epoch": 1.957983193277311, "grad_norm": 0.4498041142739656, "learning_rate": 1.927445950897765e-05, "loss": 0.3808, "step": 1981 }, { "epoch": 1.958971824023727, "grad_norm": 0.48579353433612504, "learning_rate": 1.9256137779406377e-05, "loss": 0.4358, "step": 1982 }, { "epoch": 1.9599604547701435, "grad_norm": 0.5176696521823142, "learning_rate": 1.9237816049835107e-05, "loss": 0.3642, "step": 1983 }, { "epoch": 1.9609490855165594, "grad_norm": 0.5613284870151625, "learning_rate": 1.9219494320263833e-05, "loss": 0.4079, "step": 1984 }, { "epoch": 1.9619377162629759, "grad_norm": 0.3996102150117655, "learning_rate": 1.9201172590692563e-05, "loss": 0.3955, "step": 1985 }, { "epoch": 1.9629263470093918, "grad_norm": 0.5677124475724988, "learning_rate": 1.918285086112129e-05, "loss": 0.3874, "step": 1986 }, { "epoch": 1.9639149777558083, "grad_norm": 0.4423995621898426, "learning_rate": 1.916452913155002e-05, "loss": 0.484, "step": 1987 }, { "epoch": 1.9649036085022245, "grad_norm": 3.0358736233164327, "learning_rate": 1.9146207401978746e-05, "loss": 0.3925, "step": 1988 }, { "epoch": 1.9658922392486406, "grad_norm": 0.47300937729605047, "learning_rate": 1.9127885672407475e-05, "loss": 0.3961, "step": 1989 }, { "epoch": 1.9668808699950568, "grad_norm": 0.4710541560337362, "learning_rate": 1.9109563942836202e-05, "loss": 0.3877, "step": 1990 }, { "epoch": 1.967869500741473, "grad_norm": 0.3776564004278724, "learning_rate": 1.9091242213264935e-05, "loss": 0.4199, "step": 1991 }, { "epoch": 1.9688581314878892, "grad_norm": 0.41143144050548275, "learning_rate": 1.907292048369366e-05, "loss": 0.3753, "step": 1992 }, { "epoch": 1.9698467622343054, "grad_norm": 0.45020150408902315, "learning_rate": 1.905459875412239e-05, "loss": 0.4172, "step": 1993 }, { "epoch": 1.9708353929807219, "grad_norm": 0.36859434872632524, "learning_rate": 1.9036277024551118e-05, "loss": 0.3953, "step": 1994 }, { "epoch": 1.9718240237271378, "grad_norm": 0.45381380790074854, "learning_rate": 1.9017955294979847e-05, "loss": 0.419, "step": 1995 }, { "epoch": 1.9728126544735543, "grad_norm": 7.046107500119414, "learning_rate": 1.8999633565408577e-05, "loss": 1.0816, "step": 1996 }, { "epoch": 1.9738012852199702, "grad_norm": 14.376257734366982, "learning_rate": 1.8981311835837304e-05, "loss": 0.4053, "step": 1997 }, { "epoch": 1.9747899159663866, "grad_norm": 0.6912975140925988, "learning_rate": 1.8962990106266033e-05, "loss": 0.4057, "step": 1998 }, { "epoch": 1.9757785467128026, "grad_norm": 0.4528673059663746, "learning_rate": 1.894466837669476e-05, "loss": 0.4415, "step": 1999 }, { "epoch": 1.976767177459219, "grad_norm": 0.4710319818580347, "learning_rate": 1.892634664712349e-05, "loss": 0.3994, "step": 2000 }, { "epoch": 1.9777558082056352, "grad_norm": 0.4990443888663984, "learning_rate": 1.8908024917552216e-05, "loss": 0.3493, "step": 2001 }, { "epoch": 1.9787444389520514, "grad_norm": 0.4518481005001932, "learning_rate": 1.8889703187980946e-05, "loss": 0.3484, "step": 2002 }, { "epoch": 1.9797330696984676, "grad_norm": 0.4612688334358858, "learning_rate": 1.8871381458409676e-05, "loss": 0.3991, "step": 2003 }, { "epoch": 1.9807217004448838, "grad_norm": 0.44163077971885617, "learning_rate": 1.8853059728838405e-05, "loss": 0.3846, "step": 2004 }, { "epoch": 1.9817103311913, "grad_norm": 0.4798464045161417, "learning_rate": 1.8834737999267132e-05, "loss": 0.3946, "step": 2005 }, { "epoch": 1.9826989619377162, "grad_norm": 0.40649638480642747, "learning_rate": 1.8816416269695862e-05, "loss": 0.3448, "step": 2006 }, { "epoch": 1.9836875926841326, "grad_norm": 0.3836654995287432, "learning_rate": 1.8798094540124588e-05, "loss": 0.3638, "step": 2007 }, { "epoch": 1.9846762234305486, "grad_norm": 0.4879033014867591, "learning_rate": 1.8779772810553318e-05, "loss": 0.3723, "step": 2008 }, { "epoch": 1.985664854176965, "grad_norm": 0.40160577156157173, "learning_rate": 1.8761451080982044e-05, "loss": 0.4057, "step": 2009 }, { "epoch": 1.986653484923381, "grad_norm": 0.5042892493619028, "learning_rate": 1.8743129351410774e-05, "loss": 0.4091, "step": 2010 }, { "epoch": 1.9876421156697974, "grad_norm": 0.4445646337138756, "learning_rate": 1.87248076218395e-05, "loss": 0.4115, "step": 2011 }, { "epoch": 1.9886307464162134, "grad_norm": 0.387988371103682, "learning_rate": 1.870648589226823e-05, "loss": 0.3877, "step": 2012 }, { "epoch": 1.9896193771626298, "grad_norm": 0.45908497404093895, "learning_rate": 1.8688164162696957e-05, "loss": 0.3958, "step": 2013 }, { "epoch": 1.990608007909046, "grad_norm": 0.3925231641862922, "learning_rate": 1.8669842433125687e-05, "loss": 0.3718, "step": 2014 }, { "epoch": 1.9915966386554622, "grad_norm": 0.4059923712651453, "learning_rate": 1.8651520703554416e-05, "loss": 0.408, "step": 2015 }, { "epoch": 1.9925852694018784, "grad_norm": 0.5446924217541459, "learning_rate": 1.8633198973983146e-05, "loss": 0.3941, "step": 2016 }, { "epoch": 1.9935739001482946, "grad_norm": 0.3832258567401457, "learning_rate": 1.8614877244411873e-05, "loss": 0.385, "step": 2017 }, { "epoch": 1.9945625308947108, "grad_norm": 0.4930508602243161, "learning_rate": 1.8596555514840602e-05, "loss": 0.394, "step": 2018 }, { "epoch": 1.995551161641127, "grad_norm": 0.4678269230729438, "learning_rate": 1.8578233785269332e-05, "loss": 0.4334, "step": 2019 }, { "epoch": 1.9965397923875432, "grad_norm": 0.43778840172878075, "learning_rate": 1.855991205569806e-05, "loss": 0.3902, "step": 2020 }, { "epoch": 1.9975284231339594, "grad_norm": 0.4411202959166341, "learning_rate": 1.854159032612679e-05, "loss": 0.4075, "step": 2021 }, { "epoch": 1.9985170538803758, "grad_norm": 0.41058917860187455, "learning_rate": 1.8523268596555515e-05, "loss": 0.3866, "step": 2022 }, { "epoch": 1.9995056846267918, "grad_norm": 0.46744588573090634, "learning_rate": 1.8504946866984245e-05, "loss": 0.4228, "step": 2023 }, { "epoch": 2.0, "grad_norm": 0.6945448348782393, "learning_rate": 1.848662513741297e-05, "loss": 0.3402, "step": 2024 }, { "epoch": 2.0009886307464164, "grad_norm": 0.6201512222238776, "learning_rate": 1.84683034078417e-05, "loss": 0.3131, "step": 2025 }, { "epoch": 2.0019772614928324, "grad_norm": 0.4808897503000627, "learning_rate": 1.8449981678270427e-05, "loss": 0.3015, "step": 2026 }, { "epoch": 2.002965892239249, "grad_norm": 0.5432426754105668, "learning_rate": 1.843165994869916e-05, "loss": 0.3037, "step": 2027 }, { "epoch": 2.003954522985665, "grad_norm": 0.611122697443419, "learning_rate": 1.8413338219127887e-05, "loss": 0.2967, "step": 2028 }, { "epoch": 2.004943153732081, "grad_norm": 0.4503893023262218, "learning_rate": 1.8395016489556617e-05, "loss": 0.2867, "step": 2029 }, { "epoch": 2.005931784478497, "grad_norm": 0.42820451162402745, "learning_rate": 1.8376694759985343e-05, "loss": 0.2927, "step": 2030 }, { "epoch": 2.0069204152249136, "grad_norm": 0.44492153257293415, "learning_rate": 1.8358373030414073e-05, "loss": 0.2997, "step": 2031 }, { "epoch": 2.0079090459713296, "grad_norm": 0.37554971144462446, "learning_rate": 1.83400513008428e-05, "loss": 0.2901, "step": 2032 }, { "epoch": 2.008897676717746, "grad_norm": 0.4254926220112015, "learning_rate": 1.832172957127153e-05, "loss": 0.287, "step": 2033 }, { "epoch": 2.009886307464162, "grad_norm": 0.43912464580412264, "learning_rate": 1.8303407841700256e-05, "loss": 0.2902, "step": 2034 }, { "epoch": 2.0108749382105784, "grad_norm": 0.35666205841719395, "learning_rate": 1.8285086112128985e-05, "loss": 0.2984, "step": 2035 }, { "epoch": 2.0118635689569944, "grad_norm": 0.4107115610059268, "learning_rate": 1.8266764382557712e-05, "loss": 0.272, "step": 2036 }, { "epoch": 2.012852199703411, "grad_norm": 0.44681951740821174, "learning_rate": 1.824844265298644e-05, "loss": 0.2949, "step": 2037 }, { "epoch": 2.013840830449827, "grad_norm": 0.3711991689239265, "learning_rate": 1.823012092341517e-05, "loss": 0.301, "step": 2038 }, { "epoch": 2.014829461196243, "grad_norm": 0.4066339538453939, "learning_rate": 1.82117991938439e-05, "loss": 0.2955, "step": 2039 }, { "epoch": 2.0158180919426596, "grad_norm": 0.3870912537826698, "learning_rate": 1.8193477464272628e-05, "loss": 0.2895, "step": 2040 }, { "epoch": 2.0168067226890756, "grad_norm": 0.36742267005149015, "learning_rate": 1.8175155734701357e-05, "loss": 0.2995, "step": 2041 }, { "epoch": 2.017795353435492, "grad_norm": 0.3589903300460581, "learning_rate": 1.8156834005130087e-05, "loss": 0.2681, "step": 2042 }, { "epoch": 2.018783984181908, "grad_norm": 0.3676370063170235, "learning_rate": 1.8138512275558814e-05, "loss": 0.2737, "step": 2043 }, { "epoch": 2.0197726149283244, "grad_norm": 0.42674436493048756, "learning_rate": 1.8120190545987544e-05, "loss": 0.2969, "step": 2044 }, { "epoch": 2.0207612456747404, "grad_norm": 0.37461787691906495, "learning_rate": 1.810186881641627e-05, "loss": 0.3064, "step": 2045 }, { "epoch": 2.0217498764211568, "grad_norm": 0.3476116469264459, "learning_rate": 1.8083547086845e-05, "loss": 0.288, "step": 2046 }, { "epoch": 2.0227385071675728, "grad_norm": 0.34460565866439313, "learning_rate": 1.8065225357273726e-05, "loss": 0.2856, "step": 2047 }, { "epoch": 2.023727137913989, "grad_norm": 0.3773587670405607, "learning_rate": 1.8046903627702456e-05, "loss": 0.3093, "step": 2048 }, { "epoch": 2.024715768660405, "grad_norm": 0.40942679187722647, "learning_rate": 1.8028581898131182e-05, "loss": 0.3193, "step": 2049 }, { "epoch": 2.0257043994068216, "grad_norm": 0.37887705042355907, "learning_rate": 1.8010260168559916e-05, "loss": 0.3082, "step": 2050 }, { "epoch": 2.026693030153238, "grad_norm": 0.37848361875593467, "learning_rate": 1.7991938438988642e-05, "loss": 0.3016, "step": 2051 }, { "epoch": 2.027681660899654, "grad_norm": 0.32490381617513986, "learning_rate": 1.7973616709417372e-05, "loss": 0.2713, "step": 2052 }, { "epoch": 2.0286702916460704, "grad_norm": 0.3934609734219772, "learning_rate": 1.7955294979846098e-05, "loss": 0.2946, "step": 2053 }, { "epoch": 2.0296589223924864, "grad_norm": 0.3857342473202583, "learning_rate": 1.7936973250274828e-05, "loss": 0.3142, "step": 2054 }, { "epoch": 2.0306475531389028, "grad_norm": 0.37234926099509674, "learning_rate": 1.7918651520703554e-05, "loss": 0.2965, "step": 2055 }, { "epoch": 2.0316361838853187, "grad_norm": 0.33465448106539125, "learning_rate": 1.7900329791132284e-05, "loss": 0.2805, "step": 2056 }, { "epoch": 2.032624814631735, "grad_norm": 0.32386265257330005, "learning_rate": 1.788200806156101e-05, "loss": 0.2819, "step": 2057 }, { "epoch": 2.033613445378151, "grad_norm": 0.3318290801048338, "learning_rate": 1.786368633198974e-05, "loss": 0.2846, "step": 2058 }, { "epoch": 2.0346020761245676, "grad_norm": 0.34889015913004695, "learning_rate": 1.7845364602418467e-05, "loss": 0.2583, "step": 2059 }, { "epoch": 2.0355907068709835, "grad_norm": 0.3730282349854314, "learning_rate": 1.7827042872847197e-05, "loss": 0.2791, "step": 2060 }, { "epoch": 2.0365793376174, "grad_norm": 0.3720941385369664, "learning_rate": 1.7808721143275926e-05, "loss": 0.2573, "step": 2061 }, { "epoch": 2.037567968363816, "grad_norm": 0.9012078944121036, "learning_rate": 1.7790399413704656e-05, "loss": 0.2862, "step": 2062 }, { "epoch": 2.0385565991102323, "grad_norm": 0.3563506285073204, "learning_rate": 1.7772077684133383e-05, "loss": 0.2975, "step": 2063 }, { "epoch": 2.0395452298566488, "grad_norm": 0.42716739482822474, "learning_rate": 1.7753755954562113e-05, "loss": 0.3178, "step": 2064 }, { "epoch": 2.0405338606030647, "grad_norm": 0.34843757293302247, "learning_rate": 1.773543422499084e-05, "loss": 0.2624, "step": 2065 }, { "epoch": 2.041522491349481, "grad_norm": 0.34309248750852184, "learning_rate": 1.771711249541957e-05, "loss": 0.2947, "step": 2066 }, { "epoch": 2.042511122095897, "grad_norm": 0.3717718461370227, "learning_rate": 1.76987907658483e-05, "loss": 0.2764, "step": 2067 }, { "epoch": 2.0434997528423136, "grad_norm": 0.36392053376380873, "learning_rate": 1.7680469036277025e-05, "loss": 0.2885, "step": 2068 }, { "epoch": 2.0444883835887295, "grad_norm": 1.308789224179621, "learning_rate": 1.7662147306705755e-05, "loss": 0.2832, "step": 2069 }, { "epoch": 2.045477014335146, "grad_norm": 0.32328658288452705, "learning_rate": 1.764382557713448e-05, "loss": 0.2587, "step": 2070 }, { "epoch": 2.046465645081562, "grad_norm": 0.36158305829139864, "learning_rate": 1.762550384756321e-05, "loss": 0.2945, "step": 2071 }, { "epoch": 2.0474542758279783, "grad_norm": 0.34193159607226287, "learning_rate": 1.7607182117991937e-05, "loss": 0.2604, "step": 2072 }, { "epoch": 2.0484429065743943, "grad_norm": 0.3753593069282091, "learning_rate": 1.7588860388420667e-05, "loss": 0.2799, "step": 2073 }, { "epoch": 2.0494315373208107, "grad_norm": 0.8793554922820945, "learning_rate": 1.7570538658849397e-05, "loss": 0.2951, "step": 2074 }, { "epoch": 2.0504201680672267, "grad_norm": 0.3431279483385732, "learning_rate": 1.7552216929278127e-05, "loss": 0.2715, "step": 2075 }, { "epoch": 2.051408798813643, "grad_norm": 0.3981015356930174, "learning_rate": 1.7533895199706853e-05, "loss": 0.3005, "step": 2076 }, { "epoch": 2.052397429560059, "grad_norm": 0.4327877331372407, "learning_rate": 1.7515573470135583e-05, "loss": 0.2912, "step": 2077 }, { "epoch": 2.0533860603064755, "grad_norm": 0.3847902069146942, "learning_rate": 1.749725174056431e-05, "loss": 0.304, "step": 2078 }, { "epoch": 2.054374691052892, "grad_norm": 0.365841822014409, "learning_rate": 1.747893001099304e-05, "loss": 0.2775, "step": 2079 }, { "epoch": 2.055363321799308, "grad_norm": 0.39186948734696975, "learning_rate": 1.7460608281421766e-05, "loss": 0.2945, "step": 2080 }, { "epoch": 2.0563519525457243, "grad_norm": 0.38469559001270315, "learning_rate": 1.7442286551850495e-05, "loss": 0.2786, "step": 2081 }, { "epoch": 2.0573405832921403, "grad_norm": 0.3807974587390052, "learning_rate": 1.7423964822279222e-05, "loss": 0.2797, "step": 2082 }, { "epoch": 2.0583292140385567, "grad_norm": 0.3680470165208605, "learning_rate": 1.7405643092707952e-05, "loss": 0.3081, "step": 2083 }, { "epoch": 2.0593178447849727, "grad_norm": 0.4439029925655337, "learning_rate": 1.738732136313668e-05, "loss": 0.2815, "step": 2084 }, { "epoch": 2.060306475531389, "grad_norm": 0.34314842249858846, "learning_rate": 1.7368999633565408e-05, "loss": 0.2572, "step": 2085 }, { "epoch": 2.061295106277805, "grad_norm": 0.39636487821638955, "learning_rate": 1.7350677903994138e-05, "loss": 0.3214, "step": 2086 }, { "epoch": 2.0622837370242215, "grad_norm": 0.3516390157739844, "learning_rate": 1.7332356174422868e-05, "loss": 0.2814, "step": 2087 }, { "epoch": 2.0632723677706375, "grad_norm": 0.3776723570178149, "learning_rate": 1.7314034444851594e-05, "loss": 0.3037, "step": 2088 }, { "epoch": 2.064260998517054, "grad_norm": 0.3545316953034309, "learning_rate": 1.7295712715280324e-05, "loss": 0.2543, "step": 2089 }, { "epoch": 2.06524962926347, "grad_norm": 0.33182514033707033, "learning_rate": 1.7277390985709054e-05, "loss": 0.2787, "step": 2090 }, { "epoch": 2.0662382600098863, "grad_norm": 0.4227026721974439, "learning_rate": 1.725906925613778e-05, "loss": 0.2646, "step": 2091 }, { "epoch": 2.0672268907563027, "grad_norm": 0.3327821238472474, "learning_rate": 1.724074752656651e-05, "loss": 0.2676, "step": 2092 }, { "epoch": 2.0682155215027187, "grad_norm": 0.37491321092573826, "learning_rate": 1.7222425796995236e-05, "loss": 0.2869, "step": 2093 }, { "epoch": 2.069204152249135, "grad_norm": 0.3567773957121631, "learning_rate": 1.7204104067423966e-05, "loss": 0.2979, "step": 2094 }, { "epoch": 2.070192782995551, "grad_norm": 0.34981015836929336, "learning_rate": 1.7185782337852692e-05, "loss": 0.2813, "step": 2095 }, { "epoch": 2.0711814137419675, "grad_norm": 0.33176701826950666, "learning_rate": 1.7167460608281422e-05, "loss": 0.2607, "step": 2096 }, { "epoch": 2.0721700444883835, "grad_norm": 0.3749240861679635, "learning_rate": 1.714913887871015e-05, "loss": 0.3404, "step": 2097 }, { "epoch": 2.0731586752348, "grad_norm": 0.3484473144074298, "learning_rate": 1.7130817149138882e-05, "loss": 0.2928, "step": 2098 }, { "epoch": 2.074147305981216, "grad_norm": 0.3804173613964582, "learning_rate": 1.7112495419567608e-05, "loss": 0.2823, "step": 2099 }, { "epoch": 2.0751359367276323, "grad_norm": 0.34137770366628306, "learning_rate": 1.7094173689996338e-05, "loss": 0.2835, "step": 2100 }, { "epoch": 2.0761245674740483, "grad_norm": 0.34756721891059333, "learning_rate": 1.7075851960425065e-05, "loss": 0.2899, "step": 2101 }, { "epoch": 2.0771131982204647, "grad_norm": 0.3854267112426629, "learning_rate": 1.7057530230853794e-05, "loss": 0.2898, "step": 2102 }, { "epoch": 2.0781018289668807, "grad_norm": 0.3694193274179835, "learning_rate": 1.703920850128252e-05, "loss": 0.3066, "step": 2103 }, { "epoch": 2.079090459713297, "grad_norm": 0.34109207904549443, "learning_rate": 1.702088677171125e-05, "loss": 0.2827, "step": 2104 }, { "epoch": 2.0800790904597135, "grad_norm": 0.35146299809715603, "learning_rate": 1.7002565042139977e-05, "loss": 0.3, "step": 2105 }, { "epoch": 2.0810677212061295, "grad_norm": 0.3653311233859651, "learning_rate": 1.6984243312568707e-05, "loss": 0.2712, "step": 2106 }, { "epoch": 2.082056351952546, "grad_norm": 0.3449223521584276, "learning_rate": 1.6965921582997437e-05, "loss": 0.2724, "step": 2107 }, { "epoch": 2.083044982698962, "grad_norm": 0.3695684755708202, "learning_rate": 1.6947599853426163e-05, "loss": 0.2955, "step": 2108 }, { "epoch": 2.0840336134453783, "grad_norm": 0.34768151371490624, "learning_rate": 1.6929278123854893e-05, "loss": 0.2664, "step": 2109 }, { "epoch": 2.0850222441917943, "grad_norm": 0.38800676605755785, "learning_rate": 1.6910956394283623e-05, "loss": 0.2938, "step": 2110 }, { "epoch": 2.0860108749382107, "grad_norm": 0.38183985342126264, "learning_rate": 1.689263466471235e-05, "loss": 0.2888, "step": 2111 }, { "epoch": 2.0869995056846267, "grad_norm": 0.3145750066742417, "learning_rate": 1.687431293514108e-05, "loss": 0.2675, "step": 2112 }, { "epoch": 2.087988136431043, "grad_norm": 0.3299342929995307, "learning_rate": 1.685599120556981e-05, "loss": 0.286, "step": 2113 }, { "epoch": 2.088976767177459, "grad_norm": 0.3339129694551416, "learning_rate": 1.6837669475998535e-05, "loss": 0.2849, "step": 2114 }, { "epoch": 2.0899653979238755, "grad_norm": 0.33904548483464103, "learning_rate": 1.6819347746427265e-05, "loss": 0.2673, "step": 2115 }, { "epoch": 2.0909540286702915, "grad_norm": 0.3361968419866841, "learning_rate": 1.680102601685599e-05, "loss": 0.2875, "step": 2116 }, { "epoch": 2.091942659416708, "grad_norm": 0.3656092023464879, "learning_rate": 1.678270428728472e-05, "loss": 0.3131, "step": 2117 }, { "epoch": 2.092931290163124, "grad_norm": 0.32361628357574584, "learning_rate": 1.6764382557713447e-05, "loss": 0.3084, "step": 2118 }, { "epoch": 2.0939199209095403, "grad_norm": 0.3336293398555863, "learning_rate": 1.6746060828142177e-05, "loss": 0.2957, "step": 2119 }, { "epoch": 2.0949085516559567, "grad_norm": 0.3480087355486201, "learning_rate": 1.6727739098570904e-05, "loss": 0.2952, "step": 2120 }, { "epoch": 2.0958971824023727, "grad_norm": 0.3463536093747808, "learning_rate": 1.6709417368999634e-05, "loss": 0.2853, "step": 2121 }, { "epoch": 2.096885813148789, "grad_norm": 0.31583285484981294, "learning_rate": 1.6691095639428363e-05, "loss": 0.2766, "step": 2122 }, { "epoch": 2.097874443895205, "grad_norm": 0.3502402999031485, "learning_rate": 1.6672773909857093e-05, "loss": 0.2835, "step": 2123 }, { "epoch": 2.0988630746416215, "grad_norm": 0.34869200070835643, "learning_rate": 1.665445218028582e-05, "loss": 0.3176, "step": 2124 }, { "epoch": 2.0998517053880374, "grad_norm": 0.3206863793155926, "learning_rate": 1.663613045071455e-05, "loss": 0.2914, "step": 2125 }, { "epoch": 2.100840336134454, "grad_norm": 0.35741955165330735, "learning_rate": 1.6617808721143276e-05, "loss": 0.286, "step": 2126 }, { "epoch": 2.10182896688087, "grad_norm": 0.3547912520335088, "learning_rate": 1.6599486991572006e-05, "loss": 0.2969, "step": 2127 }, { "epoch": 2.1028175976272863, "grad_norm": 0.321173496945262, "learning_rate": 1.6581165262000732e-05, "loss": 0.2863, "step": 2128 }, { "epoch": 2.1038062283737022, "grad_norm": 0.3216953093407526, "learning_rate": 1.6562843532429462e-05, "loss": 0.2753, "step": 2129 }, { "epoch": 2.1047948591201187, "grad_norm": 0.3347571598640675, "learning_rate": 1.654452180285819e-05, "loss": 0.2951, "step": 2130 }, { "epoch": 2.1057834898665346, "grad_norm": 0.3313206667797741, "learning_rate": 1.6526200073286918e-05, "loss": 0.2811, "step": 2131 }, { "epoch": 2.106772120612951, "grad_norm": 0.3441664763638713, "learning_rate": 1.6507878343715648e-05, "loss": 0.2951, "step": 2132 }, { "epoch": 2.1077607513593675, "grad_norm": 0.2977296404773225, "learning_rate": 1.6489556614144374e-05, "loss": 0.2639, "step": 2133 }, { "epoch": 2.1087493821057834, "grad_norm": 0.3315548982278328, "learning_rate": 1.6471234884573104e-05, "loss": 0.2817, "step": 2134 }, { "epoch": 2.1097380128522, "grad_norm": 0.4840256992795094, "learning_rate": 1.6452913155001834e-05, "loss": 0.3236, "step": 2135 }, { "epoch": 2.110726643598616, "grad_norm": 0.3295926543925444, "learning_rate": 1.6434591425430564e-05, "loss": 0.2979, "step": 2136 }, { "epoch": 2.1117152743450323, "grad_norm": 0.3221011512837361, "learning_rate": 1.641626969585929e-05, "loss": 0.2836, "step": 2137 }, { "epoch": 2.1127039050914482, "grad_norm": 0.35397598018640686, "learning_rate": 1.639794796628802e-05, "loss": 0.2646, "step": 2138 }, { "epoch": 2.1136925358378647, "grad_norm": 0.3305645778678562, "learning_rate": 1.6379626236716746e-05, "loss": 0.2811, "step": 2139 }, { "epoch": 2.1146811665842806, "grad_norm": 0.3409458812447619, "learning_rate": 1.6361304507145476e-05, "loss": 0.2831, "step": 2140 }, { "epoch": 2.115669797330697, "grad_norm": 0.34250110458570676, "learning_rate": 1.6342982777574203e-05, "loss": 0.3023, "step": 2141 }, { "epoch": 2.116658428077113, "grad_norm": 0.33833187830882516, "learning_rate": 1.6324661048002932e-05, "loss": 0.2768, "step": 2142 }, { "epoch": 2.1176470588235294, "grad_norm": 0.3495890272466183, "learning_rate": 1.630633931843166e-05, "loss": 0.296, "step": 2143 }, { "epoch": 2.1186356895699454, "grad_norm": 0.3322195363578974, "learning_rate": 1.628801758886039e-05, "loss": 0.3045, "step": 2144 }, { "epoch": 2.119624320316362, "grad_norm": 0.3423112265825593, "learning_rate": 1.6269695859289115e-05, "loss": 0.3215, "step": 2145 }, { "epoch": 2.1206129510627783, "grad_norm": 0.3230158872118203, "learning_rate": 1.6251374129717848e-05, "loss": 0.2679, "step": 2146 }, { "epoch": 2.1216015818091942, "grad_norm": 0.3280344504608779, "learning_rate": 1.6233052400146575e-05, "loss": 0.2687, "step": 2147 }, { "epoch": 2.1225902125556106, "grad_norm": 0.34539361446477546, "learning_rate": 1.6214730670575304e-05, "loss": 0.3066, "step": 2148 }, { "epoch": 2.1235788433020266, "grad_norm": 0.5169298330152396, "learning_rate": 1.619640894100403e-05, "loss": 0.2851, "step": 2149 }, { "epoch": 2.124567474048443, "grad_norm": 0.3724697321464871, "learning_rate": 1.617808721143276e-05, "loss": 0.2953, "step": 2150 }, { "epoch": 2.125556104794859, "grad_norm": 0.3550322363497184, "learning_rate": 1.6159765481861487e-05, "loss": 0.2675, "step": 2151 }, { "epoch": 2.1265447355412754, "grad_norm": 0.3442487570669081, "learning_rate": 1.6141443752290217e-05, "loss": 0.2885, "step": 2152 }, { "epoch": 2.1275333662876914, "grad_norm": 0.3774241275151995, "learning_rate": 1.6123122022718947e-05, "loss": 0.3131, "step": 2153 }, { "epoch": 2.128521997034108, "grad_norm": 0.33410909981229503, "learning_rate": 1.6104800293147673e-05, "loss": 0.2884, "step": 2154 }, { "epoch": 2.129510627780524, "grad_norm": 1.2133165092726412, "learning_rate": 1.6086478563576403e-05, "loss": 0.2971, "step": 2155 }, { "epoch": 2.13049925852694, "grad_norm": 0.3616199120471439, "learning_rate": 1.606815683400513e-05, "loss": 0.3037, "step": 2156 }, { "epoch": 2.131487889273356, "grad_norm": 0.3635036512950771, "learning_rate": 1.604983510443386e-05, "loss": 0.313, "step": 2157 }, { "epoch": 2.1324765200197726, "grad_norm": 0.33721124993259577, "learning_rate": 1.603151337486259e-05, "loss": 0.2736, "step": 2158 }, { "epoch": 2.133465150766189, "grad_norm": 0.3163702219866928, "learning_rate": 1.601319164529132e-05, "loss": 0.2667, "step": 2159 }, { "epoch": 2.134453781512605, "grad_norm": 0.32981166179549937, "learning_rate": 1.5994869915720045e-05, "loss": 0.2828, "step": 2160 }, { "epoch": 2.1354424122590214, "grad_norm": 0.3508237528928884, "learning_rate": 1.5976548186148775e-05, "loss": 0.2979, "step": 2161 }, { "epoch": 2.1364310430054374, "grad_norm": 0.3414465369758898, "learning_rate": 1.59582264565775e-05, "loss": 0.2832, "step": 2162 }, { "epoch": 2.137419673751854, "grad_norm": 0.38692703347944857, "learning_rate": 1.593990472700623e-05, "loss": 0.3235, "step": 2163 }, { "epoch": 2.13840830449827, "grad_norm": 0.33251392624071985, "learning_rate": 1.5921582997434958e-05, "loss": 0.2633, "step": 2164 }, { "epoch": 2.139396935244686, "grad_norm": 0.34199251404953085, "learning_rate": 1.5903261267863687e-05, "loss": 0.2746, "step": 2165 }, { "epoch": 2.140385565991102, "grad_norm": 0.3483421433224605, "learning_rate": 1.5884939538292414e-05, "loss": 0.2757, "step": 2166 }, { "epoch": 2.1413741967375186, "grad_norm": 0.3358027255987268, "learning_rate": 1.5866617808721144e-05, "loss": 0.2662, "step": 2167 }, { "epoch": 2.1423628274839346, "grad_norm": 0.33456488995777134, "learning_rate": 1.584829607914987e-05, "loss": 0.2884, "step": 2168 }, { "epoch": 2.143351458230351, "grad_norm": 0.3515615949600769, "learning_rate": 1.58299743495786e-05, "loss": 0.3164, "step": 2169 }, { "epoch": 2.144340088976767, "grad_norm": 0.35833939309492396, "learning_rate": 1.581165262000733e-05, "loss": 0.2866, "step": 2170 }, { "epoch": 2.1453287197231834, "grad_norm": 0.34767964805676704, "learning_rate": 1.579333089043606e-05, "loss": 0.2797, "step": 2171 }, { "epoch": 2.1463173504696, "grad_norm": 0.3516708001115529, "learning_rate": 1.5775009160864786e-05, "loss": 0.2662, "step": 2172 }, { "epoch": 2.147305981216016, "grad_norm": 0.32295177453894813, "learning_rate": 1.5756687431293516e-05, "loss": 0.279, "step": 2173 }, { "epoch": 2.148294611962432, "grad_norm": 0.3784496675389675, "learning_rate": 1.5738365701722242e-05, "loss": 0.3108, "step": 2174 }, { "epoch": 2.149283242708848, "grad_norm": 0.3615499441644552, "learning_rate": 1.5720043972150972e-05, "loss": 0.2761, "step": 2175 }, { "epoch": 2.1502718734552646, "grad_norm": 0.35922383574025385, "learning_rate": 1.57017222425797e-05, "loss": 0.2816, "step": 2176 }, { "epoch": 2.1512605042016806, "grad_norm": 0.345985516301648, "learning_rate": 1.5683400513008428e-05, "loss": 0.3106, "step": 2177 }, { "epoch": 2.152249134948097, "grad_norm": 0.35840050148605745, "learning_rate": 1.5665078783437158e-05, "loss": 0.3118, "step": 2178 }, { "epoch": 2.153237765694513, "grad_norm": 0.34223450024861585, "learning_rate": 1.5646757053865884e-05, "loss": 0.2674, "step": 2179 }, { "epoch": 2.1542263964409294, "grad_norm": 0.3451029541157785, "learning_rate": 1.5628435324294614e-05, "loss": 0.3012, "step": 2180 }, { "epoch": 2.1552150271873454, "grad_norm": 0.35896835074108063, "learning_rate": 1.5610113594723344e-05, "loss": 0.3034, "step": 2181 }, { "epoch": 2.156203657933762, "grad_norm": 0.410107397011747, "learning_rate": 1.5591791865152074e-05, "loss": 0.3229, "step": 2182 }, { "epoch": 2.1571922886801778, "grad_norm": 0.34021333109885105, "learning_rate": 1.55734701355808e-05, "loss": 0.2918, "step": 2183 }, { "epoch": 2.158180919426594, "grad_norm": 0.31292383520376227, "learning_rate": 1.555514840600953e-05, "loss": 0.2881, "step": 2184 }, { "epoch": 2.1591695501730106, "grad_norm": 0.3268101190734101, "learning_rate": 1.5536826676438256e-05, "loss": 0.2765, "step": 2185 }, { "epoch": 2.1601581809194266, "grad_norm": 0.37507909014895696, "learning_rate": 1.5518504946866986e-05, "loss": 0.3011, "step": 2186 }, { "epoch": 2.161146811665843, "grad_norm": 0.35433766662136124, "learning_rate": 1.5500183217295713e-05, "loss": 0.2996, "step": 2187 }, { "epoch": 2.162135442412259, "grad_norm": 0.340464758224724, "learning_rate": 1.5481861487724442e-05, "loss": 0.2736, "step": 2188 }, { "epoch": 2.1631240731586754, "grad_norm": 0.3329144891245448, "learning_rate": 1.546353975815317e-05, "loss": 0.2629, "step": 2189 }, { "epoch": 2.1641127039050914, "grad_norm": 0.3386842106897477, "learning_rate": 1.54452180285819e-05, "loss": 0.2556, "step": 2190 }, { "epoch": 2.165101334651508, "grad_norm": 0.3295850637588111, "learning_rate": 1.5426896299010625e-05, "loss": 0.2776, "step": 2191 }, { "epoch": 2.1660899653979238, "grad_norm": 0.32059674590299875, "learning_rate": 1.5408574569439355e-05, "loss": 0.2866, "step": 2192 }, { "epoch": 2.16707859614434, "grad_norm": 0.33882761970245745, "learning_rate": 1.5390252839868085e-05, "loss": 0.2998, "step": 2193 }, { "epoch": 2.168067226890756, "grad_norm": 0.33967694372077806, "learning_rate": 1.5371931110296814e-05, "loss": 0.2805, "step": 2194 }, { "epoch": 2.1690558576371726, "grad_norm": 0.3244032869326362, "learning_rate": 1.535360938072554e-05, "loss": 0.2876, "step": 2195 }, { "epoch": 2.1700444883835885, "grad_norm": 0.2875510514354305, "learning_rate": 1.533528765115427e-05, "loss": 0.2631, "step": 2196 }, { "epoch": 2.171033119130005, "grad_norm": 0.31052264429692533, "learning_rate": 1.5316965921582997e-05, "loss": 0.2666, "step": 2197 }, { "epoch": 2.1720217498764214, "grad_norm": 0.3534845893464135, "learning_rate": 1.5298644192011727e-05, "loss": 0.2959, "step": 2198 }, { "epoch": 2.1730103806228374, "grad_norm": 0.32588821411313007, "learning_rate": 1.5280322462440457e-05, "loss": 0.2886, "step": 2199 }, { "epoch": 2.1739990113692538, "grad_norm": 0.31999025503331063, "learning_rate": 1.5262000732869183e-05, "loss": 0.27, "step": 2200 }, { "epoch": 2.1749876421156698, "grad_norm": 0.3591433687832965, "learning_rate": 1.5243679003297911e-05, "loss": 0.2696, "step": 2201 }, { "epoch": 2.175976272862086, "grad_norm": 0.3469727112801614, "learning_rate": 1.522535727372664e-05, "loss": 0.31, "step": 2202 }, { "epoch": 2.176964903608502, "grad_norm": 0.3101293484107506, "learning_rate": 1.5207035544155367e-05, "loss": 0.279, "step": 2203 }, { "epoch": 2.1779535343549186, "grad_norm": 0.3452312590028626, "learning_rate": 1.5188713814584096e-05, "loss": 0.2974, "step": 2204 }, { "epoch": 2.1789421651013345, "grad_norm": 0.3596215728630589, "learning_rate": 1.5170392085012827e-05, "loss": 0.3028, "step": 2205 }, { "epoch": 2.179930795847751, "grad_norm": 0.31030668948068363, "learning_rate": 1.5152070355441555e-05, "loss": 0.2762, "step": 2206 }, { "epoch": 2.180919426594167, "grad_norm": 0.3438426919826336, "learning_rate": 1.5133748625870283e-05, "loss": 0.2942, "step": 2207 }, { "epoch": 2.1819080573405834, "grad_norm": 0.34706813423781385, "learning_rate": 1.5115426896299011e-05, "loss": 0.3085, "step": 2208 }, { "epoch": 2.1828966880869993, "grad_norm": 0.303603961144676, "learning_rate": 1.509710516672774e-05, "loss": 0.2846, "step": 2209 }, { "epoch": 2.1838853188334157, "grad_norm": 0.32187636527268415, "learning_rate": 1.5078783437156468e-05, "loss": 0.29, "step": 2210 }, { "epoch": 2.184873949579832, "grad_norm": 0.3245746761154007, "learning_rate": 1.5060461707585197e-05, "loss": 0.2694, "step": 2211 }, { "epoch": 2.185862580326248, "grad_norm": 0.3329194603566512, "learning_rate": 1.5042139978013926e-05, "loss": 0.2908, "step": 2212 }, { "epoch": 2.1868512110726646, "grad_norm": 0.36192381228190973, "learning_rate": 1.5023818248442654e-05, "loss": 0.2987, "step": 2213 }, { "epoch": 2.1878398418190805, "grad_norm": 0.3352258738770613, "learning_rate": 1.5005496518871382e-05, "loss": 0.2727, "step": 2214 }, { "epoch": 2.188828472565497, "grad_norm": 0.34482450414530225, "learning_rate": 1.498717478930011e-05, "loss": 0.2964, "step": 2215 }, { "epoch": 2.189817103311913, "grad_norm": 0.302908089733451, "learning_rate": 1.4968853059728838e-05, "loss": 0.2674, "step": 2216 }, { "epoch": 2.1908057340583293, "grad_norm": 0.35241614271721977, "learning_rate": 1.495053133015757e-05, "loss": 0.3025, "step": 2217 }, { "epoch": 2.1917943648047453, "grad_norm": 0.34818882713972193, "learning_rate": 1.4932209600586298e-05, "loss": 0.2947, "step": 2218 }, { "epoch": 2.1927829955511617, "grad_norm": 0.4032929232868348, "learning_rate": 1.4913887871015026e-05, "loss": 0.2931, "step": 2219 }, { "epoch": 2.1937716262975777, "grad_norm": 0.4013441440166708, "learning_rate": 1.4895566141443754e-05, "loss": 0.2803, "step": 2220 }, { "epoch": 2.194760257043994, "grad_norm": 0.3927935903200586, "learning_rate": 1.4877244411872482e-05, "loss": 0.2911, "step": 2221 }, { "epoch": 2.19574888779041, "grad_norm": 0.36716193585271095, "learning_rate": 1.485892268230121e-05, "loss": 0.3019, "step": 2222 }, { "epoch": 2.1967375185368265, "grad_norm": 0.3152287005563823, "learning_rate": 1.4840600952729938e-05, "loss": 0.268, "step": 2223 }, { "epoch": 2.1977261492832425, "grad_norm": 0.3484469821235995, "learning_rate": 1.4822279223158666e-05, "loss": 0.3268, "step": 2224 }, { "epoch": 2.198714780029659, "grad_norm": 0.5720224776218088, "learning_rate": 1.4803957493587394e-05, "loss": 0.2967, "step": 2225 }, { "epoch": 2.1997034107760753, "grad_norm": 0.3527272213092796, "learning_rate": 1.4785635764016123e-05, "loss": 0.2889, "step": 2226 }, { "epoch": 2.2006920415224913, "grad_norm": 0.3458392601793998, "learning_rate": 1.476731403444485e-05, "loss": 0.3146, "step": 2227 }, { "epoch": 2.2016806722689077, "grad_norm": 0.3347190093636909, "learning_rate": 1.4748992304873579e-05, "loss": 0.3, "step": 2228 }, { "epoch": 2.2026693030153237, "grad_norm": 0.3273411892974918, "learning_rate": 1.473067057530231e-05, "loss": 0.3235, "step": 2229 }, { "epoch": 2.20365793376174, "grad_norm": 0.348298380538476, "learning_rate": 1.4712348845731038e-05, "loss": 0.2848, "step": 2230 }, { "epoch": 2.204646564508156, "grad_norm": 0.3114224137463308, "learning_rate": 1.4694027116159766e-05, "loss": 0.2716, "step": 2231 }, { "epoch": 2.2056351952545725, "grad_norm": 0.36633076473870163, "learning_rate": 1.4675705386588495e-05, "loss": 0.3042, "step": 2232 }, { "epoch": 2.2066238260009885, "grad_norm": 3.8311138547687196, "learning_rate": 1.4657383657017223e-05, "loss": 0.3469, "step": 2233 }, { "epoch": 2.207612456747405, "grad_norm": 0.3797920700355059, "learning_rate": 1.4639061927445952e-05, "loss": 0.3055, "step": 2234 }, { "epoch": 2.208601087493821, "grad_norm": 0.3673325669763765, "learning_rate": 1.462074019787468e-05, "loss": 0.3091, "step": 2235 }, { "epoch": 2.2095897182402373, "grad_norm": 1.2145674437195548, "learning_rate": 1.4602418468303409e-05, "loss": 0.3023, "step": 2236 }, { "epoch": 2.2105783489866533, "grad_norm": 0.3181218722556578, "learning_rate": 1.4584096738732137e-05, "loss": 0.2793, "step": 2237 }, { "epoch": 2.2115669797330697, "grad_norm": 0.32625279357356496, "learning_rate": 1.4565775009160865e-05, "loss": 0.2751, "step": 2238 }, { "epoch": 2.2125556104794857, "grad_norm": 0.3386148366920936, "learning_rate": 1.4547453279589593e-05, "loss": 0.2662, "step": 2239 }, { "epoch": 2.213544241225902, "grad_norm": 0.30599500408253993, "learning_rate": 1.4529131550018321e-05, "loss": 0.2771, "step": 2240 }, { "epoch": 2.2145328719723185, "grad_norm": 0.3083241308552393, "learning_rate": 1.4510809820447053e-05, "loss": 0.3023, "step": 2241 }, { "epoch": 2.2155215027187345, "grad_norm": 0.32965254791152737, "learning_rate": 1.449248809087578e-05, "loss": 0.2957, "step": 2242 }, { "epoch": 2.216510133465151, "grad_norm": 0.31005964645678497, "learning_rate": 1.4474166361304509e-05, "loss": 0.2433, "step": 2243 }, { "epoch": 2.217498764211567, "grad_norm": 0.3626603940806798, "learning_rate": 1.4455844631733237e-05, "loss": 0.2979, "step": 2244 }, { "epoch": 2.2184873949579833, "grad_norm": 0.3372381801684875, "learning_rate": 1.4437522902161965e-05, "loss": 0.2804, "step": 2245 }, { "epoch": 2.2194760257043993, "grad_norm": 0.3710383367709695, "learning_rate": 1.4419201172590693e-05, "loss": 0.2838, "step": 2246 }, { "epoch": 2.2204646564508157, "grad_norm": 0.3657446456603297, "learning_rate": 1.4400879443019421e-05, "loss": 0.292, "step": 2247 }, { "epoch": 2.2214532871972317, "grad_norm": 0.33967289290671343, "learning_rate": 1.438255771344815e-05, "loss": 0.2908, "step": 2248 }, { "epoch": 2.222441917943648, "grad_norm": 0.3715056121323138, "learning_rate": 1.4364235983876878e-05, "loss": 0.2599, "step": 2249 }, { "epoch": 2.223430548690064, "grad_norm": 0.3376070833674275, "learning_rate": 1.4345914254305606e-05, "loss": 0.3027, "step": 2250 }, { "epoch": 2.2244191794364805, "grad_norm": 0.3193607973964519, "learning_rate": 1.4327592524734334e-05, "loss": 0.2849, "step": 2251 }, { "epoch": 2.2254078101828965, "grad_norm": 0.3372234298789546, "learning_rate": 1.4309270795163064e-05, "loss": 0.2808, "step": 2252 }, { "epoch": 2.226396440929313, "grad_norm": 0.4111726474001905, "learning_rate": 1.4290949065591793e-05, "loss": 0.282, "step": 2253 }, { "epoch": 2.2273850716757293, "grad_norm": 0.3524875550351587, "learning_rate": 1.4272627336020521e-05, "loss": 0.2981, "step": 2254 }, { "epoch": 2.2283737024221453, "grad_norm": 0.3437069005243047, "learning_rate": 1.425430560644925e-05, "loss": 0.3093, "step": 2255 }, { "epoch": 2.2293623331685617, "grad_norm": 0.34445109803646057, "learning_rate": 1.4235983876877978e-05, "loss": 0.2943, "step": 2256 }, { "epoch": 2.2303509639149777, "grad_norm": 0.33397204914435236, "learning_rate": 1.4217662147306708e-05, "loss": 0.2841, "step": 2257 }, { "epoch": 2.231339594661394, "grad_norm": 0.42874423339786766, "learning_rate": 1.4199340417735436e-05, "loss": 0.3024, "step": 2258 }, { "epoch": 2.23232822540781, "grad_norm": 0.37556690211331895, "learning_rate": 1.4181018688164164e-05, "loss": 0.3261, "step": 2259 }, { "epoch": 2.2333168561542265, "grad_norm": 0.31395357601723406, "learning_rate": 1.4162696958592892e-05, "loss": 0.2631, "step": 2260 }, { "epoch": 2.2343054869006425, "grad_norm": 0.33567939038528155, "learning_rate": 1.414437522902162e-05, "loss": 0.2776, "step": 2261 }, { "epoch": 2.235294117647059, "grad_norm": 0.3380892712218959, "learning_rate": 1.4126053499450348e-05, "loss": 0.2805, "step": 2262 }, { "epoch": 2.236282748393475, "grad_norm": 0.33834936028556534, "learning_rate": 1.4107731769879076e-05, "loss": 0.2697, "step": 2263 }, { "epoch": 2.2372713791398913, "grad_norm": 0.3193372083479065, "learning_rate": 1.4089410040307804e-05, "loss": 0.2661, "step": 2264 }, { "epoch": 2.2382600098863072, "grad_norm": 0.31683920057303, "learning_rate": 1.4071088310736536e-05, "loss": 0.2814, "step": 2265 }, { "epoch": 2.2392486406327237, "grad_norm": 0.3453272957786612, "learning_rate": 1.4052766581165264e-05, "loss": 0.3, "step": 2266 }, { "epoch": 2.24023727137914, "grad_norm": 0.3615411446614653, "learning_rate": 1.4034444851593992e-05, "loss": 0.2894, "step": 2267 }, { "epoch": 2.241225902125556, "grad_norm": 0.38954515035349363, "learning_rate": 1.401612312202272e-05, "loss": 0.297, "step": 2268 }, { "epoch": 2.2422145328719725, "grad_norm": 0.3426612053009388, "learning_rate": 1.3997801392451448e-05, "loss": 0.2972, "step": 2269 }, { "epoch": 2.2432031636183885, "grad_norm": 0.33373480820233165, "learning_rate": 1.3979479662880176e-05, "loss": 0.3031, "step": 2270 }, { "epoch": 2.244191794364805, "grad_norm": 0.36750708414983263, "learning_rate": 1.3961157933308904e-05, "loss": 0.2957, "step": 2271 }, { "epoch": 2.245180425111221, "grad_norm": 0.31893478258594676, "learning_rate": 1.3942836203737633e-05, "loss": 0.259, "step": 2272 }, { "epoch": 2.2461690558576373, "grad_norm": 0.30160862814700623, "learning_rate": 1.392451447416636e-05, "loss": 0.2588, "step": 2273 }, { "epoch": 2.2471576866040532, "grad_norm": 0.37772426093160955, "learning_rate": 1.3906192744595089e-05, "loss": 0.2926, "step": 2274 }, { "epoch": 2.2481463173504697, "grad_norm": 0.3678910260190973, "learning_rate": 1.3887871015023819e-05, "loss": 0.2818, "step": 2275 }, { "epoch": 2.2491349480968856, "grad_norm": 0.33147174358157827, "learning_rate": 1.3869549285452547e-05, "loss": 0.2533, "step": 2276 }, { "epoch": 2.250123578843302, "grad_norm": 0.3252518200940282, "learning_rate": 1.3851227555881277e-05, "loss": 0.2736, "step": 2277 }, { "epoch": 2.251112209589718, "grad_norm": 0.4047456736797285, "learning_rate": 1.3832905826310005e-05, "loss": 0.2817, "step": 2278 }, { "epoch": 2.2521008403361344, "grad_norm": 0.36961126496846636, "learning_rate": 1.3814584096738733e-05, "loss": 0.298, "step": 2279 }, { "epoch": 2.253089471082551, "grad_norm": 0.37624148125324014, "learning_rate": 1.3796262367167463e-05, "loss": 0.25, "step": 2280 }, { "epoch": 2.254078101828967, "grad_norm": 0.37646823732669815, "learning_rate": 1.377794063759619e-05, "loss": 0.2848, "step": 2281 }, { "epoch": 2.2550667325753833, "grad_norm": 0.3483961988505596, "learning_rate": 1.3759618908024919e-05, "loss": 0.2871, "step": 2282 }, { "epoch": 2.2560553633217992, "grad_norm": 0.3457434111190312, "learning_rate": 1.3741297178453647e-05, "loss": 0.3122, "step": 2283 }, { "epoch": 2.2570439940682157, "grad_norm": 0.35387153727739934, "learning_rate": 1.3722975448882375e-05, "loss": 0.2952, "step": 2284 }, { "epoch": 2.2580326248146316, "grad_norm": 0.34582544122814723, "learning_rate": 1.3704653719311103e-05, "loss": 0.2866, "step": 2285 }, { "epoch": 2.259021255561048, "grad_norm": 0.31238899408197174, "learning_rate": 1.3686331989739831e-05, "loss": 0.2745, "step": 2286 }, { "epoch": 2.260009886307464, "grad_norm": 0.3375486248798471, "learning_rate": 1.366801026016856e-05, "loss": 0.2621, "step": 2287 }, { "epoch": 2.2609985170538804, "grad_norm": 0.3465418042869245, "learning_rate": 1.3649688530597287e-05, "loss": 0.2808, "step": 2288 }, { "epoch": 2.2619871478002964, "grad_norm": 0.3434891728663934, "learning_rate": 1.3631366801026019e-05, "loss": 0.3019, "step": 2289 }, { "epoch": 2.262975778546713, "grad_norm": 0.3186862851048164, "learning_rate": 1.3613045071454747e-05, "loss": 0.2993, "step": 2290 }, { "epoch": 2.263964409293129, "grad_norm": 0.39233238929237935, "learning_rate": 1.3594723341883475e-05, "loss": 0.3048, "step": 2291 }, { "epoch": 2.2649530400395452, "grad_norm": 0.32923582941203255, "learning_rate": 1.3576401612312203e-05, "loss": 0.2866, "step": 2292 }, { "epoch": 2.2659416707859616, "grad_norm": 0.30636820349784977, "learning_rate": 1.3558079882740931e-05, "loss": 0.2886, "step": 2293 }, { "epoch": 2.2669303015323776, "grad_norm": 0.3443632326423214, "learning_rate": 1.353975815316966e-05, "loss": 0.2477, "step": 2294 }, { "epoch": 2.267918932278794, "grad_norm": 0.35856778872287665, "learning_rate": 1.3521436423598388e-05, "loss": 0.3023, "step": 2295 }, { "epoch": 2.26890756302521, "grad_norm": 0.3385993483649333, "learning_rate": 1.3503114694027116e-05, "loss": 0.2838, "step": 2296 }, { "epoch": 2.2698961937716264, "grad_norm": 0.3209886730075033, "learning_rate": 1.3484792964455844e-05, "loss": 0.2751, "step": 2297 }, { "epoch": 2.2708848245180424, "grad_norm": 0.34650049110701814, "learning_rate": 1.3466471234884574e-05, "loss": 0.2881, "step": 2298 }, { "epoch": 2.271873455264459, "grad_norm": 0.33219446797227853, "learning_rate": 1.3448149505313302e-05, "loss": 0.2475, "step": 2299 }, { "epoch": 2.272862086010875, "grad_norm": 0.3942884605526493, "learning_rate": 1.342982777574203e-05, "loss": 0.2795, "step": 2300 }, { "epoch": 2.2738507167572912, "grad_norm": 0.34028011697223143, "learning_rate": 1.341150604617076e-05, "loss": 0.2994, "step": 2301 }, { "epoch": 2.274839347503707, "grad_norm": 0.31565683909848, "learning_rate": 1.3393184316599488e-05, "loss": 0.2701, "step": 2302 }, { "epoch": 2.2758279782501236, "grad_norm": 0.37080120048451826, "learning_rate": 1.3374862587028218e-05, "loss": 0.2998, "step": 2303 }, { "epoch": 2.2768166089965396, "grad_norm": 0.35446851961909187, "learning_rate": 1.3356540857456946e-05, "loss": 0.2866, "step": 2304 }, { "epoch": 2.277805239742956, "grad_norm": 0.3184124314460603, "learning_rate": 1.3338219127885674e-05, "loss": 0.2841, "step": 2305 }, { "epoch": 2.2787938704893724, "grad_norm": 0.35822478818371767, "learning_rate": 1.3319897398314402e-05, "loss": 0.3063, "step": 2306 }, { "epoch": 2.2797825012357884, "grad_norm": 0.3000960990301747, "learning_rate": 1.330157566874313e-05, "loss": 0.283, "step": 2307 }, { "epoch": 2.280771131982205, "grad_norm": 0.33677231775195166, "learning_rate": 1.3283253939171858e-05, "loss": 0.2777, "step": 2308 }, { "epoch": 2.281759762728621, "grad_norm": 0.33743786399976317, "learning_rate": 1.3264932209600586e-05, "loss": 0.2912, "step": 2309 }, { "epoch": 2.282748393475037, "grad_norm": 0.33282455633235913, "learning_rate": 1.3246610480029314e-05, "loss": 0.2965, "step": 2310 }, { "epoch": 2.283737024221453, "grad_norm": 0.34302843433469016, "learning_rate": 1.3228288750458042e-05, "loss": 0.2804, "step": 2311 }, { "epoch": 2.2847256549678696, "grad_norm": 0.6070749013442661, "learning_rate": 1.320996702088677e-05, "loss": 0.2833, "step": 2312 }, { "epoch": 2.2857142857142856, "grad_norm": 0.31983684652596006, "learning_rate": 1.3191645291315502e-05, "loss": 0.2704, "step": 2313 }, { "epoch": 2.286702916460702, "grad_norm": 0.33921932397252746, "learning_rate": 1.317332356174423e-05, "loss": 0.2683, "step": 2314 }, { "epoch": 2.287691547207118, "grad_norm": 0.3512182090905402, "learning_rate": 1.3155001832172958e-05, "loss": 0.2816, "step": 2315 }, { "epoch": 2.2886801779535344, "grad_norm": 0.34336783092752493, "learning_rate": 1.3136680102601686e-05, "loss": 0.2712, "step": 2316 }, { "epoch": 2.2896688086999504, "grad_norm": 0.40045236712289206, "learning_rate": 1.3118358373030415e-05, "loss": 0.2842, "step": 2317 }, { "epoch": 2.290657439446367, "grad_norm": 0.35104527485669973, "learning_rate": 1.3100036643459143e-05, "loss": 0.2756, "step": 2318 }, { "epoch": 2.291646070192783, "grad_norm": 0.3429152287856765, "learning_rate": 1.308171491388787e-05, "loss": 0.3028, "step": 2319 }, { "epoch": 2.292634700939199, "grad_norm": 0.3432145493541532, "learning_rate": 1.3063393184316599e-05, "loss": 0.2967, "step": 2320 }, { "epoch": 2.2936233316856156, "grad_norm": 0.363313819752954, "learning_rate": 1.3045071454745329e-05, "loss": 0.2681, "step": 2321 }, { "epoch": 2.2946119624320316, "grad_norm": 0.32337939654689013, "learning_rate": 1.3026749725174057e-05, "loss": 0.303, "step": 2322 }, { "epoch": 2.295600593178448, "grad_norm": 0.34434487521017226, "learning_rate": 1.3008427995602785e-05, "loss": 0.3277, "step": 2323 }, { "epoch": 2.296589223924864, "grad_norm": 0.3506164698412244, "learning_rate": 1.2990106266031515e-05, "loss": 0.3032, "step": 2324 }, { "epoch": 2.2975778546712804, "grad_norm": 0.32600312642689594, "learning_rate": 1.2971784536460243e-05, "loss": 0.3007, "step": 2325 }, { "epoch": 2.2985664854176964, "grad_norm": 0.3164400051835193, "learning_rate": 1.2953462806888973e-05, "loss": 0.2929, "step": 2326 }, { "epoch": 2.299555116164113, "grad_norm": 0.3228718552067326, "learning_rate": 1.29351410773177e-05, "loss": 0.2795, "step": 2327 }, { "epoch": 2.3005437469105288, "grad_norm": 0.32354846991995173, "learning_rate": 1.2916819347746429e-05, "loss": 0.293, "step": 2328 }, { "epoch": 2.301532377656945, "grad_norm": 0.3316587563580182, "learning_rate": 1.2898497618175157e-05, "loss": 0.291, "step": 2329 }, { "epoch": 2.302521008403361, "grad_norm": 0.33295081893680767, "learning_rate": 1.2880175888603885e-05, "loss": 0.281, "step": 2330 }, { "epoch": 2.3035096391497776, "grad_norm": 0.3376186789744833, "learning_rate": 1.2861854159032613e-05, "loss": 0.2941, "step": 2331 }, { "epoch": 2.304498269896194, "grad_norm": 0.33596461848670905, "learning_rate": 1.2843532429461341e-05, "loss": 0.2902, "step": 2332 }, { "epoch": 2.30548690064261, "grad_norm": 0.30341239585680646, "learning_rate": 1.282521069989007e-05, "loss": 0.2687, "step": 2333 }, { "epoch": 2.3064755313890264, "grad_norm": 0.3166288001637802, "learning_rate": 1.2806888970318798e-05, "loss": 0.3013, "step": 2334 }, { "epoch": 2.3074641621354424, "grad_norm": 0.337744344123231, "learning_rate": 1.2788567240747526e-05, "loss": 0.2667, "step": 2335 }, { "epoch": 2.308452792881859, "grad_norm": 0.3325893877153519, "learning_rate": 1.2770245511176257e-05, "loss": 0.3121, "step": 2336 }, { "epoch": 2.3094414236282748, "grad_norm": 0.30317368672898504, "learning_rate": 1.2751923781604985e-05, "loss": 0.2758, "step": 2337 }, { "epoch": 2.310430054374691, "grad_norm": 0.32992810546373674, "learning_rate": 1.2733602052033713e-05, "loss": 0.3059, "step": 2338 }, { "epoch": 2.311418685121107, "grad_norm": 0.34705961831436516, "learning_rate": 1.2715280322462441e-05, "loss": 0.275, "step": 2339 }, { "epoch": 2.3124073158675236, "grad_norm": 0.32314441600454347, "learning_rate": 1.269695859289117e-05, "loss": 0.2769, "step": 2340 }, { "epoch": 2.3133959466139395, "grad_norm": 0.34135313829612696, "learning_rate": 1.2678636863319898e-05, "loss": 0.2927, "step": 2341 }, { "epoch": 2.314384577360356, "grad_norm": 0.3283354410576416, "learning_rate": 1.2660315133748626e-05, "loss": 0.3054, "step": 2342 }, { "epoch": 2.315373208106772, "grad_norm": 0.3101381034025676, "learning_rate": 1.2641993404177354e-05, "loss": 0.2725, "step": 2343 }, { "epoch": 2.3163618388531884, "grad_norm": 0.2938859270026189, "learning_rate": 1.2623671674606084e-05, "loss": 0.2687, "step": 2344 }, { "epoch": 2.317350469599605, "grad_norm": 0.34715366813620985, "learning_rate": 1.2605349945034812e-05, "loss": 0.3118, "step": 2345 }, { "epoch": 2.3183391003460208, "grad_norm": 0.3361944233286493, "learning_rate": 1.258702821546354e-05, "loss": 0.2638, "step": 2346 }, { "epoch": 2.3193277310924367, "grad_norm": 0.33023193701483244, "learning_rate": 1.2568706485892268e-05, "loss": 0.2795, "step": 2347 }, { "epoch": 2.320316361838853, "grad_norm": 0.32335089295822445, "learning_rate": 1.2550384756320998e-05, "loss": 0.3213, "step": 2348 }, { "epoch": 2.3213049925852696, "grad_norm": 0.3406275657126936, "learning_rate": 1.2532063026749728e-05, "loss": 0.288, "step": 2349 }, { "epoch": 2.3222936233316855, "grad_norm": 0.5377794468857815, "learning_rate": 1.2513741297178456e-05, "loss": 0.2717, "step": 2350 }, { "epoch": 2.323282254078102, "grad_norm": 0.3581101905092646, "learning_rate": 1.2495419567607184e-05, "loss": 0.2982, "step": 2351 }, { "epoch": 2.324270884824518, "grad_norm": 0.3115553585742111, "learning_rate": 1.2477097838035912e-05, "loss": 0.2909, "step": 2352 }, { "epoch": 2.3252595155709344, "grad_norm": 0.33619540290463046, "learning_rate": 1.245877610846464e-05, "loss": 0.2918, "step": 2353 }, { "epoch": 2.3262481463173503, "grad_norm": 0.3223842011538079, "learning_rate": 1.2440454378893368e-05, "loss": 0.282, "step": 2354 }, { "epoch": 2.3272367770637667, "grad_norm": 0.31077230217137536, "learning_rate": 1.2422132649322096e-05, "loss": 0.2644, "step": 2355 }, { "epoch": 2.3282254078101827, "grad_norm": 4.151638979311808, "learning_rate": 1.2403810919750824e-05, "loss": 0.4117, "step": 2356 }, { "epoch": 2.329214038556599, "grad_norm": 0.44854402700667395, "learning_rate": 1.2385489190179554e-05, "loss": 0.2851, "step": 2357 }, { "epoch": 2.3302026693030156, "grad_norm": 0.3422003094518195, "learning_rate": 1.2367167460608282e-05, "loss": 0.3046, "step": 2358 }, { "epoch": 2.3311913000494315, "grad_norm": 0.3464286043462282, "learning_rate": 1.234884573103701e-05, "loss": 0.2706, "step": 2359 }, { "epoch": 2.3321799307958475, "grad_norm": 2.233666719129464, "learning_rate": 1.2330524001465739e-05, "loss": 0.2878, "step": 2360 }, { "epoch": 2.333168561542264, "grad_norm": 0.45292602257302, "learning_rate": 1.2312202271894467e-05, "loss": 0.3043, "step": 2361 }, { "epoch": 2.3341571922886803, "grad_norm": 0.3386840873949116, "learning_rate": 1.2293880542323195e-05, "loss": 0.2811, "step": 2362 }, { "epoch": 2.3351458230350963, "grad_norm": 0.3727094629717422, "learning_rate": 1.2275558812751925e-05, "loss": 0.3095, "step": 2363 }, { "epoch": 2.3361344537815127, "grad_norm": 0.381840424781254, "learning_rate": 1.2257237083180653e-05, "loss": 0.2535, "step": 2364 }, { "epoch": 2.3371230845279287, "grad_norm": 0.37720835915495293, "learning_rate": 1.2238915353609381e-05, "loss": 0.2944, "step": 2365 }, { "epoch": 2.338111715274345, "grad_norm": 0.3650224166701438, "learning_rate": 1.2220593624038109e-05, "loss": 0.3166, "step": 2366 }, { "epoch": 2.339100346020761, "grad_norm": 0.37047301236881475, "learning_rate": 1.2202271894466839e-05, "loss": 0.2774, "step": 2367 }, { "epoch": 2.3400889767671775, "grad_norm": 0.4144830895358065, "learning_rate": 1.2183950164895567e-05, "loss": 0.304, "step": 2368 }, { "epoch": 2.3410776075135935, "grad_norm": 0.37472972984064684, "learning_rate": 1.2165628435324295e-05, "loss": 0.2823, "step": 2369 }, { "epoch": 2.34206623826001, "grad_norm": 0.40462677947909065, "learning_rate": 1.2147306705753025e-05, "loss": 0.2974, "step": 2370 }, { "epoch": 2.3430548690064263, "grad_norm": 0.35744603696127614, "learning_rate": 1.2128984976181753e-05, "loss": 0.2707, "step": 2371 }, { "epoch": 2.3440434997528423, "grad_norm": 4.8052573435203145, "learning_rate": 1.2110663246610481e-05, "loss": 0.544, "step": 2372 }, { "epoch": 2.3450321304992583, "grad_norm": 0.4618616215514522, "learning_rate": 1.2092341517039209e-05, "loss": 0.2824, "step": 2373 }, { "epoch": 2.3460207612456747, "grad_norm": 0.38904580061238386, "learning_rate": 1.2074019787467937e-05, "loss": 0.2806, "step": 2374 }, { "epoch": 2.347009391992091, "grad_norm": 0.3391516790447318, "learning_rate": 1.2055698057896667e-05, "loss": 0.2724, "step": 2375 }, { "epoch": 2.347998022738507, "grad_norm": 0.4261336732096131, "learning_rate": 1.2037376328325395e-05, "loss": 0.294, "step": 2376 }, { "epoch": 2.3489866534849235, "grad_norm": 0.4159112502903862, "learning_rate": 1.2019054598754123e-05, "loss": 0.2802, "step": 2377 }, { "epoch": 2.3499752842313395, "grad_norm": 0.35523678686869825, "learning_rate": 1.2000732869182851e-05, "loss": 0.285, "step": 2378 }, { "epoch": 2.350963914977756, "grad_norm": 0.4294627901131109, "learning_rate": 1.198241113961158e-05, "loss": 0.2847, "step": 2379 }, { "epoch": 2.351952545724172, "grad_norm": 0.36966395363042337, "learning_rate": 1.1964089410040308e-05, "loss": 0.2689, "step": 2380 }, { "epoch": 2.3529411764705883, "grad_norm": 0.3825426712004643, "learning_rate": 1.1945767680469037e-05, "loss": 0.2859, "step": 2381 }, { "epoch": 2.3539298072170043, "grad_norm": 0.356939040728489, "learning_rate": 1.1927445950897766e-05, "loss": 0.2798, "step": 2382 }, { "epoch": 2.3549184379634207, "grad_norm": 0.4210457475718919, "learning_rate": 1.1909124221326494e-05, "loss": 0.2992, "step": 2383 }, { "epoch": 2.3559070687098367, "grad_norm": 0.36704383588267286, "learning_rate": 1.1890802491755222e-05, "loss": 0.2894, "step": 2384 }, { "epoch": 2.356895699456253, "grad_norm": 0.3720671934615416, "learning_rate": 1.187248076218395e-05, "loss": 0.3032, "step": 2385 }, { "epoch": 2.357884330202669, "grad_norm": 0.418783353318987, "learning_rate": 1.1854159032612678e-05, "loss": 0.297, "step": 2386 }, { "epoch": 2.3588729609490855, "grad_norm": 0.32324503359198964, "learning_rate": 1.1835837303041408e-05, "loss": 0.2869, "step": 2387 }, { "epoch": 2.359861591695502, "grad_norm": 0.3222947007864621, "learning_rate": 1.1817515573470136e-05, "loss": 0.2808, "step": 2388 }, { "epoch": 2.360850222441918, "grad_norm": 0.32513185679631496, "learning_rate": 1.1799193843898864e-05, "loss": 0.2777, "step": 2389 }, { "epoch": 2.3618388531883343, "grad_norm": 0.33420574192632374, "learning_rate": 1.1780872114327594e-05, "loss": 0.2611, "step": 2390 }, { "epoch": 2.3628274839347503, "grad_norm": 0.28922630359171053, "learning_rate": 1.1762550384756322e-05, "loss": 0.2539, "step": 2391 }, { "epoch": 2.3638161146811667, "grad_norm": 0.35114249745180376, "learning_rate": 1.174422865518505e-05, "loss": 0.303, "step": 2392 }, { "epoch": 2.3648047454275827, "grad_norm": 0.33678191923227835, "learning_rate": 1.172590692561378e-05, "loss": 0.2677, "step": 2393 }, { "epoch": 2.365793376173999, "grad_norm": 0.33424409518240067, "learning_rate": 1.1707585196042508e-05, "loss": 0.2903, "step": 2394 }, { "epoch": 2.366782006920415, "grad_norm": 0.3571597649411409, "learning_rate": 1.1689263466471236e-05, "loss": 0.2914, "step": 2395 }, { "epoch": 2.3677706376668315, "grad_norm": 0.35451348764632445, "learning_rate": 1.1670941736899964e-05, "loss": 0.2807, "step": 2396 }, { "epoch": 2.3687592684132475, "grad_norm": 0.30501613708896275, "learning_rate": 1.1652620007328692e-05, "loss": 0.2695, "step": 2397 }, { "epoch": 2.369747899159664, "grad_norm": 0.3273416180285976, "learning_rate": 1.163429827775742e-05, "loss": 0.2731, "step": 2398 }, { "epoch": 2.37073652990608, "grad_norm": 0.34047057042602646, "learning_rate": 1.161597654818615e-05, "loss": 0.2997, "step": 2399 }, { "epoch": 2.3717251606524963, "grad_norm": 0.3447228992041485, "learning_rate": 1.1597654818614878e-05, "loss": 0.3014, "step": 2400 }, { "epoch": 2.3727137913989127, "grad_norm": 0.3374175127059564, "learning_rate": 1.1579333089043606e-05, "loss": 0.2532, "step": 2401 }, { "epoch": 2.3737024221453287, "grad_norm": 0.32105141884408156, "learning_rate": 1.1561011359472335e-05, "loss": 0.2583, "step": 2402 }, { "epoch": 2.374691052891745, "grad_norm": 0.3358668365984708, "learning_rate": 1.1542689629901063e-05, "loss": 0.3119, "step": 2403 }, { "epoch": 2.375679683638161, "grad_norm": 0.33019855827017863, "learning_rate": 1.152436790032979e-05, "loss": 0.2618, "step": 2404 }, { "epoch": 2.3766683143845775, "grad_norm": 0.3221792998003085, "learning_rate": 1.150604617075852e-05, "loss": 0.2849, "step": 2405 }, { "epoch": 2.3776569451309935, "grad_norm": 0.3184805378019957, "learning_rate": 1.1487724441187249e-05, "loss": 0.282, "step": 2406 }, { "epoch": 2.37864557587741, "grad_norm": 0.3161524906027362, "learning_rate": 1.1469402711615977e-05, "loss": 0.2741, "step": 2407 }, { "epoch": 2.379634206623826, "grad_norm": 0.3374729324339922, "learning_rate": 1.1451080982044705e-05, "loss": 0.2837, "step": 2408 }, { "epoch": 2.3806228373702423, "grad_norm": 0.3082925808901268, "learning_rate": 1.1432759252473433e-05, "loss": 0.2859, "step": 2409 }, { "epoch": 2.3816114681166582, "grad_norm": 0.3679929679160637, "learning_rate": 1.1414437522902161e-05, "loss": 0.3161, "step": 2410 }, { "epoch": 2.3826000988630747, "grad_norm": 0.3262306480596806, "learning_rate": 1.1396115793330891e-05, "loss": 0.2876, "step": 2411 }, { "epoch": 2.3835887296094906, "grad_norm": 0.30107700812051064, "learning_rate": 1.1377794063759619e-05, "loss": 0.2734, "step": 2412 }, { "epoch": 2.384577360355907, "grad_norm": 0.3094398598036531, "learning_rate": 1.1359472334188349e-05, "loss": 0.2895, "step": 2413 }, { "epoch": 2.3855659911023235, "grad_norm": 0.313249240222792, "learning_rate": 1.1341150604617077e-05, "loss": 0.2791, "step": 2414 }, { "epoch": 2.3865546218487395, "grad_norm": 10.20533227722941, "learning_rate": 1.1322828875045805e-05, "loss": 0.5257, "step": 2415 }, { "epoch": 2.387543252595156, "grad_norm": 0.34807857903913353, "learning_rate": 1.1304507145474533e-05, "loss": 0.2885, "step": 2416 }, { "epoch": 2.388531883341572, "grad_norm": 0.3241634013885061, "learning_rate": 1.1286185415903263e-05, "loss": 0.2847, "step": 2417 }, { "epoch": 2.3895205140879883, "grad_norm": 0.35801046616246196, "learning_rate": 1.1267863686331991e-05, "loss": 0.2844, "step": 2418 }, { "epoch": 2.3905091448344042, "grad_norm": 0.32844225824999296, "learning_rate": 1.124954195676072e-05, "loss": 0.2957, "step": 2419 }, { "epoch": 2.3914977755808207, "grad_norm": 0.4133599069287874, "learning_rate": 1.1231220227189447e-05, "loss": 0.275, "step": 2420 }, { "epoch": 2.3924864063272366, "grad_norm": 0.33395408388199194, "learning_rate": 1.1212898497618175e-05, "loss": 0.2838, "step": 2421 }, { "epoch": 2.393475037073653, "grad_norm": 0.3257106576700763, "learning_rate": 1.1194576768046904e-05, "loss": 0.2697, "step": 2422 }, { "epoch": 2.394463667820069, "grad_norm": 0.3202076274979032, "learning_rate": 1.1176255038475633e-05, "loss": 0.2716, "step": 2423 }, { "epoch": 2.3954522985664854, "grad_norm": 0.3499728748574284, "learning_rate": 1.1157933308904361e-05, "loss": 0.2834, "step": 2424 }, { "epoch": 2.3964409293129014, "grad_norm": 0.3504866226387898, "learning_rate": 1.113961157933309e-05, "loss": 0.2989, "step": 2425 }, { "epoch": 2.397429560059318, "grad_norm": 0.3205952140387686, "learning_rate": 1.1121289849761818e-05, "loss": 0.2889, "step": 2426 }, { "epoch": 2.3984181908057343, "grad_norm": 0.32321307674435, "learning_rate": 1.1102968120190546e-05, "loss": 0.2862, "step": 2427 }, { "epoch": 2.3994068215521502, "grad_norm": 0.33284471232010626, "learning_rate": 1.1084646390619274e-05, "loss": 0.2846, "step": 2428 }, { "epoch": 2.4003954522985667, "grad_norm": 0.30713200220732884, "learning_rate": 1.1066324661048004e-05, "loss": 0.2595, "step": 2429 }, { "epoch": 2.4013840830449826, "grad_norm": 0.315256136828381, "learning_rate": 1.1048002931476732e-05, "loss": 0.2718, "step": 2430 }, { "epoch": 2.402372713791399, "grad_norm": 0.323483016304508, "learning_rate": 1.102968120190546e-05, "loss": 0.2988, "step": 2431 }, { "epoch": 2.403361344537815, "grad_norm": 0.30042357223780175, "learning_rate": 1.1011359472334188e-05, "loss": 0.2936, "step": 2432 }, { "epoch": 2.4043499752842314, "grad_norm": 0.3143088447984726, "learning_rate": 1.0993037742762916e-05, "loss": 0.2568, "step": 2433 }, { "epoch": 2.4053386060306474, "grad_norm": 0.3318334068617343, "learning_rate": 1.0974716013191646e-05, "loss": 0.3042, "step": 2434 }, { "epoch": 2.406327236777064, "grad_norm": 0.33461704992812624, "learning_rate": 1.0956394283620374e-05, "loss": 0.2989, "step": 2435 }, { "epoch": 2.40731586752348, "grad_norm": 0.32239299225973184, "learning_rate": 1.0938072554049102e-05, "loss": 0.2784, "step": 2436 }, { "epoch": 2.4083044982698962, "grad_norm": 0.3115916740120738, "learning_rate": 1.0919750824477832e-05, "loss": 0.3087, "step": 2437 }, { "epoch": 2.409293129016312, "grad_norm": 0.32762316353477405, "learning_rate": 1.090142909490656e-05, "loss": 0.3145, "step": 2438 }, { "epoch": 2.4102817597627286, "grad_norm": 0.33206975474249284, "learning_rate": 1.0883107365335288e-05, "loss": 0.3022, "step": 2439 }, { "epoch": 2.411270390509145, "grad_norm": 0.30825467468358253, "learning_rate": 1.0864785635764016e-05, "loss": 0.2651, "step": 2440 }, { "epoch": 2.412259021255561, "grad_norm": 0.34789133232738184, "learning_rate": 1.0846463906192746e-05, "loss": 0.306, "step": 2441 }, { "epoch": 2.4132476520019774, "grad_norm": 0.325340347657917, "learning_rate": 1.0828142176621474e-05, "loss": 0.2624, "step": 2442 }, { "epoch": 2.4142362827483934, "grad_norm": 0.33689829950892625, "learning_rate": 1.0809820447050202e-05, "loss": 0.3007, "step": 2443 }, { "epoch": 2.41522491349481, "grad_norm": 0.33286441622499435, "learning_rate": 1.079149871747893e-05, "loss": 0.282, "step": 2444 }, { "epoch": 2.416213544241226, "grad_norm": 0.3434431938890706, "learning_rate": 1.0773176987907659e-05, "loss": 0.2986, "step": 2445 }, { "epoch": 2.4172021749876422, "grad_norm": 0.3143567548038031, "learning_rate": 1.0754855258336387e-05, "loss": 0.2811, "step": 2446 }, { "epoch": 2.418190805734058, "grad_norm": 0.33440281028976626, "learning_rate": 1.0736533528765116e-05, "loss": 0.2744, "step": 2447 }, { "epoch": 2.4191794364804746, "grad_norm": 0.355969620204421, "learning_rate": 1.0718211799193845e-05, "loss": 0.269, "step": 2448 }, { "epoch": 2.4201680672268906, "grad_norm": 0.3353002108310941, "learning_rate": 1.0699890069622573e-05, "loss": 0.2634, "step": 2449 }, { "epoch": 2.421156697973307, "grad_norm": 0.3144798401517659, "learning_rate": 1.06815683400513e-05, "loss": 0.2937, "step": 2450 }, { "epoch": 2.422145328719723, "grad_norm": 0.3153955180961104, "learning_rate": 1.0663246610480029e-05, "loss": 0.2757, "step": 2451 }, { "epoch": 2.4231339594661394, "grad_norm": 0.3336958677861515, "learning_rate": 1.0644924880908757e-05, "loss": 0.2952, "step": 2452 }, { "epoch": 2.424122590212556, "grad_norm": 0.33363202063272085, "learning_rate": 1.0626603151337487e-05, "loss": 0.2915, "step": 2453 }, { "epoch": 2.425111220958972, "grad_norm": 0.3230216310011584, "learning_rate": 1.0608281421766215e-05, "loss": 0.2756, "step": 2454 }, { "epoch": 2.426099851705388, "grad_norm": 0.29826636545529017, "learning_rate": 1.0589959692194943e-05, "loss": 0.287, "step": 2455 }, { "epoch": 2.427088482451804, "grad_norm": 0.358989675417826, "learning_rate": 1.0571637962623671e-05, "loss": 0.3024, "step": 2456 }, { "epoch": 2.4280771131982206, "grad_norm": 0.3245685791751705, "learning_rate": 1.0553316233052401e-05, "loss": 0.2628, "step": 2457 }, { "epoch": 2.4290657439446366, "grad_norm": 0.3982224891745618, "learning_rate": 1.0534994503481129e-05, "loss": 0.3142, "step": 2458 }, { "epoch": 2.430054374691053, "grad_norm": 0.3145271449807848, "learning_rate": 1.0516672773909857e-05, "loss": 0.2761, "step": 2459 }, { "epoch": 2.431043005437469, "grad_norm": 0.3667931120978127, "learning_rate": 1.0498351044338587e-05, "loss": 0.2779, "step": 2460 }, { "epoch": 2.4320316361838854, "grad_norm": 0.3379040194726811, "learning_rate": 1.0480029314767315e-05, "loss": 0.3136, "step": 2461 }, { "epoch": 2.4330202669303014, "grad_norm": 0.30979966306147366, "learning_rate": 1.0461707585196043e-05, "loss": 0.2904, "step": 2462 }, { "epoch": 2.434008897676718, "grad_norm": 0.3185894881873933, "learning_rate": 1.0443385855624771e-05, "loss": 0.2815, "step": 2463 }, { "epoch": 2.4349975284231338, "grad_norm": 0.46127240024854993, "learning_rate": 1.0425064126053501e-05, "loss": 0.3137, "step": 2464 }, { "epoch": 2.43598615916955, "grad_norm": 0.3314860245837639, "learning_rate": 1.040674239648223e-05, "loss": 0.2998, "step": 2465 }, { "epoch": 2.4369747899159666, "grad_norm": 0.32486960695414846, "learning_rate": 1.0388420666910957e-05, "loss": 0.2677, "step": 2466 }, { "epoch": 2.4379634206623826, "grad_norm": 0.3138060624986125, "learning_rate": 1.0370098937339686e-05, "loss": 0.2869, "step": 2467 }, { "epoch": 2.438952051408799, "grad_norm": 0.31674601792734935, "learning_rate": 1.0351777207768414e-05, "loss": 0.2972, "step": 2468 }, { "epoch": 2.439940682155215, "grad_norm": 0.34854744303151786, "learning_rate": 1.0333455478197142e-05, "loss": 0.2609, "step": 2469 }, { "epoch": 2.4409293129016314, "grad_norm": 0.3332938764520655, "learning_rate": 1.0315133748625872e-05, "loss": 0.286, "step": 2470 }, { "epoch": 2.4419179436480474, "grad_norm": 0.3132788916597391, "learning_rate": 1.02968120190546e-05, "loss": 0.2939, "step": 2471 }, { "epoch": 2.442906574394464, "grad_norm": 0.3161395406707777, "learning_rate": 1.0278490289483328e-05, "loss": 0.2853, "step": 2472 }, { "epoch": 2.4438952051408798, "grad_norm": 0.31890454413587493, "learning_rate": 1.0260168559912056e-05, "loss": 0.2661, "step": 2473 }, { "epoch": 2.444883835887296, "grad_norm": 0.31538050922955235, "learning_rate": 1.0241846830340784e-05, "loss": 0.2752, "step": 2474 }, { "epoch": 2.445872466633712, "grad_norm": 0.3139089900998475, "learning_rate": 1.0223525100769512e-05, "loss": 0.2843, "step": 2475 }, { "epoch": 2.4468610973801286, "grad_norm": 0.29996628368374506, "learning_rate": 1.0205203371198242e-05, "loss": 0.2601, "step": 2476 }, { "epoch": 2.4478497281265446, "grad_norm": 0.33793789595043405, "learning_rate": 1.018688164162697e-05, "loss": 0.3103, "step": 2477 }, { "epoch": 2.448838358872961, "grad_norm": 0.31824791183682277, "learning_rate": 1.0168559912055698e-05, "loss": 0.2747, "step": 2478 }, { "epoch": 2.4498269896193774, "grad_norm": 0.31606536182592215, "learning_rate": 1.0150238182484426e-05, "loss": 0.2511, "step": 2479 }, { "epoch": 2.4508156203657934, "grad_norm": 0.3270577996490876, "learning_rate": 1.0131916452913156e-05, "loss": 0.2896, "step": 2480 }, { "epoch": 2.4518042511122093, "grad_norm": 1.872027170770951, "learning_rate": 1.0113594723341884e-05, "loss": 0.3115, "step": 2481 }, { "epoch": 2.4527928818586258, "grad_norm": 0.2971487094494043, "learning_rate": 1.0095272993770612e-05, "loss": 0.2538, "step": 2482 }, { "epoch": 2.453781512605042, "grad_norm": 0.32772487851650356, "learning_rate": 1.0076951264199342e-05, "loss": 0.3012, "step": 2483 }, { "epoch": 2.454770143351458, "grad_norm": 0.33840468141239227, "learning_rate": 1.005862953462807e-05, "loss": 0.3265, "step": 2484 }, { "epoch": 2.4557587740978746, "grad_norm": 0.33353075921511366, "learning_rate": 1.0040307805056798e-05, "loss": 0.2875, "step": 2485 }, { "epoch": 2.4567474048442905, "grad_norm": 0.3325196566471947, "learning_rate": 1.0021986075485526e-05, "loss": 0.2947, "step": 2486 }, { "epoch": 2.457736035590707, "grad_norm": 0.34829615624680715, "learning_rate": 1.0003664345914255e-05, "loss": 0.2907, "step": 2487 }, { "epoch": 2.458724666337123, "grad_norm": 0.33297644947824656, "learning_rate": 9.985342616342984e-06, "loss": 0.2927, "step": 2488 }, { "epoch": 2.4597132970835394, "grad_norm": 0.30874299544544437, "learning_rate": 9.967020886771712e-06, "loss": 0.2772, "step": 2489 }, { "epoch": 2.4607019278299553, "grad_norm": 0.3363990155563431, "learning_rate": 9.94869915720044e-06, "loss": 0.2743, "step": 2490 }, { "epoch": 2.4616905585763718, "grad_norm": 0.33524031913803465, "learning_rate": 9.930377427629169e-06, "loss": 0.2973, "step": 2491 }, { "epoch": 2.462679189322788, "grad_norm": 0.2961452583216925, "learning_rate": 9.912055698057897e-06, "loss": 0.2555, "step": 2492 }, { "epoch": 2.463667820069204, "grad_norm": 0.3298898490970977, "learning_rate": 9.893733968486625e-06, "loss": 0.2929, "step": 2493 }, { "epoch": 2.46465645081562, "grad_norm": 0.3179803502877565, "learning_rate": 9.875412238915355e-06, "loss": 0.2572, "step": 2494 }, { "epoch": 2.4656450815620365, "grad_norm": 0.3490713932556183, "learning_rate": 9.857090509344083e-06, "loss": 0.2792, "step": 2495 }, { "epoch": 2.466633712308453, "grad_norm": 1.5132389557252333, "learning_rate": 9.838768779772811e-06, "loss": 0.2965, "step": 2496 }, { "epoch": 2.467622343054869, "grad_norm": 0.3315470609485268, "learning_rate": 9.820447050201539e-06, "loss": 0.3012, "step": 2497 }, { "epoch": 2.4686109738012854, "grad_norm": 0.355089971572738, "learning_rate": 9.802125320630267e-06, "loss": 0.2852, "step": 2498 }, { "epoch": 2.4695996045477013, "grad_norm": 0.2956764480730307, "learning_rate": 9.783803591058995e-06, "loss": 0.2593, "step": 2499 }, { "epoch": 2.4705882352941178, "grad_norm": 0.320709032054276, "learning_rate": 9.765481861487725e-06, "loss": 0.2915, "step": 2500 }, { "epoch": 2.4715768660405337, "grad_norm": 0.34076577786956136, "learning_rate": 9.747160131916453e-06, "loss": 0.2818, "step": 2501 }, { "epoch": 2.47256549678695, "grad_norm": 0.32186110600724155, "learning_rate": 9.728838402345181e-06, "loss": 0.2872, "step": 2502 }, { "epoch": 2.473554127533366, "grad_norm": 0.33285795424790504, "learning_rate": 9.710516672773911e-06, "loss": 0.3124, "step": 2503 }, { "epoch": 2.4745427582797825, "grad_norm": 0.3183178071554399, "learning_rate": 9.69219494320264e-06, "loss": 0.3012, "step": 2504 }, { "epoch": 2.475531389026199, "grad_norm": 0.3775444238309756, "learning_rate": 9.673873213631367e-06, "loss": 0.2782, "step": 2505 }, { "epoch": 2.476520019772615, "grad_norm": 0.3309115839232878, "learning_rate": 9.655551484060097e-06, "loss": 0.2922, "step": 2506 }, { "epoch": 2.477508650519031, "grad_norm": 0.31495666833252545, "learning_rate": 9.637229754488825e-06, "loss": 0.2843, "step": 2507 }, { "epoch": 2.4784972812654473, "grad_norm": 0.33631541803078935, "learning_rate": 9.618908024917553e-06, "loss": 0.2855, "step": 2508 }, { "epoch": 2.4794859120118637, "grad_norm": 0.3052461306657151, "learning_rate": 9.600586295346281e-06, "loss": 0.282, "step": 2509 }, { "epoch": 2.4804745427582797, "grad_norm": 0.34400968034417323, "learning_rate": 9.58226456577501e-06, "loss": 0.2986, "step": 2510 }, { "epoch": 2.481463173504696, "grad_norm": 0.3222751475712991, "learning_rate": 9.563942836203738e-06, "loss": 0.2886, "step": 2511 }, { "epoch": 2.482451804251112, "grad_norm": 0.34516313824385436, "learning_rate": 9.545621106632467e-06, "loss": 0.2991, "step": 2512 }, { "epoch": 2.4834404349975285, "grad_norm": 0.3585696555529004, "learning_rate": 9.527299377061196e-06, "loss": 0.2644, "step": 2513 }, { "epoch": 2.4844290657439445, "grad_norm": 0.3462543271999876, "learning_rate": 9.508977647489924e-06, "loss": 0.2838, "step": 2514 }, { "epoch": 2.485417696490361, "grad_norm": 0.3433441739516674, "learning_rate": 9.490655917918652e-06, "loss": 0.2796, "step": 2515 }, { "epoch": 2.486406327236777, "grad_norm": 0.32719115427143136, "learning_rate": 9.47233418834738e-06, "loss": 0.2719, "step": 2516 }, { "epoch": 2.4873949579831933, "grad_norm": 0.31560220424562857, "learning_rate": 9.454012458776108e-06, "loss": 0.2821, "step": 2517 }, { "epoch": 2.4883835887296093, "grad_norm": 0.4602762978098691, "learning_rate": 9.435690729204838e-06, "loss": 0.2957, "step": 2518 }, { "epoch": 2.4893722194760257, "grad_norm": 0.3307520801528609, "learning_rate": 9.417368999633566e-06, "loss": 0.2935, "step": 2519 }, { "epoch": 2.4903608502224417, "grad_norm": 0.3317295386423108, "learning_rate": 9.399047270062294e-06, "loss": 0.2613, "step": 2520 }, { "epoch": 2.491349480968858, "grad_norm": 0.6536768664291642, "learning_rate": 9.380725540491022e-06, "loss": 0.2715, "step": 2521 }, { "epoch": 2.4923381117152745, "grad_norm": 0.34315778951883485, "learning_rate": 9.36240381091975e-06, "loss": 0.2785, "step": 2522 }, { "epoch": 2.4933267424616905, "grad_norm": 0.3337364104854179, "learning_rate": 9.344082081348478e-06, "loss": 0.3036, "step": 2523 }, { "epoch": 2.494315373208107, "grad_norm": 0.5544267518631636, "learning_rate": 9.325760351777208e-06, "loss": 0.2964, "step": 2524 }, { "epoch": 2.495304003954523, "grad_norm": 0.3112532170177589, "learning_rate": 9.307438622205936e-06, "loss": 0.2695, "step": 2525 }, { "epoch": 2.4962926347009393, "grad_norm": 0.31569627933225386, "learning_rate": 9.289116892634666e-06, "loss": 0.2611, "step": 2526 }, { "epoch": 2.4972812654473553, "grad_norm": 0.32351174125715704, "learning_rate": 9.270795163063394e-06, "loss": 0.3006, "step": 2527 }, { "epoch": 2.4982698961937717, "grad_norm": 0.32420104269865235, "learning_rate": 9.252473433492122e-06, "loss": 0.2756, "step": 2528 }, { "epoch": 2.4992585269401877, "grad_norm": 0.30202549559626013, "learning_rate": 9.23415170392085e-06, "loss": 0.2649, "step": 2529 }, { "epoch": 2.500247157686604, "grad_norm": 0.3425811319663481, "learning_rate": 9.21582997434958e-06, "loss": 0.2909, "step": 2530 }, { "epoch": 2.5012357884330205, "grad_norm": 0.3324140292856398, "learning_rate": 9.197508244778308e-06, "loss": 0.2751, "step": 2531 }, { "epoch": 2.5022244191794365, "grad_norm": 0.3052740474401645, "learning_rate": 9.179186515207036e-06, "loss": 0.2901, "step": 2532 }, { "epoch": 2.5032130499258525, "grad_norm": 0.3063870078299709, "learning_rate": 9.160864785635765e-06, "loss": 0.2545, "step": 2533 }, { "epoch": 2.504201680672269, "grad_norm": 0.32525533681675955, "learning_rate": 9.142543056064493e-06, "loss": 0.2749, "step": 2534 }, { "epoch": 2.5051903114186853, "grad_norm": 0.3119432150078339, "learning_rate": 9.12422132649322e-06, "loss": 0.2805, "step": 2535 }, { "epoch": 2.5061789421651013, "grad_norm": 0.3032182860595709, "learning_rate": 9.10589959692195e-06, "loss": 0.2582, "step": 2536 }, { "epoch": 2.5071675729115177, "grad_norm": 0.3191459306327782, "learning_rate": 9.087577867350679e-06, "loss": 0.2953, "step": 2537 }, { "epoch": 2.5081562036579337, "grad_norm": 0.31277723215009356, "learning_rate": 9.069256137779407e-06, "loss": 0.293, "step": 2538 }, { "epoch": 2.50914483440435, "grad_norm": 0.3272650864884885, "learning_rate": 9.050934408208135e-06, "loss": 0.294, "step": 2539 }, { "epoch": 2.510133465150766, "grad_norm": 0.3240347681572221, "learning_rate": 9.032612678636863e-06, "loss": 0.2961, "step": 2540 }, { "epoch": 2.5111220958971825, "grad_norm": 0.32098487478133814, "learning_rate": 9.014290949065591e-06, "loss": 0.3086, "step": 2541 }, { "epoch": 2.5121107266435985, "grad_norm": 0.31549981448914943, "learning_rate": 8.995969219494321e-06, "loss": 0.3016, "step": 2542 }, { "epoch": 2.513099357390015, "grad_norm": 0.31858254501324246, "learning_rate": 8.977647489923049e-06, "loss": 0.2789, "step": 2543 }, { "epoch": 2.5140879881364313, "grad_norm": 0.30064984700041403, "learning_rate": 8.959325760351777e-06, "loss": 0.2712, "step": 2544 }, { "epoch": 2.5150766188828473, "grad_norm": 0.3585327666285049, "learning_rate": 8.941004030780505e-06, "loss": 0.3167, "step": 2545 }, { "epoch": 2.5160652496292633, "grad_norm": 0.3052779487455974, "learning_rate": 8.922682301209233e-06, "loss": 0.285, "step": 2546 }, { "epoch": 2.5170538803756797, "grad_norm": 0.31057089656738607, "learning_rate": 8.904360571637963e-06, "loss": 0.2807, "step": 2547 }, { "epoch": 2.518042511122096, "grad_norm": 0.32541036902377873, "learning_rate": 8.886038842066691e-06, "loss": 0.2856, "step": 2548 }, { "epoch": 2.519031141868512, "grad_norm": 0.3247393431284627, "learning_rate": 8.86771711249542e-06, "loss": 0.2583, "step": 2549 }, { "epoch": 2.5200197726149285, "grad_norm": 0.299261015778785, "learning_rate": 8.84939538292415e-06, "loss": 0.2955, "step": 2550 }, { "epoch": 2.5210084033613445, "grad_norm": 0.29241554383707935, "learning_rate": 8.831073653352877e-06, "loss": 0.2671, "step": 2551 }, { "epoch": 2.521997034107761, "grad_norm": 0.3078403003841863, "learning_rate": 8.812751923781605e-06, "loss": 0.2644, "step": 2552 }, { "epoch": 2.522985664854177, "grad_norm": 0.31440782068762757, "learning_rate": 8.794430194210334e-06, "loss": 0.2876, "step": 2553 }, { "epoch": 2.5239742956005933, "grad_norm": 0.31191149173438903, "learning_rate": 8.776108464639063e-06, "loss": 0.2765, "step": 2554 }, { "epoch": 2.5249629263470093, "grad_norm": 0.30669868723814125, "learning_rate": 8.757786735067792e-06, "loss": 0.2629, "step": 2555 }, { "epoch": 2.5259515570934257, "grad_norm": 0.3044238013064673, "learning_rate": 8.73946500549652e-06, "loss": 0.2637, "step": 2556 }, { "epoch": 2.526940187839842, "grad_norm": 0.3214906086905622, "learning_rate": 8.721143275925248e-06, "loss": 0.28, "step": 2557 }, { "epoch": 2.527928818586258, "grad_norm": 0.3184244126450762, "learning_rate": 8.702821546353976e-06, "loss": 0.2928, "step": 2558 }, { "epoch": 2.528917449332674, "grad_norm": 8.976042313952151, "learning_rate": 8.684499816782704e-06, "loss": 0.4582, "step": 2559 }, { "epoch": 2.5299060800790905, "grad_norm": 0.35616773054745837, "learning_rate": 8.666178087211434e-06, "loss": 0.289, "step": 2560 }, { "epoch": 2.530894710825507, "grad_norm": 0.3316808052918025, "learning_rate": 8.647856357640162e-06, "loss": 0.3031, "step": 2561 }, { "epoch": 2.531883341571923, "grad_norm": 0.3163496390342216, "learning_rate": 8.62953462806889e-06, "loss": 0.2665, "step": 2562 }, { "epoch": 2.532871972318339, "grad_norm": 0.29115954669892974, "learning_rate": 8.611212898497618e-06, "loss": 0.2715, "step": 2563 }, { "epoch": 2.5338606030647552, "grad_norm": 0.3365173458660393, "learning_rate": 8.592891168926346e-06, "loss": 0.2927, "step": 2564 }, { "epoch": 2.5348492338111717, "grad_norm": 0.33337092569542204, "learning_rate": 8.574569439355074e-06, "loss": 0.2752, "step": 2565 }, { "epoch": 2.5358378645575876, "grad_norm": 0.34283075666154017, "learning_rate": 8.556247709783804e-06, "loss": 0.3137, "step": 2566 }, { "epoch": 2.536826495304004, "grad_norm": 0.3249937869043709, "learning_rate": 8.537925980212532e-06, "loss": 0.2697, "step": 2567 }, { "epoch": 2.53781512605042, "grad_norm": 0.31671467622725424, "learning_rate": 8.51960425064126e-06, "loss": 0.2867, "step": 2568 }, { "epoch": 2.5388037567968365, "grad_norm": 0.3328714908277569, "learning_rate": 8.501282521069988e-06, "loss": 0.2819, "step": 2569 }, { "epoch": 2.539792387543253, "grad_norm": 0.3506131478653219, "learning_rate": 8.482960791498718e-06, "loss": 0.2952, "step": 2570 }, { "epoch": 2.540781018289669, "grad_norm": 0.3221572320808237, "learning_rate": 8.464639061927446e-06, "loss": 0.2574, "step": 2571 }, { "epoch": 2.541769649036085, "grad_norm": 0.31368802002205787, "learning_rate": 8.446317332356175e-06, "loss": 0.2655, "step": 2572 }, { "epoch": 2.5427582797825012, "grad_norm": 0.9199245788536203, "learning_rate": 8.427995602784904e-06, "loss": 0.302, "step": 2573 }, { "epoch": 2.5437469105289177, "grad_norm": 0.33908922474426684, "learning_rate": 8.409673873213632e-06, "loss": 0.3151, "step": 2574 }, { "epoch": 2.5447355412753336, "grad_norm": 0.343598731668652, "learning_rate": 8.39135214364236e-06, "loss": 0.27, "step": 2575 }, { "epoch": 2.5457241720217496, "grad_norm": 0.32640981335407493, "learning_rate": 8.373030414071089e-06, "loss": 0.2747, "step": 2576 }, { "epoch": 2.546712802768166, "grad_norm": 0.3288403308419439, "learning_rate": 8.354708684499817e-06, "loss": 0.3017, "step": 2577 }, { "epoch": 2.5477014335145824, "grad_norm": 0.316101634967927, "learning_rate": 8.336386954928547e-06, "loss": 0.2762, "step": 2578 }, { "epoch": 2.5486900642609984, "grad_norm": 0.36155888043656026, "learning_rate": 8.318065225357275e-06, "loss": 0.2713, "step": 2579 }, { "epoch": 2.549678695007415, "grad_norm": 0.31381320306887706, "learning_rate": 8.299743495786003e-06, "loss": 0.2649, "step": 2580 }, { "epoch": 2.550667325753831, "grad_norm": 0.3255755412079104, "learning_rate": 8.281421766214731e-06, "loss": 0.2891, "step": 2581 }, { "epoch": 2.5516559565002472, "grad_norm": 0.30113061207598246, "learning_rate": 8.263100036643459e-06, "loss": 0.2808, "step": 2582 }, { "epoch": 2.552644587246663, "grad_norm": 0.2941843792083445, "learning_rate": 8.244778307072187e-06, "loss": 0.239, "step": 2583 }, { "epoch": 2.5536332179930796, "grad_norm": 0.29485903364446775, "learning_rate": 8.226456577500917e-06, "loss": 0.2871, "step": 2584 }, { "epoch": 2.5546218487394956, "grad_norm": 0.3262254836931687, "learning_rate": 8.208134847929645e-06, "loss": 0.2718, "step": 2585 }, { "epoch": 2.555610479485912, "grad_norm": 0.30972143066243957, "learning_rate": 8.189813118358373e-06, "loss": 0.2744, "step": 2586 }, { "epoch": 2.5565991102323284, "grad_norm": 0.29934557247516674, "learning_rate": 8.171491388787101e-06, "loss": 0.2609, "step": 2587 }, { "epoch": 2.5575877409787444, "grad_norm": 0.2927344401820182, "learning_rate": 8.15316965921583e-06, "loss": 0.2697, "step": 2588 }, { "epoch": 2.5585763717251604, "grad_norm": 0.34524467876533405, "learning_rate": 8.134847929644557e-06, "loss": 0.2866, "step": 2589 }, { "epoch": 2.559565002471577, "grad_norm": 1.013125486574032, "learning_rate": 8.116526200073287e-06, "loss": 0.299, "step": 2590 }, { "epoch": 2.5605536332179932, "grad_norm": 0.3452870350593378, "learning_rate": 8.098204470502015e-06, "loss": 0.3271, "step": 2591 }, { "epoch": 2.561542263964409, "grad_norm": 0.3101299818763407, "learning_rate": 8.079882740930744e-06, "loss": 0.2647, "step": 2592 }, { "epoch": 2.5625308947108256, "grad_norm": 0.3047560484495522, "learning_rate": 8.061561011359473e-06, "loss": 0.2777, "step": 2593 }, { "epoch": 2.5635195254572416, "grad_norm": 0.3027974993832518, "learning_rate": 8.043239281788201e-06, "loss": 0.2799, "step": 2594 }, { "epoch": 2.564508156203658, "grad_norm": 0.301174157770394, "learning_rate": 8.02491755221693e-06, "loss": 0.2648, "step": 2595 }, { "epoch": 2.565496786950074, "grad_norm": 0.3120117564772233, "learning_rate": 8.00659582264566e-06, "loss": 0.2967, "step": 2596 }, { "epoch": 2.5664854176964904, "grad_norm": 0.3310996959952828, "learning_rate": 7.988274093074387e-06, "loss": 0.2716, "step": 2597 }, { "epoch": 2.5674740484429064, "grad_norm": 0.3106284453871339, "learning_rate": 7.969952363503116e-06, "loss": 0.2781, "step": 2598 }, { "epoch": 2.568462679189323, "grad_norm": 0.3208642289090898, "learning_rate": 7.951630633931844e-06, "loss": 0.296, "step": 2599 }, { "epoch": 2.5694513099357392, "grad_norm": 0.3708288699828045, "learning_rate": 7.933308904360572e-06, "loss": 0.3128, "step": 2600 }, { "epoch": 2.570439940682155, "grad_norm": 0.3339403109809998, "learning_rate": 7.9149871747893e-06, "loss": 0.2911, "step": 2601 }, { "epoch": 2.571428571428571, "grad_norm": 0.33782674365244636, "learning_rate": 7.89666544521803e-06, "loss": 0.2707, "step": 2602 }, { "epoch": 2.5724172021749876, "grad_norm": 0.3432380924147312, "learning_rate": 7.878343715646758e-06, "loss": 0.2937, "step": 2603 }, { "epoch": 2.573405832921404, "grad_norm": 0.3163414117087477, "learning_rate": 7.860021986075486e-06, "loss": 0.2881, "step": 2604 }, { "epoch": 2.57439446366782, "grad_norm": 0.34327320825167656, "learning_rate": 7.841700256504214e-06, "loss": 0.3111, "step": 2605 }, { "epoch": 2.5753830944142364, "grad_norm": 0.3233618558750646, "learning_rate": 7.823378526932942e-06, "loss": 0.2889, "step": 2606 }, { "epoch": 2.5763717251606524, "grad_norm": 0.29402531847440083, "learning_rate": 7.805056797361672e-06, "loss": 0.2785, "step": 2607 }, { "epoch": 2.577360355907069, "grad_norm": 0.3056889968036359, "learning_rate": 7.7867350677904e-06, "loss": 0.2732, "step": 2608 }, { "epoch": 2.5783489866534848, "grad_norm": 0.32700691333116977, "learning_rate": 7.768413338219128e-06, "loss": 0.2715, "step": 2609 }, { "epoch": 2.579337617399901, "grad_norm": 0.2941882915234888, "learning_rate": 7.750091608647856e-06, "loss": 0.2664, "step": 2610 }, { "epoch": 2.580326248146317, "grad_norm": 0.27863819169737636, "learning_rate": 7.731769879076584e-06, "loss": 0.2581, "step": 2611 }, { "epoch": 2.5813148788927336, "grad_norm": 0.3013311673741528, "learning_rate": 7.713448149505313e-06, "loss": 0.3048, "step": 2612 }, { "epoch": 2.58230350963915, "grad_norm": 0.3211019510846337, "learning_rate": 7.695126419934042e-06, "loss": 0.2973, "step": 2613 }, { "epoch": 2.583292140385566, "grad_norm": 0.2922780682840003, "learning_rate": 7.67680469036277e-06, "loss": 0.2691, "step": 2614 }, { "epoch": 2.584280771131982, "grad_norm": 0.30770493776999386, "learning_rate": 7.658482960791499e-06, "loss": 0.2866, "step": 2615 }, { "epoch": 2.5852694018783984, "grad_norm": 0.3043175190359127, "learning_rate": 7.640161231220228e-06, "loss": 0.2996, "step": 2616 }, { "epoch": 2.586258032624815, "grad_norm": 0.3158147495037161, "learning_rate": 7.621839501648956e-06, "loss": 0.3083, "step": 2617 }, { "epoch": 2.5872466633712308, "grad_norm": 0.3334325665658252, "learning_rate": 7.603517772077684e-06, "loss": 0.2964, "step": 2618 }, { "epoch": 2.588235294117647, "grad_norm": 0.3274990211234987, "learning_rate": 7.5851960425064135e-06, "loss": 0.2788, "step": 2619 }, { "epoch": 2.589223924864063, "grad_norm": 0.2952399361488048, "learning_rate": 7.566874312935142e-06, "loss": 0.2795, "step": 2620 }, { "epoch": 2.5902125556104796, "grad_norm": 0.30513983843132403, "learning_rate": 7.54855258336387e-06, "loss": 0.2862, "step": 2621 }, { "epoch": 2.5912011863568956, "grad_norm": 0.3431173977139085, "learning_rate": 7.530230853792599e-06, "loss": 0.2975, "step": 2622 }, { "epoch": 2.592189817103312, "grad_norm": 0.34342824248943415, "learning_rate": 7.511909124221327e-06, "loss": 0.2932, "step": 2623 }, { "epoch": 2.593178447849728, "grad_norm": 0.30772046607803927, "learning_rate": 7.493587394650055e-06, "loss": 0.2756, "step": 2624 }, { "epoch": 2.5941670785961444, "grad_norm": 0.3014901066323891, "learning_rate": 7.475265665078785e-06, "loss": 0.2865, "step": 2625 }, { "epoch": 2.595155709342561, "grad_norm": 0.32070292761595176, "learning_rate": 7.456943935507513e-06, "loss": 0.2825, "step": 2626 }, { "epoch": 2.5961443400889768, "grad_norm": 0.3251322140735928, "learning_rate": 7.438622205936241e-06, "loss": 0.2817, "step": 2627 }, { "epoch": 2.5971329708353927, "grad_norm": 0.33337714413521063, "learning_rate": 7.420300476364969e-06, "loss": 0.2937, "step": 2628 }, { "epoch": 2.598121601581809, "grad_norm": 0.29146692259228896, "learning_rate": 7.401978746793697e-06, "loss": 0.2912, "step": 2629 }, { "epoch": 2.5991102323282256, "grad_norm": 0.316170166018179, "learning_rate": 7.383657017222425e-06, "loss": 0.3032, "step": 2630 }, { "epoch": 2.6000988630746416, "grad_norm": 0.32083474651873917, "learning_rate": 7.365335287651155e-06, "loss": 0.296, "step": 2631 }, { "epoch": 2.601087493821058, "grad_norm": 0.32325437226350506, "learning_rate": 7.347013558079883e-06, "loss": 0.2627, "step": 2632 }, { "epoch": 2.602076124567474, "grad_norm": 0.3010962708882428, "learning_rate": 7.328691828508611e-06, "loss": 0.2801, "step": 2633 }, { "epoch": 2.6030647553138904, "grad_norm": 0.29279488124455455, "learning_rate": 7.31037009893734e-06, "loss": 0.2768, "step": 2634 }, { "epoch": 2.6040533860603063, "grad_norm": 0.3102591356796067, "learning_rate": 7.292048369366068e-06, "loss": 0.2727, "step": 2635 }, { "epoch": 2.6050420168067228, "grad_norm": 0.3031981422545899, "learning_rate": 7.2737266397947965e-06, "loss": 0.2751, "step": 2636 }, { "epoch": 2.6060306475531387, "grad_norm": 0.308976392033661, "learning_rate": 7.255404910223526e-06, "loss": 0.2759, "step": 2637 }, { "epoch": 2.607019278299555, "grad_norm": 0.3204197457193874, "learning_rate": 7.2370831806522544e-06, "loss": 0.2784, "step": 2638 }, { "epoch": 2.6080079090459716, "grad_norm": 0.34528850352110446, "learning_rate": 7.2187614510809825e-06, "loss": 0.3084, "step": 2639 }, { "epoch": 2.6089965397923875, "grad_norm": 0.3150014447656134, "learning_rate": 7.200439721509711e-06, "loss": 0.2972, "step": 2640 }, { "epoch": 2.6099851705388035, "grad_norm": 0.37642029885751715, "learning_rate": 7.182117991938439e-06, "loss": 0.2828, "step": 2641 }, { "epoch": 2.61097380128522, "grad_norm": 0.3039827388335417, "learning_rate": 7.163796262367167e-06, "loss": 0.2674, "step": 2642 }, { "epoch": 2.6119624320316364, "grad_norm": 0.3475953349615929, "learning_rate": 7.145474532795897e-06, "loss": 0.2812, "step": 2643 }, { "epoch": 2.6129510627780523, "grad_norm": 0.3274633357798315, "learning_rate": 7.127152803224625e-06, "loss": 0.2721, "step": 2644 }, { "epoch": 2.6139396935244688, "grad_norm": 0.320777153944434, "learning_rate": 7.108831073653354e-06, "loss": 0.2783, "step": 2645 }, { "epoch": 2.6149283242708847, "grad_norm": 0.3389660776084452, "learning_rate": 7.090509344082082e-06, "loss": 0.325, "step": 2646 }, { "epoch": 2.615916955017301, "grad_norm": 0.34169444615287276, "learning_rate": 7.07218761451081e-06, "loss": 0.2863, "step": 2647 }, { "epoch": 2.616905585763717, "grad_norm": 0.34613267020192295, "learning_rate": 7.053865884939538e-06, "loss": 0.3015, "step": 2648 }, { "epoch": 2.6178942165101335, "grad_norm": 0.3291669460131398, "learning_rate": 7.035544155368268e-06, "loss": 0.2799, "step": 2649 }, { "epoch": 2.6188828472565495, "grad_norm": 0.29546741227293477, "learning_rate": 7.017222425796996e-06, "loss": 0.2833, "step": 2650 }, { "epoch": 2.619871478002966, "grad_norm": 0.3119680934234821, "learning_rate": 6.998900696225724e-06, "loss": 0.2617, "step": 2651 }, { "epoch": 2.6208601087493824, "grad_norm": 0.3079286125776011, "learning_rate": 6.980578966654452e-06, "loss": 0.2611, "step": 2652 }, { "epoch": 2.6218487394957983, "grad_norm": 0.31733622354200564, "learning_rate": 6.96225723708318e-06, "loss": 0.2639, "step": 2653 }, { "epoch": 2.6228373702422143, "grad_norm": 0.3258297584820773, "learning_rate": 6.943935507511909e-06, "loss": 0.3066, "step": 2654 }, { "epoch": 2.6238260009886307, "grad_norm": 0.3216831193898818, "learning_rate": 6.925613777940638e-06, "loss": 0.274, "step": 2655 }, { "epoch": 2.624814631735047, "grad_norm": 0.3060936021918322, "learning_rate": 6.907292048369366e-06, "loss": 0.2972, "step": 2656 }, { "epoch": 2.625803262481463, "grad_norm": 0.30271837250635975, "learning_rate": 6.888970318798095e-06, "loss": 0.278, "step": 2657 }, { "epoch": 2.6267918932278795, "grad_norm": 0.3426774758658574, "learning_rate": 6.8706485892268234e-06, "loss": 0.2936, "step": 2658 }, { "epoch": 2.6277805239742955, "grad_norm": 0.3299082234305938, "learning_rate": 6.8523268596555516e-06, "loss": 0.2782, "step": 2659 }, { "epoch": 2.628769154720712, "grad_norm": 0.3033900205329443, "learning_rate": 6.83400513008428e-06, "loss": 0.2854, "step": 2660 }, { "epoch": 2.629757785467128, "grad_norm": 0.3265503435401548, "learning_rate": 6.8156834005130095e-06, "loss": 0.2985, "step": 2661 }, { "epoch": 2.6307464162135443, "grad_norm": 0.691707581326077, "learning_rate": 6.797361670941738e-06, "loss": 0.2899, "step": 2662 }, { "epoch": 2.6317350469599603, "grad_norm": 0.31063874680769077, "learning_rate": 6.779039941370466e-06, "loss": 0.2865, "step": 2663 }, { "epoch": 2.6327236777063767, "grad_norm": 0.3249445938996524, "learning_rate": 6.760718211799194e-06, "loss": 0.2716, "step": 2664 }, { "epoch": 2.633712308452793, "grad_norm": 0.303764761798986, "learning_rate": 6.742396482227922e-06, "loss": 0.2498, "step": 2665 }, { "epoch": 2.634700939199209, "grad_norm": 0.317059754847548, "learning_rate": 6.724074752656651e-06, "loss": 0.2499, "step": 2666 }, { "epoch": 2.635689569945625, "grad_norm": 0.30501149153694807, "learning_rate": 6.70575302308538e-06, "loss": 0.2915, "step": 2667 }, { "epoch": 2.6366782006920415, "grad_norm": 0.3126897209592263, "learning_rate": 6.687431293514109e-06, "loss": 0.284, "step": 2668 }, { "epoch": 2.637666831438458, "grad_norm": 0.30016647200239577, "learning_rate": 6.669109563942837e-06, "loss": 0.2741, "step": 2669 }, { "epoch": 2.638655462184874, "grad_norm": 0.31930825620124975, "learning_rate": 6.650787834371565e-06, "loss": 0.2874, "step": 2670 }, { "epoch": 2.6396440929312903, "grad_norm": 0.3038798329592333, "learning_rate": 6.632466104800293e-06, "loss": 0.2651, "step": 2671 }, { "epoch": 2.6406327236777063, "grad_norm": 0.2721960222877931, "learning_rate": 6.614144375229021e-06, "loss": 0.2558, "step": 2672 }, { "epoch": 2.6416213544241227, "grad_norm": 0.2959145842755548, "learning_rate": 6.595822645657751e-06, "loss": 0.2585, "step": 2673 }, { "epoch": 2.6426099851705387, "grad_norm": 0.30512704489216913, "learning_rate": 6.577500916086479e-06, "loss": 0.2818, "step": 2674 }, { "epoch": 2.643598615916955, "grad_norm": 0.33358084872382526, "learning_rate": 6.559179186515207e-06, "loss": 0.2913, "step": 2675 }, { "epoch": 2.644587246663371, "grad_norm": 0.3024186878760609, "learning_rate": 6.540857456943935e-06, "loss": 0.2748, "step": 2676 }, { "epoch": 2.6455758774097875, "grad_norm": 0.316593912632956, "learning_rate": 6.522535727372664e-06, "loss": 0.2872, "step": 2677 }, { "epoch": 2.646564508156204, "grad_norm": 0.3003491744096471, "learning_rate": 6.5042139978013925e-06, "loss": 0.2808, "step": 2678 }, { "epoch": 2.64755313890262, "grad_norm": 0.42995959947123835, "learning_rate": 6.485892268230121e-06, "loss": 0.2953, "step": 2679 }, { "epoch": 2.648541769649036, "grad_norm": 0.32568532257525984, "learning_rate": 6.46757053865885e-06, "loss": 0.2921, "step": 2680 }, { "epoch": 2.6495304003954523, "grad_norm": 0.3070991878832122, "learning_rate": 6.4492488090875785e-06, "loss": 0.2854, "step": 2681 }, { "epoch": 2.6505190311418687, "grad_norm": 0.2951808809020992, "learning_rate": 6.430927079516307e-06, "loss": 0.2728, "step": 2682 }, { "epoch": 2.6515076618882847, "grad_norm": 0.29655124249506976, "learning_rate": 6.412605349945035e-06, "loss": 0.2813, "step": 2683 }, { "epoch": 2.652496292634701, "grad_norm": 0.31579785580798697, "learning_rate": 6.394283620373763e-06, "loss": 0.2844, "step": 2684 }, { "epoch": 2.653484923381117, "grad_norm": 0.3081110657515338, "learning_rate": 6.375961890802493e-06, "loss": 0.2994, "step": 2685 }, { "epoch": 2.6544735541275335, "grad_norm": 0.2962983681194998, "learning_rate": 6.357640161231221e-06, "loss": 0.2913, "step": 2686 }, { "epoch": 2.6554621848739495, "grad_norm": 0.2898951718807583, "learning_rate": 6.339318431659949e-06, "loss": 0.2665, "step": 2687 }, { "epoch": 2.656450815620366, "grad_norm": 0.2890457813360233, "learning_rate": 6.320996702088677e-06, "loss": 0.2897, "step": 2688 }, { "epoch": 2.657439446366782, "grad_norm": 0.3329670284018998, "learning_rate": 6.302674972517406e-06, "loss": 0.3233, "step": 2689 }, { "epoch": 2.6584280771131983, "grad_norm": 0.3342070409771942, "learning_rate": 6.284353242946134e-06, "loss": 0.3048, "step": 2690 }, { "epoch": 2.6594167078596147, "grad_norm": 0.31406661453040885, "learning_rate": 6.266031513374864e-06, "loss": 0.2602, "step": 2691 }, { "epoch": 2.6604053386060307, "grad_norm": 0.2951339112535458, "learning_rate": 6.247709783803592e-06, "loss": 0.28, "step": 2692 }, { "epoch": 2.6613939693524467, "grad_norm": 0.3411187244850766, "learning_rate": 6.22938805423232e-06, "loss": 0.2845, "step": 2693 }, { "epoch": 2.662382600098863, "grad_norm": 0.3227447977007505, "learning_rate": 6.211066324661048e-06, "loss": 0.2973, "step": 2694 }, { "epoch": 2.6633712308452795, "grad_norm": 0.3062512801393021, "learning_rate": 6.192744595089777e-06, "loss": 0.2859, "step": 2695 }, { "epoch": 2.6643598615916955, "grad_norm": 0.3230420006680011, "learning_rate": 6.174422865518505e-06, "loss": 0.2808, "step": 2696 }, { "epoch": 2.6653484923381114, "grad_norm": 0.3270940334644268, "learning_rate": 6.156101135947233e-06, "loss": 0.2909, "step": 2697 }, { "epoch": 2.666337123084528, "grad_norm": 0.29480162519408437, "learning_rate": 6.137779406375962e-06, "loss": 0.27, "step": 2698 }, { "epoch": 2.6673257538309443, "grad_norm": 0.3058018623817684, "learning_rate": 6.1194576768046904e-06, "loss": 0.2853, "step": 2699 }, { "epoch": 2.6683143845773603, "grad_norm": 0.3011188507016692, "learning_rate": 6.101135947233419e-06, "loss": 0.2735, "step": 2700 }, { "epoch": 2.6693030153237767, "grad_norm": 0.3311460882960713, "learning_rate": 6.0828142176621475e-06, "loss": 0.2696, "step": 2701 }, { "epoch": 2.6702916460701926, "grad_norm": 0.3347937313000275, "learning_rate": 6.0644924880908765e-06, "loss": 0.2992, "step": 2702 }, { "epoch": 2.671280276816609, "grad_norm": 0.32025084970425927, "learning_rate": 6.0461707585196046e-06, "loss": 0.2983, "step": 2703 }, { "epoch": 2.6722689075630255, "grad_norm": 0.29700714484221685, "learning_rate": 6.0278490289483335e-06, "loss": 0.273, "step": 2704 }, { "epoch": 2.6732575383094415, "grad_norm": 0.3497052748212935, "learning_rate": 6.009527299377062e-06, "loss": 0.2987, "step": 2705 }, { "epoch": 2.6742461690558574, "grad_norm": 0.3415703694470868, "learning_rate": 5.99120556980579e-06, "loss": 0.2509, "step": 2706 }, { "epoch": 2.675234799802274, "grad_norm": 0.3520787494669564, "learning_rate": 5.972883840234519e-06, "loss": 0.304, "step": 2707 }, { "epoch": 2.6762234305486903, "grad_norm": 0.2988731035057533, "learning_rate": 5.954562110663247e-06, "loss": 0.2691, "step": 2708 }, { "epoch": 2.6772120612951062, "grad_norm": 0.2980819163641365, "learning_rate": 5.936240381091975e-06, "loss": 0.2796, "step": 2709 }, { "epoch": 2.6782006920415222, "grad_norm": 0.32906160461653505, "learning_rate": 5.917918651520704e-06, "loss": 0.2947, "step": 2710 }, { "epoch": 2.6791893227879386, "grad_norm": 0.33125324537672834, "learning_rate": 5.899596921949432e-06, "loss": 0.3132, "step": 2711 }, { "epoch": 2.680177953534355, "grad_norm": 0.3249654575763053, "learning_rate": 5.881275192378161e-06, "loss": 0.3188, "step": 2712 }, { "epoch": 2.681166584280771, "grad_norm": 0.3515434358481219, "learning_rate": 5.86295346280689e-06, "loss": 0.337, "step": 2713 }, { "epoch": 2.6821552150271875, "grad_norm": 0.43241991953364783, "learning_rate": 5.844631733235618e-06, "loss": 0.2814, "step": 2714 }, { "epoch": 2.6831438457736034, "grad_norm": 0.30799054307489926, "learning_rate": 5.826310003664346e-06, "loss": 0.2907, "step": 2715 }, { "epoch": 2.68413247652002, "grad_norm": 0.29910262119525627, "learning_rate": 5.807988274093075e-06, "loss": 0.2834, "step": 2716 }, { "epoch": 2.685121107266436, "grad_norm": 0.35325903940541264, "learning_rate": 5.789666544521803e-06, "loss": 0.303, "step": 2717 }, { "epoch": 2.6861097380128522, "grad_norm": 0.3278857867262599, "learning_rate": 5.771344814950531e-06, "loss": 0.28, "step": 2718 }, { "epoch": 2.687098368759268, "grad_norm": 0.307020248771263, "learning_rate": 5.75302308537926e-06, "loss": 0.2989, "step": 2719 }, { "epoch": 2.6880869995056846, "grad_norm": 0.3340716105070889, "learning_rate": 5.734701355807988e-06, "loss": 0.3097, "step": 2720 }, { "epoch": 2.689075630252101, "grad_norm": 0.30523033093524155, "learning_rate": 5.7163796262367165e-06, "loss": 0.2593, "step": 2721 }, { "epoch": 2.690064260998517, "grad_norm": 0.28899840953033434, "learning_rate": 5.6980578966654455e-06, "loss": 0.2635, "step": 2722 }, { "epoch": 2.691052891744933, "grad_norm": 0.32886062010904576, "learning_rate": 5.679736167094174e-06, "loss": 0.2991, "step": 2723 }, { "epoch": 2.6920415224913494, "grad_norm": 0.3036999746435608, "learning_rate": 5.6614144375229025e-06, "loss": 0.2976, "step": 2724 }, { "epoch": 2.693030153237766, "grad_norm": 0.3223306354319073, "learning_rate": 5.6430927079516315e-06, "loss": 0.2876, "step": 2725 }, { "epoch": 2.694018783984182, "grad_norm": 0.3079834197905964, "learning_rate": 5.62477097838036e-06, "loss": 0.2868, "step": 2726 }, { "epoch": 2.6950074147305982, "grad_norm": 0.30680672608980813, "learning_rate": 5.606449248809088e-06, "loss": 0.2903, "step": 2727 }, { "epoch": 2.695996045477014, "grad_norm": 0.31408716284482846, "learning_rate": 5.588127519237817e-06, "loss": 0.2824, "step": 2728 }, { "epoch": 2.6969846762234306, "grad_norm": 0.49501145527353263, "learning_rate": 5.569805789666545e-06, "loss": 0.307, "step": 2729 }, { "epoch": 2.6979733069698466, "grad_norm": 0.30281030812738857, "learning_rate": 5.551484060095273e-06, "loss": 0.2799, "step": 2730 }, { "epoch": 2.698961937716263, "grad_norm": 0.29747182118476384, "learning_rate": 5.533162330524002e-06, "loss": 0.2851, "step": 2731 }, { "epoch": 2.699950568462679, "grad_norm": 0.29881061871379, "learning_rate": 5.51484060095273e-06, "loss": 0.2742, "step": 2732 }, { "epoch": 2.7009391992090954, "grad_norm": 0.30889029423172204, "learning_rate": 5.496518871381458e-06, "loss": 0.2576, "step": 2733 }, { "epoch": 2.701927829955512, "grad_norm": 0.32366645808770395, "learning_rate": 5.478197141810187e-06, "loss": 0.3046, "step": 2734 }, { "epoch": 2.702916460701928, "grad_norm": 0.2990448597915902, "learning_rate": 5.459875412238916e-06, "loss": 0.3207, "step": 2735 }, { "epoch": 2.703905091448344, "grad_norm": 0.3024723742965304, "learning_rate": 5.441553682667644e-06, "loss": 0.3012, "step": 2736 }, { "epoch": 2.70489372219476, "grad_norm": 0.29439025854613593, "learning_rate": 5.423231953096373e-06, "loss": 0.2647, "step": 2737 }, { "epoch": 2.7058823529411766, "grad_norm": 0.3063881365518898, "learning_rate": 5.404910223525101e-06, "loss": 0.2809, "step": 2738 }, { "epoch": 2.7068709836875926, "grad_norm": 0.30369628955365363, "learning_rate": 5.386588493953829e-06, "loss": 0.2733, "step": 2739 }, { "epoch": 2.707859614434009, "grad_norm": 0.33591527244160957, "learning_rate": 5.368266764382558e-06, "loss": 0.3027, "step": 2740 }, { "epoch": 2.708848245180425, "grad_norm": 0.30680918555726916, "learning_rate": 5.349945034811286e-06, "loss": 0.2758, "step": 2741 }, { "epoch": 2.7098368759268414, "grad_norm": 0.2813078338013892, "learning_rate": 5.3316233052400145e-06, "loss": 0.2538, "step": 2742 }, { "epoch": 2.7108255066732574, "grad_norm": 0.29870335838456863, "learning_rate": 5.3133015756687434e-06, "loss": 0.2868, "step": 2743 }, { "epoch": 2.711814137419674, "grad_norm": 0.2867588454105059, "learning_rate": 5.2949798460974715e-06, "loss": 0.2628, "step": 2744 }, { "epoch": 2.71280276816609, "grad_norm": 0.3111382878896453, "learning_rate": 5.2766581165262005e-06, "loss": 0.284, "step": 2745 }, { "epoch": 2.713791398912506, "grad_norm": 0.2782360058801156, "learning_rate": 5.258336386954929e-06, "loss": 0.2568, "step": 2746 }, { "epoch": 2.7147800296589226, "grad_norm": 0.3125095605993743, "learning_rate": 5.2400146573836576e-06, "loss": 0.3131, "step": 2747 }, { "epoch": 2.7157686604053386, "grad_norm": 0.3096073941456396, "learning_rate": 5.221692927812386e-06, "loss": 0.277, "step": 2748 }, { "epoch": 2.7167572911517546, "grad_norm": 0.3039242318592737, "learning_rate": 5.203371198241115e-06, "loss": 0.2797, "step": 2749 }, { "epoch": 2.717745921898171, "grad_norm": 0.308809474714669, "learning_rate": 5.185049468669843e-06, "loss": 0.2782, "step": 2750 }, { "epoch": 2.7187345526445874, "grad_norm": 0.30715752564299836, "learning_rate": 5.166727739098571e-06, "loss": 0.3124, "step": 2751 }, { "epoch": 2.7197231833910034, "grad_norm": 0.3208141712493757, "learning_rate": 5.1484060095273e-06, "loss": 0.2924, "step": 2752 }, { "epoch": 2.72071181413742, "grad_norm": 0.30497585552278034, "learning_rate": 5.130084279956028e-06, "loss": 0.2984, "step": 2753 }, { "epoch": 2.721700444883836, "grad_norm": 0.2919143101099683, "learning_rate": 5.111762550384756e-06, "loss": 0.2849, "step": 2754 }, { "epoch": 2.722689075630252, "grad_norm": 0.2763961920970215, "learning_rate": 5.093440820813485e-06, "loss": 0.2549, "step": 2755 }, { "epoch": 2.723677706376668, "grad_norm": 0.30196158356093095, "learning_rate": 5.075119091242213e-06, "loss": 0.2689, "step": 2756 }, { "epoch": 2.7246663371230846, "grad_norm": 0.329953953765354, "learning_rate": 5.056797361670942e-06, "loss": 0.3052, "step": 2757 }, { "epoch": 2.7256549678695006, "grad_norm": 0.303388467253725, "learning_rate": 5.038475632099671e-06, "loss": 0.3083, "step": 2758 }, { "epoch": 2.726643598615917, "grad_norm": 0.29072342701813175, "learning_rate": 5.020153902528399e-06, "loss": 0.2702, "step": 2759 }, { "epoch": 2.7276322293623334, "grad_norm": 0.32416410456824696, "learning_rate": 5.001832172957127e-06, "loss": 0.289, "step": 2760 }, { "epoch": 2.7286208601087494, "grad_norm": 0.2901154413919831, "learning_rate": 4.983510443385856e-06, "loss": 0.2651, "step": 2761 }, { "epoch": 2.7296094908551654, "grad_norm": 0.3259120562375597, "learning_rate": 4.965188713814584e-06, "loss": 0.276, "step": 2762 }, { "epoch": 2.7305981216015818, "grad_norm": 0.2888623489431081, "learning_rate": 4.9468669842433124e-06, "loss": 0.3009, "step": 2763 }, { "epoch": 2.731586752347998, "grad_norm": 0.3009954455497413, "learning_rate": 4.928545254672041e-06, "loss": 0.2968, "step": 2764 }, { "epoch": 2.732575383094414, "grad_norm": 0.5336422998216938, "learning_rate": 4.9102235251007695e-06, "loss": 0.3035, "step": 2765 }, { "epoch": 2.7335640138408306, "grad_norm": 2.2800076942780145, "learning_rate": 4.891901795529498e-06, "loss": 0.3332, "step": 2766 }, { "epoch": 2.7345526445872466, "grad_norm": 0.2940705145609994, "learning_rate": 4.873580065958227e-06, "loss": 0.2616, "step": 2767 }, { "epoch": 2.735541275333663, "grad_norm": 0.31719350026302434, "learning_rate": 4.8552583363869555e-06, "loss": 0.2905, "step": 2768 }, { "epoch": 2.736529906080079, "grad_norm": 0.32838230951685377, "learning_rate": 4.836936606815684e-06, "loss": 0.2866, "step": 2769 }, { "epoch": 2.7375185368264954, "grad_norm": 0.30453297452077094, "learning_rate": 4.818614877244413e-06, "loss": 0.2758, "step": 2770 }, { "epoch": 2.7385071675729113, "grad_norm": 0.2832893952970659, "learning_rate": 4.800293147673141e-06, "loss": 0.2323, "step": 2771 }, { "epoch": 2.7394957983193278, "grad_norm": 0.304211339237199, "learning_rate": 4.781971418101869e-06, "loss": 0.2807, "step": 2772 }, { "epoch": 2.740484429065744, "grad_norm": 0.2815321012974396, "learning_rate": 4.763649688530598e-06, "loss": 0.269, "step": 2773 }, { "epoch": 2.74147305981216, "grad_norm": 0.31381715929839665, "learning_rate": 4.745327958959326e-06, "loss": 0.2725, "step": 2774 }, { "epoch": 2.742461690558576, "grad_norm": 0.33219832423903867, "learning_rate": 4.727006229388054e-06, "loss": 0.2897, "step": 2775 }, { "epoch": 2.7434503213049926, "grad_norm": 0.3239846259448096, "learning_rate": 4.708684499816783e-06, "loss": 0.2994, "step": 2776 }, { "epoch": 2.744438952051409, "grad_norm": 0.30709592870919855, "learning_rate": 4.690362770245511e-06, "loss": 0.3085, "step": 2777 }, { "epoch": 2.745427582797825, "grad_norm": 0.30500417099689064, "learning_rate": 4.672041040674239e-06, "loss": 0.2936, "step": 2778 }, { "epoch": 2.7464162135442414, "grad_norm": 0.3237981226816196, "learning_rate": 4.653719311102968e-06, "loss": 0.31, "step": 2779 }, { "epoch": 2.7474048442906573, "grad_norm": 0.30809995259166034, "learning_rate": 4.635397581531697e-06, "loss": 0.2806, "step": 2780 }, { "epoch": 2.7483934750370738, "grad_norm": 0.2965603607602377, "learning_rate": 4.617075851960425e-06, "loss": 0.2955, "step": 2781 }, { "epoch": 2.7493821057834897, "grad_norm": 0.29483983169024547, "learning_rate": 4.598754122389154e-06, "loss": 0.2831, "step": 2782 }, { "epoch": 2.750370736529906, "grad_norm": 0.3014381041937235, "learning_rate": 4.580432392817882e-06, "loss": 0.2989, "step": 2783 }, { "epoch": 2.751359367276322, "grad_norm": 0.32087829546355895, "learning_rate": 4.56211066324661e-06, "loss": 0.2775, "step": 2784 }, { "epoch": 2.7523479980227386, "grad_norm": 0.29683904012448953, "learning_rate": 4.543788933675339e-06, "loss": 0.2843, "step": 2785 }, { "epoch": 2.753336628769155, "grad_norm": 0.28605057413446605, "learning_rate": 4.5254672041040675e-06, "loss": 0.2885, "step": 2786 }, { "epoch": 2.754325259515571, "grad_norm": 0.48666263734794, "learning_rate": 4.507145474532796e-06, "loss": 0.2704, "step": 2787 }, { "epoch": 2.755313890261987, "grad_norm": 0.331248566505627, "learning_rate": 4.4888237449615246e-06, "loss": 0.292, "step": 2788 }, { "epoch": 2.7563025210084033, "grad_norm": 0.29946630838737504, "learning_rate": 4.470502015390253e-06, "loss": 0.2677, "step": 2789 }, { "epoch": 2.7572911517548198, "grad_norm": 0.3053906963001733, "learning_rate": 4.452180285818982e-06, "loss": 0.2709, "step": 2790 }, { "epoch": 2.7582797825012357, "grad_norm": 0.3171353240144877, "learning_rate": 4.43385855624771e-06, "loss": 0.2871, "step": 2791 }, { "epoch": 2.759268413247652, "grad_norm": 0.29250853329035387, "learning_rate": 4.415536826676439e-06, "loss": 0.2769, "step": 2792 }, { "epoch": 2.760257043994068, "grad_norm": 0.2919477279790293, "learning_rate": 4.397215097105167e-06, "loss": 0.2517, "step": 2793 }, { "epoch": 2.7612456747404845, "grad_norm": 0.3162099150632416, "learning_rate": 4.378893367533896e-06, "loss": 0.303, "step": 2794 }, { "epoch": 2.7622343054869005, "grad_norm": 0.31372497319680437, "learning_rate": 4.360571637962624e-06, "loss": 0.3032, "step": 2795 }, { "epoch": 2.763222936233317, "grad_norm": 0.30366895106744646, "learning_rate": 4.342249908391352e-06, "loss": 0.2775, "step": 2796 }, { "epoch": 2.764211566979733, "grad_norm": 0.2967195534400123, "learning_rate": 4.323928178820081e-06, "loss": 0.2737, "step": 2797 }, { "epoch": 2.7652001977261493, "grad_norm": 0.29909623923768647, "learning_rate": 4.305606449248809e-06, "loss": 0.2645, "step": 2798 }, { "epoch": 2.7661888284725658, "grad_norm": 0.2885534731763224, "learning_rate": 4.287284719677537e-06, "loss": 0.27, "step": 2799 }, { "epoch": 2.7671774592189817, "grad_norm": 0.29954695452351704, "learning_rate": 4.268962990106266e-06, "loss": 0.2839, "step": 2800 }, { "epoch": 2.7681660899653977, "grad_norm": 0.27761410811891923, "learning_rate": 4.250641260534994e-06, "loss": 0.2656, "step": 2801 }, { "epoch": 2.769154720711814, "grad_norm": 0.284624884141742, "learning_rate": 4.232319530963723e-06, "loss": 0.2536, "step": 2802 }, { "epoch": 2.7701433514582305, "grad_norm": 0.29425280768685147, "learning_rate": 4.213997801392452e-06, "loss": 0.2692, "step": 2803 }, { "epoch": 2.7711319822046465, "grad_norm": 0.3064386546396075, "learning_rate": 4.19567607182118e-06, "loss": 0.2721, "step": 2804 }, { "epoch": 2.772120612951063, "grad_norm": 0.290776872466533, "learning_rate": 4.177354342249908e-06, "loss": 0.2753, "step": 2805 }, { "epoch": 2.773109243697479, "grad_norm": 0.3092005398421125, "learning_rate": 4.159032612678637e-06, "loss": 0.293, "step": 2806 }, { "epoch": 2.7740978744438953, "grad_norm": 0.29789363452478407, "learning_rate": 4.1407108831073654e-06, "loss": 0.2943, "step": 2807 }, { "epoch": 2.7750865051903113, "grad_norm": 0.2895660156413966, "learning_rate": 4.1223891535360936e-06, "loss": 0.2873, "step": 2808 }, { "epoch": 2.7760751359367277, "grad_norm": 0.3001902108798764, "learning_rate": 4.1040674239648225e-06, "loss": 0.2815, "step": 2809 }, { "epoch": 2.7770637666831437, "grad_norm": 0.3212950611664459, "learning_rate": 4.085745694393551e-06, "loss": 0.2993, "step": 2810 }, { "epoch": 2.77805239742956, "grad_norm": 0.37121490248467875, "learning_rate": 4.067423964822279e-06, "loss": 0.2997, "step": 2811 }, { "epoch": 2.7790410281759765, "grad_norm": 0.2833956353421675, "learning_rate": 4.049102235251008e-06, "loss": 0.2434, "step": 2812 }, { "epoch": 2.7800296589223925, "grad_norm": 0.3005404114117958, "learning_rate": 4.030780505679737e-06, "loss": 0.2785, "step": 2813 }, { "epoch": 2.7810182896688085, "grad_norm": 0.2871011080968174, "learning_rate": 4.012458776108465e-06, "loss": 0.2926, "step": 2814 }, { "epoch": 2.782006920415225, "grad_norm": 0.3073910105846964, "learning_rate": 3.994137046537194e-06, "loss": 0.2911, "step": 2815 }, { "epoch": 2.7829955511616413, "grad_norm": 2.406649412383568, "learning_rate": 3.975815316965922e-06, "loss": 0.3521, "step": 2816 }, { "epoch": 2.7839841819080573, "grad_norm": 0.29919814721720034, "learning_rate": 3.95749358739465e-06, "loss": 0.2834, "step": 2817 }, { "epoch": 2.7849728126544737, "grad_norm": 0.3091635576637203, "learning_rate": 3.939171857823379e-06, "loss": 0.2706, "step": 2818 }, { "epoch": 2.7859614434008897, "grad_norm": 0.29163551600985593, "learning_rate": 3.920850128252107e-06, "loss": 0.2764, "step": 2819 }, { "epoch": 2.786950074147306, "grad_norm": 0.31892428716822796, "learning_rate": 3.902528398680836e-06, "loss": 0.2857, "step": 2820 }, { "epoch": 2.787938704893722, "grad_norm": 0.2983161701837488, "learning_rate": 3.884206669109564e-06, "loss": 0.3191, "step": 2821 }, { "epoch": 2.7889273356401385, "grad_norm": 0.2945983326725661, "learning_rate": 3.865884939538292e-06, "loss": 0.2732, "step": 2822 }, { "epoch": 2.7899159663865545, "grad_norm": 0.2854216538858774, "learning_rate": 3.847563209967021e-06, "loss": 0.2598, "step": 2823 }, { "epoch": 2.790904597132971, "grad_norm": 0.30940851794219776, "learning_rate": 3.829241480395749e-06, "loss": 0.2973, "step": 2824 }, { "epoch": 2.7918932278793873, "grad_norm": 0.31693127253611064, "learning_rate": 3.810919750824478e-06, "loss": 0.3035, "step": 2825 }, { "epoch": 2.7928818586258033, "grad_norm": 0.314949790257459, "learning_rate": 3.7925980212532068e-06, "loss": 0.2988, "step": 2826 }, { "epoch": 2.7938704893722193, "grad_norm": 0.2897505260919834, "learning_rate": 3.774276291681935e-06, "loss": 0.2826, "step": 2827 }, { "epoch": 2.7948591201186357, "grad_norm": 0.2907920825664742, "learning_rate": 3.7559545621106634e-06, "loss": 0.268, "step": 2828 }, { "epoch": 2.795847750865052, "grad_norm": 0.29463209466411605, "learning_rate": 3.7376328325393924e-06, "loss": 0.2641, "step": 2829 }, { "epoch": 2.796836381611468, "grad_norm": 0.31890496404378504, "learning_rate": 3.7193111029681205e-06, "loss": 0.3074, "step": 2830 }, { "epoch": 2.797825012357884, "grad_norm": 0.2814476554713294, "learning_rate": 3.7009893733968486e-06, "loss": 0.2693, "step": 2831 }, { "epoch": 2.7988136431043005, "grad_norm": 0.31433206867978186, "learning_rate": 3.6826676438255776e-06, "loss": 0.3047, "step": 2832 }, { "epoch": 2.799802273850717, "grad_norm": 0.2998461901750344, "learning_rate": 3.6643459142543057e-06, "loss": 0.2823, "step": 2833 }, { "epoch": 2.800790904597133, "grad_norm": 0.29878739949662597, "learning_rate": 3.646024184683034e-06, "loss": 0.2883, "step": 2834 }, { "epoch": 2.8017795353435493, "grad_norm": 0.27820641055110523, "learning_rate": 3.627702455111763e-06, "loss": 0.2819, "step": 2835 }, { "epoch": 2.8027681660899653, "grad_norm": 0.2784579118467304, "learning_rate": 3.6093807255404913e-06, "loss": 0.2623, "step": 2836 }, { "epoch": 2.8037567968363817, "grad_norm": 0.2900020527996704, "learning_rate": 3.5910589959692194e-06, "loss": 0.2778, "step": 2837 }, { "epoch": 2.804745427582798, "grad_norm": 0.2839227517636407, "learning_rate": 3.5727372663979483e-06, "loss": 0.2775, "step": 2838 }, { "epoch": 2.805734058329214, "grad_norm": 0.3190445685498285, "learning_rate": 3.554415536826677e-06, "loss": 0.294, "step": 2839 }, { "epoch": 2.80672268907563, "grad_norm": 0.29019581358598234, "learning_rate": 3.536093807255405e-06, "loss": 0.2856, "step": 2840 }, { "epoch": 2.8077113198220465, "grad_norm": 0.28942319002803263, "learning_rate": 3.517772077684134e-06, "loss": 0.2872, "step": 2841 }, { "epoch": 2.808699950568463, "grad_norm": 0.29180367468251495, "learning_rate": 3.499450348112862e-06, "loss": 0.2909, "step": 2842 }, { "epoch": 2.809688581314879, "grad_norm": 0.28238528529375073, "learning_rate": 3.48112861854159e-06, "loss": 0.285, "step": 2843 }, { "epoch": 2.810677212061295, "grad_norm": 0.3110911523389197, "learning_rate": 3.462806888970319e-06, "loss": 0.271, "step": 2844 }, { "epoch": 2.8116658428077113, "grad_norm": 0.3080348778353728, "learning_rate": 3.4444851593990477e-06, "loss": 0.2855, "step": 2845 }, { "epoch": 2.8126544735541277, "grad_norm": 0.3090054861121094, "learning_rate": 3.4261634298277758e-06, "loss": 0.2907, "step": 2846 }, { "epoch": 2.8136431043005437, "grad_norm": 0.2981235018217657, "learning_rate": 3.4078417002565047e-06, "loss": 0.2849, "step": 2847 }, { "epoch": 2.81463173504696, "grad_norm": 0.3070535923469625, "learning_rate": 3.389519970685233e-06, "loss": 0.3175, "step": 2848 }, { "epoch": 2.815620365793376, "grad_norm": 0.31872260089643123, "learning_rate": 3.371198241113961e-06, "loss": 0.3025, "step": 2849 }, { "epoch": 2.8166089965397925, "grad_norm": 0.2873119092670171, "learning_rate": 3.35287651154269e-06, "loss": 0.2659, "step": 2850 }, { "epoch": 2.8175976272862084, "grad_norm": 0.29305838647058685, "learning_rate": 3.3345547819714185e-06, "loss": 0.2841, "step": 2851 }, { "epoch": 2.818586258032625, "grad_norm": 0.27865375795273195, "learning_rate": 3.3162330524001466e-06, "loss": 0.2371, "step": 2852 }, { "epoch": 2.819574888779041, "grad_norm": 0.29685890263043924, "learning_rate": 3.2979113228288755e-06, "loss": 0.2822, "step": 2853 }, { "epoch": 2.8205635195254573, "grad_norm": 0.29280218216374765, "learning_rate": 3.2795895932576036e-06, "loss": 0.2784, "step": 2854 }, { "epoch": 2.8215521502718737, "grad_norm": 0.2873332167053172, "learning_rate": 3.261267863686332e-06, "loss": 0.2842, "step": 2855 }, { "epoch": 2.8225407810182896, "grad_norm": 0.29996842832311515, "learning_rate": 3.2429461341150607e-06, "loss": 0.2843, "step": 2856 }, { "epoch": 2.8235294117647056, "grad_norm": 0.29154676661838425, "learning_rate": 3.2246244045437892e-06, "loss": 0.2583, "step": 2857 }, { "epoch": 2.824518042511122, "grad_norm": 0.2896081905816264, "learning_rate": 3.2063026749725174e-06, "loss": 0.2734, "step": 2858 }, { "epoch": 2.8255066732575385, "grad_norm": 0.2961651815875584, "learning_rate": 3.1879809454012463e-06, "loss": 0.2927, "step": 2859 }, { "epoch": 2.8264953040039544, "grad_norm": 0.33812713307994324, "learning_rate": 3.1696592158299744e-06, "loss": 0.278, "step": 2860 }, { "epoch": 2.827483934750371, "grad_norm": 3.067100986982271, "learning_rate": 3.151337486258703e-06, "loss": 0.3809, "step": 2861 }, { "epoch": 2.828472565496787, "grad_norm": 0.30828349037104297, "learning_rate": 3.133015756687432e-06, "loss": 0.2482, "step": 2862 }, { "epoch": 2.8294611962432032, "grad_norm": 0.29856845207992416, "learning_rate": 3.11469402711616e-06, "loss": 0.2689, "step": 2863 }, { "epoch": 2.830449826989619, "grad_norm": 0.29163396193964775, "learning_rate": 3.0963722975448886e-06, "loss": 0.2694, "step": 2864 }, { "epoch": 2.8314384577360356, "grad_norm": 0.29863396806175235, "learning_rate": 3.0780505679736167e-06, "loss": 0.3138, "step": 2865 }, { "epoch": 2.8324270884824516, "grad_norm": 0.28624995367024547, "learning_rate": 3.0597288384023452e-06, "loss": 0.2682, "step": 2866 }, { "epoch": 2.833415719228868, "grad_norm": 0.309440201478211, "learning_rate": 3.0414071088310737e-06, "loss": 0.2905, "step": 2867 }, { "epoch": 2.8344043499752845, "grad_norm": 0.286138050570904, "learning_rate": 3.0230853792598023e-06, "loss": 0.2713, "step": 2868 }, { "epoch": 2.8353929807217004, "grad_norm": 0.3038322552538627, "learning_rate": 3.004763649688531e-06, "loss": 0.3244, "step": 2869 }, { "epoch": 2.8363816114681164, "grad_norm": 0.30560014829901244, "learning_rate": 2.9864419201172594e-06, "loss": 0.2663, "step": 2870 }, { "epoch": 2.837370242214533, "grad_norm": 0.28375216570968204, "learning_rate": 2.9681201905459875e-06, "loss": 0.2697, "step": 2871 }, { "epoch": 2.8383588729609492, "grad_norm": 0.31839001723987304, "learning_rate": 2.949798460974716e-06, "loss": 0.3012, "step": 2872 }, { "epoch": 2.839347503707365, "grad_norm": 0.3054488179079099, "learning_rate": 2.931476731403445e-06, "loss": 0.2806, "step": 2873 }, { "epoch": 2.8403361344537816, "grad_norm": 0.28393631080892706, "learning_rate": 2.913155001832173e-06, "loss": 0.2698, "step": 2874 }, { "epoch": 2.8413247652001976, "grad_norm": 0.29606934349232905, "learning_rate": 2.8948332722609016e-06, "loss": 0.2845, "step": 2875 }, { "epoch": 2.842313395946614, "grad_norm": 0.33074248448334154, "learning_rate": 2.87651154268963e-06, "loss": 0.2902, "step": 2876 }, { "epoch": 2.84330202669303, "grad_norm": 0.2949536945300778, "learning_rate": 2.8581898131183583e-06, "loss": 0.3088, "step": 2877 }, { "epoch": 2.8442906574394464, "grad_norm": 0.31156861958831755, "learning_rate": 2.839868083547087e-06, "loss": 0.3094, "step": 2878 }, { "epoch": 2.8452792881858624, "grad_norm": 0.29070429191147557, "learning_rate": 2.8215463539758157e-06, "loss": 0.2819, "step": 2879 }, { "epoch": 2.846267918932279, "grad_norm": 0.2729793628610079, "learning_rate": 2.803224624404544e-06, "loss": 0.262, "step": 2880 }, { "epoch": 2.8472565496786952, "grad_norm": 0.3083990328614314, "learning_rate": 2.7849028948332724e-06, "loss": 0.2872, "step": 2881 }, { "epoch": 2.848245180425111, "grad_norm": 0.29228655272497106, "learning_rate": 2.766581165262001e-06, "loss": 0.2788, "step": 2882 }, { "epoch": 2.849233811171527, "grad_norm": 0.2818725448553874, "learning_rate": 2.748259435690729e-06, "loss": 0.2678, "step": 2883 }, { "epoch": 2.8502224419179436, "grad_norm": 0.3148711592552682, "learning_rate": 2.729937706119458e-06, "loss": 0.2933, "step": 2884 }, { "epoch": 2.85121107266436, "grad_norm": 0.29370870884551437, "learning_rate": 2.7116159765481865e-06, "loss": 0.2452, "step": 2885 }, { "epoch": 2.852199703410776, "grad_norm": 0.2927040103880256, "learning_rate": 2.6932942469769146e-06, "loss": 0.2667, "step": 2886 }, { "epoch": 2.8531883341571924, "grad_norm": 0.32909253767276975, "learning_rate": 2.674972517405643e-06, "loss": 0.2922, "step": 2887 }, { "epoch": 2.8541769649036084, "grad_norm": 0.3171164142013659, "learning_rate": 2.6566507878343717e-06, "loss": 0.285, "step": 2888 }, { "epoch": 2.855165595650025, "grad_norm": 0.3147797052247942, "learning_rate": 2.6383290582631003e-06, "loss": 0.2861, "step": 2889 }, { "epoch": 2.856154226396441, "grad_norm": 0.2898327005797937, "learning_rate": 2.6200073286918288e-06, "loss": 0.2742, "step": 2890 }, { "epoch": 2.857142857142857, "grad_norm": 0.27614608324366974, "learning_rate": 2.6016855991205573e-06, "loss": 0.2671, "step": 2891 }, { "epoch": 2.858131487889273, "grad_norm": 0.2848771337035191, "learning_rate": 2.5833638695492854e-06, "loss": 0.2734, "step": 2892 }, { "epoch": 2.8591201186356896, "grad_norm": 0.29085571671479155, "learning_rate": 2.565042139978014e-06, "loss": 0.296, "step": 2893 }, { "epoch": 2.860108749382106, "grad_norm": 0.2971108362426078, "learning_rate": 2.5467204104067425e-06, "loss": 0.2886, "step": 2894 }, { "epoch": 2.861097380128522, "grad_norm": 0.29946583692457474, "learning_rate": 2.528398680835471e-06, "loss": 0.2799, "step": 2895 }, { "epoch": 2.862086010874938, "grad_norm": 0.30676129468652846, "learning_rate": 2.5100769512641996e-06, "loss": 0.3082, "step": 2896 }, { "epoch": 2.8630746416213544, "grad_norm": 0.2943647176004645, "learning_rate": 2.491755221692928e-06, "loss": 0.3017, "step": 2897 }, { "epoch": 2.864063272367771, "grad_norm": 0.2949390765591675, "learning_rate": 2.4734334921216562e-06, "loss": 0.2768, "step": 2898 }, { "epoch": 2.865051903114187, "grad_norm": 0.2901377315574012, "learning_rate": 2.4551117625503848e-06, "loss": 0.2941, "step": 2899 }, { "epoch": 2.866040533860603, "grad_norm": 0.2938980967780369, "learning_rate": 2.4367900329791133e-06, "loss": 0.2967, "step": 2900 }, { "epoch": 2.867029164607019, "grad_norm": 0.301636143282697, "learning_rate": 2.418468303407842e-06, "loss": 0.2986, "step": 2901 }, { "epoch": 2.8680177953534356, "grad_norm": 0.2917655068803605, "learning_rate": 2.4001465738365704e-06, "loss": 0.284, "step": 2902 }, { "epoch": 2.8690064260998516, "grad_norm": 0.30013486101157383, "learning_rate": 2.381824844265299e-06, "loss": 0.2682, "step": 2903 }, { "epoch": 2.869995056846268, "grad_norm": 0.27281759305715125, "learning_rate": 2.363503114694027e-06, "loss": 0.2651, "step": 2904 }, { "epoch": 2.870983687592684, "grad_norm": 0.2948669693828221, "learning_rate": 2.3451813851227555e-06, "loss": 0.2993, "step": 2905 }, { "epoch": 2.8719723183391004, "grad_norm": 0.2914048294564349, "learning_rate": 2.326859655551484e-06, "loss": 0.2921, "step": 2906 }, { "epoch": 2.872960949085517, "grad_norm": 0.31773255236633785, "learning_rate": 2.3085379259802126e-06, "loss": 0.2464, "step": 2907 }, { "epoch": 2.8739495798319328, "grad_norm": 0.2848047819951922, "learning_rate": 2.290216196408941e-06, "loss": 0.2652, "step": 2908 }, { "epoch": 2.8749382105783488, "grad_norm": 0.2959760073247724, "learning_rate": 2.2718944668376697e-06, "loss": 0.2881, "step": 2909 }, { "epoch": 2.875926841324765, "grad_norm": 0.2782489792456424, "learning_rate": 2.253572737266398e-06, "loss": 0.2562, "step": 2910 }, { "epoch": 2.8769154720711816, "grad_norm": 0.2941158787240688, "learning_rate": 2.2352510076951263e-06, "loss": 0.2732, "step": 2911 }, { "epoch": 2.8779041028175976, "grad_norm": 0.2783981972426223, "learning_rate": 2.216929278123855e-06, "loss": 0.2828, "step": 2912 }, { "epoch": 2.878892733564014, "grad_norm": 0.3630386667863461, "learning_rate": 2.1986075485525834e-06, "loss": 0.2791, "step": 2913 }, { "epoch": 2.87988136431043, "grad_norm": 0.2732116226552905, "learning_rate": 2.180285818981312e-06, "loss": 0.245, "step": 2914 }, { "epoch": 2.8808699950568464, "grad_norm": 0.28944481468453553, "learning_rate": 2.1619640894100405e-06, "loss": 0.2726, "step": 2915 }, { "epoch": 2.8818586258032624, "grad_norm": 0.2969144526376977, "learning_rate": 2.1436423598387686e-06, "loss": 0.2712, "step": 2916 }, { "epoch": 2.8828472565496788, "grad_norm": 0.29546216714635126, "learning_rate": 2.125320630267497e-06, "loss": 0.3155, "step": 2917 }, { "epoch": 2.8838358872960947, "grad_norm": 0.29166972583905015, "learning_rate": 2.106998900696226e-06, "loss": 0.2708, "step": 2918 }, { "epoch": 2.884824518042511, "grad_norm": 0.2907796184853829, "learning_rate": 2.088677171124954e-06, "loss": 0.262, "step": 2919 }, { "epoch": 2.8858131487889276, "grad_norm": 0.2857404614059104, "learning_rate": 2.0703554415536827e-06, "loss": 0.2737, "step": 2920 }, { "epoch": 2.8868017795353436, "grad_norm": 0.3029009473299206, "learning_rate": 2.0520337119824113e-06, "loss": 0.2764, "step": 2921 }, { "epoch": 2.8877904102817595, "grad_norm": 0.2786093028027722, "learning_rate": 2.0337119824111394e-06, "loss": 0.2627, "step": 2922 }, { "epoch": 2.888779041028176, "grad_norm": 0.2793719512418242, "learning_rate": 2.0153902528398683e-06, "loss": 0.2478, "step": 2923 }, { "epoch": 2.8897676717745924, "grad_norm": 0.2843242151956319, "learning_rate": 1.997068523268597e-06, "loss": 0.2547, "step": 2924 }, { "epoch": 2.8907563025210083, "grad_norm": 0.2762252138226284, "learning_rate": 1.978746793697325e-06, "loss": 0.2679, "step": 2925 }, { "epoch": 2.8917449332674248, "grad_norm": 0.3262195045735497, "learning_rate": 1.9604250641260535e-06, "loss": 0.2979, "step": 2926 }, { "epoch": 2.8927335640138407, "grad_norm": 0.27679389462478543, "learning_rate": 1.942103334554782e-06, "loss": 0.2704, "step": 2927 }, { "epoch": 2.893722194760257, "grad_norm": 0.297839330282876, "learning_rate": 1.9237816049835106e-06, "loss": 0.2808, "step": 2928 }, { "epoch": 2.894710825506673, "grad_norm": 0.27536667700707473, "learning_rate": 1.905459875412239e-06, "loss": 0.289, "step": 2929 }, { "epoch": 2.8956994562530896, "grad_norm": 0.30509699045379424, "learning_rate": 1.8871381458409674e-06, "loss": 0.2781, "step": 2930 }, { "epoch": 2.8966880869995055, "grad_norm": 0.2858844278725378, "learning_rate": 1.8688164162696962e-06, "loss": 0.2783, "step": 2931 }, { "epoch": 2.897676717745922, "grad_norm": 0.2995405466446339, "learning_rate": 1.8504946866984243e-06, "loss": 0.3012, "step": 2932 }, { "epoch": 2.8986653484923384, "grad_norm": 0.2888532651134583, "learning_rate": 1.8321729571271528e-06, "loss": 0.2611, "step": 2933 }, { "epoch": 2.8996539792387543, "grad_norm": 0.30587696421008354, "learning_rate": 1.8138512275558816e-06, "loss": 0.2689, "step": 2934 }, { "epoch": 2.9006426099851703, "grad_norm": 0.286252972139845, "learning_rate": 1.7955294979846097e-06, "loss": 0.2813, "step": 2935 }, { "epoch": 2.9016312407315867, "grad_norm": 0.2792775617875418, "learning_rate": 1.7772077684133384e-06, "loss": 0.2977, "step": 2936 }, { "epoch": 2.902619871478003, "grad_norm": 0.2898139586810527, "learning_rate": 1.758886038842067e-06, "loss": 0.2765, "step": 2937 }, { "epoch": 2.903608502224419, "grad_norm": 0.29777574400798346, "learning_rate": 1.740564309270795e-06, "loss": 0.2986, "step": 2938 }, { "epoch": 2.9045971329708355, "grad_norm": 0.3095900395288344, "learning_rate": 1.7222425796995238e-06, "loss": 0.2913, "step": 2939 }, { "epoch": 2.9055857637172515, "grad_norm": 0.287806999513699, "learning_rate": 1.7039208501282524e-06, "loss": 0.2624, "step": 2940 }, { "epoch": 2.906574394463668, "grad_norm": 0.2892554839858409, "learning_rate": 1.6855991205569805e-06, "loss": 0.3031, "step": 2941 }, { "epoch": 2.907563025210084, "grad_norm": 0.29207080193398904, "learning_rate": 1.6672773909857092e-06, "loss": 0.2862, "step": 2942 }, { "epoch": 2.9085516559565003, "grad_norm": 0.2852504436691857, "learning_rate": 1.6489556614144378e-06, "loss": 0.271, "step": 2943 }, { "epoch": 2.9095402867029163, "grad_norm": 0.2897704479300311, "learning_rate": 1.630633931843166e-06, "loss": 0.2834, "step": 2944 }, { "epoch": 2.9105289174493327, "grad_norm": 0.2807777834829169, "learning_rate": 1.6123122022718946e-06, "loss": 0.2817, "step": 2945 }, { "epoch": 2.911517548195749, "grad_norm": 0.2916674350409026, "learning_rate": 1.5939904727006232e-06, "loss": 0.2787, "step": 2946 }, { "epoch": 2.912506178942165, "grad_norm": 0.28956768148103973, "learning_rate": 1.5756687431293515e-06, "loss": 0.2784, "step": 2947 }, { "epoch": 2.913494809688581, "grad_norm": 0.2944463159778755, "learning_rate": 1.55734701355808e-06, "loss": 0.2966, "step": 2948 }, { "epoch": 2.9144834404349975, "grad_norm": 0.2893729641052645, "learning_rate": 1.5390252839868083e-06, "loss": 0.2749, "step": 2949 }, { "epoch": 2.915472071181414, "grad_norm": 0.2710023464824464, "learning_rate": 1.5207035544155369e-06, "loss": 0.2919, "step": 2950 }, { "epoch": 2.91646070192783, "grad_norm": 0.2917149843071587, "learning_rate": 1.5023818248442654e-06, "loss": 0.2848, "step": 2951 }, { "epoch": 2.9174493326742463, "grad_norm": 0.29937679381431836, "learning_rate": 1.4840600952729937e-06, "loss": 0.271, "step": 2952 }, { "epoch": 2.9184379634206623, "grad_norm": 0.28835187200962836, "learning_rate": 1.4657383657017225e-06, "loss": 0.2961, "step": 2953 }, { "epoch": 2.9194265941670787, "grad_norm": 0.2845933100350811, "learning_rate": 1.4474166361304508e-06, "loss": 0.2877, "step": 2954 }, { "epoch": 2.9204152249134947, "grad_norm": 0.29044834083025695, "learning_rate": 1.4290949065591791e-06, "loss": 0.2796, "step": 2955 }, { "epoch": 2.921403855659911, "grad_norm": 0.2982870109017702, "learning_rate": 1.4107731769879079e-06, "loss": 0.31, "step": 2956 }, { "epoch": 2.922392486406327, "grad_norm": 0.3020697515880698, "learning_rate": 1.3924514474166362e-06, "loss": 0.2774, "step": 2957 }, { "epoch": 2.9233811171527435, "grad_norm": 0.2797027352430909, "learning_rate": 1.3741297178453645e-06, "loss": 0.2658, "step": 2958 }, { "epoch": 2.92436974789916, "grad_norm": 0.2849297845537985, "learning_rate": 1.3558079882740933e-06, "loss": 0.2795, "step": 2959 }, { "epoch": 2.925358378645576, "grad_norm": 0.27213914052911925, "learning_rate": 1.3374862587028216e-06, "loss": 0.2647, "step": 2960 }, { "epoch": 2.926347009391992, "grad_norm": 0.2823689923550878, "learning_rate": 1.3191645291315501e-06, "loss": 0.2726, "step": 2961 }, { "epoch": 2.9273356401384083, "grad_norm": 0.2850868384516706, "learning_rate": 1.3008427995602787e-06, "loss": 0.2792, "step": 2962 }, { "epoch": 2.9283242708848247, "grad_norm": 0.3000078646387433, "learning_rate": 1.282521069989007e-06, "loss": 0.2986, "step": 2963 }, { "epoch": 2.9293129016312407, "grad_norm": 0.30074257376539354, "learning_rate": 1.2641993404177355e-06, "loss": 0.2786, "step": 2964 }, { "epoch": 2.9303015323776567, "grad_norm": 0.30823558744452667, "learning_rate": 1.245877610846464e-06, "loss": 0.298, "step": 2965 }, { "epoch": 2.931290163124073, "grad_norm": 0.28269619123980594, "learning_rate": 1.2275558812751924e-06, "loss": 0.2744, "step": 2966 }, { "epoch": 2.9322787938704895, "grad_norm": 0.2940789980740601, "learning_rate": 1.209234151703921e-06, "loss": 0.277, "step": 2967 }, { "epoch": 2.9332674246169055, "grad_norm": 0.2787649268769888, "learning_rate": 1.1909124221326494e-06, "loss": 0.2673, "step": 2968 }, { "epoch": 2.934256055363322, "grad_norm": 0.2897651150944215, "learning_rate": 1.1725906925613778e-06, "loss": 0.304, "step": 2969 }, { "epoch": 2.935244686109738, "grad_norm": 0.28203602458052285, "learning_rate": 1.1542689629901063e-06, "loss": 0.2728, "step": 2970 }, { "epoch": 2.9362333168561543, "grad_norm": 0.29053621218435155, "learning_rate": 1.1359472334188348e-06, "loss": 0.256, "step": 2971 }, { "epoch": 2.9372219476025707, "grad_norm": 0.3031587690533328, "learning_rate": 1.1176255038475632e-06, "loss": 0.2812, "step": 2972 }, { "epoch": 2.9382105783489867, "grad_norm": 0.2921898479872661, "learning_rate": 1.0993037742762917e-06, "loss": 0.2696, "step": 2973 }, { "epoch": 2.9391992090954027, "grad_norm": 0.303108337200625, "learning_rate": 1.0809820447050202e-06, "loss": 0.3026, "step": 2974 }, { "epoch": 2.940187839841819, "grad_norm": 0.28240563434183136, "learning_rate": 1.0626603151337486e-06, "loss": 0.2679, "step": 2975 }, { "epoch": 2.9411764705882355, "grad_norm": 0.2908896624561574, "learning_rate": 1.044338585562477e-06, "loss": 0.296, "step": 2976 }, { "epoch": 2.9421651013346515, "grad_norm": 0.2876688295393858, "learning_rate": 1.0260168559912056e-06, "loss": 0.2872, "step": 2977 }, { "epoch": 2.9431537320810675, "grad_norm": 0.29286892995409664, "learning_rate": 1.0076951264199342e-06, "loss": 0.2967, "step": 2978 }, { "epoch": 2.944142362827484, "grad_norm": 0.2748746969583461, "learning_rate": 9.893733968486625e-07, "loss": 0.2813, "step": 2979 }, { "epoch": 2.9451309935739003, "grad_norm": 0.27357898235533845, "learning_rate": 9.71051667277391e-07, "loss": 0.2787, "step": 2980 }, { "epoch": 2.9461196243203163, "grad_norm": 0.2822821083479917, "learning_rate": 9.527299377061195e-07, "loss": 0.2794, "step": 2981 }, { "epoch": 2.9471082550667327, "grad_norm": 0.28469452041874754, "learning_rate": 9.344082081348481e-07, "loss": 0.3067, "step": 2982 }, { "epoch": 2.9480968858131487, "grad_norm": 0.2827965583906571, "learning_rate": 9.160864785635764e-07, "loss": 0.2861, "step": 2983 }, { "epoch": 2.949085516559565, "grad_norm": 0.28384603761165944, "learning_rate": 8.977647489923048e-07, "loss": 0.2865, "step": 2984 }, { "epoch": 2.950074147305981, "grad_norm": 0.283113740983401, "learning_rate": 8.794430194210335e-07, "loss": 0.2946, "step": 2985 }, { "epoch": 2.9510627780523975, "grad_norm": 0.29152164696493166, "learning_rate": 8.611212898497619e-07, "loss": 0.288, "step": 2986 }, { "epoch": 2.9520514087988134, "grad_norm": 0.2770035853553244, "learning_rate": 8.427995602784902e-07, "loss": 0.2624, "step": 2987 }, { "epoch": 2.95304003954523, "grad_norm": 0.28200732428485664, "learning_rate": 8.244778307072189e-07, "loss": 0.2908, "step": 2988 }, { "epoch": 2.9540286702916463, "grad_norm": 0.2623646426043654, "learning_rate": 8.061561011359473e-07, "loss": 0.2726, "step": 2989 }, { "epoch": 2.9550173010380623, "grad_norm": 0.2955719043727139, "learning_rate": 7.878343715646757e-07, "loss": 0.2893, "step": 2990 }, { "epoch": 2.9560059317844782, "grad_norm": 0.28859414562797453, "learning_rate": 7.695126419934042e-07, "loss": 0.2643, "step": 2991 }, { "epoch": 2.9569945625308947, "grad_norm": 0.28318953956991405, "learning_rate": 7.511909124221327e-07, "loss": 0.2685, "step": 2992 }, { "epoch": 2.957983193277311, "grad_norm": 0.2884475111559657, "learning_rate": 7.328691828508612e-07, "loss": 0.2393, "step": 2993 }, { "epoch": 2.958971824023727, "grad_norm": 0.29507333623813287, "learning_rate": 7.145474532795896e-07, "loss": 0.2962, "step": 2994 }, { "epoch": 2.9599604547701435, "grad_norm": 0.29323680623094134, "learning_rate": 6.962257237083181e-07, "loss": 0.2891, "step": 2995 }, { "epoch": 2.9609490855165594, "grad_norm": 0.29481717546595543, "learning_rate": 6.779039941370466e-07, "loss": 0.2833, "step": 2996 }, { "epoch": 2.961937716262976, "grad_norm": 0.27905578936611214, "learning_rate": 6.595822645657751e-07, "loss": 0.2649, "step": 2997 }, { "epoch": 2.962926347009392, "grad_norm": 0.28722032836502315, "learning_rate": 6.412605349945035e-07, "loss": 0.2959, "step": 2998 }, { "epoch": 2.9639149777558083, "grad_norm": 0.2732982792500891, "learning_rate": 6.22938805423232e-07, "loss": 0.2635, "step": 2999 }, { "epoch": 2.9649036085022242, "grad_norm": 0.28504446250971643, "learning_rate": 6.046170758519605e-07, "loss": 0.2968, "step": 3000 }, { "epoch": 2.9658922392486406, "grad_norm": 0.28107691427098874, "learning_rate": 5.862953462806889e-07, "loss": 0.2836, "step": 3001 }, { "epoch": 2.966880869995057, "grad_norm": 0.2939073091900629, "learning_rate": 5.679736167094174e-07, "loss": 0.2817, "step": 3002 }, { "epoch": 2.967869500741473, "grad_norm": 0.27471478960711765, "learning_rate": 5.496518871381459e-07, "loss": 0.2717, "step": 3003 }, { "epoch": 2.968858131487889, "grad_norm": 0.2882002861571052, "learning_rate": 5.313301575668743e-07, "loss": 0.2942, "step": 3004 }, { "epoch": 2.9698467622343054, "grad_norm": 0.28624180714526437, "learning_rate": 5.130084279956028e-07, "loss": 0.2869, "step": 3005 }, { "epoch": 2.970835392980722, "grad_norm": 0.2765996467485786, "learning_rate": 4.946866984243312e-07, "loss": 0.2829, "step": 3006 }, { "epoch": 2.971824023727138, "grad_norm": 0.2887601116038397, "learning_rate": 4.763649688530597e-07, "loss": 0.302, "step": 3007 }, { "epoch": 2.9728126544735543, "grad_norm": 0.2740399141623958, "learning_rate": 4.580432392817882e-07, "loss": 0.267, "step": 3008 }, { "epoch": 2.9738012852199702, "grad_norm": 0.295054050720032, "learning_rate": 4.3972150971051674e-07, "loss": 0.2876, "step": 3009 }, { "epoch": 2.9747899159663866, "grad_norm": 0.3049382005793767, "learning_rate": 4.213997801392451e-07, "loss": 0.2503, "step": 3010 }, { "epoch": 2.9757785467128026, "grad_norm": 0.30661083606694933, "learning_rate": 4.0307805056797366e-07, "loss": 0.3235, "step": 3011 }, { "epoch": 2.976767177459219, "grad_norm": 0.26990954312526305, "learning_rate": 3.847563209967021e-07, "loss": 0.2513, "step": 3012 }, { "epoch": 2.977755808205635, "grad_norm": 0.2700302426122201, "learning_rate": 3.664345914254306e-07, "loss": 0.2703, "step": 3013 }, { "epoch": 2.9787444389520514, "grad_norm": 0.26888263659344397, "learning_rate": 3.4811286185415905e-07, "loss": 0.2397, "step": 3014 }, { "epoch": 2.979733069698468, "grad_norm": 0.2791475028227216, "learning_rate": 3.2979113228288753e-07, "loss": 0.2445, "step": 3015 }, { "epoch": 2.980721700444884, "grad_norm": 0.2960497262763875, "learning_rate": 3.11469402711616e-07, "loss": 0.2959, "step": 3016 }, { "epoch": 2.9817103311913, "grad_norm": 0.966875029237952, "learning_rate": 2.9314767314034444e-07, "loss": 0.2813, "step": 3017 }, { "epoch": 2.982698961937716, "grad_norm": 0.33175331762799953, "learning_rate": 2.748259435690729e-07, "loss": 0.2829, "step": 3018 }, { "epoch": 2.9836875926841326, "grad_norm": 0.28904903910648905, "learning_rate": 2.565042139978014e-07, "loss": 0.2575, "step": 3019 }, { "epoch": 2.9846762234305486, "grad_norm": 0.27957684413687184, "learning_rate": 2.3818248442652986e-07, "loss": 0.2957, "step": 3020 }, { "epoch": 2.985664854176965, "grad_norm": 0.271576091357759, "learning_rate": 2.1986075485525837e-07, "loss": 0.296, "step": 3021 }, { "epoch": 2.986653484923381, "grad_norm": 0.2809319456231787, "learning_rate": 2.0153902528398683e-07, "loss": 0.2789, "step": 3022 }, { "epoch": 2.9876421156697974, "grad_norm": 0.2824880114956861, "learning_rate": 1.832172957127153e-07, "loss": 0.2842, "step": 3023 }, { "epoch": 2.9886307464162134, "grad_norm": 0.2852175870155898, "learning_rate": 1.6489556614144377e-07, "loss": 0.2873, "step": 3024 }, { "epoch": 2.98961937716263, "grad_norm": 0.26856670410794437, "learning_rate": 1.4657383657017222e-07, "loss": 0.2571, "step": 3025 }, { "epoch": 2.990608007909046, "grad_norm": 0.2792774973008445, "learning_rate": 1.282521069989007e-07, "loss": 0.2384, "step": 3026 }, { "epoch": 2.991596638655462, "grad_norm": 0.4717846091693085, "learning_rate": 1.0993037742762919e-07, "loss": 0.3169, "step": 3027 }, { "epoch": 2.9925852694018786, "grad_norm": 0.28169411150683166, "learning_rate": 9.160864785635765e-08, "loss": 0.2741, "step": 3028 }, { "epoch": 2.9935739001482946, "grad_norm": 0.28646070903663673, "learning_rate": 7.328691828508611e-08, "loss": 0.2942, "step": 3029 }, { "epoch": 2.9945625308947106, "grad_norm": 0.2859334240770583, "learning_rate": 5.496518871381459e-08, "loss": 0.2714, "step": 3030 }, { "epoch": 2.995551161641127, "grad_norm": 0.28559163046061836, "learning_rate": 3.6643459142543055e-08, "loss": 0.2678, "step": 3031 }, { "epoch": 2.9965397923875434, "grad_norm": 0.3968803306091079, "learning_rate": 1.8321729571271528e-08, "loss": 0.282, "step": 3032 }, { "epoch": 2.9975284231339594, "grad_norm": 0.29035755393628715, "learning_rate": 0.0, "loss": 0.3029, "step": 3033 }, { "epoch": 2.9975284231339594, "step": 3033, "total_flos": 2.583628845240287e+18, "train_loss": 0.4116285385381574, "train_runtime": 175906.6842, "train_samples_per_second": 0.276, "train_steps_per_second": 0.017 } ], "logging_steps": 1, "max_steps": 3033, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.583628845240287e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }