diff --git "a/checkpoint-2592/trainer_state.json" "b/checkpoint-2592/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2592/trainer_state.json" @@ -0,0 +1,18177 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998071359691417, + "eval_steps": 500, + "global_step": 2592, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003857280617164899, + "grad_norm": 1.3028969772220882, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.4718, + "step": 1 + }, + { + "epoch": 0.0007714561234329798, + "grad_norm": 1.3665860686662479, + "learning_rate": 5.000000000000001e-07, + "loss": 2.3575, + "step": 2 + }, + { + "epoch": 0.0011571841851494697, + "grad_norm": 1.3998982983137798, + "learning_rate": 7.5e-07, + "loss": 2.5221, + "step": 3 + }, + { + "epoch": 0.0015429122468659595, + "grad_norm": 1.4627027358629756, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.4767, + "step": 4 + }, + { + "epoch": 0.0019286403085824494, + "grad_norm": 1.4038200232517333, + "learning_rate": 1.25e-06, + "loss": 2.4538, + "step": 5 + }, + { + "epoch": 0.0023143683702989393, + "grad_norm": 1.3419044955414192, + "learning_rate": 1.5e-06, + "loss": 2.3874, + "step": 6 + }, + { + "epoch": 0.002700096432015429, + "grad_norm": 1.3860959919367515, + "learning_rate": 1.75e-06, + "loss": 2.4335, + "step": 7 + }, + { + "epoch": 0.003085824493731919, + "grad_norm": 1.35282104723839, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.4681, + "step": 8 + }, + { + "epoch": 0.0034715525554484088, + "grad_norm": 1.281614923512164, + "learning_rate": 2.25e-06, + "loss": 2.4464, + "step": 9 + }, + { + "epoch": 0.003857280617164899, + "grad_norm": 1.4254055492712447, + "learning_rate": 2.5e-06, + "loss": 2.4931, + "step": 10 + }, + { + "epoch": 0.004243008678881389, + "grad_norm": 1.3481677623706914, + "learning_rate": 2.7500000000000004e-06, + "loss": 2.4665, + "step": 11 + }, + { + "epoch": 0.004628736740597879, + "grad_norm": 1.2672866006199177, + "learning_rate": 3e-06, + "loss": 2.4167, + "step": 12 + }, + { + "epoch": 0.005014464802314368, + "grad_norm": 1.390249615802629, + "learning_rate": 3.2500000000000002e-06, + "loss": 2.4674, + "step": 13 + }, + { + "epoch": 0.005400192864030858, + "grad_norm": 1.305099944034975, + "learning_rate": 3.5e-06, + "loss": 2.3945, + "step": 14 + }, + { + "epoch": 0.0057859209257473485, + "grad_norm": 1.2796827824031904, + "learning_rate": 3.7500000000000005e-06, + "loss": 2.4574, + "step": 15 + }, + { + "epoch": 0.006171648987463838, + "grad_norm": 1.281187496954195, + "learning_rate": 4.000000000000001e-06, + "loss": 2.4474, + "step": 16 + }, + { + "epoch": 0.006557377049180328, + "grad_norm": 1.2706764452637, + "learning_rate": 4.25e-06, + "loss": 2.4759, + "step": 17 + }, + { + "epoch": 0.0069431051108968175, + "grad_norm": 1.1124376648153766, + "learning_rate": 4.5e-06, + "loss": 2.4301, + "step": 18 + }, + { + "epoch": 0.007328833172613307, + "grad_norm": 1.1285940335012594, + "learning_rate": 4.75e-06, + "loss": 2.4671, + "step": 19 + }, + { + "epoch": 0.007714561234329798, + "grad_norm": 1.0971409205597384, + "learning_rate": 5e-06, + "loss": 2.4581, + "step": 20 + }, + { + "epoch": 0.008100289296046287, + "grad_norm": 1.0101028043054117, + "learning_rate": 5.2500000000000006e-06, + "loss": 2.4187, + "step": 21 + }, + { + "epoch": 0.008486017357762778, + "grad_norm": 1.0183351895502508, + "learning_rate": 5.500000000000001e-06, + "loss": 2.4967, + "step": 22 + }, + { + "epoch": 0.008871745419479268, + "grad_norm": 0.9615041393264294, + "learning_rate": 5.75e-06, + "loss": 2.4402, + "step": 23 + }, + { + "epoch": 0.009257473481195757, + "grad_norm": 0.8512923775098707, + "learning_rate": 6e-06, + "loss": 2.4582, + "step": 24 + }, + { + "epoch": 0.009643201542912247, + "grad_norm": 0.7092461493595851, + "learning_rate": 6.25e-06, + "loss": 2.4808, + "step": 25 + }, + { + "epoch": 0.010028929604628737, + "grad_norm": 0.6819046573220403, + "learning_rate": 6.5000000000000004e-06, + "loss": 2.3906, + "step": 26 + }, + { + "epoch": 0.010414657666345226, + "grad_norm": 0.7115437695404054, + "learning_rate": 6.750000000000001e-06, + "loss": 2.407, + "step": 27 + }, + { + "epoch": 0.010800385728061716, + "grad_norm": 0.76903326342247, + "learning_rate": 7e-06, + "loss": 2.4771, + "step": 28 + }, + { + "epoch": 0.011186113789778206, + "grad_norm": 0.6338741014214005, + "learning_rate": 7.25e-06, + "loss": 2.4336, + "step": 29 + }, + { + "epoch": 0.011571841851494697, + "grad_norm": 0.640435250241746, + "learning_rate": 7.500000000000001e-06, + "loss": 2.408, + "step": 30 + }, + { + "epoch": 0.011957569913211187, + "grad_norm": 0.6176702235125447, + "learning_rate": 7.75e-06, + "loss": 2.4004, + "step": 31 + }, + { + "epoch": 0.012343297974927676, + "grad_norm": 0.609835959019172, + "learning_rate": 8.000000000000001e-06, + "loss": 2.3579, + "step": 32 + }, + { + "epoch": 0.012729026036644166, + "grad_norm": 0.5619355195918498, + "learning_rate": 8.25e-06, + "loss": 2.4959, + "step": 33 + }, + { + "epoch": 0.013114754098360656, + "grad_norm": 0.5757497035883407, + "learning_rate": 8.5e-06, + "loss": 2.3962, + "step": 34 + }, + { + "epoch": 0.013500482160077145, + "grad_norm": 0.5810404938379216, + "learning_rate": 8.750000000000001e-06, + "loss": 2.4677, + "step": 35 + }, + { + "epoch": 0.013886210221793635, + "grad_norm": 0.5710345213527928, + "learning_rate": 9e-06, + "loss": 2.341, + "step": 36 + }, + { + "epoch": 0.014271938283510125, + "grad_norm": 0.6565216595062554, + "learning_rate": 9.250000000000001e-06, + "loss": 2.4151, + "step": 37 + }, + { + "epoch": 0.014657666345226614, + "grad_norm": 0.5518571592310421, + "learning_rate": 9.5e-06, + "loss": 2.3074, + "step": 38 + }, + { + "epoch": 0.015043394406943106, + "grad_norm": 0.5187339842543488, + "learning_rate": 9.75e-06, + "loss": 2.4212, + "step": 39 + }, + { + "epoch": 0.015429122468659595, + "grad_norm": 0.5563829699152197, + "learning_rate": 1e-05, + "loss": 2.4743, + "step": 40 + }, + { + "epoch": 0.015814850530376085, + "grad_norm": 0.5041845776833512, + "learning_rate": 9.999996211403454e-06, + "loss": 2.3575, + "step": 41 + }, + { + "epoch": 0.016200578592092573, + "grad_norm": 0.48969986698988993, + "learning_rate": 9.999984845619553e-06, + "loss": 2.4481, + "step": 42 + }, + { + "epoch": 0.016586306653809064, + "grad_norm": 0.4802875056014886, + "learning_rate": 9.999965902665524e-06, + "loss": 2.4324, + "step": 43 + }, + { + "epoch": 0.016972034715525556, + "grad_norm": 0.4966005227112398, + "learning_rate": 9.999939382570075e-06, + "loss": 2.4681, + "step": 44 + }, + { + "epoch": 0.017357762777242044, + "grad_norm": 0.504548661198233, + "learning_rate": 9.999905285373392e-06, + "loss": 2.3829, + "step": 45 + }, + { + "epoch": 0.017743490838958535, + "grad_norm": 0.49530725000256803, + "learning_rate": 9.999863611127149e-06, + "loss": 2.3726, + "step": 46 + }, + { + "epoch": 0.018129218900675023, + "grad_norm": 0.496879485860879, + "learning_rate": 9.999814359894501e-06, + "loss": 2.355, + "step": 47 + }, + { + "epoch": 0.018514946962391515, + "grad_norm": 0.5349294236673756, + "learning_rate": 9.999757531750086e-06, + "loss": 2.3913, + "step": 48 + }, + { + "epoch": 0.018900675024108003, + "grad_norm": 0.48220878118791843, + "learning_rate": 9.999693126780022e-06, + "loss": 2.3789, + "step": 49 + }, + { + "epoch": 0.019286403085824494, + "grad_norm": 0.4778862019205235, + "learning_rate": 9.99962114508191e-06, + "loss": 2.4296, + "step": 50 + }, + { + "epoch": 0.019672131147540985, + "grad_norm": 0.4973185957898752, + "learning_rate": 9.999541586764836e-06, + "loss": 2.4465, + "step": 51 + }, + { + "epoch": 0.020057859209257473, + "grad_norm": 0.49336298166277864, + "learning_rate": 9.999454451949364e-06, + "loss": 2.4485, + "step": 52 + }, + { + "epoch": 0.020443587270973965, + "grad_norm": 0.48791115925851264, + "learning_rate": 9.999359740767545e-06, + "loss": 2.4737, + "step": 53 + }, + { + "epoch": 0.020829315332690453, + "grad_norm": 0.49124507643462106, + "learning_rate": 9.999257453362903e-06, + "loss": 2.364, + "step": 54 + }, + { + "epoch": 0.021215043394406944, + "grad_norm": 0.5260600198155415, + "learning_rate": 9.999147589890452e-06, + "loss": 2.3577, + "step": 55 + }, + { + "epoch": 0.021600771456123432, + "grad_norm": 0.4646617505959831, + "learning_rate": 9.999030150516681e-06, + "loss": 2.4449, + "step": 56 + }, + { + "epoch": 0.021986499517839923, + "grad_norm": 0.4841418940374766, + "learning_rate": 9.998905135419564e-06, + "loss": 2.4372, + "step": 57 + }, + { + "epoch": 0.02237222757955641, + "grad_norm": 0.48770580532041086, + "learning_rate": 9.998772544788552e-06, + "loss": 2.4455, + "step": 58 + }, + { + "epoch": 0.022757955641272903, + "grad_norm": 0.5108619805179353, + "learning_rate": 9.99863237882458e-06, + "loss": 2.4247, + "step": 59 + }, + { + "epoch": 0.023143683702989394, + "grad_norm": 0.48302169372077075, + "learning_rate": 9.998484637740058e-06, + "loss": 2.3931, + "step": 60 + }, + { + "epoch": 0.023529411764705882, + "grad_norm": 0.491126215051955, + "learning_rate": 9.998329321758882e-06, + "loss": 2.402, + "step": 61 + }, + { + "epoch": 0.023915139826422373, + "grad_norm": 0.46744989065036346, + "learning_rate": 9.998166431116421e-06, + "loss": 2.354, + "step": 62 + }, + { + "epoch": 0.02430086788813886, + "grad_norm": 0.5088343527314104, + "learning_rate": 9.997995966059526e-06, + "loss": 2.4644, + "step": 63 + }, + { + "epoch": 0.024686595949855353, + "grad_norm": 0.4601911278579166, + "learning_rate": 9.997817926846528e-06, + "loss": 2.3642, + "step": 64 + }, + { + "epoch": 0.02507232401157184, + "grad_norm": 0.45558237724734235, + "learning_rate": 9.997632313747236e-06, + "loss": 2.3585, + "step": 65 + }, + { + "epoch": 0.025458052073288332, + "grad_norm": 0.5098804291523603, + "learning_rate": 9.99743912704293e-06, + "loss": 2.4082, + "step": 66 + }, + { + "epoch": 0.02584378013500482, + "grad_norm": 0.48981215111458465, + "learning_rate": 9.997238367026376e-06, + "loss": 2.3659, + "step": 67 + }, + { + "epoch": 0.02622950819672131, + "grad_norm": 0.46146276348442455, + "learning_rate": 9.997030034001815e-06, + "loss": 2.4289, + "step": 68 + }, + { + "epoch": 0.026615236258437803, + "grad_norm": 0.4866095040257039, + "learning_rate": 9.99681412828496e-06, + "loss": 2.4155, + "step": 69 + }, + { + "epoch": 0.02700096432015429, + "grad_norm": 0.44699856443081387, + "learning_rate": 9.996590650203003e-06, + "loss": 2.418, + "step": 70 + }, + { + "epoch": 0.027386692381870782, + "grad_norm": 0.45814399872315087, + "learning_rate": 9.996359600094612e-06, + "loss": 2.3706, + "step": 71 + }, + { + "epoch": 0.02777242044358727, + "grad_norm": 0.4581038253559308, + "learning_rate": 9.99612097830993e-06, + "loss": 2.3944, + "step": 72 + }, + { + "epoch": 0.02815814850530376, + "grad_norm": 0.4436937845767424, + "learning_rate": 9.995874785210573e-06, + "loss": 2.3962, + "step": 73 + }, + { + "epoch": 0.02854387656702025, + "grad_norm": 0.5058877161626817, + "learning_rate": 9.995621021169632e-06, + "loss": 2.4237, + "step": 74 + }, + { + "epoch": 0.02892960462873674, + "grad_norm": 0.4793468391613926, + "learning_rate": 9.99535968657167e-06, + "loss": 2.4446, + "step": 75 + }, + { + "epoch": 0.02931533269045323, + "grad_norm": 0.4796523581998484, + "learning_rate": 9.995090781812724e-06, + "loss": 2.4206, + "step": 76 + }, + { + "epoch": 0.02970106075216972, + "grad_norm": 0.49382659971704934, + "learning_rate": 9.994814307300302e-06, + "loss": 2.3688, + "step": 77 + }, + { + "epoch": 0.03008678881388621, + "grad_norm": 0.5288191844079528, + "learning_rate": 9.994530263453385e-06, + "loss": 2.4649, + "step": 78 + }, + { + "epoch": 0.0304725168756027, + "grad_norm": 0.46921741488140223, + "learning_rate": 9.994238650702425e-06, + "loss": 2.3567, + "step": 79 + }, + { + "epoch": 0.03085824493731919, + "grad_norm": 0.4840280310683048, + "learning_rate": 9.993939469489342e-06, + "loss": 2.4211, + "step": 80 + }, + { + "epoch": 0.03124397299903568, + "grad_norm": 0.5169497417227851, + "learning_rate": 9.993632720267526e-06, + "loss": 2.4632, + "step": 81 + }, + { + "epoch": 0.03162970106075217, + "grad_norm": 0.6302728557829981, + "learning_rate": 9.993318403501838e-06, + "loss": 2.525, + "step": 82 + }, + { + "epoch": 0.03201542912246866, + "grad_norm": 0.46673916252446257, + "learning_rate": 9.992996519668603e-06, + "loss": 2.3906, + "step": 83 + }, + { + "epoch": 0.032401157184185146, + "grad_norm": 0.4938456663759333, + "learning_rate": 9.99266706925562e-06, + "loss": 2.4276, + "step": 84 + }, + { + "epoch": 0.03278688524590164, + "grad_norm": 0.4561673092917819, + "learning_rate": 9.99233005276215e-06, + "loss": 2.4089, + "step": 85 + }, + { + "epoch": 0.03317261330761813, + "grad_norm": 0.4775094525872891, + "learning_rate": 9.991985470698918e-06, + "loss": 2.4591, + "step": 86 + }, + { + "epoch": 0.03355834136933462, + "grad_norm": 0.4450031576193394, + "learning_rate": 9.99163332358812e-06, + "loss": 2.3826, + "step": 87 + }, + { + "epoch": 0.03394406943105111, + "grad_norm": 0.4728950026349946, + "learning_rate": 9.991273611963413e-06, + "loss": 2.3936, + "step": 88 + }, + { + "epoch": 0.0343297974927676, + "grad_norm": 0.4504962222392687, + "learning_rate": 9.990906336369917e-06, + "loss": 2.4611, + "step": 89 + }, + { + "epoch": 0.03471552555448409, + "grad_norm": 0.45006031421593823, + "learning_rate": 9.990531497364215e-06, + "loss": 2.3998, + "step": 90 + }, + { + "epoch": 0.035101253616200576, + "grad_norm": 0.45430590871658183, + "learning_rate": 9.990149095514354e-06, + "loss": 2.4285, + "step": 91 + }, + { + "epoch": 0.03548698167791707, + "grad_norm": 0.48930021919773625, + "learning_rate": 9.98975913139984e-06, + "loss": 2.4687, + "step": 92 + }, + { + "epoch": 0.03587270973963356, + "grad_norm": 0.4458311534356548, + "learning_rate": 9.989361605611638e-06, + "loss": 2.439, + "step": 93 + }, + { + "epoch": 0.036258437801350046, + "grad_norm": 0.4829210847535642, + "learning_rate": 9.988956518752178e-06, + "loss": 2.3716, + "step": 94 + }, + { + "epoch": 0.03664416586306654, + "grad_norm": 0.4860495403190378, + "learning_rate": 9.988543871435342e-06, + "loss": 2.3246, + "step": 95 + }, + { + "epoch": 0.03702989392478303, + "grad_norm": 0.4590978918787668, + "learning_rate": 9.98812366428647e-06, + "loss": 2.3632, + "step": 96 + }, + { + "epoch": 0.03741562198649952, + "grad_norm": 0.5170375903870793, + "learning_rate": 9.98769589794236e-06, + "loss": 2.3509, + "step": 97 + }, + { + "epoch": 0.037801350048216005, + "grad_norm": 0.4930988306444351, + "learning_rate": 9.987260573051268e-06, + "loss": 2.4131, + "step": 98 + }, + { + "epoch": 0.0381870781099325, + "grad_norm": 0.5027035108292636, + "learning_rate": 9.986817690272902e-06, + "loss": 2.3941, + "step": 99 + }, + { + "epoch": 0.03857280617164899, + "grad_norm": 0.5253968168941702, + "learning_rate": 9.986367250278423e-06, + "loss": 2.3307, + "step": 100 + }, + { + "epoch": 0.038958534233365476, + "grad_norm": 0.4719637441013078, + "learning_rate": 9.985909253750446e-06, + "loss": 2.3666, + "step": 101 + }, + { + "epoch": 0.03934426229508197, + "grad_norm": 0.4712787856936807, + "learning_rate": 9.985443701383035e-06, + "loss": 2.3941, + "step": 102 + }, + { + "epoch": 0.03972999035679846, + "grad_norm": 0.46114806739089415, + "learning_rate": 9.984970593881706e-06, + "loss": 2.4337, + "step": 103 + }, + { + "epoch": 0.040115718418514947, + "grad_norm": 0.46850436431850573, + "learning_rate": 9.984489931963429e-06, + "loss": 2.4107, + "step": 104 + }, + { + "epoch": 0.040501446480231434, + "grad_norm": 0.49937095248671337, + "learning_rate": 9.984001716356611e-06, + "loss": 2.382, + "step": 105 + }, + { + "epoch": 0.04088717454194793, + "grad_norm": 0.4597593612632286, + "learning_rate": 9.983505947801115e-06, + "loss": 2.4465, + "step": 106 + }, + { + "epoch": 0.04127290260366442, + "grad_norm": 0.5102293641150926, + "learning_rate": 9.983002627048248e-06, + "loss": 2.4051, + "step": 107 + }, + { + "epoch": 0.041658630665380905, + "grad_norm": 0.48329948878166523, + "learning_rate": 9.982491754860763e-06, + "loss": 2.4089, + "step": 108 + }, + { + "epoch": 0.04204435872709739, + "grad_norm": 0.4985851779622545, + "learning_rate": 9.981973332012856e-06, + "loss": 2.3836, + "step": 109 + }, + { + "epoch": 0.04243008678881389, + "grad_norm": 0.46047059896927356, + "learning_rate": 9.981447359290162e-06, + "loss": 2.3619, + "step": 110 + }, + { + "epoch": 0.042815814850530376, + "grad_norm": 0.45941412104146895, + "learning_rate": 9.980913837489763e-06, + "loss": 2.4467, + "step": 111 + }, + { + "epoch": 0.043201542912246864, + "grad_norm": 0.5279912896817367, + "learning_rate": 9.980372767420179e-06, + "loss": 2.3873, + "step": 112 + }, + { + "epoch": 0.04358727097396336, + "grad_norm": 0.5173284181396758, + "learning_rate": 9.979824149901365e-06, + "loss": 2.3712, + "step": 113 + }, + { + "epoch": 0.04397299903567985, + "grad_norm": 0.46425124191934264, + "learning_rate": 9.979267985764717e-06, + "loss": 2.3972, + "step": 114 + }, + { + "epoch": 0.044358727097396335, + "grad_norm": 0.47210175255183745, + "learning_rate": 9.978704275853073e-06, + "loss": 2.4281, + "step": 115 + }, + { + "epoch": 0.04474445515911282, + "grad_norm": 0.5117270444340153, + "learning_rate": 9.978133021020697e-06, + "loss": 2.5032, + "step": 116 + }, + { + "epoch": 0.04513018322082932, + "grad_norm": 0.45237840748789165, + "learning_rate": 9.977554222133293e-06, + "loss": 2.2994, + "step": 117 + }, + { + "epoch": 0.045515911282545805, + "grad_norm": 0.4829428321396551, + "learning_rate": 9.97696788006799e-06, + "loss": 2.3987, + "step": 118 + }, + { + "epoch": 0.04590163934426229, + "grad_norm": 0.4576774127420072, + "learning_rate": 9.976373995713358e-06, + "loss": 2.3653, + "step": 119 + }, + { + "epoch": 0.04628736740597879, + "grad_norm": 0.4743705723346769, + "learning_rate": 9.97577256996939e-06, + "loss": 2.4184, + "step": 120 + }, + { + "epoch": 0.046673095467695276, + "grad_norm": 0.46090426942328017, + "learning_rate": 9.975163603747513e-06, + "loss": 2.3988, + "step": 121 + }, + { + "epoch": 0.047058823529411764, + "grad_norm": 0.452297730740023, + "learning_rate": 9.974547097970576e-06, + "loss": 2.4127, + "step": 122 + }, + { + "epoch": 0.04744455159112825, + "grad_norm": 0.45844755127239656, + "learning_rate": 9.973923053572854e-06, + "loss": 2.3684, + "step": 123 + }, + { + "epoch": 0.04783027965284475, + "grad_norm": 0.49492845581087924, + "learning_rate": 9.97329147150005e-06, + "loss": 2.4074, + "step": 124 + }, + { + "epoch": 0.048216007714561235, + "grad_norm": 0.484978985690606, + "learning_rate": 9.972652352709287e-06, + "loss": 2.468, + "step": 125 + }, + { + "epoch": 0.04860173577627772, + "grad_norm": 0.4611298393733667, + "learning_rate": 9.972005698169112e-06, + "loss": 2.4323, + "step": 126 + }, + { + "epoch": 0.04898746383799421, + "grad_norm": 0.4646255015376031, + "learning_rate": 9.971351508859488e-06, + "loss": 2.4634, + "step": 127 + }, + { + "epoch": 0.049373191899710706, + "grad_norm": 0.46220388970928555, + "learning_rate": 9.970689785771798e-06, + "loss": 2.3982, + "step": 128 + }, + { + "epoch": 0.049758919961427193, + "grad_norm": 0.47071092914994284, + "learning_rate": 9.970020529908846e-06, + "loss": 2.4697, + "step": 129 + }, + { + "epoch": 0.05014464802314368, + "grad_norm": 0.5604351159632398, + "learning_rate": 9.969343742284847e-06, + "loss": 2.3377, + "step": 130 + }, + { + "epoch": 0.050530376084860176, + "grad_norm": 0.47620334301858896, + "learning_rate": 9.968659423925429e-06, + "loss": 2.3535, + "step": 131 + }, + { + "epoch": 0.050916104146576664, + "grad_norm": 0.4828800048233955, + "learning_rate": 9.96796757586764e-06, + "loss": 2.4521, + "step": 132 + }, + { + "epoch": 0.05130183220829315, + "grad_norm": 0.4595892831688092, + "learning_rate": 9.967268199159926e-06, + "loss": 2.3431, + "step": 133 + }, + { + "epoch": 0.05168756027000964, + "grad_norm": 0.46535351807962866, + "learning_rate": 9.96656129486215e-06, + "loss": 2.336, + "step": 134 + }, + { + "epoch": 0.052073288331726135, + "grad_norm": 0.5597994767184472, + "learning_rate": 9.96584686404559e-06, + "loss": 2.399, + "step": 135 + }, + { + "epoch": 0.05245901639344262, + "grad_norm": 0.5182784238006419, + "learning_rate": 9.965124907792916e-06, + "loss": 2.419, + "step": 136 + }, + { + "epoch": 0.05284474445515911, + "grad_norm": 0.484806699314445, + "learning_rate": 9.964395427198208e-06, + "loss": 2.4725, + "step": 137 + }, + { + "epoch": 0.053230472516875606, + "grad_norm": 0.46748733607259835, + "learning_rate": 9.963658423366951e-06, + "loss": 2.3721, + "step": 138 + }, + { + "epoch": 0.053616200578592094, + "grad_norm": 0.45804476035104996, + "learning_rate": 9.962913897416029e-06, + "loss": 2.4362, + "step": 139 + }, + { + "epoch": 0.05400192864030858, + "grad_norm": 0.5644951235988059, + "learning_rate": 9.962161850473723e-06, + "loss": 2.4108, + "step": 140 + }, + { + "epoch": 0.05438765670202507, + "grad_norm": 0.4584390238284688, + "learning_rate": 9.961402283679718e-06, + "loss": 2.3989, + "step": 141 + }, + { + "epoch": 0.054773384763741564, + "grad_norm": 0.4539282651014262, + "learning_rate": 9.960635198185088e-06, + "loss": 2.3916, + "step": 142 + }, + { + "epoch": 0.05515911282545805, + "grad_norm": 0.4413326116727136, + "learning_rate": 9.959860595152305e-06, + "loss": 2.3971, + "step": 143 + }, + { + "epoch": 0.05554484088717454, + "grad_norm": 0.4883760828911208, + "learning_rate": 9.95907847575523e-06, + "loss": 2.3996, + "step": 144 + }, + { + "epoch": 0.055930568948891035, + "grad_norm": 0.4509831906670727, + "learning_rate": 9.958288841179121e-06, + "loss": 2.3887, + "step": 145 + }, + { + "epoch": 0.05631629701060752, + "grad_norm": 0.48204373539948087, + "learning_rate": 9.957491692620618e-06, + "loss": 2.3994, + "step": 146 + }, + { + "epoch": 0.05670202507232401, + "grad_norm": 0.5064694164808567, + "learning_rate": 9.956687031287752e-06, + "loss": 2.4212, + "step": 147 + }, + { + "epoch": 0.0570877531340405, + "grad_norm": 0.47286697840128156, + "learning_rate": 9.955874858399936e-06, + "loss": 2.4378, + "step": 148 + }, + { + "epoch": 0.057473481195756994, + "grad_norm": 0.45461008312404527, + "learning_rate": 9.955055175187971e-06, + "loss": 2.427, + "step": 149 + }, + { + "epoch": 0.05785920925747348, + "grad_norm": 0.5304097965230556, + "learning_rate": 9.954227982894034e-06, + "loss": 2.3624, + "step": 150 + }, + { + "epoch": 0.05824493731918997, + "grad_norm": 0.48513100057264075, + "learning_rate": 9.953393282771686e-06, + "loss": 2.3925, + "step": 151 + }, + { + "epoch": 0.05863066538090646, + "grad_norm": 0.5064590654892867, + "learning_rate": 9.952551076085864e-06, + "loss": 2.4262, + "step": 152 + }, + { + "epoch": 0.05901639344262295, + "grad_norm": 0.4579406686580277, + "learning_rate": 9.951701364112877e-06, + "loss": 2.311, + "step": 153 + }, + { + "epoch": 0.05940212150433944, + "grad_norm": 0.48021046645886806, + "learning_rate": 9.950844148140414e-06, + "loss": 2.414, + "step": 154 + }, + { + "epoch": 0.05978784956605593, + "grad_norm": 0.4384289042787429, + "learning_rate": 9.949979429467534e-06, + "loss": 2.4737, + "step": 155 + }, + { + "epoch": 0.06017357762777242, + "grad_norm": 0.4860862476128489, + "learning_rate": 9.949107209404664e-06, + "loss": 2.4112, + "step": 156 + }, + { + "epoch": 0.06055930568948891, + "grad_norm": 0.47596173584160284, + "learning_rate": 9.948227489273601e-06, + "loss": 2.4056, + "step": 157 + }, + { + "epoch": 0.0609450337512054, + "grad_norm": 0.5025215866684043, + "learning_rate": 9.947340270407504e-06, + "loss": 2.4447, + "step": 158 + }, + { + "epoch": 0.06133076181292189, + "grad_norm": 0.46084735188375686, + "learning_rate": 9.946445554150902e-06, + "loss": 2.462, + "step": 159 + }, + { + "epoch": 0.06171648987463838, + "grad_norm": 0.45982518950698714, + "learning_rate": 9.945543341859681e-06, + "loss": 2.3593, + "step": 160 + }, + { + "epoch": 0.06210221793635487, + "grad_norm": 0.4945780654463559, + "learning_rate": 9.94463363490109e-06, + "loss": 2.3348, + "step": 161 + }, + { + "epoch": 0.06248794599807136, + "grad_norm": 0.4872729237867549, + "learning_rate": 9.94371643465373e-06, + "loss": 2.392, + "step": 162 + }, + { + "epoch": 0.06287367405978785, + "grad_norm": 0.4605048563245808, + "learning_rate": 9.942791742507565e-06, + "loss": 2.4605, + "step": 163 + }, + { + "epoch": 0.06325940212150434, + "grad_norm": 0.4482486921811318, + "learning_rate": 9.94185955986391e-06, + "loss": 2.4488, + "step": 164 + }, + { + "epoch": 0.06364513018322084, + "grad_norm": 0.47037373849736897, + "learning_rate": 9.940919888135428e-06, + "loss": 2.3973, + "step": 165 + }, + { + "epoch": 0.06403085824493732, + "grad_norm": 0.45236768246003195, + "learning_rate": 9.939972728746134e-06, + "loss": 2.3402, + "step": 166 + }, + { + "epoch": 0.06441658630665381, + "grad_norm": 0.4779222209932742, + "learning_rate": 9.939018083131391e-06, + "loss": 2.363, + "step": 167 + }, + { + "epoch": 0.06480231436837029, + "grad_norm": 0.4901328716880792, + "learning_rate": 9.938055952737908e-06, + "loss": 2.3983, + "step": 168 + }, + { + "epoch": 0.06518804243008679, + "grad_norm": 0.4750880213070407, + "learning_rate": 9.937086339023731e-06, + "loss": 2.4079, + "step": 169 + }, + { + "epoch": 0.06557377049180328, + "grad_norm": 0.4706342992387472, + "learning_rate": 9.93610924345825e-06, + "loss": 2.3022, + "step": 170 + }, + { + "epoch": 0.06595949855351976, + "grad_norm": 0.50942295471628, + "learning_rate": 9.935124667522196e-06, + "loss": 2.4003, + "step": 171 + }, + { + "epoch": 0.06634522661523626, + "grad_norm": 0.4648678544986133, + "learning_rate": 9.934132612707631e-06, + "loss": 2.3952, + "step": 172 + }, + { + "epoch": 0.06673095467695275, + "grad_norm": 0.4849676435310516, + "learning_rate": 9.933133080517956e-06, + "loss": 2.3687, + "step": 173 + }, + { + "epoch": 0.06711668273866923, + "grad_norm": 0.45490408653760767, + "learning_rate": 9.932126072467897e-06, + "loss": 2.3959, + "step": 174 + }, + { + "epoch": 0.06750241080038573, + "grad_norm": 0.46281412003095096, + "learning_rate": 9.931111590083516e-06, + "loss": 2.4556, + "step": 175 + }, + { + "epoch": 0.06788813886210222, + "grad_norm": 0.5152142848860477, + "learning_rate": 9.930089634902197e-06, + "loss": 2.4501, + "step": 176 + }, + { + "epoch": 0.0682738669238187, + "grad_norm": 0.4688795671612781, + "learning_rate": 9.92906020847265e-06, + "loss": 2.4516, + "step": 177 + }, + { + "epoch": 0.0686595949855352, + "grad_norm": 0.45994214817232393, + "learning_rate": 9.92802331235491e-06, + "loss": 2.4451, + "step": 178 + }, + { + "epoch": 0.0690453230472517, + "grad_norm": 0.4621313038771384, + "learning_rate": 9.926978948120327e-06, + "loss": 2.4197, + "step": 179 + }, + { + "epoch": 0.06943105110896818, + "grad_norm": 0.4582006758782177, + "learning_rate": 9.925927117351573e-06, + "loss": 2.4572, + "step": 180 + }, + { + "epoch": 0.06981677917068467, + "grad_norm": 0.4779378215804943, + "learning_rate": 9.92486782164263e-06, + "loss": 2.3648, + "step": 181 + }, + { + "epoch": 0.07020250723240115, + "grad_norm": 0.44552167365942835, + "learning_rate": 9.923801062598799e-06, + "loss": 2.3999, + "step": 182 + }, + { + "epoch": 0.07058823529411765, + "grad_norm": 0.4962343901077161, + "learning_rate": 9.922726841836685e-06, + "loss": 2.3891, + "step": 183 + }, + { + "epoch": 0.07097396335583414, + "grad_norm": 0.4859300553836987, + "learning_rate": 9.921645160984205e-06, + "loss": 2.3188, + "step": 184 + }, + { + "epoch": 0.07135969141755062, + "grad_norm": 0.49044136281300504, + "learning_rate": 9.92055602168058e-06, + "loss": 2.4622, + "step": 185 + }, + { + "epoch": 0.07174541947926712, + "grad_norm": 0.5242947128137062, + "learning_rate": 9.919459425576334e-06, + "loss": 2.3797, + "step": 186 + }, + { + "epoch": 0.07213114754098361, + "grad_norm": 0.4982936128608122, + "learning_rate": 9.918355374333292e-06, + "loss": 2.4064, + "step": 187 + }, + { + "epoch": 0.07251687560270009, + "grad_norm": 0.47856529619520927, + "learning_rate": 9.917243869624573e-06, + "loss": 2.2904, + "step": 188 + }, + { + "epoch": 0.07290260366441659, + "grad_norm": 0.4760751948697406, + "learning_rate": 9.916124913134594e-06, + "loss": 2.3582, + "step": 189 + }, + { + "epoch": 0.07328833172613308, + "grad_norm": 0.4639298555212939, + "learning_rate": 9.91499850655907e-06, + "loss": 2.3503, + "step": 190 + }, + { + "epoch": 0.07367405978784956, + "grad_norm": 0.4807769790534427, + "learning_rate": 9.913864651604996e-06, + "loss": 2.399, + "step": 191 + }, + { + "epoch": 0.07405978784956606, + "grad_norm": 0.5044607391473807, + "learning_rate": 9.91272334999066e-06, + "loss": 2.3792, + "step": 192 + }, + { + "epoch": 0.07444551591128254, + "grad_norm": 0.4575543809543949, + "learning_rate": 9.911574603445637e-06, + "loss": 2.3502, + "step": 193 + }, + { + "epoch": 0.07483124397299903, + "grad_norm": 0.46442861911525574, + "learning_rate": 9.91041841371078e-06, + "loss": 2.3847, + "step": 194 + }, + { + "epoch": 0.07521697203471553, + "grad_norm": 0.4663103425438184, + "learning_rate": 9.909254782538225e-06, + "loss": 2.3995, + "step": 195 + }, + { + "epoch": 0.07560270009643201, + "grad_norm": 0.502061724200111, + "learning_rate": 9.908083711691383e-06, + "loss": 2.3949, + "step": 196 + }, + { + "epoch": 0.0759884281581485, + "grad_norm": 0.4772691500713791, + "learning_rate": 9.906905202944939e-06, + "loss": 2.3846, + "step": 197 + }, + { + "epoch": 0.076374156219865, + "grad_norm": 0.4568898979377072, + "learning_rate": 9.905719258084852e-06, + "loss": 2.3305, + "step": 198 + }, + { + "epoch": 0.07675988428158148, + "grad_norm": 0.4249948751433217, + "learning_rate": 9.904525878908347e-06, + "loss": 2.3695, + "step": 199 + }, + { + "epoch": 0.07714561234329798, + "grad_norm": 0.48786089598149546, + "learning_rate": 9.903325067223918e-06, + "loss": 2.35, + "step": 200 + }, + { + "epoch": 0.07753134040501447, + "grad_norm": 0.47807928221591933, + "learning_rate": 9.902116824851323e-06, + "loss": 2.3781, + "step": 201 + }, + { + "epoch": 0.07791706846673095, + "grad_norm": 0.46074562043721295, + "learning_rate": 9.900901153621576e-06, + "loss": 2.3326, + "step": 202 + }, + { + "epoch": 0.07830279652844745, + "grad_norm": 0.4785080056291243, + "learning_rate": 9.899678055376955e-06, + "loss": 2.352, + "step": 203 + }, + { + "epoch": 0.07868852459016394, + "grad_norm": 0.49822216149179815, + "learning_rate": 9.898447531970989e-06, + "loss": 2.329, + "step": 204 + }, + { + "epoch": 0.07907425265188042, + "grad_norm": 0.5074455882320913, + "learning_rate": 9.897209585268459e-06, + "loss": 2.2767, + "step": 205 + }, + { + "epoch": 0.07945998071359692, + "grad_norm": 0.48299990133600534, + "learning_rate": 9.8959642171454e-06, + "loss": 2.3657, + "step": 206 + }, + { + "epoch": 0.0798457087753134, + "grad_norm": 0.4606233819521811, + "learning_rate": 9.89471142948909e-06, + "loss": 2.4016, + "step": 207 + }, + { + "epoch": 0.08023143683702989, + "grad_norm": 0.44478840251929497, + "learning_rate": 9.893451224198051e-06, + "loss": 2.3426, + "step": 208 + }, + { + "epoch": 0.08061716489874639, + "grad_norm": 0.46445194930483236, + "learning_rate": 9.892183603182048e-06, + "loss": 2.3325, + "step": 209 + }, + { + "epoch": 0.08100289296046287, + "grad_norm": 0.5371784356002836, + "learning_rate": 9.890908568362083e-06, + "loss": 2.4046, + "step": 210 + }, + { + "epoch": 0.08138862102217936, + "grad_norm": 0.4971218959499928, + "learning_rate": 9.889626121670391e-06, + "loss": 2.3784, + "step": 211 + }, + { + "epoch": 0.08177434908389586, + "grad_norm": 0.4783572091434314, + "learning_rate": 9.888336265050443e-06, + "loss": 2.3865, + "step": 212 + }, + { + "epoch": 0.08216007714561234, + "grad_norm": 0.510470355835948, + "learning_rate": 9.887039000456937e-06, + "loss": 2.3624, + "step": 213 + }, + { + "epoch": 0.08254580520732883, + "grad_norm": 0.4613120488085547, + "learning_rate": 9.885734329855798e-06, + "loss": 2.4258, + "step": 214 + }, + { + "epoch": 0.08293153326904533, + "grad_norm": 0.43709866085432875, + "learning_rate": 9.884422255224175e-06, + "loss": 2.308, + "step": 215 + }, + { + "epoch": 0.08331726133076181, + "grad_norm": 0.4857474219872402, + "learning_rate": 9.883102778550434e-06, + "loss": 2.3864, + "step": 216 + }, + { + "epoch": 0.0837029893924783, + "grad_norm": 0.5003412853512436, + "learning_rate": 9.881775901834164e-06, + "loss": 2.4105, + "step": 217 + }, + { + "epoch": 0.08408871745419479, + "grad_norm": 0.48105392183158313, + "learning_rate": 9.880441627086163e-06, + "loss": 2.4384, + "step": 218 + }, + { + "epoch": 0.08447444551591128, + "grad_norm": 0.46812824003346093, + "learning_rate": 9.879099956328443e-06, + "loss": 2.4241, + "step": 219 + }, + { + "epoch": 0.08486017357762778, + "grad_norm": 0.5071249854343619, + "learning_rate": 9.877750891594224e-06, + "loss": 2.3748, + "step": 220 + }, + { + "epoch": 0.08524590163934426, + "grad_norm": 0.47055633644804373, + "learning_rate": 9.876394434927931e-06, + "loss": 2.3642, + "step": 221 + }, + { + "epoch": 0.08563162970106075, + "grad_norm": 0.4913503211525076, + "learning_rate": 9.875030588385192e-06, + "loss": 2.3467, + "step": 222 + }, + { + "epoch": 0.08601735776277725, + "grad_norm": 0.4742540637522417, + "learning_rate": 9.873659354032829e-06, + "loss": 2.345, + "step": 223 + }, + { + "epoch": 0.08640308582449373, + "grad_norm": 0.4627514802436143, + "learning_rate": 9.872280733948867e-06, + "loss": 2.4574, + "step": 224 + }, + { + "epoch": 0.08678881388621022, + "grad_norm": 0.45535902185916416, + "learning_rate": 9.87089473022252e-06, + "loss": 2.4138, + "step": 225 + }, + { + "epoch": 0.08717454194792672, + "grad_norm": 0.43825210670158116, + "learning_rate": 9.869501344954188e-06, + "loss": 2.4522, + "step": 226 + }, + { + "epoch": 0.0875602700096432, + "grad_norm": 0.5354119753902198, + "learning_rate": 9.868100580255466e-06, + "loss": 2.3617, + "step": 227 + }, + { + "epoch": 0.0879459980713597, + "grad_norm": 0.46706788377779374, + "learning_rate": 9.866692438249124e-06, + "loss": 2.4118, + "step": 228 + }, + { + "epoch": 0.08833172613307617, + "grad_norm": 0.4471121597083322, + "learning_rate": 9.865276921069113e-06, + "loss": 2.3571, + "step": 229 + }, + { + "epoch": 0.08871745419479267, + "grad_norm": 0.4572982101835507, + "learning_rate": 9.863854030860566e-06, + "loss": 2.3805, + "step": 230 + }, + { + "epoch": 0.08910318225650916, + "grad_norm": 0.45200242655619743, + "learning_rate": 9.862423769779784e-06, + "loss": 2.4114, + "step": 231 + }, + { + "epoch": 0.08948891031822565, + "grad_norm": 0.48459327556912984, + "learning_rate": 9.86098613999424e-06, + "loss": 2.4248, + "step": 232 + }, + { + "epoch": 0.08987463837994214, + "grad_norm": 0.4382958973086269, + "learning_rate": 9.859541143682573e-06, + "loss": 2.3683, + "step": 233 + }, + { + "epoch": 0.09026036644165863, + "grad_norm": 0.5249998518789483, + "learning_rate": 9.858088783034587e-06, + "loss": 2.3673, + "step": 234 + }, + { + "epoch": 0.09064609450337512, + "grad_norm": 0.49347712460532517, + "learning_rate": 9.856629060251247e-06, + "loss": 2.486, + "step": 235 + }, + { + "epoch": 0.09103182256509161, + "grad_norm": 0.5340059340122717, + "learning_rate": 9.855161977544672e-06, + "loss": 2.4404, + "step": 236 + }, + { + "epoch": 0.0914175506268081, + "grad_norm": 0.5052074730677552, + "learning_rate": 9.853687537138132e-06, + "loss": 2.3487, + "step": 237 + }, + { + "epoch": 0.09180327868852459, + "grad_norm": 0.4584681855122188, + "learning_rate": 9.852205741266058e-06, + "loss": 2.4361, + "step": 238 + }, + { + "epoch": 0.09218900675024108, + "grad_norm": 0.45006330044968135, + "learning_rate": 9.850716592174016e-06, + "loss": 2.4544, + "step": 239 + }, + { + "epoch": 0.09257473481195758, + "grad_norm": 0.45259872735656165, + "learning_rate": 9.849220092118721e-06, + "loss": 2.2847, + "step": 240 + }, + { + "epoch": 0.09296046287367406, + "grad_norm": 0.4868831359383384, + "learning_rate": 9.847716243368027e-06, + "loss": 2.3389, + "step": 241 + }, + { + "epoch": 0.09334619093539055, + "grad_norm": 0.4760449473372184, + "learning_rate": 9.846205048200926e-06, + "loss": 2.3317, + "step": 242 + }, + { + "epoch": 0.09373191899710703, + "grad_norm": 0.4438227065202477, + "learning_rate": 9.844686508907538e-06, + "loss": 2.3582, + "step": 243 + }, + { + "epoch": 0.09411764705882353, + "grad_norm": 0.43517075741433026, + "learning_rate": 9.84316062778912e-06, + "loss": 2.365, + "step": 244 + }, + { + "epoch": 0.09450337512054002, + "grad_norm": 0.5025767914527827, + "learning_rate": 9.841627407158048e-06, + "loss": 2.3178, + "step": 245 + }, + { + "epoch": 0.0948891031822565, + "grad_norm": 0.4864659521213612, + "learning_rate": 9.840086849337825e-06, + "loss": 2.3858, + "step": 246 + }, + { + "epoch": 0.095274831243973, + "grad_norm": 0.44385232721166673, + "learning_rate": 9.838538956663073e-06, + "loss": 2.3473, + "step": 247 + }, + { + "epoch": 0.0956605593056895, + "grad_norm": 0.4353216339544687, + "learning_rate": 9.836983731479526e-06, + "loss": 2.3667, + "step": 248 + }, + { + "epoch": 0.09604628736740597, + "grad_norm": 0.4719586700058661, + "learning_rate": 9.835421176144035e-06, + "loss": 2.3507, + "step": 249 + }, + { + "epoch": 0.09643201542912247, + "grad_norm": 0.4721523839328503, + "learning_rate": 9.833851293024555e-06, + "loss": 2.4036, + "step": 250 + }, + { + "epoch": 0.09681774349083896, + "grad_norm": 0.43112104586716343, + "learning_rate": 9.832274084500147e-06, + "loss": 2.3394, + "step": 251 + }, + { + "epoch": 0.09720347155255545, + "grad_norm": 0.4645093917726468, + "learning_rate": 9.830689552960974e-06, + "loss": 2.3434, + "step": 252 + }, + { + "epoch": 0.09758919961427194, + "grad_norm": 0.4464501744666932, + "learning_rate": 9.829097700808298e-06, + "loss": 2.268, + "step": 253 + }, + { + "epoch": 0.09797492767598842, + "grad_norm": 0.4412531990241394, + "learning_rate": 9.827498530454473e-06, + "loss": 2.3434, + "step": 254 + }, + { + "epoch": 0.09836065573770492, + "grad_norm": 0.48563901834289575, + "learning_rate": 9.825892044322942e-06, + "loss": 2.4006, + "step": 255 + }, + { + "epoch": 0.09874638379942141, + "grad_norm": 0.48283553769044785, + "learning_rate": 9.824278244848236e-06, + "loss": 2.4335, + "step": 256 + }, + { + "epoch": 0.09913211186113789, + "grad_norm": 0.43055299801280267, + "learning_rate": 9.82265713447597e-06, + "loss": 2.3679, + "step": 257 + }, + { + "epoch": 0.09951783992285439, + "grad_norm": 0.449769379434095, + "learning_rate": 9.821028715662838e-06, + "loss": 2.3132, + "step": 258 + }, + { + "epoch": 0.09990356798457088, + "grad_norm": 0.5217734914013734, + "learning_rate": 9.819392990876605e-06, + "loss": 2.3516, + "step": 259 + }, + { + "epoch": 0.10028929604628736, + "grad_norm": 0.4615898258764757, + "learning_rate": 9.817749962596115e-06, + "loss": 2.3833, + "step": 260 + }, + { + "epoch": 0.10067502410800386, + "grad_norm": 0.48582752923842243, + "learning_rate": 9.816099633311278e-06, + "loss": 2.3628, + "step": 261 + }, + { + "epoch": 0.10106075216972035, + "grad_norm": 0.4559771187990709, + "learning_rate": 9.814442005523062e-06, + "loss": 2.4158, + "step": 262 + }, + { + "epoch": 0.10144648023143683, + "grad_norm": 0.489757834124081, + "learning_rate": 9.812777081743505e-06, + "loss": 2.4279, + "step": 263 + }, + { + "epoch": 0.10183220829315333, + "grad_norm": 0.44309092173930786, + "learning_rate": 9.811104864495691e-06, + "loss": 2.3007, + "step": 264 + }, + { + "epoch": 0.10221793635486982, + "grad_norm": 0.47803882641272966, + "learning_rate": 9.809425356313769e-06, + "loss": 2.3382, + "step": 265 + }, + { + "epoch": 0.1026036644165863, + "grad_norm": 0.46596798590577343, + "learning_rate": 9.807738559742927e-06, + "loss": 2.3615, + "step": 266 + }, + { + "epoch": 0.1029893924783028, + "grad_norm": 0.4664501923411451, + "learning_rate": 9.806044477339403e-06, + "loss": 2.3507, + "step": 267 + }, + { + "epoch": 0.10337512054001928, + "grad_norm": 0.49023888589795483, + "learning_rate": 9.804343111670472e-06, + "loss": 2.3983, + "step": 268 + }, + { + "epoch": 0.10376084860173578, + "grad_norm": 0.45763647245985856, + "learning_rate": 9.802634465314454e-06, + "loss": 2.3629, + "step": 269 + }, + { + "epoch": 0.10414657666345227, + "grad_norm": 0.4723827861961394, + "learning_rate": 9.800918540860693e-06, + "loss": 2.3405, + "step": 270 + }, + { + "epoch": 0.10453230472516875, + "grad_norm": 0.44017857265774585, + "learning_rate": 9.799195340909569e-06, + "loss": 2.3689, + "step": 271 + }, + { + "epoch": 0.10491803278688525, + "grad_norm": 0.4656051971642151, + "learning_rate": 9.797464868072489e-06, + "loss": 2.3007, + "step": 272 + }, + { + "epoch": 0.10530376084860174, + "grad_norm": 0.47987588123571606, + "learning_rate": 9.795727124971872e-06, + "loss": 2.406, + "step": 273 + }, + { + "epoch": 0.10568948891031822, + "grad_norm": 0.5056657398354122, + "learning_rate": 9.793982114241165e-06, + "loss": 2.3804, + "step": 274 + }, + { + "epoch": 0.10607521697203472, + "grad_norm": 0.47845361621935306, + "learning_rate": 9.792229838524825e-06, + "loss": 2.343, + "step": 275 + }, + { + "epoch": 0.10646094503375121, + "grad_norm": 0.48207101891115595, + "learning_rate": 9.790470300478318e-06, + "loss": 2.3679, + "step": 276 + }, + { + "epoch": 0.10684667309546769, + "grad_norm": 0.44257531338996314, + "learning_rate": 9.788703502768115e-06, + "loss": 2.3744, + "step": 277 + }, + { + "epoch": 0.10723240115718419, + "grad_norm": 0.45301627449323617, + "learning_rate": 9.786929448071688e-06, + "loss": 2.3729, + "step": 278 + }, + { + "epoch": 0.10761812921890067, + "grad_norm": 0.4469254649273637, + "learning_rate": 9.785148139077511e-06, + "loss": 2.3602, + "step": 279 + }, + { + "epoch": 0.10800385728061716, + "grad_norm": 0.4484779739046592, + "learning_rate": 9.783359578485047e-06, + "loss": 2.4075, + "step": 280 + }, + { + "epoch": 0.10838958534233366, + "grad_norm": 0.4636230876300128, + "learning_rate": 9.78156376900475e-06, + "loss": 2.4094, + "step": 281 + }, + { + "epoch": 0.10877531340405014, + "grad_norm": 0.4746029120882621, + "learning_rate": 9.77976071335806e-06, + "loss": 2.3912, + "step": 282 + }, + { + "epoch": 0.10916104146576663, + "grad_norm": 0.4644868421831817, + "learning_rate": 9.777950414277394e-06, + "loss": 2.3936, + "step": 283 + }, + { + "epoch": 0.10954676952748313, + "grad_norm": 0.5069332456975906, + "learning_rate": 9.776132874506153e-06, + "loss": 2.3874, + "step": 284 + }, + { + "epoch": 0.10993249758919961, + "grad_norm": 0.448790481721612, + "learning_rate": 9.774308096798704e-06, + "loss": 2.4064, + "step": 285 + }, + { + "epoch": 0.1103182256509161, + "grad_norm": 0.5221704977963175, + "learning_rate": 9.772476083920388e-06, + "loss": 2.3179, + "step": 286 + }, + { + "epoch": 0.1107039537126326, + "grad_norm": 0.45028528688105157, + "learning_rate": 9.770636838647505e-06, + "loss": 2.333, + "step": 287 + }, + { + "epoch": 0.11108968177434908, + "grad_norm": 0.46213043493703676, + "learning_rate": 9.768790363767321e-06, + "loss": 2.3342, + "step": 288 + }, + { + "epoch": 0.11147540983606558, + "grad_norm": 0.48597786762721035, + "learning_rate": 9.766936662078056e-06, + "loss": 2.3701, + "step": 289 + }, + { + "epoch": 0.11186113789778207, + "grad_norm": 0.47811060819847545, + "learning_rate": 9.76507573638888e-06, + "loss": 2.407, + "step": 290 + }, + { + "epoch": 0.11224686595949855, + "grad_norm": 0.47693681660455023, + "learning_rate": 9.763207589519909e-06, + "loss": 2.4047, + "step": 291 + }, + { + "epoch": 0.11263259402121505, + "grad_norm": 0.44714431772174534, + "learning_rate": 9.761332224302209e-06, + "loss": 2.3458, + "step": 292 + }, + { + "epoch": 0.11301832208293153, + "grad_norm": 0.4706222187538263, + "learning_rate": 9.759449643577779e-06, + "loss": 2.3536, + "step": 293 + }, + { + "epoch": 0.11340405014464802, + "grad_norm": 0.4822820286599736, + "learning_rate": 9.757559850199554e-06, + "loss": 2.3624, + "step": 294 + }, + { + "epoch": 0.11378977820636452, + "grad_norm": 0.4527420181209796, + "learning_rate": 9.755662847031402e-06, + "loss": 2.3276, + "step": 295 + }, + { + "epoch": 0.114175506268081, + "grad_norm": 0.5009015927230964, + "learning_rate": 9.753758636948112e-06, + "loss": 2.3411, + "step": 296 + }, + { + "epoch": 0.11456123432979749, + "grad_norm": 0.45340300327958766, + "learning_rate": 9.751847222835399e-06, + "loss": 2.3366, + "step": 297 + }, + { + "epoch": 0.11494696239151399, + "grad_norm": 0.4521806041401524, + "learning_rate": 9.749928607589894e-06, + "loss": 2.3661, + "step": 298 + }, + { + "epoch": 0.11533269045323047, + "grad_norm": 0.43386796678166034, + "learning_rate": 9.74800279411914e-06, + "loss": 2.4343, + "step": 299 + }, + { + "epoch": 0.11571841851494696, + "grad_norm": 0.5073054891521527, + "learning_rate": 9.74606978534159e-06, + "loss": 2.3711, + "step": 300 + }, + { + "epoch": 0.11610414657666346, + "grad_norm": 0.48069295086138375, + "learning_rate": 9.744129584186599e-06, + "loss": 2.3452, + "step": 301 + }, + { + "epoch": 0.11648987463837994, + "grad_norm": 0.4983738490461878, + "learning_rate": 9.742182193594424e-06, + "loss": 2.3582, + "step": 302 + }, + { + "epoch": 0.11687560270009643, + "grad_norm": 0.5346958331811935, + "learning_rate": 9.740227616516215e-06, + "loss": 2.4356, + "step": 303 + }, + { + "epoch": 0.11726133076181292, + "grad_norm": 0.4880172814003539, + "learning_rate": 9.738265855914014e-06, + "loss": 2.3423, + "step": 304 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 0.4717538049526971, + "learning_rate": 9.736296914760747e-06, + "loss": 2.3923, + "step": 305 + }, + { + "epoch": 0.1180327868852459, + "grad_norm": 0.4971984961224734, + "learning_rate": 9.734320796040226e-06, + "loss": 2.4046, + "step": 306 + }, + { + "epoch": 0.11841851494696239, + "grad_norm": 0.511356699564154, + "learning_rate": 9.732337502747137e-06, + "loss": 2.3471, + "step": 307 + }, + { + "epoch": 0.11880424300867888, + "grad_norm": 0.5462753805792145, + "learning_rate": 9.730347037887041e-06, + "loss": 2.2914, + "step": 308 + }, + { + "epoch": 0.11918997107039538, + "grad_norm": 0.5658159223894998, + "learning_rate": 9.728349404476361e-06, + "loss": 2.3901, + "step": 309 + }, + { + "epoch": 0.11957569913211186, + "grad_norm": 0.48142167178073736, + "learning_rate": 9.726344605542388e-06, + "loss": 2.3021, + "step": 310 + }, + { + "epoch": 0.11996142719382835, + "grad_norm": 0.4797926831647081, + "learning_rate": 9.724332644123278e-06, + "loss": 2.371, + "step": 311 + }, + { + "epoch": 0.12034715525554485, + "grad_norm": 0.4786174999989658, + "learning_rate": 9.722313523268028e-06, + "loss": 2.3725, + "step": 312 + }, + { + "epoch": 0.12073288331726133, + "grad_norm": 0.4438762134466573, + "learning_rate": 9.720287246036495e-06, + "loss": 2.2856, + "step": 313 + }, + { + "epoch": 0.12111861137897782, + "grad_norm": 0.50049840606718, + "learning_rate": 9.718253815499379e-06, + "loss": 2.3902, + "step": 314 + }, + { + "epoch": 0.12150433944069432, + "grad_norm": 0.476365793733286, + "learning_rate": 9.716213234738216e-06, + "loss": 2.3777, + "step": 315 + }, + { + "epoch": 0.1218900675024108, + "grad_norm": 0.4488991949236611, + "learning_rate": 9.714165506845381e-06, + "loss": 2.3435, + "step": 316 + }, + { + "epoch": 0.1222757955641273, + "grad_norm": 0.4587270176274335, + "learning_rate": 9.712110634924083e-06, + "loss": 2.3591, + "step": 317 + }, + { + "epoch": 0.12266152362584377, + "grad_norm": 0.4538941606082488, + "learning_rate": 9.710048622088354e-06, + "loss": 2.237, + "step": 318 + }, + { + "epoch": 0.12304725168756027, + "grad_norm": 0.4704173813002971, + "learning_rate": 9.707979471463045e-06, + "loss": 2.3786, + "step": 319 + }, + { + "epoch": 0.12343297974927676, + "grad_norm": 0.5109877934613372, + "learning_rate": 9.705903186183828e-06, + "loss": 2.3326, + "step": 320 + }, + { + "epoch": 0.12381870781099324, + "grad_norm": 0.46072430320991625, + "learning_rate": 9.703819769397187e-06, + "loss": 2.4276, + "step": 321 + }, + { + "epoch": 0.12420443587270974, + "grad_norm": 0.5050060879426643, + "learning_rate": 9.70172922426041e-06, + "loss": 2.3332, + "step": 322 + }, + { + "epoch": 0.12459016393442623, + "grad_norm": 0.4821117791422475, + "learning_rate": 9.699631553941591e-06, + "loss": 2.364, + "step": 323 + }, + { + "epoch": 0.12497589199614272, + "grad_norm": 0.46553118925707043, + "learning_rate": 9.697526761619621e-06, + "loss": 2.3143, + "step": 324 + }, + { + "epoch": 0.1253616200578592, + "grad_norm": 0.5019839429179425, + "learning_rate": 9.695414850484187e-06, + "loss": 2.3849, + "step": 325 + }, + { + "epoch": 0.1257473481195757, + "grad_norm": 0.4665907597446247, + "learning_rate": 9.693295823735754e-06, + "loss": 2.3864, + "step": 326 + }, + { + "epoch": 0.1261330761812922, + "grad_norm": 0.47054309847305004, + "learning_rate": 9.69116968458558e-06, + "loss": 2.4386, + "step": 327 + }, + { + "epoch": 0.12651880424300868, + "grad_norm": 0.5694549688899382, + "learning_rate": 9.689036436255698e-06, + "loss": 2.3638, + "step": 328 + }, + { + "epoch": 0.12690453230472518, + "grad_norm": 0.4993630278897394, + "learning_rate": 9.686896081978916e-06, + "loss": 2.3018, + "step": 329 + }, + { + "epoch": 0.12729026036644167, + "grad_norm": 0.46646727753640355, + "learning_rate": 9.68474862499881e-06, + "loss": 2.3632, + "step": 330 + }, + { + "epoch": 0.12767598842815814, + "grad_norm": 0.5514409158628053, + "learning_rate": 9.682594068569717e-06, + "loss": 2.3586, + "step": 331 + }, + { + "epoch": 0.12806171648987463, + "grad_norm": 0.5365834653256637, + "learning_rate": 9.680432415956736e-06, + "loss": 2.283, + "step": 332 + }, + { + "epoch": 0.12844744455159113, + "grad_norm": 0.46818705457501164, + "learning_rate": 9.67826367043572e-06, + "loss": 2.3932, + "step": 333 + }, + { + "epoch": 0.12883317261330762, + "grad_norm": 0.4541066736608887, + "learning_rate": 9.676087835293267e-06, + "loss": 2.3231, + "step": 334 + }, + { + "epoch": 0.12921890067502412, + "grad_norm": 0.5282386505767401, + "learning_rate": 9.673904913826723e-06, + "loss": 2.2421, + "step": 335 + }, + { + "epoch": 0.12960462873674058, + "grad_norm": 0.468829947274691, + "learning_rate": 9.671714909344175e-06, + "loss": 2.3366, + "step": 336 + }, + { + "epoch": 0.12999035679845708, + "grad_norm": 0.44396215189823446, + "learning_rate": 9.669517825164435e-06, + "loss": 2.4577, + "step": 337 + }, + { + "epoch": 0.13037608486017357, + "grad_norm": 0.45822960137335506, + "learning_rate": 9.66731366461705e-06, + "loss": 2.3348, + "step": 338 + }, + { + "epoch": 0.13076181292189007, + "grad_norm": 0.49070250854985425, + "learning_rate": 9.665102431042294e-06, + "loss": 2.3269, + "step": 339 + }, + { + "epoch": 0.13114754098360656, + "grad_norm": 0.4750510730638908, + "learning_rate": 9.66288412779115e-06, + "loss": 2.3423, + "step": 340 + }, + { + "epoch": 0.13153326904532306, + "grad_norm": 0.4491421521169457, + "learning_rate": 9.660658758225328e-06, + "loss": 2.3735, + "step": 341 + }, + { + "epoch": 0.13191899710703953, + "grad_norm": 0.48144278294663, + "learning_rate": 9.658426325717231e-06, + "loss": 2.3237, + "step": 342 + }, + { + "epoch": 0.13230472516875602, + "grad_norm": 0.4458502716413071, + "learning_rate": 9.656186833649978e-06, + "loss": 2.3359, + "step": 343 + }, + { + "epoch": 0.13269045323047252, + "grad_norm": 0.47471916997622327, + "learning_rate": 9.653940285417381e-06, + "loss": 2.2852, + "step": 344 + }, + { + "epoch": 0.133076181292189, + "grad_norm": 0.44132656234673406, + "learning_rate": 9.651686684423946e-06, + "loss": 2.4675, + "step": 345 + }, + { + "epoch": 0.1334619093539055, + "grad_norm": 0.448204840397715, + "learning_rate": 9.649426034084866e-06, + "loss": 2.4185, + "step": 346 + }, + { + "epoch": 0.13384763741562197, + "grad_norm": 0.4783505363815993, + "learning_rate": 9.64715833782602e-06, + "loss": 2.3576, + "step": 347 + }, + { + "epoch": 0.13423336547733847, + "grad_norm": 0.5392006855126578, + "learning_rate": 9.644883599083959e-06, + "loss": 2.3052, + "step": 348 + }, + { + "epoch": 0.13461909353905496, + "grad_norm": 0.5398502095090069, + "learning_rate": 9.642601821305911e-06, + "loss": 2.3771, + "step": 349 + }, + { + "epoch": 0.13500482160077146, + "grad_norm": 0.4652744324669575, + "learning_rate": 9.640313007949774e-06, + "loss": 2.3416, + "step": 350 + }, + { + "epoch": 0.13539054966248795, + "grad_norm": 0.4874516740981523, + "learning_rate": 9.638017162484099e-06, + "loss": 2.3085, + "step": 351 + }, + { + "epoch": 0.13577627772420445, + "grad_norm": 0.5146225636471995, + "learning_rate": 9.635714288388103e-06, + "loss": 2.3684, + "step": 352 + }, + { + "epoch": 0.13616200578592091, + "grad_norm": 0.5324693443633871, + "learning_rate": 9.633404389151647e-06, + "loss": 2.3293, + "step": 353 + }, + { + "epoch": 0.1365477338476374, + "grad_norm": 0.5005897826018193, + "learning_rate": 9.631087468275242e-06, + "loss": 2.3957, + "step": 354 + }, + { + "epoch": 0.1369334619093539, + "grad_norm": 0.4794002915682946, + "learning_rate": 9.628763529270042e-06, + "loss": 2.3003, + "step": 355 + }, + { + "epoch": 0.1373191899710704, + "grad_norm": 0.4519655989539701, + "learning_rate": 9.626432575657834e-06, + "loss": 2.3054, + "step": 356 + }, + { + "epoch": 0.1377049180327869, + "grad_norm": 0.45990015572061077, + "learning_rate": 9.624094610971031e-06, + "loss": 2.3092, + "step": 357 + }, + { + "epoch": 0.1380906460945034, + "grad_norm": 0.4575290225254486, + "learning_rate": 9.621749638752677e-06, + "loss": 2.3802, + "step": 358 + }, + { + "epoch": 0.13847637415621986, + "grad_norm": 0.48239530007607073, + "learning_rate": 9.619397662556434e-06, + "loss": 2.4129, + "step": 359 + }, + { + "epoch": 0.13886210221793635, + "grad_norm": 0.4722242819640516, + "learning_rate": 9.617038685946578e-06, + "loss": 2.3015, + "step": 360 + }, + { + "epoch": 0.13924783027965285, + "grad_norm": 0.4717278397903987, + "learning_rate": 9.614672712497994e-06, + "loss": 2.3601, + "step": 361 + }, + { + "epoch": 0.13963355834136934, + "grad_norm": 0.4629947753534378, + "learning_rate": 9.612299745796166e-06, + "loss": 2.3506, + "step": 362 + }, + { + "epoch": 0.14001928640308584, + "grad_norm": 0.44161467357227746, + "learning_rate": 9.609919789437181e-06, + "loss": 2.3704, + "step": 363 + }, + { + "epoch": 0.1404050144648023, + "grad_norm": 0.48893954585393296, + "learning_rate": 9.60753284702772e-06, + "loss": 2.343, + "step": 364 + }, + { + "epoch": 0.1407907425265188, + "grad_norm": 0.48320802138822283, + "learning_rate": 9.605138922185044e-06, + "loss": 2.2883, + "step": 365 + }, + { + "epoch": 0.1411764705882353, + "grad_norm": 0.46256366347211225, + "learning_rate": 9.602738018536999e-06, + "loss": 2.3546, + "step": 366 + }, + { + "epoch": 0.1415621986499518, + "grad_norm": 0.47369112795205537, + "learning_rate": 9.600330139722009e-06, + "loss": 2.338, + "step": 367 + }, + { + "epoch": 0.14194792671166828, + "grad_norm": 0.44509196448459337, + "learning_rate": 9.597915289389067e-06, + "loss": 2.3689, + "step": 368 + }, + { + "epoch": 0.14233365477338478, + "grad_norm": 0.5206361761868962, + "learning_rate": 9.595493471197728e-06, + "loss": 2.3584, + "step": 369 + }, + { + "epoch": 0.14271938283510124, + "grad_norm": 0.4494183534270254, + "learning_rate": 9.59306468881811e-06, + "loss": 2.3922, + "step": 370 + }, + { + "epoch": 0.14310511089681774, + "grad_norm": 0.46343082392561685, + "learning_rate": 9.590628945930884e-06, + "loss": 2.3408, + "step": 371 + }, + { + "epoch": 0.14349083895853423, + "grad_norm": 0.5078625730082782, + "learning_rate": 9.58818624622727e-06, + "loss": 2.2884, + "step": 372 + }, + { + "epoch": 0.14387656702025073, + "grad_norm": 0.4584681472733906, + "learning_rate": 9.585736593409025e-06, + "loss": 2.3047, + "step": 373 + }, + { + "epoch": 0.14426229508196722, + "grad_norm": 0.4505415778870253, + "learning_rate": 9.583279991188452e-06, + "loss": 2.362, + "step": 374 + }, + { + "epoch": 0.1446480231436837, + "grad_norm": 0.4473492838799735, + "learning_rate": 9.58081644328838e-06, + "loss": 2.4563, + "step": 375 + }, + { + "epoch": 0.14503375120540019, + "grad_norm": 0.4564603087597555, + "learning_rate": 9.578345953442163e-06, + "loss": 2.2288, + "step": 376 + }, + { + "epoch": 0.14541947926711668, + "grad_norm": 0.43177932208894787, + "learning_rate": 9.575868525393678e-06, + "loss": 2.2975, + "step": 377 + }, + { + "epoch": 0.14580520732883318, + "grad_norm": 0.46636470820623344, + "learning_rate": 9.573384162897316e-06, + "loss": 2.4027, + "step": 378 + }, + { + "epoch": 0.14619093539054967, + "grad_norm": 0.4782220814127971, + "learning_rate": 9.570892869717973e-06, + "loss": 2.3228, + "step": 379 + }, + { + "epoch": 0.14657666345226616, + "grad_norm": 0.47904333240288893, + "learning_rate": 9.568394649631055e-06, + "loss": 2.345, + "step": 380 + }, + { + "epoch": 0.14696239151398263, + "grad_norm": 0.5044958437706698, + "learning_rate": 9.565889506422457e-06, + "loss": 2.3638, + "step": 381 + }, + { + "epoch": 0.14734811957569913, + "grad_norm": 0.49190167391071266, + "learning_rate": 9.56337744388857e-06, + "loss": 2.3272, + "step": 382 + }, + { + "epoch": 0.14773384763741562, + "grad_norm": 0.44996933951384155, + "learning_rate": 9.560858465836276e-06, + "loss": 2.3653, + "step": 383 + }, + { + "epoch": 0.14811957569913212, + "grad_norm": 0.4587857596998676, + "learning_rate": 9.558332576082925e-06, + "loss": 2.3414, + "step": 384 + }, + { + "epoch": 0.1485053037608486, + "grad_norm": 0.5012824701110264, + "learning_rate": 9.555799778456352e-06, + "loss": 2.3932, + "step": 385 + }, + { + "epoch": 0.14889103182256508, + "grad_norm": 0.5113006222659796, + "learning_rate": 9.553260076794854e-06, + "loss": 2.3353, + "step": 386 + }, + { + "epoch": 0.14927675988428157, + "grad_norm": 0.48958486992708833, + "learning_rate": 9.550713474947195e-06, + "loss": 2.3772, + "step": 387 + }, + { + "epoch": 0.14966248794599807, + "grad_norm": 0.43811850438162614, + "learning_rate": 9.548159976772593e-06, + "loss": 2.3487, + "step": 388 + }, + { + "epoch": 0.15004821600771456, + "grad_norm": 0.46168083383138425, + "learning_rate": 9.545599586140717e-06, + "loss": 2.3849, + "step": 389 + }, + { + "epoch": 0.15043394406943106, + "grad_norm": 0.49256260847551697, + "learning_rate": 9.543032306931683e-06, + "loss": 2.3543, + "step": 390 + }, + { + "epoch": 0.15081967213114755, + "grad_norm": 0.473167833158212, + "learning_rate": 9.540458143036043e-06, + "loss": 2.3816, + "step": 391 + }, + { + "epoch": 0.15120540019286402, + "grad_norm": 0.4742238894783086, + "learning_rate": 9.537877098354787e-06, + "loss": 2.3936, + "step": 392 + }, + { + "epoch": 0.15159112825458051, + "grad_norm": 0.5024024944905685, + "learning_rate": 9.535289176799327e-06, + "loss": 2.3507, + "step": 393 + }, + { + "epoch": 0.151976856316297, + "grad_norm": 0.4856201150929417, + "learning_rate": 9.532694382291502e-06, + "loss": 2.2789, + "step": 394 + }, + { + "epoch": 0.1523625843780135, + "grad_norm": 0.472014061842457, + "learning_rate": 9.530092718763563e-06, + "loss": 2.3341, + "step": 395 + }, + { + "epoch": 0.15274831243973, + "grad_norm": 0.42326984525803946, + "learning_rate": 9.527484190158171e-06, + "loss": 2.3542, + "step": 396 + }, + { + "epoch": 0.15313404050144647, + "grad_norm": 0.49125522387471887, + "learning_rate": 9.52486880042839e-06, + "loss": 2.312, + "step": 397 + }, + { + "epoch": 0.15351976856316296, + "grad_norm": 0.4598836914194489, + "learning_rate": 9.522246553537684e-06, + "loss": 2.3529, + "step": 398 + }, + { + "epoch": 0.15390549662487946, + "grad_norm": 0.4521553323843294, + "learning_rate": 9.51961745345991e-06, + "loss": 2.2969, + "step": 399 + }, + { + "epoch": 0.15429122468659595, + "grad_norm": 0.4796362607034985, + "learning_rate": 9.5169815041793e-06, + "loss": 2.2865, + "step": 400 + }, + { + "epoch": 0.15467695274831245, + "grad_norm": 0.45184163779831893, + "learning_rate": 9.514338709690479e-06, + "loss": 2.3817, + "step": 401 + }, + { + "epoch": 0.15506268081002894, + "grad_norm": 0.45974094003422233, + "learning_rate": 9.51168907399844e-06, + "loss": 2.3112, + "step": 402 + }, + { + "epoch": 0.1554484088717454, + "grad_norm": 0.4892632061678719, + "learning_rate": 9.509032601118541e-06, + "loss": 2.3575, + "step": 403 + }, + { + "epoch": 0.1558341369334619, + "grad_norm": 0.48317202108320073, + "learning_rate": 9.506369295076505e-06, + "loss": 2.3653, + "step": 404 + }, + { + "epoch": 0.1562198649951784, + "grad_norm": 0.460662253665198, + "learning_rate": 9.50369915990841e-06, + "loss": 2.3577, + "step": 405 + }, + { + "epoch": 0.1566055930568949, + "grad_norm": 0.47046930881397986, + "learning_rate": 9.50102219966068e-06, + "loss": 2.2641, + "step": 406 + }, + { + "epoch": 0.1569913211186114, + "grad_norm": 0.4667940344677699, + "learning_rate": 9.498338418390084e-06, + "loss": 2.3379, + "step": 407 + }, + { + "epoch": 0.15737704918032788, + "grad_norm": 0.4517494208806375, + "learning_rate": 9.495647820163725e-06, + "loss": 2.4353, + "step": 408 + }, + { + "epoch": 0.15776277724204435, + "grad_norm": 0.5185786383663151, + "learning_rate": 9.492950409059046e-06, + "loss": 2.3267, + "step": 409 + }, + { + "epoch": 0.15814850530376084, + "grad_norm": 0.4580602346169873, + "learning_rate": 9.490246189163804e-06, + "loss": 2.347, + "step": 410 + }, + { + "epoch": 0.15853423336547734, + "grad_norm": 0.46822479368086606, + "learning_rate": 9.487535164576078e-06, + "loss": 2.3668, + "step": 411 + }, + { + "epoch": 0.15891996142719383, + "grad_norm": 0.4400590003473538, + "learning_rate": 9.484817339404261e-06, + "loss": 2.3596, + "step": 412 + }, + { + "epoch": 0.15930568948891033, + "grad_norm": 0.5087509111696566, + "learning_rate": 9.482092717767051e-06, + "loss": 2.3396, + "step": 413 + }, + { + "epoch": 0.1596914175506268, + "grad_norm": 0.4679149494631726, + "learning_rate": 9.479361303793441e-06, + "loss": 2.317, + "step": 414 + }, + { + "epoch": 0.1600771456123433, + "grad_norm": 0.5143950303506667, + "learning_rate": 9.476623101622723e-06, + "loss": 2.3757, + "step": 415 + }, + { + "epoch": 0.16046287367405979, + "grad_norm": 0.4899194637705207, + "learning_rate": 9.473878115404477e-06, + "loss": 2.3438, + "step": 416 + }, + { + "epoch": 0.16084860173577628, + "grad_norm": 0.4324465415290763, + "learning_rate": 9.471126349298557e-06, + "loss": 2.3271, + "step": 417 + }, + { + "epoch": 0.16123432979749278, + "grad_norm": 0.4959757578143956, + "learning_rate": 9.468367807475098e-06, + "loss": 2.373, + "step": 418 + }, + { + "epoch": 0.16162005785920927, + "grad_norm": 0.45871196145856946, + "learning_rate": 9.465602494114501e-06, + "loss": 2.3363, + "step": 419 + }, + { + "epoch": 0.16200578592092574, + "grad_norm": 0.44081619492718654, + "learning_rate": 9.462830413407427e-06, + "loss": 2.388, + "step": 420 + }, + { + "epoch": 0.16239151398264223, + "grad_norm": 0.4834495637227687, + "learning_rate": 9.460051569554797e-06, + "loss": 2.3146, + "step": 421 + }, + { + "epoch": 0.16277724204435873, + "grad_norm": 0.4472145307722556, + "learning_rate": 9.457265966767774e-06, + "loss": 2.34, + "step": 422 + }, + { + "epoch": 0.16316297010607522, + "grad_norm": 0.45602302896948904, + "learning_rate": 9.454473609267774e-06, + "loss": 2.4182, + "step": 423 + }, + { + "epoch": 0.16354869816779172, + "grad_norm": 0.4630513057117173, + "learning_rate": 9.451674501286436e-06, + "loss": 2.3345, + "step": 424 + }, + { + "epoch": 0.16393442622950818, + "grad_norm": 0.5332933685367667, + "learning_rate": 9.448868647065644e-06, + "loss": 2.3917, + "step": 425 + }, + { + "epoch": 0.16432015429122468, + "grad_norm": 0.44731427171532107, + "learning_rate": 9.44605605085749e-06, + "loss": 2.3259, + "step": 426 + }, + { + "epoch": 0.16470588235294117, + "grad_norm": 0.45357761505856337, + "learning_rate": 9.443236716924297e-06, + "loss": 2.4065, + "step": 427 + }, + { + "epoch": 0.16509161041465767, + "grad_norm": 0.47800651336465766, + "learning_rate": 9.440410649538592e-06, + "loss": 2.2823, + "step": 428 + }, + { + "epoch": 0.16547733847637416, + "grad_norm": 0.5054964717603495, + "learning_rate": 9.437577852983103e-06, + "loss": 2.4017, + "step": 429 + }, + { + "epoch": 0.16586306653809066, + "grad_norm": 0.43849628461745427, + "learning_rate": 9.434738331550763e-06, + "loss": 2.3686, + "step": 430 + }, + { + "epoch": 0.16624879459980713, + "grad_norm": 0.4487380662190444, + "learning_rate": 9.43189208954469e-06, + "loss": 2.2904, + "step": 431 + }, + { + "epoch": 0.16663452266152362, + "grad_norm": 0.49527336714809694, + "learning_rate": 9.42903913127819e-06, + "loss": 2.3772, + "step": 432 + }, + { + "epoch": 0.16702025072324012, + "grad_norm": 0.43855103001961837, + "learning_rate": 9.426179461074745e-06, + "loss": 2.3655, + "step": 433 + }, + { + "epoch": 0.1674059787849566, + "grad_norm": 0.4441094939078043, + "learning_rate": 9.423313083268013e-06, + "loss": 2.3571, + "step": 434 + }, + { + "epoch": 0.1677917068466731, + "grad_norm": 0.45172643249146127, + "learning_rate": 9.42044000220181e-06, + "loss": 2.3865, + "step": 435 + }, + { + "epoch": 0.16817743490838957, + "grad_norm": 0.45725013139103915, + "learning_rate": 9.417560222230115e-06, + "loss": 2.3225, + "step": 436 + }, + { + "epoch": 0.16856316297010607, + "grad_norm": 0.4495590545821153, + "learning_rate": 9.41467374771706e-06, + "loss": 2.384, + "step": 437 + }, + { + "epoch": 0.16894889103182256, + "grad_norm": 0.4348734627267952, + "learning_rate": 9.411780583036915e-06, + "loss": 2.2426, + "step": 438 + }, + { + "epoch": 0.16933461909353906, + "grad_norm": 0.44935831762246375, + "learning_rate": 9.4088807325741e-06, + "loss": 2.3759, + "step": 439 + }, + { + "epoch": 0.16972034715525555, + "grad_norm": 0.46552305380180337, + "learning_rate": 9.405974200723156e-06, + "loss": 2.3293, + "step": 440 + }, + { + "epoch": 0.17010607521697205, + "grad_norm": 0.45321579183130734, + "learning_rate": 9.403060991888753e-06, + "loss": 2.42, + "step": 441 + }, + { + "epoch": 0.17049180327868851, + "grad_norm": 0.47121176667252546, + "learning_rate": 9.400141110485684e-06, + "loss": 2.3055, + "step": 442 + }, + { + "epoch": 0.170877531340405, + "grad_norm": 0.4628982145702491, + "learning_rate": 9.397214560938845e-06, + "loss": 2.3334, + "step": 443 + }, + { + "epoch": 0.1712632594021215, + "grad_norm": 0.4520120405725651, + "learning_rate": 9.394281347683247e-06, + "loss": 2.3796, + "step": 444 + }, + { + "epoch": 0.171648987463838, + "grad_norm": 0.4614216676717978, + "learning_rate": 9.391341475163992e-06, + "loss": 2.3848, + "step": 445 + }, + { + "epoch": 0.1720347155255545, + "grad_norm": 0.44564394784034217, + "learning_rate": 9.388394947836278e-06, + "loss": 2.3898, + "step": 446 + }, + { + "epoch": 0.17242044358727096, + "grad_norm": 0.47489566440126735, + "learning_rate": 9.385441770165385e-06, + "loss": 2.2923, + "step": 447 + }, + { + "epoch": 0.17280617164898746, + "grad_norm": 0.4749715130955814, + "learning_rate": 9.382481946626673e-06, + "loss": 2.3554, + "step": 448 + }, + { + "epoch": 0.17319189971070395, + "grad_norm": 0.4561615049418411, + "learning_rate": 9.379515481705572e-06, + "loss": 2.2604, + "step": 449 + }, + { + "epoch": 0.17357762777242045, + "grad_norm": 0.45020708135658016, + "learning_rate": 9.37654237989758e-06, + "loss": 2.3612, + "step": 450 + }, + { + "epoch": 0.17396335583413694, + "grad_norm": 0.4388756109677834, + "learning_rate": 9.373562645708244e-06, + "loss": 2.3052, + "step": 451 + }, + { + "epoch": 0.17434908389585344, + "grad_norm": 0.47173029790030807, + "learning_rate": 9.370576283653178e-06, + "loss": 2.3531, + "step": 452 + }, + { + "epoch": 0.1747348119575699, + "grad_norm": 0.4551478961521674, + "learning_rate": 9.367583298258022e-06, + "loss": 2.32, + "step": 453 + }, + { + "epoch": 0.1751205400192864, + "grad_norm": 0.5107186970646902, + "learning_rate": 9.364583694058467e-06, + "loss": 2.3383, + "step": 454 + }, + { + "epoch": 0.1755062680810029, + "grad_norm": 0.49295732309731005, + "learning_rate": 9.361577475600225e-06, + "loss": 2.3149, + "step": 455 + }, + { + "epoch": 0.1758919961427194, + "grad_norm": 0.45013631985174496, + "learning_rate": 9.358564647439037e-06, + "loss": 2.3459, + "step": 456 + }, + { + "epoch": 0.17627772420443588, + "grad_norm": 0.4400225469735529, + "learning_rate": 9.355545214140661e-06, + "loss": 2.2976, + "step": 457 + }, + { + "epoch": 0.17666345226615235, + "grad_norm": 0.44432846198913595, + "learning_rate": 9.352519180280862e-06, + "loss": 2.3229, + "step": 458 + }, + { + "epoch": 0.17704918032786884, + "grad_norm": 0.4568845745655981, + "learning_rate": 9.349486550445405e-06, + "loss": 2.3313, + "step": 459 + }, + { + "epoch": 0.17743490838958534, + "grad_norm": 0.4922752298047154, + "learning_rate": 9.34644732923006e-06, + "loss": 2.2953, + "step": 460 + }, + { + "epoch": 0.17782063645130183, + "grad_norm": 0.4478173878901112, + "learning_rate": 9.343401521240576e-06, + "loss": 2.3652, + "step": 461 + }, + { + "epoch": 0.17820636451301833, + "grad_norm": 0.4392171343396795, + "learning_rate": 9.34034913109269e-06, + "loss": 2.3811, + "step": 462 + }, + { + "epoch": 0.17859209257473482, + "grad_norm": 0.43781933893230074, + "learning_rate": 9.337290163412112e-06, + "loss": 2.3914, + "step": 463 + }, + { + "epoch": 0.1789778206364513, + "grad_norm": 0.46351158158141825, + "learning_rate": 9.33422462283452e-06, + "loss": 2.3988, + "step": 464 + }, + { + "epoch": 0.17936354869816779, + "grad_norm": 0.5066065947328378, + "learning_rate": 9.33115251400555e-06, + "loss": 2.3164, + "step": 465 + }, + { + "epoch": 0.17974927675988428, + "grad_norm": 0.4179985733238741, + "learning_rate": 9.328073841580797e-06, + "loss": 2.382, + "step": 466 + }, + { + "epoch": 0.18013500482160077, + "grad_norm": 0.4633632643057099, + "learning_rate": 9.3249886102258e-06, + "loss": 2.4012, + "step": 467 + }, + { + "epoch": 0.18052073288331727, + "grad_norm": 0.43953321607847123, + "learning_rate": 9.321896824616036e-06, + "loss": 2.3646, + "step": 468 + }, + { + "epoch": 0.18090646094503376, + "grad_norm": 0.47262124365709346, + "learning_rate": 9.318798489436917e-06, + "loss": 2.3455, + "step": 469 + }, + { + "epoch": 0.18129218900675023, + "grad_norm": 0.4734091829521179, + "learning_rate": 9.315693609383782e-06, + "loss": 2.4445, + "step": 470 + }, + { + "epoch": 0.18167791706846673, + "grad_norm": 0.5357820111129492, + "learning_rate": 9.312582189161882e-06, + "loss": 2.3196, + "step": 471 + }, + { + "epoch": 0.18206364513018322, + "grad_norm": 0.4531939491222502, + "learning_rate": 9.309464233486386e-06, + "loss": 2.3352, + "step": 472 + }, + { + "epoch": 0.18244937319189972, + "grad_norm": 0.46164565898099374, + "learning_rate": 9.306339747082364e-06, + "loss": 2.3742, + "step": 473 + }, + { + "epoch": 0.1828351012536162, + "grad_norm": 0.46841595728918883, + "learning_rate": 9.303208734684785e-06, + "loss": 2.3534, + "step": 474 + }, + { + "epoch": 0.18322082931533268, + "grad_norm": 0.47295947245811865, + "learning_rate": 9.300071201038503e-06, + "loss": 2.3432, + "step": 475 + }, + { + "epoch": 0.18360655737704917, + "grad_norm": 0.4343256543024035, + "learning_rate": 9.29692715089826e-06, + "loss": 2.3627, + "step": 476 + }, + { + "epoch": 0.18399228543876567, + "grad_norm": 0.4689889558698709, + "learning_rate": 9.29377658902867e-06, + "loss": 2.3754, + "step": 477 + }, + { + "epoch": 0.18437801350048216, + "grad_norm": 0.47896804482222666, + "learning_rate": 9.290619520204216e-06, + "loss": 2.3702, + "step": 478 + }, + { + "epoch": 0.18476374156219866, + "grad_norm": 0.4482334033726198, + "learning_rate": 9.287455949209243e-06, + "loss": 2.2704, + "step": 479 + }, + { + "epoch": 0.18514946962391515, + "grad_norm": 0.47595219738664213, + "learning_rate": 9.284285880837947e-06, + "loss": 2.3589, + "step": 480 + }, + { + "epoch": 0.18553519768563162, + "grad_norm": 0.5474764319097681, + "learning_rate": 9.281109319894374e-06, + "loss": 2.3885, + "step": 481 + }, + { + "epoch": 0.18592092574734811, + "grad_norm": 0.45726139926742926, + "learning_rate": 9.277926271192405e-06, + "loss": 2.3444, + "step": 482 + }, + { + "epoch": 0.1863066538090646, + "grad_norm": 0.45950159400239354, + "learning_rate": 9.274736739555757e-06, + "loss": 2.3134, + "step": 483 + }, + { + "epoch": 0.1866923818707811, + "grad_norm": 0.4510519833063498, + "learning_rate": 9.271540729817969e-06, + "loss": 2.3257, + "step": 484 + }, + { + "epoch": 0.1870781099324976, + "grad_norm": 0.47307586476649705, + "learning_rate": 9.268338246822395e-06, + "loss": 2.3307, + "step": 485 + }, + { + "epoch": 0.18746383799421407, + "grad_norm": 0.4863114816770395, + "learning_rate": 9.265129295422205e-06, + "loss": 2.3995, + "step": 486 + }, + { + "epoch": 0.18784956605593056, + "grad_norm": 0.4962829228709082, + "learning_rate": 9.261913880480367e-06, + "loss": 2.3765, + "step": 487 + }, + { + "epoch": 0.18823529411764706, + "grad_norm": 0.43479392629654684, + "learning_rate": 9.258692006869644e-06, + "loss": 2.3637, + "step": 488 + }, + { + "epoch": 0.18862102217936355, + "grad_norm": 0.460315396760295, + "learning_rate": 9.255463679472587e-06, + "loss": 2.3175, + "step": 489 + }, + { + "epoch": 0.18900675024108005, + "grad_norm": 0.45600398012066895, + "learning_rate": 9.252228903181529e-06, + "loss": 2.3246, + "step": 490 + }, + { + "epoch": 0.18939247830279654, + "grad_norm": 0.4473255725474842, + "learning_rate": 9.248987682898576e-06, + "loss": 2.3491, + "step": 491 + }, + { + "epoch": 0.189778206364513, + "grad_norm": 0.46269118307905355, + "learning_rate": 9.245740023535596e-06, + "loss": 2.2622, + "step": 492 + }, + { + "epoch": 0.1901639344262295, + "grad_norm": 0.4925059596550583, + "learning_rate": 9.24248593001422e-06, + "loss": 2.2895, + "step": 493 + }, + { + "epoch": 0.190549662487946, + "grad_norm": 0.4773842445251738, + "learning_rate": 9.239225407265824e-06, + "loss": 2.3874, + "step": 494 + }, + { + "epoch": 0.1909353905496625, + "grad_norm": 0.4662931552622624, + "learning_rate": 9.235958460231533e-06, + "loss": 2.298, + "step": 495 + }, + { + "epoch": 0.191321118611379, + "grad_norm": 0.4619381336604932, + "learning_rate": 9.232685093862206e-06, + "loss": 2.381, + "step": 496 + }, + { + "epoch": 0.19170684667309545, + "grad_norm": 0.5298956071333619, + "learning_rate": 9.229405313118423e-06, + "loss": 2.3675, + "step": 497 + }, + { + "epoch": 0.19209257473481195, + "grad_norm": 0.4743693788571214, + "learning_rate": 9.226119122970495e-06, + "loss": 2.3474, + "step": 498 + }, + { + "epoch": 0.19247830279652844, + "grad_norm": 0.5161243598264611, + "learning_rate": 9.22282652839844e-06, + "loss": 2.3182, + "step": 499 + }, + { + "epoch": 0.19286403085824494, + "grad_norm": 0.46825367215183233, + "learning_rate": 9.219527534391983e-06, + "loss": 2.3798, + "step": 500 + }, + { + "epoch": 0.19324975891996143, + "grad_norm": 0.49111756652037414, + "learning_rate": 9.216222145950548e-06, + "loss": 2.3427, + "step": 501 + }, + { + "epoch": 0.19363548698167793, + "grad_norm": 0.4372455755844022, + "learning_rate": 9.212910368083246e-06, + "loss": 2.352, + "step": 502 + }, + { + "epoch": 0.1940212150433944, + "grad_norm": 0.444514189504708, + "learning_rate": 9.209592205808874e-06, + "loss": 2.314, + "step": 503 + }, + { + "epoch": 0.1944069431051109, + "grad_norm": 0.4494673362799293, + "learning_rate": 9.206267664155906e-06, + "loss": 2.3621, + "step": 504 + }, + { + "epoch": 0.19479267116682739, + "grad_norm": 0.4854587757083868, + "learning_rate": 9.202936748162479e-06, + "loss": 2.3142, + "step": 505 + }, + { + "epoch": 0.19517839922854388, + "grad_norm": 0.42915591124069197, + "learning_rate": 9.19959946287639e-06, + "loss": 2.3916, + "step": 506 + }, + { + "epoch": 0.19556412729026038, + "grad_norm": 0.4490181754160567, + "learning_rate": 9.19625581335509e-06, + "loss": 2.3163, + "step": 507 + }, + { + "epoch": 0.19594985535197684, + "grad_norm": 0.4415330391865188, + "learning_rate": 9.192905804665677e-06, + "loss": 2.3438, + "step": 508 + }, + { + "epoch": 0.19633558341369334, + "grad_norm": 0.4649406985335145, + "learning_rate": 9.189549441884883e-06, + "loss": 2.3441, + "step": 509 + }, + { + "epoch": 0.19672131147540983, + "grad_norm": 0.4619548519344354, + "learning_rate": 9.18618673009907e-06, + "loss": 2.3707, + "step": 510 + }, + { + "epoch": 0.19710703953712633, + "grad_norm": 0.4378739732903529, + "learning_rate": 9.182817674404218e-06, + "loss": 2.2881, + "step": 511 + }, + { + "epoch": 0.19749276759884282, + "grad_norm": 0.43622883307093985, + "learning_rate": 9.179442279905927e-06, + "loss": 2.3662, + "step": 512 + }, + { + "epoch": 0.19787849566055932, + "grad_norm": 0.41856212113307684, + "learning_rate": 9.176060551719402e-06, + "loss": 2.4293, + "step": 513 + }, + { + "epoch": 0.19826422372227578, + "grad_norm": 0.45985586879947066, + "learning_rate": 9.17267249496944e-06, + "loss": 2.3028, + "step": 514 + }, + { + "epoch": 0.19864995178399228, + "grad_norm": 0.47187581156802855, + "learning_rate": 9.169278114790437e-06, + "loss": 2.3083, + "step": 515 + }, + { + "epoch": 0.19903567984570877, + "grad_norm": 0.46813467893089566, + "learning_rate": 9.165877416326365e-06, + "loss": 2.4425, + "step": 516 + }, + { + "epoch": 0.19942140790742527, + "grad_norm": 0.430401839388525, + "learning_rate": 9.162470404730776e-06, + "loss": 2.3625, + "step": 517 + }, + { + "epoch": 0.19980713596914176, + "grad_norm": 0.45099082208073454, + "learning_rate": 9.159057085166785e-06, + "loss": 2.3849, + "step": 518 + }, + { + "epoch": 0.20019286403085826, + "grad_norm": 0.45837731641693436, + "learning_rate": 9.15563746280707e-06, + "loss": 2.3482, + "step": 519 + }, + { + "epoch": 0.20057859209257473, + "grad_norm": 0.4453193647370109, + "learning_rate": 9.152211542833856e-06, + "loss": 2.3383, + "step": 520 + }, + { + "epoch": 0.20096432015429122, + "grad_norm": 0.5345378341413923, + "learning_rate": 9.148779330438919e-06, + "loss": 2.3119, + "step": 521 + }, + { + "epoch": 0.20135004821600772, + "grad_norm": 0.48502988720256984, + "learning_rate": 9.145340830823562e-06, + "loss": 2.2941, + "step": 522 + }, + { + "epoch": 0.2017357762777242, + "grad_norm": 0.45826176855332257, + "learning_rate": 9.141896049198622e-06, + "loss": 2.3359, + "step": 523 + }, + { + "epoch": 0.2021215043394407, + "grad_norm": 0.458602324657938, + "learning_rate": 9.138444990784455e-06, + "loss": 2.2403, + "step": 524 + }, + { + "epoch": 0.20250723240115717, + "grad_norm": 0.4370961113251191, + "learning_rate": 9.134987660810925e-06, + "loss": 2.3572, + "step": 525 + }, + { + "epoch": 0.20289296046287367, + "grad_norm": 0.4644845727347313, + "learning_rate": 9.131524064517405e-06, + "loss": 2.3709, + "step": 526 + }, + { + "epoch": 0.20327868852459016, + "grad_norm": 0.4649486219929489, + "learning_rate": 9.128054207152765e-06, + "loss": 2.3325, + "step": 527 + }, + { + "epoch": 0.20366441658630666, + "grad_norm": 0.4215258238013722, + "learning_rate": 9.124578093975358e-06, + "loss": 2.2895, + "step": 528 + }, + { + "epoch": 0.20405014464802315, + "grad_norm": 0.44795826208091805, + "learning_rate": 9.12109573025302e-06, + "loss": 2.422, + "step": 529 + }, + { + "epoch": 0.20443587270973965, + "grad_norm": 0.471764689731237, + "learning_rate": 9.117607121263063e-06, + "loss": 2.288, + "step": 530 + }, + { + "epoch": 0.2048216007714561, + "grad_norm": 0.45638319685158746, + "learning_rate": 9.114112272292255e-06, + "loss": 2.3147, + "step": 531 + }, + { + "epoch": 0.2052073288331726, + "grad_norm": 0.4922252882046052, + "learning_rate": 9.110611188636828e-06, + "loss": 2.2945, + "step": 532 + }, + { + "epoch": 0.2055930568948891, + "grad_norm": 0.4522653884951574, + "learning_rate": 9.107103875602458e-06, + "loss": 2.3377, + "step": 533 + }, + { + "epoch": 0.2059787849566056, + "grad_norm": 0.41728328170335205, + "learning_rate": 9.103590338504264e-06, + "loss": 2.3368, + "step": 534 + }, + { + "epoch": 0.2063645130183221, + "grad_norm": 0.46950092641132146, + "learning_rate": 9.100070582666796e-06, + "loss": 2.3087, + "step": 535 + }, + { + "epoch": 0.20675024108003856, + "grad_norm": 0.4641571541323398, + "learning_rate": 9.096544613424026e-06, + "loss": 2.3273, + "step": 536 + }, + { + "epoch": 0.20713596914175506, + "grad_norm": 0.449176581685245, + "learning_rate": 9.093012436119345e-06, + "loss": 2.2831, + "step": 537 + }, + { + "epoch": 0.20752169720347155, + "grad_norm": 0.44605553704842194, + "learning_rate": 9.089474056105552e-06, + "loss": 2.3976, + "step": 538 + }, + { + "epoch": 0.20790742526518805, + "grad_norm": 0.47316784820593766, + "learning_rate": 9.085929478744841e-06, + "loss": 2.3183, + "step": 539 + }, + { + "epoch": 0.20829315332690454, + "grad_norm": 0.46460066029125063, + "learning_rate": 9.082378709408805e-06, + "loss": 2.3535, + "step": 540 + }, + { + "epoch": 0.20867888138862103, + "grad_norm": 0.4811002373146888, + "learning_rate": 9.078821753478417e-06, + "loss": 2.3269, + "step": 541 + }, + { + "epoch": 0.2090646094503375, + "grad_norm": 0.49989065114093767, + "learning_rate": 9.075258616344025e-06, + "loss": 2.4009, + "step": 542 + }, + { + "epoch": 0.209450337512054, + "grad_norm": 0.4984731432556212, + "learning_rate": 9.071689303405343e-06, + "loss": 2.3227, + "step": 543 + }, + { + "epoch": 0.2098360655737705, + "grad_norm": 0.49437252214019445, + "learning_rate": 9.068113820071447e-06, + "loss": 2.3462, + "step": 544 + }, + { + "epoch": 0.210221793635487, + "grad_norm": 0.4515422763419102, + "learning_rate": 9.064532171760762e-06, + "loss": 2.4018, + "step": 545 + }, + { + "epoch": 0.21060752169720348, + "grad_norm": 0.4713555212878107, + "learning_rate": 9.060944363901057e-06, + "loss": 2.327, + "step": 546 + }, + { + "epoch": 0.21099324975891995, + "grad_norm": 0.4506689586670132, + "learning_rate": 9.057350401929433e-06, + "loss": 2.3214, + "step": 547 + }, + { + "epoch": 0.21137897782063644, + "grad_norm": 0.43907614616137236, + "learning_rate": 9.053750291292321e-06, + "loss": 2.323, + "step": 548 + }, + { + "epoch": 0.21176470588235294, + "grad_norm": 0.5266396975771747, + "learning_rate": 9.050144037445465e-06, + "loss": 2.2813, + "step": 549 + }, + { + "epoch": 0.21215043394406943, + "grad_norm": 0.4664813996758015, + "learning_rate": 9.046531645853924e-06, + "loss": 2.3815, + "step": 550 + }, + { + "epoch": 0.21253616200578593, + "grad_norm": 0.4201578229894165, + "learning_rate": 9.042913121992053e-06, + "loss": 2.3655, + "step": 551 + }, + { + "epoch": 0.21292189006750242, + "grad_norm": 0.4435148066995547, + "learning_rate": 9.039288471343505e-06, + "loss": 2.3591, + "step": 552 + }, + { + "epoch": 0.2133076181292189, + "grad_norm": 0.47388724861237486, + "learning_rate": 9.035657699401215e-06, + "loss": 2.3069, + "step": 553 + }, + { + "epoch": 0.21369334619093538, + "grad_norm": 0.4435532778834711, + "learning_rate": 9.032020811667395e-06, + "loss": 2.3913, + "step": 554 + }, + { + "epoch": 0.21407907425265188, + "grad_norm": 0.46191975386537937, + "learning_rate": 9.028377813653525e-06, + "loss": 2.3414, + "step": 555 + }, + { + "epoch": 0.21446480231436837, + "grad_norm": 0.4808909866476146, + "learning_rate": 9.024728710880345e-06, + "loss": 2.3831, + "step": 556 + }, + { + "epoch": 0.21485053037608487, + "grad_norm": 0.4586450639319388, + "learning_rate": 9.021073508877845e-06, + "loss": 2.3432, + "step": 557 + }, + { + "epoch": 0.21523625843780134, + "grad_norm": 0.42942847956441976, + "learning_rate": 9.017412213185261e-06, + "loss": 2.3106, + "step": 558 + }, + { + "epoch": 0.21562198649951783, + "grad_norm": 0.4130686749756181, + "learning_rate": 9.013744829351063e-06, + "loss": 2.3833, + "step": 559 + }, + { + "epoch": 0.21600771456123433, + "grad_norm": 0.45046846934747004, + "learning_rate": 9.010071362932945e-06, + "loss": 2.3834, + "step": 560 + }, + { + "epoch": 0.21639344262295082, + "grad_norm": 0.4973014254267896, + "learning_rate": 9.006391819497815e-06, + "loss": 2.3622, + "step": 561 + }, + { + "epoch": 0.21677917068466732, + "grad_norm": 0.448815959873419, + "learning_rate": 9.002706204621802e-06, + "loss": 2.2579, + "step": 562 + }, + { + "epoch": 0.2171648987463838, + "grad_norm": 0.48175635182232734, + "learning_rate": 8.999014523890228e-06, + "loss": 2.371, + "step": 563 + }, + { + "epoch": 0.21755062680810028, + "grad_norm": 0.44560435917971575, + "learning_rate": 8.995316782897605e-06, + "loss": 2.3395, + "step": 564 + }, + { + "epoch": 0.21793635486981677, + "grad_norm": 0.4655634371173414, + "learning_rate": 8.991612987247635e-06, + "loss": 2.3734, + "step": 565 + }, + { + "epoch": 0.21832208293153327, + "grad_norm": 0.40798212196098665, + "learning_rate": 8.987903142553194e-06, + "loss": 2.2733, + "step": 566 + }, + { + "epoch": 0.21870781099324976, + "grad_norm": 0.46181147562124997, + "learning_rate": 8.984187254436321e-06, + "loss": 2.357, + "step": 567 + }, + { + "epoch": 0.21909353905496626, + "grad_norm": 0.460719819698626, + "learning_rate": 8.98046532852822e-06, + "loss": 2.378, + "step": 568 + }, + { + "epoch": 0.21947926711668275, + "grad_norm": 0.4549844145982879, + "learning_rate": 8.976737370469237e-06, + "loss": 2.3645, + "step": 569 + }, + { + "epoch": 0.21986499517839922, + "grad_norm": 0.4472909074531031, + "learning_rate": 8.973003385908867e-06, + "loss": 2.343, + "step": 570 + }, + { + "epoch": 0.22025072324011571, + "grad_norm": 0.49221278008360814, + "learning_rate": 8.969263380505732e-06, + "loss": 2.3563, + "step": 571 + }, + { + "epoch": 0.2206364513018322, + "grad_norm": 0.4921528083374167, + "learning_rate": 8.965517359927583e-06, + "loss": 2.3788, + "step": 572 + }, + { + "epoch": 0.2210221793635487, + "grad_norm": 0.43757484012513737, + "learning_rate": 8.961765329851284e-06, + "loss": 2.2875, + "step": 573 + }, + { + "epoch": 0.2214079074252652, + "grad_norm": 0.4839031556215558, + "learning_rate": 8.958007295962802e-06, + "loss": 2.3777, + "step": 574 + }, + { + "epoch": 0.22179363548698167, + "grad_norm": 0.4596815016724665, + "learning_rate": 8.954243263957214e-06, + "loss": 2.2989, + "step": 575 + }, + { + "epoch": 0.22217936354869816, + "grad_norm": 0.43764815076903374, + "learning_rate": 8.950473239538672e-06, + "loss": 2.2998, + "step": 576 + }, + { + "epoch": 0.22256509161041466, + "grad_norm": 0.4579255766738418, + "learning_rate": 8.946697228420422e-06, + "loss": 2.3002, + "step": 577 + }, + { + "epoch": 0.22295081967213115, + "grad_norm": 0.4588483179120956, + "learning_rate": 8.942915236324775e-06, + "loss": 2.3287, + "step": 578 + }, + { + "epoch": 0.22333654773384765, + "grad_norm": 0.46612698165947597, + "learning_rate": 8.93912726898311e-06, + "loss": 2.3349, + "step": 579 + }, + { + "epoch": 0.22372227579556414, + "grad_norm": 0.457613146049226, + "learning_rate": 8.935333332135853e-06, + "loss": 2.3414, + "step": 580 + }, + { + "epoch": 0.2241080038572806, + "grad_norm": 0.44860534787088, + "learning_rate": 8.93153343153249e-06, + "loss": 2.3232, + "step": 581 + }, + { + "epoch": 0.2244937319189971, + "grad_norm": 0.4937219548706247, + "learning_rate": 8.927727572931532e-06, + "loss": 2.3366, + "step": 582 + }, + { + "epoch": 0.2248794599807136, + "grad_norm": 0.4932265294129161, + "learning_rate": 8.923915762100525e-06, + "loss": 2.3102, + "step": 583 + }, + { + "epoch": 0.2252651880424301, + "grad_norm": 0.44934181403039386, + "learning_rate": 8.920098004816035e-06, + "loss": 2.3293, + "step": 584 + }, + { + "epoch": 0.2256509161041466, + "grad_norm": 0.45126599523522487, + "learning_rate": 8.916274306863642e-06, + "loss": 2.3543, + "step": 585 + }, + { + "epoch": 0.22603664416586305, + "grad_norm": 0.5065377259132144, + "learning_rate": 8.91244467403792e-06, + "loss": 2.3546, + "step": 586 + }, + { + "epoch": 0.22642237222757955, + "grad_norm": 0.4778584518441631, + "learning_rate": 8.908609112142444e-06, + "loss": 2.3189, + "step": 587 + }, + { + "epoch": 0.22680810028929604, + "grad_norm": 0.4822383520476654, + "learning_rate": 8.904767626989774e-06, + "loss": 2.3384, + "step": 588 + }, + { + "epoch": 0.22719382835101254, + "grad_norm": 0.4856551848734807, + "learning_rate": 8.900920224401446e-06, + "loss": 2.3957, + "step": 589 + }, + { + "epoch": 0.22757955641272903, + "grad_norm": 0.4153806184547439, + "learning_rate": 8.897066910207958e-06, + "loss": 2.307, + "step": 590 + }, + { + "epoch": 0.22796528447444553, + "grad_norm": 0.46317031367156986, + "learning_rate": 8.893207690248776e-06, + "loss": 2.3562, + "step": 591 + }, + { + "epoch": 0.228351012536162, + "grad_norm": 0.48489527142084543, + "learning_rate": 8.88934257037231e-06, + "loss": 2.3807, + "step": 592 + }, + { + "epoch": 0.2287367405978785, + "grad_norm": 0.4528141570294088, + "learning_rate": 8.88547155643591e-06, + "loss": 2.2668, + "step": 593 + }, + { + "epoch": 0.22912246865959499, + "grad_norm": 0.4251865157625066, + "learning_rate": 8.88159465430586e-06, + "loss": 2.2951, + "step": 594 + }, + { + "epoch": 0.22950819672131148, + "grad_norm": 0.46093546704278265, + "learning_rate": 8.877711869857368e-06, + "loss": 2.3992, + "step": 595 + }, + { + "epoch": 0.22989392478302798, + "grad_norm": 0.4433096544674703, + "learning_rate": 8.873823208974557e-06, + "loss": 2.3377, + "step": 596 + }, + { + "epoch": 0.23027965284474444, + "grad_norm": 0.46530553084201687, + "learning_rate": 8.869928677550453e-06, + "loss": 2.3484, + "step": 597 + }, + { + "epoch": 0.23066538090646094, + "grad_norm": 0.46108298845071083, + "learning_rate": 8.866028281486978e-06, + "loss": 2.3731, + "step": 598 + }, + { + "epoch": 0.23105110896817743, + "grad_norm": 0.43532198423080554, + "learning_rate": 8.862122026694944e-06, + "loss": 2.3332, + "step": 599 + }, + { + "epoch": 0.23143683702989393, + "grad_norm": 0.44454741853143076, + "learning_rate": 8.85820991909404e-06, + "loss": 2.2882, + "step": 600 + }, + { + "epoch": 0.23182256509161042, + "grad_norm": 0.46404386426637784, + "learning_rate": 8.854291964612824e-06, + "loss": 2.3363, + "step": 601 + }, + { + "epoch": 0.23220829315332692, + "grad_norm": 0.4398707058122081, + "learning_rate": 8.850368169188717e-06, + "loss": 2.3552, + "step": 602 + }, + { + "epoch": 0.23259402121504338, + "grad_norm": 0.45305515998890616, + "learning_rate": 8.84643853876799e-06, + "loss": 2.3186, + "step": 603 + }, + { + "epoch": 0.23297974927675988, + "grad_norm": 0.4821684943054939, + "learning_rate": 8.842503079305757e-06, + "loss": 2.3453, + "step": 604 + }, + { + "epoch": 0.23336547733847637, + "grad_norm": 0.4470144221350872, + "learning_rate": 8.838561796765964e-06, + "loss": 2.3314, + "step": 605 + }, + { + "epoch": 0.23375120540019287, + "grad_norm": 0.4798478803514554, + "learning_rate": 8.834614697121384e-06, + "loss": 2.299, + "step": 606 + }, + { + "epoch": 0.23413693346190936, + "grad_norm": 0.45620806995059565, + "learning_rate": 8.830661786353602e-06, + "loss": 2.3667, + "step": 607 + }, + { + "epoch": 0.23452266152362583, + "grad_norm": 0.4363843604095868, + "learning_rate": 8.826703070453014e-06, + "loss": 2.3131, + "step": 608 + }, + { + "epoch": 0.23490838958534233, + "grad_norm": 0.41484049221741226, + "learning_rate": 8.82273855541881e-06, + "loss": 2.3871, + "step": 609 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 0.43654986338390206, + "learning_rate": 8.81876824725897e-06, + "loss": 2.4467, + "step": 610 + }, + { + "epoch": 0.23567984570877532, + "grad_norm": 0.43244037227389426, + "learning_rate": 8.814792151990253e-06, + "loss": 2.3314, + "step": 611 + }, + { + "epoch": 0.2360655737704918, + "grad_norm": 0.4471834289259313, + "learning_rate": 8.810810275638183e-06, + "loss": 2.2777, + "step": 612 + }, + { + "epoch": 0.2364513018322083, + "grad_norm": 0.4610781584149346, + "learning_rate": 8.806822624237055e-06, + "loss": 2.2824, + "step": 613 + }, + { + "epoch": 0.23683702989392477, + "grad_norm": 0.5055359126910268, + "learning_rate": 8.802829203829904e-06, + "loss": 2.3409, + "step": 614 + }, + { + "epoch": 0.23722275795564127, + "grad_norm": 0.431466867002683, + "learning_rate": 8.798830020468519e-06, + "loss": 2.3704, + "step": 615 + }, + { + "epoch": 0.23760848601735776, + "grad_norm": 0.41289024443297107, + "learning_rate": 8.794825080213415e-06, + "loss": 2.2887, + "step": 616 + }, + { + "epoch": 0.23799421407907426, + "grad_norm": 0.42480354215245325, + "learning_rate": 8.79081438913383e-06, + "loss": 2.2875, + "step": 617 + }, + { + "epoch": 0.23837994214079075, + "grad_norm": 0.4368624737440716, + "learning_rate": 8.786797953307725e-06, + "loss": 2.3376, + "step": 618 + }, + { + "epoch": 0.23876567020250722, + "grad_norm": 0.4443608049799916, + "learning_rate": 8.78277577882176e-06, + "loss": 2.2969, + "step": 619 + }, + { + "epoch": 0.2391513982642237, + "grad_norm": 0.45500903232533707, + "learning_rate": 8.778747871771293e-06, + "loss": 2.3506, + "step": 620 + }, + { + "epoch": 0.2395371263259402, + "grad_norm": 0.4491929565912108, + "learning_rate": 8.774714238260369e-06, + "loss": 2.4093, + "step": 621 + }, + { + "epoch": 0.2399228543876567, + "grad_norm": 0.4826105022745694, + "learning_rate": 8.770674884401714e-06, + "loss": 2.3438, + "step": 622 + }, + { + "epoch": 0.2403085824493732, + "grad_norm": 0.4541889182729078, + "learning_rate": 8.766629816316722e-06, + "loss": 2.2042, + "step": 623 + }, + { + "epoch": 0.2406943105110897, + "grad_norm": 0.4576938178983255, + "learning_rate": 8.76257904013544e-06, + "loss": 2.3415, + "step": 624 + }, + { + "epoch": 0.24108003857280616, + "grad_norm": 0.4234180305215837, + "learning_rate": 8.758522561996577e-06, + "loss": 2.3312, + "step": 625 + }, + { + "epoch": 0.24146576663452265, + "grad_norm": 0.44449556958449665, + "learning_rate": 8.754460388047472e-06, + "loss": 2.381, + "step": 626 + }, + { + "epoch": 0.24185149469623915, + "grad_norm": 0.4432541863472157, + "learning_rate": 8.750392524444102e-06, + "loss": 2.3367, + "step": 627 + }, + { + "epoch": 0.24223722275795564, + "grad_norm": 0.44148252274322086, + "learning_rate": 8.746318977351066e-06, + "loss": 2.3565, + "step": 628 + }, + { + "epoch": 0.24262295081967214, + "grad_norm": 0.5134089752365603, + "learning_rate": 8.742239752941572e-06, + "loss": 2.303, + "step": 629 + }, + { + "epoch": 0.24300867888138863, + "grad_norm": 0.4443950399081667, + "learning_rate": 8.738154857397437e-06, + "loss": 2.3333, + "step": 630 + }, + { + "epoch": 0.2433944069431051, + "grad_norm": 0.42103159116790523, + "learning_rate": 8.734064296909066e-06, + "loss": 2.3859, + "step": 631 + }, + { + "epoch": 0.2437801350048216, + "grad_norm": 0.42843587691048385, + "learning_rate": 8.729968077675454e-06, + "loss": 2.3206, + "step": 632 + }, + { + "epoch": 0.2441658630665381, + "grad_norm": 0.4830071126848421, + "learning_rate": 8.725866205904173e-06, + "loss": 2.3747, + "step": 633 + }, + { + "epoch": 0.2445515911282546, + "grad_norm": 0.49859578340636046, + "learning_rate": 8.721758687811353e-06, + "loss": 2.388, + "step": 634 + }, + { + "epoch": 0.24493731918997108, + "grad_norm": 0.4439955166387394, + "learning_rate": 8.717645529621686e-06, + "loss": 2.3674, + "step": 635 + }, + { + "epoch": 0.24532304725168755, + "grad_norm": 0.4229037183311655, + "learning_rate": 8.713526737568415e-06, + "loss": 2.3056, + "step": 636 + }, + { + "epoch": 0.24570877531340404, + "grad_norm": 0.4383644039158399, + "learning_rate": 8.709402317893312e-06, + "loss": 2.3957, + "step": 637 + }, + { + "epoch": 0.24609450337512054, + "grad_norm": 0.42455327869988235, + "learning_rate": 8.705272276846684e-06, + "loss": 2.3569, + "step": 638 + }, + { + "epoch": 0.24648023143683703, + "grad_norm": 0.45921277553192413, + "learning_rate": 8.701136620687355e-06, + "loss": 2.3386, + "step": 639 + }, + { + "epoch": 0.24686595949855353, + "grad_norm": 0.43121730879012, + "learning_rate": 8.696995355682656e-06, + "loss": 2.3687, + "step": 640 + }, + { + "epoch": 0.24725168756027002, + "grad_norm": 0.4462369019493956, + "learning_rate": 8.692848488108423e-06, + "loss": 2.3777, + "step": 641 + }, + { + "epoch": 0.2476374156219865, + "grad_norm": 0.43436360192004747, + "learning_rate": 8.688696024248977e-06, + "loss": 2.3914, + "step": 642 + }, + { + "epoch": 0.24802314368370298, + "grad_norm": 0.4137768504976057, + "learning_rate": 8.684537970397122e-06, + "loss": 2.4054, + "step": 643 + }, + { + "epoch": 0.24840887174541948, + "grad_norm": 0.4415096763888693, + "learning_rate": 8.680374332854134e-06, + "loss": 2.3903, + "step": 644 + }, + { + "epoch": 0.24879459980713597, + "grad_norm": 0.4556463786464995, + "learning_rate": 8.676205117929752e-06, + "loss": 2.3369, + "step": 645 + }, + { + "epoch": 0.24918032786885247, + "grad_norm": 0.4182380195172059, + "learning_rate": 8.672030331942163e-06, + "loss": 2.3101, + "step": 646 + }, + { + "epoch": 0.24956605593056894, + "grad_norm": 0.43948879831910515, + "learning_rate": 8.667849981217997e-06, + "loss": 2.3495, + "step": 647 + }, + { + "epoch": 0.24995178399228543, + "grad_norm": 0.431206271755024, + "learning_rate": 8.663664072092324e-06, + "loss": 2.3322, + "step": 648 + }, + { + "epoch": 0.25033751205400195, + "grad_norm": 0.4628389056590261, + "learning_rate": 8.659472610908628e-06, + "loss": 2.2407, + "step": 649 + }, + { + "epoch": 0.2507232401157184, + "grad_norm": 0.42026399292897054, + "learning_rate": 8.655275604018813e-06, + "loss": 2.3461, + "step": 650 + }, + { + "epoch": 0.2511089681774349, + "grad_norm": 0.46049227256605096, + "learning_rate": 8.651073057783185e-06, + "loss": 2.2959, + "step": 651 + }, + { + "epoch": 0.2514946962391514, + "grad_norm": 0.44202962984695643, + "learning_rate": 8.646864978570445e-06, + "loss": 2.3031, + "step": 652 + }, + { + "epoch": 0.2518804243008679, + "grad_norm": 0.4360453070528761, + "learning_rate": 8.64265137275768e-06, + "loss": 2.3151, + "step": 653 + }, + { + "epoch": 0.2522661523625844, + "grad_norm": 0.444762011719982, + "learning_rate": 8.638432246730351e-06, + "loss": 2.2721, + "step": 654 + }, + { + "epoch": 0.25265188042430087, + "grad_norm": 0.4147806152584227, + "learning_rate": 8.634207606882282e-06, + "loss": 2.2823, + "step": 655 + }, + { + "epoch": 0.25303760848601736, + "grad_norm": 0.4529975286905433, + "learning_rate": 8.629977459615655e-06, + "loss": 2.3199, + "step": 656 + }, + { + "epoch": 0.25342333654773386, + "grad_norm": 0.44515481137345064, + "learning_rate": 8.625741811341001e-06, + "loss": 2.328, + "step": 657 + }, + { + "epoch": 0.25380906460945035, + "grad_norm": 0.44598812762748385, + "learning_rate": 8.621500668477184e-06, + "loss": 2.3782, + "step": 658 + }, + { + "epoch": 0.25419479267116685, + "grad_norm": 0.45731308991486175, + "learning_rate": 8.617254037451396e-06, + "loss": 2.3809, + "step": 659 + }, + { + "epoch": 0.25458052073288334, + "grad_norm": 0.45291014192182655, + "learning_rate": 8.613001924699146e-06, + "loss": 2.3475, + "step": 660 + }, + { + "epoch": 0.2549662487945998, + "grad_norm": 0.4594502645503529, + "learning_rate": 8.60874433666425e-06, + "loss": 2.2756, + "step": 661 + }, + { + "epoch": 0.2553519768563163, + "grad_norm": 0.45670632233299724, + "learning_rate": 8.60448127979882e-06, + "loss": 2.3647, + "step": 662 + }, + { + "epoch": 0.25573770491803277, + "grad_norm": 0.44423047286891265, + "learning_rate": 8.600212760563257e-06, + "loss": 2.3758, + "step": 663 + }, + { + "epoch": 0.25612343297974927, + "grad_norm": 0.42893572139376235, + "learning_rate": 8.595938785426241e-06, + "loss": 2.3226, + "step": 664 + }, + { + "epoch": 0.25650916104146576, + "grad_norm": 0.47474284251148635, + "learning_rate": 8.591659360864718e-06, + "loss": 2.3082, + "step": 665 + }, + { + "epoch": 0.25689488910318226, + "grad_norm": 0.47126129734527794, + "learning_rate": 8.587374493363895e-06, + "loss": 2.3365, + "step": 666 + }, + { + "epoch": 0.25728061716489875, + "grad_norm": 0.4788897233983171, + "learning_rate": 8.583084189417225e-06, + "loss": 2.4082, + "step": 667 + }, + { + "epoch": 0.25766634522661525, + "grad_norm": 0.44412197119577956, + "learning_rate": 8.578788455526398e-06, + "loss": 2.3528, + "step": 668 + }, + { + "epoch": 0.25805207328833174, + "grad_norm": 0.4333346854250443, + "learning_rate": 8.574487298201337e-06, + "loss": 2.3591, + "step": 669 + }, + { + "epoch": 0.25843780135004824, + "grad_norm": 0.45321598100218463, + "learning_rate": 8.570180723960181e-06, + "loss": 2.3196, + "step": 670 + }, + { + "epoch": 0.25882352941176473, + "grad_norm": 0.4720265187911564, + "learning_rate": 8.565868739329282e-06, + "loss": 2.2847, + "step": 671 + }, + { + "epoch": 0.25920925747348117, + "grad_norm": 0.4266778768608411, + "learning_rate": 8.561551350843185e-06, + "loss": 2.311, + "step": 672 + }, + { + "epoch": 0.25959498553519766, + "grad_norm": 0.41875169594932343, + "learning_rate": 8.557228565044629e-06, + "loss": 2.3319, + "step": 673 + }, + { + "epoch": 0.25998071359691416, + "grad_norm": 0.4576486818375762, + "learning_rate": 8.552900388484527e-06, + "loss": 2.3906, + "step": 674 + }, + { + "epoch": 0.26036644165863065, + "grad_norm": 0.45405522032831847, + "learning_rate": 8.548566827721968e-06, + "loss": 2.3113, + "step": 675 + }, + { + "epoch": 0.26075216972034715, + "grad_norm": 0.45848135978953214, + "learning_rate": 8.544227889324199e-06, + "loss": 2.3568, + "step": 676 + }, + { + "epoch": 0.26113789778206364, + "grad_norm": 0.5325117292327657, + "learning_rate": 8.53988357986661e-06, + "loss": 2.3044, + "step": 677 + }, + { + "epoch": 0.26152362584378014, + "grad_norm": 0.460540990758648, + "learning_rate": 8.535533905932739e-06, + "loss": 2.335, + "step": 678 + }, + { + "epoch": 0.26190935390549663, + "grad_norm": 0.4297960673737978, + "learning_rate": 8.531178874114248e-06, + "loss": 2.3894, + "step": 679 + }, + { + "epoch": 0.26229508196721313, + "grad_norm": 0.43060572349815474, + "learning_rate": 8.526818491010922e-06, + "loss": 2.2778, + "step": 680 + }, + { + "epoch": 0.2626808100289296, + "grad_norm": 0.4052028043437063, + "learning_rate": 8.52245276323065e-06, + "loss": 2.271, + "step": 681 + }, + { + "epoch": 0.2630665380906461, + "grad_norm": 0.467624997249077, + "learning_rate": 8.518081697389433e-06, + "loss": 2.3156, + "step": 682 + }, + { + "epoch": 0.26345226615236256, + "grad_norm": 0.456634740268385, + "learning_rate": 8.513705300111344e-06, + "loss": 2.3432, + "step": 683 + }, + { + "epoch": 0.26383799421407905, + "grad_norm": 0.4565601445461014, + "learning_rate": 8.509323578028547e-06, + "loss": 2.3478, + "step": 684 + }, + { + "epoch": 0.26422372227579555, + "grad_norm": 0.43377812392761256, + "learning_rate": 8.504936537781276e-06, + "loss": 2.337, + "step": 685 + }, + { + "epoch": 0.26460945033751204, + "grad_norm": 0.4379959990068602, + "learning_rate": 8.500544186017818e-06, + "loss": 2.3597, + "step": 686 + }, + { + "epoch": 0.26499517839922854, + "grad_norm": 0.4393854424171466, + "learning_rate": 8.496146529394515e-06, + "loss": 2.3475, + "step": 687 + }, + { + "epoch": 0.26538090646094503, + "grad_norm": 0.46530108150002264, + "learning_rate": 8.491743574575743e-06, + "loss": 2.3816, + "step": 688 + }, + { + "epoch": 0.2657666345226615, + "grad_norm": 0.42409285117482814, + "learning_rate": 8.487335328233912e-06, + "loss": 2.2594, + "step": 689 + }, + { + "epoch": 0.266152362584378, + "grad_norm": 0.46669962561508027, + "learning_rate": 8.482921797049445e-06, + "loss": 2.3251, + "step": 690 + }, + { + "epoch": 0.2665380906460945, + "grad_norm": 0.45599266566706936, + "learning_rate": 8.478502987710784e-06, + "loss": 2.2822, + "step": 691 + }, + { + "epoch": 0.266923818707811, + "grad_norm": 0.44453326279872246, + "learning_rate": 8.474078906914359e-06, + "loss": 2.2975, + "step": 692 + }, + { + "epoch": 0.2673095467695275, + "grad_norm": 0.4421890363738117, + "learning_rate": 8.469649561364592e-06, + "loss": 2.4006, + "step": 693 + }, + { + "epoch": 0.26769527483124395, + "grad_norm": 0.42181612986205536, + "learning_rate": 8.465214957773886e-06, + "loss": 2.339, + "step": 694 + }, + { + "epoch": 0.26808100289296044, + "grad_norm": 0.4781947811050221, + "learning_rate": 8.46077510286261e-06, + "loss": 2.3552, + "step": 695 + }, + { + "epoch": 0.26846673095467694, + "grad_norm": 0.44218204028140373, + "learning_rate": 8.456330003359093e-06, + "loss": 2.3174, + "step": 696 + }, + { + "epoch": 0.26885245901639343, + "grad_norm": 0.4335540427536611, + "learning_rate": 8.45187966599961e-06, + "loss": 2.3073, + "step": 697 + }, + { + "epoch": 0.2692381870781099, + "grad_norm": 0.4255526529918983, + "learning_rate": 8.447424097528374e-06, + "loss": 2.3241, + "step": 698 + }, + { + "epoch": 0.2696239151398264, + "grad_norm": 0.4519888234246226, + "learning_rate": 8.442963304697522e-06, + "loss": 2.2504, + "step": 699 + }, + { + "epoch": 0.2700096432015429, + "grad_norm": 0.4285496344210807, + "learning_rate": 8.438497294267117e-06, + "loss": 2.3666, + "step": 700 + }, + { + "epoch": 0.2703953712632594, + "grad_norm": 0.46756712739850437, + "learning_rate": 8.434026073005121e-06, + "loss": 2.3016, + "step": 701 + }, + { + "epoch": 0.2707810993249759, + "grad_norm": 0.43224586635772977, + "learning_rate": 8.429549647687396e-06, + "loss": 2.2907, + "step": 702 + }, + { + "epoch": 0.2711668273866924, + "grad_norm": 0.4279270332396626, + "learning_rate": 8.42506802509769e-06, + "loss": 2.327, + "step": 703 + }, + { + "epoch": 0.2715525554484089, + "grad_norm": 0.4806493766376229, + "learning_rate": 8.420581212027625e-06, + "loss": 2.3157, + "step": 704 + }, + { + "epoch": 0.2719382835101254, + "grad_norm": 0.43953840194209576, + "learning_rate": 8.416089215276695e-06, + "loss": 2.2619, + "step": 705 + }, + { + "epoch": 0.27232401157184183, + "grad_norm": 0.4582687236589324, + "learning_rate": 8.411592041652241e-06, + "loss": 2.3504, + "step": 706 + }, + { + "epoch": 0.2727097396335583, + "grad_norm": 0.44503957981450315, + "learning_rate": 8.407089697969458e-06, + "loss": 2.3481, + "step": 707 + }, + { + "epoch": 0.2730954676952748, + "grad_norm": 0.42107672742502356, + "learning_rate": 8.402582191051365e-06, + "loss": 2.3103, + "step": 708 + }, + { + "epoch": 0.2734811957569913, + "grad_norm": 0.45027516032651865, + "learning_rate": 8.398069527728818e-06, + "loss": 2.3194, + "step": 709 + }, + { + "epoch": 0.2738669238187078, + "grad_norm": 0.46235879544669245, + "learning_rate": 8.393551714840477e-06, + "loss": 2.2534, + "step": 710 + }, + { + "epoch": 0.2742526518804243, + "grad_norm": 0.4241828703032586, + "learning_rate": 8.389028759232816e-06, + "loss": 2.3229, + "step": 711 + }, + { + "epoch": 0.2746383799421408, + "grad_norm": 0.3991820998003778, + "learning_rate": 8.38450066776009e-06, + "loss": 2.3235, + "step": 712 + }, + { + "epoch": 0.2750241080038573, + "grad_norm": 0.45056071729226377, + "learning_rate": 8.379967447284348e-06, + "loss": 2.4087, + "step": 713 + }, + { + "epoch": 0.2754098360655738, + "grad_norm": 0.44409452817733797, + "learning_rate": 8.375429104675404e-06, + "loss": 2.3609, + "step": 714 + }, + { + "epoch": 0.2757955641272903, + "grad_norm": 0.45902422826903083, + "learning_rate": 8.370885646810842e-06, + "loss": 2.3387, + "step": 715 + }, + { + "epoch": 0.2761812921890068, + "grad_norm": 0.4678718330955543, + "learning_rate": 8.36633708057599e-06, + "loss": 2.3463, + "step": 716 + }, + { + "epoch": 0.2765670202507232, + "grad_norm": 0.42990641040563493, + "learning_rate": 8.361783412863922e-06, + "loss": 2.3658, + "step": 717 + }, + { + "epoch": 0.2769527483124397, + "grad_norm": 0.44293983886938715, + "learning_rate": 8.357224650575442e-06, + "loss": 2.2432, + "step": 718 + }, + { + "epoch": 0.2773384763741562, + "grad_norm": 0.4748980516116094, + "learning_rate": 8.352660800619075e-06, + "loss": 2.3018, + "step": 719 + }, + { + "epoch": 0.2777242044358727, + "grad_norm": 0.45821392252427534, + "learning_rate": 8.348091869911054e-06, + "loss": 2.378, + "step": 720 + }, + { + "epoch": 0.2781099324975892, + "grad_norm": 0.43505219416127844, + "learning_rate": 8.343517865375314e-06, + "loss": 2.3017, + "step": 721 + }, + { + "epoch": 0.2784956605593057, + "grad_norm": 0.47054578385085105, + "learning_rate": 8.338938793943478e-06, + "loss": 2.3064, + "step": 722 + }, + { + "epoch": 0.2788813886210222, + "grad_norm": 0.5068953478792466, + "learning_rate": 8.334354662554848e-06, + "loss": 2.3934, + "step": 723 + }, + { + "epoch": 0.2792671166827387, + "grad_norm": 0.4274771286480794, + "learning_rate": 8.329765478156394e-06, + "loss": 2.3296, + "step": 724 + }, + { + "epoch": 0.2796528447444552, + "grad_norm": 0.4540017236562187, + "learning_rate": 8.325171247702742e-06, + "loss": 2.4178, + "step": 725 + }, + { + "epoch": 0.28003857280617167, + "grad_norm": 0.43197431133254655, + "learning_rate": 8.320571978156169e-06, + "loss": 2.306, + "step": 726 + }, + { + "epoch": 0.28042430086788817, + "grad_norm": 0.5257629218842567, + "learning_rate": 8.315967676486581e-06, + "loss": 2.3461, + "step": 727 + }, + { + "epoch": 0.2808100289296046, + "grad_norm": 0.42206142604158375, + "learning_rate": 8.311358349671516e-06, + "loss": 2.3429, + "step": 728 + }, + { + "epoch": 0.2811957569913211, + "grad_norm": 0.4310166461530932, + "learning_rate": 8.30674400469613e-06, + "loss": 2.3838, + "step": 729 + }, + { + "epoch": 0.2815814850530376, + "grad_norm": 0.47512136364839236, + "learning_rate": 8.302124648553175e-06, + "loss": 2.2788, + "step": 730 + }, + { + "epoch": 0.2819672131147541, + "grad_norm": 0.44485823670509395, + "learning_rate": 8.297500288243006e-06, + "loss": 2.3716, + "step": 731 + }, + { + "epoch": 0.2823529411764706, + "grad_norm": 0.42194008296317503, + "learning_rate": 8.292870930773551e-06, + "loss": 2.2852, + "step": 732 + }, + { + "epoch": 0.2827386692381871, + "grad_norm": 0.4727245134331024, + "learning_rate": 8.288236583160322e-06, + "loss": 2.4187, + "step": 733 + }, + { + "epoch": 0.2831243972999036, + "grad_norm": 0.4656778918218644, + "learning_rate": 8.283597252426389e-06, + "loss": 2.3504, + "step": 734 + }, + { + "epoch": 0.28351012536162007, + "grad_norm": 0.4431636979476998, + "learning_rate": 8.27895294560237e-06, + "loss": 2.3422, + "step": 735 + }, + { + "epoch": 0.28389585342333656, + "grad_norm": 0.41930968524089257, + "learning_rate": 8.274303669726427e-06, + "loss": 2.308, + "step": 736 + }, + { + "epoch": 0.28428158148505306, + "grad_norm": 0.4722213099735777, + "learning_rate": 8.269649431844253e-06, + "loss": 2.359, + "step": 737 + }, + { + "epoch": 0.28466730954676955, + "grad_norm": 0.4884260822490044, + "learning_rate": 8.26499023900906e-06, + "loss": 2.3165, + "step": 738 + }, + { + "epoch": 0.285053037608486, + "grad_norm": 0.453666419747345, + "learning_rate": 8.260326098281567e-06, + "loss": 2.2771, + "step": 739 + }, + { + "epoch": 0.2854387656702025, + "grad_norm": 0.434870597831758, + "learning_rate": 8.255657016729997e-06, + "loss": 2.3319, + "step": 740 + }, + { + "epoch": 0.285824493731919, + "grad_norm": 0.4431025745177239, + "learning_rate": 8.250983001430055e-06, + "loss": 2.3395, + "step": 741 + }, + { + "epoch": 0.2862102217936355, + "grad_norm": 0.47702025242587814, + "learning_rate": 8.24630405946492e-06, + "loss": 2.2912, + "step": 742 + }, + { + "epoch": 0.286595949855352, + "grad_norm": 0.4285271915769268, + "learning_rate": 8.241620197925247e-06, + "loss": 2.3997, + "step": 743 + }, + { + "epoch": 0.28698167791706847, + "grad_norm": 0.4519377530435274, + "learning_rate": 8.23693142390914e-06, + "loss": 2.385, + "step": 744 + }, + { + "epoch": 0.28736740597878496, + "grad_norm": 0.42967581483794326, + "learning_rate": 8.232237744522145e-06, + "loss": 2.3602, + "step": 745 + }, + { + "epoch": 0.28775313404050146, + "grad_norm": 0.4380475395781914, + "learning_rate": 8.227539166877244e-06, + "loss": 2.356, + "step": 746 + }, + { + "epoch": 0.28813886210221795, + "grad_norm": 0.428033800976379, + "learning_rate": 8.222835698094849e-06, + "loss": 2.3886, + "step": 747 + }, + { + "epoch": 0.28852459016393445, + "grad_norm": 0.43342082942139437, + "learning_rate": 8.218127345302775e-06, + "loss": 2.3613, + "step": 748 + }, + { + "epoch": 0.28891031822565094, + "grad_norm": 0.43796959158042176, + "learning_rate": 8.21341411563624e-06, + "loss": 2.2804, + "step": 749 + }, + { + "epoch": 0.2892960462873674, + "grad_norm": 0.4258383538912118, + "learning_rate": 8.208696016237858e-06, + "loss": 2.3471, + "step": 750 + }, + { + "epoch": 0.2896817743490839, + "grad_norm": 0.4527951901212638, + "learning_rate": 8.203973054257614e-06, + "loss": 2.3433, + "step": 751 + }, + { + "epoch": 0.29006750241080037, + "grad_norm": 0.4746672206815161, + "learning_rate": 8.199245236852871e-06, + "loss": 2.3056, + "step": 752 + }, + { + "epoch": 0.29045323047251687, + "grad_norm": 0.446204159927227, + "learning_rate": 8.194512571188347e-06, + "loss": 2.3905, + "step": 753 + }, + { + "epoch": 0.29083895853423336, + "grad_norm": 0.4390420515188638, + "learning_rate": 8.189775064436101e-06, + "loss": 2.2572, + "step": 754 + }, + { + "epoch": 0.29122468659594986, + "grad_norm": 0.4249036856189722, + "learning_rate": 8.18503272377554e-06, + "loss": 2.3421, + "step": 755 + }, + { + "epoch": 0.29161041465766635, + "grad_norm": 0.4477391836717647, + "learning_rate": 8.180285556393384e-06, + "loss": 2.241, + "step": 756 + }, + { + "epoch": 0.29199614271938285, + "grad_norm": 0.45573037235185965, + "learning_rate": 8.175533569483678e-06, + "loss": 2.2949, + "step": 757 + }, + { + "epoch": 0.29238187078109934, + "grad_norm": 0.45332611581297916, + "learning_rate": 8.170776770247766e-06, + "loss": 2.2902, + "step": 758 + }, + { + "epoch": 0.29276759884281583, + "grad_norm": 0.42911111909344035, + "learning_rate": 8.166015165894285e-06, + "loss": 2.3321, + "step": 759 + }, + { + "epoch": 0.29315332690453233, + "grad_norm": 0.45635018516913833, + "learning_rate": 8.161248763639154e-06, + "loss": 2.3595, + "step": 760 + }, + { + "epoch": 0.29353905496624877, + "grad_norm": 0.4353529159629687, + "learning_rate": 8.156477570705561e-06, + "loss": 2.3467, + "step": 761 + }, + { + "epoch": 0.29392478302796526, + "grad_norm": 0.43159025437557735, + "learning_rate": 8.151701594323957e-06, + "loss": 2.3273, + "step": 762 + }, + { + "epoch": 0.29431051108968176, + "grad_norm": 0.45204397895329773, + "learning_rate": 8.146920841732045e-06, + "loss": 2.3402, + "step": 763 + }, + { + "epoch": 0.29469623915139825, + "grad_norm": 0.48590496149018464, + "learning_rate": 8.142135320174758e-06, + "loss": 2.3194, + "step": 764 + }, + { + "epoch": 0.29508196721311475, + "grad_norm": 0.44858443291686595, + "learning_rate": 8.13734503690426e-06, + "loss": 2.4044, + "step": 765 + }, + { + "epoch": 0.29546769527483124, + "grad_norm": 0.4780747442685094, + "learning_rate": 8.132549999179934e-06, + "loss": 2.3557, + "step": 766 + }, + { + "epoch": 0.29585342333654774, + "grad_norm": 0.46334226284408603, + "learning_rate": 8.127750214268363e-06, + "loss": 2.3217, + "step": 767 + }, + { + "epoch": 0.29623915139826423, + "grad_norm": 0.428819680186002, + "learning_rate": 8.122945689443328e-06, + "loss": 2.374, + "step": 768 + }, + { + "epoch": 0.29662487945998073, + "grad_norm": 0.49990179942528384, + "learning_rate": 8.11813643198579e-06, + "loss": 2.2795, + "step": 769 + }, + { + "epoch": 0.2970106075216972, + "grad_norm": 0.4869602981354855, + "learning_rate": 8.113322449183884e-06, + "loss": 2.3162, + "step": 770 + }, + { + "epoch": 0.2973963355834137, + "grad_norm": 0.4359106823661919, + "learning_rate": 8.108503748332906e-06, + "loss": 2.2967, + "step": 771 + }, + { + "epoch": 0.29778206364513016, + "grad_norm": 0.46852221277067796, + "learning_rate": 8.1036803367353e-06, + "loss": 2.39, + "step": 772 + }, + { + "epoch": 0.29816779170684665, + "grad_norm": 0.4376965119507577, + "learning_rate": 8.098852221700652e-06, + "loss": 2.3758, + "step": 773 + }, + { + "epoch": 0.29855351976856315, + "grad_norm": 0.42922571975329926, + "learning_rate": 8.094019410545673e-06, + "loss": 2.2821, + "step": 774 + }, + { + "epoch": 0.29893924783027964, + "grad_norm": 0.43239232104124353, + "learning_rate": 8.089181910594191e-06, + "loss": 2.4046, + "step": 775 + }, + { + "epoch": 0.29932497589199614, + "grad_norm": 0.4361179145248521, + "learning_rate": 8.084339729177142e-06, + "loss": 2.3665, + "step": 776 + }, + { + "epoch": 0.29971070395371263, + "grad_norm": 0.4620195317271602, + "learning_rate": 8.079492873632554e-06, + "loss": 2.3647, + "step": 777 + }, + { + "epoch": 0.3000964320154291, + "grad_norm": 0.4292740072549495, + "learning_rate": 8.074641351305539e-06, + "loss": 2.3595, + "step": 778 + }, + { + "epoch": 0.3004821600771456, + "grad_norm": 0.4702921848552218, + "learning_rate": 8.069785169548279e-06, + "loss": 2.3734, + "step": 779 + }, + { + "epoch": 0.3008678881388621, + "grad_norm": 0.47865810368594297, + "learning_rate": 8.064924335720023e-06, + "loss": 2.324, + "step": 780 + }, + { + "epoch": 0.3012536162005786, + "grad_norm": 0.4235903947368299, + "learning_rate": 8.060058857187066e-06, + "loss": 2.3847, + "step": 781 + }, + { + "epoch": 0.3016393442622951, + "grad_norm": 0.4284741604176135, + "learning_rate": 8.05518874132274e-06, + "loss": 2.3455, + "step": 782 + }, + { + "epoch": 0.30202507232401155, + "grad_norm": 0.4345349921182491, + "learning_rate": 8.050313995507406e-06, + "loss": 2.3149, + "step": 783 + }, + { + "epoch": 0.30241080038572804, + "grad_norm": 0.4687050619045661, + "learning_rate": 8.045434627128446e-06, + "loss": 2.3534, + "step": 784 + }, + { + "epoch": 0.30279652844744454, + "grad_norm": 0.44461581794714267, + "learning_rate": 8.04055064358024e-06, + "loss": 2.3629, + "step": 785 + }, + { + "epoch": 0.30318225650916103, + "grad_norm": 0.4460439925749081, + "learning_rate": 8.035662052264167e-06, + "loss": 2.2856, + "step": 786 + }, + { + "epoch": 0.3035679845708775, + "grad_norm": 0.42462245180475355, + "learning_rate": 8.030768860588585e-06, + "loss": 2.3613, + "step": 787 + }, + { + "epoch": 0.303953712632594, + "grad_norm": 0.45679951273446, + "learning_rate": 8.025871075968828e-06, + "loss": 2.303, + "step": 788 + }, + { + "epoch": 0.3043394406943105, + "grad_norm": 0.42176465264134017, + "learning_rate": 8.020968705827184e-06, + "loss": 2.4177, + "step": 789 + }, + { + "epoch": 0.304725168756027, + "grad_norm": 0.45242146039478087, + "learning_rate": 8.0160617575929e-06, + "loss": 2.3975, + "step": 790 + }, + { + "epoch": 0.3051108968177435, + "grad_norm": 0.45600534039329604, + "learning_rate": 8.01115023870215e-06, + "loss": 2.347, + "step": 791 + }, + { + "epoch": 0.30549662487946, + "grad_norm": 0.444106058641205, + "learning_rate": 8.006234156598043e-06, + "loss": 2.3369, + "step": 792 + }, + { + "epoch": 0.3058823529411765, + "grad_norm": 0.42845035166525575, + "learning_rate": 8.001313518730596e-06, + "loss": 2.3673, + "step": 793 + }, + { + "epoch": 0.30626808100289293, + "grad_norm": 0.4232424401622947, + "learning_rate": 7.996388332556735e-06, + "loss": 2.3429, + "step": 794 + }, + { + "epoch": 0.30665380906460943, + "grad_norm": 0.4488939460799838, + "learning_rate": 7.99145860554028e-06, + "loss": 2.248, + "step": 795 + }, + { + "epoch": 0.3070395371263259, + "grad_norm": 0.4455748705862296, + "learning_rate": 7.986524345151924e-06, + "loss": 2.3008, + "step": 796 + }, + { + "epoch": 0.3074252651880424, + "grad_norm": 0.4425635266461437, + "learning_rate": 7.981585558869244e-06, + "loss": 2.3849, + "step": 797 + }, + { + "epoch": 0.3078109932497589, + "grad_norm": 0.4777966028201153, + "learning_rate": 7.976642254176658e-06, + "loss": 2.3225, + "step": 798 + }, + { + "epoch": 0.3081967213114754, + "grad_norm": 0.47200162397794504, + "learning_rate": 7.97169443856545e-06, + "loss": 2.2937, + "step": 799 + }, + { + "epoch": 0.3085824493731919, + "grad_norm": 0.42909077888030944, + "learning_rate": 7.966742119533724e-06, + "loss": 2.3257, + "step": 800 + }, + { + "epoch": 0.3089681774349084, + "grad_norm": 0.4096126054850406, + "learning_rate": 7.961785304586418e-06, + "loss": 2.3265, + "step": 801 + }, + { + "epoch": 0.3093539054966249, + "grad_norm": 0.4347992134327031, + "learning_rate": 7.956824001235281e-06, + "loss": 2.3526, + "step": 802 + }, + { + "epoch": 0.3097396335583414, + "grad_norm": 0.43862916899554255, + "learning_rate": 7.951858216998863e-06, + "loss": 2.2812, + "step": 803 + }, + { + "epoch": 0.3101253616200579, + "grad_norm": 0.4600817148233885, + "learning_rate": 7.946887959402504e-06, + "loss": 2.33, + "step": 804 + }, + { + "epoch": 0.3105110896817743, + "grad_norm": 0.48101554196914814, + "learning_rate": 7.941913235978329e-06, + "loss": 2.3809, + "step": 805 + }, + { + "epoch": 0.3108968177434908, + "grad_norm": 0.4665225951071249, + "learning_rate": 7.936934054265222e-06, + "loss": 2.3546, + "step": 806 + }, + { + "epoch": 0.3112825458052073, + "grad_norm": 0.43534765876665066, + "learning_rate": 7.931950421808828e-06, + "loss": 2.3582, + "step": 807 + }, + { + "epoch": 0.3116682738669238, + "grad_norm": 0.42892045412622404, + "learning_rate": 7.926962346161535e-06, + "loss": 2.3411, + "step": 808 + }, + { + "epoch": 0.3120540019286403, + "grad_norm": 0.42918015973344015, + "learning_rate": 7.921969834882468e-06, + "loss": 2.2586, + "step": 809 + }, + { + "epoch": 0.3124397299903568, + "grad_norm": 0.4571111610981652, + "learning_rate": 7.916972895537471e-06, + "loss": 2.2982, + "step": 810 + }, + { + "epoch": 0.3128254580520733, + "grad_norm": 0.42461745307941356, + "learning_rate": 7.911971535699097e-06, + "loss": 2.3182, + "step": 811 + }, + { + "epoch": 0.3132111861137898, + "grad_norm": 0.46143612577410686, + "learning_rate": 7.9069657629466e-06, + "loss": 2.4201, + "step": 812 + }, + { + "epoch": 0.3135969141755063, + "grad_norm": 0.4515370603709611, + "learning_rate": 7.901955584865923e-06, + "loss": 2.3354, + "step": 813 + }, + { + "epoch": 0.3139826422372228, + "grad_norm": 0.4408669003589643, + "learning_rate": 7.896941009049682e-06, + "loss": 2.3545, + "step": 814 + }, + { + "epoch": 0.31436837029893927, + "grad_norm": 0.43867232073734636, + "learning_rate": 7.891922043097162e-06, + "loss": 2.3471, + "step": 815 + }, + { + "epoch": 0.31475409836065577, + "grad_norm": 0.41494825233536436, + "learning_rate": 7.886898694614292e-06, + "loss": 2.3706, + "step": 816 + }, + { + "epoch": 0.3151398264223722, + "grad_norm": 0.4654886458710088, + "learning_rate": 7.881870971213652e-06, + "loss": 2.3748, + "step": 817 + }, + { + "epoch": 0.3155255544840887, + "grad_norm": 0.42965298008792185, + "learning_rate": 7.876838880514448e-06, + "loss": 2.3053, + "step": 818 + }, + { + "epoch": 0.3159112825458052, + "grad_norm": 0.4838698164022045, + "learning_rate": 7.871802430142506e-06, + "loss": 2.3315, + "step": 819 + }, + { + "epoch": 0.3162970106075217, + "grad_norm": 0.42798835508259303, + "learning_rate": 7.866761627730253e-06, + "loss": 2.3773, + "step": 820 + }, + { + "epoch": 0.3166827386692382, + "grad_norm": 0.47092082650843975, + "learning_rate": 7.86171648091672e-06, + "loss": 2.3291, + "step": 821 + }, + { + "epoch": 0.3170684667309547, + "grad_norm": 0.458184182466206, + "learning_rate": 7.856666997347515e-06, + "loss": 2.3083, + "step": 822 + }, + { + "epoch": 0.3174541947926712, + "grad_norm": 0.5946076914449817, + "learning_rate": 7.851613184674821e-06, + "loss": 2.4127, + "step": 823 + }, + { + "epoch": 0.31783992285438767, + "grad_norm": 0.5024372758017904, + "learning_rate": 7.846555050557381e-06, + "loss": 2.3815, + "step": 824 + }, + { + "epoch": 0.31822565091610416, + "grad_norm": 0.4388051557059938, + "learning_rate": 7.841492602660487e-06, + "loss": 2.3495, + "step": 825 + }, + { + "epoch": 0.31861137897782066, + "grad_norm": 0.4467754067676991, + "learning_rate": 7.836425848655968e-06, + "loss": 2.3293, + "step": 826 + }, + { + "epoch": 0.31899710703953715, + "grad_norm": 0.4586352536542521, + "learning_rate": 7.831354796222178e-06, + "loss": 2.372, + "step": 827 + }, + { + "epoch": 0.3193828351012536, + "grad_norm": 0.45899021692487624, + "learning_rate": 7.826279453043985e-06, + "loss": 2.3055, + "step": 828 + }, + { + "epoch": 0.3197685631629701, + "grad_norm": 0.4625457915914088, + "learning_rate": 7.821199826812764e-06, + "loss": 2.3704, + "step": 829 + }, + { + "epoch": 0.3201542912246866, + "grad_norm": 0.48422010360163636, + "learning_rate": 7.816115925226373e-06, + "loss": 2.4121, + "step": 830 + }, + { + "epoch": 0.3205400192864031, + "grad_norm": 0.4441602028094247, + "learning_rate": 7.811027755989153e-06, + "loss": 2.3545, + "step": 831 + }, + { + "epoch": 0.32092574734811957, + "grad_norm": 0.47419398203385676, + "learning_rate": 7.805935326811913e-06, + "loss": 2.3258, + "step": 832 + }, + { + "epoch": 0.32131147540983607, + "grad_norm": 0.42641661954889865, + "learning_rate": 7.800838645411917e-06, + "loss": 2.3169, + "step": 833 + }, + { + "epoch": 0.32169720347155256, + "grad_norm": 0.4545956560840867, + "learning_rate": 7.795737719512872e-06, + "loss": 2.3415, + "step": 834 + }, + { + "epoch": 0.32208293153326906, + "grad_norm": 0.4596332641427211, + "learning_rate": 7.79063255684492e-06, + "loss": 2.3337, + "step": 835 + }, + { + "epoch": 0.32246865959498555, + "grad_norm": 0.43100947353655417, + "learning_rate": 7.78552316514462e-06, + "loss": 2.3427, + "step": 836 + }, + { + "epoch": 0.32285438765670205, + "grad_norm": 0.4581439366747127, + "learning_rate": 7.78040955215494e-06, + "loss": 2.3594, + "step": 837 + }, + { + "epoch": 0.32324011571841854, + "grad_norm": 0.4395366404437616, + "learning_rate": 7.775291725625252e-06, + "loss": 2.3767, + "step": 838 + }, + { + "epoch": 0.323625843780135, + "grad_norm": 0.44458456150956316, + "learning_rate": 7.7701696933113e-06, + "loss": 2.3406, + "step": 839 + }, + { + "epoch": 0.3240115718418515, + "grad_norm": 0.47111729135319297, + "learning_rate": 7.765043462975217e-06, + "loss": 2.3604, + "step": 840 + }, + { + "epoch": 0.32439729990356797, + "grad_norm": 0.4224299456693926, + "learning_rate": 7.759913042385487e-06, + "loss": 2.3147, + "step": 841 + }, + { + "epoch": 0.32478302796528447, + "grad_norm": 0.4255486761004747, + "learning_rate": 7.754778439316947e-06, + "loss": 2.2993, + "step": 842 + }, + { + "epoch": 0.32516875602700096, + "grad_norm": 0.4250357635068921, + "learning_rate": 7.749639661550775e-06, + "loss": 2.3983, + "step": 843 + }, + { + "epoch": 0.32555448408871746, + "grad_norm": 0.44438113155091435, + "learning_rate": 7.744496716874472e-06, + "loss": 2.263, + "step": 844 + }, + { + "epoch": 0.32594021215043395, + "grad_norm": 0.46758813922495346, + "learning_rate": 7.739349613081854e-06, + "loss": 2.2812, + "step": 845 + }, + { + "epoch": 0.32632594021215044, + "grad_norm": 0.45898502931127505, + "learning_rate": 7.734198357973041e-06, + "loss": 2.3454, + "step": 846 + }, + { + "epoch": 0.32671166827386694, + "grad_norm": 0.44360162517764584, + "learning_rate": 7.729042959354447e-06, + "loss": 2.3242, + "step": 847 + }, + { + "epoch": 0.32709739633558343, + "grad_norm": 0.42832329145176784, + "learning_rate": 7.723883425038759e-06, + "loss": 2.3197, + "step": 848 + }, + { + "epoch": 0.32748312439729993, + "grad_norm": 0.44317044594073346, + "learning_rate": 7.718719762844935e-06, + "loss": 2.2811, + "step": 849 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 0.47226771226988595, + "learning_rate": 7.713551980598189e-06, + "loss": 2.3576, + "step": 850 + }, + { + "epoch": 0.32825458052073286, + "grad_norm": 0.460364931979272, + "learning_rate": 7.708380086129977e-06, + "loss": 2.3019, + "step": 851 + }, + { + "epoch": 0.32864030858244936, + "grad_norm": 0.5167468336564306, + "learning_rate": 7.703204087277989e-06, + "loss": 2.3055, + "step": 852 + }, + { + "epoch": 0.32902603664416585, + "grad_norm": 0.4936892938530032, + "learning_rate": 7.698023991886133e-06, + "loss": 2.3918, + "step": 853 + }, + { + "epoch": 0.32941176470588235, + "grad_norm": 0.4659719100731399, + "learning_rate": 7.692839807804522e-06, + "loss": 2.3606, + "step": 854 + }, + { + "epoch": 0.32979749276759884, + "grad_norm": 0.41721873745637655, + "learning_rate": 7.687651542889474e-06, + "loss": 2.3027, + "step": 855 + }, + { + "epoch": 0.33018322082931534, + "grad_norm": 0.47568708565046847, + "learning_rate": 7.682459205003484e-06, + "loss": 2.1596, + "step": 856 + }, + { + "epoch": 0.33056894889103183, + "grad_norm": 0.4212915311019565, + "learning_rate": 7.677262802015223e-06, + "loss": 2.3223, + "step": 857 + }, + { + "epoch": 0.33095467695274833, + "grad_norm": 0.41267202978176354, + "learning_rate": 7.672062341799516e-06, + "loss": 2.3424, + "step": 858 + }, + { + "epoch": 0.3313404050144648, + "grad_norm": 0.43703247499268455, + "learning_rate": 7.666857832237343e-06, + "loss": 2.351, + "step": 859 + }, + { + "epoch": 0.3317261330761813, + "grad_norm": 0.4339373605530483, + "learning_rate": 7.661649281215823e-06, + "loss": 2.2267, + "step": 860 + }, + { + "epoch": 0.33211186113789776, + "grad_norm": 0.4095000678783189, + "learning_rate": 7.656436696628194e-06, + "loss": 2.3461, + "step": 861 + }, + { + "epoch": 0.33249758919961425, + "grad_norm": 0.4052235350798568, + "learning_rate": 7.651220086373803e-06, + "loss": 2.371, + "step": 862 + }, + { + "epoch": 0.33288331726133075, + "grad_norm": 0.42506673786742166, + "learning_rate": 7.645999458358107e-06, + "loss": 2.3749, + "step": 863 + }, + { + "epoch": 0.33326904532304724, + "grad_norm": 0.4308040407483772, + "learning_rate": 7.640774820492647e-06, + "loss": 2.3551, + "step": 864 + }, + { + "epoch": 0.33365477338476374, + "grad_norm": 0.44619888704058536, + "learning_rate": 7.635546180695039e-06, + "loss": 2.3325, + "step": 865 + }, + { + "epoch": 0.33404050144648023, + "grad_norm": 0.458016446786189, + "learning_rate": 7.630313546888968e-06, + "loss": 2.3347, + "step": 866 + }, + { + "epoch": 0.3344262295081967, + "grad_norm": 0.4123412862810054, + "learning_rate": 7.625076927004169e-06, + "loss": 2.3084, + "step": 867 + }, + { + "epoch": 0.3348119575699132, + "grad_norm": 0.4705401473635554, + "learning_rate": 7.619836328976416e-06, + "loss": 2.3727, + "step": 868 + }, + { + "epoch": 0.3351976856316297, + "grad_norm": 0.4362543045922105, + "learning_rate": 7.614591760747516e-06, + "loss": 2.3146, + "step": 869 + }, + { + "epoch": 0.3355834136933462, + "grad_norm": 0.4445694236054695, + "learning_rate": 7.6093432302652895e-06, + "loss": 2.308, + "step": 870 + }, + { + "epoch": 0.3359691417550627, + "grad_norm": 0.43128765692864757, + "learning_rate": 7.604090745483562e-06, + "loss": 2.3179, + "step": 871 + }, + { + "epoch": 0.33635486981677915, + "grad_norm": 0.45804986585748414, + "learning_rate": 7.598834314362151e-06, + "loss": 2.3559, + "step": 872 + }, + { + "epoch": 0.33674059787849564, + "grad_norm": 0.43170738334665876, + "learning_rate": 7.593573944866857e-06, + "loss": 2.3143, + "step": 873 + }, + { + "epoch": 0.33712632594021213, + "grad_norm": 0.5067556063547982, + "learning_rate": 7.588309644969445e-06, + "loss": 2.3403, + "step": 874 + }, + { + "epoch": 0.33751205400192863, + "grad_norm": 0.45811233476707985, + "learning_rate": 7.58304142264764e-06, + "loss": 2.3567, + "step": 875 + }, + { + "epoch": 0.3378977820636451, + "grad_norm": 0.5042358345345198, + "learning_rate": 7.57776928588511e-06, + "loss": 2.289, + "step": 876 + }, + { + "epoch": 0.3382835101253616, + "grad_norm": 0.47536789766317805, + "learning_rate": 7.572493242671453e-06, + "loss": 2.3533, + "step": 877 + }, + { + "epoch": 0.3386692381870781, + "grad_norm": 0.4330481840781185, + "learning_rate": 7.567213301002189e-06, + "loss": 2.2871, + "step": 878 + }, + { + "epoch": 0.3390549662487946, + "grad_norm": 0.43206128149569756, + "learning_rate": 7.561929468878746e-06, + "loss": 2.3191, + "step": 879 + }, + { + "epoch": 0.3394406943105111, + "grad_norm": 0.43227398982142884, + "learning_rate": 7.556641754308447e-06, + "loss": 2.2718, + "step": 880 + }, + { + "epoch": 0.3398264223722276, + "grad_norm": 0.49571664458087944, + "learning_rate": 7.5513501653045e-06, + "loss": 2.4013, + "step": 881 + }, + { + "epoch": 0.3402121504339441, + "grad_norm": 0.45840248989677757, + "learning_rate": 7.546054709885981e-06, + "loss": 2.2785, + "step": 882 + }, + { + "epoch": 0.34059787849566053, + "grad_norm": 0.5225076487174943, + "learning_rate": 7.540755396077828e-06, + "loss": 2.3079, + "step": 883 + }, + { + "epoch": 0.34098360655737703, + "grad_norm": 0.41229766105030147, + "learning_rate": 7.535452231910829e-06, + "loss": 2.3381, + "step": 884 + }, + { + "epoch": 0.3413693346190935, + "grad_norm": 0.4652528286296387, + "learning_rate": 7.5301452254216e-06, + "loss": 2.361, + "step": 885 + }, + { + "epoch": 0.34175506268081, + "grad_norm": 0.4670802361126979, + "learning_rate": 7.524834384652586e-06, + "loss": 2.3392, + "step": 886 + }, + { + "epoch": 0.3421407907425265, + "grad_norm": 0.4408580017883797, + "learning_rate": 7.519519717652039e-06, + "loss": 2.322, + "step": 887 + }, + { + "epoch": 0.342526518804243, + "grad_norm": 0.4193533845351374, + "learning_rate": 7.514201232474012e-06, + "loss": 2.3312, + "step": 888 + }, + { + "epoch": 0.3429122468659595, + "grad_norm": 0.4159448991368155, + "learning_rate": 7.50887893717834e-06, + "loss": 2.3291, + "step": 889 + }, + { + "epoch": 0.343297974927676, + "grad_norm": 0.41481103356640897, + "learning_rate": 7.503552839830638e-06, + "loss": 2.3178, + "step": 890 + }, + { + "epoch": 0.3436837029893925, + "grad_norm": 0.4491536254248892, + "learning_rate": 7.498222948502277e-06, + "loss": 2.3412, + "step": 891 + }, + { + "epoch": 0.344069431051109, + "grad_norm": 0.43446834098269727, + "learning_rate": 7.492889271270382e-06, + "loss": 2.3726, + "step": 892 + }, + { + "epoch": 0.3444551591128255, + "grad_norm": 0.44546096046509476, + "learning_rate": 7.487551816217813e-06, + "loss": 2.2741, + "step": 893 + }, + { + "epoch": 0.3448408871745419, + "grad_norm": 0.44732584277374626, + "learning_rate": 7.482210591433156e-06, + "loss": 2.3504, + "step": 894 + }, + { + "epoch": 0.3452266152362584, + "grad_norm": 0.44113243248302264, + "learning_rate": 7.4768656050107065e-06, + "loss": 2.305, + "step": 895 + }, + { + "epoch": 0.3456123432979749, + "grad_norm": 0.48335465507229275, + "learning_rate": 7.471516865050468e-06, + "loss": 2.3396, + "step": 896 + }, + { + "epoch": 0.3459980713596914, + "grad_norm": 0.4466799510147099, + "learning_rate": 7.466164379658123e-06, + "loss": 2.3965, + "step": 897 + }, + { + "epoch": 0.3463837994214079, + "grad_norm": 0.43440164723782954, + "learning_rate": 7.4608081569450365e-06, + "loss": 2.3035, + "step": 898 + }, + { + "epoch": 0.3467695274831244, + "grad_norm": 0.4400112286896313, + "learning_rate": 7.455448205028238e-06, + "loss": 2.3103, + "step": 899 + }, + { + "epoch": 0.3471552555448409, + "grad_norm": 0.46608894618304125, + "learning_rate": 7.450084532030402e-06, + "loss": 2.2765, + "step": 900 + }, + { + "epoch": 0.3475409836065574, + "grad_norm": 0.45086681159300906, + "learning_rate": 7.444717146079845e-06, + "loss": 2.4157, + "step": 901 + }, + { + "epoch": 0.3479267116682739, + "grad_norm": 0.4478101138542285, + "learning_rate": 7.439346055310514e-06, + "loss": 2.3622, + "step": 902 + }, + { + "epoch": 0.3483124397299904, + "grad_norm": 0.41300538182631347, + "learning_rate": 7.433971267861966e-06, + "loss": 2.3724, + "step": 903 + }, + { + "epoch": 0.34869816779170687, + "grad_norm": 0.4158739751317302, + "learning_rate": 7.428592791879361e-06, + "loss": 2.3306, + "step": 904 + }, + { + "epoch": 0.3490838958534233, + "grad_norm": 0.4609002769648654, + "learning_rate": 7.42321063551345e-06, + "loss": 2.2837, + "step": 905 + }, + { + "epoch": 0.3494696239151398, + "grad_norm": 0.4252440776041672, + "learning_rate": 7.41782480692056e-06, + "loss": 2.3061, + "step": 906 + }, + { + "epoch": 0.3498553519768563, + "grad_norm": 0.48234747553660545, + "learning_rate": 7.412435314262585e-06, + "loss": 2.3373, + "step": 907 + }, + { + "epoch": 0.3502410800385728, + "grad_norm": 0.4716302401394498, + "learning_rate": 7.407042165706969e-06, + "loss": 2.32, + "step": 908 + }, + { + "epoch": 0.3506268081002893, + "grad_norm": 0.49459850070567807, + "learning_rate": 7.401645369426697e-06, + "loss": 2.3398, + "step": 909 + }, + { + "epoch": 0.3510125361620058, + "grad_norm": 0.41483151543671365, + "learning_rate": 7.396244933600285e-06, + "loss": 2.3372, + "step": 910 + }, + { + "epoch": 0.3513982642237223, + "grad_norm": 0.4368914093246457, + "learning_rate": 7.390840866411759e-06, + "loss": 2.3382, + "step": 911 + }, + { + "epoch": 0.3517839922854388, + "grad_norm": 0.42396785049832025, + "learning_rate": 7.385433176050654e-06, + "loss": 2.3367, + "step": 912 + }, + { + "epoch": 0.35216972034715527, + "grad_norm": 0.44575975227453146, + "learning_rate": 7.380021870711991e-06, + "loss": 2.392, + "step": 913 + }, + { + "epoch": 0.35255544840887176, + "grad_norm": 0.4987776280098204, + "learning_rate": 7.37460695859627e-06, + "loss": 2.3795, + "step": 914 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 0.4463947309244009, + "learning_rate": 7.369188447909459e-06, + "loss": 2.3268, + "step": 915 + }, + { + "epoch": 0.3533269045323047, + "grad_norm": 0.41673276875071985, + "learning_rate": 7.36376634686298e-06, + "loss": 2.3676, + "step": 916 + }, + { + "epoch": 0.3537126325940212, + "grad_norm": 0.44534856324225514, + "learning_rate": 7.358340663673695e-06, + "loss": 2.3705, + "step": 917 + }, + { + "epoch": 0.3540983606557377, + "grad_norm": 0.460255688689065, + "learning_rate": 7.352911406563888e-06, + "loss": 2.322, + "step": 918 + }, + { + "epoch": 0.3544840887174542, + "grad_norm": 0.43611753646963614, + "learning_rate": 7.347478583761272e-06, + "loss": 2.2735, + "step": 919 + }, + { + "epoch": 0.3548698167791707, + "grad_norm": 0.43948537974215407, + "learning_rate": 7.342042203498952e-06, + "loss": 2.319, + "step": 920 + }, + { + "epoch": 0.35525554484088717, + "grad_norm": 0.45940409785850245, + "learning_rate": 7.3366022740154285e-06, + "loss": 2.3538, + "step": 921 + }, + { + "epoch": 0.35564127290260367, + "grad_norm": 0.46162875647189644, + "learning_rate": 7.331158803554585e-06, + "loss": 2.3226, + "step": 922 + }, + { + "epoch": 0.35602700096432016, + "grad_norm": 0.4368969469152973, + "learning_rate": 7.325711800365662e-06, + "loss": 2.3425, + "step": 923 + }, + { + "epoch": 0.35641272902603666, + "grad_norm": 0.4591835143995513, + "learning_rate": 7.320261272703259e-06, + "loss": 2.3363, + "step": 924 + }, + { + "epoch": 0.35679845708775315, + "grad_norm": 0.4760097133592221, + "learning_rate": 7.31480722882732e-06, + "loss": 2.3522, + "step": 925 + }, + { + "epoch": 0.35718418514946965, + "grad_norm": 0.46169731625643, + "learning_rate": 7.309349677003111e-06, + "loss": 2.3759, + "step": 926 + }, + { + "epoch": 0.35756991321118614, + "grad_norm": 0.47075847741995125, + "learning_rate": 7.303888625501217e-06, + "loss": 2.3803, + "step": 927 + }, + { + "epoch": 0.3579556412729026, + "grad_norm": 0.44926789399658806, + "learning_rate": 7.298424082597526e-06, + "loss": 2.3828, + "step": 928 + }, + { + "epoch": 0.3583413693346191, + "grad_norm": 0.47082834920460115, + "learning_rate": 7.292956056573217e-06, + "loss": 2.326, + "step": 929 + }, + { + "epoch": 0.35872709739633557, + "grad_norm": 0.43121536673404864, + "learning_rate": 7.28748455571475e-06, + "loss": 2.3573, + "step": 930 + }, + { + "epoch": 0.35911282545805207, + "grad_norm": 0.49565820410327954, + "learning_rate": 7.2820095883138456e-06, + "loss": 2.3946, + "step": 931 + }, + { + "epoch": 0.35949855351976856, + "grad_norm": 0.4462732434716897, + "learning_rate": 7.276531162667484e-06, + "loss": 2.3244, + "step": 932 + }, + { + "epoch": 0.35988428158148505, + "grad_norm": 0.5351326609547389, + "learning_rate": 7.271049287077881e-06, + "loss": 2.2503, + "step": 933 + }, + { + "epoch": 0.36027000964320155, + "grad_norm": 0.4590061951597883, + "learning_rate": 7.265563969852482e-06, + "loss": 2.318, + "step": 934 + }, + { + "epoch": 0.36065573770491804, + "grad_norm": 0.4319454825999862, + "learning_rate": 7.260075219303951e-06, + "loss": 2.3753, + "step": 935 + }, + { + "epoch": 0.36104146576663454, + "grad_norm": 0.41309154995577047, + "learning_rate": 7.254583043750152e-06, + "loss": 2.3311, + "step": 936 + }, + { + "epoch": 0.36142719382835103, + "grad_norm": 0.5407552089493747, + "learning_rate": 7.249087451514137e-06, + "loss": 2.2955, + "step": 937 + }, + { + "epoch": 0.36181292189006753, + "grad_norm": 0.4492520741093502, + "learning_rate": 7.243588450924142e-06, + "loss": 2.2689, + "step": 938 + }, + { + "epoch": 0.36219864995178397, + "grad_norm": 0.4493553262474411, + "learning_rate": 7.238086050313563e-06, + "loss": 2.2745, + "step": 939 + }, + { + "epoch": 0.36258437801350046, + "grad_norm": 0.42785233193380856, + "learning_rate": 7.232580258020952e-06, + "loss": 2.277, + "step": 940 + }, + { + "epoch": 0.36297010607521696, + "grad_norm": 0.43432059382700255, + "learning_rate": 7.227071082389998e-06, + "loss": 2.3192, + "step": 941 + }, + { + "epoch": 0.36335583413693345, + "grad_norm": 0.4330307978504909, + "learning_rate": 7.221558531769519e-06, + "loss": 2.2972, + "step": 942 + }, + { + "epoch": 0.36374156219864995, + "grad_norm": 0.41452292523779316, + "learning_rate": 7.216042614513446e-06, + "loss": 2.302, + "step": 943 + }, + { + "epoch": 0.36412729026036644, + "grad_norm": 0.4538549860302695, + "learning_rate": 7.210523338980814e-06, + "loss": 2.3121, + "step": 944 + }, + { + "epoch": 0.36451301832208294, + "grad_norm": 0.4138770978200875, + "learning_rate": 7.205000713535748e-06, + "loss": 2.3459, + "step": 945 + }, + { + "epoch": 0.36489874638379943, + "grad_norm": 0.4120360593592124, + "learning_rate": 7.199474746547445e-06, + "loss": 2.2938, + "step": 946 + }, + { + "epoch": 0.3652844744455159, + "grad_norm": 0.4247085637801656, + "learning_rate": 7.193945446390169e-06, + "loss": 2.3401, + "step": 947 + }, + { + "epoch": 0.3656702025072324, + "grad_norm": 0.41971540037244914, + "learning_rate": 7.1884128214432366e-06, + "loss": 2.3172, + "step": 948 + }, + { + "epoch": 0.3660559305689489, + "grad_norm": 0.47931566387475316, + "learning_rate": 7.182876880091001e-06, + "loss": 2.3714, + "step": 949 + }, + { + "epoch": 0.36644165863066536, + "grad_norm": 0.4609317408192823, + "learning_rate": 7.17733763072284e-06, + "loss": 2.3339, + "step": 950 + }, + { + "epoch": 0.36682738669238185, + "grad_norm": 0.43813545970177914, + "learning_rate": 7.171795081733149e-06, + "loss": 2.3266, + "step": 951 + }, + { + "epoch": 0.36721311475409835, + "grad_norm": 0.4354158690725303, + "learning_rate": 7.1662492415213194e-06, + "loss": 2.3486, + "step": 952 + }, + { + "epoch": 0.36759884281581484, + "grad_norm": 0.4553878583964571, + "learning_rate": 7.160700118491729e-06, + "loss": 2.3382, + "step": 953 + }, + { + "epoch": 0.36798457087753134, + "grad_norm": 0.3973964395326932, + "learning_rate": 7.155147721053736e-06, + "loss": 2.2795, + "step": 954 + }, + { + "epoch": 0.36837029893924783, + "grad_norm": 0.43994093319418587, + "learning_rate": 7.149592057621657e-06, + "loss": 2.3168, + "step": 955 + }, + { + "epoch": 0.3687560270009643, + "grad_norm": 0.4318978015975414, + "learning_rate": 7.14403313661476e-06, + "loss": 2.3598, + "step": 956 + }, + { + "epoch": 0.3691417550626808, + "grad_norm": 0.44106364168307033, + "learning_rate": 7.138470966457247e-06, + "loss": 2.3031, + "step": 957 + }, + { + "epoch": 0.3695274831243973, + "grad_norm": 0.5289909050360543, + "learning_rate": 7.1329055555782455e-06, + "loss": 2.3867, + "step": 958 + }, + { + "epoch": 0.3699132111861138, + "grad_norm": 0.44511966109041345, + "learning_rate": 7.127336912411796e-06, + "loss": 2.3054, + "step": 959 + }, + { + "epoch": 0.3702989392478303, + "grad_norm": 0.4484647822924042, + "learning_rate": 7.1217650453968335e-06, + "loss": 2.327, + "step": 960 + }, + { + "epoch": 0.37068466730954674, + "grad_norm": 0.43351059716778734, + "learning_rate": 7.116189962977182e-06, + "loss": 2.3268, + "step": 961 + }, + { + "epoch": 0.37107039537126324, + "grad_norm": 0.4477952358364692, + "learning_rate": 7.110611673601534e-06, + "loss": 2.2614, + "step": 962 + }, + { + "epoch": 0.37145612343297973, + "grad_norm": 0.41218857307769957, + "learning_rate": 7.105030185723447e-06, + "loss": 2.2849, + "step": 963 + }, + { + "epoch": 0.37184185149469623, + "grad_norm": 0.45707003023428944, + "learning_rate": 7.099445507801324e-06, + "loss": 2.4215, + "step": 964 + }, + { + "epoch": 0.3722275795564127, + "grad_norm": 0.4653349176520788, + "learning_rate": 7.093857648298399e-06, + "loss": 2.3859, + "step": 965 + }, + { + "epoch": 0.3726133076181292, + "grad_norm": 0.49872774030691847, + "learning_rate": 7.0882666156827315e-06, + "loss": 2.3693, + "step": 966 + }, + { + "epoch": 0.3729990356798457, + "grad_norm": 0.424303126549995, + "learning_rate": 7.082672418427189e-06, + "loss": 2.224, + "step": 967 + }, + { + "epoch": 0.3733847637415622, + "grad_norm": 0.45802316947265115, + "learning_rate": 7.0770750650094335e-06, + "loss": 2.3169, + "step": 968 + }, + { + "epoch": 0.3737704918032787, + "grad_norm": 0.45558049307985826, + "learning_rate": 7.07147456391191e-06, + "loss": 2.3528, + "step": 969 + }, + { + "epoch": 0.3741562198649952, + "grad_norm": 0.41730404131935284, + "learning_rate": 7.065870923621832e-06, + "loss": 2.3625, + "step": 970 + }, + { + "epoch": 0.3745419479267117, + "grad_norm": 0.44765780350037143, + "learning_rate": 7.060264152631178e-06, + "loss": 2.3049, + "step": 971 + }, + { + "epoch": 0.37492767598842813, + "grad_norm": 0.427541289342369, + "learning_rate": 7.0546542594366605e-06, + "loss": 2.3661, + "step": 972 + }, + { + "epoch": 0.37531340405014463, + "grad_norm": 0.457471287514202, + "learning_rate": 7.04904125253973e-06, + "loss": 2.3248, + "step": 973 + }, + { + "epoch": 0.3756991321118611, + "grad_norm": 0.42521854187642844, + "learning_rate": 7.0434251404465536e-06, + "loss": 2.3568, + "step": 974 + }, + { + "epoch": 0.3760848601735776, + "grad_norm": 0.46678355766392204, + "learning_rate": 7.037805931668006e-06, + "loss": 2.2357, + "step": 975 + }, + { + "epoch": 0.3764705882352941, + "grad_norm": 0.4351826329091472, + "learning_rate": 7.03218363471965e-06, + "loss": 2.2827, + "step": 976 + }, + { + "epoch": 0.3768563162970106, + "grad_norm": 0.4214350713148253, + "learning_rate": 7.026558258121734e-06, + "loss": 2.3179, + "step": 977 + }, + { + "epoch": 0.3772420443587271, + "grad_norm": 0.4403774879476315, + "learning_rate": 7.0209298103991705e-06, + "loss": 2.3061, + "step": 978 + }, + { + "epoch": 0.3776277724204436, + "grad_norm": 0.45153347431122515, + "learning_rate": 7.015298300081527e-06, + "loss": 2.3657, + "step": 979 + }, + { + "epoch": 0.3780135004821601, + "grad_norm": 0.4554861232665788, + "learning_rate": 7.0096637357030105e-06, + "loss": 2.3103, + "step": 980 + }, + { + "epoch": 0.3783992285438766, + "grad_norm": 0.433555203257244, + "learning_rate": 7.004026125802458e-06, + "loss": 2.3119, + "step": 981 + }, + { + "epoch": 0.3787849566055931, + "grad_norm": 0.4376671806296453, + "learning_rate": 6.998385478923322e-06, + "loss": 2.3454, + "step": 982 + }, + { + "epoch": 0.3791706846673095, + "grad_norm": 0.46881557305357713, + "learning_rate": 6.992741803613654e-06, + "loss": 2.2832, + "step": 983 + }, + { + "epoch": 0.379556412729026, + "grad_norm": 0.399577766685155, + "learning_rate": 6.987095108426102e-06, + "loss": 2.3893, + "step": 984 + }, + { + "epoch": 0.3799421407907425, + "grad_norm": 0.42203561978643744, + "learning_rate": 6.981445401917883e-06, + "loss": 2.3288, + "step": 985 + }, + { + "epoch": 0.380327868852459, + "grad_norm": 0.4515287192066342, + "learning_rate": 6.975792692650778e-06, + "loss": 2.3583, + "step": 986 + }, + { + "epoch": 0.3807135969141755, + "grad_norm": 0.444049448296168, + "learning_rate": 6.970136989191125e-06, + "loss": 2.3105, + "step": 987 + }, + { + "epoch": 0.381099324975892, + "grad_norm": 0.46122786470791066, + "learning_rate": 6.964478300109796e-06, + "loss": 2.3619, + "step": 988 + }, + { + "epoch": 0.3814850530376085, + "grad_norm": 0.4158567496051892, + "learning_rate": 6.958816633982183e-06, + "loss": 2.3948, + "step": 989 + }, + { + "epoch": 0.381870781099325, + "grad_norm": 0.4177324575146256, + "learning_rate": 6.953151999388196e-06, + "loss": 2.4152, + "step": 990 + }, + { + "epoch": 0.3822565091610415, + "grad_norm": 0.41288361870688683, + "learning_rate": 6.9474844049122415e-06, + "loss": 2.3977, + "step": 991 + }, + { + "epoch": 0.382642237222758, + "grad_norm": 0.4491521001025632, + "learning_rate": 6.94181385914321e-06, + "loss": 2.2437, + "step": 992 + }, + { + "epoch": 0.38302796528447447, + "grad_norm": 0.4394758332260508, + "learning_rate": 6.936140370674465e-06, + "loss": 2.411, + "step": 993 + }, + { + "epoch": 0.3834136933461909, + "grad_norm": 0.4387644067430558, + "learning_rate": 6.930463948103833e-06, + "loss": 2.2535, + "step": 994 + }, + { + "epoch": 0.3837994214079074, + "grad_norm": 0.45901455967029586, + "learning_rate": 6.924784600033579e-06, + "loss": 2.2672, + "step": 995 + }, + { + "epoch": 0.3841851494696239, + "grad_norm": 0.4231729373706068, + "learning_rate": 6.91910233507041e-06, + "loss": 2.3228, + "step": 996 + }, + { + "epoch": 0.3845708775313404, + "grad_norm": 0.43524885209076375, + "learning_rate": 6.913417161825449e-06, + "loss": 2.2992, + "step": 997 + }, + { + "epoch": 0.3849566055930569, + "grad_norm": 0.4405235231336523, + "learning_rate": 6.907729088914228e-06, + "loss": 2.2838, + "step": 998 + }, + { + "epoch": 0.3853423336547734, + "grad_norm": 0.44791515209255905, + "learning_rate": 6.90203812495667e-06, + "loss": 2.3547, + "step": 999 + }, + { + "epoch": 0.3857280617164899, + "grad_norm": 0.4365039922356492, + "learning_rate": 6.896344278577083e-06, + "loss": 2.3111, + "step": 1000 + }, + { + "epoch": 0.3861137897782064, + "grad_norm": 0.43431408111788405, + "learning_rate": 6.890647558404144e-06, + "loss": 2.3078, + "step": 1001 + }, + { + "epoch": 0.38649951783992287, + "grad_norm": 0.4569136157900356, + "learning_rate": 6.8849479730708765e-06, + "loss": 2.3569, + "step": 1002 + }, + { + "epoch": 0.38688524590163936, + "grad_norm": 0.39957289809295965, + "learning_rate": 6.87924553121466e-06, + "loss": 2.3248, + "step": 1003 + }, + { + "epoch": 0.38727097396335586, + "grad_norm": 0.4372886094285002, + "learning_rate": 6.873540241477189e-06, + "loss": 2.323, + "step": 1004 + }, + { + "epoch": 0.3876567020250723, + "grad_norm": 0.43480340806009493, + "learning_rate": 6.867832112504482e-06, + "loss": 2.3657, + "step": 1005 + }, + { + "epoch": 0.3880424300867888, + "grad_norm": 0.4658185278735477, + "learning_rate": 6.862121152946858e-06, + "loss": 2.3572, + "step": 1006 + }, + { + "epoch": 0.3884281581485053, + "grad_norm": 0.4359512135855857, + "learning_rate": 6.856407371458927e-06, + "loss": 2.2816, + "step": 1007 + }, + { + "epoch": 0.3888138862102218, + "grad_norm": 0.4386396770138102, + "learning_rate": 6.850690776699574e-06, + "loss": 2.307, + "step": 1008 + }, + { + "epoch": 0.3891996142719383, + "grad_norm": 0.4324053729484432, + "learning_rate": 6.844971377331942e-06, + "loss": 2.3195, + "step": 1009 + }, + { + "epoch": 0.38958534233365477, + "grad_norm": 0.4274709230781966, + "learning_rate": 6.839249182023439e-06, + "loss": 2.2916, + "step": 1010 + }, + { + "epoch": 0.38997107039537127, + "grad_norm": 0.49161756609994206, + "learning_rate": 6.833524199445694e-06, + "loss": 2.33, + "step": 1011 + }, + { + "epoch": 0.39035679845708776, + "grad_norm": 0.4280522034863406, + "learning_rate": 6.8277964382745675e-06, + "loss": 2.286, + "step": 1012 + }, + { + "epoch": 0.39074252651880426, + "grad_norm": 0.4415005657982364, + "learning_rate": 6.822065907190133e-06, + "loss": 2.3205, + "step": 1013 + }, + { + "epoch": 0.39112825458052075, + "grad_norm": 0.4381340413937925, + "learning_rate": 6.816332614876655e-06, + "loss": 2.3975, + "step": 1014 + }, + { + "epoch": 0.39151398264223725, + "grad_norm": 0.42718477861729054, + "learning_rate": 6.810596570022589e-06, + "loss": 2.3751, + "step": 1015 + }, + { + "epoch": 0.3918997107039537, + "grad_norm": 0.41276047458090415, + "learning_rate": 6.804857781320558e-06, + "loss": 2.2694, + "step": 1016 + }, + { + "epoch": 0.3922854387656702, + "grad_norm": 0.4520249379157355, + "learning_rate": 6.799116257467342e-06, + "loss": 2.3873, + "step": 1017 + }, + { + "epoch": 0.3926711668273867, + "grad_norm": 0.5172398184768827, + "learning_rate": 6.79337200716387e-06, + "loss": 2.2735, + "step": 1018 + }, + { + "epoch": 0.39305689488910317, + "grad_norm": 0.46114562885130483, + "learning_rate": 6.7876250391152e-06, + "loss": 2.391, + "step": 1019 + }, + { + "epoch": 0.39344262295081966, + "grad_norm": 0.42465065549191094, + "learning_rate": 6.781875362030512e-06, + "loss": 2.3423, + "step": 1020 + }, + { + "epoch": 0.39382835101253616, + "grad_norm": 0.4704726760079544, + "learning_rate": 6.776122984623086e-06, + "loss": 2.3638, + "step": 1021 + }, + { + "epoch": 0.39421407907425265, + "grad_norm": 0.4369827657610331, + "learning_rate": 6.770367915610295e-06, + "loss": 2.2782, + "step": 1022 + }, + { + "epoch": 0.39459980713596915, + "grad_norm": 0.47649969787580804, + "learning_rate": 6.764610163713597e-06, + "loss": 2.3465, + "step": 1023 + }, + { + "epoch": 0.39498553519768564, + "grad_norm": 0.5111935072885732, + "learning_rate": 6.758849737658508e-06, + "loss": 2.3016, + "step": 1024 + }, + { + "epoch": 0.39537126325940214, + "grad_norm": 0.4909865341420742, + "learning_rate": 6.753086646174602e-06, + "loss": 2.2941, + "step": 1025 + }, + { + "epoch": 0.39575699132111863, + "grad_norm": 0.4606179194384059, + "learning_rate": 6.747320897995493e-06, + "loss": 2.3839, + "step": 1026 + }, + { + "epoch": 0.39614271938283513, + "grad_norm": 0.44722228600393227, + "learning_rate": 6.741552501858814e-06, + "loss": 2.3077, + "step": 1027 + }, + { + "epoch": 0.39652844744455157, + "grad_norm": 0.4364105456771137, + "learning_rate": 6.735781466506216e-06, + "loss": 2.3102, + "step": 1028 + }, + { + "epoch": 0.39691417550626806, + "grad_norm": 0.4790670655719006, + "learning_rate": 6.73000780068335e-06, + "loss": 2.4034, + "step": 1029 + }, + { + "epoch": 0.39729990356798456, + "grad_norm": 0.45100353119796904, + "learning_rate": 6.724231513139853e-06, + "loss": 2.321, + "step": 1030 + }, + { + "epoch": 0.39768563162970105, + "grad_norm": 0.44416370251594317, + "learning_rate": 6.718452612629333e-06, + "loss": 2.2442, + "step": 1031 + }, + { + "epoch": 0.39807135969141755, + "grad_norm": 0.44156998375007717, + "learning_rate": 6.712671107909359e-06, + "loss": 2.3522, + "step": 1032 + }, + { + "epoch": 0.39845708775313404, + "grad_norm": 0.4444707116798702, + "learning_rate": 6.706887007741445e-06, + "loss": 2.3001, + "step": 1033 + }, + { + "epoch": 0.39884281581485054, + "grad_norm": 0.46496092995888816, + "learning_rate": 6.701100320891044e-06, + "loss": 2.3374, + "step": 1034 + }, + { + "epoch": 0.39922854387656703, + "grad_norm": 0.4268218167377186, + "learning_rate": 6.69531105612752e-06, + "loss": 2.2871, + "step": 1035 + }, + { + "epoch": 0.3996142719382835, + "grad_norm": 0.44627263198522205, + "learning_rate": 6.6895192222241534e-06, + "loss": 2.3167, + "step": 1036 + }, + { + "epoch": 0.4, + "grad_norm": 0.4394902467570602, + "learning_rate": 6.683724827958108e-06, + "loss": 2.3015, + "step": 1037 + }, + { + "epoch": 0.4003857280617165, + "grad_norm": 0.44809256036451395, + "learning_rate": 6.677927882110435e-06, + "loss": 2.2743, + "step": 1038 + }, + { + "epoch": 0.40077145612343296, + "grad_norm": 0.4480462093293534, + "learning_rate": 6.672128393466051e-06, + "loss": 2.3038, + "step": 1039 + }, + { + "epoch": 0.40115718418514945, + "grad_norm": 0.40716720447609844, + "learning_rate": 6.666326370813722e-06, + "loss": 2.3595, + "step": 1040 + }, + { + "epoch": 0.40154291224686595, + "grad_norm": 0.3871021637643824, + "learning_rate": 6.66052182294606e-06, + "loss": 2.3098, + "step": 1041 + }, + { + "epoch": 0.40192864030858244, + "grad_norm": 0.4255994407428415, + "learning_rate": 6.654714758659499e-06, + "loss": 2.3425, + "step": 1042 + }, + { + "epoch": 0.40231436837029894, + "grad_norm": 0.41962801248829623, + "learning_rate": 6.648905186754292e-06, + "loss": 2.3371, + "step": 1043 + }, + { + "epoch": 0.40270009643201543, + "grad_norm": 0.43733012177467506, + "learning_rate": 6.643093116034486e-06, + "loss": 2.3944, + "step": 1044 + }, + { + "epoch": 0.4030858244937319, + "grad_norm": 0.4558494652602206, + "learning_rate": 6.637278555307915e-06, + "loss": 2.477, + "step": 1045 + }, + { + "epoch": 0.4034715525554484, + "grad_norm": 0.48233116424295136, + "learning_rate": 6.631461513386195e-06, + "loss": 2.337, + "step": 1046 + }, + { + "epoch": 0.4038572806171649, + "grad_norm": 0.451424760143431, + "learning_rate": 6.625641999084689e-06, + "loss": 2.3773, + "step": 1047 + }, + { + "epoch": 0.4042430086788814, + "grad_norm": 0.4144779902411331, + "learning_rate": 6.619820021222518e-06, + "loss": 2.3218, + "step": 1048 + }, + { + "epoch": 0.4046287367405979, + "grad_norm": 0.4327313055254783, + "learning_rate": 6.613995588622533e-06, + "loss": 2.3059, + "step": 1049 + }, + { + "epoch": 0.40501446480231434, + "grad_norm": 0.44106603363727004, + "learning_rate": 6.608168710111301e-06, + "loss": 2.2968, + "step": 1050 + }, + { + "epoch": 0.40540019286403084, + "grad_norm": 0.4297979184946325, + "learning_rate": 6.602339394519101e-06, + "loss": 2.3781, + "step": 1051 + }, + { + "epoch": 0.40578592092574733, + "grad_norm": 0.47658296834396163, + "learning_rate": 6.5965076506799e-06, + "loss": 2.2953, + "step": 1052 + }, + { + "epoch": 0.40617164898746383, + "grad_norm": 0.41497031845158716, + "learning_rate": 6.590673487431352e-06, + "loss": 2.3416, + "step": 1053 + }, + { + "epoch": 0.4065573770491803, + "grad_norm": 0.4179010618522049, + "learning_rate": 6.584836913614769e-06, + "loss": 2.3061, + "step": 1054 + }, + { + "epoch": 0.4069431051108968, + "grad_norm": 0.4500351290200825, + "learning_rate": 6.578997938075126e-06, + "loss": 2.2873, + "step": 1055 + }, + { + "epoch": 0.4073288331726133, + "grad_norm": 0.4432168262062113, + "learning_rate": 6.573156569661026e-06, + "loss": 2.3566, + "step": 1056 + }, + { + "epoch": 0.4077145612343298, + "grad_norm": 0.4326865514231597, + "learning_rate": 6.567312817224707e-06, + "loss": 2.2927, + "step": 1057 + }, + { + "epoch": 0.4081002892960463, + "grad_norm": 0.439714361044198, + "learning_rate": 6.561466689622018e-06, + "loss": 2.3334, + "step": 1058 + }, + { + "epoch": 0.4084860173577628, + "grad_norm": 0.4333011032218123, + "learning_rate": 6.555618195712405e-06, + "loss": 2.2632, + "step": 1059 + }, + { + "epoch": 0.4088717454194793, + "grad_norm": 0.45387751813644217, + "learning_rate": 6.549767344358903e-06, + "loss": 2.3249, + "step": 1060 + }, + { + "epoch": 0.40925747348119573, + "grad_norm": 0.4434742842228654, + "learning_rate": 6.543914144428114e-06, + "loss": 2.337, + "step": 1061 + }, + { + "epoch": 0.4096432015429122, + "grad_norm": 0.447638566703459, + "learning_rate": 6.538058604790209e-06, + "loss": 2.3462, + "step": 1062 + }, + { + "epoch": 0.4100289296046287, + "grad_norm": 0.4790848677092544, + "learning_rate": 6.532200734318896e-06, + "loss": 2.3278, + "step": 1063 + }, + { + "epoch": 0.4104146576663452, + "grad_norm": 0.4827492802460279, + "learning_rate": 6.526340541891418e-06, + "loss": 2.2715, + "step": 1064 + }, + { + "epoch": 0.4108003857280617, + "grad_norm": 0.4257176094164772, + "learning_rate": 6.5204780363885374e-06, + "loss": 2.3327, + "step": 1065 + }, + { + "epoch": 0.4111861137897782, + "grad_norm": 0.43133813252324593, + "learning_rate": 6.514613226694522e-06, + "loss": 2.4307, + "step": 1066 + }, + { + "epoch": 0.4115718418514947, + "grad_norm": 0.48643274699614625, + "learning_rate": 6.508746121697129e-06, + "loss": 2.3788, + "step": 1067 + }, + { + "epoch": 0.4119575699132112, + "grad_norm": 0.4402660949629646, + "learning_rate": 6.5028767302875974e-06, + "loss": 2.3179, + "step": 1068 + }, + { + "epoch": 0.4123432979749277, + "grad_norm": 0.44133891806353254, + "learning_rate": 6.4970050613606305e-06, + "loss": 2.2797, + "step": 1069 + }, + { + "epoch": 0.4127290260366442, + "grad_norm": 0.4394619756368342, + "learning_rate": 6.491131123814379e-06, + "loss": 2.3056, + "step": 1070 + }, + { + "epoch": 0.4131147540983607, + "grad_norm": 0.4615846772059102, + "learning_rate": 6.485254926550438e-06, + "loss": 2.3564, + "step": 1071 + }, + { + "epoch": 0.4135004821600771, + "grad_norm": 0.4450207637638467, + "learning_rate": 6.479376478473822e-06, + "loss": 2.3143, + "step": 1072 + }, + { + "epoch": 0.4138862102217936, + "grad_norm": 0.4340102860306911, + "learning_rate": 6.473495788492961e-06, + "loss": 2.3714, + "step": 1073 + }, + { + "epoch": 0.4142719382835101, + "grad_norm": 0.443586757260675, + "learning_rate": 6.467612865519674e-06, + "loss": 2.3184, + "step": 1074 + }, + { + "epoch": 0.4146576663452266, + "grad_norm": 0.42694356262634586, + "learning_rate": 6.461727718469175e-06, + "loss": 2.2255, + "step": 1075 + }, + { + "epoch": 0.4150433944069431, + "grad_norm": 0.4516396805934383, + "learning_rate": 6.455840356260041e-06, + "loss": 2.315, + "step": 1076 + }, + { + "epoch": 0.4154291224686596, + "grad_norm": 0.43394672178411253, + "learning_rate": 6.449950787814207e-06, + "loss": 2.3639, + "step": 1077 + }, + { + "epoch": 0.4158148505303761, + "grad_norm": 0.4443342157979516, + "learning_rate": 6.444059022056957e-06, + "loss": 2.3522, + "step": 1078 + }, + { + "epoch": 0.4162005785920926, + "grad_norm": 0.4204045067858496, + "learning_rate": 6.438165067916895e-06, + "loss": 2.3528, + "step": 1079 + }, + { + "epoch": 0.4165863066538091, + "grad_norm": 0.43346000237735166, + "learning_rate": 6.432268934325947e-06, + "loss": 2.3276, + "step": 1080 + }, + { + "epoch": 0.4169720347155256, + "grad_norm": 0.4203595622350257, + "learning_rate": 6.4263706302193455e-06, + "loss": 2.3096, + "step": 1081 + }, + { + "epoch": 0.41735776277724207, + "grad_norm": 0.442932032351111, + "learning_rate": 6.420470164535606e-06, + "loss": 2.3486, + "step": 1082 + }, + { + "epoch": 0.4177434908389585, + "grad_norm": 0.43755358770487474, + "learning_rate": 6.414567546216522e-06, + "loss": 2.3114, + "step": 1083 + }, + { + "epoch": 0.418129218900675, + "grad_norm": 0.45867286880559605, + "learning_rate": 6.408662784207149e-06, + "loss": 2.3757, + "step": 1084 + }, + { + "epoch": 0.4185149469623915, + "grad_norm": 0.4549111483968304, + "learning_rate": 6.402755887455792e-06, + "loss": 2.3097, + "step": 1085 + }, + { + "epoch": 0.418900675024108, + "grad_norm": 0.4190428005930495, + "learning_rate": 6.396846864913992e-06, + "loss": 2.374, + "step": 1086 + }, + { + "epoch": 0.4192864030858245, + "grad_norm": 0.4388628337078911, + "learning_rate": 6.390935725536506e-06, + "loss": 2.2961, + "step": 1087 + }, + { + "epoch": 0.419672131147541, + "grad_norm": 0.44117997063904274, + "learning_rate": 6.385022478281307e-06, + "loss": 2.3508, + "step": 1088 + }, + { + "epoch": 0.4200578592092575, + "grad_norm": 0.42126577551344346, + "learning_rate": 6.379107132109556e-06, + "loss": 2.3741, + "step": 1089 + }, + { + "epoch": 0.420443587270974, + "grad_norm": 0.43608915227968786, + "learning_rate": 6.3731896959855955e-06, + "loss": 2.3676, + "step": 1090 + }, + { + "epoch": 0.42082931533269047, + "grad_norm": 0.4468702780714682, + "learning_rate": 6.367270178876941e-06, + "loss": 2.3011, + "step": 1091 + }, + { + "epoch": 0.42121504339440696, + "grad_norm": 0.4604216720014819, + "learning_rate": 6.361348589754255e-06, + "loss": 2.2769, + "step": 1092 + }, + { + "epoch": 0.42160077145612346, + "grad_norm": 0.4332754651757432, + "learning_rate": 6.355424937591341e-06, + "loss": 2.3619, + "step": 1093 + }, + { + "epoch": 0.4219864995178399, + "grad_norm": 0.4590925298486195, + "learning_rate": 6.349499231365132e-06, + "loss": 2.2536, + "step": 1094 + }, + { + "epoch": 0.4223722275795564, + "grad_norm": 0.4497533415984344, + "learning_rate": 6.3435714800556725e-06, + "loss": 2.2562, + "step": 1095 + }, + { + "epoch": 0.4227579556412729, + "grad_norm": 0.4656021835189469, + "learning_rate": 6.337641692646106e-06, + "loss": 2.2969, + "step": 1096 + }, + { + "epoch": 0.4231436837029894, + "grad_norm": 0.4642208333188446, + "learning_rate": 6.331709878122658e-06, + "loss": 2.3359, + "step": 1097 + }, + { + "epoch": 0.4235294117647059, + "grad_norm": 0.4324865356121325, + "learning_rate": 6.325776045474632e-06, + "loss": 2.309, + "step": 1098 + }, + { + "epoch": 0.42391513982642237, + "grad_norm": 0.4431490537166105, + "learning_rate": 6.319840203694388e-06, + "loss": 2.3479, + "step": 1099 + }, + { + "epoch": 0.42430086788813887, + "grad_norm": 0.44518768669815395, + "learning_rate": 6.313902361777327e-06, + "loss": 2.2904, + "step": 1100 + }, + { + "epoch": 0.42468659594985536, + "grad_norm": 0.49563019133481434, + "learning_rate": 6.307962528721887e-06, + "loss": 2.3937, + "step": 1101 + }, + { + "epoch": 0.42507232401157186, + "grad_norm": 0.47713133946510067, + "learning_rate": 6.3020207135295185e-06, + "loss": 2.3491, + "step": 1102 + }, + { + "epoch": 0.42545805207328835, + "grad_norm": 0.4011094627347639, + "learning_rate": 6.296076925204677e-06, + "loss": 2.3313, + "step": 1103 + }, + { + "epoch": 0.42584378013500485, + "grad_norm": 0.4040592127888121, + "learning_rate": 6.290131172754811e-06, + "loss": 2.3553, + "step": 1104 + }, + { + "epoch": 0.4262295081967213, + "grad_norm": 0.4473439863544359, + "learning_rate": 6.284183465190343e-06, + "loss": 2.3219, + "step": 1105 + }, + { + "epoch": 0.4266152362584378, + "grad_norm": 0.46805268291151503, + "learning_rate": 6.278233811524657e-06, + "loss": 2.3188, + "step": 1106 + }, + { + "epoch": 0.4270009643201543, + "grad_norm": 0.4597217826013756, + "learning_rate": 6.272282220774091e-06, + "loss": 2.3134, + "step": 1107 + }, + { + "epoch": 0.42738669238187077, + "grad_norm": 0.42617259659875806, + "learning_rate": 6.266328701957911e-06, + "loss": 2.3124, + "step": 1108 + }, + { + "epoch": 0.42777242044358726, + "grad_norm": 0.44021464574232927, + "learning_rate": 6.260373264098314e-06, + "loss": 2.2858, + "step": 1109 + }, + { + "epoch": 0.42815814850530376, + "grad_norm": 0.47478062965458695, + "learning_rate": 6.2544159162203975e-06, + "loss": 2.3755, + "step": 1110 + }, + { + "epoch": 0.42854387656702025, + "grad_norm": 0.4740157113934357, + "learning_rate": 6.248456667352158e-06, + "loss": 2.2967, + "step": 1111 + }, + { + "epoch": 0.42892960462873675, + "grad_norm": 0.47571505212355597, + "learning_rate": 6.24249552652447e-06, + "loss": 2.4558, + "step": 1112 + }, + { + "epoch": 0.42931533269045324, + "grad_norm": 0.5414223860738109, + "learning_rate": 6.236532502771078e-06, + "loss": 2.3475, + "step": 1113 + }, + { + "epoch": 0.42970106075216974, + "grad_norm": 0.45421679017277083, + "learning_rate": 6.230567605128578e-06, + "loss": 2.3586, + "step": 1114 + }, + { + "epoch": 0.43008678881388623, + "grad_norm": 0.4554354798531583, + "learning_rate": 6.2246008426364055e-06, + "loss": 2.4012, + "step": 1115 + }, + { + "epoch": 0.4304725168756027, + "grad_norm": 0.47175937847091437, + "learning_rate": 6.2186322243368236e-06, + "loss": 2.3741, + "step": 1116 + }, + { + "epoch": 0.43085824493731917, + "grad_norm": 0.43380626429841324, + "learning_rate": 6.212661759274908e-06, + "loss": 2.3263, + "step": 1117 + }, + { + "epoch": 0.43124397299903566, + "grad_norm": 0.4472251530815354, + "learning_rate": 6.206689456498529e-06, + "loss": 2.3582, + "step": 1118 + }, + { + "epoch": 0.43162970106075216, + "grad_norm": 0.4768444445219848, + "learning_rate": 6.200715325058349e-06, + "loss": 2.3286, + "step": 1119 + }, + { + "epoch": 0.43201542912246865, + "grad_norm": 0.4296130917301774, + "learning_rate": 6.194739374007792e-06, + "loss": 2.2701, + "step": 1120 + }, + { + "epoch": 0.43240115718418515, + "grad_norm": 0.43050510001729003, + "learning_rate": 6.1887616124030505e-06, + "loss": 2.3901, + "step": 1121 + }, + { + "epoch": 0.43278688524590164, + "grad_norm": 0.42942575073230405, + "learning_rate": 6.182782049303051e-06, + "loss": 2.3512, + "step": 1122 + }, + { + "epoch": 0.43317261330761814, + "grad_norm": 0.47879872044883814, + "learning_rate": 6.176800693769457e-06, + "loss": 2.2915, + "step": 1123 + }, + { + "epoch": 0.43355834136933463, + "grad_norm": 0.448815528010462, + "learning_rate": 6.170817554866646e-06, + "loss": 2.2555, + "step": 1124 + }, + { + "epoch": 0.4339440694310511, + "grad_norm": 0.4227739666249548, + "learning_rate": 6.164832641661698e-06, + "loss": 2.3306, + "step": 1125 + }, + { + "epoch": 0.4343297974927676, + "grad_norm": 0.4485330017422935, + "learning_rate": 6.158845963224377e-06, + "loss": 2.3105, + "step": 1126 + }, + { + "epoch": 0.43471552555448406, + "grad_norm": 0.4584878233301734, + "learning_rate": 6.1528575286271306e-06, + "loss": 2.2823, + "step": 1127 + }, + { + "epoch": 0.43510125361620056, + "grad_norm": 0.4488282278195123, + "learning_rate": 6.1468673469450655e-06, + "loss": 2.4241, + "step": 1128 + }, + { + "epoch": 0.43548698167791705, + "grad_norm": 0.4466795628436329, + "learning_rate": 6.14087542725593e-06, + "loss": 2.3984, + "step": 1129 + }, + { + "epoch": 0.43587270973963355, + "grad_norm": 0.46895294939856697, + "learning_rate": 6.134881778640115e-06, + "loss": 2.4015, + "step": 1130 + }, + { + "epoch": 0.43625843780135004, + "grad_norm": 0.4389431896841497, + "learning_rate": 6.1288864101806225e-06, + "loss": 2.3532, + "step": 1131 + }, + { + "epoch": 0.43664416586306654, + "grad_norm": 0.4506338770546297, + "learning_rate": 6.122889330963069e-06, + "loss": 2.312, + "step": 1132 + }, + { + "epoch": 0.43702989392478303, + "grad_norm": 0.43919560541539265, + "learning_rate": 6.116890550075658e-06, + "loss": 2.3842, + "step": 1133 + }, + { + "epoch": 0.4374156219864995, + "grad_norm": 0.4520050253783449, + "learning_rate": 6.110890076609175e-06, + "loss": 2.3763, + "step": 1134 + }, + { + "epoch": 0.437801350048216, + "grad_norm": 0.44824681576411424, + "learning_rate": 6.10488791965697e-06, + "loss": 2.3638, + "step": 1135 + }, + { + "epoch": 0.4381870781099325, + "grad_norm": 0.4277658180056835, + "learning_rate": 6.098884088314938e-06, + "loss": 2.339, + "step": 1136 + }, + { + "epoch": 0.438572806171649, + "grad_norm": 0.4594464904112313, + "learning_rate": 6.092878591681525e-06, + "loss": 2.3469, + "step": 1137 + }, + { + "epoch": 0.4389585342333655, + "grad_norm": 0.4274399567143741, + "learning_rate": 6.086871438857687e-06, + "loss": 2.3536, + "step": 1138 + }, + { + "epoch": 0.43934426229508194, + "grad_norm": 0.4737894356640477, + "learning_rate": 6.080862638946896e-06, + "loss": 2.2823, + "step": 1139 + }, + { + "epoch": 0.43972999035679844, + "grad_norm": 0.43143990698958135, + "learning_rate": 6.074852201055121e-06, + "loss": 2.371, + "step": 1140 + }, + { + "epoch": 0.44011571841851493, + "grad_norm": 0.49600949126416094, + "learning_rate": 6.068840134290811e-06, + "loss": 2.273, + "step": 1141 + }, + { + "epoch": 0.44050144648023143, + "grad_norm": 0.422300559886733, + "learning_rate": 6.062826447764883e-06, + "loss": 2.2668, + "step": 1142 + }, + { + "epoch": 0.4408871745419479, + "grad_norm": 0.4844578511809956, + "learning_rate": 6.056811150590713e-06, + "loss": 2.3362, + "step": 1143 + }, + { + "epoch": 0.4412729026036644, + "grad_norm": 0.434657304710951, + "learning_rate": 6.050794251884112e-06, + "loss": 2.2864, + "step": 1144 + }, + { + "epoch": 0.4416586306653809, + "grad_norm": 0.44665209883883367, + "learning_rate": 6.044775760763321e-06, + "loss": 2.2931, + "step": 1145 + }, + { + "epoch": 0.4420443587270974, + "grad_norm": 0.4371263489152066, + "learning_rate": 6.038755686348993e-06, + "loss": 2.3158, + "step": 1146 + }, + { + "epoch": 0.4424300867888139, + "grad_norm": 0.4391629731269026, + "learning_rate": 6.032734037764184e-06, + "loss": 2.3269, + "step": 1147 + }, + { + "epoch": 0.4428158148505304, + "grad_norm": 0.4942247415917742, + "learning_rate": 6.026710824134331e-06, + "loss": 2.3198, + "step": 1148 + }, + { + "epoch": 0.4432015429122469, + "grad_norm": 0.4050150430809796, + "learning_rate": 6.020686054587244e-06, + "loss": 2.2747, + "step": 1149 + }, + { + "epoch": 0.44358727097396333, + "grad_norm": 0.43967979539812374, + "learning_rate": 6.014659738253091e-06, + "loss": 2.3176, + "step": 1150 + }, + { + "epoch": 0.4439729990356798, + "grad_norm": 0.4412842004726475, + "learning_rate": 6.008631884264387e-06, + "loss": 2.2611, + "step": 1151 + }, + { + "epoch": 0.4443587270973963, + "grad_norm": 0.441535243961793, + "learning_rate": 6.002602501755974e-06, + "loss": 2.2964, + "step": 1152 + }, + { + "epoch": 0.4447444551591128, + "grad_norm": 0.43134500859104097, + "learning_rate": 5.996571599865011e-06, + "loss": 2.343, + "step": 1153 + }, + { + "epoch": 0.4451301832208293, + "grad_norm": 0.43813050565346195, + "learning_rate": 5.9905391877309585e-06, + "loss": 2.346, + "step": 1154 + }, + { + "epoch": 0.4455159112825458, + "grad_norm": 0.44953540142040926, + "learning_rate": 5.9845052744955654e-06, + "loss": 2.3625, + "step": 1155 + }, + { + "epoch": 0.4459016393442623, + "grad_norm": 0.45320640228396486, + "learning_rate": 5.978469869302861e-06, + "loss": 2.355, + "step": 1156 + }, + { + "epoch": 0.4462873674059788, + "grad_norm": 0.4289413336000678, + "learning_rate": 5.972432981299129e-06, + "loss": 2.2561, + "step": 1157 + }, + { + "epoch": 0.4466730954676953, + "grad_norm": 0.42557121234458123, + "learning_rate": 5.9663946196329016e-06, + "loss": 2.3525, + "step": 1158 + }, + { + "epoch": 0.4470588235294118, + "grad_norm": 0.45419794091593085, + "learning_rate": 5.960354793454948e-06, + "loss": 2.3306, + "step": 1159 + }, + { + "epoch": 0.4474445515911283, + "grad_norm": 0.4776081570201772, + "learning_rate": 5.954313511918252e-06, + "loss": 2.379, + "step": 1160 + }, + { + "epoch": 0.4478302796528447, + "grad_norm": 0.40265412301439, + "learning_rate": 5.948270784178007e-06, + "loss": 2.3548, + "step": 1161 + }, + { + "epoch": 0.4482160077145612, + "grad_norm": 0.43486616171542647, + "learning_rate": 5.942226619391592e-06, + "loss": 2.2836, + "step": 1162 + }, + { + "epoch": 0.4486017357762777, + "grad_norm": 0.4870799701993185, + "learning_rate": 5.936181026718572e-06, + "loss": 2.3652, + "step": 1163 + }, + { + "epoch": 0.4489874638379942, + "grad_norm": 0.4255862868250421, + "learning_rate": 5.9301340153206685e-06, + "loss": 2.3503, + "step": 1164 + }, + { + "epoch": 0.4493731918997107, + "grad_norm": 0.40623416645588406, + "learning_rate": 5.924085594361758e-06, + "loss": 2.377, + "step": 1165 + }, + { + "epoch": 0.4497589199614272, + "grad_norm": 0.40894215034833015, + "learning_rate": 5.918035773007852e-06, + "loss": 2.3663, + "step": 1166 + }, + { + "epoch": 0.4501446480231437, + "grad_norm": 0.4534195425524319, + "learning_rate": 5.911984560427082e-06, + "loss": 2.3498, + "step": 1167 + }, + { + "epoch": 0.4505303760848602, + "grad_norm": 0.4173435915739408, + "learning_rate": 5.905931965789688e-06, + "loss": 2.3478, + "step": 1168 + }, + { + "epoch": 0.4509161041465767, + "grad_norm": 0.4153750621998993, + "learning_rate": 5.899877998268006e-06, + "loss": 2.393, + "step": 1169 + }, + { + "epoch": 0.4513018322082932, + "grad_norm": 0.47144396929105753, + "learning_rate": 5.893822667036456e-06, + "loss": 2.3126, + "step": 1170 + }, + { + "epoch": 0.45168756027000967, + "grad_norm": 0.4697653920570496, + "learning_rate": 5.887765981271518e-06, + "loss": 2.3625, + "step": 1171 + }, + { + "epoch": 0.4520732883317261, + "grad_norm": 0.4243931648314425, + "learning_rate": 5.881707950151725e-06, + "loss": 2.3607, + "step": 1172 + }, + { + "epoch": 0.4524590163934426, + "grad_norm": 0.41514890362349877, + "learning_rate": 5.875648582857655e-06, + "loss": 2.3081, + "step": 1173 + }, + { + "epoch": 0.4528447444551591, + "grad_norm": 0.45353219387179644, + "learning_rate": 5.869587888571906e-06, + "loss": 2.4277, + "step": 1174 + }, + { + "epoch": 0.4532304725168756, + "grad_norm": 0.3951554178998431, + "learning_rate": 5.863525876479088e-06, + "loss": 2.346, + "step": 1175 + }, + { + "epoch": 0.4536162005785921, + "grad_norm": 0.4298758994384629, + "learning_rate": 5.857462555765809e-06, + "loss": 2.3002, + "step": 1176 + }, + { + "epoch": 0.4540019286403086, + "grad_norm": 0.4437585353827277, + "learning_rate": 5.851397935620659e-06, + "loss": 2.3046, + "step": 1177 + }, + { + "epoch": 0.4543876567020251, + "grad_norm": 0.44068266378563453, + "learning_rate": 5.845332025234195e-06, + "loss": 2.3419, + "step": 1178 + }, + { + "epoch": 0.4547733847637416, + "grad_norm": 0.45015328025310364, + "learning_rate": 5.839264833798937e-06, + "loss": 2.2871, + "step": 1179 + }, + { + "epoch": 0.45515911282545807, + "grad_norm": 0.41377865869977437, + "learning_rate": 5.8331963705093375e-06, + "loss": 2.2378, + "step": 1180 + }, + { + "epoch": 0.45554484088717456, + "grad_norm": 0.4224676221449054, + "learning_rate": 5.82712664456178e-06, + "loss": 2.3075, + "step": 1181 + }, + { + "epoch": 0.45593056894889106, + "grad_norm": 0.43244460378263555, + "learning_rate": 5.8210556651545645e-06, + "loss": 2.3632, + "step": 1182 + }, + { + "epoch": 0.4563162970106075, + "grad_norm": 0.42085739790821947, + "learning_rate": 5.814983441487885e-06, + "loss": 2.3397, + "step": 1183 + }, + { + "epoch": 0.456702025072324, + "grad_norm": 0.4439530858820655, + "learning_rate": 5.808909982763825e-06, + "loss": 2.1945, + "step": 1184 + }, + { + "epoch": 0.4570877531340405, + "grad_norm": 0.4462098611435156, + "learning_rate": 5.802835298186337e-06, + "loss": 2.4003, + "step": 1185 + }, + { + "epoch": 0.457473481195757, + "grad_norm": 0.4367403521099576, + "learning_rate": 5.796759396961235e-06, + "loss": 2.3101, + "step": 1186 + }, + { + "epoch": 0.4578592092574735, + "grad_norm": 0.43024092109324674, + "learning_rate": 5.79068228829617e-06, + "loss": 2.2912, + "step": 1187 + }, + { + "epoch": 0.45824493731918997, + "grad_norm": 0.45455438629206824, + "learning_rate": 5.784603981400632e-06, + "loss": 2.394, + "step": 1188 + }, + { + "epoch": 0.45863066538090647, + "grad_norm": 0.45733608566425527, + "learning_rate": 5.77852448548592e-06, + "loss": 2.3055, + "step": 1189 + }, + { + "epoch": 0.45901639344262296, + "grad_norm": 0.4484306633332954, + "learning_rate": 5.772443809765138e-06, + "loss": 2.2913, + "step": 1190 + }, + { + "epoch": 0.45940212150433946, + "grad_norm": 0.4697565911997198, + "learning_rate": 5.766361963453174e-06, + "loss": 2.35, + "step": 1191 + }, + { + "epoch": 0.45978784956605595, + "grad_norm": 0.45054113892803993, + "learning_rate": 5.760278955766695e-06, + "loss": 2.3265, + "step": 1192 + }, + { + "epoch": 0.46017357762777245, + "grad_norm": 0.4221795155143893, + "learning_rate": 5.754194795924126e-06, + "loss": 2.2774, + "step": 1193 + }, + { + "epoch": 0.4605593056894889, + "grad_norm": 0.43959673814629013, + "learning_rate": 5.748109493145637e-06, + "loss": 2.3489, + "step": 1194 + }, + { + "epoch": 0.4609450337512054, + "grad_norm": 0.434043983494275, + "learning_rate": 5.742023056653131e-06, + "loss": 2.3185, + "step": 1195 + }, + { + "epoch": 0.4613307618129219, + "grad_norm": 0.4499995133774159, + "learning_rate": 5.735935495670229e-06, + "loss": 2.3143, + "step": 1196 + }, + { + "epoch": 0.46171648987463837, + "grad_norm": 0.43859434091634275, + "learning_rate": 5.7298468194222555e-06, + "loss": 2.2815, + "step": 1197 + }, + { + "epoch": 0.46210221793635486, + "grad_norm": 0.45098456142101523, + "learning_rate": 5.723757037136226e-06, + "loss": 2.3214, + "step": 1198 + }, + { + "epoch": 0.46248794599807136, + "grad_norm": 0.44782039911925825, + "learning_rate": 5.717666158040832e-06, + "loss": 2.3265, + "step": 1199 + }, + { + "epoch": 0.46287367405978785, + "grad_norm": 0.42669035365800917, + "learning_rate": 5.711574191366427e-06, + "loss": 2.3586, + "step": 1200 + }, + { + "epoch": 0.46325940212150435, + "grad_norm": 0.4312352059791168, + "learning_rate": 5.705481146345011e-06, + "loss": 2.3525, + "step": 1201 + }, + { + "epoch": 0.46364513018322084, + "grad_norm": 0.39976774767851975, + "learning_rate": 5.699387032210222e-06, + "loss": 2.3207, + "step": 1202 + }, + { + "epoch": 0.46403085824493734, + "grad_norm": 0.42260370567419747, + "learning_rate": 5.693291858197315e-06, + "loss": 2.3909, + "step": 1203 + }, + { + "epoch": 0.46441658630665383, + "grad_norm": 0.45382296447308357, + "learning_rate": 5.687195633543151e-06, + "loss": 2.3194, + "step": 1204 + }, + { + "epoch": 0.4648023143683703, + "grad_norm": 0.41319552901533824, + "learning_rate": 5.681098367486186e-06, + "loss": 2.3009, + "step": 1205 + }, + { + "epoch": 0.46518804243008677, + "grad_norm": 0.4328173372360925, + "learning_rate": 5.675000069266451e-06, + "loss": 2.326, + "step": 1206 + }, + { + "epoch": 0.46557377049180326, + "grad_norm": 0.4581633461649314, + "learning_rate": 5.6689007481255445e-06, + "loss": 2.3827, + "step": 1207 + }, + { + "epoch": 0.46595949855351976, + "grad_norm": 0.41910554221151053, + "learning_rate": 5.662800413306611e-06, + "loss": 2.3139, + "step": 1208 + }, + { + "epoch": 0.46634522661523625, + "grad_norm": 0.4682743739464939, + "learning_rate": 5.656699074054335e-06, + "loss": 2.2777, + "step": 1209 + }, + { + "epoch": 0.46673095467695275, + "grad_norm": 0.4471323728277224, + "learning_rate": 5.650596739614921e-06, + "loss": 2.2842, + "step": 1210 + }, + { + "epoch": 0.46711668273866924, + "grad_norm": 0.4385459014905679, + "learning_rate": 5.644493419236082e-06, + "loss": 2.3688, + "step": 1211 + }, + { + "epoch": 0.46750241080038574, + "grad_norm": 0.4661959535855007, + "learning_rate": 5.6383891221670275e-06, + "loss": 2.2435, + "step": 1212 + }, + { + "epoch": 0.46788813886210223, + "grad_norm": 0.4187724967770663, + "learning_rate": 5.632283857658442e-06, + "loss": 2.3224, + "step": 1213 + }, + { + "epoch": 0.4682738669238187, + "grad_norm": 0.4309626015977979, + "learning_rate": 5.626177634962482e-06, + "loss": 2.36, + "step": 1214 + }, + { + "epoch": 0.4686595949855352, + "grad_norm": 0.45728438707785074, + "learning_rate": 5.620070463332751e-06, + "loss": 2.3238, + "step": 1215 + }, + { + "epoch": 0.46904532304725166, + "grad_norm": 0.45588519636584973, + "learning_rate": 5.613962352024293e-06, + "loss": 2.3423, + "step": 1216 + }, + { + "epoch": 0.46943105110896816, + "grad_norm": 0.4203095541797606, + "learning_rate": 5.607853310293575e-06, + "loss": 2.3643, + "step": 1217 + }, + { + "epoch": 0.46981677917068465, + "grad_norm": 0.4428037199827808, + "learning_rate": 5.601743347398478e-06, + "loss": 2.3693, + "step": 1218 + }, + { + "epoch": 0.47020250723240115, + "grad_norm": 0.42730369062409596, + "learning_rate": 5.595632472598273e-06, + "loss": 2.2706, + "step": 1219 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.4847039689722017, + "learning_rate": 5.589520695153618e-06, + "loss": 2.3504, + "step": 1220 + }, + { + "epoch": 0.47097396335583414, + "grad_norm": 0.4466785201608283, + "learning_rate": 5.583408024326532e-06, + "loss": 2.3141, + "step": 1221 + }, + { + "epoch": 0.47135969141755063, + "grad_norm": 0.4644702092382401, + "learning_rate": 5.5772944693803975e-06, + "loss": 2.3732, + "step": 1222 + }, + { + "epoch": 0.4717454194792671, + "grad_norm": 0.42841728929128997, + "learning_rate": 5.57118003957993e-06, + "loss": 2.3068, + "step": 1223 + }, + { + "epoch": 0.4721311475409836, + "grad_norm": 0.43004816739382506, + "learning_rate": 5.5650647441911706e-06, + "loss": 2.2982, + "step": 1224 + }, + { + "epoch": 0.4725168756027001, + "grad_norm": 0.4377588605123363, + "learning_rate": 5.558948592481476e-06, + "loss": 2.3656, + "step": 1225 + }, + { + "epoch": 0.4729026036644166, + "grad_norm": 0.4078263685690202, + "learning_rate": 5.552831593719499e-06, + "loss": 2.3767, + "step": 1226 + }, + { + "epoch": 0.47328833172613305, + "grad_norm": 0.4374828944134357, + "learning_rate": 5.546713757175174e-06, + "loss": 2.3627, + "step": 1227 + }, + { + "epoch": 0.47367405978784954, + "grad_norm": 0.4083844779519322, + "learning_rate": 5.540595092119709e-06, + "loss": 2.2955, + "step": 1228 + }, + { + "epoch": 0.47405978784956604, + "grad_norm": 0.427796250465258, + "learning_rate": 5.534475607825566e-06, + "loss": 2.3696, + "step": 1229 + }, + { + "epoch": 0.47444551591128253, + "grad_norm": 0.4883224066551824, + "learning_rate": 5.5283553135664426e-06, + "loss": 2.3231, + "step": 1230 + }, + { + "epoch": 0.47483124397299903, + "grad_norm": 0.44997208402273214, + "learning_rate": 5.522234218617274e-06, + "loss": 2.3012, + "step": 1231 + }, + { + "epoch": 0.4752169720347155, + "grad_norm": 0.5034216061553358, + "learning_rate": 5.516112332254203e-06, + "loss": 2.3678, + "step": 1232 + }, + { + "epoch": 0.475602700096432, + "grad_norm": 0.43877172254018615, + "learning_rate": 5.509989663754572e-06, + "loss": 2.3391, + "step": 1233 + }, + { + "epoch": 0.4759884281581485, + "grad_norm": 0.4543499788909993, + "learning_rate": 5.503866222396907e-06, + "loss": 2.2994, + "step": 1234 + }, + { + "epoch": 0.476374156219865, + "grad_norm": 0.45789077129692657, + "learning_rate": 5.497742017460913e-06, + "loss": 2.3188, + "step": 1235 + }, + { + "epoch": 0.4767598842815815, + "grad_norm": 0.4560334384462107, + "learning_rate": 5.491617058227443e-06, + "loss": 2.3405, + "step": 1236 + }, + { + "epoch": 0.477145612343298, + "grad_norm": 0.43544502546982145, + "learning_rate": 5.485491353978496e-06, + "loss": 2.3168, + "step": 1237 + }, + { + "epoch": 0.47753134040501444, + "grad_norm": 0.4295577460446279, + "learning_rate": 5.479364913997203e-06, + "loss": 2.3379, + "step": 1238 + }, + { + "epoch": 0.47791706846673093, + "grad_norm": 0.46521640185997026, + "learning_rate": 5.473237747567805e-06, + "loss": 2.3228, + "step": 1239 + }, + { + "epoch": 0.4783027965284474, + "grad_norm": 0.43381577321765624, + "learning_rate": 5.46710986397565e-06, + "loss": 2.3475, + "step": 1240 + }, + { + "epoch": 0.4786885245901639, + "grad_norm": 0.4866871687241284, + "learning_rate": 5.460981272507168e-06, + "loss": 2.3004, + "step": 1241 + }, + { + "epoch": 0.4790742526518804, + "grad_norm": 0.424201002696731, + "learning_rate": 5.454851982449865e-06, + "loss": 2.2405, + "step": 1242 + }, + { + "epoch": 0.4794599807135969, + "grad_norm": 0.42804105550903804, + "learning_rate": 5.448722003092298e-06, + "loss": 2.337, + "step": 1243 + }, + { + "epoch": 0.4798457087753134, + "grad_norm": 0.4459269229444017, + "learning_rate": 5.442591343724081e-06, + "loss": 2.393, + "step": 1244 + }, + { + "epoch": 0.4802314368370299, + "grad_norm": 0.4278328610127386, + "learning_rate": 5.436460013635848e-06, + "loss": 2.2744, + "step": 1245 + }, + { + "epoch": 0.4806171648987464, + "grad_norm": 0.4463499915836363, + "learning_rate": 5.430328022119255e-06, + "loss": 2.3151, + "step": 1246 + }, + { + "epoch": 0.4810028929604629, + "grad_norm": 0.4356287393133619, + "learning_rate": 5.42419537846696e-06, + "loss": 2.2979, + "step": 1247 + }, + { + "epoch": 0.4813886210221794, + "grad_norm": 0.4507469977004841, + "learning_rate": 5.418062091972604e-06, + "loss": 2.3654, + "step": 1248 + }, + { + "epoch": 0.4817743490838959, + "grad_norm": 0.43202360474966794, + "learning_rate": 5.411928171930812e-06, + "loss": 2.3442, + "step": 1249 + }, + { + "epoch": 0.4821600771456123, + "grad_norm": 0.4556106440128594, + "learning_rate": 5.405793627637157e-06, + "loss": 2.3677, + "step": 1250 + }, + { + "epoch": 0.4825458052073288, + "grad_norm": 0.4481360098922905, + "learning_rate": 5.399658468388169e-06, + "loss": 2.3548, + "step": 1251 + }, + { + "epoch": 0.4829315332690453, + "grad_norm": 0.4255551311824385, + "learning_rate": 5.393522703481303e-06, + "loss": 2.3924, + "step": 1252 + }, + { + "epoch": 0.4833172613307618, + "grad_norm": 0.43423503978903033, + "learning_rate": 5.387386342214934e-06, + "loss": 2.302, + "step": 1253 + }, + { + "epoch": 0.4837029893924783, + "grad_norm": 0.45181001802914283, + "learning_rate": 5.381249393888344e-06, + "loss": 2.3078, + "step": 1254 + }, + { + "epoch": 0.4840887174541948, + "grad_norm": 0.4141462355372266, + "learning_rate": 5.375111867801698e-06, + "loss": 2.3217, + "step": 1255 + }, + { + "epoch": 0.4844744455159113, + "grad_norm": 0.41247263365108455, + "learning_rate": 5.36897377325604e-06, + "loss": 2.3347, + "step": 1256 + }, + { + "epoch": 0.4848601735776278, + "grad_norm": 0.4523684838820164, + "learning_rate": 5.362835119553278e-06, + "loss": 2.3655, + "step": 1257 + }, + { + "epoch": 0.4852459016393443, + "grad_norm": 0.45940229024333956, + "learning_rate": 5.356695915996162e-06, + "loss": 2.3997, + "step": 1258 + }, + { + "epoch": 0.4856316297010608, + "grad_norm": 0.45153234599778547, + "learning_rate": 5.350556171888281e-06, + "loss": 2.3613, + "step": 1259 + }, + { + "epoch": 0.48601735776277727, + "grad_norm": 0.46289662017840116, + "learning_rate": 5.344415896534039e-06, + "loss": 2.3734, + "step": 1260 + }, + { + "epoch": 0.4864030858244937, + "grad_norm": 0.4176290117254934, + "learning_rate": 5.338275099238647e-06, + "loss": 2.3401, + "step": 1261 + }, + { + "epoch": 0.4867888138862102, + "grad_norm": 0.45879286317120854, + "learning_rate": 5.332133789308104e-06, + "loss": 2.3718, + "step": 1262 + }, + { + "epoch": 0.4871745419479267, + "grad_norm": 0.44584372814567164, + "learning_rate": 5.325991976049191e-06, + "loss": 2.2913, + "step": 1263 + }, + { + "epoch": 0.4875602700096432, + "grad_norm": 0.474275042923422, + "learning_rate": 5.319849668769449e-06, + "loss": 2.3786, + "step": 1264 + }, + { + "epoch": 0.4879459980713597, + "grad_norm": 0.4273666670416358, + "learning_rate": 5.313706876777166e-06, + "loss": 2.3495, + "step": 1265 + }, + { + "epoch": 0.4883317261330762, + "grad_norm": 0.4153343181353523, + "learning_rate": 5.307563609381367e-06, + "loss": 2.3742, + "step": 1266 + }, + { + "epoch": 0.4887174541947927, + "grad_norm": 0.4274131690236131, + "learning_rate": 5.301419875891795e-06, + "loss": 2.3406, + "step": 1267 + }, + { + "epoch": 0.4891031822565092, + "grad_norm": 0.40845967490965335, + "learning_rate": 5.295275685618905e-06, + "loss": 2.2525, + "step": 1268 + }, + { + "epoch": 0.48948891031822567, + "grad_norm": 0.4400933708424202, + "learning_rate": 5.289131047873837e-06, + "loss": 2.3325, + "step": 1269 + }, + { + "epoch": 0.48987463837994216, + "grad_norm": 0.43694952970669976, + "learning_rate": 5.282985971968413e-06, + "loss": 2.2801, + "step": 1270 + }, + { + "epoch": 0.49026036644165866, + "grad_norm": 0.42276340759115294, + "learning_rate": 5.276840467215119e-06, + "loss": 2.3553, + "step": 1271 + }, + { + "epoch": 0.4906460945033751, + "grad_norm": 0.42298935637179624, + "learning_rate": 5.270694542927089e-06, + "loss": 2.3527, + "step": 1272 + }, + { + "epoch": 0.4910318225650916, + "grad_norm": 0.4718174194659704, + "learning_rate": 5.264548208418094e-06, + "loss": 2.3241, + "step": 1273 + }, + { + "epoch": 0.4914175506268081, + "grad_norm": 0.4507159267224731, + "learning_rate": 5.258401473002529e-06, + "loss": 2.3285, + "step": 1274 + }, + { + "epoch": 0.4918032786885246, + "grad_norm": 0.4421805779778448, + "learning_rate": 5.252254345995392e-06, + "loss": 2.3861, + "step": 1275 + }, + { + "epoch": 0.4921890067502411, + "grad_norm": 0.440561261668261, + "learning_rate": 5.246106836712277e-06, + "loss": 2.3358, + "step": 1276 + }, + { + "epoch": 0.49257473481195757, + "grad_norm": 0.43081093028252854, + "learning_rate": 5.239958954469358e-06, + "loss": 2.3077, + "step": 1277 + }, + { + "epoch": 0.49296046287367407, + "grad_norm": 0.4127708092181289, + "learning_rate": 5.233810708583372e-06, + "loss": 2.2792, + "step": 1278 + }, + { + "epoch": 0.49334619093539056, + "grad_norm": 0.4159049601401717, + "learning_rate": 5.227662108371609e-06, + "loss": 2.283, + "step": 1279 + }, + { + "epoch": 0.49373191899710706, + "grad_norm": 0.489563289357969, + "learning_rate": 5.2215131631518945e-06, + "loss": 2.3171, + "step": 1280 + }, + { + "epoch": 0.49411764705882355, + "grad_norm": 0.442079206163343, + "learning_rate": 5.215363882242578e-06, + "loss": 2.3156, + "step": 1281 + }, + { + "epoch": 0.49450337512054005, + "grad_norm": 0.4133470673765935, + "learning_rate": 5.2092142749625165e-06, + "loss": 2.2269, + "step": 1282 + }, + { + "epoch": 0.4948891031822565, + "grad_norm": 0.43427602878160826, + "learning_rate": 5.203064350631064e-06, + "loss": 2.353, + "step": 1283 + }, + { + "epoch": 0.495274831243973, + "grad_norm": 0.4193435283723235, + "learning_rate": 5.196914118568054e-06, + "loss": 2.3391, + "step": 1284 + }, + { + "epoch": 0.4956605593056895, + "grad_norm": 0.4386644430997478, + "learning_rate": 5.190763588093781e-06, + "loss": 2.3025, + "step": 1285 + }, + { + "epoch": 0.49604628736740597, + "grad_norm": 0.40588838928948484, + "learning_rate": 5.184612768529002e-06, + "loss": 2.3091, + "step": 1286 + }, + { + "epoch": 0.49643201542912246, + "grad_norm": 0.4816780898609368, + "learning_rate": 5.178461669194903e-06, + "loss": 2.4023, + "step": 1287 + }, + { + "epoch": 0.49681774349083896, + "grad_norm": 0.420314726568252, + "learning_rate": 5.1723102994130994e-06, + "loss": 2.3143, + "step": 1288 + }, + { + "epoch": 0.49720347155255545, + "grad_norm": 0.46688921048479026, + "learning_rate": 5.166158668505612e-06, + "loss": 2.3528, + "step": 1289 + }, + { + "epoch": 0.49758919961427195, + "grad_norm": 0.42807252967185566, + "learning_rate": 5.1600067857948634e-06, + "loss": 2.3267, + "step": 1290 + }, + { + "epoch": 0.49797492767598844, + "grad_norm": 0.41117819104535525, + "learning_rate": 5.153854660603651e-06, + "loss": 2.2806, + "step": 1291 + }, + { + "epoch": 0.49836065573770494, + "grad_norm": 0.4073016141616718, + "learning_rate": 5.147702302255143e-06, + "loss": 2.2486, + "step": 1292 + }, + { + "epoch": 0.49874638379942143, + "grad_norm": 0.4333639612761763, + "learning_rate": 5.141549720072865e-06, + "loss": 2.2796, + "step": 1293 + }, + { + "epoch": 0.4991321118611379, + "grad_norm": 0.43911100915033385, + "learning_rate": 5.1353969233806735e-06, + "loss": 2.3437, + "step": 1294 + }, + { + "epoch": 0.49951783992285437, + "grad_norm": 0.43606136330998496, + "learning_rate": 5.129243921502756e-06, + "loss": 2.3163, + "step": 1295 + }, + { + "epoch": 0.49990356798457086, + "grad_norm": 0.4643969346558146, + "learning_rate": 5.123090723763607e-06, + "loss": 2.3239, + "step": 1296 + }, + { + "epoch": 0.5002892960462874, + "grad_norm": 0.43901031034161053, + "learning_rate": 5.116937339488023e-06, + "loss": 2.2926, + "step": 1297 + }, + { + "epoch": 0.5006750241080039, + "grad_norm": 0.4297549565643895, + "learning_rate": 5.110783778001077e-06, + "loss": 2.3467, + "step": 1298 + }, + { + "epoch": 0.5010607521697203, + "grad_norm": 0.442465634264455, + "learning_rate": 5.104630048628117e-06, + "loss": 2.3235, + "step": 1299 + }, + { + "epoch": 0.5014464802314368, + "grad_norm": 0.4118556545526013, + "learning_rate": 5.098476160694741e-06, + "loss": 2.3466, + "step": 1300 + }, + { + "epoch": 0.5018322082931533, + "grad_norm": 0.40736736064570456, + "learning_rate": 5.092322123526787e-06, + "loss": 2.3379, + "step": 1301 + }, + { + "epoch": 0.5022179363548698, + "grad_norm": 0.4059480335225931, + "learning_rate": 5.086167946450323e-06, + "loss": 2.342, + "step": 1302 + }, + { + "epoch": 0.5026036644165863, + "grad_norm": 0.4375340280301096, + "learning_rate": 5.080013638791624e-06, + "loss": 2.3221, + "step": 1303 + }, + { + "epoch": 0.5029893924783028, + "grad_norm": 0.4306039764256995, + "learning_rate": 5.073859209877167e-06, + "loss": 2.2592, + "step": 1304 + }, + { + "epoch": 0.5033751205400193, + "grad_norm": 0.4298973973686701, + "learning_rate": 5.06770466903361e-06, + "loss": 2.2559, + "step": 1305 + }, + { + "epoch": 0.5037608486017358, + "grad_norm": 0.4183112371095239, + "learning_rate": 5.0615500255877835e-06, + "loss": 2.3756, + "step": 1306 + }, + { + "epoch": 0.5041465766634523, + "grad_norm": 0.4493544264927047, + "learning_rate": 5.055395288866672e-06, + "loss": 2.3288, + "step": 1307 + }, + { + "epoch": 0.5045323047251687, + "grad_norm": 0.40468569502294427, + "learning_rate": 5.049240468197401e-06, + "loss": 2.3034, + "step": 1308 + }, + { + "epoch": 0.5049180327868853, + "grad_norm": 0.4137130754137583, + "learning_rate": 5.04308557290722e-06, + "loss": 2.3024, + "step": 1309 + }, + { + "epoch": 0.5053037608486017, + "grad_norm": 0.4400960341336459, + "learning_rate": 5.036930612323501e-06, + "loss": 2.3114, + "step": 1310 + }, + { + "epoch": 0.5056894889103182, + "grad_norm": 0.418064368559306, + "learning_rate": 5.030775595773706e-06, + "loss": 2.3444, + "step": 1311 + }, + { + "epoch": 0.5060752169720347, + "grad_norm": 0.4143452375728194, + "learning_rate": 5.0246205325853824e-06, + "loss": 2.3348, + "step": 1312 + }, + { + "epoch": 0.5064609450337512, + "grad_norm": 0.4520396890728923, + "learning_rate": 5.0184654320861545e-06, + "loss": 2.2867, + "step": 1313 + }, + { + "epoch": 0.5068466730954677, + "grad_norm": 0.42533132932115353, + "learning_rate": 5.0123103036036965e-06, + "loss": 2.3015, + "step": 1314 + }, + { + "epoch": 0.5072324011571842, + "grad_norm": 0.3916690241337248, + "learning_rate": 5.006155156465728e-06, + "loss": 2.3024, + "step": 1315 + }, + { + "epoch": 0.5076181292189007, + "grad_norm": 0.4776736612144727, + "learning_rate": 5e-06, + "loss": 2.3367, + "step": 1316 + }, + { + "epoch": 0.5080038572806171, + "grad_norm": 0.4229583495828416, + "learning_rate": 4.9938448435342725e-06, + "loss": 2.3766, + "step": 1317 + }, + { + "epoch": 0.5083895853423337, + "grad_norm": 0.45185543617759183, + "learning_rate": 4.987689696396305e-06, + "loss": 2.2442, + "step": 1318 + }, + { + "epoch": 0.5087753134040501, + "grad_norm": 0.43226886814619625, + "learning_rate": 4.981534567913848e-06, + "loss": 2.2452, + "step": 1319 + }, + { + "epoch": 0.5091610414657667, + "grad_norm": 0.4527728589734573, + "learning_rate": 4.975379467414621e-06, + "loss": 2.3577, + "step": 1320 + }, + { + "epoch": 0.5095467695274831, + "grad_norm": 0.4792812570785096, + "learning_rate": 4.969224404226296e-06, + "loss": 2.3183, + "step": 1321 + }, + { + "epoch": 0.5099324975891996, + "grad_norm": 0.42420738308445527, + "learning_rate": 4.963069387676499e-06, + "loss": 2.3004, + "step": 1322 + }, + { + "epoch": 0.5103182256509161, + "grad_norm": 0.4599343183213432, + "learning_rate": 4.95691442709278e-06, + "loss": 2.2649, + "step": 1323 + }, + { + "epoch": 0.5107039537126326, + "grad_norm": 0.41604459324505855, + "learning_rate": 4.950759531802602e-06, + "loss": 2.3898, + "step": 1324 + }, + { + "epoch": 0.5110896817743491, + "grad_norm": 0.4228763687924707, + "learning_rate": 4.944604711133329e-06, + "loss": 2.321, + "step": 1325 + }, + { + "epoch": 0.5114754098360655, + "grad_norm": 0.430884235363054, + "learning_rate": 4.938449974412217e-06, + "loss": 2.3314, + "step": 1326 + }, + { + "epoch": 0.5118611378977821, + "grad_norm": 0.46295425261968765, + "learning_rate": 4.932295330966392e-06, + "loss": 2.2935, + "step": 1327 + }, + { + "epoch": 0.5122468659594985, + "grad_norm": 0.4213530809499717, + "learning_rate": 4.926140790122835e-06, + "loss": 2.3326, + "step": 1328 + }, + { + "epoch": 0.5126325940212151, + "grad_norm": 0.4665793372541669, + "learning_rate": 4.919986361208379e-06, + "loss": 2.346, + "step": 1329 + }, + { + "epoch": 0.5130183220829315, + "grad_norm": 0.4456751472800828, + "learning_rate": 4.91383205354968e-06, + "loss": 2.3559, + "step": 1330 + }, + { + "epoch": 0.5134040501446481, + "grad_norm": 0.42134895981722886, + "learning_rate": 4.907677876473214e-06, + "loss": 2.2444, + "step": 1331 + }, + { + "epoch": 0.5137897782063645, + "grad_norm": 0.41934135489864954, + "learning_rate": 4.90152383930526e-06, + "loss": 2.2932, + "step": 1332 + }, + { + "epoch": 0.514175506268081, + "grad_norm": 0.45310057387043157, + "learning_rate": 4.895369951371884e-06, + "loss": 2.3435, + "step": 1333 + }, + { + "epoch": 0.5145612343297975, + "grad_norm": 0.5258865623441964, + "learning_rate": 4.889216221998925e-06, + "loss": 2.3424, + "step": 1334 + }, + { + "epoch": 0.5149469623915139, + "grad_norm": 0.48039153232624293, + "learning_rate": 4.883062660511979e-06, + "loss": 2.3399, + "step": 1335 + }, + { + "epoch": 0.5153326904532305, + "grad_norm": 0.47444154350583756, + "learning_rate": 4.876909276236395e-06, + "loss": 2.3539, + "step": 1336 + }, + { + "epoch": 0.5157184185149469, + "grad_norm": 0.5069800163012861, + "learning_rate": 4.870756078497247e-06, + "loss": 2.379, + "step": 1337 + }, + { + "epoch": 0.5161041465766635, + "grad_norm": 0.49748181116335205, + "learning_rate": 4.864603076619329e-06, + "loss": 2.2928, + "step": 1338 + }, + { + "epoch": 0.5164898746383799, + "grad_norm": 0.42706937479473217, + "learning_rate": 4.858450279927138e-06, + "loss": 2.3086, + "step": 1339 + }, + { + "epoch": 0.5168756027000965, + "grad_norm": 0.41046138972851537, + "learning_rate": 4.852297697744857e-06, + "loss": 2.2277, + "step": 1340 + }, + { + "epoch": 0.5172613307618129, + "grad_norm": 0.4330642896329234, + "learning_rate": 4.84614533939635e-06, + "loss": 2.2658, + "step": 1341 + }, + { + "epoch": 0.5176470588235295, + "grad_norm": 0.43557729756163627, + "learning_rate": 4.839993214205138e-06, + "loss": 2.2969, + "step": 1342 + }, + { + "epoch": 0.5180327868852459, + "grad_norm": 0.4132315515601232, + "learning_rate": 4.83384133149439e-06, + "loss": 2.306, + "step": 1343 + }, + { + "epoch": 0.5184185149469623, + "grad_norm": 0.4725680753577124, + "learning_rate": 4.827689700586902e-06, + "loss": 2.3319, + "step": 1344 + }, + { + "epoch": 0.5188042430086789, + "grad_norm": 0.4024975847853517, + "learning_rate": 4.821538330805098e-06, + "loss": 2.3366, + "step": 1345 + }, + { + "epoch": 0.5191899710703953, + "grad_norm": 0.4771435011033568, + "learning_rate": 4.815387231471001e-06, + "loss": 2.3304, + "step": 1346 + }, + { + "epoch": 0.5195756991321119, + "grad_norm": 0.45140386154895684, + "learning_rate": 4.80923641190622e-06, + "loss": 2.3414, + "step": 1347 + }, + { + "epoch": 0.5199614271938283, + "grad_norm": 0.42681886917358797, + "learning_rate": 4.803085881431949e-06, + "loss": 2.3342, + "step": 1348 + }, + { + "epoch": 0.5203471552555449, + "grad_norm": 0.4437892746591648, + "learning_rate": 4.796935649368936e-06, + "loss": 2.2996, + "step": 1349 + }, + { + "epoch": 0.5207328833172613, + "grad_norm": 0.5069638790201045, + "learning_rate": 4.790785725037484e-06, + "loss": 2.3107, + "step": 1350 + }, + { + "epoch": 0.5211186113789779, + "grad_norm": 0.4366235525915503, + "learning_rate": 4.784636117757423e-06, + "loss": 2.2616, + "step": 1351 + }, + { + "epoch": 0.5215043394406943, + "grad_norm": 0.4652015925814833, + "learning_rate": 4.778486836848107e-06, + "loss": 2.3107, + "step": 1352 + }, + { + "epoch": 0.5218900675024108, + "grad_norm": 0.46175852384207916, + "learning_rate": 4.772337891628394e-06, + "loss": 2.2378, + "step": 1353 + }, + { + "epoch": 0.5222757955641273, + "grad_norm": 0.43883478665108466, + "learning_rate": 4.76618929141663e-06, + "loss": 2.3241, + "step": 1354 + }, + { + "epoch": 0.5226615236258437, + "grad_norm": 0.4417539282200209, + "learning_rate": 4.760041045530645e-06, + "loss": 2.2594, + "step": 1355 + }, + { + "epoch": 0.5230472516875603, + "grad_norm": 0.4063010167186865, + "learning_rate": 4.7538931632877254e-06, + "loss": 2.3434, + "step": 1356 + }, + { + "epoch": 0.5234329797492767, + "grad_norm": 0.4384298510879702, + "learning_rate": 4.7477456540046105e-06, + "loss": 2.3348, + "step": 1357 + }, + { + "epoch": 0.5238187078109933, + "grad_norm": 0.45915359256206345, + "learning_rate": 4.741598526997474e-06, + "loss": 2.3056, + "step": 1358 + }, + { + "epoch": 0.5242044358727097, + "grad_norm": 0.43968549360910814, + "learning_rate": 4.7354517915819065e-06, + "loss": 2.336, + "step": 1359 + }, + { + "epoch": 0.5245901639344263, + "grad_norm": 0.41796295540394873, + "learning_rate": 4.729305457072913e-06, + "loss": 2.2748, + "step": 1360 + }, + { + "epoch": 0.5249758919961427, + "grad_norm": 0.4362828750825012, + "learning_rate": 4.723159532784883e-06, + "loss": 2.2561, + "step": 1361 + }, + { + "epoch": 0.5253616200578592, + "grad_norm": 0.4371254165879112, + "learning_rate": 4.717014028031589e-06, + "loss": 2.3335, + "step": 1362 + }, + { + "epoch": 0.5257473481195757, + "grad_norm": 0.45665829594986745, + "learning_rate": 4.710868952126166e-06, + "loss": 2.3818, + "step": 1363 + }, + { + "epoch": 0.5261330761812922, + "grad_norm": 0.44081750659388047, + "learning_rate": 4.704724314381097e-06, + "loss": 2.3263, + "step": 1364 + }, + { + "epoch": 0.5265188042430087, + "grad_norm": 0.4437002618568672, + "learning_rate": 4.6985801241082065e-06, + "loss": 2.2988, + "step": 1365 + }, + { + "epoch": 0.5269045323047251, + "grad_norm": 0.4459940408363005, + "learning_rate": 4.692436390618635e-06, + "loss": 2.279, + "step": 1366 + }, + { + "epoch": 0.5272902603664417, + "grad_norm": 0.4433554462879378, + "learning_rate": 4.686293123222837e-06, + "loss": 2.3205, + "step": 1367 + }, + { + "epoch": 0.5276759884281581, + "grad_norm": 0.4391895686696009, + "learning_rate": 4.680150331230552e-06, + "loss": 2.3565, + "step": 1368 + }, + { + "epoch": 0.5280617164898747, + "grad_norm": 0.44910921831985756, + "learning_rate": 4.674008023950809e-06, + "loss": 2.3278, + "step": 1369 + }, + { + "epoch": 0.5284474445515911, + "grad_norm": 0.4183548874298808, + "learning_rate": 4.667866210691897e-06, + "loss": 2.41, + "step": 1370 + }, + { + "epoch": 0.5288331726133076, + "grad_norm": 0.42557100108547463, + "learning_rate": 4.661724900761355e-06, + "loss": 2.2916, + "step": 1371 + }, + { + "epoch": 0.5292189006750241, + "grad_norm": 0.45979466957641174, + "learning_rate": 4.6555841034659625e-06, + "loss": 2.2926, + "step": 1372 + }, + { + "epoch": 0.5296046287367406, + "grad_norm": 0.40343968385412204, + "learning_rate": 4.6494438281117195e-06, + "loss": 2.3008, + "step": 1373 + }, + { + "epoch": 0.5299903567984571, + "grad_norm": 0.4418053528829375, + "learning_rate": 4.643304084003839e-06, + "loss": 2.2997, + "step": 1374 + }, + { + "epoch": 0.5303760848601736, + "grad_norm": 0.42781136201724485, + "learning_rate": 4.637164880446725e-06, + "loss": 2.2972, + "step": 1375 + }, + { + "epoch": 0.5307618129218901, + "grad_norm": 0.41523373826769044, + "learning_rate": 4.631026226743962e-06, + "loss": 2.3737, + "step": 1376 + }, + { + "epoch": 0.5311475409836065, + "grad_norm": 0.43181206158587115, + "learning_rate": 4.624888132198303e-06, + "loss": 2.3483, + "step": 1377 + }, + { + "epoch": 0.531533269045323, + "grad_norm": 0.4883659943209646, + "learning_rate": 4.618750606111657e-06, + "loss": 2.3484, + "step": 1378 + }, + { + "epoch": 0.5319189971070395, + "grad_norm": 0.4562404426597322, + "learning_rate": 4.612613657785066e-06, + "loss": 2.2604, + "step": 1379 + }, + { + "epoch": 0.532304725168756, + "grad_norm": 0.43283921590840496, + "learning_rate": 4.606477296518698e-06, + "loss": 2.3077, + "step": 1380 + }, + { + "epoch": 0.5326904532304725, + "grad_norm": 0.46474300575051714, + "learning_rate": 4.600341531611833e-06, + "loss": 2.3756, + "step": 1381 + }, + { + "epoch": 0.533076181292189, + "grad_norm": 0.4918375192419344, + "learning_rate": 4.594206372362845e-06, + "loss": 2.3844, + "step": 1382 + }, + { + "epoch": 0.5334619093539055, + "grad_norm": 0.43756529480102446, + "learning_rate": 4.588071828069191e-06, + "loss": 2.2807, + "step": 1383 + }, + { + "epoch": 0.533847637415622, + "grad_norm": 0.43556271744351255, + "learning_rate": 4.581937908027397e-06, + "loss": 2.2884, + "step": 1384 + }, + { + "epoch": 0.5342333654773385, + "grad_norm": 0.41233607827108854, + "learning_rate": 4.575804621533043e-06, + "loss": 2.3097, + "step": 1385 + }, + { + "epoch": 0.534619093539055, + "grad_norm": 0.4323643545291776, + "learning_rate": 4.5696719778807465e-06, + "loss": 2.3127, + "step": 1386 + }, + { + "epoch": 0.5350048216007715, + "grad_norm": 0.44083524101705623, + "learning_rate": 4.563539986364152e-06, + "loss": 2.3313, + "step": 1387 + }, + { + "epoch": 0.5353905496624879, + "grad_norm": 0.4144429426000985, + "learning_rate": 4.55740865627592e-06, + "loss": 2.3078, + "step": 1388 + }, + { + "epoch": 0.5357762777242044, + "grad_norm": 0.4984500916383888, + "learning_rate": 4.551277996907703e-06, + "loss": 2.2869, + "step": 1389 + }, + { + "epoch": 0.5361620057859209, + "grad_norm": 0.44955445867207805, + "learning_rate": 4.545148017550138e-06, + "loss": 2.3013, + "step": 1390 + }, + { + "epoch": 0.5365477338476374, + "grad_norm": 0.45229936484140115, + "learning_rate": 4.5390187274928325e-06, + "loss": 2.2827, + "step": 1391 + }, + { + "epoch": 0.5369334619093539, + "grad_norm": 0.41324968774465565, + "learning_rate": 4.532890136024351e-06, + "loss": 2.3624, + "step": 1392 + }, + { + "epoch": 0.5373191899710704, + "grad_norm": 0.47895598575081966, + "learning_rate": 4.5267622524321955e-06, + "loss": 2.2684, + "step": 1393 + }, + { + "epoch": 0.5377049180327869, + "grad_norm": 0.5188073056811957, + "learning_rate": 4.520635086002799e-06, + "loss": 2.3686, + "step": 1394 + }, + { + "epoch": 0.5380906460945034, + "grad_norm": 0.431259848225407, + "learning_rate": 4.514508646021506e-06, + "loss": 2.3401, + "step": 1395 + }, + { + "epoch": 0.5384763741562199, + "grad_norm": 0.47217102884984974, + "learning_rate": 4.508382941772558e-06, + "loss": 2.3236, + "step": 1396 + }, + { + "epoch": 0.5388621022179364, + "grad_norm": 0.44966054172796366, + "learning_rate": 4.502257982539087e-06, + "loss": 2.3164, + "step": 1397 + }, + { + "epoch": 0.5392478302796528, + "grad_norm": 0.47812913336760227, + "learning_rate": 4.496133777603093e-06, + "loss": 2.3211, + "step": 1398 + }, + { + "epoch": 0.5396335583413693, + "grad_norm": 0.465887586248686, + "learning_rate": 4.49001033624543e-06, + "loss": 2.3873, + "step": 1399 + }, + { + "epoch": 0.5400192864030858, + "grad_norm": 0.4611561152316998, + "learning_rate": 4.483887667745798e-06, + "loss": 2.3814, + "step": 1400 + }, + { + "epoch": 0.5404050144648023, + "grad_norm": 0.43480217765151674, + "learning_rate": 4.477765781382728e-06, + "loss": 2.3513, + "step": 1401 + }, + { + "epoch": 0.5407907425265188, + "grad_norm": 0.443621190936497, + "learning_rate": 4.471644686433559e-06, + "loss": 2.3188, + "step": 1402 + }, + { + "epoch": 0.5411764705882353, + "grad_norm": 0.47335316164044927, + "learning_rate": 4.465524392174437e-06, + "loss": 2.3548, + "step": 1403 + }, + { + "epoch": 0.5415621986499518, + "grad_norm": 0.4568037576128477, + "learning_rate": 4.459404907880293e-06, + "loss": 2.2584, + "step": 1404 + }, + { + "epoch": 0.5419479267116682, + "grad_norm": 0.48574365511330586, + "learning_rate": 4.453286242824827e-06, + "loss": 2.271, + "step": 1405 + }, + { + "epoch": 0.5423336547733848, + "grad_norm": 0.41630044315682513, + "learning_rate": 4.447168406280503e-06, + "loss": 2.3783, + "step": 1406 + }, + { + "epoch": 0.5427193828351012, + "grad_norm": 0.4388643867308547, + "learning_rate": 4.441051407518525e-06, + "loss": 2.3014, + "step": 1407 + }, + { + "epoch": 0.5431051108968178, + "grad_norm": 0.46351987282241525, + "learning_rate": 4.434935255808831e-06, + "loss": 2.3795, + "step": 1408 + }, + { + "epoch": 0.5434908389585342, + "grad_norm": 0.4067052982054639, + "learning_rate": 4.428819960420072e-06, + "loss": 2.3166, + "step": 1409 + }, + { + "epoch": 0.5438765670202508, + "grad_norm": 0.4424432749709641, + "learning_rate": 4.422705530619604e-06, + "loss": 2.2886, + "step": 1410 + }, + { + "epoch": 0.5442622950819672, + "grad_norm": 0.440011691897373, + "learning_rate": 4.4165919756734695e-06, + "loss": 2.354, + "step": 1411 + }, + { + "epoch": 0.5446480231436837, + "grad_norm": 0.4651265995172307, + "learning_rate": 4.410479304846385e-06, + "loss": 2.3595, + "step": 1412 + }, + { + "epoch": 0.5450337512054002, + "grad_norm": 0.4127500619791685, + "learning_rate": 4.4043675274017284e-06, + "loss": 2.2397, + "step": 1413 + }, + { + "epoch": 0.5454194792671166, + "grad_norm": 0.4228985803007235, + "learning_rate": 4.3982566526015244e-06, + "loss": 2.3301, + "step": 1414 + }, + { + "epoch": 0.5458052073288332, + "grad_norm": 0.4257754524806136, + "learning_rate": 4.392146689706426e-06, + "loss": 2.2894, + "step": 1415 + }, + { + "epoch": 0.5461909353905496, + "grad_norm": 0.4464196651405697, + "learning_rate": 4.386037647975708e-06, + "loss": 2.3799, + "step": 1416 + }, + { + "epoch": 0.5465766634522662, + "grad_norm": 0.4621712141445011, + "learning_rate": 4.379929536667251e-06, + "loss": 2.3379, + "step": 1417 + }, + { + "epoch": 0.5469623915139826, + "grad_norm": 0.4509721789709156, + "learning_rate": 4.37382236503752e-06, + "loss": 2.371, + "step": 1418 + }, + { + "epoch": 0.5473481195756992, + "grad_norm": 0.4314830170510386, + "learning_rate": 4.3677161423415584e-06, + "loss": 2.3325, + "step": 1419 + }, + { + "epoch": 0.5477338476374156, + "grad_norm": 0.42464418592417985, + "learning_rate": 4.361610877832974e-06, + "loss": 2.3283, + "step": 1420 + }, + { + "epoch": 0.5481195756991322, + "grad_norm": 0.4094192326677035, + "learning_rate": 4.355506580763919e-06, + "loss": 2.311, + "step": 1421 + }, + { + "epoch": 0.5485053037608486, + "grad_norm": 0.4677564370459633, + "learning_rate": 4.3494032603850804e-06, + "loss": 2.2784, + "step": 1422 + }, + { + "epoch": 0.548891031822565, + "grad_norm": 0.4205875476635024, + "learning_rate": 4.343300925945667e-06, + "loss": 2.3176, + "step": 1423 + }, + { + "epoch": 0.5492767598842816, + "grad_norm": 0.46917697693113575, + "learning_rate": 4.337199586693389e-06, + "loss": 2.3322, + "step": 1424 + }, + { + "epoch": 0.549662487945998, + "grad_norm": 0.4301394790054798, + "learning_rate": 4.331099251874457e-06, + "loss": 2.3428, + "step": 1425 + }, + { + "epoch": 0.5500482160077146, + "grad_norm": 0.43302569982748224, + "learning_rate": 4.32499993073355e-06, + "loss": 2.3217, + "step": 1426 + }, + { + "epoch": 0.550433944069431, + "grad_norm": 0.4603363514517138, + "learning_rate": 4.3189016325138155e-06, + "loss": 2.2692, + "step": 1427 + }, + { + "epoch": 0.5508196721311476, + "grad_norm": 0.41660294234922207, + "learning_rate": 4.312804366456851e-06, + "loss": 2.3322, + "step": 1428 + }, + { + "epoch": 0.551205400192864, + "grad_norm": 0.43506495250256916, + "learning_rate": 4.306708141802687e-06, + "loss": 2.3301, + "step": 1429 + }, + { + "epoch": 0.5515911282545806, + "grad_norm": 0.4453736574375762, + "learning_rate": 4.30061296778978e-06, + "loss": 2.2675, + "step": 1430 + }, + { + "epoch": 0.551976856316297, + "grad_norm": 0.4904730112987836, + "learning_rate": 4.294518853654991e-06, + "loss": 2.3282, + "step": 1431 + }, + { + "epoch": 0.5523625843780136, + "grad_norm": 0.45455508963296065, + "learning_rate": 4.2884258086335755e-06, + "loss": 2.3528, + "step": 1432 + }, + { + "epoch": 0.55274831243973, + "grad_norm": 0.4180507969561658, + "learning_rate": 4.282333841959171e-06, + "loss": 2.2981, + "step": 1433 + }, + { + "epoch": 0.5531340405014464, + "grad_norm": 0.45184851358234884, + "learning_rate": 4.276242962863775e-06, + "loss": 2.3093, + "step": 1434 + }, + { + "epoch": 0.553519768563163, + "grad_norm": 0.4554037815490411, + "learning_rate": 4.270153180577746e-06, + "loss": 2.3091, + "step": 1435 + }, + { + "epoch": 0.5539054966248794, + "grad_norm": 0.4346303129201663, + "learning_rate": 4.2640645043297715e-06, + "loss": 2.3912, + "step": 1436 + }, + { + "epoch": 0.554291224686596, + "grad_norm": 0.45225269018483133, + "learning_rate": 4.25797694334687e-06, + "loss": 2.2834, + "step": 1437 + }, + { + "epoch": 0.5546769527483124, + "grad_norm": 0.4084486155678106, + "learning_rate": 4.251890506854365e-06, + "loss": 2.3305, + "step": 1438 + }, + { + "epoch": 0.555062680810029, + "grad_norm": 0.4575803527285325, + "learning_rate": 4.2458052040758746e-06, + "loss": 2.3479, + "step": 1439 + }, + { + "epoch": 0.5554484088717454, + "grad_norm": 0.44545457389858556, + "learning_rate": 4.239721044233306e-06, + "loss": 2.3287, + "step": 1440 + }, + { + "epoch": 0.555834136933462, + "grad_norm": 0.409063828267486, + "learning_rate": 4.2336380365468274e-06, + "loss": 2.2689, + "step": 1441 + }, + { + "epoch": 0.5562198649951784, + "grad_norm": 0.43152638314173347, + "learning_rate": 4.227556190234864e-06, + "loss": 2.3285, + "step": 1442 + }, + { + "epoch": 0.5566055930568949, + "grad_norm": 0.43376742895032044, + "learning_rate": 4.22147551451408e-06, + "loss": 2.2717, + "step": 1443 + }, + { + "epoch": 0.5569913211186114, + "grad_norm": 0.42210336592990916, + "learning_rate": 4.215396018599369e-06, + "loss": 2.3696, + "step": 1444 + }, + { + "epoch": 0.5573770491803278, + "grad_norm": 0.4182398682850248, + "learning_rate": 4.209317711703829e-06, + "loss": 2.3391, + "step": 1445 + }, + { + "epoch": 0.5577627772420444, + "grad_norm": 0.4387139655900613, + "learning_rate": 4.203240603038768e-06, + "loss": 2.3068, + "step": 1446 + }, + { + "epoch": 0.5581485053037608, + "grad_norm": 0.4550086921214388, + "learning_rate": 4.197164701813665e-06, + "loss": 2.2963, + "step": 1447 + }, + { + "epoch": 0.5585342333654774, + "grad_norm": 0.44569448498905484, + "learning_rate": 4.191090017236177e-06, + "loss": 2.3024, + "step": 1448 + }, + { + "epoch": 0.5589199614271938, + "grad_norm": 0.4494128519533429, + "learning_rate": 4.185016558512117e-06, + "loss": 2.271, + "step": 1449 + }, + { + "epoch": 0.5593056894889104, + "grad_norm": 0.4208880799036616, + "learning_rate": 4.178944334845438e-06, + "loss": 2.3638, + "step": 1450 + }, + { + "epoch": 0.5596914175506268, + "grad_norm": 0.43348786281976237, + "learning_rate": 4.1728733554382204e-06, + "loss": 2.3346, + "step": 1451 + }, + { + "epoch": 0.5600771456123433, + "grad_norm": 0.4418399784800928, + "learning_rate": 4.166803629490664e-06, + "loss": 2.2924, + "step": 1452 + }, + { + "epoch": 0.5604628736740598, + "grad_norm": 0.44668162646159176, + "learning_rate": 4.160735166201063e-06, + "loss": 2.3616, + "step": 1453 + }, + { + "epoch": 0.5608486017357763, + "grad_norm": 0.4354460375268765, + "learning_rate": 4.1546679747658045e-06, + "loss": 2.3326, + "step": 1454 + }, + { + "epoch": 0.5612343297974928, + "grad_norm": 0.4142044774335118, + "learning_rate": 4.148602064379342e-06, + "loss": 2.3641, + "step": 1455 + }, + { + "epoch": 0.5616200578592092, + "grad_norm": 0.4640451050436532, + "learning_rate": 4.142537444234192e-06, + "loss": 2.324, + "step": 1456 + }, + { + "epoch": 0.5620057859209258, + "grad_norm": 0.43337984727739914, + "learning_rate": 4.136474123520913e-06, + "loss": 2.3159, + "step": 1457 + }, + { + "epoch": 0.5623915139826422, + "grad_norm": 0.42751653282762, + "learning_rate": 4.1304121114280946e-06, + "loss": 2.3692, + "step": 1458 + }, + { + "epoch": 0.5627772420443587, + "grad_norm": 0.4462906076451016, + "learning_rate": 4.1243514171423465e-06, + "loss": 2.3026, + "step": 1459 + }, + { + "epoch": 0.5631629701060752, + "grad_norm": 0.4535928576994934, + "learning_rate": 4.118292049848277e-06, + "loss": 2.2959, + "step": 1460 + }, + { + "epoch": 0.5635486981677917, + "grad_norm": 0.4306687463240508, + "learning_rate": 4.1122340187284845e-06, + "loss": 2.31, + "step": 1461 + }, + { + "epoch": 0.5639344262295082, + "grad_norm": 0.43039521461340857, + "learning_rate": 4.106177332963544e-06, + "loss": 2.281, + "step": 1462 + }, + { + "epoch": 0.5643201542912247, + "grad_norm": 0.4026555682467883, + "learning_rate": 4.100122001731993e-06, + "loss": 2.3083, + "step": 1463 + }, + { + "epoch": 0.5647058823529412, + "grad_norm": 0.429036499023987, + "learning_rate": 4.094068034210313e-06, + "loss": 2.3539, + "step": 1464 + }, + { + "epoch": 0.5650916104146577, + "grad_norm": 0.4221609577666498, + "learning_rate": 4.088015439572919e-06, + "loss": 2.379, + "step": 1465 + }, + { + "epoch": 0.5654773384763742, + "grad_norm": 0.4298721214257128, + "learning_rate": 4.081964226992149e-06, + "loss": 2.2893, + "step": 1466 + }, + { + "epoch": 0.5658630665380906, + "grad_norm": 0.4518357335488598, + "learning_rate": 4.0759144056382426e-06, + "loss": 2.4006, + "step": 1467 + }, + { + "epoch": 0.5662487945998071, + "grad_norm": 0.4362484857989554, + "learning_rate": 4.069865984679332e-06, + "loss": 2.2697, + "step": 1468 + }, + { + "epoch": 0.5666345226615236, + "grad_norm": 0.47563962927912806, + "learning_rate": 4.06381897328143e-06, + "loss": 2.3475, + "step": 1469 + }, + { + "epoch": 0.5670202507232401, + "grad_norm": 0.4090665480564414, + "learning_rate": 4.057773380608411e-06, + "loss": 2.3633, + "step": 1470 + }, + { + "epoch": 0.5674059787849566, + "grad_norm": 0.4041220890931108, + "learning_rate": 4.051729215821995e-06, + "loss": 2.2962, + "step": 1471 + }, + { + "epoch": 0.5677917068466731, + "grad_norm": 0.43674579923533235, + "learning_rate": 4.045686488081748e-06, + "loss": 2.3277, + "step": 1472 + }, + { + "epoch": 0.5681774349083896, + "grad_norm": 0.44877559752623075, + "learning_rate": 4.039645206545053e-06, + "loss": 2.3456, + "step": 1473 + }, + { + "epoch": 0.5685631629701061, + "grad_norm": 0.433832854221165, + "learning_rate": 4.033605380367099e-06, + "loss": 2.3341, + "step": 1474 + }, + { + "epoch": 0.5689488910318226, + "grad_norm": 0.46945090750470925, + "learning_rate": 4.027567018700873e-06, + "loss": 2.3741, + "step": 1475 + }, + { + "epoch": 0.5693346190935391, + "grad_norm": 0.4276102055477122, + "learning_rate": 4.021530130697141e-06, + "loss": 2.3246, + "step": 1476 + }, + { + "epoch": 0.5697203471552555, + "grad_norm": 0.42626648975286313, + "learning_rate": 4.015494725504435e-06, + "loss": 2.2405, + "step": 1477 + }, + { + "epoch": 0.570106075216972, + "grad_norm": 0.44367775635689727, + "learning_rate": 4.009460812269045e-06, + "loss": 2.3604, + "step": 1478 + }, + { + "epoch": 0.5704918032786885, + "grad_norm": 0.4082484041938663, + "learning_rate": 4.003428400134992e-06, + "loss": 2.3743, + "step": 1479 + }, + { + "epoch": 0.570877531340405, + "grad_norm": 0.43933866170044744, + "learning_rate": 3.997397498244028e-06, + "loss": 2.3001, + "step": 1480 + }, + { + "epoch": 0.5712632594021215, + "grad_norm": 0.43375151507001325, + "learning_rate": 3.991368115735612e-06, + "loss": 2.3359, + "step": 1481 + }, + { + "epoch": 0.571648987463838, + "grad_norm": 0.43964106653478796, + "learning_rate": 3.985340261746909e-06, + "loss": 2.3816, + "step": 1482 + }, + { + "epoch": 0.5720347155255545, + "grad_norm": 0.4363234931572726, + "learning_rate": 3.979313945412758e-06, + "loss": 2.3066, + "step": 1483 + }, + { + "epoch": 0.572420443587271, + "grad_norm": 0.4252315319429752, + "learning_rate": 3.97328917586567e-06, + "loss": 2.2941, + "step": 1484 + }, + { + "epoch": 0.5728061716489875, + "grad_norm": 0.4611132439778421, + "learning_rate": 3.9672659622358175e-06, + "loss": 2.2804, + "step": 1485 + }, + { + "epoch": 0.573191899710704, + "grad_norm": 0.47487041414747433, + "learning_rate": 3.961244313651008e-06, + "loss": 2.3275, + "step": 1486 + }, + { + "epoch": 0.5735776277724205, + "grad_norm": 0.44979611706443356, + "learning_rate": 3.955224239236681e-06, + "loss": 2.3187, + "step": 1487 + }, + { + "epoch": 0.5739633558341369, + "grad_norm": 0.4524099895055111, + "learning_rate": 3.9492057481158905e-06, + "loss": 2.3436, + "step": 1488 + }, + { + "epoch": 0.5743490838958534, + "grad_norm": 0.40135274768366935, + "learning_rate": 3.943188849409289e-06, + "loss": 2.3494, + "step": 1489 + }, + { + "epoch": 0.5747348119575699, + "grad_norm": 0.43440989096021043, + "learning_rate": 3.937173552235117e-06, + "loss": 2.2697, + "step": 1490 + }, + { + "epoch": 0.5751205400192864, + "grad_norm": 0.4704618668381495, + "learning_rate": 3.9311598657091895e-06, + "loss": 2.321, + "step": 1491 + }, + { + "epoch": 0.5755062680810029, + "grad_norm": 0.42035450557088433, + "learning_rate": 3.92514779894488e-06, + "loss": 2.3282, + "step": 1492 + }, + { + "epoch": 0.5758919961427194, + "grad_norm": 0.44099448828389753, + "learning_rate": 3.919137361053105e-06, + "loss": 2.3271, + "step": 1493 + }, + { + "epoch": 0.5762777242044359, + "grad_norm": 0.43312368312817023, + "learning_rate": 3.913128561142315e-06, + "loss": 2.2885, + "step": 1494 + }, + { + "epoch": 0.5766634522661523, + "grad_norm": 0.42701973541052424, + "learning_rate": 3.907121408318478e-06, + "loss": 2.3509, + "step": 1495 + }, + { + "epoch": 0.5770491803278689, + "grad_norm": 0.43036889157362906, + "learning_rate": 3.901115911685063e-06, + "loss": 2.3325, + "step": 1496 + }, + { + "epoch": 0.5774349083895853, + "grad_norm": 0.422898379339551, + "learning_rate": 3.895112080343033e-06, + "loss": 2.3412, + "step": 1497 + }, + { + "epoch": 0.5778206364513019, + "grad_norm": 0.42172624395411484, + "learning_rate": 3.889109923390827e-06, + "loss": 2.3041, + "step": 1498 + }, + { + "epoch": 0.5782063645130183, + "grad_norm": 0.413562412674589, + "learning_rate": 3.8831094499243425e-06, + "loss": 2.2498, + "step": 1499 + }, + { + "epoch": 0.5785920925747348, + "grad_norm": 0.439426523547599, + "learning_rate": 3.877110669036932e-06, + "loss": 2.295, + "step": 1500 + }, + { + "epoch": 0.5789778206364513, + "grad_norm": 0.45232269736121317, + "learning_rate": 3.8711135898193775e-06, + "loss": 2.2968, + "step": 1501 + }, + { + "epoch": 0.5793635486981678, + "grad_norm": 0.43144290461141577, + "learning_rate": 3.865118221359887e-06, + "loss": 2.2932, + "step": 1502 + }, + { + "epoch": 0.5797492767598843, + "grad_norm": 0.4824244549953793, + "learning_rate": 3.859124572744072e-06, + "loss": 2.3413, + "step": 1503 + }, + { + "epoch": 0.5801350048216007, + "grad_norm": 0.44213543325579097, + "learning_rate": 3.853132653054936e-06, + "loss": 2.4002, + "step": 1504 + }, + { + "epoch": 0.5805207328833173, + "grad_norm": 0.4425691507108305, + "learning_rate": 3.84714247137287e-06, + "loss": 2.3816, + "step": 1505 + }, + { + "epoch": 0.5809064609450337, + "grad_norm": 0.42064681797782016, + "learning_rate": 3.841154036775626e-06, + "loss": 2.3163, + "step": 1506 + }, + { + "epoch": 0.5812921890067503, + "grad_norm": 0.4354270633194001, + "learning_rate": 3.8351673583383055e-06, + "loss": 2.3095, + "step": 1507 + }, + { + "epoch": 0.5816779170684667, + "grad_norm": 0.45282359045318643, + "learning_rate": 3.829182445133356e-06, + "loss": 2.2697, + "step": 1508 + }, + { + "epoch": 0.5820636451301833, + "grad_norm": 0.4414981823819402, + "learning_rate": 3.823199306230543e-06, + "loss": 2.3051, + "step": 1509 + }, + { + "epoch": 0.5824493731918997, + "grad_norm": 0.458287159992059, + "learning_rate": 3.8172179506969495e-06, + "loss": 2.3139, + "step": 1510 + }, + { + "epoch": 0.5828351012536162, + "grad_norm": 0.49019278650383025, + "learning_rate": 3.811238387596951e-06, + "loss": 2.3708, + "step": 1511 + }, + { + "epoch": 0.5832208293153327, + "grad_norm": 0.42237821459993796, + "learning_rate": 3.8052606259922097e-06, + "loss": 2.3203, + "step": 1512 + }, + { + "epoch": 0.5836065573770491, + "grad_norm": 0.4739834488460313, + "learning_rate": 3.7992846749416536e-06, + "loss": 2.3406, + "step": 1513 + }, + { + "epoch": 0.5839922854387657, + "grad_norm": 0.4307938570749805, + "learning_rate": 3.7933105435014727e-06, + "loss": 2.2845, + "step": 1514 + }, + { + "epoch": 0.5843780135004821, + "grad_norm": 0.45217825822154756, + "learning_rate": 3.787338240725095e-06, + "loss": 2.4072, + "step": 1515 + }, + { + "epoch": 0.5847637415621987, + "grad_norm": 0.44028592257351734, + "learning_rate": 3.7813677756631773e-06, + "loss": 2.3072, + "step": 1516 + }, + { + "epoch": 0.5851494696239151, + "grad_norm": 0.43745662531069157, + "learning_rate": 3.775399157363596e-06, + "loss": 2.3843, + "step": 1517 + }, + { + "epoch": 0.5855351976856317, + "grad_norm": 0.4136627009639538, + "learning_rate": 3.7694323948714223e-06, + "loss": 2.3225, + "step": 1518 + }, + { + "epoch": 0.5859209257473481, + "grad_norm": 0.4502576623862565, + "learning_rate": 3.7634674972289227e-06, + "loss": 2.2701, + "step": 1519 + }, + { + "epoch": 0.5863066538090647, + "grad_norm": 0.46431712852149626, + "learning_rate": 3.75750447347553e-06, + "loss": 2.2981, + "step": 1520 + }, + { + "epoch": 0.5866923818707811, + "grad_norm": 0.568350545022941, + "learning_rate": 3.7515433326478435e-06, + "loss": 2.291, + "step": 1521 + }, + { + "epoch": 0.5870781099324975, + "grad_norm": 0.45444458005389265, + "learning_rate": 3.745584083779604e-06, + "loss": 2.2699, + "step": 1522 + }, + { + "epoch": 0.5874638379942141, + "grad_norm": 0.4199450739154832, + "learning_rate": 3.7396267359016867e-06, + "loss": 2.3459, + "step": 1523 + }, + { + "epoch": 0.5878495660559305, + "grad_norm": 0.4202023743038374, + "learning_rate": 3.7336712980420897e-06, + "loss": 2.2784, + "step": 1524 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.46221305351333675, + "learning_rate": 3.727717779225912e-06, + "loss": 2.3683, + "step": 1525 + }, + { + "epoch": 0.5886210221793635, + "grad_norm": 0.4351128959783611, + "learning_rate": 3.721766188475344e-06, + "loss": 2.3124, + "step": 1526 + }, + { + "epoch": 0.5890067502410801, + "grad_norm": 0.4209813229158257, + "learning_rate": 3.71581653480966e-06, + "loss": 2.3296, + "step": 1527 + }, + { + "epoch": 0.5893924783027965, + "grad_norm": 0.42243311573698344, + "learning_rate": 3.7098688272451893e-06, + "loss": 2.3514, + "step": 1528 + }, + { + "epoch": 0.5897782063645131, + "grad_norm": 0.43099090382210853, + "learning_rate": 3.7039230747953236e-06, + "loss": 2.3592, + "step": 1529 + }, + { + "epoch": 0.5901639344262295, + "grad_norm": 0.4354140278443831, + "learning_rate": 3.6979792864704832e-06, + "loss": 2.2796, + "step": 1530 + }, + { + "epoch": 0.590549662487946, + "grad_norm": 0.4195720074531869, + "learning_rate": 3.692037471278115e-06, + "loss": 2.2625, + "step": 1531 + }, + { + "epoch": 0.5909353905496625, + "grad_norm": 0.4213217187229761, + "learning_rate": 3.6860976382226747e-06, + "loss": 2.2571, + "step": 1532 + }, + { + "epoch": 0.5913211186113789, + "grad_norm": 0.4359726710019312, + "learning_rate": 3.680159796305614e-06, + "loss": 2.3521, + "step": 1533 + }, + { + "epoch": 0.5917068466730955, + "grad_norm": 0.4327228331680235, + "learning_rate": 3.67422395452537e-06, + "loss": 2.4329, + "step": 1534 + }, + { + "epoch": 0.5920925747348119, + "grad_norm": 0.4576120431790627, + "learning_rate": 3.6682901218773446e-06, + "loss": 2.2853, + "step": 1535 + }, + { + "epoch": 0.5924783027965285, + "grad_norm": 0.4444798965167098, + "learning_rate": 3.662358307353897e-06, + "loss": 2.3421, + "step": 1536 + }, + { + "epoch": 0.5928640308582449, + "grad_norm": 0.4699482709499635, + "learning_rate": 3.6564285199443274e-06, + "loss": 2.3009, + "step": 1537 + }, + { + "epoch": 0.5932497589199615, + "grad_norm": 0.4306699759632948, + "learning_rate": 3.650500768634868e-06, + "loss": 2.299, + "step": 1538 + }, + { + "epoch": 0.5936354869816779, + "grad_norm": 0.45327480781505264, + "learning_rate": 3.64457506240866e-06, + "loss": 2.3126, + "step": 1539 + }, + { + "epoch": 0.5940212150433944, + "grad_norm": 0.451769483636293, + "learning_rate": 3.638651410245746e-06, + "loss": 2.3094, + "step": 1540 + }, + { + "epoch": 0.5944069431051109, + "grad_norm": 0.4721999596669721, + "learning_rate": 3.63272982112306e-06, + "loss": 2.3099, + "step": 1541 + }, + { + "epoch": 0.5947926711668274, + "grad_norm": 0.3961455522079624, + "learning_rate": 3.6268103040144053e-06, + "loss": 2.3102, + "step": 1542 + }, + { + "epoch": 0.5951783992285439, + "grad_norm": 0.4156035408585732, + "learning_rate": 3.6208928678904463e-06, + "loss": 2.353, + "step": 1543 + }, + { + "epoch": 0.5955641272902603, + "grad_norm": 0.42120161369983333, + "learning_rate": 3.6149775217186954e-06, + "loss": 2.2909, + "step": 1544 + }, + { + "epoch": 0.5959498553519769, + "grad_norm": 0.41454445097440046, + "learning_rate": 3.6090642744634953e-06, + "loss": 2.284, + "step": 1545 + }, + { + "epoch": 0.5963355834136933, + "grad_norm": 0.3927789206783352, + "learning_rate": 3.6031531350860094e-06, + "loss": 2.3265, + "step": 1546 + }, + { + "epoch": 0.5967213114754099, + "grad_norm": 0.4091783102629233, + "learning_rate": 3.597244112544208e-06, + "loss": 2.2583, + "step": 1547 + }, + { + "epoch": 0.5971070395371263, + "grad_norm": 0.4148339546574294, + "learning_rate": 3.5913372157928515e-06, + "loss": 2.2539, + "step": 1548 + }, + { + "epoch": 0.5974927675988428, + "grad_norm": 0.41068301438559474, + "learning_rate": 3.585432453783479e-06, + "loss": 2.3277, + "step": 1549 + }, + { + "epoch": 0.5978784956605593, + "grad_norm": 0.46476593432998486, + "learning_rate": 3.5795298354643952e-06, + "loss": 2.3157, + "step": 1550 + }, + { + "epoch": 0.5982642237222758, + "grad_norm": 0.45193568556648994, + "learning_rate": 3.573629369780656e-06, + "loss": 2.3242, + "step": 1551 + }, + { + "epoch": 0.5986499517839923, + "grad_norm": 0.4066513892913827, + "learning_rate": 3.5677310656740537e-06, + "loss": 2.3037, + "step": 1552 + }, + { + "epoch": 0.5990356798457088, + "grad_norm": 0.39607445972972627, + "learning_rate": 3.561834932083108e-06, + "loss": 2.2809, + "step": 1553 + }, + { + "epoch": 0.5994214079074253, + "grad_norm": 0.4259688436253283, + "learning_rate": 3.5559409779430467e-06, + "loss": 2.3362, + "step": 1554 + }, + { + "epoch": 0.5998071359691417, + "grad_norm": 0.44233653847468657, + "learning_rate": 3.550049212185794e-06, + "loss": 2.3335, + "step": 1555 + }, + { + "epoch": 0.6001928640308583, + "grad_norm": 0.40783594510679305, + "learning_rate": 3.5441596437399596e-06, + "loss": 2.3427, + "step": 1556 + }, + { + "epoch": 0.6005785920925747, + "grad_norm": 0.5203177021711052, + "learning_rate": 3.5382722815308256e-06, + "loss": 2.3203, + "step": 1557 + }, + { + "epoch": 0.6009643201542912, + "grad_norm": 0.4106556534023462, + "learning_rate": 3.532387134480327e-06, + "loss": 2.2962, + "step": 1558 + }, + { + "epoch": 0.6013500482160077, + "grad_norm": 0.4222949937445898, + "learning_rate": 3.526504211507041e-06, + "loss": 2.2522, + "step": 1559 + }, + { + "epoch": 0.6017357762777242, + "grad_norm": 0.4389117767073239, + "learning_rate": 3.5206235215261785e-06, + "loss": 2.2762, + "step": 1560 + }, + { + "epoch": 0.6021215043394407, + "grad_norm": 0.41557848167566935, + "learning_rate": 3.5147450734495635e-06, + "loss": 2.2869, + "step": 1561 + }, + { + "epoch": 0.6025072324011572, + "grad_norm": 0.42102431393771766, + "learning_rate": 3.5088688761856215e-06, + "loss": 2.2785, + "step": 1562 + }, + { + "epoch": 0.6028929604628737, + "grad_norm": 0.42418048176304096, + "learning_rate": 3.5029949386393725e-06, + "loss": 2.307, + "step": 1563 + }, + { + "epoch": 0.6032786885245902, + "grad_norm": 0.4796721034987809, + "learning_rate": 3.4971232697124046e-06, + "loss": 2.3422, + "step": 1564 + }, + { + "epoch": 0.6036644165863067, + "grad_norm": 0.4742154173234965, + "learning_rate": 3.491253878302873e-06, + "loss": 2.3337, + "step": 1565 + }, + { + "epoch": 0.6040501446480231, + "grad_norm": 0.436248194948228, + "learning_rate": 3.485386773305479e-06, + "loss": 2.3014, + "step": 1566 + }, + { + "epoch": 0.6044358727097396, + "grad_norm": 0.42815065224421, + "learning_rate": 3.4795219636114642e-06, + "loss": 2.3166, + "step": 1567 + }, + { + "epoch": 0.6048216007714561, + "grad_norm": 0.42212387633519377, + "learning_rate": 3.4736594581085837e-06, + "loss": 2.3011, + "step": 1568 + }, + { + "epoch": 0.6052073288331726, + "grad_norm": 0.45334078792920063, + "learning_rate": 3.4677992656811054e-06, + "loss": 2.3444, + "step": 1569 + }, + { + "epoch": 0.6055930568948891, + "grad_norm": 0.4549394704936667, + "learning_rate": 3.4619413952097925e-06, + "loss": 2.4325, + "step": 1570 + }, + { + "epoch": 0.6059787849566056, + "grad_norm": 0.419176742644126, + "learning_rate": 3.4560858555718877e-06, + "loss": 2.2612, + "step": 1571 + }, + { + "epoch": 0.6063645130183221, + "grad_norm": 0.40484748565623707, + "learning_rate": 3.4502326556411e-06, + "loss": 2.3178, + "step": 1572 + }, + { + "epoch": 0.6067502410800386, + "grad_norm": 0.4314659069011491, + "learning_rate": 3.4443818042875974e-06, + "loss": 2.3171, + "step": 1573 + }, + { + "epoch": 0.607135969141755, + "grad_norm": 0.4904836199860373, + "learning_rate": 3.438533310377985e-06, + "loss": 2.2796, + "step": 1574 + }, + { + "epoch": 0.6075216972034716, + "grad_norm": 0.4249998138728461, + "learning_rate": 3.432687182775294e-06, + "loss": 2.2727, + "step": 1575 + }, + { + "epoch": 0.607907425265188, + "grad_norm": 0.4707558912746351, + "learning_rate": 3.4268434303389747e-06, + "loss": 2.2528, + "step": 1576 + }, + { + "epoch": 0.6082931533269045, + "grad_norm": 0.40487788077121967, + "learning_rate": 3.4210020619248762e-06, + "loss": 2.3378, + "step": 1577 + }, + { + "epoch": 0.608678881388621, + "grad_norm": 0.3928419411927058, + "learning_rate": 3.4151630863852315e-06, + "loss": 2.3446, + "step": 1578 + }, + { + "epoch": 0.6090646094503375, + "grad_norm": 0.4121647963429338, + "learning_rate": 3.4093265125686494e-06, + "loss": 2.3299, + "step": 1579 + }, + { + "epoch": 0.609450337512054, + "grad_norm": 0.4479085650162451, + "learning_rate": 3.403492349320101e-06, + "loss": 2.3532, + "step": 1580 + }, + { + "epoch": 0.6098360655737705, + "grad_norm": 0.4142417050318523, + "learning_rate": 3.3976606054809015e-06, + "loss": 2.3442, + "step": 1581 + }, + { + "epoch": 0.610221793635487, + "grad_norm": 0.42194842190876514, + "learning_rate": 3.391831289888701e-06, + "loss": 2.3086, + "step": 1582 + }, + { + "epoch": 0.6106075216972034, + "grad_norm": 0.42552892598640235, + "learning_rate": 3.38600441137747e-06, + "loss": 2.3675, + "step": 1583 + }, + { + "epoch": 0.61099324975892, + "grad_norm": 0.4451008561962209, + "learning_rate": 3.380179978777482e-06, + "loss": 2.3745, + "step": 1584 + }, + { + "epoch": 0.6113789778206364, + "grad_norm": 0.4307186268643373, + "learning_rate": 3.3743580009153122e-06, + "loss": 2.3006, + "step": 1585 + }, + { + "epoch": 0.611764705882353, + "grad_norm": 0.41560471224439033, + "learning_rate": 3.368538486613807e-06, + "loss": 2.3105, + "step": 1586 + }, + { + "epoch": 0.6121504339440694, + "grad_norm": 0.4508243452795475, + "learning_rate": 3.362721444692086e-06, + "loss": 2.3381, + "step": 1587 + }, + { + "epoch": 0.6125361620057859, + "grad_norm": 0.3995941055290941, + "learning_rate": 3.356906883965516e-06, + "loss": 2.3195, + "step": 1588 + }, + { + "epoch": 0.6129218900675024, + "grad_norm": 0.41518630090251907, + "learning_rate": 3.3510948132457087e-06, + "loss": 2.2516, + "step": 1589 + }, + { + "epoch": 0.6133076181292189, + "grad_norm": 0.40103288319382635, + "learning_rate": 3.3452852413405014e-06, + "loss": 2.2795, + "step": 1590 + }, + { + "epoch": 0.6136933461909354, + "grad_norm": 0.4185829392452625, + "learning_rate": 3.3394781770539406e-06, + "loss": 2.2732, + "step": 1591 + }, + { + "epoch": 0.6140790742526518, + "grad_norm": 0.4689714856626146, + "learning_rate": 3.33367362918628e-06, + "loss": 2.3501, + "step": 1592 + }, + { + "epoch": 0.6144648023143684, + "grad_norm": 0.4360872226256897, + "learning_rate": 3.32787160653395e-06, + "loss": 2.4018, + "step": 1593 + }, + { + "epoch": 0.6148505303760848, + "grad_norm": 0.447666636095158, + "learning_rate": 3.3220721178895658e-06, + "loss": 2.3115, + "step": 1594 + }, + { + "epoch": 0.6152362584378014, + "grad_norm": 0.4214603591205362, + "learning_rate": 3.316275172041893e-06, + "loss": 2.2318, + "step": 1595 + }, + { + "epoch": 0.6156219864995178, + "grad_norm": 0.41757506919655035, + "learning_rate": 3.3104807777758487e-06, + "loss": 2.3058, + "step": 1596 + }, + { + "epoch": 0.6160077145612344, + "grad_norm": 0.41339639782568854, + "learning_rate": 3.3046889438724805e-06, + "loss": 2.2834, + "step": 1597 + }, + { + "epoch": 0.6163934426229508, + "grad_norm": 0.4464343403324317, + "learning_rate": 3.2988996791089573e-06, + "loss": 2.2788, + "step": 1598 + }, + { + "epoch": 0.6167791706846673, + "grad_norm": 0.40466299123420946, + "learning_rate": 3.293112992258556e-06, + "loss": 2.2974, + "step": 1599 + }, + { + "epoch": 0.6171648987463838, + "grad_norm": 0.40325913870382823, + "learning_rate": 3.2873288920906436e-06, + "loss": 2.242, + "step": 1600 + }, + { + "epoch": 0.6175506268081002, + "grad_norm": 0.42192819850604385, + "learning_rate": 3.2815473873706696e-06, + "loss": 2.3584, + "step": 1601 + }, + { + "epoch": 0.6179363548698168, + "grad_norm": 0.42561536777423264, + "learning_rate": 3.275768486860149e-06, + "loss": 2.3795, + "step": 1602 + }, + { + "epoch": 0.6183220829315332, + "grad_norm": 0.42857073166383797, + "learning_rate": 3.2699921993166508e-06, + "loss": 2.3295, + "step": 1603 + }, + { + "epoch": 0.6187078109932498, + "grad_norm": 0.4066147316249197, + "learning_rate": 3.2642185334937853e-06, + "loss": 2.2482, + "step": 1604 + }, + { + "epoch": 0.6190935390549662, + "grad_norm": 0.44300196333196484, + "learning_rate": 3.2584474981411874e-06, + "loss": 2.3715, + "step": 1605 + }, + { + "epoch": 0.6194792671166828, + "grad_norm": 0.4148539593484898, + "learning_rate": 3.252679102004509e-06, + "loss": 2.299, + "step": 1606 + }, + { + "epoch": 0.6198649951783992, + "grad_norm": 0.43327244941421494, + "learning_rate": 3.2469133538253983e-06, + "loss": 2.3321, + "step": 1607 + }, + { + "epoch": 0.6202507232401158, + "grad_norm": 0.4449484874248776, + "learning_rate": 3.2411502623414925e-06, + "loss": 2.2933, + "step": 1608 + }, + { + "epoch": 0.6206364513018322, + "grad_norm": 0.4342425826361147, + "learning_rate": 3.2353898362864055e-06, + "loss": 2.3152, + "step": 1609 + }, + { + "epoch": 0.6210221793635486, + "grad_norm": 0.4821905887515583, + "learning_rate": 3.229632084389708e-06, + "loss": 2.3511, + "step": 1610 + }, + { + "epoch": 0.6214079074252652, + "grad_norm": 0.45075055098796424, + "learning_rate": 3.2238770153769173e-06, + "loss": 2.2534, + "step": 1611 + }, + { + "epoch": 0.6217936354869816, + "grad_norm": 0.43457470407406457, + "learning_rate": 3.2181246379694886e-06, + "loss": 2.3579, + "step": 1612 + }, + { + "epoch": 0.6221793635486982, + "grad_norm": 0.4398445075795343, + "learning_rate": 3.2123749608847998e-06, + "loss": 2.3447, + "step": 1613 + }, + { + "epoch": 0.6225650916104146, + "grad_norm": 0.4357463212834921, + "learning_rate": 3.206627992836131e-06, + "loss": 2.2671, + "step": 1614 + }, + { + "epoch": 0.6229508196721312, + "grad_norm": 0.4602686135319776, + "learning_rate": 3.200883742532659e-06, + "loss": 2.3401, + "step": 1615 + }, + { + "epoch": 0.6233365477338476, + "grad_norm": 0.4498777315680553, + "learning_rate": 3.1951422186794447e-06, + "loss": 2.3381, + "step": 1616 + }, + { + "epoch": 0.6237222757955642, + "grad_norm": 0.45243899257172177, + "learning_rate": 3.1894034299774125e-06, + "loss": 2.3527, + "step": 1617 + }, + { + "epoch": 0.6241080038572806, + "grad_norm": 0.4125076055492351, + "learning_rate": 3.183667385123346e-06, + "loss": 2.3594, + "step": 1618 + }, + { + "epoch": 0.6244937319189972, + "grad_norm": 0.41444805052369155, + "learning_rate": 3.1779340928098695e-06, + "loss": 2.2522, + "step": 1619 + }, + { + "epoch": 0.6248794599807136, + "grad_norm": 0.4402574577616683, + "learning_rate": 3.1722035617254333e-06, + "loss": 2.3001, + "step": 1620 + }, + { + "epoch": 0.6252651880424301, + "grad_norm": 0.39364272113708715, + "learning_rate": 3.1664758005543072e-06, + "loss": 2.2133, + "step": 1621 + }, + { + "epoch": 0.6256509161041466, + "grad_norm": 0.4447946352808919, + "learning_rate": 3.160750817976562e-06, + "loss": 2.3014, + "step": 1622 + }, + { + "epoch": 0.626036644165863, + "grad_norm": 0.41953913275683713, + "learning_rate": 3.1550286226680576e-06, + "loss": 2.3584, + "step": 1623 + }, + { + "epoch": 0.6264223722275796, + "grad_norm": 0.40785748215954465, + "learning_rate": 3.149309223300428e-06, + "loss": 2.3599, + "step": 1624 + }, + { + "epoch": 0.626808100289296, + "grad_norm": 0.4490579214152913, + "learning_rate": 3.1435926285410747e-06, + "loss": 2.4041, + "step": 1625 + }, + { + "epoch": 0.6271938283510126, + "grad_norm": 0.4127887227261846, + "learning_rate": 3.137878847053143e-06, + "loss": 2.3671, + "step": 1626 + }, + { + "epoch": 0.627579556412729, + "grad_norm": 0.4481331305889832, + "learning_rate": 3.1321678874955193e-06, + "loss": 2.3999, + "step": 1627 + }, + { + "epoch": 0.6279652844744456, + "grad_norm": 0.4150063152135012, + "learning_rate": 3.126459758522813e-06, + "loss": 2.2777, + "step": 1628 + }, + { + "epoch": 0.628351012536162, + "grad_norm": 0.449950999952686, + "learning_rate": 3.120754468785343e-06, + "loss": 2.2863, + "step": 1629 + }, + { + "epoch": 0.6287367405978785, + "grad_norm": 0.4104039989836355, + "learning_rate": 3.115052026929124e-06, + "loss": 2.274, + "step": 1630 + }, + { + "epoch": 0.629122468659595, + "grad_norm": 0.4253221247881812, + "learning_rate": 3.1093524415958576e-06, + "loss": 2.3231, + "step": 1631 + }, + { + "epoch": 0.6295081967213115, + "grad_norm": 0.441253477244187, + "learning_rate": 3.103655721422917e-06, + "loss": 2.2865, + "step": 1632 + }, + { + "epoch": 0.629893924783028, + "grad_norm": 0.434903455606142, + "learning_rate": 3.097961875043331e-06, + "loss": 2.3141, + "step": 1633 + }, + { + "epoch": 0.6302796528447444, + "grad_norm": 0.402970101599088, + "learning_rate": 3.0922709110857727e-06, + "loss": 2.3473, + "step": 1634 + }, + { + "epoch": 0.630665380906461, + "grad_norm": 0.5255032149638728, + "learning_rate": 3.0865828381745515e-06, + "loss": 2.3338, + "step": 1635 + }, + { + "epoch": 0.6310511089681774, + "grad_norm": 0.5375959149946876, + "learning_rate": 3.080897664929592e-06, + "loss": 2.3365, + "step": 1636 + }, + { + "epoch": 0.631436837029894, + "grad_norm": 0.4437236135709563, + "learning_rate": 3.0752153999664225e-06, + "loss": 2.3017, + "step": 1637 + }, + { + "epoch": 0.6318225650916104, + "grad_norm": 0.4140603788140724, + "learning_rate": 3.06953605189617e-06, + "loss": 2.2803, + "step": 1638 + }, + { + "epoch": 0.6322082931533269, + "grad_norm": 0.40732890731601284, + "learning_rate": 3.0638596293255368e-06, + "loss": 2.3403, + "step": 1639 + }, + { + "epoch": 0.6325940212150434, + "grad_norm": 0.4480598498633505, + "learning_rate": 3.0581861408567907e-06, + "loss": 2.3276, + "step": 1640 + }, + { + "epoch": 0.6329797492767599, + "grad_norm": 0.4449957529966464, + "learning_rate": 3.052515595087759e-06, + "loss": 2.3904, + "step": 1641 + }, + { + "epoch": 0.6333654773384764, + "grad_norm": 0.44263014545792057, + "learning_rate": 3.0468480006118045e-06, + "loss": 2.3563, + "step": 1642 + }, + { + "epoch": 0.6337512054001929, + "grad_norm": 0.40308659294775584, + "learning_rate": 3.041183366017818e-06, + "loss": 2.3224, + "step": 1643 + }, + { + "epoch": 0.6341369334619094, + "grad_norm": 0.47183380223311655, + "learning_rate": 3.035521699890206e-06, + "loss": 2.3157, + "step": 1644 + }, + { + "epoch": 0.6345226615236258, + "grad_norm": 0.41576466733189876, + "learning_rate": 3.029863010808876e-06, + "loss": 2.3774, + "step": 1645 + }, + { + "epoch": 0.6349083895853423, + "grad_norm": 0.4361736521701304, + "learning_rate": 3.0242073073492238e-06, + "loss": 2.3852, + "step": 1646 + }, + { + "epoch": 0.6352941176470588, + "grad_norm": 0.43135315720350514, + "learning_rate": 3.0185545980821207e-06, + "loss": 2.3221, + "step": 1647 + }, + { + "epoch": 0.6356798457087753, + "grad_norm": 0.4385687708863605, + "learning_rate": 3.0129048915739013e-06, + "loss": 2.3825, + "step": 1648 + }, + { + "epoch": 0.6360655737704918, + "grad_norm": 0.4416969053883667, + "learning_rate": 3.007258196386347e-06, + "loss": 2.2408, + "step": 1649 + }, + { + "epoch": 0.6364513018322083, + "grad_norm": 0.4128777848509827, + "learning_rate": 3.00161452107668e-06, + "loss": 2.3332, + "step": 1650 + }, + { + "epoch": 0.6368370298939248, + "grad_norm": 0.453816183858808, + "learning_rate": 2.9959738741975426e-06, + "loss": 2.3545, + "step": 1651 + }, + { + "epoch": 0.6372227579556413, + "grad_norm": 0.40964967583845385, + "learning_rate": 2.9903362642969903e-06, + "loss": 2.2743, + "step": 1652 + }, + { + "epoch": 0.6376084860173578, + "grad_norm": 0.4417719523403045, + "learning_rate": 2.9847016999184746e-06, + "loss": 2.3381, + "step": 1653 + }, + { + "epoch": 0.6379942140790743, + "grad_norm": 0.41947075176552073, + "learning_rate": 2.97907018960083e-06, + "loss": 2.3772, + "step": 1654 + }, + { + "epoch": 0.6383799421407907, + "grad_norm": 0.4365891844676327, + "learning_rate": 2.9734417418782667e-06, + "loss": 2.3256, + "step": 1655 + }, + { + "epoch": 0.6387656702025072, + "grad_norm": 0.4303482492434974, + "learning_rate": 2.967816365280351e-06, + "loss": 2.3813, + "step": 1656 + }, + { + "epoch": 0.6391513982642237, + "grad_norm": 0.4363287258527234, + "learning_rate": 2.962194068331996e-06, + "loss": 2.3038, + "step": 1657 + }, + { + "epoch": 0.6395371263259402, + "grad_norm": 0.432977695465348, + "learning_rate": 2.956574859553448e-06, + "loss": 2.3224, + "step": 1658 + }, + { + "epoch": 0.6399228543876567, + "grad_norm": 0.47593063120339796, + "learning_rate": 2.9509587474602707e-06, + "loss": 2.3044, + "step": 1659 + }, + { + "epoch": 0.6403085824493732, + "grad_norm": 0.44002954822773555, + "learning_rate": 2.94534574056334e-06, + "loss": 2.2886, + "step": 1660 + }, + { + "epoch": 0.6406943105110897, + "grad_norm": 0.40756469467426576, + "learning_rate": 2.9397358473688232e-06, + "loss": 2.318, + "step": 1661 + }, + { + "epoch": 0.6410800385728062, + "grad_norm": 0.44199679711135886, + "learning_rate": 2.934129076378168e-06, + "loss": 2.3477, + "step": 1662 + }, + { + "epoch": 0.6414657666345227, + "grad_norm": 0.4359941554828112, + "learning_rate": 2.9285254360880922e-06, + "loss": 2.2384, + "step": 1663 + }, + { + "epoch": 0.6418514946962391, + "grad_norm": 0.38720988764246606, + "learning_rate": 2.9229249349905686e-06, + "loss": 2.311, + "step": 1664 + }, + { + "epoch": 0.6422372227579557, + "grad_norm": 0.46557718087737565, + "learning_rate": 2.917327581572812e-06, + "loss": 2.3703, + "step": 1665 + }, + { + "epoch": 0.6426229508196721, + "grad_norm": 0.3984082593840511, + "learning_rate": 2.911733384317269e-06, + "loss": 2.3391, + "step": 1666 + }, + { + "epoch": 0.6430086788813886, + "grad_norm": 0.42206119674386167, + "learning_rate": 2.906142351701603e-06, + "loss": 2.3009, + "step": 1667 + }, + { + "epoch": 0.6433944069431051, + "grad_norm": 0.4217609862574355, + "learning_rate": 2.9005544921986774e-06, + "loss": 2.3729, + "step": 1668 + }, + { + "epoch": 0.6437801350048216, + "grad_norm": 0.39429987610744055, + "learning_rate": 2.8949698142765535e-06, + "loss": 2.268, + "step": 1669 + }, + { + "epoch": 0.6441658630665381, + "grad_norm": 0.4164481989825121, + "learning_rate": 2.889388326398468e-06, + "loss": 2.3098, + "step": 1670 + }, + { + "epoch": 0.6445515911282546, + "grad_norm": 0.40129608423507873, + "learning_rate": 2.8838100370228213e-06, + "loss": 2.3361, + "step": 1671 + }, + { + "epoch": 0.6449373191899711, + "grad_norm": 0.4089424621249286, + "learning_rate": 2.8782349546031673e-06, + "loss": 2.3635, + "step": 1672 + }, + { + "epoch": 0.6453230472516875, + "grad_norm": 0.4240294298016162, + "learning_rate": 2.8726630875882056e-06, + "loss": 2.2852, + "step": 1673 + }, + { + "epoch": 0.6457087753134041, + "grad_norm": 0.4290293267118307, + "learning_rate": 2.867094444421756e-06, + "loss": 2.2841, + "step": 1674 + }, + { + "epoch": 0.6460945033751205, + "grad_norm": 0.4097212990786284, + "learning_rate": 2.861529033542756e-06, + "loss": 2.3201, + "step": 1675 + }, + { + "epoch": 0.6464802314368371, + "grad_norm": 0.40823257837484095, + "learning_rate": 2.8559668633852433e-06, + "loss": 2.3387, + "step": 1676 + }, + { + "epoch": 0.6468659594985535, + "grad_norm": 0.4291351890757635, + "learning_rate": 2.8504079423783443e-06, + "loss": 2.3246, + "step": 1677 + }, + { + "epoch": 0.64725168756027, + "grad_norm": 0.4506378274791692, + "learning_rate": 2.844852278946264e-06, + "loss": 2.3413, + "step": 1678 + }, + { + "epoch": 0.6476374156219865, + "grad_norm": 0.4858719061199603, + "learning_rate": 2.839299881508272e-06, + "loss": 2.3755, + "step": 1679 + }, + { + "epoch": 0.648023143683703, + "grad_norm": 0.4387548340054685, + "learning_rate": 2.8337507584786826e-06, + "loss": 2.2976, + "step": 1680 + }, + { + "epoch": 0.6484088717454195, + "grad_norm": 0.40231666835311036, + "learning_rate": 2.828204918266852e-06, + "loss": 2.3032, + "step": 1681 + }, + { + "epoch": 0.6487945998071359, + "grad_norm": 0.4303129513109538, + "learning_rate": 2.8226623692771605e-06, + "loss": 2.3042, + "step": 1682 + }, + { + "epoch": 0.6491803278688525, + "grad_norm": 0.44567862759054966, + "learning_rate": 2.817123119909001e-06, + "loss": 2.3626, + "step": 1683 + }, + { + "epoch": 0.6495660559305689, + "grad_norm": 0.41045229346005024, + "learning_rate": 2.811587178556764e-06, + "loss": 2.3123, + "step": 1684 + }, + { + "epoch": 0.6499517839922855, + "grad_norm": 0.42060750551913056, + "learning_rate": 2.8060545536098314e-06, + "loss": 2.3444, + "step": 1685 + }, + { + "epoch": 0.6503375120540019, + "grad_norm": 0.4603147009817396, + "learning_rate": 2.800525253452557e-06, + "loss": 2.3029, + "step": 1686 + }, + { + "epoch": 0.6507232401157185, + "grad_norm": 0.4452232135375676, + "learning_rate": 2.794999286464253e-06, + "loss": 2.344, + "step": 1687 + }, + { + "epoch": 0.6511089681774349, + "grad_norm": 0.4166776719874048, + "learning_rate": 2.789476661019186e-06, + "loss": 2.3088, + "step": 1688 + }, + { + "epoch": 0.6514946962391513, + "grad_norm": 0.4005437799726485, + "learning_rate": 2.7839573854865555e-06, + "loss": 2.3429, + "step": 1689 + }, + { + "epoch": 0.6518804243008679, + "grad_norm": 0.44743394893607225, + "learning_rate": 2.778441468230483e-06, + "loss": 2.3008, + "step": 1690 + }, + { + "epoch": 0.6522661523625843, + "grad_norm": 0.42389007114177785, + "learning_rate": 2.7729289176100026e-06, + "loss": 2.2756, + "step": 1691 + }, + { + "epoch": 0.6526518804243009, + "grad_norm": 0.4551381436945548, + "learning_rate": 2.7674197419790493e-06, + "loss": 2.3439, + "step": 1692 + }, + { + "epoch": 0.6530376084860173, + "grad_norm": 0.4186949915013729, + "learning_rate": 2.761913949686438e-06, + "loss": 2.3103, + "step": 1693 + }, + { + "epoch": 0.6534233365477339, + "grad_norm": 0.4509372293751877, + "learning_rate": 2.75641154907586e-06, + "loss": 2.2772, + "step": 1694 + }, + { + "epoch": 0.6538090646094503, + "grad_norm": 0.46075922985731843, + "learning_rate": 2.7509125484858657e-06, + "loss": 2.2995, + "step": 1695 + }, + { + "epoch": 0.6541947926711669, + "grad_norm": 0.4351647353450828, + "learning_rate": 2.7454169562498503e-06, + "loss": 2.3232, + "step": 1696 + }, + { + "epoch": 0.6545805207328833, + "grad_norm": 0.44116773382830016, + "learning_rate": 2.73992478069605e-06, + "loss": 2.3232, + "step": 1697 + }, + { + "epoch": 0.6549662487945999, + "grad_norm": 0.49166533573126286, + "learning_rate": 2.734436030147517e-06, + "loss": 2.3415, + "step": 1698 + }, + { + "epoch": 0.6553519768563163, + "grad_norm": 0.4109900880916673, + "learning_rate": 2.72895071292212e-06, + "loss": 2.2507, + "step": 1699 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 0.4523684680383627, + "learning_rate": 2.723468837332517e-06, + "loss": 2.3552, + "step": 1700 + }, + { + "epoch": 0.6561234329797493, + "grad_norm": 0.4515322104742252, + "learning_rate": 2.7179904116861557e-06, + "loss": 2.3195, + "step": 1701 + }, + { + "epoch": 0.6565091610414657, + "grad_norm": 0.524366845606669, + "learning_rate": 2.712515444285253e-06, + "loss": 2.3239, + "step": 1702 + }, + { + "epoch": 0.6568948891031823, + "grad_norm": 0.4187010538839707, + "learning_rate": 2.707043943426786e-06, + "loss": 2.3485, + "step": 1703 + }, + { + "epoch": 0.6572806171648987, + "grad_norm": 0.43658345945129773, + "learning_rate": 2.7015759174024756e-06, + "loss": 2.3283, + "step": 1704 + }, + { + "epoch": 0.6576663452266153, + "grad_norm": 0.4146289618702742, + "learning_rate": 2.6961113744987854e-06, + "loss": 2.2828, + "step": 1705 + }, + { + "epoch": 0.6580520732883317, + "grad_norm": 0.43820230295987506, + "learning_rate": 2.6906503229968895e-06, + "loss": 2.2593, + "step": 1706 + }, + { + "epoch": 0.6584378013500483, + "grad_norm": 0.42122366510118364, + "learning_rate": 2.6851927711726807e-06, + "loss": 2.3358, + "step": 1707 + }, + { + "epoch": 0.6588235294117647, + "grad_norm": 0.4117770533764119, + "learning_rate": 2.6797387272967414e-06, + "loss": 2.3342, + "step": 1708 + }, + { + "epoch": 0.6592092574734812, + "grad_norm": 0.4716978041525038, + "learning_rate": 2.6742881996343405e-06, + "loss": 2.3389, + "step": 1709 + }, + { + "epoch": 0.6595949855351977, + "grad_norm": 0.4340611250787719, + "learning_rate": 2.668841196445416e-06, + "loss": 2.3282, + "step": 1710 + }, + { + "epoch": 0.6599807135969141, + "grad_norm": 0.4396181233847713, + "learning_rate": 2.6633977259845715e-06, + "loss": 2.2852, + "step": 1711 + }, + { + "epoch": 0.6603664416586307, + "grad_norm": 0.41139067151993575, + "learning_rate": 2.65795779650105e-06, + "loss": 2.3336, + "step": 1712 + }, + { + "epoch": 0.6607521697203471, + "grad_norm": 0.4140739732396573, + "learning_rate": 2.65252141623873e-06, + "loss": 2.3266, + "step": 1713 + }, + { + "epoch": 0.6611378977820637, + "grad_norm": 0.4433489124410752, + "learning_rate": 2.6470885934361136e-06, + "loss": 2.2896, + "step": 1714 + }, + { + "epoch": 0.6615236258437801, + "grad_norm": 0.4471073261840135, + "learning_rate": 2.6416593363263067e-06, + "loss": 2.32, + "step": 1715 + }, + { + "epoch": 0.6619093539054967, + "grad_norm": 0.4188281593563816, + "learning_rate": 2.63623365313702e-06, + "loss": 2.2587, + "step": 1716 + }, + { + "epoch": 0.6622950819672131, + "grad_norm": 0.4196333926973484, + "learning_rate": 2.6308115520905396e-06, + "loss": 2.3181, + "step": 1717 + }, + { + "epoch": 0.6626808100289296, + "grad_norm": 0.43507817332036064, + "learning_rate": 2.625393041403731e-06, + "loss": 2.3033, + "step": 1718 + }, + { + "epoch": 0.6630665380906461, + "grad_norm": 0.4013952383118774, + "learning_rate": 2.619978129288011e-06, + "loss": 2.3137, + "step": 1719 + }, + { + "epoch": 0.6634522661523626, + "grad_norm": 0.4352226375058755, + "learning_rate": 2.614566823949348e-06, + "loss": 2.3434, + "step": 1720 + }, + { + "epoch": 0.6638379942140791, + "grad_norm": 0.46138167395279756, + "learning_rate": 2.6091591335882426e-06, + "loss": 2.2565, + "step": 1721 + }, + { + "epoch": 0.6642237222757955, + "grad_norm": 0.4315324035523396, + "learning_rate": 2.603755066399718e-06, + "loss": 2.2905, + "step": 1722 + }, + { + "epoch": 0.6646094503375121, + "grad_norm": 0.4248660288503237, + "learning_rate": 2.598354630573303e-06, + "loss": 2.3645, + "step": 1723 + }, + { + "epoch": 0.6649951783992285, + "grad_norm": 0.4627762735762227, + "learning_rate": 2.592957834293033e-06, + "loss": 2.3146, + "step": 1724 + }, + { + "epoch": 0.665380906460945, + "grad_norm": 0.42172579815965766, + "learning_rate": 2.5875646857374147e-06, + "loss": 2.3384, + "step": 1725 + }, + { + "epoch": 0.6657666345226615, + "grad_norm": 0.43837623448336754, + "learning_rate": 2.5821751930794404e-06, + "loss": 2.3123, + "step": 1726 + }, + { + "epoch": 0.666152362584378, + "grad_norm": 0.44426184885159165, + "learning_rate": 2.576789364486551e-06, + "loss": 2.3464, + "step": 1727 + }, + { + "epoch": 0.6665380906460945, + "grad_norm": 0.4087896213363994, + "learning_rate": 2.5714072081206407e-06, + "loss": 2.3194, + "step": 1728 + }, + { + "epoch": 0.666923818707811, + "grad_norm": 0.4309466855847297, + "learning_rate": 2.566028732138037e-06, + "loss": 2.3609, + "step": 1729 + }, + { + "epoch": 0.6673095467695275, + "grad_norm": 0.41812542408527814, + "learning_rate": 2.5606539446894875e-06, + "loss": 2.3093, + "step": 1730 + }, + { + "epoch": 0.667695274831244, + "grad_norm": 0.4197042581795082, + "learning_rate": 2.5552828539201563e-06, + "loss": 2.3305, + "step": 1731 + }, + { + "epoch": 0.6680810028929605, + "grad_norm": 0.44633494094061993, + "learning_rate": 2.5499154679696014e-06, + "loss": 2.3182, + "step": 1732 + }, + { + "epoch": 0.6684667309546769, + "grad_norm": 0.4093822342538184, + "learning_rate": 2.5445517949717645e-06, + "loss": 2.3515, + "step": 1733 + }, + { + "epoch": 0.6688524590163935, + "grad_norm": 0.4749740056226905, + "learning_rate": 2.5391918430549635e-06, + "loss": 2.3825, + "step": 1734 + }, + { + "epoch": 0.6692381870781099, + "grad_norm": 0.40362186487722646, + "learning_rate": 2.5338356203418784e-06, + "loss": 2.2753, + "step": 1735 + }, + { + "epoch": 0.6696239151398264, + "grad_norm": 0.40874498875645543, + "learning_rate": 2.528483134949535e-06, + "loss": 2.3529, + "step": 1736 + }, + { + "epoch": 0.6700096432015429, + "grad_norm": 0.45349066049912623, + "learning_rate": 2.523134394989294e-06, + "loss": 2.3471, + "step": 1737 + }, + { + "epoch": 0.6703953712632594, + "grad_norm": 0.43166929648623414, + "learning_rate": 2.517789408566846e-06, + "loss": 2.3572, + "step": 1738 + }, + { + "epoch": 0.6707810993249759, + "grad_norm": 0.4240149403537371, + "learning_rate": 2.5124481837821886e-06, + "loss": 2.3522, + "step": 1739 + }, + { + "epoch": 0.6711668273866924, + "grad_norm": 0.4235013301612557, + "learning_rate": 2.50711072872962e-06, + "loss": 2.2818, + "step": 1740 + }, + { + "epoch": 0.6715525554484089, + "grad_norm": 0.46681872751640835, + "learning_rate": 2.5017770514977252e-06, + "loss": 2.3037, + "step": 1741 + }, + { + "epoch": 0.6719382835101254, + "grad_norm": 0.4312151667630878, + "learning_rate": 2.4964471601693633e-06, + "loss": 2.3269, + "step": 1742 + }, + { + "epoch": 0.6723240115718419, + "grad_norm": 0.480321202635658, + "learning_rate": 2.4911210628216615e-06, + "loss": 2.3542, + "step": 1743 + }, + { + "epoch": 0.6727097396335583, + "grad_norm": 0.4161546691919782, + "learning_rate": 2.4857987675259887e-06, + "loss": 2.3684, + "step": 1744 + }, + { + "epoch": 0.6730954676952748, + "grad_norm": 0.4334867788444831, + "learning_rate": 2.480480282347961e-06, + "loss": 2.232, + "step": 1745 + }, + { + "epoch": 0.6734811957569913, + "grad_norm": 0.4205179198468628, + "learning_rate": 2.4751656153474147e-06, + "loss": 2.3038, + "step": 1746 + }, + { + "epoch": 0.6738669238187078, + "grad_norm": 0.4369725705928331, + "learning_rate": 2.4698547745784014e-06, + "loss": 2.3228, + "step": 1747 + }, + { + "epoch": 0.6742526518804243, + "grad_norm": 0.4527242369582889, + "learning_rate": 2.4645477680891734e-06, + "loss": 2.296, + "step": 1748 + }, + { + "epoch": 0.6746383799421408, + "grad_norm": 0.4206894542970612, + "learning_rate": 2.4592446039221718e-06, + "loss": 2.3163, + "step": 1749 + }, + { + "epoch": 0.6750241080038573, + "grad_norm": 0.4421546566249675, + "learning_rate": 2.453945290114021e-06, + "loss": 2.3798, + "step": 1750 + }, + { + "epoch": 0.6754098360655738, + "grad_norm": 0.4452265180248534, + "learning_rate": 2.448649834695503e-06, + "loss": 2.3544, + "step": 1751 + }, + { + "epoch": 0.6757955641272902, + "grad_norm": 0.4306368748720817, + "learning_rate": 2.4433582456915556e-06, + "loss": 2.276, + "step": 1752 + }, + { + "epoch": 0.6761812921890068, + "grad_norm": 0.4123619858403055, + "learning_rate": 2.4380705311212557e-06, + "loss": 2.4109, + "step": 1753 + }, + { + "epoch": 0.6765670202507232, + "grad_norm": 0.39794428972608337, + "learning_rate": 2.432786698997813e-06, + "loss": 2.3193, + "step": 1754 + }, + { + "epoch": 0.6769527483124397, + "grad_norm": 0.41602795068016346, + "learning_rate": 2.427506757328549e-06, + "loss": 2.3405, + "step": 1755 + }, + { + "epoch": 0.6773384763741562, + "grad_norm": 0.44838899638141694, + "learning_rate": 2.422230714114891e-06, + "loss": 2.3606, + "step": 1756 + }, + { + "epoch": 0.6777242044358727, + "grad_norm": 0.4234780319399461, + "learning_rate": 2.416958577352361e-06, + "loss": 2.2062, + "step": 1757 + }, + { + "epoch": 0.6781099324975892, + "grad_norm": 0.43427324024035047, + "learning_rate": 2.411690355030556e-06, + "loss": 2.3539, + "step": 1758 + }, + { + "epoch": 0.6784956605593057, + "grad_norm": 0.4471318387959939, + "learning_rate": 2.4064260551331452e-06, + "loss": 2.292, + "step": 1759 + }, + { + "epoch": 0.6788813886210222, + "grad_norm": 0.4152680657270176, + "learning_rate": 2.4011656856378513e-06, + "loss": 2.3433, + "step": 1760 + }, + { + "epoch": 0.6792671166827386, + "grad_norm": 0.4221820618812638, + "learning_rate": 2.3959092545164408e-06, + "loss": 2.3259, + "step": 1761 + }, + { + "epoch": 0.6796528447444552, + "grad_norm": 0.4226470871183536, + "learning_rate": 2.3906567697347117e-06, + "loss": 2.3538, + "step": 1762 + }, + { + "epoch": 0.6800385728061716, + "grad_norm": 0.3946388601121077, + "learning_rate": 2.3854082392524836e-06, + "loss": 2.2842, + "step": 1763 + }, + { + "epoch": 0.6804243008678882, + "grad_norm": 0.4023774836632626, + "learning_rate": 2.3801636710235836e-06, + "loss": 2.2965, + "step": 1764 + }, + { + "epoch": 0.6808100289296046, + "grad_norm": 0.39928186380628516, + "learning_rate": 2.3749230729958322e-06, + "loss": 2.256, + "step": 1765 + }, + { + "epoch": 0.6811957569913211, + "grad_norm": 0.40275495361852215, + "learning_rate": 2.369686453111033e-06, + "loss": 2.3468, + "step": 1766 + }, + { + "epoch": 0.6815814850530376, + "grad_norm": 0.4120092812454993, + "learning_rate": 2.3644538193049626e-06, + "loss": 2.3906, + "step": 1767 + }, + { + "epoch": 0.6819672131147541, + "grad_norm": 0.42677358596189907, + "learning_rate": 2.3592251795073564e-06, + "loss": 2.2787, + "step": 1768 + }, + { + "epoch": 0.6823529411764706, + "grad_norm": 0.4296128371192793, + "learning_rate": 2.3540005416418943e-06, + "loss": 2.3418, + "step": 1769 + }, + { + "epoch": 0.682738669238187, + "grad_norm": 0.45496302018867923, + "learning_rate": 2.348779913626199e-06, + "loss": 2.3438, + "step": 1770 + }, + { + "epoch": 0.6831243972999036, + "grad_norm": 0.43775959209446863, + "learning_rate": 2.3435633033718096e-06, + "loss": 2.3501, + "step": 1771 + }, + { + "epoch": 0.68351012536162, + "grad_norm": 0.46247162736100794, + "learning_rate": 2.338350718784177e-06, + "loss": 2.3259, + "step": 1772 + }, + { + "epoch": 0.6838958534233366, + "grad_norm": 0.45384808041325736, + "learning_rate": 2.333142167762657e-06, + "loss": 2.3733, + "step": 1773 + }, + { + "epoch": 0.684281581485053, + "grad_norm": 0.4430161348080049, + "learning_rate": 2.327937658200487e-06, + "loss": 2.2967, + "step": 1774 + }, + { + "epoch": 0.6846673095467696, + "grad_norm": 0.42186373207951944, + "learning_rate": 2.322737197984781e-06, + "loss": 2.3502, + "step": 1775 + }, + { + "epoch": 0.685053037608486, + "grad_norm": 0.4463067104849964, + "learning_rate": 2.3175407949965167e-06, + "loss": 2.2719, + "step": 1776 + }, + { + "epoch": 0.6854387656702025, + "grad_norm": 0.4328966014517113, + "learning_rate": 2.312348457110527e-06, + "loss": 2.2898, + "step": 1777 + }, + { + "epoch": 0.685824493731919, + "grad_norm": 0.4125922387553286, + "learning_rate": 2.3071601921954797e-06, + "loss": 2.3335, + "step": 1778 + }, + { + "epoch": 0.6862102217936354, + "grad_norm": 0.4264072177678742, + "learning_rate": 2.301976008113871e-06, + "loss": 2.3068, + "step": 1779 + }, + { + "epoch": 0.686595949855352, + "grad_norm": 0.4216162007809989, + "learning_rate": 2.296795912722014e-06, + "loss": 2.2289, + "step": 1780 + }, + { + "epoch": 0.6869816779170684, + "grad_norm": 0.4093559337331704, + "learning_rate": 2.291619913870024e-06, + "loss": 2.2638, + "step": 1781 + }, + { + "epoch": 0.687367405978785, + "grad_norm": 0.40958554700715705, + "learning_rate": 2.286448019401811e-06, + "loss": 2.3726, + "step": 1782 + }, + { + "epoch": 0.6877531340405014, + "grad_norm": 0.4060960866832944, + "learning_rate": 2.2812802371550653e-06, + "loss": 2.3531, + "step": 1783 + }, + { + "epoch": 0.688138862102218, + "grad_norm": 0.45165307711606734, + "learning_rate": 2.2761165749612417e-06, + "loss": 2.314, + "step": 1784 + }, + { + "epoch": 0.6885245901639344, + "grad_norm": 0.4020078234302099, + "learning_rate": 2.2709570406455543e-06, + "loss": 2.3234, + "step": 1785 + }, + { + "epoch": 0.688910318225651, + "grad_norm": 0.43220274620477667, + "learning_rate": 2.2658016420269596e-06, + "loss": 2.3538, + "step": 1786 + }, + { + "epoch": 0.6892960462873674, + "grad_norm": 0.4400411393426206, + "learning_rate": 2.2606503869181486e-06, + "loss": 2.2693, + "step": 1787 + }, + { + "epoch": 0.6896817743490838, + "grad_norm": 0.5031429505901274, + "learning_rate": 2.25550328312553e-06, + "loss": 2.3507, + "step": 1788 + }, + { + "epoch": 0.6900675024108004, + "grad_norm": 0.4292461378623028, + "learning_rate": 2.250360338449226e-06, + "loss": 2.3647, + "step": 1789 + }, + { + "epoch": 0.6904532304725168, + "grad_norm": 0.4382398684826801, + "learning_rate": 2.2452215606830524e-06, + "loss": 2.3459, + "step": 1790 + }, + { + "epoch": 0.6908389585342334, + "grad_norm": 0.417063718826344, + "learning_rate": 2.2400869576145135e-06, + "loss": 2.297, + "step": 1791 + }, + { + "epoch": 0.6912246865959498, + "grad_norm": 0.4262177150266158, + "learning_rate": 2.2349565370247837e-06, + "loss": 2.3026, + "step": 1792 + }, + { + "epoch": 0.6916104146576664, + "grad_norm": 0.4198489437872795, + "learning_rate": 2.2298303066887007e-06, + "loss": 2.3022, + "step": 1793 + }, + { + "epoch": 0.6919961427193828, + "grad_norm": 0.4382512325320904, + "learning_rate": 2.2247082743747517e-06, + "loss": 2.3303, + "step": 1794 + }, + { + "epoch": 0.6923818707810994, + "grad_norm": 0.3966081503331144, + "learning_rate": 2.2195904478450603e-06, + "loss": 2.2817, + "step": 1795 + }, + { + "epoch": 0.6927675988428158, + "grad_norm": 0.4501733901208891, + "learning_rate": 2.214476834855382e-06, + "loss": 2.3482, + "step": 1796 + }, + { + "epoch": 0.6931533269045324, + "grad_norm": 0.4034321531880344, + "learning_rate": 2.209367443155082e-06, + "loss": 2.3428, + "step": 1797 + }, + { + "epoch": 0.6935390549662488, + "grad_norm": 0.42956892349144943, + "learning_rate": 2.20426228048713e-06, + "loss": 2.3275, + "step": 1798 + }, + { + "epoch": 0.6939247830279652, + "grad_norm": 0.4554530017723556, + "learning_rate": 2.199161354588086e-06, + "loss": 2.3162, + "step": 1799 + }, + { + "epoch": 0.6943105110896818, + "grad_norm": 0.41340986165183063, + "learning_rate": 2.1940646731880887e-06, + "loss": 2.2701, + "step": 1800 + }, + { + "epoch": 0.6946962391513982, + "grad_norm": 0.47516419252309366, + "learning_rate": 2.188972244010849e-06, + "loss": 2.3452, + "step": 1801 + }, + { + "epoch": 0.6950819672131148, + "grad_norm": 0.4484595753194459, + "learning_rate": 2.183884074773628e-06, + "loss": 2.2771, + "step": 1802 + }, + { + "epoch": 0.6954676952748312, + "grad_norm": 0.4045747362266517, + "learning_rate": 2.178800173187237e-06, + "loss": 2.3176, + "step": 1803 + }, + { + "epoch": 0.6958534233365478, + "grad_norm": 0.42374833281973634, + "learning_rate": 2.173720546956015e-06, + "loss": 2.3732, + "step": 1804 + }, + { + "epoch": 0.6962391513982642, + "grad_norm": 0.43674250725646085, + "learning_rate": 2.1686452037778236e-06, + "loss": 2.3584, + "step": 1805 + }, + { + "epoch": 0.6966248794599808, + "grad_norm": 0.4525712743157628, + "learning_rate": 2.1635741513440346e-06, + "loss": 2.4265, + "step": 1806 + }, + { + "epoch": 0.6970106075216972, + "grad_norm": 0.4176078402911036, + "learning_rate": 2.1585073973395156e-06, + "loss": 2.2231, + "step": 1807 + }, + { + "epoch": 0.6973963355834137, + "grad_norm": 0.42853072370017875, + "learning_rate": 2.1534449494426203e-06, + "loss": 2.302, + "step": 1808 + }, + { + "epoch": 0.6977820636451302, + "grad_norm": 0.41296489653639573, + "learning_rate": 2.148386815325179e-06, + "loss": 2.3283, + "step": 1809 + }, + { + "epoch": 0.6981677917068466, + "grad_norm": 0.4514682175260553, + "learning_rate": 2.1433330026524855e-06, + "loss": 2.3262, + "step": 1810 + }, + { + "epoch": 0.6985535197685632, + "grad_norm": 0.45052354679809675, + "learning_rate": 2.138283519083281e-06, + "loss": 2.3511, + "step": 1811 + }, + { + "epoch": 0.6989392478302796, + "grad_norm": 0.4503982963335606, + "learning_rate": 2.1332383722697483e-06, + "loss": 2.2479, + "step": 1812 + }, + { + "epoch": 0.6993249758919962, + "grad_norm": 0.40719473821595903, + "learning_rate": 2.128197569857497e-06, + "loss": 2.3032, + "step": 1813 + }, + { + "epoch": 0.6997107039537126, + "grad_norm": 0.41991110805498033, + "learning_rate": 2.1231611194855523e-06, + "loss": 2.3421, + "step": 1814 + }, + { + "epoch": 0.7000964320154291, + "grad_norm": 0.44901149374013327, + "learning_rate": 2.118129028786349e-06, + "loss": 2.3204, + "step": 1815 + }, + { + "epoch": 0.7004821600771456, + "grad_norm": 0.41440766728142986, + "learning_rate": 2.1131013053857097e-06, + "loss": 2.3755, + "step": 1816 + }, + { + "epoch": 0.7008678881388621, + "grad_norm": 0.42158767575961104, + "learning_rate": 2.1080779569028413e-06, + "loss": 2.3608, + "step": 1817 + }, + { + "epoch": 0.7012536162005786, + "grad_norm": 0.4544717459877259, + "learning_rate": 2.103058990950318e-06, + "loss": 2.3146, + "step": 1818 + }, + { + "epoch": 0.7016393442622951, + "grad_norm": 0.424649340632615, + "learning_rate": 2.098044415134078e-06, + "loss": 2.33, + "step": 1819 + }, + { + "epoch": 0.7020250723240116, + "grad_norm": 0.40885864827974017, + "learning_rate": 2.0930342370534013e-06, + "loss": 2.3, + "step": 1820 + }, + { + "epoch": 0.702410800385728, + "grad_norm": 0.4552603406642715, + "learning_rate": 2.0880284643009035e-06, + "loss": 2.3369, + "step": 1821 + }, + { + "epoch": 0.7027965284474446, + "grad_norm": 0.4513056650519724, + "learning_rate": 2.08302710446253e-06, + "loss": 2.3225, + "step": 1822 + }, + { + "epoch": 0.703182256509161, + "grad_norm": 0.4244786335106065, + "learning_rate": 2.078030165117533e-06, + "loss": 2.3048, + "step": 1823 + }, + { + "epoch": 0.7035679845708775, + "grad_norm": 0.43298567730858045, + "learning_rate": 2.073037653838466e-06, + "loss": 2.2702, + "step": 1824 + }, + { + "epoch": 0.703953712632594, + "grad_norm": 0.45316993438511183, + "learning_rate": 2.0680495781911745e-06, + "loss": 2.2087, + "step": 1825 + }, + { + "epoch": 0.7043394406943105, + "grad_norm": 0.44461260348149667, + "learning_rate": 2.0630659457347806e-06, + "loss": 2.3887, + "step": 1826 + }, + { + "epoch": 0.704725168756027, + "grad_norm": 0.4384648003703453, + "learning_rate": 2.0580867640216723e-06, + "loss": 2.2269, + "step": 1827 + }, + { + "epoch": 0.7051108968177435, + "grad_norm": 0.44502413743025215, + "learning_rate": 2.053112040597495e-06, + "loss": 2.3219, + "step": 1828 + }, + { + "epoch": 0.70549662487946, + "grad_norm": 0.43791680419882656, + "learning_rate": 2.048141783001138e-06, + "loss": 2.3794, + "step": 1829 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.4527043653356696, + "learning_rate": 2.0431759987647206e-06, + "loss": 2.3024, + "step": 1830 + }, + { + "epoch": 0.706268081002893, + "grad_norm": 0.43486023816632846, + "learning_rate": 2.038214695413584e-06, + "loss": 2.3632, + "step": 1831 + }, + { + "epoch": 0.7066538090646094, + "grad_norm": 0.4340772873446721, + "learning_rate": 2.0332578804662783e-06, + "loss": 2.3251, + "step": 1832 + }, + { + "epoch": 0.707039537126326, + "grad_norm": 0.42095115265847993, + "learning_rate": 2.028305561434553e-06, + "loss": 2.3547, + "step": 1833 + }, + { + "epoch": 0.7074252651880424, + "grad_norm": 0.4153578236352047, + "learning_rate": 2.0233577458233418e-06, + "loss": 2.3595, + "step": 1834 + }, + { + "epoch": 0.7078109932497589, + "grad_norm": 0.45286628297659587, + "learning_rate": 2.0184144411307584e-06, + "loss": 2.2989, + "step": 1835 + }, + { + "epoch": 0.7081967213114754, + "grad_norm": 0.415210757441709, + "learning_rate": 2.013475654848076e-06, + "loss": 2.2673, + "step": 1836 + }, + { + "epoch": 0.7085824493731919, + "grad_norm": 0.43402706304210137, + "learning_rate": 2.008541394459721e-06, + "loss": 2.3889, + "step": 1837 + }, + { + "epoch": 0.7089681774349084, + "grad_norm": 0.4501553541223613, + "learning_rate": 2.0036116674432653e-06, + "loss": 2.3073, + "step": 1838 + }, + { + "epoch": 0.7093539054966249, + "grad_norm": 0.4137364602460989, + "learning_rate": 1.998686481269406e-06, + "loss": 2.3535, + "step": 1839 + }, + { + "epoch": 0.7097396335583414, + "grad_norm": 0.43657698866139233, + "learning_rate": 1.99376584340196e-06, + "loss": 2.3303, + "step": 1840 + }, + { + "epoch": 0.7101253616200579, + "grad_norm": 0.42596895550642816, + "learning_rate": 1.98884976129785e-06, + "loss": 2.4018, + "step": 1841 + }, + { + "epoch": 0.7105110896817743, + "grad_norm": 0.4200682470564326, + "learning_rate": 1.983938242407101e-06, + "loss": 2.3129, + "step": 1842 + }, + { + "epoch": 0.7108968177434909, + "grad_norm": 0.4324219822945481, + "learning_rate": 1.979031294172817e-06, + "loss": 2.2912, + "step": 1843 + }, + { + "epoch": 0.7112825458052073, + "grad_norm": 0.41765298347102997, + "learning_rate": 1.9741289240311757e-06, + "loss": 2.3259, + "step": 1844 + }, + { + "epoch": 0.7116682738669238, + "grad_norm": 0.42394530159768007, + "learning_rate": 1.9692311394114176e-06, + "loss": 2.3176, + "step": 1845 + }, + { + "epoch": 0.7120540019286403, + "grad_norm": 0.43116865679038874, + "learning_rate": 1.964337947735835e-06, + "loss": 2.337, + "step": 1846 + }, + { + "epoch": 0.7124397299903568, + "grad_norm": 0.4502038632023192, + "learning_rate": 1.9594493564197613e-06, + "loss": 2.2922, + "step": 1847 + }, + { + "epoch": 0.7128254580520733, + "grad_norm": 0.4447897770422271, + "learning_rate": 1.954565372871554e-06, + "loss": 2.3829, + "step": 1848 + }, + { + "epoch": 0.7132111861137898, + "grad_norm": 0.43691135598900194, + "learning_rate": 1.9496860044925935e-06, + "loss": 2.3216, + "step": 1849 + }, + { + "epoch": 0.7135969141755063, + "grad_norm": 0.4701764832621497, + "learning_rate": 1.9448112586772617e-06, + "loss": 2.3048, + "step": 1850 + }, + { + "epoch": 0.7139826422372227, + "grad_norm": 0.43593029585434484, + "learning_rate": 1.9399411428129354e-06, + "loss": 2.3111, + "step": 1851 + }, + { + "epoch": 0.7143683702989393, + "grad_norm": 0.4428201780077991, + "learning_rate": 1.935075664279978e-06, + "loss": 2.3719, + "step": 1852 + }, + { + "epoch": 0.7147540983606557, + "grad_norm": 0.43311839685942927, + "learning_rate": 1.930214830451721e-06, + "loss": 2.2691, + "step": 1853 + }, + { + "epoch": 0.7151398264223723, + "grad_norm": 0.42447069689000166, + "learning_rate": 1.925358648694463e-06, + "loss": 2.2861, + "step": 1854 + }, + { + "epoch": 0.7155255544840887, + "grad_norm": 0.40031901393760644, + "learning_rate": 1.920507126367448e-06, + "loss": 2.3643, + "step": 1855 + }, + { + "epoch": 0.7159112825458052, + "grad_norm": 0.3900072469818421, + "learning_rate": 1.9156602708228584e-06, + "loss": 2.3331, + "step": 1856 + }, + { + "epoch": 0.7162970106075217, + "grad_norm": 0.48597677530097333, + "learning_rate": 1.910818089405809e-06, + "loss": 2.3775, + "step": 1857 + }, + { + "epoch": 0.7166827386692382, + "grad_norm": 0.46915085296113956, + "learning_rate": 1.9059805894543288e-06, + "loss": 2.333, + "step": 1858 + }, + { + "epoch": 0.7170684667309547, + "grad_norm": 0.45610828678227044, + "learning_rate": 1.9011477782993503e-06, + "loss": 2.3094, + "step": 1859 + }, + { + "epoch": 0.7174541947926711, + "grad_norm": 0.4468466992999948, + "learning_rate": 1.8963196632647008e-06, + "loss": 2.3186, + "step": 1860 + }, + { + "epoch": 0.7178399228543877, + "grad_norm": 0.4498292884554748, + "learning_rate": 1.891496251667096e-06, + "loss": 2.3005, + "step": 1861 + }, + { + "epoch": 0.7182256509161041, + "grad_norm": 0.4089911948744338, + "learning_rate": 1.886677550816118e-06, + "loss": 2.2811, + "step": 1862 + }, + { + "epoch": 0.7186113789778207, + "grad_norm": 0.4000617709919224, + "learning_rate": 1.8818635680142127e-06, + "loss": 2.3181, + "step": 1863 + }, + { + "epoch": 0.7189971070395371, + "grad_norm": 0.4580902252341343, + "learning_rate": 1.8770543105566752e-06, + "loss": 2.2947, + "step": 1864 + }, + { + "epoch": 0.7193828351012537, + "grad_norm": 0.42253237694802087, + "learning_rate": 1.872249785731638e-06, + "loss": 2.3342, + "step": 1865 + }, + { + "epoch": 0.7197685631629701, + "grad_norm": 0.4496865269170129, + "learning_rate": 1.8674500008200675e-06, + "loss": 2.3673, + "step": 1866 + }, + { + "epoch": 0.7201542912246865, + "grad_norm": 0.4073635146420168, + "learning_rate": 1.8626549630957397e-06, + "loss": 2.3325, + "step": 1867 + }, + { + "epoch": 0.7205400192864031, + "grad_norm": 0.3997870993505134, + "learning_rate": 1.8578646798252432e-06, + "loss": 2.2871, + "step": 1868 + }, + { + "epoch": 0.7209257473481195, + "grad_norm": 0.40751796677737057, + "learning_rate": 1.8530791582679558e-06, + "loss": 2.3511, + "step": 1869 + }, + { + "epoch": 0.7213114754098361, + "grad_norm": 0.44983615888224715, + "learning_rate": 1.8482984056760434e-06, + "loss": 2.3554, + "step": 1870 + }, + { + "epoch": 0.7216972034715525, + "grad_norm": 0.4192126012438455, + "learning_rate": 1.8435224292944415e-06, + "loss": 2.3417, + "step": 1871 + }, + { + "epoch": 0.7220829315332691, + "grad_norm": 0.4506643100816781, + "learning_rate": 1.8387512363608496e-06, + "loss": 2.3664, + "step": 1872 + }, + { + "epoch": 0.7224686595949855, + "grad_norm": 0.4165920944006413, + "learning_rate": 1.8339848341057165e-06, + "loss": 2.3338, + "step": 1873 + }, + { + "epoch": 0.7228543876567021, + "grad_norm": 0.4315717005524774, + "learning_rate": 1.8292232297522356e-06, + "loss": 2.3309, + "step": 1874 + }, + { + "epoch": 0.7232401157184185, + "grad_norm": 0.4104338792616358, + "learning_rate": 1.8244664305163217e-06, + "loss": 2.3175, + "step": 1875 + }, + { + "epoch": 0.7236258437801351, + "grad_norm": 0.4250811881361986, + "learning_rate": 1.8197144436066167e-06, + "loss": 2.3429, + "step": 1876 + }, + { + "epoch": 0.7240115718418515, + "grad_norm": 0.4319063440906805, + "learning_rate": 1.8149672762244625e-06, + "loss": 2.3144, + "step": 1877 + }, + { + "epoch": 0.7243972999035679, + "grad_norm": 0.4631219421778302, + "learning_rate": 1.8102249355639007e-06, + "loss": 2.3249, + "step": 1878 + }, + { + "epoch": 0.7247830279652845, + "grad_norm": 0.4096130626344659, + "learning_rate": 1.8054874288116564e-06, + "loss": 2.3437, + "step": 1879 + }, + { + "epoch": 0.7251687560270009, + "grad_norm": 0.43221946726888827, + "learning_rate": 1.8007547631471289e-06, + "loss": 2.3126, + "step": 1880 + }, + { + "epoch": 0.7255544840887175, + "grad_norm": 0.41203528881787477, + "learning_rate": 1.7960269457423867e-06, + "loss": 2.3676, + "step": 1881 + }, + { + "epoch": 0.7259402121504339, + "grad_norm": 0.456724826662013, + "learning_rate": 1.7913039837621448e-06, + "loss": 2.3162, + "step": 1882 + }, + { + "epoch": 0.7263259402121505, + "grad_norm": 0.4622864968839015, + "learning_rate": 1.7865858843637617e-06, + "loss": 2.3036, + "step": 1883 + }, + { + "epoch": 0.7267116682738669, + "grad_norm": 0.3977901985670626, + "learning_rate": 1.781872654697226e-06, + "loss": 2.3802, + "step": 1884 + }, + { + "epoch": 0.7270973963355835, + "grad_norm": 0.44693389963733726, + "learning_rate": 1.7771643019051516e-06, + "loss": 2.3529, + "step": 1885 + }, + { + "epoch": 0.7274831243972999, + "grad_norm": 0.42913602906379306, + "learning_rate": 1.772460833122755e-06, + "loss": 2.3534, + "step": 1886 + }, + { + "epoch": 0.7278688524590164, + "grad_norm": 0.42282407024026286, + "learning_rate": 1.7677622554778568e-06, + "loss": 2.3453, + "step": 1887 + }, + { + "epoch": 0.7282545805207329, + "grad_norm": 0.4246761734320408, + "learning_rate": 1.7630685760908623e-06, + "loss": 2.2622, + "step": 1888 + }, + { + "epoch": 0.7286403085824493, + "grad_norm": 0.4085341091618752, + "learning_rate": 1.7583798020747538e-06, + "loss": 2.3744, + "step": 1889 + }, + { + "epoch": 0.7290260366441659, + "grad_norm": 0.3963714532359338, + "learning_rate": 1.7536959405350806e-06, + "loss": 2.2303, + "step": 1890 + }, + { + "epoch": 0.7294117647058823, + "grad_norm": 0.4256967560310133, + "learning_rate": 1.7490169985699485e-06, + "loss": 2.2885, + "step": 1891 + }, + { + "epoch": 0.7297974927675989, + "grad_norm": 0.4493754498912692, + "learning_rate": 1.7443429832700038e-06, + "loss": 2.3275, + "step": 1892 + }, + { + "epoch": 0.7301832208293153, + "grad_norm": 0.4290343544231461, + "learning_rate": 1.7396739017184334e-06, + "loss": 2.3917, + "step": 1893 + }, + { + "epoch": 0.7305689488910319, + "grad_norm": 0.4209821488017083, + "learning_rate": 1.735009760990941e-06, + "loss": 2.3115, + "step": 1894 + }, + { + "epoch": 0.7309546769527483, + "grad_norm": 0.44112705296346094, + "learning_rate": 1.7303505681557486e-06, + "loss": 2.2876, + "step": 1895 + }, + { + "epoch": 0.7313404050144648, + "grad_norm": 0.43928075906138714, + "learning_rate": 1.7256963302735752e-06, + "loss": 2.3287, + "step": 1896 + }, + { + "epoch": 0.7317261330761813, + "grad_norm": 0.4355951809171198, + "learning_rate": 1.7210470543976326e-06, + "loss": 2.3864, + "step": 1897 + }, + { + "epoch": 0.7321118611378978, + "grad_norm": 0.4313611024308735, + "learning_rate": 1.7164027475736134e-06, + "loss": 2.3034, + "step": 1898 + }, + { + "epoch": 0.7324975891996143, + "grad_norm": 0.4353284629392325, + "learning_rate": 1.7117634168396774e-06, + "loss": 2.2431, + "step": 1899 + }, + { + "epoch": 0.7328833172613307, + "grad_norm": 0.4382032196577667, + "learning_rate": 1.7071290692264492e-06, + "loss": 2.3491, + "step": 1900 + }, + { + "epoch": 0.7332690453230473, + "grad_norm": 0.43415950786337915, + "learning_rate": 1.7024997117569964e-06, + "loss": 2.3355, + "step": 1901 + }, + { + "epoch": 0.7336547733847637, + "grad_norm": 0.4314052449468606, + "learning_rate": 1.6978753514468255e-06, + "loss": 2.3296, + "step": 1902 + }, + { + "epoch": 0.7340405014464803, + "grad_norm": 0.44774770885005954, + "learning_rate": 1.6932559953038702e-06, + "loss": 2.3172, + "step": 1903 + }, + { + "epoch": 0.7344262295081967, + "grad_norm": 0.4312655490717796, + "learning_rate": 1.6886416503284835e-06, + "loss": 2.3042, + "step": 1904 + }, + { + "epoch": 0.7348119575699132, + "grad_norm": 0.43758573603925244, + "learning_rate": 1.684032323513421e-06, + "loss": 2.3183, + "step": 1905 + }, + { + "epoch": 0.7351976856316297, + "grad_norm": 0.48022398401192334, + "learning_rate": 1.679428021843833e-06, + "loss": 2.2606, + "step": 1906 + }, + { + "epoch": 0.7355834136933462, + "grad_norm": 0.43618605575066427, + "learning_rate": 1.6748287522972583e-06, + "loss": 2.2778, + "step": 1907 + }, + { + "epoch": 0.7359691417550627, + "grad_norm": 0.4445544662179184, + "learning_rate": 1.6702345218436066e-06, + "loss": 2.3342, + "step": 1908 + }, + { + "epoch": 0.7363548698167792, + "grad_norm": 0.4383728743068401, + "learning_rate": 1.665645337445153e-06, + "loss": 2.3321, + "step": 1909 + }, + { + "epoch": 0.7367405978784957, + "grad_norm": 0.42465807370261877, + "learning_rate": 1.6610612060565235e-06, + "loss": 2.3491, + "step": 1910 + }, + { + "epoch": 0.7371263259402121, + "grad_norm": 0.42171228959818263, + "learning_rate": 1.6564821346246878e-06, + "loss": 2.2639, + "step": 1911 + }, + { + "epoch": 0.7375120540019287, + "grad_norm": 0.4240188509741186, + "learning_rate": 1.6519081300889472e-06, + "loss": 2.3131, + "step": 1912 + }, + { + "epoch": 0.7378977820636451, + "grad_norm": 0.47621727140695824, + "learning_rate": 1.6473391993809252e-06, + "loss": 2.3753, + "step": 1913 + }, + { + "epoch": 0.7382835101253616, + "grad_norm": 0.40631055589112214, + "learning_rate": 1.6427753494245585e-06, + "loss": 2.3181, + "step": 1914 + }, + { + "epoch": 0.7386692381870781, + "grad_norm": 0.42382359906353934, + "learning_rate": 1.638216587136079e-06, + "loss": 2.3913, + "step": 1915 + }, + { + "epoch": 0.7390549662487946, + "grad_norm": 0.41456911250815726, + "learning_rate": 1.6336629194240118e-06, + "loss": 2.3084, + "step": 1916 + }, + { + "epoch": 0.7394406943105111, + "grad_norm": 0.42685946694878363, + "learning_rate": 1.6291143531891601e-06, + "loss": 2.3688, + "step": 1917 + }, + { + "epoch": 0.7398264223722276, + "grad_norm": 0.41571596018624085, + "learning_rate": 1.6245708953245958e-06, + "loss": 2.3355, + "step": 1918 + }, + { + "epoch": 0.7402121504339441, + "grad_norm": 0.4126446757420263, + "learning_rate": 1.6200325527156536e-06, + "loss": 2.3458, + "step": 1919 + }, + { + "epoch": 0.7405978784956606, + "grad_norm": 0.4203400452330613, + "learning_rate": 1.6154993322399114e-06, + "loss": 2.3521, + "step": 1920 + }, + { + "epoch": 0.740983606557377, + "grad_norm": 0.4283850137050537, + "learning_rate": 1.6109712407671867e-06, + "loss": 2.3439, + "step": 1921 + }, + { + "epoch": 0.7413693346190935, + "grad_norm": 0.4319712256454236, + "learning_rate": 1.6064482851595225e-06, + "loss": 2.3353, + "step": 1922 + }, + { + "epoch": 0.74175506268081, + "grad_norm": 0.43119097151201774, + "learning_rate": 1.6019304722711836e-06, + "loss": 2.2801, + "step": 1923 + }, + { + "epoch": 0.7421407907425265, + "grad_norm": 0.45487209978859533, + "learning_rate": 1.5974178089486364e-06, + "loss": 2.332, + "step": 1924 + }, + { + "epoch": 0.742526518804243, + "grad_norm": 0.44591444583471357, + "learning_rate": 1.5929103020305441e-06, + "loss": 2.2509, + "step": 1925 + }, + { + "epoch": 0.7429122468659595, + "grad_norm": 0.4381066815679619, + "learning_rate": 1.588407958347759e-06, + "loss": 2.235, + "step": 1926 + }, + { + "epoch": 0.743297974927676, + "grad_norm": 0.4454021513173899, + "learning_rate": 1.583910784723306e-06, + "loss": 2.3541, + "step": 1927 + }, + { + "epoch": 0.7436837029893925, + "grad_norm": 0.47553223454478355, + "learning_rate": 1.5794187879723755e-06, + "loss": 2.3544, + "step": 1928 + }, + { + "epoch": 0.744069431051109, + "grad_norm": 0.41385399136871154, + "learning_rate": 1.5749319749023117e-06, + "loss": 2.3134, + "step": 1929 + }, + { + "epoch": 0.7444551591128254, + "grad_norm": 0.3985543677381964, + "learning_rate": 1.5704503523126057e-06, + "loss": 2.3468, + "step": 1930 + }, + { + "epoch": 0.744840887174542, + "grad_norm": 0.40023562887529873, + "learning_rate": 1.5659739269948798e-06, + "loss": 2.3317, + "step": 1931 + }, + { + "epoch": 0.7452266152362584, + "grad_norm": 0.43607247084987116, + "learning_rate": 1.561502705732883e-06, + "loss": 2.3172, + "step": 1932 + }, + { + "epoch": 0.7456123432979749, + "grad_norm": 0.4112898127350544, + "learning_rate": 1.557036695302478e-06, + "loss": 2.3366, + "step": 1933 + }, + { + "epoch": 0.7459980713596914, + "grad_norm": 0.4225869777300909, + "learning_rate": 1.552575902471628e-06, + "loss": 2.3598, + "step": 1934 + }, + { + "epoch": 0.7463837994214079, + "grad_norm": 0.3947844663663247, + "learning_rate": 1.5481203340003915e-06, + "loss": 2.3811, + "step": 1935 + }, + { + "epoch": 0.7467695274831244, + "grad_norm": 0.4323476933757536, + "learning_rate": 1.543669996640908e-06, + "loss": 2.355, + "step": 1936 + }, + { + "epoch": 0.7471552555448409, + "grad_norm": 0.4260625784537816, + "learning_rate": 1.5392248971373913e-06, + "loss": 2.4, + "step": 1937 + }, + { + "epoch": 0.7475409836065574, + "grad_norm": 0.44096850417415434, + "learning_rate": 1.534785042226115e-06, + "loss": 2.3021, + "step": 1938 + }, + { + "epoch": 0.7479267116682738, + "grad_norm": 0.44503728345754445, + "learning_rate": 1.5303504386354096e-06, + "loss": 2.3117, + "step": 1939 + }, + { + "epoch": 0.7483124397299904, + "grad_norm": 0.43620387877127675, + "learning_rate": 1.5259210930856423e-06, + "loss": 2.3713, + "step": 1940 + }, + { + "epoch": 0.7486981677917068, + "grad_norm": 0.4136199715831025, + "learning_rate": 1.5214970122892164e-06, + "loss": 2.1578, + "step": 1941 + }, + { + "epoch": 0.7490838958534234, + "grad_norm": 0.3970309676881285, + "learning_rate": 1.5170782029505543e-06, + "loss": 2.3259, + "step": 1942 + }, + { + "epoch": 0.7494696239151398, + "grad_norm": 0.4624348047529903, + "learning_rate": 1.5126646717660898e-06, + "loss": 2.2813, + "step": 1943 + }, + { + "epoch": 0.7498553519768563, + "grad_norm": 0.4169276477348904, + "learning_rate": 1.5082564254242583e-06, + "loss": 2.3011, + "step": 1944 + }, + { + "epoch": 0.7502410800385728, + "grad_norm": 0.4220108001492482, + "learning_rate": 1.5038534706054857e-06, + "loss": 2.3509, + "step": 1945 + }, + { + "epoch": 0.7506268081002893, + "grad_norm": 0.4311426982017881, + "learning_rate": 1.4994558139821818e-06, + "loss": 2.3794, + "step": 1946 + }, + { + "epoch": 0.7510125361620058, + "grad_norm": 0.4443090002313747, + "learning_rate": 1.495063462218725e-06, + "loss": 2.3786, + "step": 1947 + }, + { + "epoch": 0.7513982642237222, + "grad_norm": 0.42055047265269285, + "learning_rate": 1.4906764219714537e-06, + "loss": 2.3093, + "step": 1948 + }, + { + "epoch": 0.7517839922854388, + "grad_norm": 0.4271206972448656, + "learning_rate": 1.4862946998886591e-06, + "loss": 2.3313, + "step": 1949 + }, + { + "epoch": 0.7521697203471552, + "grad_norm": 0.4523623802186264, + "learning_rate": 1.4819183026105694e-06, + "loss": 2.3825, + "step": 1950 + }, + { + "epoch": 0.7525554484088718, + "grad_norm": 0.44065059445820953, + "learning_rate": 1.47754723676935e-06, + "loss": 2.3994, + "step": 1951 + }, + { + "epoch": 0.7529411764705882, + "grad_norm": 0.4418740367804237, + "learning_rate": 1.4731815089890795e-06, + "loss": 2.3543, + "step": 1952 + }, + { + "epoch": 0.7533269045323048, + "grad_norm": 0.4172566227457047, + "learning_rate": 1.4688211258857533e-06, + "loss": 2.3196, + "step": 1953 + }, + { + "epoch": 0.7537126325940212, + "grad_norm": 0.4550251205994799, + "learning_rate": 1.4644660940672628e-06, + "loss": 2.4147, + "step": 1954 + }, + { + "epoch": 0.7540983606557377, + "grad_norm": 0.4176140241551424, + "learning_rate": 1.4601164201333917e-06, + "loss": 2.2887, + "step": 1955 + }, + { + "epoch": 0.7544840887174542, + "grad_norm": 0.4414057227206095, + "learning_rate": 1.455772110675804e-06, + "loss": 2.342, + "step": 1956 + }, + { + "epoch": 0.7548698167791706, + "grad_norm": 0.41491981206533557, + "learning_rate": 1.4514331722780323e-06, + "loss": 2.3814, + "step": 1957 + }, + { + "epoch": 0.7552555448408872, + "grad_norm": 0.42835423357893915, + "learning_rate": 1.447099611515474e-06, + "loss": 2.3606, + "step": 1958 + }, + { + "epoch": 0.7556412729026036, + "grad_norm": 0.41793133773717467, + "learning_rate": 1.4427714349553718e-06, + "loss": 2.2872, + "step": 1959 + }, + { + "epoch": 0.7560270009643202, + "grad_norm": 0.4126520103044043, + "learning_rate": 1.438448649156815e-06, + "loss": 2.3769, + "step": 1960 + }, + { + "epoch": 0.7564127290260366, + "grad_norm": 0.42809154465845917, + "learning_rate": 1.434131260670718e-06, + "loss": 2.3048, + "step": 1961 + }, + { + "epoch": 0.7567984570877532, + "grad_norm": 0.4159494516222891, + "learning_rate": 1.4298192760398183e-06, + "loss": 2.3678, + "step": 1962 + }, + { + "epoch": 0.7571841851494696, + "grad_norm": 0.42294985526580336, + "learning_rate": 1.4255127017986642e-06, + "loss": 2.3726, + "step": 1963 + }, + { + "epoch": 0.7575699132111862, + "grad_norm": 0.4547618222932602, + "learning_rate": 1.4212115444736024e-06, + "loss": 2.3227, + "step": 1964 + }, + { + "epoch": 0.7579556412729026, + "grad_norm": 0.4539738123807865, + "learning_rate": 1.4169158105827768e-06, + "loss": 2.3494, + "step": 1965 + }, + { + "epoch": 0.758341369334619, + "grad_norm": 0.4447504726542575, + "learning_rate": 1.412625506636106e-06, + "loss": 2.3716, + "step": 1966 + }, + { + "epoch": 0.7587270973963356, + "grad_norm": 0.4441122610676579, + "learning_rate": 1.4083406391352827e-06, + "loss": 2.3429, + "step": 1967 + }, + { + "epoch": 0.759112825458052, + "grad_norm": 0.4361873669315586, + "learning_rate": 1.4040612145737608e-06, + "loss": 2.3435, + "step": 1968 + }, + { + "epoch": 0.7594985535197686, + "grad_norm": 0.4501748064507914, + "learning_rate": 1.399787239436744e-06, + "loss": 2.3487, + "step": 1969 + }, + { + "epoch": 0.759884281581485, + "grad_norm": 0.3961270353598128, + "learning_rate": 1.3955187202011817e-06, + "loss": 2.2895, + "step": 1970 + }, + { + "epoch": 0.7602700096432016, + "grad_norm": 0.40673706251633657, + "learning_rate": 1.3912556633357504e-06, + "loss": 2.2987, + "step": 1971 + }, + { + "epoch": 0.760655737704918, + "grad_norm": 0.42338880344973645, + "learning_rate": 1.3869980753008537e-06, + "loss": 2.3168, + "step": 1972 + }, + { + "epoch": 0.7610414657666346, + "grad_norm": 0.40140883912164094, + "learning_rate": 1.382745962548604e-06, + "loss": 2.3646, + "step": 1973 + }, + { + "epoch": 0.761427193828351, + "grad_norm": 0.41867937012538364, + "learning_rate": 1.3784993315228167e-06, + "loss": 2.3299, + "step": 1974 + }, + { + "epoch": 0.7618129218900676, + "grad_norm": 0.4303118053950361, + "learning_rate": 1.3742581886590006e-06, + "loss": 2.3334, + "step": 1975 + }, + { + "epoch": 0.762198649951784, + "grad_norm": 0.3981641920452056, + "learning_rate": 1.370022540384347e-06, + "loss": 2.3107, + "step": 1976 + }, + { + "epoch": 0.7625843780135004, + "grad_norm": 0.4612849602908465, + "learning_rate": 1.3657923931177204e-06, + "loss": 2.2303, + "step": 1977 + }, + { + "epoch": 0.762970106075217, + "grad_norm": 0.4180792084943692, + "learning_rate": 1.3615677532696498e-06, + "loss": 2.3706, + "step": 1978 + }, + { + "epoch": 0.7633558341369334, + "grad_norm": 0.38250956972257466, + "learning_rate": 1.3573486272423192e-06, + "loss": 2.2321, + "step": 1979 + }, + { + "epoch": 0.76374156219865, + "grad_norm": 0.4027643412011808, + "learning_rate": 1.353135021429554e-06, + "loss": 2.3544, + "step": 1980 + }, + { + "epoch": 0.7641272902603664, + "grad_norm": 0.44615568380452053, + "learning_rate": 1.348926942216815e-06, + "loss": 2.2998, + "step": 1981 + }, + { + "epoch": 0.764513018322083, + "grad_norm": 0.4386786384305421, + "learning_rate": 1.3447243959811885e-06, + "loss": 2.3685, + "step": 1982 + }, + { + "epoch": 0.7648987463837994, + "grad_norm": 0.40691005347999887, + "learning_rate": 1.340527389091374e-06, + "loss": 2.3685, + "step": 1983 + }, + { + "epoch": 0.765284474445516, + "grad_norm": 0.444251477937014, + "learning_rate": 1.3363359279076776e-06, + "loss": 2.3357, + "step": 1984 + }, + { + "epoch": 0.7656702025072324, + "grad_norm": 0.4260239073946116, + "learning_rate": 1.3321500187820042e-06, + "loss": 2.3069, + "step": 1985 + }, + { + "epoch": 0.7660559305689489, + "grad_norm": 0.40422071633890755, + "learning_rate": 1.3279696680578402e-06, + "loss": 2.3677, + "step": 1986 + }, + { + "epoch": 0.7664416586306654, + "grad_norm": 0.41280437028129924, + "learning_rate": 1.3237948820702495e-06, + "loss": 2.331, + "step": 1987 + }, + { + "epoch": 0.7668273866923818, + "grad_norm": 0.4082483834119175, + "learning_rate": 1.3196256671458663e-06, + "loss": 2.3281, + "step": 1988 + }, + { + "epoch": 0.7672131147540984, + "grad_norm": 0.41258302839639005, + "learning_rate": 1.3154620296028793e-06, + "loss": 2.2833, + "step": 1989 + }, + { + "epoch": 0.7675988428158148, + "grad_norm": 0.4344894821948446, + "learning_rate": 1.3113039757510253e-06, + "loss": 2.3197, + "step": 1990 + }, + { + "epoch": 0.7679845708775314, + "grad_norm": 0.42892943095676617, + "learning_rate": 1.307151511891578e-06, + "loss": 2.3273, + "step": 1991 + }, + { + "epoch": 0.7683702989392478, + "grad_norm": 0.44491385707895625, + "learning_rate": 1.3030046443173445e-06, + "loss": 2.3597, + "step": 1992 + }, + { + "epoch": 0.7687560270009643, + "grad_norm": 0.41228660786813054, + "learning_rate": 1.298863379312647e-06, + "loss": 2.3733, + "step": 1993 + }, + { + "epoch": 0.7691417550626808, + "grad_norm": 0.4383023013387142, + "learning_rate": 1.2947277231533178e-06, + "loss": 2.2713, + "step": 1994 + }, + { + "epoch": 0.7695274831243973, + "grad_norm": 0.4036438771963324, + "learning_rate": 1.2905976821066902e-06, + "loss": 2.295, + "step": 1995 + }, + { + "epoch": 0.7699132111861138, + "grad_norm": 0.45641979064711524, + "learning_rate": 1.2864732624315867e-06, + "loss": 2.3163, + "step": 1996 + }, + { + "epoch": 0.7702989392478303, + "grad_norm": 0.44286916639484614, + "learning_rate": 1.282354470378313e-06, + "loss": 2.3142, + "step": 1997 + }, + { + "epoch": 0.7706846673095468, + "grad_norm": 0.45764787143026897, + "learning_rate": 1.2782413121886483e-06, + "loss": 2.3925, + "step": 1998 + }, + { + "epoch": 0.7710703953712632, + "grad_norm": 0.41820084025876386, + "learning_rate": 1.2741337940958286e-06, + "loss": 2.3918, + "step": 1999 + }, + { + "epoch": 0.7714561234329798, + "grad_norm": 0.43730016656356097, + "learning_rate": 1.270031922324546e-06, + "loss": 2.2286, + "step": 2000 + }, + { + "epoch": 0.7718418514946962, + "grad_norm": 0.45294857793904747, + "learning_rate": 1.2659357030909352e-06, + "loss": 2.3372, + "step": 2001 + }, + { + "epoch": 0.7722275795564127, + "grad_norm": 0.44175153237351367, + "learning_rate": 1.2618451426025657e-06, + "loss": 2.3348, + "step": 2002 + }, + { + "epoch": 0.7726133076181292, + "grad_norm": 0.4199874401953492, + "learning_rate": 1.2577602470584287e-06, + "loss": 2.2427, + "step": 2003 + }, + { + "epoch": 0.7729990356798457, + "grad_norm": 0.42520445167328846, + "learning_rate": 1.2536810226489354e-06, + "loss": 2.3594, + "step": 2004 + }, + { + "epoch": 0.7733847637415622, + "grad_norm": 0.396497578004788, + "learning_rate": 1.249607475555899e-06, + "loss": 2.2683, + "step": 2005 + }, + { + "epoch": 0.7737704918032787, + "grad_norm": 0.4298494247154455, + "learning_rate": 1.2455396119525288e-06, + "loss": 2.2133, + "step": 2006 + }, + { + "epoch": 0.7741562198649952, + "grad_norm": 0.5036877150521499, + "learning_rate": 1.2414774380034245e-06, + "loss": 2.2628, + "step": 2007 + }, + { + "epoch": 0.7745419479267117, + "grad_norm": 0.4205072735039371, + "learning_rate": 1.237420959864561e-06, + "loss": 2.3491, + "step": 2008 + }, + { + "epoch": 0.7749276759884282, + "grad_norm": 0.43258037370084457, + "learning_rate": 1.2333701836832812e-06, + "loss": 2.3784, + "step": 2009 + }, + { + "epoch": 0.7753134040501446, + "grad_norm": 0.41445328672967857, + "learning_rate": 1.229325115598286e-06, + "loss": 2.2929, + "step": 2010 + }, + { + "epoch": 0.7756991321118611, + "grad_norm": 0.42352117823562174, + "learning_rate": 1.2252857617396318e-06, + "loss": 2.3134, + "step": 2011 + }, + { + "epoch": 0.7760848601735776, + "grad_norm": 0.4106272439078515, + "learning_rate": 1.2212521282287093e-06, + "loss": 2.3272, + "step": 2012 + }, + { + "epoch": 0.7764705882352941, + "grad_norm": 0.40093498606113714, + "learning_rate": 1.217224221178242e-06, + "loss": 2.2491, + "step": 2013 + }, + { + "epoch": 0.7768563162970106, + "grad_norm": 0.43709998605681294, + "learning_rate": 1.2132020466922767e-06, + "loss": 2.3315, + "step": 2014 + }, + { + "epoch": 0.7772420443587271, + "grad_norm": 0.4356135695882619, + "learning_rate": 1.2091856108661703e-06, + "loss": 2.3562, + "step": 2015 + }, + { + "epoch": 0.7776277724204436, + "grad_norm": 0.47240650389883526, + "learning_rate": 1.2051749197865875e-06, + "loss": 2.3961, + "step": 2016 + }, + { + "epoch": 0.7780135004821601, + "grad_norm": 0.4060173420056584, + "learning_rate": 1.2011699795314813e-06, + "loss": 2.3648, + "step": 2017 + }, + { + "epoch": 0.7783992285438766, + "grad_norm": 0.41026562263462857, + "learning_rate": 1.1971707961700962e-06, + "loss": 2.2899, + "step": 2018 + }, + { + "epoch": 0.7787849566055931, + "grad_norm": 0.42500261896680547, + "learning_rate": 1.1931773757629472e-06, + "loss": 2.365, + "step": 2019 + }, + { + "epoch": 0.7791706846673095, + "grad_norm": 0.4110188581895219, + "learning_rate": 1.1891897243618184e-06, + "loss": 2.3169, + "step": 2020 + }, + { + "epoch": 0.779556412729026, + "grad_norm": 0.4412302630768088, + "learning_rate": 1.1852078480097502e-06, + "loss": 2.3269, + "step": 2021 + }, + { + "epoch": 0.7799421407907425, + "grad_norm": 0.42791139175603443, + "learning_rate": 1.1812317527410316e-06, + "loss": 2.4182, + "step": 2022 + }, + { + "epoch": 0.780327868852459, + "grad_norm": 0.43155330381976453, + "learning_rate": 1.1772614445811902e-06, + "loss": 2.3056, + "step": 2023 + }, + { + "epoch": 0.7807135969141755, + "grad_norm": 0.4248080667352216, + "learning_rate": 1.173296929546987e-06, + "loss": 2.3511, + "step": 2024 + }, + { + "epoch": 0.781099324975892, + "grad_norm": 0.47188007333530674, + "learning_rate": 1.1693382136463981e-06, + "loss": 2.3643, + "step": 2025 + }, + { + "epoch": 0.7814850530376085, + "grad_norm": 0.4554633115058257, + "learning_rate": 1.1653853028786177e-06, + "loss": 2.319, + "step": 2026 + }, + { + "epoch": 0.781870781099325, + "grad_norm": 0.41320717435656057, + "learning_rate": 1.161438203234037e-06, + "loss": 2.324, + "step": 2027 + }, + { + "epoch": 0.7822565091610415, + "grad_norm": 0.44992969789800136, + "learning_rate": 1.1574969206942443e-06, + "loss": 2.3228, + "step": 2028 + }, + { + "epoch": 0.7826422372227579, + "grad_norm": 0.42752178483527925, + "learning_rate": 1.15356146123201e-06, + "loss": 2.3232, + "step": 2029 + }, + { + "epoch": 0.7830279652844745, + "grad_norm": 0.42090346482240726, + "learning_rate": 1.149631830811283e-06, + "loss": 2.3241, + "step": 2030 + }, + { + "epoch": 0.7834136933461909, + "grad_norm": 0.40632873911064693, + "learning_rate": 1.145708035387177e-06, + "loss": 2.3086, + "step": 2031 + }, + { + "epoch": 0.7837994214079074, + "grad_norm": 0.43403847555527236, + "learning_rate": 1.1417900809059623e-06, + "loss": 2.3284, + "step": 2032 + }, + { + "epoch": 0.7841851494696239, + "grad_norm": 0.45825087452891977, + "learning_rate": 1.1378779733050583e-06, + "loss": 2.2471, + "step": 2033 + }, + { + "epoch": 0.7845708775313404, + "grad_norm": 0.44605854512315274, + "learning_rate": 1.1339717185130228e-06, + "loss": 2.3399, + "step": 2034 + }, + { + "epoch": 0.7849566055930569, + "grad_norm": 0.445422337924896, + "learning_rate": 1.1300713224495485e-06, + "loss": 2.3489, + "step": 2035 + }, + { + "epoch": 0.7853423336547734, + "grad_norm": 0.42565645651135264, + "learning_rate": 1.1261767910254422e-06, + "loss": 2.2588, + "step": 2036 + }, + { + "epoch": 0.7857280617164899, + "grad_norm": 0.4181093795070767, + "learning_rate": 1.1222881301426314e-06, + "loss": 2.3327, + "step": 2037 + }, + { + "epoch": 0.7861137897782063, + "grad_norm": 0.42210516058715325, + "learning_rate": 1.1184053456941407e-06, + "loss": 2.2581, + "step": 2038 + }, + { + "epoch": 0.7864995178399229, + "grad_norm": 0.40726524748024123, + "learning_rate": 1.1145284435640918e-06, + "loss": 2.3303, + "step": 2039 + }, + { + "epoch": 0.7868852459016393, + "grad_norm": 0.42613476920779403, + "learning_rate": 1.1106574296276923e-06, + "loss": 2.3447, + "step": 2040 + }, + { + "epoch": 0.7872709739633559, + "grad_norm": 0.4229952696715132, + "learning_rate": 1.1067923097512256e-06, + "loss": 2.3759, + "step": 2041 + }, + { + "epoch": 0.7876567020250723, + "grad_norm": 0.3964771073110038, + "learning_rate": 1.102933089792042e-06, + "loss": 2.3604, + "step": 2042 + }, + { + "epoch": 0.7880424300867888, + "grad_norm": 0.39912612226497574, + "learning_rate": 1.0990797755985567e-06, + "loss": 2.2094, + "step": 2043 + }, + { + "epoch": 0.7884281581485053, + "grad_norm": 0.41715700699022745, + "learning_rate": 1.095232373010226e-06, + "loss": 2.3472, + "step": 2044 + }, + { + "epoch": 0.7888138862102217, + "grad_norm": 0.4661905801110712, + "learning_rate": 1.0913908878575568e-06, + "loss": 2.372, + "step": 2045 + }, + { + "epoch": 0.7891996142719383, + "grad_norm": 0.4475875561296712, + "learning_rate": 1.0875553259620825e-06, + "loss": 2.3541, + "step": 2046 + }, + { + "epoch": 0.7895853423336547, + "grad_norm": 0.4137122903983067, + "learning_rate": 1.0837256931363605e-06, + "loss": 2.3271, + "step": 2047 + }, + { + "epoch": 0.7899710703953713, + "grad_norm": 0.43475606513803694, + "learning_rate": 1.0799019951839656e-06, + "loss": 2.3564, + "step": 2048 + }, + { + "epoch": 0.7903567984570877, + "grad_norm": 0.42735929912744547, + "learning_rate": 1.0760842378994758e-06, + "loss": 2.2648, + "step": 2049 + }, + { + "epoch": 0.7907425265188043, + "grad_norm": 0.4054186470588228, + "learning_rate": 1.0722724270684698e-06, + "loss": 2.2956, + "step": 2050 + }, + { + "epoch": 0.7911282545805207, + "grad_norm": 0.4164187575257658, + "learning_rate": 1.068466568467512e-06, + "loss": 2.3812, + "step": 2051 + }, + { + "epoch": 0.7915139826422373, + "grad_norm": 0.4209567437960228, + "learning_rate": 1.0646666678641477e-06, + "loss": 2.3218, + "step": 2052 + }, + { + "epoch": 0.7918997107039537, + "grad_norm": 0.4200255182068855, + "learning_rate": 1.0608727310168921e-06, + "loss": 2.2764, + "step": 2053 + }, + { + "epoch": 0.7922854387656703, + "grad_norm": 0.4555452108402252, + "learning_rate": 1.0570847636752251e-06, + "loss": 2.297, + "step": 2054 + }, + { + "epoch": 0.7926711668273867, + "grad_norm": 0.43866423150867556, + "learning_rate": 1.0533027715795784e-06, + "loss": 2.3602, + "step": 2055 + }, + { + "epoch": 0.7930568948891031, + "grad_norm": 0.4040195252185418, + "learning_rate": 1.0495267604613273e-06, + "loss": 2.3524, + "step": 2056 + }, + { + "epoch": 0.7934426229508197, + "grad_norm": 0.41119784078419164, + "learning_rate": 1.0457567360427872e-06, + "loss": 2.3815, + "step": 2057 + }, + { + "epoch": 0.7938283510125361, + "grad_norm": 0.4082097704919852, + "learning_rate": 1.041992704037198e-06, + "loss": 2.2881, + "step": 2058 + }, + { + "epoch": 0.7942140790742527, + "grad_norm": 0.44778947055140056, + "learning_rate": 1.0382346701487183e-06, + "loss": 2.4268, + "step": 2059 + }, + { + "epoch": 0.7945998071359691, + "grad_norm": 0.41670178757828186, + "learning_rate": 1.0344826400724185e-06, + "loss": 2.2601, + "step": 2060 + }, + { + "epoch": 0.7949855351976857, + "grad_norm": 0.4050164213793503, + "learning_rate": 1.030736619494268e-06, + "loss": 2.2754, + "step": 2061 + }, + { + "epoch": 0.7953712632594021, + "grad_norm": 0.41930209822290115, + "learning_rate": 1.0269966140911343e-06, + "loss": 2.3653, + "step": 2062 + }, + { + "epoch": 0.7957569913211187, + "grad_norm": 0.43621583234469957, + "learning_rate": 1.023262629530763e-06, + "loss": 2.2771, + "step": 2063 + }, + { + "epoch": 0.7961427193828351, + "grad_norm": 0.44674323122071313, + "learning_rate": 1.0195346714717813e-06, + "loss": 2.429, + "step": 2064 + }, + { + "epoch": 0.7965284474445516, + "grad_norm": 0.4344073799130207, + "learning_rate": 1.015812745563679e-06, + "loss": 2.2308, + "step": 2065 + }, + { + "epoch": 0.7969141755062681, + "grad_norm": 0.4339314134771839, + "learning_rate": 1.012096857446807e-06, + "loss": 2.3263, + "step": 2066 + }, + { + "epoch": 0.7972999035679845, + "grad_norm": 0.41138510874721523, + "learning_rate": 1.0083870127523659e-06, + "loss": 2.2738, + "step": 2067 + }, + { + "epoch": 0.7976856316297011, + "grad_norm": 0.44137290067576496, + "learning_rate": 1.0046832171023952e-06, + "loss": 2.4554, + "step": 2068 + }, + { + "epoch": 0.7980713596914175, + "grad_norm": 0.4232345697274198, + "learning_rate": 1.0009854761097736e-06, + "loss": 2.3486, + "step": 2069 + }, + { + "epoch": 0.7984570877531341, + "grad_norm": 0.42589530418445515, + "learning_rate": 9.972937953781985e-07, + "loss": 2.2941, + "step": 2070 + }, + { + "epoch": 0.7988428158148505, + "grad_norm": 0.4431955037608249, + "learning_rate": 9.936081805021859e-07, + "loss": 2.2909, + "step": 2071 + }, + { + "epoch": 0.799228543876567, + "grad_norm": 0.41161461847207664, + "learning_rate": 9.899286370670575e-07, + "loss": 2.3233, + "step": 2072 + }, + { + "epoch": 0.7996142719382835, + "grad_norm": 0.43207424360876473, + "learning_rate": 9.862551706489382e-07, + "loss": 2.2991, + "step": 2073 + }, + { + "epoch": 0.8, + "grad_norm": 0.4216834706882256, + "learning_rate": 9.825877868147393e-07, + "loss": 2.3383, + "step": 2074 + }, + { + "epoch": 0.8003857280617165, + "grad_norm": 0.3959572897055696, + "learning_rate": 9.789264911221546e-07, + "loss": 2.2559, + "step": 2075 + }, + { + "epoch": 0.800771456123433, + "grad_norm": 0.4385171900226881, + "learning_rate": 9.752712891196558e-07, + "loss": 2.3079, + "step": 2076 + }, + { + "epoch": 0.8011571841851495, + "grad_norm": 0.452752952242241, + "learning_rate": 9.716221863464764e-07, + "loss": 2.3363, + "step": 2077 + }, + { + "epoch": 0.8015429122468659, + "grad_norm": 0.4393729464245923, + "learning_rate": 9.679791883326067e-07, + "loss": 2.2671, + "step": 2078 + }, + { + "epoch": 0.8019286403085825, + "grad_norm": 0.4390517173709909, + "learning_rate": 9.643423005987868e-07, + "loss": 2.2943, + "step": 2079 + }, + { + "epoch": 0.8023143683702989, + "grad_norm": 0.45152023580011974, + "learning_rate": 9.607115286564972e-07, + "loss": 2.3131, + "step": 2080 + }, + { + "epoch": 0.8027000964320155, + "grad_norm": 0.41921933819791163, + "learning_rate": 9.570868780079485e-07, + "loss": 2.2806, + "step": 2081 + }, + { + "epoch": 0.8030858244937319, + "grad_norm": 0.4239070139144562, + "learning_rate": 9.534683541460771e-07, + "loss": 2.2803, + "step": 2082 + }, + { + "epoch": 0.8034715525554484, + "grad_norm": 0.42556425223869837, + "learning_rate": 9.498559625545362e-07, + "loss": 2.3287, + "step": 2083 + }, + { + "epoch": 0.8038572806171649, + "grad_norm": 0.42612821228367675, + "learning_rate": 9.46249708707681e-07, + "loss": 2.3217, + "step": 2084 + }, + { + "epoch": 0.8042430086788814, + "grad_norm": 0.4124648062571102, + "learning_rate": 9.426495980705685e-07, + "loss": 2.3432, + "step": 2085 + }, + { + "epoch": 0.8046287367405979, + "grad_norm": 0.39315266190910947, + "learning_rate": 9.39055636098945e-07, + "loss": 2.3182, + "step": 2086 + }, + { + "epoch": 0.8050144648023144, + "grad_norm": 0.40497942402672604, + "learning_rate": 9.354678282392399e-07, + "loss": 2.3232, + "step": 2087 + }, + { + "epoch": 0.8054001928640309, + "grad_norm": 0.43575039141039523, + "learning_rate": 9.318861799285539e-07, + "loss": 2.2737, + "step": 2088 + }, + { + "epoch": 0.8057859209257473, + "grad_norm": 0.4195391658604251, + "learning_rate": 9.283106965946581e-07, + "loss": 2.3175, + "step": 2089 + }, + { + "epoch": 0.8061716489874639, + "grad_norm": 0.42360172858140893, + "learning_rate": 9.247413836559765e-07, + "loss": 2.254, + "step": 2090 + }, + { + "epoch": 0.8065573770491803, + "grad_norm": 0.4252842232896849, + "learning_rate": 9.211782465215829e-07, + "loss": 2.3785, + "step": 2091 + }, + { + "epoch": 0.8069431051108968, + "grad_norm": 0.4354684092231028, + "learning_rate": 9.176212905911946e-07, + "loss": 2.3159, + "step": 2092 + }, + { + "epoch": 0.8073288331726133, + "grad_norm": 0.4120198398311047, + "learning_rate": 9.140705212551599e-07, + "loss": 2.3067, + "step": 2093 + }, + { + "epoch": 0.8077145612343298, + "grad_norm": 0.4062676518671219, + "learning_rate": 9.105259438944508e-07, + "loss": 2.3309, + "step": 2094 + }, + { + "epoch": 0.8081002892960463, + "grad_norm": 0.4218432803347716, + "learning_rate": 9.069875638806558e-07, + "loss": 2.3381, + "step": 2095 + }, + { + "epoch": 0.8084860173577628, + "grad_norm": 0.44741866515440026, + "learning_rate": 9.034553865759754e-07, + "loss": 2.2959, + "step": 2096 + }, + { + "epoch": 0.8088717454194793, + "grad_norm": 0.42302813421063773, + "learning_rate": 8.999294173332058e-07, + "loss": 2.258, + "step": 2097 + }, + { + "epoch": 0.8092574734811958, + "grad_norm": 0.40136791403407035, + "learning_rate": 8.964096614957374e-07, + "loss": 2.3537, + "step": 2098 + }, + { + "epoch": 0.8096432015429122, + "grad_norm": 0.41843971603799174, + "learning_rate": 8.928961243975437e-07, + "loss": 2.3205, + "step": 2099 + }, + { + "epoch": 0.8100289296046287, + "grad_norm": 0.41636229509726935, + "learning_rate": 8.893888113631732e-07, + "loss": 2.3583, + "step": 2100 + }, + { + "epoch": 0.8104146576663452, + "grad_norm": 0.41726692500260676, + "learning_rate": 8.858877277077455e-07, + "loss": 2.3512, + "step": 2101 + }, + { + "epoch": 0.8108003857280617, + "grad_norm": 0.41826271820553335, + "learning_rate": 8.823928787369379e-07, + "loss": 2.2571, + "step": 2102 + }, + { + "epoch": 0.8111861137897782, + "grad_norm": 0.4342375756262929, + "learning_rate": 8.789042697469796e-07, + "loss": 2.3969, + "step": 2103 + }, + { + "epoch": 0.8115718418514947, + "grad_norm": 0.4768657159728018, + "learning_rate": 8.754219060246432e-07, + "loss": 2.2958, + "step": 2104 + }, + { + "epoch": 0.8119575699132112, + "grad_norm": 0.4123373053535353, + "learning_rate": 8.719457928472364e-07, + "loss": 2.264, + "step": 2105 + }, + { + "epoch": 0.8123432979749277, + "grad_norm": 0.44858485725351716, + "learning_rate": 8.684759354825962e-07, + "loss": 2.2995, + "step": 2106 + }, + { + "epoch": 0.8127290260366442, + "grad_norm": 0.41609733353522543, + "learning_rate": 8.650123391890763e-07, + "loss": 2.3769, + "step": 2107 + }, + { + "epoch": 0.8131147540983606, + "grad_norm": 0.40205999326820274, + "learning_rate": 8.615550092155478e-07, + "loss": 2.3102, + "step": 2108 + }, + { + "epoch": 0.8135004821600772, + "grad_norm": 0.4545386030204492, + "learning_rate": 8.581039508013788e-07, + "loss": 2.2828, + "step": 2109 + }, + { + "epoch": 0.8138862102217936, + "grad_norm": 0.40903450628581595, + "learning_rate": 8.546591691764388e-07, + "loss": 2.3142, + "step": 2110 + }, + { + "epoch": 0.8142719382835101, + "grad_norm": 0.43073974568345547, + "learning_rate": 8.512206695610825e-07, + "loss": 2.3096, + "step": 2111 + }, + { + "epoch": 0.8146576663452266, + "grad_norm": 0.4310644884162425, + "learning_rate": 8.477884571661449e-07, + "loss": 2.3036, + "step": 2112 + }, + { + "epoch": 0.8150433944069431, + "grad_norm": 0.43803695575052093, + "learning_rate": 8.443625371929326e-07, + "loss": 2.3054, + "step": 2113 + }, + { + "epoch": 0.8154291224686596, + "grad_norm": 0.41544065559345766, + "learning_rate": 8.40942914833216e-07, + "loss": 2.3236, + "step": 2114 + }, + { + "epoch": 0.8158148505303761, + "grad_norm": 0.42351460560781906, + "learning_rate": 8.375295952692258e-07, + "loss": 2.383, + "step": 2115 + }, + { + "epoch": 0.8162005785920926, + "grad_norm": 0.4622928943201081, + "learning_rate": 8.341225836736367e-07, + "loss": 2.3541, + "step": 2116 + }, + { + "epoch": 0.816586306653809, + "grad_norm": 0.3875682392366773, + "learning_rate": 8.30721885209565e-07, + "loss": 2.26, + "step": 2117 + }, + { + "epoch": 0.8169720347155256, + "grad_norm": 0.4227521615438232, + "learning_rate": 8.273275050305618e-07, + "loss": 2.2635, + "step": 2118 + }, + { + "epoch": 0.817357762777242, + "grad_norm": 0.42449931643301214, + "learning_rate": 8.239394482805996e-07, + "loss": 2.3466, + "step": 2119 + }, + { + "epoch": 0.8177434908389586, + "grad_norm": 0.4285021268387793, + "learning_rate": 8.20557720094074e-07, + "loss": 2.2994, + "step": 2120 + }, + { + "epoch": 0.818129218900675, + "grad_norm": 0.43324914742483134, + "learning_rate": 8.171823255957828e-07, + "loss": 2.3021, + "step": 2121 + }, + { + "epoch": 0.8185149469623915, + "grad_norm": 0.43250542249302915, + "learning_rate": 8.138132699009321e-07, + "loss": 2.3227, + "step": 2122 + }, + { + "epoch": 0.818900675024108, + "grad_norm": 0.4718675403886274, + "learning_rate": 8.104505581151184e-07, + "loss": 2.3697, + "step": 2123 + }, + { + "epoch": 0.8192864030858245, + "grad_norm": 0.4216607077105276, + "learning_rate": 8.070941953343242e-07, + "loss": 2.3251, + "step": 2124 + }, + { + "epoch": 0.819672131147541, + "grad_norm": 0.40915512928375525, + "learning_rate": 8.037441866449114e-07, + "loss": 2.3603, + "step": 2125 + }, + { + "epoch": 0.8200578592092574, + "grad_norm": 0.46439779886388954, + "learning_rate": 8.004005371236128e-07, + "loss": 2.2923, + "step": 2126 + }, + { + "epoch": 0.820443587270974, + "grad_norm": 0.4024444417990968, + "learning_rate": 7.970632518375232e-07, + "loss": 2.3659, + "step": 2127 + }, + { + "epoch": 0.8208293153326904, + "grad_norm": 0.44130633810097636, + "learning_rate": 7.937323358440935e-07, + "loss": 2.3072, + "step": 2128 + }, + { + "epoch": 0.821215043394407, + "grad_norm": 0.41030222383464565, + "learning_rate": 7.904077941911248e-07, + "loss": 2.2652, + "step": 2129 + }, + { + "epoch": 0.8216007714561234, + "grad_norm": 0.4239855735926553, + "learning_rate": 7.870896319167548e-07, + "loss": 2.2922, + "step": 2130 + }, + { + "epoch": 0.82198649951784, + "grad_norm": 0.4264318179022017, + "learning_rate": 7.83777854049454e-07, + "loss": 2.461, + "step": 2131 + }, + { + "epoch": 0.8223722275795564, + "grad_norm": 0.39931907861667765, + "learning_rate": 7.804724656080182e-07, + "loss": 2.2963, + "step": 2132 + }, + { + "epoch": 0.8227579556412729, + "grad_norm": 0.44069287415695574, + "learning_rate": 7.771734716015611e-07, + "loss": 2.3215, + "step": 2133 + }, + { + "epoch": 0.8231436837029894, + "grad_norm": 0.4764007174469988, + "learning_rate": 7.738808770295064e-07, + "loss": 2.3178, + "step": 2134 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 0.40896327860723114, + "learning_rate": 7.705946868815783e-07, + "loss": 2.3397, + "step": 2135 + }, + { + "epoch": 0.8239151398264224, + "grad_norm": 0.4133582523462277, + "learning_rate": 7.673149061377966e-07, + "loss": 2.295, + "step": 2136 + }, + { + "epoch": 0.8243008678881388, + "grad_norm": 0.44252932709212506, + "learning_rate": 7.64041539768467e-07, + "loss": 2.3135, + "step": 2137 + }, + { + "epoch": 0.8246865959498554, + "grad_norm": 0.4315479104904479, + "learning_rate": 7.607745927341764e-07, + "loss": 2.364, + "step": 2138 + }, + { + "epoch": 0.8250723240115718, + "grad_norm": 0.4230358525826942, + "learning_rate": 7.575140699857819e-07, + "loss": 2.2812, + "step": 2139 + }, + { + "epoch": 0.8254580520732884, + "grad_norm": 0.4407423704703889, + "learning_rate": 7.542599764644049e-07, + "loss": 2.2873, + "step": 2140 + }, + { + "epoch": 0.8258437801350048, + "grad_norm": 0.4315046975539552, + "learning_rate": 7.510123171014255e-07, + "loss": 2.2976, + "step": 2141 + }, + { + "epoch": 0.8262295081967214, + "grad_norm": 0.440416523656838, + "learning_rate": 7.477710968184726e-07, + "loss": 2.3353, + "step": 2142 + }, + { + "epoch": 0.8266152362584378, + "grad_norm": 0.4283525113622728, + "learning_rate": 7.445363205274153e-07, + "loss": 2.3893, + "step": 2143 + }, + { + "epoch": 0.8270009643201542, + "grad_norm": 0.42482838905355197, + "learning_rate": 7.413079931303591e-07, + "loss": 2.3279, + "step": 2144 + }, + { + "epoch": 0.8273866923818708, + "grad_norm": 0.42004246146651786, + "learning_rate": 7.380861195196359e-07, + "loss": 2.2928, + "step": 2145 + }, + { + "epoch": 0.8277724204435872, + "grad_norm": 0.4646102318061459, + "learning_rate": 7.348707045777959e-07, + "loss": 2.3576, + "step": 2146 + }, + { + "epoch": 0.8281581485053038, + "grad_norm": 0.43910211059318865, + "learning_rate": 7.316617531776049e-07, + "loss": 2.36, + "step": 2147 + }, + { + "epoch": 0.8285438765670202, + "grad_norm": 0.4473140935403787, + "learning_rate": 7.284592701820325e-07, + "loss": 2.2544, + "step": 2148 + }, + { + "epoch": 0.8289296046287368, + "grad_norm": 0.4477860585093107, + "learning_rate": 7.252632604442439e-07, + "loss": 2.2522, + "step": 2149 + }, + { + "epoch": 0.8293153326904532, + "grad_norm": 0.4139567407677305, + "learning_rate": 7.220737288075958e-07, + "loss": 2.3147, + "step": 2150 + }, + { + "epoch": 0.8297010607521698, + "grad_norm": 0.4053269153531082, + "learning_rate": 7.188906801056277e-07, + "loss": 2.2956, + "step": 2151 + }, + { + "epoch": 0.8300867888138862, + "grad_norm": 0.4441494340410561, + "learning_rate": 7.157141191620548e-07, + "loss": 2.3576, + "step": 2152 + }, + { + "epoch": 0.8304725168756028, + "grad_norm": 0.40325041084116087, + "learning_rate": 7.125440507907583e-07, + "loss": 2.3164, + "step": 2153 + }, + { + "epoch": 0.8308582449373192, + "grad_norm": 0.40994622003056547, + "learning_rate": 7.093804797957849e-07, + "loss": 2.2081, + "step": 2154 + }, + { + "epoch": 0.8312439729990356, + "grad_norm": 0.4436215541711604, + "learning_rate": 7.062234109713318e-07, + "loss": 2.3287, + "step": 2155 + }, + { + "epoch": 0.8316297010607522, + "grad_norm": 0.41634391743990945, + "learning_rate": 7.030728491017408e-07, + "loss": 2.2857, + "step": 2156 + }, + { + "epoch": 0.8320154291224686, + "grad_norm": 0.40361544747904327, + "learning_rate": 6.999287989614972e-07, + "loss": 2.377, + "step": 2157 + }, + { + "epoch": 0.8324011571841852, + "grad_norm": 0.40753496942994805, + "learning_rate": 6.967912653152164e-07, + "loss": 2.2995, + "step": 2158 + }, + { + "epoch": 0.8327868852459016, + "grad_norm": 0.42439267943337, + "learning_rate": 6.936602529176367e-07, + "loss": 2.3756, + "step": 2159 + }, + { + "epoch": 0.8331726133076182, + "grad_norm": 0.4113830283805619, + "learning_rate": 6.905357665136142e-07, + "loss": 2.245, + "step": 2160 + }, + { + "epoch": 0.8335583413693346, + "grad_norm": 0.42612328379407305, + "learning_rate": 6.874178108381191e-07, + "loss": 2.3246, + "step": 2161 + }, + { + "epoch": 0.8339440694310511, + "grad_norm": 0.46288626517979153, + "learning_rate": 6.8430639061622e-07, + "loss": 2.2819, + "step": 2162 + }, + { + "epoch": 0.8343297974927676, + "grad_norm": 0.41650229120385046, + "learning_rate": 6.812015105630842e-07, + "loss": 2.412, + "step": 2163 + }, + { + "epoch": 0.8347155255544841, + "grad_norm": 0.4217207294993816, + "learning_rate": 6.781031753839662e-07, + "loss": 2.2941, + "step": 2164 + }, + { + "epoch": 0.8351012536162006, + "grad_norm": 0.4456538316970643, + "learning_rate": 6.750113897742017e-07, + "loss": 2.2193, + "step": 2165 + }, + { + "epoch": 0.835486981677917, + "grad_norm": 0.41758648776543356, + "learning_rate": 6.719261584192038e-07, + "loss": 2.2988, + "step": 2166 + }, + { + "epoch": 0.8358727097396336, + "grad_norm": 0.4186724447342165, + "learning_rate": 6.6884748599445e-07, + "loss": 2.3763, + "step": 2167 + }, + { + "epoch": 0.83625843780135, + "grad_norm": 0.4092927778532757, + "learning_rate": 6.657753771654812e-07, + "loss": 2.2815, + "step": 2168 + }, + { + "epoch": 0.8366441658630666, + "grad_norm": 0.41925614674502665, + "learning_rate": 6.627098365878886e-07, + "loss": 2.3579, + "step": 2169 + }, + { + "epoch": 0.837029893924783, + "grad_norm": 0.4447516307021346, + "learning_rate": 6.596508689073105e-07, + "loss": 2.3064, + "step": 2170 + }, + { + "epoch": 0.8374156219864995, + "grad_norm": 0.4357105661743486, + "learning_rate": 6.565984787594248e-07, + "loss": 2.2484, + "step": 2171 + }, + { + "epoch": 0.837801350048216, + "grad_norm": 0.4454408567281427, + "learning_rate": 6.535526707699408e-07, + "loss": 2.3461, + "step": 2172 + }, + { + "epoch": 0.8381870781099325, + "grad_norm": 0.421682585445223, + "learning_rate": 6.505134495545951e-07, + "loss": 2.364, + "step": 2173 + }, + { + "epoch": 0.838572806171649, + "grad_norm": 0.44971672903581555, + "learning_rate": 6.474808197191401e-07, + "loss": 2.3122, + "step": 2174 + }, + { + "epoch": 0.8389585342333655, + "grad_norm": 0.4430760452217771, + "learning_rate": 6.444547858593392e-07, + "loss": 2.3761, + "step": 2175 + }, + { + "epoch": 0.839344262295082, + "grad_norm": 0.4320460359114813, + "learning_rate": 6.414353525609628e-07, + "loss": 2.3304, + "step": 2176 + }, + { + "epoch": 0.8397299903567984, + "grad_norm": 0.40090412609242976, + "learning_rate": 6.384225243997765e-07, + "loss": 2.3369, + "step": 2177 + }, + { + "epoch": 0.840115718418515, + "grad_norm": 0.4307273849934172, + "learning_rate": 6.354163059415353e-07, + "loss": 2.3445, + "step": 2178 + }, + { + "epoch": 0.8405014464802314, + "grad_norm": 0.4314592802294269, + "learning_rate": 6.32416701741978e-07, + "loss": 2.2853, + "step": 2179 + }, + { + "epoch": 0.840887174541948, + "grad_norm": 0.40742818362166616, + "learning_rate": 6.294237163468231e-07, + "loss": 2.3268, + "step": 2180 + }, + { + "epoch": 0.8412729026036644, + "grad_norm": 0.46955868732988343, + "learning_rate": 6.264373542917551e-07, + "loss": 2.3619, + "step": 2181 + }, + { + "epoch": 0.8416586306653809, + "grad_norm": 0.42557126770219617, + "learning_rate": 6.234576201024223e-07, + "loss": 2.3603, + "step": 2182 + }, + { + "epoch": 0.8420443587270974, + "grad_norm": 0.42459882470639426, + "learning_rate": 6.204845182944292e-07, + "loss": 2.3542, + "step": 2183 + }, + { + "epoch": 0.8424300867888139, + "grad_norm": 0.4319083557644331, + "learning_rate": 6.175180533733277e-07, + "loss": 2.3457, + "step": 2184 + }, + { + "epoch": 0.8428158148505304, + "grad_norm": 0.40950629424784474, + "learning_rate": 6.145582298346153e-07, + "loss": 2.3043, + "step": 2185 + }, + { + "epoch": 0.8432015429122469, + "grad_norm": 0.40940256778256134, + "learning_rate": 6.116050521637218e-07, + "loss": 2.3052, + "step": 2186 + }, + { + "epoch": 0.8435872709739634, + "grad_norm": 0.46231089574854295, + "learning_rate": 6.086585248360072e-07, + "loss": 2.2459, + "step": 2187 + }, + { + "epoch": 0.8439729990356798, + "grad_norm": 0.43997580961979005, + "learning_rate": 6.057186523167529e-07, + "loss": 2.4005, + "step": 2188 + }, + { + "epoch": 0.8443587270973963, + "grad_norm": 0.4443737132363861, + "learning_rate": 6.027854390611548e-07, + "loss": 2.3286, + "step": 2189 + }, + { + "epoch": 0.8447444551591128, + "grad_norm": 0.45487195535732927, + "learning_rate": 5.998588895143181e-07, + "loss": 2.2987, + "step": 2190 + }, + { + "epoch": 0.8451301832208293, + "grad_norm": 0.4104730105003813, + "learning_rate": 5.96939008111248e-07, + "loss": 2.3644, + "step": 2191 + }, + { + "epoch": 0.8455159112825458, + "grad_norm": 0.3998158167331894, + "learning_rate": 5.940257992768456e-07, + "loss": 2.3193, + "step": 2192 + }, + { + "epoch": 0.8459016393442623, + "grad_norm": 0.44317647366113244, + "learning_rate": 5.911192674259015e-07, + "loss": 2.3473, + "step": 2193 + }, + { + "epoch": 0.8462873674059788, + "grad_norm": 0.40494619795811615, + "learning_rate": 5.882194169630845e-07, + "loss": 2.2956, + "step": 2194 + }, + { + "epoch": 0.8466730954676953, + "grad_norm": 0.42962309814899585, + "learning_rate": 5.853262522829417e-07, + "loss": 2.3451, + "step": 2195 + }, + { + "epoch": 0.8470588235294118, + "grad_norm": 0.4168087040989416, + "learning_rate": 5.824397777698859e-07, + "loss": 2.2744, + "step": 2196 + }, + { + "epoch": 0.8474445515911283, + "grad_norm": 0.4589074894558346, + "learning_rate": 5.795599977981914e-07, + "loss": 2.286, + "step": 2197 + }, + { + "epoch": 0.8478302796528447, + "grad_norm": 0.4270457108660484, + "learning_rate": 5.766869167319893e-07, + "loss": 2.2972, + "step": 2198 + }, + { + "epoch": 0.8482160077145612, + "grad_norm": 0.41266450983386743, + "learning_rate": 5.738205389252555e-07, + "loss": 2.2554, + "step": 2199 + }, + { + "epoch": 0.8486017357762777, + "grad_norm": 0.4257329026347416, + "learning_rate": 5.709608687218116e-07, + "loss": 2.3224, + "step": 2200 + }, + { + "epoch": 0.8489874638379942, + "grad_norm": 0.44714776911423726, + "learning_rate": 5.681079104553122e-07, + "loss": 2.3076, + "step": 2201 + }, + { + "epoch": 0.8493731918997107, + "grad_norm": 0.4411737147369575, + "learning_rate": 5.652616684492396e-07, + "loss": 2.3045, + "step": 2202 + }, + { + "epoch": 0.8497589199614272, + "grad_norm": 0.3930964058758994, + "learning_rate": 5.624221470168978e-07, + "loss": 2.3267, + "step": 2203 + }, + { + "epoch": 0.8501446480231437, + "grad_norm": 0.42756363784629275, + "learning_rate": 5.595893504614097e-07, + "loss": 2.2043, + "step": 2204 + }, + { + "epoch": 0.8505303760848602, + "grad_norm": 0.43391519530192535, + "learning_rate": 5.567632830757025e-07, + "loss": 2.2901, + "step": 2205 + }, + { + "epoch": 0.8509161041465767, + "grad_norm": 0.4307181250385132, + "learning_rate": 5.539439491425097e-07, + "loss": 2.3412, + "step": 2206 + }, + { + "epoch": 0.8513018322082931, + "grad_norm": 0.4112928115331509, + "learning_rate": 5.511313529343581e-07, + "loss": 2.2968, + "step": 2207 + }, + { + "epoch": 0.8516875602700097, + "grad_norm": 0.4083314717986259, + "learning_rate": 5.483254987135644e-07, + "loss": 2.3146, + "step": 2208 + }, + { + "epoch": 0.8520732883317261, + "grad_norm": 0.42163950916500303, + "learning_rate": 5.455263907322283e-07, + "loss": 2.3131, + "step": 2209 + }, + { + "epoch": 0.8524590163934426, + "grad_norm": 0.43750119275808214, + "learning_rate": 5.427340332322267e-07, + "loss": 2.3545, + "step": 2210 + }, + { + "epoch": 0.8528447444551591, + "grad_norm": 0.4227491273975129, + "learning_rate": 5.39948430445204e-07, + "loss": 2.3194, + "step": 2211 + }, + { + "epoch": 0.8532304725168756, + "grad_norm": 0.47559103478413123, + "learning_rate": 5.371695865925736e-07, + "loss": 2.3153, + "step": 2212 + }, + { + "epoch": 0.8536162005785921, + "grad_norm": 0.45948833709331677, + "learning_rate": 5.343975058854994e-07, + "loss": 2.3079, + "step": 2213 + }, + { + "epoch": 0.8540019286403085, + "grad_norm": 0.4213448822063752, + "learning_rate": 5.316321925249024e-07, + "loss": 2.1981, + "step": 2214 + }, + { + "epoch": 0.8543876567020251, + "grad_norm": 0.42214244984468713, + "learning_rate": 5.288736507014436e-07, + "loss": 2.3274, + "step": 2215 + }, + { + "epoch": 0.8547733847637415, + "grad_norm": 0.41575191369539755, + "learning_rate": 5.261218845955246e-07, + "loss": 2.3627, + "step": 2216 + }, + { + "epoch": 0.8551591128254581, + "grad_norm": 0.45330059585872606, + "learning_rate": 5.23376898377278e-07, + "loss": 2.3306, + "step": 2217 + }, + { + "epoch": 0.8555448408871745, + "grad_norm": 0.4033291703126306, + "learning_rate": 5.206386962065601e-07, + "loss": 2.4107, + "step": 2218 + }, + { + "epoch": 0.8559305689488911, + "grad_norm": 0.4298766207997605, + "learning_rate": 5.179072822329512e-07, + "loss": 2.3519, + "step": 2219 + }, + { + "epoch": 0.8563162970106075, + "grad_norm": 0.45446292904760033, + "learning_rate": 5.151826605957394e-07, + "loss": 2.3882, + "step": 2220 + }, + { + "epoch": 0.856702025072324, + "grad_norm": 0.43566691330033935, + "learning_rate": 5.124648354239225e-07, + "loss": 2.3187, + "step": 2221 + }, + { + "epoch": 0.8570877531340405, + "grad_norm": 0.4330203954906256, + "learning_rate": 5.097538108361966e-07, + "loss": 2.3087, + "step": 2222 + }, + { + "epoch": 0.857473481195757, + "grad_norm": 0.41939583997928825, + "learning_rate": 5.070495909409551e-07, + "loss": 2.2939, + "step": 2223 + }, + { + "epoch": 0.8578592092574735, + "grad_norm": 0.4736228920931165, + "learning_rate": 5.043521798362755e-07, + "loss": 2.2919, + "step": 2224 + }, + { + "epoch": 0.8582449373191899, + "grad_norm": 0.43631394319711947, + "learning_rate": 5.016615816099185e-07, + "loss": 2.2537, + "step": 2225 + }, + { + "epoch": 0.8586306653809065, + "grad_norm": 0.40377370202039525, + "learning_rate": 4.98977800339322e-07, + "loss": 2.3361, + "step": 2226 + }, + { + "epoch": 0.8590163934426229, + "grad_norm": 0.42579008936762563, + "learning_rate": 4.963008400915914e-07, + "loss": 2.2608, + "step": 2227 + }, + { + "epoch": 0.8594021215043395, + "grad_norm": 0.4133094110710469, + "learning_rate": 4.936307049234956e-07, + "loss": 2.291, + "step": 2228 + }, + { + "epoch": 0.8597878495660559, + "grad_norm": 0.40409474331202133, + "learning_rate": 4.9096739888146e-07, + "loss": 2.3848, + "step": 2229 + }, + { + "epoch": 0.8601735776277725, + "grad_norm": 0.4167750604678949, + "learning_rate": 4.883109260015617e-07, + "loss": 2.3461, + "step": 2230 + }, + { + "epoch": 0.8605593056894889, + "grad_norm": 0.4138540442699837, + "learning_rate": 4.85661290309522e-07, + "loss": 2.2979, + "step": 2231 + }, + { + "epoch": 0.8609450337512053, + "grad_norm": 0.4011426011687145, + "learning_rate": 4.830184958207007e-07, + "loss": 2.3469, + "step": 2232 + }, + { + "epoch": 0.8613307618129219, + "grad_norm": 0.47545862045215365, + "learning_rate": 4.80382546540093e-07, + "loss": 2.3112, + "step": 2233 + }, + { + "epoch": 0.8617164898746383, + "grad_norm": 0.41395417512831273, + "learning_rate": 4.777534464623162e-07, + "loss": 2.3335, + "step": 2234 + }, + { + "epoch": 0.8621022179363549, + "grad_norm": 0.43386901834187463, + "learning_rate": 4.7513119957161124e-07, + "loss": 2.3024, + "step": 2235 + }, + { + "epoch": 0.8624879459980713, + "grad_norm": 0.44148333849080224, + "learning_rate": 4.725158098418309e-07, + "loss": 2.3392, + "step": 2236 + }, + { + "epoch": 0.8628736740597879, + "grad_norm": 0.40902133683113945, + "learning_rate": 4.69907281236438e-07, + "loss": 2.3232, + "step": 2237 + }, + { + "epoch": 0.8632594021215043, + "grad_norm": 0.4303489752981223, + "learning_rate": 4.673056177084989e-07, + "loss": 2.3198, + "step": 2238 + }, + { + "epoch": 0.8636451301832209, + "grad_norm": 0.4156111488028808, + "learning_rate": 4.647108232006742e-07, + "loss": 2.3342, + "step": 2239 + }, + { + "epoch": 0.8640308582449373, + "grad_norm": 0.39961453276398634, + "learning_rate": 4.6212290164521554e-07, + "loss": 2.3079, + "step": 2240 + }, + { + "epoch": 0.8644165863066539, + "grad_norm": 0.4141830467341266, + "learning_rate": 4.595418569639581e-07, + "loss": 2.2747, + "step": 2241 + }, + { + "epoch": 0.8648023143683703, + "grad_norm": 0.42352865639027676, + "learning_rate": 4.5696769306831923e-07, + "loss": 2.2992, + "step": 2242 + }, + { + "epoch": 0.8651880424300867, + "grad_norm": 0.39859439958207843, + "learning_rate": 4.5440041385928444e-07, + "loss": 2.3091, + "step": 2243 + }, + { + "epoch": 0.8655737704918033, + "grad_norm": 0.4484657124226204, + "learning_rate": 4.5184002322740784e-07, + "loss": 2.2968, + "step": 2244 + }, + { + "epoch": 0.8659594985535197, + "grad_norm": 0.44190808883333643, + "learning_rate": 4.492865250528056e-07, + "loss": 2.3755, + "step": 2245 + }, + { + "epoch": 0.8663452266152363, + "grad_norm": 0.5020943728416454, + "learning_rate": 4.4673992320514617e-07, + "loss": 2.3072, + "step": 2246 + }, + { + "epoch": 0.8667309546769527, + "grad_norm": 0.4294192462710892, + "learning_rate": 4.4420022154364917e-07, + "loss": 2.3103, + "step": 2247 + }, + { + "epoch": 0.8671166827386693, + "grad_norm": 0.4203332570620652, + "learning_rate": 4.4166742391707593e-07, + "loss": 2.3168, + "step": 2248 + }, + { + "epoch": 0.8675024108003857, + "grad_norm": 0.4479748088729882, + "learning_rate": 4.391415341637262e-07, + "loss": 2.323, + "step": 2249 + }, + { + "epoch": 0.8678881388621023, + "grad_norm": 0.3912067407465435, + "learning_rate": 4.366225561114296e-07, + "loss": 2.3569, + "step": 2250 + }, + { + "epoch": 0.8682738669238187, + "grad_norm": 0.38938757566727544, + "learning_rate": 4.341104935775442e-07, + "loss": 2.322, + "step": 2251 + }, + { + "epoch": 0.8686595949855352, + "grad_norm": 0.4240750439778621, + "learning_rate": 4.316053503689466e-07, + "loss": 2.2382, + "step": 2252 + }, + { + "epoch": 0.8690453230472517, + "grad_norm": 0.4023975911484655, + "learning_rate": 4.291071302820271e-07, + "loss": 2.3566, + "step": 2253 + }, + { + "epoch": 0.8694310511089681, + "grad_norm": 0.42667166936274664, + "learning_rate": 4.2661583710268573e-07, + "loss": 2.2845, + "step": 2254 + }, + { + "epoch": 0.8698167791706847, + "grad_norm": 0.4160071160989676, + "learning_rate": 4.24131474606323e-07, + "loss": 2.327, + "step": 2255 + }, + { + "epoch": 0.8702025072324011, + "grad_norm": 0.4209819473631007, + "learning_rate": 4.2165404655783836e-07, + "loss": 2.3523, + "step": 2256 + }, + { + "epoch": 0.8705882352941177, + "grad_norm": 0.4440843418215145, + "learning_rate": 4.1918355671162145e-07, + "loss": 2.3495, + "step": 2257 + }, + { + "epoch": 0.8709739633558341, + "grad_norm": 0.41164003016046363, + "learning_rate": 4.1672000881154917e-07, + "loss": 2.3301, + "step": 2258 + }, + { + "epoch": 0.8713596914175507, + "grad_norm": 0.4469948251850552, + "learning_rate": 4.1426340659097565e-07, + "loss": 2.3737, + "step": 2259 + }, + { + "epoch": 0.8717454194792671, + "grad_norm": 0.3987525236231152, + "learning_rate": 4.1181375377273237e-07, + "loss": 2.3774, + "step": 2260 + }, + { + "epoch": 0.8721311475409836, + "grad_norm": 0.43640589294749244, + "learning_rate": 4.09371054069117e-07, + "loss": 2.2801, + "step": 2261 + }, + { + "epoch": 0.8725168756027001, + "grad_norm": 0.4329967109483763, + "learning_rate": 4.069353111818913e-07, + "loss": 2.3759, + "step": 2262 + }, + { + "epoch": 0.8729026036644166, + "grad_norm": 0.4204653148417705, + "learning_rate": 4.0450652880227426e-07, + "loss": 2.2806, + "step": 2263 + }, + { + "epoch": 0.8732883317261331, + "grad_norm": 0.46488748061246604, + "learning_rate": 4.020847106109349e-07, + "loss": 2.2773, + "step": 2264 + }, + { + "epoch": 0.8736740597878495, + "grad_norm": 0.4126593478938596, + "learning_rate": 3.996698602779919e-07, + "loss": 2.4213, + "step": 2265 + }, + { + "epoch": 0.8740597878495661, + "grad_norm": 0.41842747786828716, + "learning_rate": 3.9726198146300185e-07, + "loss": 2.3888, + "step": 2266 + }, + { + "epoch": 0.8744455159112825, + "grad_norm": 0.40822133226916885, + "learning_rate": 3.948610778149581e-07, + "loss": 2.3844, + "step": 2267 + }, + { + "epoch": 0.874831243972999, + "grad_norm": 0.4237314541249901, + "learning_rate": 3.9246715297228176e-07, + "loss": 2.306, + "step": 2268 + }, + { + "epoch": 0.8752169720347155, + "grad_norm": 0.42912593943031246, + "learning_rate": 3.9008021056281875e-07, + "loss": 2.295, + "step": 2269 + }, + { + "epoch": 0.875602700096432, + "grad_norm": 0.4105781495982416, + "learning_rate": 3.877002542038355e-07, + "loss": 2.2801, + "step": 2270 + }, + { + "epoch": 0.8759884281581485, + "grad_norm": 0.39721896942295326, + "learning_rate": 3.8532728750200755e-07, + "loss": 2.3438, + "step": 2271 + }, + { + "epoch": 0.876374156219865, + "grad_norm": 0.41985251769629356, + "learning_rate": 3.829613140534222e-07, + "loss": 2.3071, + "step": 2272 + }, + { + "epoch": 0.8767598842815815, + "grad_norm": 0.44265077565168853, + "learning_rate": 3.8060233744356634e-07, + "loss": 2.2983, + "step": 2273 + }, + { + "epoch": 0.877145612343298, + "grad_norm": 0.418111678423349, + "learning_rate": 3.782503612473243e-07, + "loss": 2.3021, + "step": 2274 + }, + { + "epoch": 0.8775313404050145, + "grad_norm": 0.4250751660834272, + "learning_rate": 3.75905389028971e-07, + "loss": 2.3847, + "step": 2275 + }, + { + "epoch": 0.877917068466731, + "grad_norm": 0.41628911378730005, + "learning_rate": 3.7356742434216775e-07, + "loss": 2.2959, + "step": 2276 + }, + { + "epoch": 0.8783027965284474, + "grad_norm": 0.4187293548369921, + "learning_rate": 3.7123647072995816e-07, + "loss": 2.3418, + "step": 2277 + }, + { + "epoch": 0.8786885245901639, + "grad_norm": 0.43655945789184647, + "learning_rate": 3.689125317247572e-07, + "loss": 2.3148, + "step": 2278 + }, + { + "epoch": 0.8790742526518804, + "grad_norm": 0.43266462241172693, + "learning_rate": 3.665956108483543e-07, + "loss": 2.2817, + "step": 2279 + }, + { + "epoch": 0.8794599807135969, + "grad_norm": 0.42782770811801585, + "learning_rate": 3.642857116118986e-07, + "loss": 2.3549, + "step": 2280 + }, + { + "epoch": 0.8798457087753134, + "grad_norm": 0.4321683221918526, + "learning_rate": 3.619828375159018e-07, + "loss": 2.3334, + "step": 2281 + }, + { + "epoch": 0.8802314368370299, + "grad_norm": 0.4396171200268804, + "learning_rate": 3.5968699205022827e-07, + "loss": 2.2895, + "step": 2282 + }, + { + "epoch": 0.8806171648987464, + "grad_norm": 0.4467943875881692, + "learning_rate": 3.573981786940889e-07, + "loss": 2.2695, + "step": 2283 + }, + { + "epoch": 0.8810028929604629, + "grad_norm": 0.47767710110174755, + "learning_rate": 3.5511640091604293e-07, + "loss": 2.3356, + "step": 2284 + }, + { + "epoch": 0.8813886210221794, + "grad_norm": 0.41802680830792355, + "learning_rate": 3.5284166217398276e-07, + "loss": 2.279, + "step": 2285 + }, + { + "epoch": 0.8817743490838958, + "grad_norm": 0.438723155619965, + "learning_rate": 3.505739659151358e-07, + "loss": 2.2767, + "step": 2286 + }, + { + "epoch": 0.8821600771456124, + "grad_norm": 0.45784513304782515, + "learning_rate": 3.4831331557605597e-07, + "loss": 2.3033, + "step": 2287 + }, + { + "epoch": 0.8825458052073288, + "grad_norm": 0.40870966872961656, + "learning_rate": 3.4605971458262e-07, + "loss": 2.3508, + "step": 2288 + }, + { + "epoch": 0.8829315332690453, + "grad_norm": 0.4424833444084585, + "learning_rate": 3.4381316635002324e-07, + "loss": 2.3019, + "step": 2289 + }, + { + "epoch": 0.8833172613307618, + "grad_norm": 0.4292800347530429, + "learning_rate": 3.4157367428276966e-07, + "loss": 2.2879, + "step": 2290 + }, + { + "epoch": 0.8837029893924783, + "grad_norm": 0.4461628943725614, + "learning_rate": 3.3934124177467386e-07, + "loss": 2.311, + "step": 2291 + }, + { + "epoch": 0.8840887174541948, + "grad_norm": 0.41792428163644263, + "learning_rate": 3.371158722088497e-07, + "loss": 2.3708, + "step": 2292 + }, + { + "epoch": 0.8844744455159113, + "grad_norm": 0.45556965914486197, + "learning_rate": 3.3489756895770773e-07, + "loss": 2.3435, + "step": 2293 + }, + { + "epoch": 0.8848601735776278, + "grad_norm": 0.42326123853106545, + "learning_rate": 3.326863353829507e-07, + "loss": 2.2452, + "step": 2294 + }, + { + "epoch": 0.8852459016393442, + "grad_norm": 0.43115329010814496, + "learning_rate": 3.3048217483556743e-07, + "loss": 2.3017, + "step": 2295 + }, + { + "epoch": 0.8856316297010608, + "grad_norm": 0.4017502606904032, + "learning_rate": 3.2828509065582713e-07, + "loss": 2.2709, + "step": 2296 + }, + { + "epoch": 0.8860173577627772, + "grad_norm": 0.43269418962686174, + "learning_rate": 3.260950861732765e-07, + "loss": 2.2945, + "step": 2297 + }, + { + "epoch": 0.8864030858244938, + "grad_norm": 0.4442870353620615, + "learning_rate": 3.239121647067339e-07, + "loss": 2.3491, + "step": 2298 + }, + { + "epoch": 0.8867888138862102, + "grad_norm": 0.41736126954832276, + "learning_rate": 3.217363295642817e-07, + "loss": 2.3845, + "step": 2299 + }, + { + "epoch": 0.8871745419479267, + "grad_norm": 0.40746274890345213, + "learning_rate": 3.195675840432655e-07, + "loss": 2.3505, + "step": 2300 + }, + { + "epoch": 0.8875602700096432, + "grad_norm": 0.43921816246720496, + "learning_rate": 3.1740593143028463e-07, + "loss": 2.3695, + "step": 2301 + }, + { + "epoch": 0.8879459980713597, + "grad_norm": 0.4417907633038376, + "learning_rate": 3.1525137500119207e-07, + "loss": 2.3178, + "step": 2302 + }, + { + "epoch": 0.8883317261330762, + "grad_norm": 0.40236247950052306, + "learning_rate": 3.1310391802108433e-07, + "loss": 2.3592, + "step": 2303 + }, + { + "epoch": 0.8887174541947926, + "grad_norm": 0.4389765459305632, + "learning_rate": 3.109635637443026e-07, + "loss": 2.2578, + "step": 2304 + }, + { + "epoch": 0.8891031822565092, + "grad_norm": 0.44248147502648144, + "learning_rate": 3.0883031541442175e-07, + "loss": 2.2892, + "step": 2305 + }, + { + "epoch": 0.8894889103182256, + "grad_norm": 0.4212530551412743, + "learning_rate": 3.067041762642475e-07, + "loss": 2.275, + "step": 2306 + }, + { + "epoch": 0.8898746383799422, + "grad_norm": 0.39390096118261103, + "learning_rate": 3.045851495158147e-07, + "loss": 2.3076, + "step": 2307 + }, + { + "epoch": 0.8902603664416586, + "grad_norm": 0.44065450958834507, + "learning_rate": 3.02473238380378e-07, + "loss": 2.3761, + "step": 2308 + }, + { + "epoch": 0.8906460945033752, + "grad_norm": 0.39869216936050716, + "learning_rate": 3.0036844605840944e-07, + "loss": 2.3692, + "step": 2309 + }, + { + "epoch": 0.8910318225650916, + "grad_norm": 0.47639377826582363, + "learning_rate": 2.9827077573959083e-07, + "loss": 2.2737, + "step": 2310 + }, + { + "epoch": 0.891417550626808, + "grad_norm": 0.3949086111875954, + "learning_rate": 2.9618023060281443e-07, + "loss": 2.3125, + "step": 2311 + }, + { + "epoch": 0.8918032786885246, + "grad_norm": 0.43709075645674, + "learning_rate": 2.9409681381617315e-07, + "loss": 2.3073, + "step": 2312 + }, + { + "epoch": 0.892189006750241, + "grad_norm": 0.44877858308367297, + "learning_rate": 2.920205285369565e-07, + "loss": 2.3552, + "step": 2313 + }, + { + "epoch": 0.8925747348119576, + "grad_norm": 0.448956433956615, + "learning_rate": 2.899513779116475e-07, + "loss": 2.3559, + "step": 2314 + }, + { + "epoch": 0.892960462873674, + "grad_norm": 0.4040751591496477, + "learning_rate": 2.878893650759168e-07, + "loss": 2.3402, + "step": 2315 + }, + { + "epoch": 0.8933461909353906, + "grad_norm": 0.42941973754766005, + "learning_rate": 2.858344931546181e-07, + "loss": 2.2931, + "step": 2316 + }, + { + "epoch": 0.893731918997107, + "grad_norm": 0.4093804426084204, + "learning_rate": 2.8378676526178484e-07, + "loss": 2.281, + "step": 2317 + }, + { + "epoch": 0.8941176470588236, + "grad_norm": 0.4056051720053737, + "learning_rate": 2.8174618450062254e-07, + "loss": 2.2842, + "step": 2318 + }, + { + "epoch": 0.89450337512054, + "grad_norm": 0.44303445355744625, + "learning_rate": 2.7971275396350526e-07, + "loss": 2.2731, + "step": 2319 + }, + { + "epoch": 0.8948891031822566, + "grad_norm": 0.4262011309842942, + "learning_rate": 2.776864767319731e-07, + "loss": 2.3174, + "step": 2320 + }, + { + "epoch": 0.895274831243973, + "grad_norm": 0.4343001065859361, + "learning_rate": 2.756673558767242e-07, + "loss": 2.2447, + "step": 2321 + }, + { + "epoch": 0.8956605593056894, + "grad_norm": 0.4042191076748588, + "learning_rate": 2.7365539445761204e-07, + "loss": 2.3615, + "step": 2322 + }, + { + "epoch": 0.896046287367406, + "grad_norm": 0.4324732347462319, + "learning_rate": 2.716505955236415e-07, + "loss": 2.3086, + "step": 2323 + }, + { + "epoch": 0.8964320154291224, + "grad_norm": 0.44877695307282456, + "learning_rate": 2.696529621129618e-07, + "loss": 2.3156, + "step": 2324 + }, + { + "epoch": 0.896817743490839, + "grad_norm": 0.4130849621531806, + "learning_rate": 2.6766249725286295e-07, + "loss": 2.3451, + "step": 2325 + }, + { + "epoch": 0.8972034715525554, + "grad_norm": 0.4215410840108579, + "learning_rate": 2.656792039597744e-07, + "loss": 2.3394, + "step": 2326 + }, + { + "epoch": 0.897589199614272, + "grad_norm": 0.4267289938196295, + "learning_rate": 2.637030852392536e-07, + "loss": 2.3131, + "step": 2327 + }, + { + "epoch": 0.8979749276759884, + "grad_norm": 0.43553479832965797, + "learning_rate": 2.617341440859883e-07, + "loss": 2.3712, + "step": 2328 + }, + { + "epoch": 0.898360655737705, + "grad_norm": 0.40774833683590433, + "learning_rate": 2.597723834837862e-07, + "loss": 2.4135, + "step": 2329 + }, + { + "epoch": 0.8987463837994214, + "grad_norm": 0.41728549772051965, + "learning_rate": 2.5781780640557753e-07, + "loss": 2.3074, + "step": 2330 + }, + { + "epoch": 0.899132111861138, + "grad_norm": 0.4429978263158119, + "learning_rate": 2.5587041581340235e-07, + "loss": 2.3504, + "step": 2331 + }, + { + "epoch": 0.8995178399228544, + "grad_norm": 0.3935302876461795, + "learning_rate": 2.539302146584116e-07, + "loss": 2.343, + "step": 2332 + }, + { + "epoch": 0.8999035679845708, + "grad_norm": 0.39486222802808546, + "learning_rate": 2.5199720588086117e-07, + "loss": 2.3279, + "step": 2333 + }, + { + "epoch": 0.9002892960462874, + "grad_norm": 0.39773603659851414, + "learning_rate": 2.5007139241010724e-07, + "loss": 2.2666, + "step": 2334 + }, + { + "epoch": 0.9006750241080038, + "grad_norm": 0.4314496361740582, + "learning_rate": 2.48152777164602e-07, + "loss": 2.3723, + "step": 2335 + }, + { + "epoch": 0.9010607521697204, + "grad_norm": 0.39779368566168694, + "learning_rate": 2.4624136305188895e-07, + "loss": 2.3041, + "step": 2336 + }, + { + "epoch": 0.9014464802314368, + "grad_norm": 0.43539426008635035, + "learning_rate": 2.443371529685995e-07, + "loss": 2.2757, + "step": 2337 + }, + { + "epoch": 0.9018322082931534, + "grad_norm": 0.41172077652364375, + "learning_rate": 2.424401498004464e-07, + "loss": 2.34, + "step": 2338 + }, + { + "epoch": 0.9022179363548698, + "grad_norm": 0.4311143648496032, + "learning_rate": 2.4055035642222225e-07, + "loss": 2.292, + "step": 2339 + }, + { + "epoch": 0.9026036644165863, + "grad_norm": 0.43199604546203846, + "learning_rate": 2.3866777569779234e-07, + "loss": 2.2811, + "step": 2340 + }, + { + "epoch": 0.9029893924783028, + "grad_norm": 0.3996744267613919, + "learning_rate": 2.367924104800917e-07, + "loss": 2.2995, + "step": 2341 + }, + { + "epoch": 0.9033751205400193, + "grad_norm": 0.467895433250459, + "learning_rate": 2.3492426361112153e-07, + "loss": 2.3085, + "step": 2342 + }, + { + "epoch": 0.9037608486017358, + "grad_norm": 0.42536228011701505, + "learning_rate": 2.3306333792194492e-07, + "loss": 2.3072, + "step": 2343 + }, + { + "epoch": 0.9041465766634522, + "grad_norm": 0.4360625771288912, + "learning_rate": 2.3120963623267822e-07, + "loss": 2.3197, + "step": 2344 + }, + { + "epoch": 0.9045323047251688, + "grad_norm": 0.4301196858620419, + "learning_rate": 2.2936316135249492e-07, + "loss": 2.2895, + "step": 2345 + }, + { + "epoch": 0.9049180327868852, + "grad_norm": 0.4154437321377232, + "learning_rate": 2.2752391607961388e-07, + "loss": 2.323, + "step": 2346 + }, + { + "epoch": 0.9053037608486018, + "grad_norm": 0.41515933007956946, + "learning_rate": 2.256919032012972e-07, + "loss": 2.2567, + "step": 2347 + }, + { + "epoch": 0.9056894889103182, + "grad_norm": 0.39822301614572697, + "learning_rate": 2.2386712549384848e-07, + "loss": 2.308, + "step": 2348 + }, + { + "epoch": 0.9060752169720347, + "grad_norm": 0.4183729867262524, + "learning_rate": 2.220495857226068e-07, + "loss": 2.3475, + "step": 2349 + }, + { + "epoch": 0.9064609450337512, + "grad_norm": 0.40238712744809574, + "learning_rate": 2.2023928664194229e-07, + "loss": 2.2622, + "step": 2350 + }, + { + "epoch": 0.9068466730954677, + "grad_norm": 0.3984804803845149, + "learning_rate": 2.1843623099525146e-07, + "loss": 2.2536, + "step": 2351 + }, + { + "epoch": 0.9072324011571842, + "grad_norm": 0.4396943702451173, + "learning_rate": 2.1664042151495424e-07, + "loss": 2.4127, + "step": 2352 + }, + { + "epoch": 0.9076181292189007, + "grad_norm": 0.4272040571822996, + "learning_rate": 2.1485186092248978e-07, + "loss": 2.2906, + "step": 2353 + }, + { + "epoch": 0.9080038572806172, + "grad_norm": 0.46163668035869504, + "learning_rate": 2.1307055192831272e-07, + "loss": 2.3236, + "step": 2354 + }, + { + "epoch": 0.9083895853423336, + "grad_norm": 0.41300006472431816, + "learning_rate": 2.112964972318865e-07, + "loss": 2.2955, + "step": 2355 + }, + { + "epoch": 0.9087753134040502, + "grad_norm": 0.40025540439999374, + "learning_rate": 2.095296995216828e-07, + "loss": 2.3027, + "step": 2356 + }, + { + "epoch": 0.9091610414657666, + "grad_norm": 0.4245241449879845, + "learning_rate": 2.0777016147517536e-07, + "loss": 2.3115, + "step": 2357 + }, + { + "epoch": 0.9095467695274831, + "grad_norm": 0.40282234698213815, + "learning_rate": 2.0601788575883518e-07, + "loss": 2.2349, + "step": 2358 + }, + { + "epoch": 0.9099324975891996, + "grad_norm": 0.46274752117782375, + "learning_rate": 2.042728750281292e-07, + "loss": 2.37, + "step": 2359 + }, + { + "epoch": 0.9103182256509161, + "grad_norm": 0.39501138729671303, + "learning_rate": 2.0253513192751374e-07, + "loss": 2.3541, + "step": 2360 + }, + { + "epoch": 0.9107039537126326, + "grad_norm": 0.4223437499567797, + "learning_rate": 2.0080465909043113e-07, + "loss": 2.352, + "step": 2361 + }, + { + "epoch": 0.9110896817743491, + "grad_norm": 0.4551690474658373, + "learning_rate": 1.990814591393081e-07, + "loss": 2.2622, + "step": 2362 + }, + { + "epoch": 0.9114754098360656, + "grad_norm": 0.4505583736302094, + "learning_rate": 1.973655346855474e-07, + "loss": 2.287, + "step": 2363 + }, + { + "epoch": 0.9118611378977821, + "grad_norm": 0.43738325452276416, + "learning_rate": 1.9565688832952846e-07, + "loss": 2.3309, + "step": 2364 + }, + { + "epoch": 0.9122468659594986, + "grad_norm": 0.4098843311547293, + "learning_rate": 1.939555226605988e-07, + "loss": 2.3374, + "step": 2365 + }, + { + "epoch": 0.912632594021215, + "grad_norm": 0.4324462884840351, + "learning_rate": 1.9226144025707382e-07, + "loss": 2.3175, + "step": 2366 + }, + { + "epoch": 0.9130183220829315, + "grad_norm": 0.4348134837758005, + "learning_rate": 1.9057464368623213e-07, + "loss": 2.3331, + "step": 2367 + }, + { + "epoch": 0.913404050144648, + "grad_norm": 0.40591563134425135, + "learning_rate": 1.8889513550430892e-07, + "loss": 2.2424, + "step": 2368 + }, + { + "epoch": 0.9137897782063645, + "grad_norm": 0.42326917689373966, + "learning_rate": 1.872229182564972e-07, + "loss": 2.3636, + "step": 2369 + }, + { + "epoch": 0.914175506268081, + "grad_norm": 0.4330510921583434, + "learning_rate": 1.855579944769387e-07, + "loss": 2.3307, + "step": 2370 + }, + { + "epoch": 0.9145612343297975, + "grad_norm": 0.47084888462293784, + "learning_rate": 1.8390036668872403e-07, + "loss": 2.3363, + "step": 2371 + }, + { + "epoch": 0.914946962391514, + "grad_norm": 0.42684759215732565, + "learning_rate": 1.8225003740388546e-07, + "loss": 2.3209, + "step": 2372 + }, + { + "epoch": 0.9153326904532305, + "grad_norm": 0.4514543622337268, + "learning_rate": 1.8060700912339635e-07, + "loss": 2.3375, + "step": 2373 + }, + { + "epoch": 0.915718418514947, + "grad_norm": 0.4186512484287575, + "learning_rate": 1.7897128433716493e-07, + "loss": 2.393, + "step": 2374 + }, + { + "epoch": 0.9161041465766635, + "grad_norm": 0.39577712124974357, + "learning_rate": 1.7734286552403114e-07, + "loss": 2.2997, + "step": 2375 + }, + { + "epoch": 0.9164898746383799, + "grad_norm": 0.4402007170416471, + "learning_rate": 1.7572175515176538e-07, + "loss": 2.4085, + "step": 2376 + }, + { + "epoch": 0.9168756027000964, + "grad_norm": 0.4066003985671325, + "learning_rate": 1.7410795567705973e-07, + "loss": 2.3095, + "step": 2377 + }, + { + "epoch": 0.9172613307618129, + "grad_norm": 0.4060197004309517, + "learning_rate": 1.725014695455285e-07, + "loss": 2.4112, + "step": 2378 + }, + { + "epoch": 0.9176470588235294, + "grad_norm": 0.4236957028335664, + "learning_rate": 1.7090229919170254e-07, + "loss": 2.3272, + "step": 2379 + }, + { + "epoch": 0.9180327868852459, + "grad_norm": 0.4587135748056596, + "learning_rate": 1.693104470390261e-07, + "loss": 2.3457, + "step": 2380 + }, + { + "epoch": 0.9184185149469624, + "grad_norm": 0.445833825057682, + "learning_rate": 1.6772591549985395e-07, + "loss": 2.3792, + "step": 2381 + }, + { + "epoch": 0.9188042430086789, + "grad_norm": 0.41916605257667877, + "learning_rate": 1.6614870697544638e-07, + "loss": 2.297, + "step": 2382 + }, + { + "epoch": 0.9191899710703954, + "grad_norm": 0.40163545031193365, + "learning_rate": 1.6457882385596647e-07, + "loss": 2.3092, + "step": 2383 + }, + { + "epoch": 0.9195756991321119, + "grad_norm": 0.4533615432359611, + "learning_rate": 1.6301626852047504e-07, + "loss": 2.3314, + "step": 2384 + }, + { + "epoch": 0.9199614271938283, + "grad_norm": 0.4179243811649184, + "learning_rate": 1.6146104333692902e-07, + "loss": 2.3185, + "step": 2385 + }, + { + "epoch": 0.9203471552555449, + "grad_norm": 0.41413676747662054, + "learning_rate": 1.599131506621765e-07, + "loss": 2.3773, + "step": 2386 + }, + { + "epoch": 0.9207328833172613, + "grad_norm": 0.4375738973653733, + "learning_rate": 1.5837259284195383e-07, + "loss": 2.2568, + "step": 2387 + }, + { + "epoch": 0.9211186113789778, + "grad_norm": 0.4246892931477109, + "learning_rate": 1.5683937221088242e-07, + "loss": 2.3448, + "step": 2388 + }, + { + "epoch": 0.9215043394406943, + "grad_norm": 0.4298415692395747, + "learning_rate": 1.5531349109246364e-07, + "loss": 2.3201, + "step": 2389 + }, + { + "epoch": 0.9218900675024108, + "grad_norm": 0.4422975749981783, + "learning_rate": 1.5379495179907666e-07, + "loss": 2.3471, + "step": 2390 + }, + { + "epoch": 0.9222757955641273, + "grad_norm": 0.41086563016151534, + "learning_rate": 1.5228375663197404e-07, + "loss": 2.3569, + "step": 2391 + }, + { + "epoch": 0.9226615236258437, + "grad_norm": 0.4168003402699989, + "learning_rate": 1.5077990788127993e-07, + "loss": 2.3138, + "step": 2392 + }, + { + "epoch": 0.9230472516875603, + "grad_norm": 0.43143785619098457, + "learning_rate": 1.4928340782598526e-07, + "loss": 2.3009, + "step": 2393 + }, + { + "epoch": 0.9234329797492767, + "grad_norm": 0.4182799340654419, + "learning_rate": 1.477942587339426e-07, + "loss": 2.2615, + "step": 2394 + }, + { + "epoch": 0.9238187078109933, + "grad_norm": 0.4193441615282384, + "learning_rate": 1.4631246286186783e-07, + "loss": 2.2258, + "step": 2395 + }, + { + "epoch": 0.9242044358727097, + "grad_norm": 0.4320029682004851, + "learning_rate": 1.448380224553303e-07, + "loss": 2.3066, + "step": 2396 + }, + { + "epoch": 0.9245901639344263, + "grad_norm": 0.4049944879632156, + "learning_rate": 1.4337093974875427e-07, + "loss": 2.3107, + "step": 2397 + }, + { + "epoch": 0.9249758919961427, + "grad_norm": 0.41431123640753376, + "learning_rate": 1.41911216965413e-07, + "loss": 2.3093, + "step": 2398 + }, + { + "epoch": 0.9253616200578592, + "grad_norm": 0.40260499302202707, + "learning_rate": 1.4045885631742807e-07, + "loss": 2.3547, + "step": 2399 + }, + { + "epoch": 0.9257473481195757, + "grad_norm": 0.40863834105476954, + "learning_rate": 1.3901386000576112e-07, + "loss": 2.3927, + "step": 2400 + }, + { + "epoch": 0.9261330761812921, + "grad_norm": 0.41884773263249914, + "learning_rate": 1.375762302202166e-07, + "loss": 2.2981, + "step": 2401 + }, + { + "epoch": 0.9265188042430087, + "grad_norm": 0.42266939838517653, + "learning_rate": 1.3614596913943457e-07, + "loss": 2.3489, + "step": 2402 + }, + { + "epoch": 0.9269045323047251, + "grad_norm": 0.4097322608020759, + "learning_rate": 1.3472307893088733e-07, + "loss": 2.3568, + "step": 2403 + }, + { + "epoch": 0.9272902603664417, + "grad_norm": 0.4108441704482335, + "learning_rate": 1.3330756175087778e-07, + "loss": 2.249, + "step": 2404 + }, + { + "epoch": 0.9276759884281581, + "grad_norm": 0.42904792387415436, + "learning_rate": 1.3189941974453502e-07, + "loss": 2.324, + "step": 2405 + }, + { + "epoch": 0.9280617164898747, + "grad_norm": 0.41881412719687494, + "learning_rate": 1.3049865504581204e-07, + "loss": 2.3073, + "step": 2406 + }, + { + "epoch": 0.9284474445515911, + "grad_norm": 0.4447067217596798, + "learning_rate": 1.2910526977748084e-07, + "loss": 2.361, + "step": 2407 + }, + { + "epoch": 0.9288331726133077, + "grad_norm": 0.4361601708399571, + "learning_rate": 1.2771926605113283e-07, + "loss": 2.355, + "step": 2408 + }, + { + "epoch": 0.9292189006750241, + "grad_norm": 0.411206644148289, + "learning_rate": 1.2634064596717122e-07, + "loss": 2.3693, + "step": 2409 + }, + { + "epoch": 0.9296046287367405, + "grad_norm": 0.41399257441418463, + "learning_rate": 1.249694116148087e-07, + "loss": 2.3556, + "step": 2410 + }, + { + "epoch": 0.9299903567984571, + "grad_norm": 0.42099022046753215, + "learning_rate": 1.2360556507206912e-07, + "loss": 2.3021, + "step": 2411 + }, + { + "epoch": 0.9303760848601735, + "grad_norm": 0.4407611306603906, + "learning_rate": 1.2224910840577642e-07, + "loss": 2.3072, + "step": 2412 + }, + { + "epoch": 0.9307618129218901, + "grad_norm": 0.4307686712814946, + "learning_rate": 1.2090004367155795e-07, + "loss": 2.3553, + "step": 2413 + }, + { + "epoch": 0.9311475409836065, + "grad_norm": 0.4170762789838413, + "learning_rate": 1.1955837291383776e-07, + "loss": 2.2864, + "step": 2414 + }, + { + "epoch": 0.9315332690453231, + "grad_norm": 0.4208399466609305, + "learning_rate": 1.1822409816583724e-07, + "loss": 2.3465, + "step": 2415 + }, + { + "epoch": 0.9319189971070395, + "grad_norm": 0.41854091983297737, + "learning_rate": 1.1689722144956672e-07, + "loss": 2.3208, + "step": 2416 + }, + { + "epoch": 0.9323047251687561, + "grad_norm": 0.43824070200599957, + "learning_rate": 1.1557774477582662e-07, + "loss": 2.2214, + "step": 2417 + }, + { + "epoch": 0.9326904532304725, + "grad_norm": 0.39789756046365826, + "learning_rate": 1.1426567014420297e-07, + "loss": 2.2473, + "step": 2418 + }, + { + "epoch": 0.933076181292189, + "grad_norm": 0.4116644410048528, + "learning_rate": 1.129609995430636e-07, + "loss": 2.3905, + "step": 2419 + }, + { + "epoch": 0.9334619093539055, + "grad_norm": 0.42436502034051177, + "learning_rate": 1.1166373494955696e-07, + "loss": 2.3822, + "step": 2420 + }, + { + "epoch": 0.9338476374156219, + "grad_norm": 0.39280611047179315, + "learning_rate": 1.1037387832960933e-07, + "loss": 2.3411, + "step": 2421 + }, + { + "epoch": 0.9342333654773385, + "grad_norm": 0.4510042871704501, + "learning_rate": 1.0909143163791769e-07, + "loss": 2.2484, + "step": 2422 + }, + { + "epoch": 0.9346190935390549, + "grad_norm": 0.42283535322397575, + "learning_rate": 1.0781639681795187e-07, + "loss": 2.3367, + "step": 2423 + }, + { + "epoch": 0.9350048216007715, + "grad_norm": 0.41401166297073777, + "learning_rate": 1.06548775801949e-07, + "loss": 2.335, + "step": 2424 + }, + { + "epoch": 0.9353905496624879, + "grad_norm": 0.3991091673458918, + "learning_rate": 1.0528857051091079e-07, + "loss": 2.3217, + "step": 2425 + }, + { + "epoch": 0.9357762777242045, + "grad_norm": 0.4206636984941167, + "learning_rate": 1.0403578285460014e-07, + "loss": 2.345, + "step": 2426 + }, + { + "epoch": 0.9361620057859209, + "grad_norm": 0.40537932353771305, + "learning_rate": 1.0279041473154117e-07, + "loss": 2.3187, + "step": 2427 + }, + { + "epoch": 0.9365477338476375, + "grad_norm": 0.406306268082581, + "learning_rate": 1.0155246802901198e-07, + "loss": 2.3173, + "step": 2428 + }, + { + "epoch": 0.9369334619093539, + "grad_norm": 0.4400571600499307, + "learning_rate": 1.0032194462304523e-07, + "loss": 2.338, + "step": 2429 + }, + { + "epoch": 0.9373191899710704, + "grad_norm": 0.421442152372355, + "learning_rate": 9.909884637842371e-08, + "loss": 2.3363, + "step": 2430 + }, + { + "epoch": 0.9377049180327869, + "grad_norm": 0.4456464895379789, + "learning_rate": 9.788317514867751e-08, + "loss": 2.4074, + "step": 2431 + }, + { + "epoch": 0.9380906460945033, + "grad_norm": 0.44272456024096185, + "learning_rate": 9.667493277608187e-08, + "loss": 2.3709, + "step": 2432 + }, + { + "epoch": 0.9384763741562199, + "grad_norm": 0.4287548757129891, + "learning_rate": 9.547412109165321e-08, + "loss": 2.3216, + "step": 2433 + }, + { + "epoch": 0.9388621022179363, + "grad_norm": 0.4120997550840361, + "learning_rate": 9.428074191514924e-08, + "loss": 2.2839, + "step": 2434 + }, + { + "epoch": 0.9392478302796529, + "grad_norm": 0.4163304656784519, + "learning_rate": 9.309479705506219e-08, + "loss": 2.3909, + "step": 2435 + }, + { + "epoch": 0.9396335583413693, + "grad_norm": 0.39798785466770226, + "learning_rate": 9.191628830861832e-08, + "loss": 2.347, + "step": 2436 + }, + { + "epoch": 0.9400192864030859, + "grad_norm": 0.43253527714333934, + "learning_rate": 9.074521746177567e-08, + "loss": 2.3688, + "step": 2437 + }, + { + "epoch": 0.9404050144648023, + "grad_norm": 0.4532266241093941, + "learning_rate": 8.95815862892202e-08, + "loss": 2.3301, + "step": 2438 + }, + { + "epoch": 0.9407907425265188, + "grad_norm": 0.4439214571057158, + "learning_rate": 8.842539655436355e-08, + "loss": 2.3264, + "step": 2439 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.43604973208520453, + "learning_rate": 8.727665000934027e-08, + "loss": 2.3637, + "step": 2440 + }, + { + "epoch": 0.9415621986499518, + "grad_norm": 0.4153258000523079, + "learning_rate": 8.61353483950056e-08, + "loss": 2.4173, + "step": 2441 + }, + { + "epoch": 0.9419479267116683, + "grad_norm": 0.46124634253916963, + "learning_rate": 8.500149344093156e-08, + "loss": 2.2817, + "step": 2442 + }, + { + "epoch": 0.9423336547733847, + "grad_norm": 0.4402039690213307, + "learning_rate": 8.387508686540591e-08, + "loss": 2.3774, + "step": 2443 + }, + { + "epoch": 0.9427193828351013, + "grad_norm": 0.4658228652575852, + "learning_rate": 8.275613037542873e-08, + "loss": 2.3657, + "step": 2444 + }, + { + "epoch": 0.9431051108968177, + "grad_norm": 0.41699101927893295, + "learning_rate": 8.164462566670972e-08, + "loss": 2.2583, + "step": 2445 + }, + { + "epoch": 0.9434908389585343, + "grad_norm": 0.4962189996072224, + "learning_rate": 8.054057442366592e-08, + "loss": 2.3248, + "step": 2446 + }, + { + "epoch": 0.9438765670202507, + "grad_norm": 0.41155828648085263, + "learning_rate": 7.944397831941952e-08, + "loss": 2.292, + "step": 2447 + }, + { + "epoch": 0.9442622950819672, + "grad_norm": 0.42135807540720943, + "learning_rate": 7.835483901579454e-08, + "loss": 2.3418, + "step": 2448 + }, + { + "epoch": 0.9446480231436837, + "grad_norm": 0.41326626703751895, + "learning_rate": 7.727315816331515e-08, + "loss": 2.3092, + "step": 2449 + }, + { + "epoch": 0.9450337512054002, + "grad_norm": 0.42688792733304276, + "learning_rate": 7.619893740120176e-08, + "loss": 2.3105, + "step": 2450 + }, + { + "epoch": 0.9454194792671167, + "grad_norm": 0.4504376421920826, + "learning_rate": 7.513217835737052e-08, + "loss": 2.3061, + "step": 2451 + }, + { + "epoch": 0.9458052073288332, + "grad_norm": 0.42387316716416107, + "learning_rate": 7.407288264842772e-08, + "loss": 2.3136, + "step": 2452 + }, + { + "epoch": 0.9461909353905497, + "grad_norm": 0.42731506298397026, + "learning_rate": 7.302105187967313e-08, + "loss": 2.3327, + "step": 2453 + }, + { + "epoch": 0.9465766634522661, + "grad_norm": 0.3897487344371067, + "learning_rate": 7.197668764509058e-08, + "loss": 2.3351, + "step": 2454 + }, + { + "epoch": 0.9469623915139826, + "grad_norm": 0.40963650978286614, + "learning_rate": 7.09397915273502e-08, + "loss": 2.3007, + "step": 2455 + }, + { + "epoch": 0.9473481195756991, + "grad_norm": 0.40256147687936505, + "learning_rate": 6.991036509780391e-08, + "loss": 2.3398, + "step": 2456 + }, + { + "epoch": 0.9477338476374156, + "grad_norm": 0.4131690467739318, + "learning_rate": 6.888840991648493e-08, + "loss": 2.3185, + "step": 2457 + }, + { + "epoch": 0.9481195756991321, + "grad_norm": 0.41776805663834055, + "learning_rate": 6.787392753210386e-08, + "loss": 2.3094, + "step": 2458 + }, + { + "epoch": 0.9485053037608486, + "grad_norm": 0.41453002864252014, + "learning_rate": 6.686691948204537e-08, + "loss": 2.2026, + "step": 2459 + }, + { + "epoch": 0.9488910318225651, + "grad_norm": 0.43591139239000615, + "learning_rate": 6.58673872923693e-08, + "loss": 2.3234, + "step": 2460 + }, + { + "epoch": 0.9492767598842816, + "grad_norm": 0.400008500109623, + "learning_rate": 6.487533247780508e-08, + "loss": 2.3007, + "step": 2461 + }, + { + "epoch": 0.9496624879459981, + "grad_norm": 0.4025190134433101, + "learning_rate": 6.38907565417507e-08, + "loss": 2.3597, + "step": 2462 + }, + { + "epoch": 0.9500482160077146, + "grad_norm": 0.40652255242436164, + "learning_rate": 6.291366097627095e-08, + "loss": 2.3401, + "step": 2463 + }, + { + "epoch": 0.950433944069431, + "grad_norm": 0.420449026168815, + "learning_rate": 6.194404726209358e-08, + "loss": 2.2636, + "step": 2464 + }, + { + "epoch": 0.9508196721311475, + "grad_norm": 0.44154819359621317, + "learning_rate": 6.098191686860877e-08, + "loss": 2.4091, + "step": 2465 + }, + { + "epoch": 0.951205400192864, + "grad_norm": 0.407960358547453, + "learning_rate": 6.002727125386631e-08, + "loss": 2.2806, + "step": 2466 + }, + { + "epoch": 0.9515911282545805, + "grad_norm": 0.40314387291041454, + "learning_rate": 5.908011186457341e-08, + "loss": 2.3228, + "step": 2467 + }, + { + "epoch": 0.951976856316297, + "grad_norm": 0.4091025733020704, + "learning_rate": 5.8140440136091326e-08, + "loss": 2.3326, + "step": 2468 + }, + { + "epoch": 0.9523625843780135, + "grad_norm": 0.4194135293166133, + "learning_rate": 5.720825749243541e-08, + "loss": 2.3769, + "step": 2469 + }, + { + "epoch": 0.95274831243973, + "grad_norm": 0.41061934123931143, + "learning_rate": 5.628356534627122e-08, + "loss": 2.3163, + "step": 2470 + }, + { + "epoch": 0.9531340405014465, + "grad_norm": 0.40610058709846136, + "learning_rate": 5.536636509891225e-08, + "loss": 2.3115, + "step": 2471 + }, + { + "epoch": 0.953519768563163, + "grad_norm": 0.41017568899600615, + "learning_rate": 5.445665814031942e-08, + "loss": 2.2737, + "step": 2472 + }, + { + "epoch": 0.9539054966248794, + "grad_norm": 0.42044391166451395, + "learning_rate": 5.355444584909886e-08, + "loss": 2.2323, + "step": 2473 + }, + { + "epoch": 0.954291224686596, + "grad_norm": 0.405416280132108, + "learning_rate": 5.265972959249632e-08, + "loss": 2.3236, + "step": 2474 + }, + { + "epoch": 0.9546769527483124, + "grad_norm": 0.4367778478379245, + "learning_rate": 5.1772510726399996e-08, + "loss": 2.3579, + "step": 2475 + }, + { + "epoch": 0.9550626808100289, + "grad_norm": 0.38365766889537234, + "learning_rate": 5.089279059533658e-08, + "loss": 2.3188, + "step": 2476 + }, + { + "epoch": 0.9554484088717454, + "grad_norm": 0.49310282815831163, + "learning_rate": 5.002057053246634e-08, + "loss": 2.3614, + "step": 2477 + }, + { + "epoch": 0.9558341369334619, + "grad_norm": 0.4340207617363217, + "learning_rate": 4.915585185958638e-08, + "loss": 2.304, + "step": 2478 + }, + { + "epoch": 0.9562198649951784, + "grad_norm": 0.41879047081622034, + "learning_rate": 4.829863588712402e-08, + "loss": 2.3394, + "step": 2479 + }, + { + "epoch": 0.9566055930568949, + "grad_norm": 0.4297196789279753, + "learning_rate": 4.744892391413791e-08, + "loss": 2.3564, + "step": 2480 + }, + { + "epoch": 0.9569913211186114, + "grad_norm": 0.3926456841142491, + "learning_rate": 4.660671722831467e-08, + "loss": 2.3058, + "step": 2481 + }, + { + "epoch": 0.9573770491803278, + "grad_norm": 0.39642546104848503, + "learning_rate": 4.577201710596613e-08, + "loss": 2.3557, + "step": 2482 + }, + { + "epoch": 0.9577627772420444, + "grad_norm": 0.4438348675032907, + "learning_rate": 4.4944824812029886e-08, + "loss": 2.3176, + "step": 2483 + }, + { + "epoch": 0.9581485053037608, + "grad_norm": 0.4157969799084524, + "learning_rate": 4.412514160006376e-08, + "loss": 2.3298, + "step": 2484 + }, + { + "epoch": 0.9585342333654774, + "grad_norm": 0.41079706672034005, + "learning_rate": 4.33129687122491e-08, + "loss": 2.3701, + "step": 2485 + }, + { + "epoch": 0.9589199614271938, + "grad_norm": 0.40202699652356505, + "learning_rate": 4.25083073793825e-08, + "loss": 2.3547, + "step": 2486 + }, + { + "epoch": 0.9593056894889104, + "grad_norm": 0.40564148246556647, + "learning_rate": 4.1711158820879613e-08, + "loss": 2.3365, + "step": 2487 + }, + { + "epoch": 0.9596914175506268, + "grad_norm": 0.4127156328756135, + "learning_rate": 4.092152424477025e-08, + "loss": 2.274, + "step": 2488 + }, + { + "epoch": 0.9600771456123433, + "grad_norm": 0.44242938132635207, + "learning_rate": 4.013940484769718e-08, + "loss": 2.3907, + "step": 2489 + }, + { + "epoch": 0.9604628736740598, + "grad_norm": 0.41195702427983877, + "learning_rate": 3.936480181491342e-08, + "loss": 2.289, + "step": 2490 + }, + { + "epoch": 0.9608486017357762, + "grad_norm": 0.40946865056306514, + "learning_rate": 3.859771632028331e-08, + "loss": 2.3897, + "step": 2491 + }, + { + "epoch": 0.9612343297974928, + "grad_norm": 0.4244008294049646, + "learning_rate": 3.7838149526277514e-08, + "loss": 2.286, + "step": 2492 + }, + { + "epoch": 0.9616200578592092, + "grad_norm": 0.41393559740833624, + "learning_rate": 3.70861025839725e-08, + "loss": 2.3189, + "step": 2493 + }, + { + "epoch": 0.9620057859209258, + "grad_norm": 0.42191742412885425, + "learning_rate": 3.634157663304994e-08, + "loss": 2.3266, + "step": 2494 + }, + { + "epoch": 0.9623915139826422, + "grad_norm": 0.42793463279971766, + "learning_rate": 3.560457280179286e-08, + "loss": 2.3268, + "step": 2495 + }, + { + "epoch": 0.9627772420443588, + "grad_norm": 0.4345097407966401, + "learning_rate": 3.487509220708563e-08, + "loss": 2.327, + "step": 2496 + }, + { + "epoch": 0.9631629701060752, + "grad_norm": 0.40205889662274236, + "learning_rate": 3.415313595441116e-08, + "loss": 2.2177, + "step": 2497 + }, + { + "epoch": 0.9635486981677918, + "grad_norm": 0.42168734366980054, + "learning_rate": 3.343870513784875e-08, + "loss": 2.3378, + "step": 2498 + }, + { + "epoch": 0.9639344262295082, + "grad_norm": 0.4102487120890495, + "learning_rate": 3.2731800840076213e-08, + "loss": 2.3205, + "step": 2499 + }, + { + "epoch": 0.9643201542912246, + "grad_norm": 0.4299063950757636, + "learning_rate": 3.2032424132362736e-08, + "loss": 2.3536, + "step": 2500 + }, + { + "epoch": 0.9647058823529412, + "grad_norm": 0.40180559664214427, + "learning_rate": 3.134057607457108e-08, + "loss": 2.2852, + "step": 2501 + }, + { + "epoch": 0.9650916104146576, + "grad_norm": 0.4291300595198783, + "learning_rate": 3.065625771515424e-08, + "loss": 2.3647, + "step": 2502 + }, + { + "epoch": 0.9654773384763742, + "grad_norm": 0.4030551949343693, + "learning_rate": 2.9979470091154315e-08, + "loss": 2.3201, + "step": 2503 + }, + { + "epoch": 0.9658630665380906, + "grad_norm": 0.4075491282522468, + "learning_rate": 2.9310214228202016e-08, + "loss": 2.2759, + "step": 2504 + }, + { + "epoch": 0.9662487945998072, + "grad_norm": 0.41618422691287965, + "learning_rate": 2.8648491140513267e-08, + "loss": 2.3569, + "step": 2505 + }, + { + "epoch": 0.9666345226615236, + "grad_norm": 0.4083096699215891, + "learning_rate": 2.799430183088925e-08, + "loss": 2.2653, + "step": 2506 + }, + { + "epoch": 0.9670202507232402, + "grad_norm": 0.40400059592556226, + "learning_rate": 2.734764729071304e-08, + "loss": 2.2471, + "step": 2507 + }, + { + "epoch": 0.9674059787849566, + "grad_norm": 0.41087315233608723, + "learning_rate": 2.6708528499950758e-08, + "loss": 2.2883, + "step": 2508 + }, + { + "epoch": 0.9677917068466731, + "grad_norm": 0.4186037693686628, + "learning_rate": 2.607694642714653e-08, + "loss": 2.2833, + "step": 2509 + }, + { + "epoch": 0.9681774349083896, + "grad_norm": 0.43098089391725436, + "learning_rate": 2.5452902029425297e-08, + "loss": 2.3275, + "step": 2510 + }, + { + "epoch": 0.968563162970106, + "grad_norm": 0.44530166646693153, + "learning_rate": 2.483639625248724e-08, + "loss": 2.2917, + "step": 2511 + }, + { + "epoch": 0.9689488910318226, + "grad_norm": 0.4331736474354493, + "learning_rate": 2.4227430030609455e-08, + "loss": 2.2545, + "step": 2512 + }, + { + "epoch": 0.969334619093539, + "grad_norm": 0.4609065384992006, + "learning_rate": 2.3626004286642634e-08, + "loss": 2.3039, + "step": 2513 + }, + { + "epoch": 0.9697203471552556, + "grad_norm": 0.4136615874241027, + "learning_rate": 2.3032119932010488e-08, + "loss": 2.384, + "step": 2514 + }, + { + "epoch": 0.970106075216972, + "grad_norm": 0.40736978504633947, + "learning_rate": 2.2445777866709208e-08, + "loss": 2.278, + "step": 2515 + }, + { + "epoch": 0.9704918032786886, + "grad_norm": 0.4603048542520577, + "learning_rate": 2.1866978979303567e-08, + "loss": 2.3386, + "step": 2516 + }, + { + "epoch": 0.970877531340405, + "grad_norm": 0.39110840096324556, + "learning_rate": 2.1295724146926933e-08, + "loss": 2.3721, + "step": 2517 + }, + { + "epoch": 0.9712632594021215, + "grad_norm": 0.4331393875353548, + "learning_rate": 2.073201423528237e-08, + "loss": 2.3596, + "step": 2518 + }, + { + "epoch": 0.971648987463838, + "grad_norm": 0.42332854514733886, + "learning_rate": 2.017585009863654e-08, + "loss": 2.3586, + "step": 2519 + }, + { + "epoch": 0.9720347155255545, + "grad_norm": 0.43115704440923985, + "learning_rate": 1.962723257982302e-08, + "loss": 2.3228, + "step": 2520 + }, + { + "epoch": 0.972420443587271, + "grad_norm": 0.449521995644178, + "learning_rate": 1.9086162510237316e-08, + "loss": 2.3004, + "step": 2521 + }, + { + "epoch": 0.9728061716489874, + "grad_norm": 0.40298981836234643, + "learning_rate": 1.8552640709837977e-08, + "loss": 2.3239, + "step": 2522 + }, + { + "epoch": 0.973191899710704, + "grad_norm": 0.4490677597059497, + "learning_rate": 1.8026667987144363e-08, + "loss": 2.3012, + "step": 2523 + }, + { + "epoch": 0.9735776277724204, + "grad_norm": 0.4054054120261914, + "learning_rate": 1.7508245139236658e-08, + "loss": 2.3121, + "step": 2524 + }, + { + "epoch": 0.973963355834137, + "grad_norm": 0.43090912391038244, + "learning_rate": 1.6997372951751967e-08, + "loss": 2.3489, + "step": 2525 + }, + { + "epoch": 0.9743490838958534, + "grad_norm": 0.4233189860834805, + "learning_rate": 1.6494052198886557e-08, + "loss": 2.3873, + "step": 2526 + }, + { + "epoch": 0.97473481195757, + "grad_norm": 0.4058108388557626, + "learning_rate": 1.59982836433914e-08, + "loss": 2.3247, + "step": 2527 + }, + { + "epoch": 0.9751205400192864, + "grad_norm": 0.4258335088907587, + "learning_rate": 1.5510068036573288e-08, + "loss": 2.3609, + "step": 2528 + }, + { + "epoch": 0.9755062680810029, + "grad_norm": 0.4207761216385338, + "learning_rate": 1.5029406118293732e-08, + "loss": 2.3278, + "step": 2529 + }, + { + "epoch": 0.9758919961427194, + "grad_norm": 0.4232939417597577, + "learning_rate": 1.4556298616965614e-08, + "loss": 2.3444, + "step": 2530 + }, + { + "epoch": 0.9762777242044359, + "grad_norm": 0.44772513815545695, + "learning_rate": 1.4090746249554866e-08, + "loss": 2.3005, + "step": 2531 + }, + { + "epoch": 0.9766634522661524, + "grad_norm": 0.4501684978470362, + "learning_rate": 1.3632749721577132e-08, + "loss": 2.2306, + "step": 2532 + }, + { + "epoch": 0.9770491803278688, + "grad_norm": 0.44218898437816934, + "learning_rate": 1.318230972709833e-08, + "loss": 2.3319, + "step": 2533 + }, + { + "epoch": 0.9774349083895854, + "grad_norm": 0.4158586144659339, + "learning_rate": 1.2739426948732426e-08, + "loss": 2.3159, + "step": 2534 + }, + { + "epoch": 0.9778206364513018, + "grad_norm": 0.4196125452677132, + "learning_rate": 1.2304102057640877e-08, + "loss": 2.2893, + "step": 2535 + }, + { + "epoch": 0.9782063645130183, + "grad_norm": 0.40891874679918727, + "learning_rate": 1.1876335713532638e-08, + "loss": 2.2473, + "step": 2536 + }, + { + "epoch": 0.9785920925747348, + "grad_norm": 0.4406733934886187, + "learning_rate": 1.1456128564660273e-08, + "loss": 2.3896, + "step": 2537 + }, + { + "epoch": 0.9789778206364513, + "grad_norm": 0.42079275376767034, + "learning_rate": 1.1043481247823285e-08, + "loss": 2.2683, + "step": 2538 + }, + { + "epoch": 0.9793635486981678, + "grad_norm": 0.4008125060938278, + "learning_rate": 1.0638394388362006e-08, + "loss": 2.2474, + "step": 2539 + }, + { + "epoch": 0.9797492767598843, + "grad_norm": 0.4272944856472343, + "learning_rate": 1.024086860016149e-08, + "loss": 2.3548, + "step": 2540 + }, + { + "epoch": 0.9801350048216008, + "grad_norm": 0.40159087051803666, + "learning_rate": 9.850904485647072e-09, + "loss": 2.3237, + "step": 2541 + }, + { + "epoch": 0.9805207328833173, + "grad_norm": 0.4156694315192405, + "learning_rate": 9.468502635786026e-09, + "loss": 2.3672, + "step": 2542 + }, + { + "epoch": 0.9809064609450338, + "grad_norm": 0.422406372438295, + "learning_rate": 9.093663630084237e-09, + "loss": 2.3174, + "step": 2543 + }, + { + "epoch": 0.9812921890067502, + "grad_norm": 0.44066709303635726, + "learning_rate": 8.726388036587874e-09, + "loss": 2.3132, + "step": 2544 + }, + { + "epoch": 0.9816779170684667, + "grad_norm": 0.41356414626582866, + "learning_rate": 8.366676411880602e-09, + "loss": 2.2772, + "step": 2545 + }, + { + "epoch": 0.9820636451301832, + "grad_norm": 0.42263596386774394, + "learning_rate": 8.014529301082485e-09, + "loss": 2.2357, + "step": 2546 + }, + { + "epoch": 0.9824493731918997, + "grad_norm": 0.43398917077284627, + "learning_rate": 7.669947237851637e-09, + "loss": 2.3422, + "step": 2547 + }, + { + "epoch": 0.9828351012536162, + "grad_norm": 0.4001274687897164, + "learning_rate": 7.332930744380906e-09, + "loss": 2.3663, + "step": 2548 + }, + { + "epoch": 0.9832208293153327, + "grad_norm": 0.4070491519301213, + "learning_rate": 7.00348033139786e-09, + "loss": 2.3069, + "step": 2549 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 0.42079521174878126, + "learning_rate": 6.681596498164244e-09, + "loss": 2.2963, + "step": 2550 + }, + { + "epoch": 0.9839922854387657, + "grad_norm": 0.43723928010333774, + "learning_rate": 6.367279732475418e-09, + "loss": 2.358, + "step": 2551 + }, + { + "epoch": 0.9843780135004822, + "grad_norm": 0.4108275482871391, + "learning_rate": 6.060530510659246e-09, + "loss": 2.326, + "step": 2552 + }, + { + "epoch": 0.9847637415621987, + "grad_norm": 0.4350703058011143, + "learning_rate": 5.761349297575547e-09, + "loss": 2.3581, + "step": 2553 + }, + { + "epoch": 0.9851494696239151, + "grad_norm": 0.43235359013039043, + "learning_rate": 5.469736546614979e-09, + "loss": 2.2757, + "step": 2554 + }, + { + "epoch": 0.9855351976856316, + "grad_norm": 0.40961132656186466, + "learning_rate": 5.185692699697931e-09, + "loss": 2.36, + "step": 2555 + }, + { + "epoch": 0.9859209257473481, + "grad_norm": 0.4430751567999117, + "learning_rate": 4.909218187276743e-09, + "loss": 2.3624, + "step": 2556 + }, + { + "epoch": 0.9863066538090646, + "grad_norm": 0.4289839532627191, + "learning_rate": 4.640313428330711e-09, + "loss": 2.2914, + "step": 2557 + }, + { + "epoch": 0.9866923818707811, + "grad_norm": 0.44036969688797795, + "learning_rate": 4.378978830368863e-09, + "loss": 2.2774, + "step": 2558 + }, + { + "epoch": 0.9870781099324976, + "grad_norm": 0.4143311437290899, + "learning_rate": 4.125214789427734e-09, + "loss": 2.2661, + "step": 2559 + }, + { + "epoch": 0.9874638379942141, + "grad_norm": 0.4240774884679542, + "learning_rate": 3.8790216900702615e-09, + "loss": 2.2581, + "step": 2560 + }, + { + "epoch": 0.9878495660559306, + "grad_norm": 0.41615959808469105, + "learning_rate": 3.6403999053885584e-09, + "loss": 2.3018, + "step": 2561 + }, + { + "epoch": 0.9882352941176471, + "grad_norm": 0.41624863195931333, + "learning_rate": 3.4093497969983625e-09, + "loss": 2.4007, + "step": 2562 + }, + { + "epoch": 0.9886210221793635, + "grad_norm": 0.4130284474102173, + "learning_rate": 3.1858717150412554e-09, + "loss": 2.3366, + "step": 2563 + }, + { + "epoch": 0.9890067502410801, + "grad_norm": 0.4268181991955994, + "learning_rate": 2.9699659981863306e-09, + "loss": 2.2902, + "step": 2564 + }, + { + "epoch": 0.9893924783027965, + "grad_norm": 0.42065001359224774, + "learning_rate": 2.761632973624084e-09, + "loss": 2.3251, + "step": 2565 + }, + { + "epoch": 0.989778206364513, + "grad_norm": 0.43868310422689855, + "learning_rate": 2.5608729570703037e-09, + "loss": 2.3727, + "step": 2566 + }, + { + "epoch": 0.9901639344262295, + "grad_norm": 0.40977965981613995, + "learning_rate": 2.367686252765511e-09, + "loss": 2.3281, + "step": 2567 + }, + { + "epoch": 0.990549662487946, + "grad_norm": 0.4121241906272853, + "learning_rate": 2.182073153471631e-09, + "loss": 2.2966, + "step": 2568 + }, + { + "epoch": 0.9909353905496625, + "grad_norm": 0.42911358142367184, + "learning_rate": 2.0040339404742147e-09, + "loss": 2.357, + "step": 2569 + }, + { + "epoch": 0.991321118611379, + "grad_norm": 0.43132519983787015, + "learning_rate": 1.8335688835802169e-09, + "loss": 2.3263, + "step": 2570 + }, + { + "epoch": 0.9917068466730955, + "grad_norm": 0.4363798226276988, + "learning_rate": 1.670678241119661e-09, + "loss": 2.3435, + "step": 2571 + }, + { + "epoch": 0.9920925747348119, + "grad_norm": 0.4289536892138021, + "learning_rate": 1.5153622599428652e-09, + "loss": 2.2977, + "step": 2572 + }, + { + "epoch": 0.9924783027965285, + "grad_norm": 0.4062667678567385, + "learning_rate": 1.3676211754215518e-09, + "loss": 2.2861, + "step": 2573 + }, + { + "epoch": 0.9928640308582449, + "grad_norm": 0.4275697437950462, + "learning_rate": 1.227455211448847e-09, + "loss": 2.3045, + "step": 2574 + }, + { + "epoch": 0.9932497589199615, + "grad_norm": 0.41249327348223197, + "learning_rate": 1.0948645804370605e-09, + "loss": 2.3402, + "step": 2575 + }, + { + "epoch": 0.9936354869816779, + "grad_norm": 0.4091376478413197, + "learning_rate": 9.698494833199068e-10, + "loss": 2.3213, + "step": 2576 + }, + { + "epoch": 0.9940212150433944, + "grad_norm": 0.44573230043842443, + "learning_rate": 8.524101095491733e-10, + "loss": 2.2555, + "step": 2577 + }, + { + "epoch": 0.9944069431051109, + "grad_norm": 0.4352921106893979, + "learning_rate": 7.425466370974965e-10, + "loss": 2.3701, + "step": 2578 + }, + { + "epoch": 0.9947926711668273, + "grad_norm": 0.43617276647783876, + "learning_rate": 6.402592324561418e-10, + "loss": 2.3061, + "step": 2579 + }, + { + "epoch": 0.9951783992285439, + "grad_norm": 0.4551060450487974, + "learning_rate": 5.455480506355582e-10, + "loss": 2.3767, + "step": 2580 + }, + { + "epoch": 0.9955641272902603, + "grad_norm": 0.4117741397937645, + "learning_rate": 4.5841323516426784e-10, + "loss": 2.3894, + "step": 2581 + }, + { + "epoch": 0.9959498553519769, + "grad_norm": 0.39894957552658944, + "learning_rate": 3.7885491809053207e-10, + "loss": 2.2485, + "step": 2582 + }, + { + "epoch": 0.9963355834136933, + "grad_norm": 0.41464108147053547, + "learning_rate": 3.0687321997957543e-10, + "loss": 2.324, + "step": 2583 + }, + { + "epoch": 0.9967213114754099, + "grad_norm": 0.4423602065102726, + "learning_rate": 2.4246824991525085e-10, + "loss": 2.3225, + "step": 2584 + }, + { + "epoch": 0.9971070395371263, + "grad_norm": 0.4362643537518209, + "learning_rate": 1.8564010549948497e-10, + "loss": 2.2874, + "step": 2585 + }, + { + "epoch": 0.9974927675988429, + "grad_norm": 0.4346366544157702, + "learning_rate": 1.3638887285116753e-10, + "loss": 2.3311, + "step": 2586 + }, + { + "epoch": 0.9978784956605593, + "grad_norm": 0.4214231323688105, + "learning_rate": 9.471462660892716e-11, + "loss": 2.3397, + "step": 2587 + }, + { + "epoch": 0.9982642237222757, + "grad_norm": 0.4105430433409002, + "learning_rate": 6.061742992613529e-11, + "loss": 2.3279, + "step": 2588 + }, + { + "epoch": 0.9986499517839923, + "grad_norm": 0.42157880275509585, + "learning_rate": 3.4097334475902135e-11, + "loss": 2.3044, + "step": 2589 + }, + { + "epoch": 0.9990356798457087, + "grad_norm": 0.4295909376753097, + "learning_rate": 1.5154380447190976e-11, + "loss": 2.3718, + "step": 2590 + }, + { + "epoch": 0.9994214079074253, + "grad_norm": 0.42028982333761167, + "learning_rate": 3.788596547038559e-12, + "loss": 2.2899, + "step": 2591 + }, + { + "epoch": 0.9998071359691417, + "grad_norm": 0.392346457170918, + "learning_rate": 0.0, + "loss": 2.3139, + "step": 2592 + } + ], + "logging_steps": 1, + "max_steps": 2592, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1296, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.591619149639975e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}