diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15239 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9998560778331078, + "eval_steps": 500, + "global_step": 2171, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00046055093405486313, + "grad_norm": 10.224479026613773, + "learning_rate": 3.0303030303030305e-08, + "loss": 1.084, + "step": 1 + }, + { + "epoch": 0.0009211018681097263, + "grad_norm": 10.903636883871654, + "learning_rate": 6.060606060606061e-08, + "loss": 0.9824, + "step": 2 + }, + { + "epoch": 0.0013816528021645895, + "grad_norm": 11.344940351126398, + "learning_rate": 9.09090909090909e-08, + "loss": 1.0421, + "step": 3 + }, + { + "epoch": 0.0018422037362194525, + "grad_norm": 13.25793073962023, + "learning_rate": 1.2121212121212122e-07, + "loss": 0.9886, + "step": 4 + }, + { + "epoch": 0.002302754670274316, + "grad_norm": 9.70140482967039, + "learning_rate": 1.5151515151515152e-07, + "loss": 0.9634, + "step": 5 + }, + { + "epoch": 0.002763305604329179, + "grad_norm": 12.03838558755089, + "learning_rate": 1.818181818181818e-07, + "loss": 0.9483, + "step": 6 + }, + { + "epoch": 0.003223856538384042, + "grad_norm": 12.588391333344033, + "learning_rate": 2.121212121212121e-07, + "loss": 0.9984, + "step": 7 + }, + { + "epoch": 0.003684407472438905, + "grad_norm": 11.836562687027973, + "learning_rate": 2.4242424242424244e-07, + "loss": 1.0708, + "step": 8 + }, + { + "epoch": 0.004144958406493768, + "grad_norm": 12.706113691995517, + "learning_rate": 2.727272727272727e-07, + "loss": 1.0863, + "step": 9 + }, + { + "epoch": 0.004605509340548632, + "grad_norm": 11.407622369949724, + "learning_rate": 3.0303030303030305e-07, + "loss": 1.2087, + "step": 10 + }, + { + "epoch": 0.005066060274603494, + "grad_norm": 12.237310594074764, + "learning_rate": 3.333333333333333e-07, + "loss": 1.0967, + "step": 11 + }, + { + "epoch": 0.005526611208658358, + "grad_norm": 10.99654178909918, + "learning_rate": 3.636363636363636e-07, + "loss": 1.1732, + "step": 12 + }, + { + "epoch": 0.0059871621427132204, + "grad_norm": 11.042210513697741, + "learning_rate": 3.939393939393939e-07, + "loss": 1.1777, + "step": 13 + }, + { + "epoch": 0.006447713076768084, + "grad_norm": 11.496495867822293, + "learning_rate": 4.242424242424242e-07, + "loss": 1.0627, + "step": 14 + }, + { + "epoch": 0.006908264010822947, + "grad_norm": 11.882894204754962, + "learning_rate": 4.545454545454545e-07, + "loss": 0.9844, + "step": 15 + }, + { + "epoch": 0.00736881494487781, + "grad_norm": 11.794166405407287, + "learning_rate": 4.848484848484849e-07, + "loss": 1.0168, + "step": 16 + }, + { + "epoch": 0.007829365878932673, + "grad_norm": 10.01774185514084, + "learning_rate": 5.151515151515151e-07, + "loss": 1.2505, + "step": 17 + }, + { + "epoch": 0.008289916812987536, + "grad_norm": 9.567696400885666, + "learning_rate": 5.454545454545454e-07, + "loss": 1.1344, + "step": 18 + }, + { + "epoch": 0.0087504677470424, + "grad_norm": 11.67769227316676, + "learning_rate": 5.757575757575758e-07, + "loss": 0.9009, + "step": 19 + }, + { + "epoch": 0.009211018681097263, + "grad_norm": 9.530138294227273, + "learning_rate": 6.060606060606061e-07, + "loss": 1.0289, + "step": 20 + }, + { + "epoch": 0.009671569615152125, + "grad_norm": 8.071657974205257, + "learning_rate": 6.363636363636363e-07, + "loss": 1.2151, + "step": 21 + }, + { + "epoch": 0.010132120549206989, + "grad_norm": 11.50961336906488, + "learning_rate": 6.666666666666666e-07, + "loss": 0.8917, + "step": 22 + }, + { + "epoch": 0.010592671483261852, + "grad_norm": 9.46367673420139, + "learning_rate": 6.96969696969697e-07, + "loss": 1.1334, + "step": 23 + }, + { + "epoch": 0.011053222417316716, + "grad_norm": 7.980897491793901, + "learning_rate": 7.272727272727272e-07, + "loss": 0.968, + "step": 24 + }, + { + "epoch": 0.011513773351371579, + "grad_norm": 6.49691041072656, + "learning_rate": 7.575757575757575e-07, + "loss": 1.0277, + "step": 25 + }, + { + "epoch": 0.011974324285426441, + "grad_norm": 6.041354612689945, + "learning_rate": 7.878787878787878e-07, + "loss": 0.9477, + "step": 26 + }, + { + "epoch": 0.012434875219481304, + "grad_norm": 6.34623067352923, + "learning_rate": 8.181818181818182e-07, + "loss": 0.9635, + "step": 27 + }, + { + "epoch": 0.012895426153536168, + "grad_norm": 5.768779273607869, + "learning_rate": 8.484848484848484e-07, + "loss": 0.9567, + "step": 28 + }, + { + "epoch": 0.013355977087591031, + "grad_norm": 5.159941543576065, + "learning_rate": 8.787878787878787e-07, + "loss": 0.8742, + "step": 29 + }, + { + "epoch": 0.013816528021645893, + "grad_norm": 4.459623755013343, + "learning_rate": 9.09090909090909e-07, + "loss": 0.905, + "step": 30 + }, + { + "epoch": 0.014277078955700757, + "grad_norm": 4.578386981237651, + "learning_rate": 9.393939393939395e-07, + "loss": 0.9856, + "step": 31 + }, + { + "epoch": 0.01473762988975562, + "grad_norm": 4.599340605830885, + "learning_rate": 9.696969696969698e-07, + "loss": 1.0623, + "step": 32 + }, + { + "epoch": 0.015198180823810484, + "grad_norm": 4.715495998419273, + "learning_rate": 1e-06, + "loss": 0.9893, + "step": 33 + }, + { + "epoch": 0.015658731757865346, + "grad_norm": 3.9725156691925405, + "learning_rate": 1.0303030303030302e-06, + "loss": 0.9039, + "step": 34 + }, + { + "epoch": 0.01611928269192021, + "grad_norm": 4.01091093069899, + "learning_rate": 1.0606060606060606e-06, + "loss": 0.7272, + "step": 35 + }, + { + "epoch": 0.016579833625975073, + "grad_norm": 4.427996895865591, + "learning_rate": 1.0909090909090908e-06, + "loss": 0.8846, + "step": 36 + }, + { + "epoch": 0.017040384560029934, + "grad_norm": 3.545754336884005, + "learning_rate": 1.121212121212121e-06, + "loss": 0.829, + "step": 37 + }, + { + "epoch": 0.0175009354940848, + "grad_norm": 5.525053440753492, + "learning_rate": 1.1515151515151516e-06, + "loss": 0.6791, + "step": 38 + }, + { + "epoch": 0.01796148642813966, + "grad_norm": 3.383199059764453, + "learning_rate": 1.1818181818181818e-06, + "loss": 0.8036, + "step": 39 + }, + { + "epoch": 0.018422037362194527, + "grad_norm": 4.1135930428353, + "learning_rate": 1.2121212121212122e-06, + "loss": 0.89, + "step": 40 + }, + { + "epoch": 0.01888258829624939, + "grad_norm": 4.200860213314929, + "learning_rate": 1.2424242424242424e-06, + "loss": 0.8742, + "step": 41 + }, + { + "epoch": 0.01934313923030425, + "grad_norm": 4.621032204910162, + "learning_rate": 1.2727272727272726e-06, + "loss": 0.9061, + "step": 42 + }, + { + "epoch": 0.019803690164359115, + "grad_norm": 4.05937719343738, + "learning_rate": 1.303030303030303e-06, + "loss": 0.8869, + "step": 43 + }, + { + "epoch": 0.020264241098413977, + "grad_norm": 3.908268562451142, + "learning_rate": 1.3333333333333332e-06, + "loss": 0.8652, + "step": 44 + }, + { + "epoch": 0.020724792032468842, + "grad_norm": 3.8127642183113104, + "learning_rate": 1.3636363636363634e-06, + "loss": 0.8547, + "step": 45 + }, + { + "epoch": 0.021185342966523704, + "grad_norm": 3.7006255903086016, + "learning_rate": 1.393939393939394e-06, + "loss": 0.8816, + "step": 46 + }, + { + "epoch": 0.021645893900578566, + "grad_norm": 4.278643366823466, + "learning_rate": 1.4242424242424242e-06, + "loss": 0.9507, + "step": 47 + }, + { + "epoch": 0.02210644483463343, + "grad_norm": 5.121316020450439, + "learning_rate": 1.4545454545454544e-06, + "loss": 0.8144, + "step": 48 + }, + { + "epoch": 0.022566995768688293, + "grad_norm": 3.048465483366566, + "learning_rate": 1.4848484848484848e-06, + "loss": 0.654, + "step": 49 + }, + { + "epoch": 0.023027546702743158, + "grad_norm": 3.9812784832535133, + "learning_rate": 1.515151515151515e-06, + "loss": 0.6679, + "step": 50 + }, + { + "epoch": 0.02348809763679802, + "grad_norm": 3.787536459284705, + "learning_rate": 1.5454545454545454e-06, + "loss": 0.9465, + "step": 51 + }, + { + "epoch": 0.023948648570852882, + "grad_norm": 3.6018385428584123, + "learning_rate": 1.5757575757575756e-06, + "loss": 0.7828, + "step": 52 + }, + { + "epoch": 0.024409199504907747, + "grad_norm": 3.5109451677027614, + "learning_rate": 1.6060606060606058e-06, + "loss": 0.8131, + "step": 53 + }, + { + "epoch": 0.02486975043896261, + "grad_norm": 3.0290570716615903, + "learning_rate": 1.6363636363636365e-06, + "loss": 0.6437, + "step": 54 + }, + { + "epoch": 0.02533030137301747, + "grad_norm": 3.682837755403324, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.7215, + "step": 55 + }, + { + "epoch": 0.025790852307072336, + "grad_norm": 3.476439822446147, + "learning_rate": 1.6969696969696969e-06, + "loss": 0.7884, + "step": 56 + }, + { + "epoch": 0.026251403241127198, + "grad_norm": 3.9813216927132475, + "learning_rate": 1.7272727272727273e-06, + "loss": 0.7466, + "step": 57 + }, + { + "epoch": 0.026711954175182063, + "grad_norm": 3.6806123465375116, + "learning_rate": 1.7575757575757575e-06, + "loss": 0.7043, + "step": 58 + }, + { + "epoch": 0.027172505109236925, + "grad_norm": 3.1611663708183344, + "learning_rate": 1.7878787878787877e-06, + "loss": 0.7341, + "step": 59 + }, + { + "epoch": 0.027633056043291786, + "grad_norm": 3.444488689830893, + "learning_rate": 1.818181818181818e-06, + "loss": 0.8762, + "step": 60 + }, + { + "epoch": 0.02809360697734665, + "grad_norm": 3.549558394232681, + "learning_rate": 1.8484848484848483e-06, + "loss": 0.8806, + "step": 61 + }, + { + "epoch": 0.028554157911401513, + "grad_norm": 3.9711382750959747, + "learning_rate": 1.878787878787879e-06, + "loss": 0.8029, + "step": 62 + }, + { + "epoch": 0.02901470884545638, + "grad_norm": 3.5959357584947615, + "learning_rate": 1.909090909090909e-06, + "loss": 0.7292, + "step": 63 + }, + { + "epoch": 0.02947525977951124, + "grad_norm": 3.9415569403625024, + "learning_rate": 1.9393939393939395e-06, + "loss": 0.856, + "step": 64 + }, + { + "epoch": 0.029935810713566102, + "grad_norm": 3.406751239525263, + "learning_rate": 1.9696969696969695e-06, + "loss": 0.7986, + "step": 65 + }, + { + "epoch": 0.030396361647620967, + "grad_norm": 3.6931502231787183, + "learning_rate": 2e-06, + "loss": 0.6504, + "step": 66 + }, + { + "epoch": 0.03085691258167583, + "grad_norm": 3.940678039571864, + "learning_rate": 1.9999988863070544e-06, + "loss": 0.6272, + "step": 67 + }, + { + "epoch": 0.03131746351573069, + "grad_norm": 3.361818700666645, + "learning_rate": 1.999995545230698e-06, + "loss": 0.8018, + "step": 68 + }, + { + "epoch": 0.031778014449785556, + "grad_norm": 4.745930001970848, + "learning_rate": 1.9999899767783724e-06, + "loss": 0.7793, + "step": 69 + }, + { + "epoch": 0.03223856538384042, + "grad_norm": 4.566864649641224, + "learning_rate": 1.999982180962482e-06, + "loss": 0.7536, + "step": 70 + }, + { + "epoch": 0.03269911631789528, + "grad_norm": 3.4766274202922123, + "learning_rate": 1.999972157800389e-06, + "loss": 0.6682, + "step": 71 + }, + { + "epoch": 0.033159667251950145, + "grad_norm": 3.2407423987046564, + "learning_rate": 1.999959907314421e-06, + "loss": 0.5877, + "step": 72 + }, + { + "epoch": 0.03362021818600501, + "grad_norm": 3.854719893893145, + "learning_rate": 1.999945429531863e-06, + "loss": 0.8257, + "step": 73 + }, + { + "epoch": 0.03408076912005987, + "grad_norm": 3.384850197913963, + "learning_rate": 1.9999287244849633e-06, + "loss": 0.7341, + "step": 74 + }, + { + "epoch": 0.034541320054114734, + "grad_norm": 3.826326260538933, + "learning_rate": 1.9999097922109303e-06, + "loss": 0.7884, + "step": 75 + }, + { + "epoch": 0.0350018709881696, + "grad_norm": 3.5685218916656805, + "learning_rate": 1.9998886327519336e-06, + "loss": 0.7556, + "step": 76 + }, + { + "epoch": 0.035462421922224464, + "grad_norm": 3.314138350796672, + "learning_rate": 1.999865246155103e-06, + "loss": 0.7505, + "step": 77 + }, + { + "epoch": 0.03592297285627932, + "grad_norm": 3.218790293838154, + "learning_rate": 1.9998396324725305e-06, + "loss": 0.6178, + "step": 78 + }, + { + "epoch": 0.03638352379033419, + "grad_norm": 3.5441600733921055, + "learning_rate": 1.999811791761267e-06, + "loss": 0.711, + "step": 79 + }, + { + "epoch": 0.03684407472438905, + "grad_norm": 3.2797802004892773, + "learning_rate": 1.999781724083324e-06, + "loss": 0.6754, + "step": 80 + }, + { + "epoch": 0.03730462565844391, + "grad_norm": 3.514532124195952, + "learning_rate": 1.9997494295056746e-06, + "loss": 0.7056, + "step": 81 + }, + { + "epoch": 0.03776517659249878, + "grad_norm": 3.4677096105337433, + "learning_rate": 1.9997149081002514e-06, + "loss": 0.6394, + "step": 82 + }, + { + "epoch": 0.03822572752655364, + "grad_norm": 2.932267254260553, + "learning_rate": 1.9996781599439464e-06, + "loss": 0.7235, + "step": 83 + }, + { + "epoch": 0.0386862784606085, + "grad_norm": 3.096434205139558, + "learning_rate": 1.9996391851186118e-06, + "loss": 0.674, + "step": 84 + }, + { + "epoch": 0.039146829394663366, + "grad_norm": 3.858101988445306, + "learning_rate": 1.99959798371106e-06, + "loss": 0.6127, + "step": 85 + }, + { + "epoch": 0.03960738032871823, + "grad_norm": 3.361207098481145, + "learning_rate": 1.999554555813062e-06, + "loss": 0.6981, + "step": 86 + }, + { + "epoch": 0.04006793126277309, + "grad_norm": 3.4015641056770503, + "learning_rate": 1.9995089015213493e-06, + "loss": 0.8552, + "step": 87 + }, + { + "epoch": 0.040528482196827954, + "grad_norm": 3.1305710457612004, + "learning_rate": 1.999461020937611e-06, + "loss": 0.6014, + "step": 88 + }, + { + "epoch": 0.04098903313088282, + "grad_norm": 3.820781969106205, + "learning_rate": 1.999410914168495e-06, + "loss": 0.8311, + "step": 89 + }, + { + "epoch": 0.041449584064937685, + "grad_norm": 3.2079786714114906, + "learning_rate": 1.99935858132561e-06, + "loss": 0.682, + "step": 90 + }, + { + "epoch": 0.04191013499899254, + "grad_norm": 3.643489327427482, + "learning_rate": 1.99930402252552e-06, + "loss": 0.6605, + "step": 91 + }, + { + "epoch": 0.04237068593304741, + "grad_norm": 3.3738758807758766, + "learning_rate": 1.9992472378897497e-06, + "loss": 0.6143, + "step": 92 + }, + { + "epoch": 0.042831236867102274, + "grad_norm": 3.077610620836017, + "learning_rate": 1.9991882275447794e-06, + "loss": 0.6217, + "step": 93 + }, + { + "epoch": 0.04329178780115713, + "grad_norm": 3.162135055961725, + "learning_rate": 1.9991269916220485e-06, + "loss": 0.7265, + "step": 94 + }, + { + "epoch": 0.043752338735212, + "grad_norm": 2.9823928585506647, + "learning_rate": 1.999063530257952e-06, + "loss": 0.6289, + "step": 95 + }, + { + "epoch": 0.04421288966926686, + "grad_norm": 3.511247382327224, + "learning_rate": 1.998997843593845e-06, + "loss": 0.6145, + "step": 96 + }, + { + "epoch": 0.04467344060332172, + "grad_norm": 3.6517354310292265, + "learning_rate": 1.9989299317760345e-06, + "loss": 0.6997, + "step": 97 + }, + { + "epoch": 0.045133991537376586, + "grad_norm": 3.5728547634334746, + "learning_rate": 1.9988597949557883e-06, + "loss": 0.7399, + "step": 98 + }, + { + "epoch": 0.04559454247143145, + "grad_norm": 3.4027410732295653, + "learning_rate": 1.998787433289327e-06, + "loss": 0.695, + "step": 99 + }, + { + "epoch": 0.046055093405486316, + "grad_norm": 3.2197558124034984, + "learning_rate": 1.9987128469378284e-06, + "loss": 0.6144, + "step": 100 + }, + { + "epoch": 0.046515644339541175, + "grad_norm": 3.1601430739183245, + "learning_rate": 1.998636036067425e-06, + "loss": 0.6233, + "step": 101 + }, + { + "epoch": 0.04697619527359604, + "grad_norm": 3.0419954341335087, + "learning_rate": 1.9985570008492044e-06, + "loss": 0.5879, + "step": 102 + }, + { + "epoch": 0.047436746207650905, + "grad_norm": 3.4126431282945027, + "learning_rate": 1.9984757414592083e-06, + "loss": 0.7941, + "step": 103 + }, + { + "epoch": 0.047897297141705764, + "grad_norm": 2.973595724909931, + "learning_rate": 1.998392258078433e-06, + "loss": 0.6119, + "step": 104 + }, + { + "epoch": 0.04835784807576063, + "grad_norm": 4.503321929117909, + "learning_rate": 1.998306550892828e-06, + "loss": 0.6559, + "step": 105 + }, + { + "epoch": 0.048818399009815494, + "grad_norm": 3.615113444334991, + "learning_rate": 1.9982186200932964e-06, + "loss": 0.6638, + "step": 106 + }, + { + "epoch": 0.04927894994387035, + "grad_norm": 3.4410984720501094, + "learning_rate": 1.998128465875694e-06, + "loss": 0.6764, + "step": 107 + }, + { + "epoch": 0.04973950087792522, + "grad_norm": 3.1749314754075617, + "learning_rate": 1.9980360884408288e-06, + "loss": 0.701, + "step": 108 + }, + { + "epoch": 0.05020005181198008, + "grad_norm": 3.2473102824540083, + "learning_rate": 1.997941487994461e-06, + "loss": 0.6957, + "step": 109 + }, + { + "epoch": 0.05066060274603494, + "grad_norm": 3.6947631666546843, + "learning_rate": 1.9978446647473024e-06, + "loss": 0.7448, + "step": 110 + }, + { + "epoch": 0.051121153680089806, + "grad_norm": 3.261951870816807, + "learning_rate": 1.9977456189150163e-06, + "loss": 0.7727, + "step": 111 + }, + { + "epoch": 0.05158170461414467, + "grad_norm": 3.3464119040387925, + "learning_rate": 1.9976443507182152e-06, + "loss": 0.6483, + "step": 112 + }, + { + "epoch": 0.05204225554819954, + "grad_norm": 3.533060942648228, + "learning_rate": 1.997540860382463e-06, + "loss": 0.8126, + "step": 113 + }, + { + "epoch": 0.052502806482254395, + "grad_norm": 3.2761042459948877, + "learning_rate": 1.997435148138272e-06, + "loss": 0.5982, + "step": 114 + }, + { + "epoch": 0.05296335741630926, + "grad_norm": 3.1082981914368473, + "learning_rate": 1.9973272142211046e-06, + "loss": 0.6504, + "step": 115 + }, + { + "epoch": 0.053423908350364126, + "grad_norm": 3.3969089614282444, + "learning_rate": 1.997217058871371e-06, + "loss": 0.5928, + "step": 116 + }, + { + "epoch": 0.053884459284418984, + "grad_norm": 3.1930029822538173, + "learning_rate": 1.9971046823344304e-06, + "loss": 0.5868, + "step": 117 + }, + { + "epoch": 0.05434501021847385, + "grad_norm": 3.6974861112788457, + "learning_rate": 1.9969900848605877e-06, + "loss": 0.7963, + "step": 118 + }, + { + "epoch": 0.054805561152528715, + "grad_norm": 3.315165825861059, + "learning_rate": 1.9968732667050966e-06, + "loss": 0.6313, + "step": 119 + }, + { + "epoch": 0.05526611208658357, + "grad_norm": 3.539748098126486, + "learning_rate": 1.9967542281281557e-06, + "loss": 0.7429, + "step": 120 + }, + { + "epoch": 0.05572666302063844, + "grad_norm": 3.338078973425553, + "learning_rate": 1.9966329693949093e-06, + "loss": 0.7662, + "step": 121 + }, + { + "epoch": 0.0561872139546933, + "grad_norm": 3.472065054117364, + "learning_rate": 1.996509490775449e-06, + "loss": 0.6625, + "step": 122 + }, + { + "epoch": 0.05664776488874816, + "grad_norm": 3.1792157328292974, + "learning_rate": 1.996383792544808e-06, + "loss": 0.6792, + "step": 123 + }, + { + "epoch": 0.05710831582280303, + "grad_norm": 3.5210682757899145, + "learning_rate": 1.996255874982965e-06, + "loss": 0.6079, + "step": 124 + }, + { + "epoch": 0.05756886675685789, + "grad_norm": 3.2138238654727416, + "learning_rate": 1.996125738374842e-06, + "loss": 0.5784, + "step": 125 + }, + { + "epoch": 0.05802941769091276, + "grad_norm": 3.3633304567565196, + "learning_rate": 1.995993383010303e-06, + "loss": 0.6367, + "step": 126 + }, + { + "epoch": 0.058489968624967616, + "grad_norm": 3.781021254907743, + "learning_rate": 1.9958588091841553e-06, + "loss": 0.6771, + "step": 127 + }, + { + "epoch": 0.05895051955902248, + "grad_norm": 3.6201021154476516, + "learning_rate": 1.9957220171961465e-06, + "loss": 0.6707, + "step": 128 + }, + { + "epoch": 0.059411070493077346, + "grad_norm": 3.6881373018752885, + "learning_rate": 1.995583007350964e-06, + "loss": 0.7512, + "step": 129 + }, + { + "epoch": 0.059871621427132204, + "grad_norm": 3.3800092959402566, + "learning_rate": 1.9954417799582382e-06, + "loss": 0.5795, + "step": 130 + }, + { + "epoch": 0.06033217236118707, + "grad_norm": 3.6812949356112936, + "learning_rate": 1.9952983353325356e-06, + "loss": 0.59, + "step": 131 + }, + { + "epoch": 0.060792723295241935, + "grad_norm": 3.730872974671056, + "learning_rate": 1.9951526737933634e-06, + "loss": 0.6077, + "step": 132 + }, + { + "epoch": 0.06125327422929679, + "grad_norm": 3.5504209377161757, + "learning_rate": 1.9950047956651657e-06, + "loss": 0.606, + "step": 133 + }, + { + "epoch": 0.06171382516335166, + "grad_norm": 3.2843333588119727, + "learning_rate": 1.9948547012773246e-06, + "loss": 0.6067, + "step": 134 + }, + { + "epoch": 0.062174376097406524, + "grad_norm": 3.3876731358864403, + "learning_rate": 1.9947023909641574e-06, + "loss": 0.7097, + "step": 135 + }, + { + "epoch": 0.06263492703146138, + "grad_norm": 4.024239315384627, + "learning_rate": 1.994547865064919e-06, + "loss": 0.8154, + "step": 136 + }, + { + "epoch": 0.06309547796551625, + "grad_norm": 3.47895486959038, + "learning_rate": 1.9943911239237974e-06, + "loss": 0.5583, + "step": 137 + }, + { + "epoch": 0.06355602889957111, + "grad_norm": 3.409716745725618, + "learning_rate": 1.9942321678899163e-06, + "loss": 0.5774, + "step": 138 + }, + { + "epoch": 0.06401657983362598, + "grad_norm": 3.32970506539607, + "learning_rate": 1.9940709973173314e-06, + "loss": 0.6011, + "step": 139 + }, + { + "epoch": 0.06447713076768084, + "grad_norm": 3.1166430692289246, + "learning_rate": 1.993907612565032e-06, + "loss": 0.5851, + "step": 140 + }, + { + "epoch": 0.0649376817017357, + "grad_norm": 3.5411479372126426, + "learning_rate": 1.9937420139969395e-06, + "loss": 0.7496, + "step": 141 + }, + { + "epoch": 0.06539823263579056, + "grad_norm": 3.421729369489728, + "learning_rate": 1.993574201981905e-06, + "loss": 0.6842, + "step": 142 + }, + { + "epoch": 0.06585878356984542, + "grad_norm": 3.1449216041214205, + "learning_rate": 1.9934041768937114e-06, + "loss": 0.6461, + "step": 143 + }, + { + "epoch": 0.06631933450390029, + "grad_norm": 3.6494550010926328, + "learning_rate": 1.9932319391110695e-06, + "loss": 0.7231, + "step": 144 + }, + { + "epoch": 0.06677988543795516, + "grad_norm": 3.728480459329745, + "learning_rate": 1.99305748901762e-06, + "loss": 0.7552, + "step": 145 + }, + { + "epoch": 0.06724043637201002, + "grad_norm": 3.2822173129999137, + "learning_rate": 1.9928808270019296e-06, + "loss": 0.6228, + "step": 146 + }, + { + "epoch": 0.06770098730606489, + "grad_norm": 3.2867458531135196, + "learning_rate": 1.9927019534574937e-06, + "loss": 0.7294, + "step": 147 + }, + { + "epoch": 0.06816153824011974, + "grad_norm": 3.1013936687675714, + "learning_rate": 1.992520868782732e-06, + "loss": 0.5613, + "step": 148 + }, + { + "epoch": 0.0686220891741746, + "grad_norm": 3.159174332303596, + "learning_rate": 1.9923375733809905e-06, + "loss": 0.7149, + "step": 149 + }, + { + "epoch": 0.06908264010822947, + "grad_norm": 3.516091730035019, + "learning_rate": 1.992152067660539e-06, + "loss": 0.6315, + "step": 150 + }, + { + "epoch": 0.06954319104228433, + "grad_norm": 3.165146365156602, + "learning_rate": 1.9919643520345695e-06, + "loss": 0.5459, + "step": 151 + }, + { + "epoch": 0.0700037419763392, + "grad_norm": 4.0062232007227685, + "learning_rate": 1.991774426921198e-06, + "loss": 0.6454, + "step": 152 + }, + { + "epoch": 0.07046429291039406, + "grad_norm": 3.548301168807787, + "learning_rate": 1.99158229274346e-06, + "loss": 0.676, + "step": 153 + }, + { + "epoch": 0.07092484384444893, + "grad_norm": 3.4249899072112027, + "learning_rate": 1.9913879499293136e-06, + "loss": 0.6644, + "step": 154 + }, + { + "epoch": 0.07138539477850378, + "grad_norm": 3.707615974526274, + "learning_rate": 1.9911913989116345e-06, + "loss": 0.6739, + "step": 155 + }, + { + "epoch": 0.07184594571255865, + "grad_norm": 3.086896742538257, + "learning_rate": 1.990992640128218e-06, + "loss": 0.5981, + "step": 156 + }, + { + "epoch": 0.07230649664661351, + "grad_norm": 3.1043886733514983, + "learning_rate": 1.990791674021776e-06, + "loss": 0.5621, + "step": 157 + }, + { + "epoch": 0.07276704758066838, + "grad_norm": 2.7518305668279432, + "learning_rate": 1.9905885010399386e-06, + "loss": 0.5827, + "step": 158 + }, + { + "epoch": 0.07322759851472324, + "grad_norm": 3.1688929903728904, + "learning_rate": 1.9903831216352494e-06, + "loss": 0.5834, + "step": 159 + }, + { + "epoch": 0.0736881494487781, + "grad_norm": 3.3606584053832695, + "learning_rate": 1.9901755362651685e-06, + "loss": 0.6374, + "step": 160 + }, + { + "epoch": 0.07414870038283296, + "grad_norm": 3.8896133343206922, + "learning_rate": 1.9899657453920676e-06, + "loss": 0.7499, + "step": 161 + }, + { + "epoch": 0.07460925131688782, + "grad_norm": 3.18752926424928, + "learning_rate": 1.989753749483233e-06, + "loss": 0.686, + "step": 162 + }, + { + "epoch": 0.07506980225094269, + "grad_norm": 3.4277096993868557, + "learning_rate": 1.989539549010861e-06, + "loss": 0.6334, + "step": 163 + }, + { + "epoch": 0.07553035318499755, + "grad_norm": 3.1363256404631876, + "learning_rate": 1.9893231444520584e-06, + "loss": 0.7184, + "step": 164 + }, + { + "epoch": 0.07599090411905242, + "grad_norm": 3.773031496303488, + "learning_rate": 1.9891045362888413e-06, + "loss": 0.7071, + "step": 165 + }, + { + "epoch": 0.07645145505310728, + "grad_norm": 3.4894250417343957, + "learning_rate": 1.988883725008136e-06, + "loss": 0.8024, + "step": 166 + }, + { + "epoch": 0.07691200598716215, + "grad_norm": 3.5237596679440126, + "learning_rate": 1.9886607111017727e-06, + "loss": 0.5565, + "step": 167 + }, + { + "epoch": 0.077372556921217, + "grad_norm": 3.956330390896236, + "learning_rate": 1.988435495066491e-06, + "loss": 0.6834, + "step": 168 + }, + { + "epoch": 0.07783310785527187, + "grad_norm": 3.0581620476086027, + "learning_rate": 1.988208077403932e-06, + "loss": 0.5355, + "step": 169 + }, + { + "epoch": 0.07829365878932673, + "grad_norm": 3.359700225843598, + "learning_rate": 1.9879784586206446e-06, + "loss": 0.6266, + "step": 170 + }, + { + "epoch": 0.0787542097233816, + "grad_norm": 3.160066872012096, + "learning_rate": 1.987746639228077e-06, + "loss": 0.4693, + "step": 171 + }, + { + "epoch": 0.07921476065743646, + "grad_norm": 2.8930357386693037, + "learning_rate": 1.9875126197425812e-06, + "loss": 0.543, + "step": 172 + }, + { + "epoch": 0.07967531159149133, + "grad_norm": 3.2160039035976538, + "learning_rate": 1.987276400685409e-06, + "loss": 0.5285, + "step": 173 + }, + { + "epoch": 0.08013586252554618, + "grad_norm": 3.5154065293512837, + "learning_rate": 1.9870379825827105e-06, + "loss": 0.7303, + "step": 174 + }, + { + "epoch": 0.08059641345960104, + "grad_norm": 3.272929698816058, + "learning_rate": 1.9867973659655357e-06, + "loss": 0.596, + "step": 175 + }, + { + "epoch": 0.08105696439365591, + "grad_norm": 3.2230741681038837, + "learning_rate": 1.9865545513698304e-06, + "loss": 0.7758, + "step": 176 + }, + { + "epoch": 0.08151751532771077, + "grad_norm": 2.88861925930577, + "learning_rate": 1.9863095393364363e-06, + "loss": 0.5791, + "step": 177 + }, + { + "epoch": 0.08197806626176564, + "grad_norm": 3.6487253551379166, + "learning_rate": 1.9860623304110895e-06, + "loss": 0.8919, + "step": 178 + }, + { + "epoch": 0.0824386171958205, + "grad_norm": 3.3372588648412513, + "learning_rate": 1.9858129251444203e-06, + "loss": 0.6433, + "step": 179 + }, + { + "epoch": 0.08289916812987537, + "grad_norm": 3.069467629265819, + "learning_rate": 1.9855613240919496e-06, + "loss": 0.617, + "step": 180 + }, + { + "epoch": 0.08335971906393022, + "grad_norm": 3.489373026628576, + "learning_rate": 1.985307527814091e-06, + "loss": 0.768, + "step": 181 + }, + { + "epoch": 0.08382026999798509, + "grad_norm": 3.459352404519042, + "learning_rate": 1.9850515368761465e-06, + "loss": 0.7647, + "step": 182 + }, + { + "epoch": 0.08428082093203995, + "grad_norm": 3.4396136216484012, + "learning_rate": 1.9847933518483066e-06, + "loss": 0.6323, + "step": 183 + }, + { + "epoch": 0.08474137186609482, + "grad_norm": 3.369315529955375, + "learning_rate": 1.9845329733056488e-06, + "loss": 0.6724, + "step": 184 + }, + { + "epoch": 0.08520192280014968, + "grad_norm": 3.144962754808177, + "learning_rate": 1.9842704018281364e-06, + "loss": 0.6974, + "step": 185 + }, + { + "epoch": 0.08566247373420455, + "grad_norm": 3.3041054108917143, + "learning_rate": 1.984005638000618e-06, + "loss": 0.6231, + "step": 186 + }, + { + "epoch": 0.0861230246682594, + "grad_norm": 4.170198999419159, + "learning_rate": 1.983738682412824e-06, + "loss": 0.644, + "step": 187 + }, + { + "epoch": 0.08658357560231426, + "grad_norm": 3.22338168544018, + "learning_rate": 1.983469535659369e-06, + "loss": 0.5809, + "step": 188 + }, + { + "epoch": 0.08704412653636913, + "grad_norm": 3.87679312890683, + "learning_rate": 1.983198198339745e-06, + "loss": 0.6326, + "step": 189 + }, + { + "epoch": 0.087504677470424, + "grad_norm": 2.8932671924853026, + "learning_rate": 1.9829246710583258e-06, + "loss": 0.694, + "step": 190 + }, + { + "epoch": 0.08796522840447886, + "grad_norm": 3.0104693995609293, + "learning_rate": 1.982648954424362e-06, + "loss": 0.5702, + "step": 191 + }, + { + "epoch": 0.08842577933853372, + "grad_norm": 3.6696726695949597, + "learning_rate": 1.982371049051981e-06, + "loss": 0.6627, + "step": 192 + }, + { + "epoch": 0.08888633027258859, + "grad_norm": 3.483251606814318, + "learning_rate": 1.982090955560185e-06, + "loss": 0.6148, + "step": 193 + }, + { + "epoch": 0.08934688120664344, + "grad_norm": 3.059134143200745, + "learning_rate": 1.981808674572851e-06, + "loss": 0.641, + "step": 194 + }, + { + "epoch": 0.0898074321406983, + "grad_norm": 3.471348679245806, + "learning_rate": 1.9815242067187273e-06, + "loss": 0.7375, + "step": 195 + }, + { + "epoch": 0.09026798307475317, + "grad_norm": 3.2356019265585423, + "learning_rate": 1.9812375526314335e-06, + "loss": 0.6559, + "step": 196 + }, + { + "epoch": 0.09072853400880804, + "grad_norm": 3.5553418134929737, + "learning_rate": 1.9809487129494588e-06, + "loss": 0.6052, + "step": 197 + }, + { + "epoch": 0.0911890849428629, + "grad_norm": 3.5207928333701126, + "learning_rate": 1.9806576883161607e-06, + "loss": 0.7502, + "step": 198 + }, + { + "epoch": 0.09164963587691777, + "grad_norm": 3.6571981250699808, + "learning_rate": 1.9803644793797635e-06, + "loss": 0.6423, + "step": 199 + }, + { + "epoch": 0.09211018681097263, + "grad_norm": 3.3310862469732236, + "learning_rate": 1.9800690867933567e-06, + "loss": 0.6166, + "step": 200 + }, + { + "epoch": 0.09257073774502748, + "grad_norm": 3.599428672561491, + "learning_rate": 1.9797715112148933e-06, + "loss": 0.6751, + "step": 201 + }, + { + "epoch": 0.09303128867908235, + "grad_norm": 3.111228152420326, + "learning_rate": 1.979471753307189e-06, + "loss": 0.5873, + "step": 202 + }, + { + "epoch": 0.09349183961313721, + "grad_norm": 2.7796406722600433, + "learning_rate": 1.979169813737921e-06, + "loss": 0.5509, + "step": 203 + }, + { + "epoch": 0.09395239054719208, + "grad_norm": 3.3190228535283586, + "learning_rate": 1.9788656931796237e-06, + "loss": 0.5311, + "step": 204 + }, + { + "epoch": 0.09441294148124695, + "grad_norm": 3.4901850657341678, + "learning_rate": 1.9785593923096927e-06, + "loss": 0.6073, + "step": 205 + }, + { + "epoch": 0.09487349241530181, + "grad_norm": 3.8594657707637836, + "learning_rate": 1.978250911810377e-06, + "loss": 0.7903, + "step": 206 + }, + { + "epoch": 0.09533404334935666, + "grad_norm": 3.783029519442971, + "learning_rate": 1.9779402523687825e-06, + "loss": 0.7535, + "step": 207 + }, + { + "epoch": 0.09579459428341153, + "grad_norm": 3.541223829083605, + "learning_rate": 1.977627414676867e-06, + "loss": 0.6787, + "step": 208 + }, + { + "epoch": 0.09625514521746639, + "grad_norm": 3.300314419811215, + "learning_rate": 1.977312399431441e-06, + "loss": 0.688, + "step": 209 + }, + { + "epoch": 0.09671569615152126, + "grad_norm": 3.021536775472106, + "learning_rate": 1.9769952073341655e-06, + "loss": 0.5406, + "step": 210 + }, + { + "epoch": 0.09717624708557612, + "grad_norm": 3.1063359917852016, + "learning_rate": 1.976675839091549e-06, + "loss": 0.7334, + "step": 211 + }, + { + "epoch": 0.09763679801963099, + "grad_norm": 4.2418847264483555, + "learning_rate": 1.976354295414948e-06, + "loss": 0.7438, + "step": 212 + }, + { + "epoch": 0.09809734895368585, + "grad_norm": 3.2746417894451927, + "learning_rate": 1.9760305770205648e-06, + "loss": 0.6335, + "step": 213 + }, + { + "epoch": 0.0985578998877407, + "grad_norm": 3.217191492035501, + "learning_rate": 1.9757046846294446e-06, + "loss": 0.6167, + "step": 214 + }, + { + "epoch": 0.09901845082179557, + "grad_norm": 3.202923015353876, + "learning_rate": 1.975376618967476e-06, + "loss": 0.6576, + "step": 215 + }, + { + "epoch": 0.09947900175585044, + "grad_norm": 3.368536938836128, + "learning_rate": 1.975046380765387e-06, + "loss": 0.5377, + "step": 216 + }, + { + "epoch": 0.0999395526899053, + "grad_norm": 3.371798575684341, + "learning_rate": 1.9747139707587467e-06, + "loss": 0.5324, + "step": 217 + }, + { + "epoch": 0.10040010362396017, + "grad_norm": 2.7715577719122484, + "learning_rate": 1.9743793896879595e-06, + "loss": 0.6635, + "step": 218 + }, + { + "epoch": 0.10086065455801503, + "grad_norm": 2.829524830044529, + "learning_rate": 1.974042638298267e-06, + "loss": 0.5327, + "step": 219 + }, + { + "epoch": 0.10132120549206988, + "grad_norm": 2.891869373792989, + "learning_rate": 1.9737037173397446e-06, + "loss": 0.5856, + "step": 220 + }, + { + "epoch": 0.10178175642612475, + "grad_norm": 3.062890989327354, + "learning_rate": 1.9733626275672996e-06, + "loss": 0.7426, + "step": 221 + }, + { + "epoch": 0.10224230736017961, + "grad_norm": 3.116251286088085, + "learning_rate": 1.973019369740671e-06, + "loss": 0.5652, + "step": 222 + }, + { + "epoch": 0.10270285829423448, + "grad_norm": 3.4890492450754502, + "learning_rate": 1.972673944624426e-06, + "loss": 0.6855, + "step": 223 + }, + { + "epoch": 0.10316340922828934, + "grad_norm": 3.0888002182898444, + "learning_rate": 1.9723263529879598e-06, + "loss": 0.83, + "step": 224 + }, + { + "epoch": 0.10362396016234421, + "grad_norm": 3.3556762378140896, + "learning_rate": 1.9719765956054933e-06, + "loss": 0.6933, + "step": 225 + }, + { + "epoch": 0.10408451109639907, + "grad_norm": 3.195505438673579, + "learning_rate": 1.971624673256071e-06, + "loss": 0.5613, + "step": 226 + }, + { + "epoch": 0.10454506203045393, + "grad_norm": 2.6074576346797143, + "learning_rate": 1.9712705867235604e-06, + "loss": 0.4869, + "step": 227 + }, + { + "epoch": 0.10500561296450879, + "grad_norm": 3.421431138997975, + "learning_rate": 1.970914336796648e-06, + "loss": 0.5345, + "step": 228 + }, + { + "epoch": 0.10546616389856366, + "grad_norm": 3.558425101140546, + "learning_rate": 1.97055592426884e-06, + "loss": 0.6834, + "step": 229 + }, + { + "epoch": 0.10592671483261852, + "grad_norm": 3.662238091068311, + "learning_rate": 1.9701953499384593e-06, + "loss": 0.6788, + "step": 230 + }, + { + "epoch": 0.10638726576667339, + "grad_norm": 2.9575097174697387, + "learning_rate": 1.9698326146086445e-06, + "loss": 0.6197, + "step": 231 + }, + { + "epoch": 0.10684781670072825, + "grad_norm": 3.4475135509424293, + "learning_rate": 1.9694677190873467e-06, + "loss": 0.659, + "step": 232 + }, + { + "epoch": 0.1073083676347831, + "grad_norm": 3.108410036960475, + "learning_rate": 1.9691006641873296e-06, + "loss": 0.6831, + "step": 233 + }, + { + "epoch": 0.10776891856883797, + "grad_norm": 3.220407600024432, + "learning_rate": 1.968731450726166e-06, + "loss": 0.6389, + "step": 234 + }, + { + "epoch": 0.10822946950289283, + "grad_norm": 3.534132987080869, + "learning_rate": 1.9683600795262364e-06, + "loss": 0.6332, + "step": 235 + }, + { + "epoch": 0.1086900204369477, + "grad_norm": 3.5324339129255877, + "learning_rate": 1.9679865514147277e-06, + "loss": 0.7518, + "step": 236 + }, + { + "epoch": 0.10915057137100256, + "grad_norm": 3.0421407458400003, + "learning_rate": 1.9676108672236317e-06, + "loss": 0.5808, + "step": 237 + }, + { + "epoch": 0.10961112230505743, + "grad_norm": 3.5520947520869908, + "learning_rate": 1.9672330277897414e-06, + "loss": 0.5368, + "step": 238 + }, + { + "epoch": 0.1100716732391123, + "grad_norm": 3.451159362431226, + "learning_rate": 1.9668530339546514e-06, + "loss": 0.6296, + "step": 239 + }, + { + "epoch": 0.11053222417316715, + "grad_norm": 3.509910875733122, + "learning_rate": 1.966470886564755e-06, + "loss": 0.7006, + "step": 240 + }, + { + "epoch": 0.11099277510722201, + "grad_norm": 2.596656303513613, + "learning_rate": 1.9660865864712412e-06, + "loss": 0.4933, + "step": 241 + }, + { + "epoch": 0.11145332604127688, + "grad_norm": 3.4232091740027655, + "learning_rate": 1.965700134530095e-06, + "loss": 0.5211, + "step": 242 + }, + { + "epoch": 0.11191387697533174, + "grad_norm": 3.040611124293651, + "learning_rate": 1.9653115316020935e-06, + "loss": 0.6377, + "step": 243 + }, + { + "epoch": 0.1123744279093866, + "grad_norm": 3.4791503945160267, + "learning_rate": 1.9649207785528065e-06, + "loss": 0.6889, + "step": 244 + }, + { + "epoch": 0.11283497884344147, + "grad_norm": 3.0895910555628032, + "learning_rate": 1.96452787625259e-06, + "loss": 0.6833, + "step": 245 + }, + { + "epoch": 0.11329552977749632, + "grad_norm": 3.001038166095486, + "learning_rate": 1.9641328255765913e-06, + "loss": 0.5572, + "step": 246 + }, + { + "epoch": 0.11375608071155119, + "grad_norm": 3.7375507199090596, + "learning_rate": 1.963735627404739e-06, + "loss": 0.6776, + "step": 247 + }, + { + "epoch": 0.11421663164560605, + "grad_norm": 3.4861645782956723, + "learning_rate": 1.963336282621747e-06, + "loss": 0.6076, + "step": 248 + }, + { + "epoch": 0.11467718257966092, + "grad_norm": 3.9356402228698313, + "learning_rate": 1.962934792117111e-06, + "loss": 0.5698, + "step": 249 + }, + { + "epoch": 0.11513773351371578, + "grad_norm": 3.4680226962421496, + "learning_rate": 1.9625311567851045e-06, + "loss": 0.6332, + "step": 250 + }, + { + "epoch": 0.11559828444777065, + "grad_norm": 4.110638488505918, + "learning_rate": 1.9621253775247795e-06, + "loss": 0.7351, + "step": 251 + }, + { + "epoch": 0.11605883538182551, + "grad_norm": 3.736146712062541, + "learning_rate": 1.9617174552399633e-06, + "loss": 0.7494, + "step": 252 + }, + { + "epoch": 0.11651938631588037, + "grad_norm": 2.938611398017176, + "learning_rate": 1.961307390839255e-06, + "loss": 0.6439, + "step": 253 + }, + { + "epoch": 0.11697993724993523, + "grad_norm": 3.267375637056784, + "learning_rate": 1.960895185236028e-06, + "loss": 0.624, + "step": 254 + }, + { + "epoch": 0.1174404881839901, + "grad_norm": 3.394791456038148, + "learning_rate": 1.9604808393484217e-06, + "loss": 0.605, + "step": 255 + }, + { + "epoch": 0.11790103911804496, + "grad_norm": 3.1028647299114174, + "learning_rate": 1.960064354099345e-06, + "loss": 0.6428, + "step": 256 + }, + { + "epoch": 0.11836159005209983, + "grad_norm": 3.0414855513121357, + "learning_rate": 1.959645730416471e-06, + "loss": 0.4093, + "step": 257 + }, + { + "epoch": 0.11882214098615469, + "grad_norm": 3.4120022845796525, + "learning_rate": 1.959224969232237e-06, + "loss": 0.6576, + "step": 258 + }, + { + "epoch": 0.11928269192020956, + "grad_norm": 2.972667933535389, + "learning_rate": 1.9588020714838394e-06, + "loss": 0.5633, + "step": 259 + }, + { + "epoch": 0.11974324285426441, + "grad_norm": 3.2887632380841842, + "learning_rate": 1.9583770381132357e-06, + "loss": 0.6012, + "step": 260 + }, + { + "epoch": 0.12020379378831927, + "grad_norm": 3.5081309566331527, + "learning_rate": 1.9579498700671386e-06, + "loss": 0.6691, + "step": 261 + }, + { + "epoch": 0.12066434472237414, + "grad_norm": 3.3208918064597936, + "learning_rate": 1.9575205682970163e-06, + "loss": 0.6513, + "step": 262 + }, + { + "epoch": 0.121124895656429, + "grad_norm": 3.2712864716169943, + "learning_rate": 1.9570891337590895e-06, + "loss": 0.5478, + "step": 263 + }, + { + "epoch": 0.12158544659048387, + "grad_norm": 3.282868754227743, + "learning_rate": 1.956655567414329e-06, + "loss": 0.6925, + "step": 264 + }, + { + "epoch": 0.12204599752453874, + "grad_norm": 3.482367964725982, + "learning_rate": 1.9562198702284552e-06, + "loss": 0.7356, + "step": 265 + }, + { + "epoch": 0.12250654845859359, + "grad_norm": 3.084010149386528, + "learning_rate": 1.955782043171933e-06, + "loss": 0.5609, + "step": 266 + }, + { + "epoch": 0.12296709939264845, + "grad_norm": 3.0585298128518765, + "learning_rate": 1.9553420872199732e-06, + "loss": 0.6027, + "step": 267 + }, + { + "epoch": 0.12342765032670332, + "grad_norm": 3.386973557288646, + "learning_rate": 1.954900003352527e-06, + "loss": 0.5398, + "step": 268 + }, + { + "epoch": 0.12388820126075818, + "grad_norm": 3.2451705841602396, + "learning_rate": 1.954455792554285e-06, + "loss": 0.7089, + "step": 269 + }, + { + "epoch": 0.12434875219481305, + "grad_norm": 2.9403279279548, + "learning_rate": 1.9540094558146775e-06, + "loss": 0.4651, + "step": 270 + }, + { + "epoch": 0.12480930312886791, + "grad_norm": 3.422975954193148, + "learning_rate": 1.9535609941278677e-06, + "loss": 0.6637, + "step": 271 + }, + { + "epoch": 0.12526985406292276, + "grad_norm": 2.8483103524833906, + "learning_rate": 1.9531104084927526e-06, + "loss": 0.5443, + "step": 272 + }, + { + "epoch": 0.12573040499697763, + "grad_norm": 3.0046696813378477, + "learning_rate": 1.9526576999129613e-06, + "loss": 0.6132, + "step": 273 + }, + { + "epoch": 0.1261909559310325, + "grad_norm": 3.030506496717706, + "learning_rate": 1.9522028693968496e-06, + "loss": 0.6696, + "step": 274 + }, + { + "epoch": 0.12665150686508736, + "grad_norm": 3.4189036410010076, + "learning_rate": 1.951745917957501e-06, + "loss": 0.6398, + "step": 275 + }, + { + "epoch": 0.12711205779914223, + "grad_norm": 3.4281845642915854, + "learning_rate": 1.951286846612723e-06, + "loss": 0.613, + "step": 276 + }, + { + "epoch": 0.1275726087331971, + "grad_norm": 3.5114510461027573, + "learning_rate": 1.9508256563850437e-06, + "loss": 0.5248, + "step": 277 + }, + { + "epoch": 0.12803315966725196, + "grad_norm": 3.2175363479882853, + "learning_rate": 1.9503623483017125e-06, + "loss": 0.6153, + "step": 278 + }, + { + "epoch": 0.12849371060130682, + "grad_norm": 3.257507884907778, + "learning_rate": 1.949896923394695e-06, + "loss": 0.6326, + "step": 279 + }, + { + "epoch": 0.12895426153536169, + "grad_norm": 3.4234091810352894, + "learning_rate": 1.9494293827006724e-06, + "loss": 0.6533, + "step": 280 + }, + { + "epoch": 0.12941481246941655, + "grad_norm": 2.9859571949464803, + "learning_rate": 1.9489597272610374e-06, + "loss": 0.7496, + "step": 281 + }, + { + "epoch": 0.1298753634034714, + "grad_norm": 3.6355830400808125, + "learning_rate": 1.948487958121895e-06, + "loss": 0.6794, + "step": 282 + }, + { + "epoch": 0.13033591433752625, + "grad_norm": 3.2842113677863733, + "learning_rate": 1.9480140763340563e-06, + "loss": 0.6123, + "step": 283 + }, + { + "epoch": 0.13079646527158112, + "grad_norm": 3.094397720174347, + "learning_rate": 1.9475380829530394e-06, + "loss": 0.5419, + "step": 284 + }, + { + "epoch": 0.13125701620563598, + "grad_norm": 3.527829614510662, + "learning_rate": 1.947059979039065e-06, + "loss": 0.7257, + "step": 285 + }, + { + "epoch": 0.13171756713969085, + "grad_norm": 3.4106857915169475, + "learning_rate": 1.9465797656570544e-06, + "loss": 0.6017, + "step": 286 + }, + { + "epoch": 0.13217811807374572, + "grad_norm": 3.319543434532599, + "learning_rate": 1.946097443876629e-06, + "loss": 0.6015, + "step": 287 + }, + { + "epoch": 0.13263866900780058, + "grad_norm": 3.141575156484543, + "learning_rate": 1.9456130147721057e-06, + "loss": 0.6703, + "step": 288 + }, + { + "epoch": 0.13309921994185545, + "grad_norm": 2.8399129741558635, + "learning_rate": 1.9451264794224948e-06, + "loss": 0.4491, + "step": 289 + }, + { + "epoch": 0.1335597708759103, + "grad_norm": 3.0162883821205484, + "learning_rate": 1.944637838911498e-06, + "loss": 0.4653, + "step": 290 + }, + { + "epoch": 0.13402032180996518, + "grad_norm": 3.0158920954224593, + "learning_rate": 1.944147094327506e-06, + "loss": 0.5747, + "step": 291 + }, + { + "epoch": 0.13448087274402004, + "grad_norm": 2.8892050309237054, + "learning_rate": 1.9436542467635968e-06, + "loss": 0.4266, + "step": 292 + }, + { + "epoch": 0.1349414236780749, + "grad_norm": 3.643489464124624, + "learning_rate": 1.943159297317532e-06, + "loss": 0.6408, + "step": 293 + }, + { + "epoch": 0.13540197461212977, + "grad_norm": 3.1781209929973815, + "learning_rate": 1.9426622470917553e-06, + "loss": 0.588, + "step": 294 + }, + { + "epoch": 0.13586252554618464, + "grad_norm": 3.2919873800040245, + "learning_rate": 1.942163097193389e-06, + "loss": 0.6894, + "step": 295 + }, + { + "epoch": 0.13632307648023947, + "grad_norm": 2.858947646506684, + "learning_rate": 1.941661848734233e-06, + "loss": 0.6467, + "step": 296 + }, + { + "epoch": 0.13678362741429434, + "grad_norm": 3.246171280951807, + "learning_rate": 1.9411585028307604e-06, + "loss": 0.5943, + "step": 297 + }, + { + "epoch": 0.1372441783483492, + "grad_norm": 2.7358113053763953, + "learning_rate": 1.9406530606041173e-06, + "loss": 0.524, + "step": 298 + }, + { + "epoch": 0.13770472928240407, + "grad_norm": 3.136480105540605, + "learning_rate": 1.940145523180118e-06, + "loss": 0.6102, + "step": 299 + }, + { + "epoch": 0.13816528021645894, + "grad_norm": 3.155533214531188, + "learning_rate": 1.939635891689245e-06, + "loss": 0.5248, + "step": 300 + }, + { + "epoch": 0.1386258311505138, + "grad_norm": 3.647288084061138, + "learning_rate": 1.9391241672666437e-06, + "loss": 0.7047, + "step": 301 + }, + { + "epoch": 0.13908638208456867, + "grad_norm": 2.979265708987498, + "learning_rate": 1.938610351052122e-06, + "loss": 0.5515, + "step": 302 + }, + { + "epoch": 0.13954693301862353, + "grad_norm": 3.119300289126859, + "learning_rate": 1.938094444190147e-06, + "loss": 0.6248, + "step": 303 + }, + { + "epoch": 0.1400074839526784, + "grad_norm": 3.663446793597039, + "learning_rate": 1.937576447829842e-06, + "loss": 0.5661, + "step": 304 + }, + { + "epoch": 0.14046803488673326, + "grad_norm": 3.5292734921724236, + "learning_rate": 1.937056363124985e-06, + "loss": 0.5915, + "step": 305 + }, + { + "epoch": 0.14092858582078813, + "grad_norm": 3.037749619620941, + "learning_rate": 1.936534191234006e-06, + "loss": 0.676, + "step": 306 + }, + { + "epoch": 0.141389136754843, + "grad_norm": 3.387669461542685, + "learning_rate": 1.9360099333199825e-06, + "loss": 0.6448, + "step": 307 + }, + { + "epoch": 0.14184968768889786, + "grad_norm": 3.5739322009118166, + "learning_rate": 1.935483590550639e-06, + "loss": 0.7227, + "step": 308 + }, + { + "epoch": 0.1423102386229527, + "grad_norm": 3.1735367015087115, + "learning_rate": 1.9349551640983444e-06, + "loss": 0.5428, + "step": 309 + }, + { + "epoch": 0.14277078955700756, + "grad_norm": 3.048510928639603, + "learning_rate": 1.934424655140109e-06, + "loss": 0.5981, + "step": 310 + }, + { + "epoch": 0.14323134049106243, + "grad_norm": 3.586272313938004, + "learning_rate": 1.933892064857579e-06, + "loss": 0.5659, + "step": 311 + }, + { + "epoch": 0.1436918914251173, + "grad_norm": 3.3251317073628393, + "learning_rate": 1.933357394437041e-06, + "loss": 0.6554, + "step": 312 + }, + { + "epoch": 0.14415244235917216, + "grad_norm": 3.2915172640337915, + "learning_rate": 1.93282064506941e-06, + "loss": 0.5574, + "step": 313 + }, + { + "epoch": 0.14461299329322702, + "grad_norm": 3.1215547918258952, + "learning_rate": 1.9322818179502356e-06, + "loss": 0.5984, + "step": 314 + }, + { + "epoch": 0.1450735442272819, + "grad_norm": 3.1067732908414336, + "learning_rate": 1.931740914279693e-06, + "loss": 0.5418, + "step": 315 + }, + { + "epoch": 0.14553409516133675, + "grad_norm": 3.0231982850922363, + "learning_rate": 1.9311979352625832e-06, + "loss": 0.6471, + "step": 316 + }, + { + "epoch": 0.14599464609539162, + "grad_norm": 3.2286421391211326, + "learning_rate": 1.930652882108331e-06, + "loss": 0.667, + "step": 317 + }, + { + "epoch": 0.14645519702944648, + "grad_norm": 3.051273886454387, + "learning_rate": 1.930105756030979e-06, + "loss": 0.6372, + "step": 318 + }, + { + "epoch": 0.14691574796350135, + "grad_norm": 3.357978841575257, + "learning_rate": 1.929556558249189e-06, + "loss": 0.5968, + "step": 319 + }, + { + "epoch": 0.1473762988975562, + "grad_norm": 3.408657581264013, + "learning_rate": 1.9290052899862353e-06, + "loss": 0.6864, + "step": 320 + }, + { + "epoch": 0.14783684983161108, + "grad_norm": 2.965828703883621, + "learning_rate": 1.9284519524700063e-06, + "loss": 0.4714, + "step": 321 + }, + { + "epoch": 0.14829740076566592, + "grad_norm": 2.936230948042832, + "learning_rate": 1.9278965469329976e-06, + "loss": 0.5051, + "step": 322 + }, + { + "epoch": 0.14875795169972078, + "grad_norm": 2.9304142245949727, + "learning_rate": 1.9273390746123115e-06, + "loss": 0.5798, + "step": 323 + }, + { + "epoch": 0.14921850263377565, + "grad_norm": 2.6044770470699885, + "learning_rate": 1.926779536749654e-06, + "loss": 0.4478, + "step": 324 + }, + { + "epoch": 0.1496790535678305, + "grad_norm": 3.346102851664222, + "learning_rate": 1.9262179345913323e-06, + "loss": 0.6081, + "step": 325 + }, + { + "epoch": 0.15013960450188538, + "grad_norm": 3.599494713042751, + "learning_rate": 1.9256542693882503e-06, + "loss": 0.66, + "step": 326 + }, + { + "epoch": 0.15060015543594024, + "grad_norm": 3.3559078451878883, + "learning_rate": 1.925088542395909e-06, + "loss": 0.5762, + "step": 327 + }, + { + "epoch": 0.1510607063699951, + "grad_norm": 3.305256183172175, + "learning_rate": 1.9245207548743994e-06, + "loss": 0.5205, + "step": 328 + }, + { + "epoch": 0.15152125730404997, + "grad_norm": 3.189694287544651, + "learning_rate": 1.9239509080884043e-06, + "loss": 0.5483, + "step": 329 + }, + { + "epoch": 0.15198180823810484, + "grad_norm": 3.8482914338019802, + "learning_rate": 1.923379003307193e-06, + "loss": 0.5602, + "step": 330 + }, + { + "epoch": 0.1524423591721597, + "grad_norm": 2.804743136804608, + "learning_rate": 1.9228050418046165e-06, + "loss": 0.4328, + "step": 331 + }, + { + "epoch": 0.15290291010621457, + "grad_norm": 3.675849703736383, + "learning_rate": 1.92222902485911e-06, + "loss": 0.5611, + "step": 332 + }, + { + "epoch": 0.15336346104026943, + "grad_norm": 3.145151001423423, + "learning_rate": 1.921650953753685e-06, + "loss": 0.44, + "step": 333 + }, + { + "epoch": 0.1538240119743243, + "grad_norm": 3.580270585290245, + "learning_rate": 1.9210708297759284e-06, + "loss": 0.6488, + "step": 334 + }, + { + "epoch": 0.15428456290837914, + "grad_norm": 3.003304167462429, + "learning_rate": 1.9204886542180007e-06, + "loss": 0.5877, + "step": 335 + }, + { + "epoch": 0.154745113842434, + "grad_norm": 2.8636522642130275, + "learning_rate": 1.9199044283766315e-06, + "loss": 0.5932, + "step": 336 + }, + { + "epoch": 0.15520566477648887, + "grad_norm": 3.3796119616806743, + "learning_rate": 1.9193181535531177e-06, + "loss": 0.6361, + "step": 337 + }, + { + "epoch": 0.15566621571054373, + "grad_norm": 3.185129118132095, + "learning_rate": 1.9187298310533184e-06, + "loss": 0.5942, + "step": 338 + }, + { + "epoch": 0.1561267666445986, + "grad_norm": 3.337715563794658, + "learning_rate": 1.9181394621876556e-06, + "loss": 0.6108, + "step": 339 + }, + { + "epoch": 0.15658731757865346, + "grad_norm": 2.8640792419997907, + "learning_rate": 1.917547048271109e-06, + "loss": 0.633, + "step": 340 + }, + { + "epoch": 0.15704786851270833, + "grad_norm": 2.7926946129227943, + "learning_rate": 1.916952590623212e-06, + "loss": 0.4712, + "step": 341 + }, + { + "epoch": 0.1575084194467632, + "grad_norm": 3.4005962235241203, + "learning_rate": 1.9163560905680514e-06, + "loss": 0.5317, + "step": 342 + }, + { + "epoch": 0.15796897038081806, + "grad_norm": 3.1869189227339416, + "learning_rate": 1.9157575494342636e-06, + "loss": 0.5394, + "step": 343 + }, + { + "epoch": 0.15842952131487292, + "grad_norm": 3.4152466358703713, + "learning_rate": 1.91515696855503e-06, + "loss": 0.6375, + "step": 344 + }, + { + "epoch": 0.1588900722489278, + "grad_norm": 3.0926055262112615, + "learning_rate": 1.9145543492680763e-06, + "loss": 0.6227, + "step": 345 + }, + { + "epoch": 0.15935062318298265, + "grad_norm": 3.22498936596375, + "learning_rate": 1.9139496929156683e-06, + "loss": 0.6416, + "step": 346 + }, + { + "epoch": 0.15981117411703752, + "grad_norm": 2.9706888314628466, + "learning_rate": 1.913343000844609e-06, + "loss": 0.5891, + "step": 347 + }, + { + "epoch": 0.16027172505109236, + "grad_norm": 3.19859477403754, + "learning_rate": 1.9127342744062357e-06, + "loss": 0.5765, + "step": 348 + }, + { + "epoch": 0.16073227598514722, + "grad_norm": 3.115476706969415, + "learning_rate": 1.912123514956417e-06, + "loss": 0.5554, + "step": 349 + }, + { + "epoch": 0.1611928269192021, + "grad_norm": 2.961966146622805, + "learning_rate": 1.9115107238555497e-06, + "loss": 0.5382, + "step": 350 + }, + { + "epoch": 0.16165337785325695, + "grad_norm": 3.671673821035005, + "learning_rate": 1.9108959024685566e-06, + "loss": 0.6642, + "step": 351 + }, + { + "epoch": 0.16211392878731182, + "grad_norm": 3.3302938456428106, + "learning_rate": 1.9102790521648817e-06, + "loss": 0.5453, + "step": 352 + }, + { + "epoch": 0.16257447972136668, + "grad_norm": 3.4915603642383752, + "learning_rate": 1.909660174318489e-06, + "loss": 0.5795, + "step": 353 + }, + { + "epoch": 0.16303503065542155, + "grad_norm": 3.211148339907401, + "learning_rate": 1.909039270307858e-06, + "loss": 0.5538, + "step": 354 + }, + { + "epoch": 0.1634955815894764, + "grad_norm": 2.998522836200678, + "learning_rate": 1.9084163415159817e-06, + "loss": 0.6143, + "step": 355 + }, + { + "epoch": 0.16395613252353128, + "grad_norm": 3.088249307549935, + "learning_rate": 1.907791389330363e-06, + "loss": 0.5506, + "step": 356 + }, + { + "epoch": 0.16441668345758614, + "grad_norm": 3.0463921020716604, + "learning_rate": 1.9071644151430108e-06, + "loss": 0.5874, + "step": 357 + }, + { + "epoch": 0.164877234391641, + "grad_norm": 3.345306076824923, + "learning_rate": 1.9065354203504398e-06, + "loss": 0.7046, + "step": 358 + }, + { + "epoch": 0.16533778532569587, + "grad_norm": 3.228814822006655, + "learning_rate": 1.9059044063536633e-06, + "loss": 0.6248, + "step": 359 + }, + { + "epoch": 0.16579833625975074, + "grad_norm": 3.1464755508670637, + "learning_rate": 1.9052713745581931e-06, + "loss": 0.7453, + "step": 360 + }, + { + "epoch": 0.16625888719380558, + "grad_norm": 3.2839840109926697, + "learning_rate": 1.9046363263740358e-06, + "loss": 0.5185, + "step": 361 + }, + { + "epoch": 0.16671943812786044, + "grad_norm": 3.2693787327619046, + "learning_rate": 1.9039992632156881e-06, + "loss": 0.6118, + "step": 362 + }, + { + "epoch": 0.1671799890619153, + "grad_norm": 2.666213429284101, + "learning_rate": 1.9033601865021356e-06, + "loss": 0.6548, + "step": 363 + }, + { + "epoch": 0.16764053999597017, + "grad_norm": 3.814240106647521, + "learning_rate": 1.902719097656849e-06, + "loss": 0.6145, + "step": 364 + }, + { + "epoch": 0.16810109093002504, + "grad_norm": 3.5057882856050515, + "learning_rate": 1.9020759981077804e-06, + "loss": 0.5805, + "step": 365 + }, + { + "epoch": 0.1685616418640799, + "grad_norm": 3.039639804053503, + "learning_rate": 1.9014308892873608e-06, + "loss": 0.5629, + "step": 366 + }, + { + "epoch": 0.16902219279813477, + "grad_norm": 3.492100318826316, + "learning_rate": 1.9007837726324965e-06, + "loss": 0.5946, + "step": 367 + }, + { + "epoch": 0.16948274373218963, + "grad_norm": 3.5705400131693494, + "learning_rate": 1.9001346495845656e-06, + "loss": 0.6548, + "step": 368 + }, + { + "epoch": 0.1699432946662445, + "grad_norm": 4.12268491301927, + "learning_rate": 1.899483521589416e-06, + "loss": 0.7214, + "step": 369 + }, + { + "epoch": 0.17040384560029936, + "grad_norm": 3.4249008502834934, + "learning_rate": 1.8988303900973612e-06, + "loss": 0.5911, + "step": 370 + }, + { + "epoch": 0.17086439653435423, + "grad_norm": 3.656574957105641, + "learning_rate": 1.8981752565631767e-06, + "loss": 0.6093, + "step": 371 + }, + { + "epoch": 0.1713249474684091, + "grad_norm": 3.079763893362344, + "learning_rate": 1.8975181224460974e-06, + "loss": 0.6065, + "step": 372 + }, + { + "epoch": 0.17178549840246396, + "grad_norm": 3.2800007239600197, + "learning_rate": 1.8968589892098153e-06, + "loss": 0.4789, + "step": 373 + }, + { + "epoch": 0.1722460493365188, + "grad_norm": 3.2117733850824313, + "learning_rate": 1.8961978583224743e-06, + "loss": 0.5964, + "step": 374 + }, + { + "epoch": 0.17270660027057366, + "grad_norm": 3.4590302152792876, + "learning_rate": 1.8955347312566675e-06, + "loss": 0.6101, + "step": 375 + }, + { + "epoch": 0.17316715120462853, + "grad_norm": 4.060553328835282, + "learning_rate": 1.8948696094894352e-06, + "loss": 0.5648, + "step": 376 + }, + { + "epoch": 0.1736277021386834, + "grad_norm": 3.567361103730285, + "learning_rate": 1.8942024945022598e-06, + "loss": 0.7206, + "step": 377 + }, + { + "epoch": 0.17408825307273826, + "grad_norm": 3.11756724456868, + "learning_rate": 1.8935333877810646e-06, + "loss": 0.5549, + "step": 378 + }, + { + "epoch": 0.17454880400679312, + "grad_norm": 3.0493230161140357, + "learning_rate": 1.892862290816208e-06, + "loss": 0.5972, + "step": 379 + }, + { + "epoch": 0.175009354940848, + "grad_norm": 3.0362303456982156, + "learning_rate": 1.8921892051024816e-06, + "loss": 0.5893, + "step": 380 + }, + { + "epoch": 0.17546990587490285, + "grad_norm": 3.2964073384518082, + "learning_rate": 1.8915141321391078e-06, + "loss": 0.6622, + "step": 381 + }, + { + "epoch": 0.17593045680895772, + "grad_norm": 3.177252523691639, + "learning_rate": 1.8908370734297338e-06, + "loss": 0.7608, + "step": 382 + }, + { + "epoch": 0.17639100774301258, + "grad_norm": 2.9299509906121077, + "learning_rate": 1.8901580304824311e-06, + "loss": 0.4753, + "step": 383 + }, + { + "epoch": 0.17685155867706745, + "grad_norm": 3.66473331747743, + "learning_rate": 1.8894770048096903e-06, + "loss": 0.551, + "step": 384 + }, + { + "epoch": 0.17731210961112231, + "grad_norm": 2.857955898303891, + "learning_rate": 1.8887939979284182e-06, + "loss": 0.6346, + "step": 385 + }, + { + "epoch": 0.17777266054517718, + "grad_norm": 3.1162479244718666, + "learning_rate": 1.8881090113599352e-06, + "loss": 0.5842, + "step": 386 + }, + { + "epoch": 0.17823321147923202, + "grad_norm": 3.329339301935554, + "learning_rate": 1.88742204662997e-06, + "loss": 0.6312, + "step": 387 + }, + { + "epoch": 0.17869376241328688, + "grad_norm": 3.334334327924235, + "learning_rate": 1.8867331052686583e-06, + "loss": 0.625, + "step": 388 + }, + { + "epoch": 0.17915431334734175, + "grad_norm": 3.2174795958697033, + "learning_rate": 1.886042188810539e-06, + "loss": 0.5223, + "step": 389 + }, + { + "epoch": 0.1796148642813966, + "grad_norm": 3.185576064014635, + "learning_rate": 1.8853492987945487e-06, + "loss": 0.5531, + "step": 390 + }, + { + "epoch": 0.18007541521545148, + "grad_norm": 3.463451373745552, + "learning_rate": 1.8846544367640216e-06, + "loss": 0.6159, + "step": 391 + }, + { + "epoch": 0.18053596614950634, + "grad_norm": 3.2172789875643604, + "learning_rate": 1.8839576042666833e-06, + "loss": 0.545, + "step": 392 + }, + { + "epoch": 0.1809965170835612, + "grad_norm": 3.050074228085725, + "learning_rate": 1.883258802854649e-06, + "loss": 0.6598, + "step": 393 + }, + { + "epoch": 0.18145706801761607, + "grad_norm": 3.3861305570759974, + "learning_rate": 1.8825580340844187e-06, + "loss": 0.6044, + "step": 394 + }, + { + "epoch": 0.18191761895167094, + "grad_norm": 3.4136999849562772, + "learning_rate": 1.8818552995168748e-06, + "loss": 0.6544, + "step": 395 + }, + { + "epoch": 0.1823781698857258, + "grad_norm": 3.3360169966873547, + "learning_rate": 1.8811506007172788e-06, + "loss": 0.6911, + "step": 396 + }, + { + "epoch": 0.18283872081978067, + "grad_norm": 3.0761653456275875, + "learning_rate": 1.8804439392552664e-06, + "loss": 0.5872, + "step": 397 + }, + { + "epoch": 0.18329927175383554, + "grad_norm": 3.466796092903531, + "learning_rate": 1.8797353167048457e-06, + "loss": 0.545, + "step": 398 + }, + { + "epoch": 0.1837598226878904, + "grad_norm": 3.0584194265671636, + "learning_rate": 1.8790247346443927e-06, + "loss": 0.5631, + "step": 399 + }, + { + "epoch": 0.18422037362194527, + "grad_norm": 3.149641541487599, + "learning_rate": 1.8783121946566473e-06, + "loss": 0.6112, + "step": 400 + }, + { + "epoch": 0.1846809245560001, + "grad_norm": 3.555627278211036, + "learning_rate": 1.8775976983287114e-06, + "loss": 0.571, + "step": 401 + }, + { + "epoch": 0.18514147549005497, + "grad_norm": 3.1940216915097412, + "learning_rate": 1.876881247252044e-06, + "loss": 0.6587, + "step": 402 + }, + { + "epoch": 0.18560202642410983, + "grad_norm": 3.488393433022656, + "learning_rate": 1.8761628430224582e-06, + "loss": 0.6844, + "step": 403 + }, + { + "epoch": 0.1860625773581647, + "grad_norm": 3.1438026269472075, + "learning_rate": 1.875442487240117e-06, + "loss": 0.5829, + "step": 404 + }, + { + "epoch": 0.18652312829221956, + "grad_norm": 3.441265497700973, + "learning_rate": 1.8747201815095313e-06, + "loss": 0.5813, + "step": 405 + }, + { + "epoch": 0.18698367922627443, + "grad_norm": 3.125275659245325, + "learning_rate": 1.8739959274395547e-06, + "loss": 0.5402, + "step": 406 + }, + { + "epoch": 0.1874442301603293, + "grad_norm": 3.182859255843592, + "learning_rate": 1.87326972664338e-06, + "loss": 0.5338, + "step": 407 + }, + { + "epoch": 0.18790478109438416, + "grad_norm": 3.3508341677793867, + "learning_rate": 1.8725415807385368e-06, + "loss": 0.596, + "step": 408 + }, + { + "epoch": 0.18836533202843903, + "grad_norm": 3.5167971779519616, + "learning_rate": 1.871811491346887e-06, + "loss": 0.738, + "step": 409 + }, + { + "epoch": 0.1888258829624939, + "grad_norm": 3.2498092239498844, + "learning_rate": 1.8710794600946216e-06, + "loss": 0.5168, + "step": 410 + }, + { + "epoch": 0.18928643389654876, + "grad_norm": 3.260212416401363, + "learning_rate": 1.8703454886122565e-06, + "loss": 0.5984, + "step": 411 + }, + { + "epoch": 0.18974698483060362, + "grad_norm": 3.507545427256196, + "learning_rate": 1.8696095785346295e-06, + "loss": 0.5627, + "step": 412 + }, + { + "epoch": 0.1902075357646585, + "grad_norm": 3.3634449621572466, + "learning_rate": 1.8688717315008962e-06, + "loss": 0.541, + "step": 413 + }, + { + "epoch": 0.19066808669871332, + "grad_norm": 3.2112093920249105, + "learning_rate": 1.8681319491545269e-06, + "loss": 0.6084, + "step": 414 + }, + { + "epoch": 0.1911286376327682, + "grad_norm": 3.5404353894665532, + "learning_rate": 1.8673902331433022e-06, + "loss": 0.5919, + "step": 415 + }, + { + "epoch": 0.19158918856682305, + "grad_norm": 3.104412575312322, + "learning_rate": 1.86664658511931e-06, + "loss": 0.6024, + "step": 416 + }, + { + "epoch": 0.19204973950087792, + "grad_norm": 3.1299398856732945, + "learning_rate": 1.8659010067389414e-06, + "loss": 0.55, + "step": 417 + }, + { + "epoch": 0.19251029043493278, + "grad_norm": 3.289621978009311, + "learning_rate": 1.8651534996628869e-06, + "loss": 0.6006, + "step": 418 + }, + { + "epoch": 0.19297084136898765, + "grad_norm": 3.388227682628728, + "learning_rate": 1.8644040655561334e-06, + "loss": 0.6185, + "step": 419 + }, + { + "epoch": 0.19343139230304252, + "grad_norm": 3.2963874382804432, + "learning_rate": 1.8636527060879601e-06, + "loss": 0.5446, + "step": 420 + }, + { + "epoch": 0.19389194323709738, + "grad_norm": 2.951087777995391, + "learning_rate": 1.8628994229319338e-06, + "loss": 0.5546, + "step": 421 + }, + { + "epoch": 0.19435249417115225, + "grad_norm": 2.970629687314148, + "learning_rate": 1.8621442177659076e-06, + "loss": 0.7266, + "step": 422 + }, + { + "epoch": 0.1948130451052071, + "grad_norm": 3.3425017993739696, + "learning_rate": 1.8613870922720145e-06, + "loss": 0.6919, + "step": 423 + }, + { + "epoch": 0.19527359603926198, + "grad_norm": 3.138471966309496, + "learning_rate": 1.8606280481366649e-06, + "loss": 0.6644, + "step": 424 + }, + { + "epoch": 0.19573414697331684, + "grad_norm": 2.931479609326468, + "learning_rate": 1.8598670870505434e-06, + "loss": 0.5852, + "step": 425 + }, + { + "epoch": 0.1961946979073717, + "grad_norm": 3.1399142652071563, + "learning_rate": 1.8591042107086038e-06, + "loss": 0.6798, + "step": 426 + }, + { + "epoch": 0.19665524884142654, + "grad_norm": 3.31831713584121, + "learning_rate": 1.8583394208100658e-06, + "loss": 0.6821, + "step": 427 + }, + { + "epoch": 0.1971157997754814, + "grad_norm": 3.570916313275249, + "learning_rate": 1.857572719058412e-06, + "loss": 0.6067, + "step": 428 + }, + { + "epoch": 0.19757635070953627, + "grad_norm": 3.3374646506424983, + "learning_rate": 1.8568041071613832e-06, + "loss": 0.5824, + "step": 429 + }, + { + "epoch": 0.19803690164359114, + "grad_norm": 3.45107101342791, + "learning_rate": 1.8560335868309742e-06, + "loss": 0.6396, + "step": 430 + }, + { + "epoch": 0.198497452577646, + "grad_norm": 2.832158757011958, + "learning_rate": 1.8552611597834317e-06, + "loss": 0.4881, + "step": 431 + }, + { + "epoch": 0.19895800351170087, + "grad_norm": 3.526958026283652, + "learning_rate": 1.8544868277392482e-06, + "loss": 0.5955, + "step": 432 + }, + { + "epoch": 0.19941855444575574, + "grad_norm": 3.368802966721466, + "learning_rate": 1.8537105924231601e-06, + "loss": 0.568, + "step": 433 + }, + { + "epoch": 0.1998791053798106, + "grad_norm": 2.9552863120111, + "learning_rate": 1.8529324555641436e-06, + "loss": 0.6433, + "step": 434 + }, + { + "epoch": 0.20033965631386547, + "grad_norm": 2.6981629403620615, + "learning_rate": 1.8521524188954091e-06, + "loss": 0.5556, + "step": 435 + }, + { + "epoch": 0.20080020724792033, + "grad_norm": 2.817878440174924, + "learning_rate": 1.8513704841543995e-06, + "loss": 0.5547, + "step": 436 + }, + { + "epoch": 0.2012607581819752, + "grad_norm": 3.2229669697907553, + "learning_rate": 1.8505866530827855e-06, + "loss": 0.5965, + "step": 437 + }, + { + "epoch": 0.20172130911603006, + "grad_norm": 3.1101407413877538, + "learning_rate": 1.8498009274264605e-06, + "loss": 0.5596, + "step": 438 + }, + { + "epoch": 0.20218186005008493, + "grad_norm": 3.3912236595059975, + "learning_rate": 1.8490133089355398e-06, + "loss": 0.7065, + "step": 439 + }, + { + "epoch": 0.20264241098413976, + "grad_norm": 3.0636194517862965, + "learning_rate": 1.848223799364353e-06, + "loss": 0.6438, + "step": 440 + }, + { + "epoch": 0.20310296191819463, + "grad_norm": 3.0364026600968277, + "learning_rate": 1.8474324004714428e-06, + "loss": 0.6763, + "step": 441 + }, + { + "epoch": 0.2035635128522495, + "grad_norm": 3.401042051162378, + "learning_rate": 1.8466391140195601e-06, + "loss": 0.6028, + "step": 442 + }, + { + "epoch": 0.20402406378630436, + "grad_norm": 3.324755759973617, + "learning_rate": 1.8458439417756594e-06, + "loss": 0.75, + "step": 443 + }, + { + "epoch": 0.20448461472035923, + "grad_norm": 3.452759888109864, + "learning_rate": 1.8450468855108969e-06, + "loss": 0.7129, + "step": 444 + }, + { + "epoch": 0.2049451656544141, + "grad_norm": 3.567128370695001, + "learning_rate": 1.8442479470006239e-06, + "loss": 0.6197, + "step": 445 + }, + { + "epoch": 0.20540571658846896, + "grad_norm": 3.3653450254819237, + "learning_rate": 1.843447128024385e-06, + "loss": 0.663, + "step": 446 + }, + { + "epoch": 0.20586626752252382, + "grad_norm": 3.3553923088948117, + "learning_rate": 1.842644430365913e-06, + "loss": 0.5946, + "step": 447 + }, + { + "epoch": 0.2063268184565787, + "grad_norm": 3.0905391763237806, + "learning_rate": 1.8418398558131257e-06, + "loss": 0.5987, + "step": 448 + }, + { + "epoch": 0.20678736939063355, + "grad_norm": 2.8613755616545875, + "learning_rate": 1.8410334061581206e-06, + "loss": 0.5291, + "step": 449 + }, + { + "epoch": 0.20724792032468842, + "grad_norm": 3.510631784297277, + "learning_rate": 1.8402250831971723e-06, + "loss": 0.5732, + "step": 450 + }, + { + "epoch": 0.20770847125874328, + "grad_norm": 3.5041502693832203, + "learning_rate": 1.8394148887307285e-06, + "loss": 0.585, + "step": 451 + }, + { + "epoch": 0.20816902219279815, + "grad_norm": 3.1445431808415556, + "learning_rate": 1.8386028245634041e-06, + "loss": 0.5904, + "step": 452 + }, + { + "epoch": 0.20862957312685299, + "grad_norm": 3.257773233991117, + "learning_rate": 1.83778889250398e-06, + "loss": 0.6259, + "step": 453 + }, + { + "epoch": 0.20909012406090785, + "grad_norm": 2.7341574429872764, + "learning_rate": 1.836973094365397e-06, + "loss": 0.4938, + "step": 454 + }, + { + "epoch": 0.20955067499496272, + "grad_norm": 3.6461025551874497, + "learning_rate": 1.8361554319647522e-06, + "loss": 0.6574, + "step": 455 + }, + { + "epoch": 0.21001122592901758, + "grad_norm": 3.278582866514977, + "learning_rate": 1.8353359071232951e-06, + "loss": 0.5728, + "step": 456 + }, + { + "epoch": 0.21047177686307245, + "grad_norm": 3.9462317493517025, + "learning_rate": 1.8345145216664242e-06, + "loss": 0.755, + "step": 457 + }, + { + "epoch": 0.2109323277971273, + "grad_norm": 3.481064251792427, + "learning_rate": 1.8336912774236818e-06, + "loss": 0.6502, + "step": 458 + }, + { + "epoch": 0.21139287873118218, + "grad_norm": 3.248277112137531, + "learning_rate": 1.83286617622875e-06, + "loss": 0.5746, + "step": 459 + }, + { + "epoch": 0.21185342966523704, + "grad_norm": 3.3146346341451918, + "learning_rate": 1.8320392199194476e-06, + "loss": 0.605, + "step": 460 + }, + { + "epoch": 0.2123139805992919, + "grad_norm": 3.441495638998992, + "learning_rate": 1.8312104103377262e-06, + "loss": 0.6416, + "step": 461 + }, + { + "epoch": 0.21277453153334677, + "grad_norm": 2.9904476105113065, + "learning_rate": 1.8303797493296637e-06, + "loss": 0.6347, + "step": 462 + }, + { + "epoch": 0.21323508246740164, + "grad_norm": 3.63776364142898, + "learning_rate": 1.8295472387454636e-06, + "loss": 0.4796, + "step": 463 + }, + { + "epoch": 0.2136956334014565, + "grad_norm": 3.0013856019668763, + "learning_rate": 1.8287128804394474e-06, + "loss": 0.6319, + "step": 464 + }, + { + "epoch": 0.21415618433551137, + "grad_norm": 3.0880145601287783, + "learning_rate": 1.8278766762700534e-06, + "loss": 0.5423, + "step": 465 + }, + { + "epoch": 0.2146167352695662, + "grad_norm": 3.133296021110922, + "learning_rate": 1.8270386280998309e-06, + "loss": 0.5982, + "step": 466 + }, + { + "epoch": 0.21507728620362107, + "grad_norm": 3.4704322121668914, + "learning_rate": 1.8261987377954365e-06, + "loss": 0.6409, + "step": 467 + }, + { + "epoch": 0.21553783713767594, + "grad_norm": 3.0683920370476203, + "learning_rate": 1.8253570072276303e-06, + "loss": 0.6111, + "step": 468 + }, + { + "epoch": 0.2159983880717308, + "grad_norm": 3.2983986131430663, + "learning_rate": 1.8245134382712709e-06, + "loss": 0.6993, + "step": 469 + }, + { + "epoch": 0.21645893900578567, + "grad_norm": 3.2695193504256754, + "learning_rate": 1.8236680328053116e-06, + "loss": 0.5836, + "step": 470 + }, + { + "epoch": 0.21691948993984053, + "grad_norm": 3.1255673883544612, + "learning_rate": 1.822820792712797e-06, + "loss": 0.5577, + "step": 471 + }, + { + "epoch": 0.2173800408738954, + "grad_norm": 3.6062462823524735, + "learning_rate": 1.8219717198808578e-06, + "loss": 0.6389, + "step": 472 + }, + { + "epoch": 0.21784059180795026, + "grad_norm": 3.393965634932339, + "learning_rate": 1.8211208162007065e-06, + "loss": 0.5364, + "step": 473 + }, + { + "epoch": 0.21830114274200513, + "grad_norm": 3.244656999193985, + "learning_rate": 1.820268083567634e-06, + "loss": 0.525, + "step": 474 + }, + { + "epoch": 0.21876169367606, + "grad_norm": 3.2505550753334664, + "learning_rate": 1.819413523881005e-06, + "loss": 0.6237, + "step": 475 + }, + { + "epoch": 0.21922224461011486, + "grad_norm": 3.0423831974765894, + "learning_rate": 1.818557139044254e-06, + "loss": 0.5472, + "step": 476 + }, + { + "epoch": 0.21968279554416972, + "grad_norm": 2.9954662337432656, + "learning_rate": 1.8176989309648803e-06, + "loss": 0.5291, + "step": 477 + }, + { + "epoch": 0.2201433464782246, + "grad_norm": 2.918815185530408, + "learning_rate": 1.8168389015544444e-06, + "loss": 0.5379, + "step": 478 + }, + { + "epoch": 0.22060389741227943, + "grad_norm": 3.265440208483845, + "learning_rate": 1.8159770527285634e-06, + "loss": 0.5796, + "step": 479 + }, + { + "epoch": 0.2210644483463343, + "grad_norm": 3.3450099140811806, + "learning_rate": 1.815113386406908e-06, + "loss": 0.5663, + "step": 480 + }, + { + "epoch": 0.22152499928038916, + "grad_norm": 2.8483938675266547, + "learning_rate": 1.8142479045131953e-06, + "loss": 0.6591, + "step": 481 + }, + { + "epoch": 0.22198555021444402, + "grad_norm": 3.1336266713837744, + "learning_rate": 1.8133806089751884e-06, + "loss": 0.4984, + "step": 482 + }, + { + "epoch": 0.2224461011484989, + "grad_norm": 3.080985904456354, + "learning_rate": 1.8125115017246887e-06, + "loss": 0.4414, + "step": 483 + }, + { + "epoch": 0.22290665208255375, + "grad_norm": 3.371599831924596, + "learning_rate": 1.8116405846975335e-06, + "loss": 0.5807, + "step": 484 + }, + { + "epoch": 0.22336720301660862, + "grad_norm": 3.078985566820636, + "learning_rate": 1.8107678598335912e-06, + "loss": 0.6484, + "step": 485 + }, + { + "epoch": 0.22382775395066348, + "grad_norm": 2.9841159659031526, + "learning_rate": 1.8098933290767567e-06, + "loss": 0.569, + "step": 486 + }, + { + "epoch": 0.22428830488471835, + "grad_norm": 3.110058890834877, + "learning_rate": 1.8090169943749474e-06, + "loss": 0.6511, + "step": 487 + }, + { + "epoch": 0.2247488558187732, + "grad_norm": 3.153179145491399, + "learning_rate": 1.808138857680099e-06, + "loss": 0.4959, + "step": 488 + }, + { + "epoch": 0.22520940675282808, + "grad_norm": 3.004599141952458, + "learning_rate": 1.8072589209481607e-06, + "loss": 0.5327, + "step": 489 + }, + { + "epoch": 0.22566995768688294, + "grad_norm": 3.3051175185768273, + "learning_rate": 1.8063771861390915e-06, + "loss": 0.6466, + "step": 490 + }, + { + "epoch": 0.2261305086209378, + "grad_norm": 3.096130029682352, + "learning_rate": 1.8054936552168547e-06, + "loss": 0.4745, + "step": 491 + }, + { + "epoch": 0.22659105955499265, + "grad_norm": 3.756055051745282, + "learning_rate": 1.804608330149415e-06, + "loss": 0.6128, + "step": 492 + }, + { + "epoch": 0.2270516104890475, + "grad_norm": 3.342218840047437, + "learning_rate": 1.8037212129087322e-06, + "loss": 0.561, + "step": 493 + }, + { + "epoch": 0.22751216142310238, + "grad_norm": 3.244488624559804, + "learning_rate": 1.8028323054707592e-06, + "loss": 0.6411, + "step": 494 + }, + { + "epoch": 0.22797271235715724, + "grad_norm": 3.3116315535429055, + "learning_rate": 1.801941609815436e-06, + "loss": 0.6216, + "step": 495 + }, + { + "epoch": 0.2284332632912121, + "grad_norm": 3.3075500193801775, + "learning_rate": 1.8010491279266858e-06, + "loss": 0.6051, + "step": 496 + }, + { + "epoch": 0.22889381422526697, + "grad_norm": 3.0538650883239, + "learning_rate": 1.8001548617924095e-06, + "loss": 0.4452, + "step": 497 + }, + { + "epoch": 0.22935436515932184, + "grad_norm": 3.5424554842795835, + "learning_rate": 1.799258813404483e-06, + "loss": 0.6483, + "step": 498 + }, + { + "epoch": 0.2298149160933767, + "grad_norm": 3.3078686719248593, + "learning_rate": 1.7983609847587521e-06, + "loss": 0.585, + "step": 499 + }, + { + "epoch": 0.23027546702743157, + "grad_norm": 3.3972513361686185, + "learning_rate": 1.7974613778550278e-06, + "loss": 0.6655, + "step": 500 + }, + { + "epoch": 0.23073601796148643, + "grad_norm": 3.465463703248316, + "learning_rate": 1.7965599946970812e-06, + "loss": 0.6034, + "step": 501 + }, + { + "epoch": 0.2311965688955413, + "grad_norm": 3.325451562972118, + "learning_rate": 1.795656837292641e-06, + "loss": 0.6398, + "step": 502 + }, + { + "epoch": 0.23165711982959616, + "grad_norm": 3.5458965409990593, + "learning_rate": 1.7947519076533873e-06, + "loss": 0.5217, + "step": 503 + }, + { + "epoch": 0.23211767076365103, + "grad_norm": 3.3055864844064904, + "learning_rate": 1.793845207794947e-06, + "loss": 0.633, + "step": 504 + }, + { + "epoch": 0.23257822169770587, + "grad_norm": 3.353646102717693, + "learning_rate": 1.7929367397368913e-06, + "loss": 0.5797, + "step": 505 + }, + { + "epoch": 0.23303877263176073, + "grad_norm": 3.254837375875168, + "learning_rate": 1.7920265055027283e-06, + "loss": 0.5433, + "step": 506 + }, + { + "epoch": 0.2334993235658156, + "grad_norm": 3.5539641885768214, + "learning_rate": 1.7911145071199018e-06, + "loss": 0.6674, + "step": 507 + }, + { + "epoch": 0.23395987449987046, + "grad_norm": 3.4968687641517584, + "learning_rate": 1.7902007466197837e-06, + "loss": 0.5679, + "step": 508 + }, + { + "epoch": 0.23442042543392533, + "grad_norm": 3.316075449684041, + "learning_rate": 1.789285226037671e-06, + "loss": 0.6063, + "step": 509 + }, + { + "epoch": 0.2348809763679802, + "grad_norm": 3.4021632966156696, + "learning_rate": 1.788367947412782e-06, + "loss": 0.5539, + "step": 510 + }, + { + "epoch": 0.23534152730203506, + "grad_norm": 3.1412284614498103, + "learning_rate": 1.7874489127882493e-06, + "loss": 0.5244, + "step": 511 + }, + { + "epoch": 0.23580207823608992, + "grad_norm": 3.6411365029508143, + "learning_rate": 1.7865281242111182e-06, + "loss": 0.6874, + "step": 512 + }, + { + "epoch": 0.2362626291701448, + "grad_norm": 2.939080873360146, + "learning_rate": 1.7856055837323406e-06, + "loss": 0.6462, + "step": 513 + }, + { + "epoch": 0.23672318010419965, + "grad_norm": 3.5060804854877334, + "learning_rate": 1.784681293406769e-06, + "loss": 0.6256, + "step": 514 + }, + { + "epoch": 0.23718373103825452, + "grad_norm": 3.454373638780033, + "learning_rate": 1.7837552552931555e-06, + "loss": 0.5419, + "step": 515 + }, + { + "epoch": 0.23764428197230938, + "grad_norm": 2.981684278208563, + "learning_rate": 1.7828274714541443e-06, + "loss": 0.6395, + "step": 516 + }, + { + "epoch": 0.23810483290636425, + "grad_norm": 2.6776702286309413, + "learning_rate": 1.7818979439562677e-06, + "loss": 0.4575, + "step": 517 + }, + { + "epoch": 0.23856538384041912, + "grad_norm": 3.0713454976431342, + "learning_rate": 1.7809666748699424e-06, + "loss": 0.6557, + "step": 518 + }, + { + "epoch": 0.23902593477447395, + "grad_norm": 3.332095530691328, + "learning_rate": 1.7800336662694635e-06, + "loss": 0.5726, + "step": 519 + }, + { + "epoch": 0.23948648570852882, + "grad_norm": 3.3759244307287375, + "learning_rate": 1.7790989202330018e-06, + "loss": 0.5942, + "step": 520 + }, + { + "epoch": 0.23994703664258368, + "grad_norm": 3.3882812703718583, + "learning_rate": 1.7781624388425973e-06, + "loss": 0.6916, + "step": 521 + }, + { + "epoch": 0.24040758757663855, + "grad_norm": 3.4802417994319, + "learning_rate": 1.7772242241841552e-06, + "loss": 0.5549, + "step": 522 + }, + { + "epoch": 0.2408681385106934, + "grad_norm": 2.8321037056813925, + "learning_rate": 1.7762842783474419e-06, + "loss": 0.5675, + "step": 523 + }, + { + "epoch": 0.24132868944474828, + "grad_norm": 3.0286366892013024, + "learning_rate": 1.7753426034260794e-06, + "loss": 0.6203, + "step": 524 + }, + { + "epoch": 0.24178924037880314, + "grad_norm": 3.2252778331423255, + "learning_rate": 1.774399201517541e-06, + "loss": 0.6389, + "step": 525 + }, + { + "epoch": 0.242249791312858, + "grad_norm": 3.0971473717588087, + "learning_rate": 1.7734540747231469e-06, + "loss": 0.6017, + "step": 526 + }, + { + "epoch": 0.24271034224691287, + "grad_norm": 3.4994719951042104, + "learning_rate": 1.772507225148059e-06, + "loss": 0.4603, + "step": 527 + }, + { + "epoch": 0.24317089318096774, + "grad_norm": 2.713673764054212, + "learning_rate": 1.7715586549012768e-06, + "loss": 0.5765, + "step": 528 + }, + { + "epoch": 0.2436314441150226, + "grad_norm": 3.221284406276941, + "learning_rate": 1.7706083660956327e-06, + "loss": 0.6266, + "step": 529 + }, + { + "epoch": 0.24409199504907747, + "grad_norm": 3.306672434466321, + "learning_rate": 1.7696563608477862e-06, + "loss": 0.6451, + "step": 530 + }, + { + "epoch": 0.24455254598313234, + "grad_norm": 3.065434289834053, + "learning_rate": 1.76870264127822e-06, + "loss": 0.6552, + "step": 531 + }, + { + "epoch": 0.24501309691718717, + "grad_norm": 3.356746881862158, + "learning_rate": 1.7677472095112363e-06, + "loss": 0.4904, + "step": 532 + }, + { + "epoch": 0.24547364785124204, + "grad_norm": 3.253784280023295, + "learning_rate": 1.7667900676749498e-06, + "loss": 0.5361, + "step": 533 + }, + { + "epoch": 0.2459341987852969, + "grad_norm": 3.0192332980381344, + "learning_rate": 1.7658312179012854e-06, + "loss": 0.5099, + "step": 534 + }, + { + "epoch": 0.24639474971935177, + "grad_norm": 3.0809296396824384, + "learning_rate": 1.7648706623259706e-06, + "loss": 0.6658, + "step": 535 + }, + { + "epoch": 0.24685530065340663, + "grad_norm": 3.6830695514949077, + "learning_rate": 1.7639084030885338e-06, + "loss": 0.5973, + "step": 536 + }, + { + "epoch": 0.2473158515874615, + "grad_norm": 3.330551105763686, + "learning_rate": 1.7629444423322982e-06, + "loss": 0.7364, + "step": 537 + }, + { + "epoch": 0.24777640252151636, + "grad_norm": 2.9911270084180366, + "learning_rate": 1.7619787822043754e-06, + "loss": 0.462, + "step": 538 + }, + { + "epoch": 0.24823695345557123, + "grad_norm": 3.149958771441102, + "learning_rate": 1.7610114248556639e-06, + "loss": 0.611, + "step": 539 + }, + { + "epoch": 0.2486975043896261, + "grad_norm": 3.30756958264156, + "learning_rate": 1.7600423724408415e-06, + "loss": 0.5285, + "step": 540 + }, + { + "epoch": 0.24915805532368096, + "grad_norm": 3.3029379864351, + "learning_rate": 1.759071627118362e-06, + "loss": 0.5643, + "step": 541 + }, + { + "epoch": 0.24961860625773583, + "grad_norm": 3.454862601842869, + "learning_rate": 1.7580991910504494e-06, + "loss": 0.6229, + "step": 542 + }, + { + "epoch": 0.2500791571917907, + "grad_norm": 3.2773777585364616, + "learning_rate": 1.7571250664030947e-06, + "loss": 0.6066, + "step": 543 + }, + { + "epoch": 0.25053970812584553, + "grad_norm": 2.955729624984104, + "learning_rate": 1.7561492553460488e-06, + "loss": 0.5717, + "step": 544 + }, + { + "epoch": 0.2510002590599004, + "grad_norm": 3.103695274249137, + "learning_rate": 1.7551717600528203e-06, + "loss": 0.5664, + "step": 545 + }, + { + "epoch": 0.25146080999395526, + "grad_norm": 3.1965303115665034, + "learning_rate": 1.7541925827006678e-06, + "loss": 0.5587, + "step": 546 + }, + { + "epoch": 0.25192136092801015, + "grad_norm": 2.9103203193249545, + "learning_rate": 1.7532117254705972e-06, + "loss": 0.6035, + "step": 547 + }, + { + "epoch": 0.252381911862065, + "grad_norm": 2.9173381955892976, + "learning_rate": 1.7522291905473558e-06, + "loss": 0.4366, + "step": 548 + }, + { + "epoch": 0.2528424627961199, + "grad_norm": 3.222972546291656, + "learning_rate": 1.7512449801194286e-06, + "loss": 0.5781, + "step": 549 + }, + { + "epoch": 0.2533030137301747, + "grad_norm": 3.1103988583748663, + "learning_rate": 1.7502590963790316e-06, + "loss": 0.5862, + "step": 550 + }, + { + "epoch": 0.25376356466422956, + "grad_norm": 3.4396946239347908, + "learning_rate": 1.7492715415221087e-06, + "loss": 0.6496, + "step": 551 + }, + { + "epoch": 0.25422411559828445, + "grad_norm": 2.9802260714638398, + "learning_rate": 1.7482823177483252e-06, + "loss": 0.6773, + "step": 552 + }, + { + "epoch": 0.2546846665323393, + "grad_norm": 3.082234293267444, + "learning_rate": 1.7472914272610646e-06, + "loss": 0.5246, + "step": 553 + }, + { + "epoch": 0.2551452174663942, + "grad_norm": 3.3567221598735992, + "learning_rate": 1.7462988722674221e-06, + "loss": 0.6232, + "step": 554 + }, + { + "epoch": 0.255605768400449, + "grad_norm": 3.2626603235826264, + "learning_rate": 1.7453046549782009e-06, + "loss": 0.5204, + "step": 555 + }, + { + "epoch": 0.2560663193345039, + "grad_norm": 3.2339294619977434, + "learning_rate": 1.7443087776079064e-06, + "loss": 0.5647, + "step": 556 + }, + { + "epoch": 0.25652687026855875, + "grad_norm": 3.3723328031579722, + "learning_rate": 1.7433112423747418e-06, + "loss": 0.6539, + "step": 557 + }, + { + "epoch": 0.25698742120261364, + "grad_norm": 3.184022192143854, + "learning_rate": 1.742312051500603e-06, + "loss": 0.6068, + "step": 558 + }, + { + "epoch": 0.2574479721366685, + "grad_norm": 3.5355428533513695, + "learning_rate": 1.741311207211074e-06, + "loss": 0.6395, + "step": 559 + }, + { + "epoch": 0.25790852307072337, + "grad_norm": 3.303032162182857, + "learning_rate": 1.740308711735421e-06, + "loss": 0.5797, + "step": 560 + }, + { + "epoch": 0.2583690740047782, + "grad_norm": 3.2151549157821235, + "learning_rate": 1.7393045673065878e-06, + "loss": 0.5189, + "step": 561 + }, + { + "epoch": 0.2588296249388331, + "grad_norm": 3.2889581937560513, + "learning_rate": 1.7382987761611924e-06, + "loss": 0.5853, + "step": 562 + }, + { + "epoch": 0.25929017587288794, + "grad_norm": 3.589638793664337, + "learning_rate": 1.7372913405395191e-06, + "loss": 0.591, + "step": 563 + }, + { + "epoch": 0.2597507268069428, + "grad_norm": 3.1711968406048734, + "learning_rate": 1.7362822626855165e-06, + "loss": 0.5466, + "step": 564 + }, + { + "epoch": 0.26021127774099767, + "grad_norm": 2.816516792416395, + "learning_rate": 1.7352715448467895e-06, + "loss": 0.5831, + "step": 565 + }, + { + "epoch": 0.2606718286750525, + "grad_norm": 3.154750601104403, + "learning_rate": 1.7342591892745974e-06, + "loss": 0.6726, + "step": 566 + }, + { + "epoch": 0.2611323796091074, + "grad_norm": 3.3840134771829637, + "learning_rate": 1.7332451982238465e-06, + "loss": 0.5483, + "step": 567 + }, + { + "epoch": 0.26159293054316224, + "grad_norm": 3.1406500671773645, + "learning_rate": 1.732229573953086e-06, + "loss": 0.6034, + "step": 568 + }, + { + "epoch": 0.26205348147721713, + "grad_norm": 2.862334788278051, + "learning_rate": 1.7312123187245037e-06, + "loss": 0.6242, + "step": 569 + }, + { + "epoch": 0.26251403241127197, + "grad_norm": 2.8647709222697095, + "learning_rate": 1.7301934348039188e-06, + "loss": 0.5623, + "step": 570 + }, + { + "epoch": 0.26297458334532686, + "grad_norm": 2.8749482072040458, + "learning_rate": 1.7291729244607795e-06, + "loss": 0.568, + "step": 571 + }, + { + "epoch": 0.2634351342793817, + "grad_norm": 3.1262185261082935, + "learning_rate": 1.7281507899681556e-06, + "loss": 0.5939, + "step": 572 + }, + { + "epoch": 0.2638956852134366, + "grad_norm": 3.2312398719111606, + "learning_rate": 1.7271270336027358e-06, + "loss": 0.5729, + "step": 573 + }, + { + "epoch": 0.26435623614749143, + "grad_norm": 3.035067327482627, + "learning_rate": 1.7261016576448198e-06, + "loss": 0.5952, + "step": 574 + }, + { + "epoch": 0.2648167870815463, + "grad_norm": 3.386614804112364, + "learning_rate": 1.7250746643783158e-06, + "loss": 0.7062, + "step": 575 + }, + { + "epoch": 0.26527733801560116, + "grad_norm": 3.3687551890454315, + "learning_rate": 1.7240460560907343e-06, + "loss": 0.4614, + "step": 576 + }, + { + "epoch": 0.26573788894965605, + "grad_norm": 2.9333951618662675, + "learning_rate": 1.7230158350731832e-06, + "loss": 0.5462, + "step": 577 + }, + { + "epoch": 0.2661984398837109, + "grad_norm": 3.2883977513315714, + "learning_rate": 1.7219840036203613e-06, + "loss": 0.668, + "step": 578 + }, + { + "epoch": 0.26665899081776573, + "grad_norm": 3.319840681654177, + "learning_rate": 1.7209505640305562e-06, + "loss": 0.5502, + "step": 579 + }, + { + "epoch": 0.2671195417518206, + "grad_norm": 3.344576294088726, + "learning_rate": 1.7199155186056364e-06, + "loss": 0.7453, + "step": 580 + }, + { + "epoch": 0.26758009268587546, + "grad_norm": 3.3507561207198266, + "learning_rate": 1.7188788696510474e-06, + "loss": 0.5591, + "step": 581 + }, + { + "epoch": 0.26804064361993035, + "grad_norm": 3.6763295819772, + "learning_rate": 1.7178406194758067e-06, + "loss": 0.5894, + "step": 582 + }, + { + "epoch": 0.2685011945539852, + "grad_norm": 2.7982715860658836, + "learning_rate": 1.7168007703924978e-06, + "loss": 0.6436, + "step": 583 + }, + { + "epoch": 0.2689617454880401, + "grad_norm": 3.1119980909697724, + "learning_rate": 1.7157593247172664e-06, + "loss": 0.557, + "step": 584 + }, + { + "epoch": 0.2694222964220949, + "grad_norm": 3.69421493980708, + "learning_rate": 1.714716284769813e-06, + "loss": 0.6445, + "step": 585 + }, + { + "epoch": 0.2698828473561498, + "grad_norm": 3.786080820523942, + "learning_rate": 1.7136716528733912e-06, + "loss": 0.417, + "step": 586 + }, + { + "epoch": 0.27034339829020465, + "grad_norm": 4.270736109571739, + "learning_rate": 1.7126254313547985e-06, + "loss": 0.6734, + "step": 587 + }, + { + "epoch": 0.27080394922425954, + "grad_norm": 3.600772785847931, + "learning_rate": 1.7115776225443739e-06, + "loss": 0.8145, + "step": 588 + }, + { + "epoch": 0.2712645001583144, + "grad_norm": 2.8943620464394293, + "learning_rate": 1.7105282287759926e-06, + "loss": 0.528, + "step": 589 + }, + { + "epoch": 0.2717250510923693, + "grad_norm": 3.0063394751214094, + "learning_rate": 1.7094772523870589e-06, + "loss": 0.5868, + "step": 590 + }, + { + "epoch": 0.2721856020264241, + "grad_norm": 3.3735726594180613, + "learning_rate": 1.7084246957185032e-06, + "loss": 0.6339, + "step": 591 + }, + { + "epoch": 0.27264615296047895, + "grad_norm": 3.3141049773629283, + "learning_rate": 1.707370561114775e-06, + "loss": 0.6053, + "step": 592 + }, + { + "epoch": 0.27310670389453384, + "grad_norm": 2.80960222995646, + "learning_rate": 1.7063148509238393e-06, + "loss": 0.4687, + "step": 593 + }, + { + "epoch": 0.2735672548285887, + "grad_norm": 3.313745439020705, + "learning_rate": 1.70525756749717e-06, + "loss": 0.6093, + "step": 594 + }, + { + "epoch": 0.2740278057626436, + "grad_norm": 3.7937269183858318, + "learning_rate": 1.7041987131897445e-06, + "loss": 0.5746, + "step": 595 + }, + { + "epoch": 0.2744883566966984, + "grad_norm": 2.9125080429436365, + "learning_rate": 1.703138290360041e-06, + "loss": 0.525, + "step": 596 + }, + { + "epoch": 0.2749489076307533, + "grad_norm": 3.2588529493155285, + "learning_rate": 1.7020763013700296e-06, + "loss": 0.5566, + "step": 597 + }, + { + "epoch": 0.27540945856480814, + "grad_norm": 3.0202214315810396, + "learning_rate": 1.70101274858517e-06, + "loss": 0.5711, + "step": 598 + }, + { + "epoch": 0.27587000949886303, + "grad_norm": 3.3568365962069664, + "learning_rate": 1.6999476343744047e-06, + "loss": 0.5359, + "step": 599 + }, + { + "epoch": 0.27633056043291787, + "grad_norm": 3.0103484901671638, + "learning_rate": 1.6988809611101535e-06, + "loss": 0.6112, + "step": 600 + }, + { + "epoch": 0.27679111136697276, + "grad_norm": 3.4029649629281673, + "learning_rate": 1.69781273116831e-06, + "loss": 0.5336, + "step": 601 + }, + { + "epoch": 0.2772516623010276, + "grad_norm": 2.8767133982019377, + "learning_rate": 1.6967429469282345e-06, + "loss": 0.6359, + "step": 602 + }, + { + "epoch": 0.2777122132350825, + "grad_norm": 3.2835587382748024, + "learning_rate": 1.695671610772749e-06, + "loss": 0.5618, + "step": 603 + }, + { + "epoch": 0.27817276416913733, + "grad_norm": 3.5991470633371283, + "learning_rate": 1.694598725088133e-06, + "loss": 0.5286, + "step": 604 + }, + { + "epoch": 0.27863331510319217, + "grad_norm": 3.1120545309975958, + "learning_rate": 1.6935242922641159e-06, + "loss": 0.6178, + "step": 605 + }, + { + "epoch": 0.27909386603724706, + "grad_norm": 3.1244820282575674, + "learning_rate": 1.6924483146938754e-06, + "loss": 0.5822, + "step": 606 + }, + { + "epoch": 0.2795544169713019, + "grad_norm": 2.714150777824839, + "learning_rate": 1.6913707947740284e-06, + "loss": 0.5298, + "step": 607 + }, + { + "epoch": 0.2800149679053568, + "grad_norm": 3.3837453901982486, + "learning_rate": 1.690291734904627e-06, + "loss": 0.5653, + "step": 608 + }, + { + "epoch": 0.28047551883941163, + "grad_norm": 3.1917062613137785, + "learning_rate": 1.6892111374891547e-06, + "loss": 0.5825, + "step": 609 + }, + { + "epoch": 0.2809360697734665, + "grad_norm": 3.3940845730675226, + "learning_rate": 1.6881290049345185e-06, + "loss": 0.5283, + "step": 610 + }, + { + "epoch": 0.28139662070752136, + "grad_norm": 3.3197149723223127, + "learning_rate": 1.6870453396510453e-06, + "loss": 0.5643, + "step": 611 + }, + { + "epoch": 0.28185717164157625, + "grad_norm": 3.368272451697648, + "learning_rate": 1.6859601440524757e-06, + "loss": 0.5448, + "step": 612 + }, + { + "epoch": 0.2823177225756311, + "grad_norm": 3.66743262346132, + "learning_rate": 1.6848734205559593e-06, + "loss": 0.5931, + "step": 613 + }, + { + "epoch": 0.282778273509686, + "grad_norm": 3.399278333660865, + "learning_rate": 1.6837851715820488e-06, + "loss": 0.6151, + "step": 614 + }, + { + "epoch": 0.2832388244437408, + "grad_norm": 3.57422302010664, + "learning_rate": 1.6826953995546945e-06, + "loss": 0.5642, + "step": 615 + }, + { + "epoch": 0.2836993753777957, + "grad_norm": 3.0447865419845286, + "learning_rate": 1.6816041069012388e-06, + "loss": 0.5781, + "step": 616 + }, + { + "epoch": 0.28415992631185055, + "grad_norm": 3.072556146750414, + "learning_rate": 1.680511296052412e-06, + "loss": 0.5839, + "step": 617 + }, + { + "epoch": 0.2846204772459054, + "grad_norm": 3.7455049180528484, + "learning_rate": 1.6794169694423257e-06, + "loss": 0.6444, + "step": 618 + }, + { + "epoch": 0.2850810281799603, + "grad_norm": 3.0929934631543574, + "learning_rate": 1.6783211295084669e-06, + "loss": 0.6054, + "step": 619 + }, + { + "epoch": 0.2855415791140151, + "grad_norm": 2.7963129880952415, + "learning_rate": 1.677223778691695e-06, + "loss": 0.586, + "step": 620 + }, + { + "epoch": 0.28600213004807, + "grad_norm": 3.0409334266150263, + "learning_rate": 1.6761249194362328e-06, + "loss": 0.6534, + "step": 621 + }, + { + "epoch": 0.28646268098212485, + "grad_norm": 3.541465069260458, + "learning_rate": 1.6750245541896644e-06, + "loss": 0.7103, + "step": 622 + }, + { + "epoch": 0.28692323191617974, + "grad_norm": 3.2927125901227012, + "learning_rate": 1.6739226854029276e-06, + "loss": 0.493, + "step": 623 + }, + { + "epoch": 0.2873837828502346, + "grad_norm": 2.761658471122107, + "learning_rate": 1.6728193155303097e-06, + "loss": 0.537, + "step": 624 + }, + { + "epoch": 0.2878443337842895, + "grad_norm": 3.362612929819794, + "learning_rate": 1.6717144470294406e-06, + "loss": 0.6486, + "step": 625 + }, + { + "epoch": 0.2883048847183443, + "grad_norm": 3.2064609899615877, + "learning_rate": 1.6706080823612894e-06, + "loss": 0.5923, + "step": 626 + }, + { + "epoch": 0.2887654356523992, + "grad_norm": 2.9733446787329227, + "learning_rate": 1.6695002239901569e-06, + "loss": 0.6514, + "step": 627 + }, + { + "epoch": 0.28922598658645404, + "grad_norm": 3.272788188514796, + "learning_rate": 1.6683908743836711e-06, + "loss": 0.599, + "step": 628 + }, + { + "epoch": 0.28968653752050894, + "grad_norm": 3.073673873179751, + "learning_rate": 1.6672800360127823e-06, + "loss": 0.5548, + "step": 629 + }, + { + "epoch": 0.2901470884545638, + "grad_norm": 3.1359157628557353, + "learning_rate": 1.6661677113517553e-06, + "loss": 0.6053, + "step": 630 + }, + { + "epoch": 0.2906076393886186, + "grad_norm": 3.2988300051982056, + "learning_rate": 1.6650539028781667e-06, + "loss": 0.7002, + "step": 631 + }, + { + "epoch": 0.2910681903226735, + "grad_norm": 3.025853139733737, + "learning_rate": 1.663938613072898e-06, + "loss": 0.5597, + "step": 632 + }, + { + "epoch": 0.29152874125672834, + "grad_norm": 2.797667473613527, + "learning_rate": 1.6628218444201299e-06, + "loss": 0.579, + "step": 633 + }, + { + "epoch": 0.29198929219078323, + "grad_norm": 2.482842591113322, + "learning_rate": 1.6617035994073372e-06, + "loss": 0.4768, + "step": 634 + }, + { + "epoch": 0.29244984312483807, + "grad_norm": 3.2875064889505423, + "learning_rate": 1.6605838805252828e-06, + "loss": 0.6799, + "step": 635 + }, + { + "epoch": 0.29291039405889296, + "grad_norm": 3.160691937520528, + "learning_rate": 1.6594626902680126e-06, + "loss": 0.4986, + "step": 636 + }, + { + "epoch": 0.2933709449929478, + "grad_norm": 3.2855289808671384, + "learning_rate": 1.6583400311328505e-06, + "loss": 0.5377, + "step": 637 + }, + { + "epoch": 0.2938314959270027, + "grad_norm": 2.9638072454998725, + "learning_rate": 1.6572159056203915e-06, + "loss": 0.6512, + "step": 638 + }, + { + "epoch": 0.29429204686105753, + "grad_norm": 3.371604241327082, + "learning_rate": 1.6560903162344966e-06, + "loss": 0.5783, + "step": 639 + }, + { + "epoch": 0.2947525977951124, + "grad_norm": 3.628939173234319, + "learning_rate": 1.6549632654822875e-06, + "loss": 0.6427, + "step": 640 + }, + { + "epoch": 0.29521314872916726, + "grad_norm": 2.945921755545379, + "learning_rate": 1.6538347558741422e-06, + "loss": 0.4827, + "step": 641 + }, + { + "epoch": 0.29567369966322216, + "grad_norm": 2.9596989147777513, + "learning_rate": 1.652704789923686e-06, + "loss": 0.7239, + "step": 642 + }, + { + "epoch": 0.296134250597277, + "grad_norm": 3.597968564106779, + "learning_rate": 1.6515733701477896e-06, + "loss": 0.666, + "step": 643 + }, + { + "epoch": 0.29659480153133183, + "grad_norm": 3.1426064389176087, + "learning_rate": 1.6504404990665615e-06, + "loss": 0.5744, + "step": 644 + }, + { + "epoch": 0.2970553524653867, + "grad_norm": 3.120168720987635, + "learning_rate": 1.6493061792033424e-06, + "loss": 0.5737, + "step": 645 + }, + { + "epoch": 0.29751590339944156, + "grad_norm": 2.7520653399387904, + "learning_rate": 1.648170413084701e-06, + "loss": 0.6586, + "step": 646 + }, + { + "epoch": 0.29797645433349645, + "grad_norm": 2.7598009601994575, + "learning_rate": 1.6470332032404258e-06, + "loss": 0.5887, + "step": 647 + }, + { + "epoch": 0.2984370052675513, + "grad_norm": 2.9219392147630456, + "learning_rate": 1.6458945522035227e-06, + "loss": 0.4776, + "step": 648 + }, + { + "epoch": 0.2988975562016062, + "grad_norm": 2.918413331344107, + "learning_rate": 1.6447544625102068e-06, + "loss": 0.5586, + "step": 649 + }, + { + "epoch": 0.299358107135661, + "grad_norm": 3.0539033522988275, + "learning_rate": 1.6436129366998973e-06, + "loss": 0.6355, + "step": 650 + }, + { + "epoch": 0.2998186580697159, + "grad_norm": 3.0936916830976475, + "learning_rate": 1.6424699773152138e-06, + "loss": 0.6304, + "step": 651 + }, + { + "epoch": 0.30027920900377075, + "grad_norm": 3.3790034098133384, + "learning_rate": 1.6413255869019666e-06, + "loss": 0.6053, + "step": 652 + }, + { + "epoch": 0.30073975993782565, + "grad_norm": 3.244406880170614, + "learning_rate": 1.640179768009156e-06, + "loss": 0.5883, + "step": 653 + }, + { + "epoch": 0.3012003108718805, + "grad_norm": 3.145131450305741, + "learning_rate": 1.6390325231889616e-06, + "loss": 0.4706, + "step": 654 + }, + { + "epoch": 0.3016608618059354, + "grad_norm": 2.7110601957589404, + "learning_rate": 1.6378838549967415e-06, + "loss": 0.6071, + "step": 655 + }, + { + "epoch": 0.3021214127399902, + "grad_norm": 3.343566724286675, + "learning_rate": 1.6367337659910221e-06, + "loss": 0.6244, + "step": 656 + }, + { + "epoch": 0.30258196367404505, + "grad_norm": 3.209028090230382, + "learning_rate": 1.6355822587334959e-06, + "loss": 0.543, + "step": 657 + }, + { + "epoch": 0.30304251460809994, + "grad_norm": 3.5185896956330422, + "learning_rate": 1.6344293357890137e-06, + "loss": 0.5821, + "step": 658 + }, + { + "epoch": 0.3035030655421548, + "grad_norm": 2.8601115189735604, + "learning_rate": 1.6332749997255804e-06, + "loss": 0.6433, + "step": 659 + }, + { + "epoch": 0.3039636164762097, + "grad_norm": 3.476073663868734, + "learning_rate": 1.632119253114347e-06, + "loss": 0.4601, + "step": 660 + }, + { + "epoch": 0.3044241674102645, + "grad_norm": 3.2412899564729005, + "learning_rate": 1.6309620985296072e-06, + "loss": 0.7235, + "step": 661 + }, + { + "epoch": 0.3048847183443194, + "grad_norm": 2.713918449445168, + "learning_rate": 1.6298035385487918e-06, + "loss": 0.6167, + "step": 662 + }, + { + "epoch": 0.30534526927837424, + "grad_norm": 3.4033868200489623, + "learning_rate": 1.6286435757524602e-06, + "loss": 0.5718, + "step": 663 + }, + { + "epoch": 0.30580582021242914, + "grad_norm": 3.0202609458048038, + "learning_rate": 1.6274822127242974e-06, + "loss": 0.4536, + "step": 664 + }, + { + "epoch": 0.306266371146484, + "grad_norm": 3.001878947353794, + "learning_rate": 1.6263194520511064e-06, + "loss": 0.57, + "step": 665 + }, + { + "epoch": 0.30672692208053887, + "grad_norm": 2.998809917811107, + "learning_rate": 1.6251552963228048e-06, + "loss": 0.5362, + "step": 666 + }, + { + "epoch": 0.3071874730145937, + "grad_norm": 2.977451183975403, + "learning_rate": 1.6239897481324164e-06, + "loss": 0.4483, + "step": 667 + }, + { + "epoch": 0.3076480239486486, + "grad_norm": 3.0423908095090733, + "learning_rate": 1.6228228100760664e-06, + "loss": 0.5557, + "step": 668 + }, + { + "epoch": 0.30810857488270343, + "grad_norm": 3.043152280285196, + "learning_rate": 1.6216544847529764e-06, + "loss": 0.6267, + "step": 669 + }, + { + "epoch": 0.30856912581675827, + "grad_norm": 2.742916110544047, + "learning_rate": 1.620484774765458e-06, + "loss": 0.5141, + "step": 670 + }, + { + "epoch": 0.30902967675081316, + "grad_norm": 3.535942753468968, + "learning_rate": 1.6193136827189065e-06, + "loss": 0.7185, + "step": 671 + }, + { + "epoch": 0.309490227684868, + "grad_norm": 3.4294982638238163, + "learning_rate": 1.6181412112217957e-06, + "loss": 0.6178, + "step": 672 + }, + { + "epoch": 0.3099507786189229, + "grad_norm": 3.3208662012013104, + "learning_rate": 1.6169673628856722e-06, + "loss": 0.5777, + "step": 673 + }, + { + "epoch": 0.31041132955297773, + "grad_norm": 3.6764674298301214, + "learning_rate": 1.6157921403251492e-06, + "loss": 0.5752, + "step": 674 + }, + { + "epoch": 0.3108718804870326, + "grad_norm": 3.252440774762675, + "learning_rate": 1.6146155461579007e-06, + "loss": 0.7095, + "step": 675 + }, + { + "epoch": 0.31133243142108746, + "grad_norm": 3.2165238232059084, + "learning_rate": 1.6134375830046563e-06, + "loss": 0.6169, + "step": 676 + }, + { + "epoch": 0.31179298235514236, + "grad_norm": 3.1854273045665287, + "learning_rate": 1.6122582534891942e-06, + "loss": 0.6611, + "step": 677 + }, + { + "epoch": 0.3122535332891972, + "grad_norm": 3.4248301426471297, + "learning_rate": 1.6110775602383365e-06, + "loss": 0.6427, + "step": 678 + }, + { + "epoch": 0.3127140842232521, + "grad_norm": 3.113032011879882, + "learning_rate": 1.6098955058819423e-06, + "loss": 0.5213, + "step": 679 + }, + { + "epoch": 0.3131746351573069, + "grad_norm": 3.3490627264511175, + "learning_rate": 1.6087120930529036e-06, + "loss": 0.5633, + "step": 680 + }, + { + "epoch": 0.3136351860913618, + "grad_norm": 3.2500116220033375, + "learning_rate": 1.6075273243871367e-06, + "loss": 0.5387, + "step": 681 + }, + { + "epoch": 0.31409573702541665, + "grad_norm": 3.6728777135144837, + "learning_rate": 1.606341202523579e-06, + "loss": 0.5113, + "step": 682 + }, + { + "epoch": 0.3145562879594715, + "grad_norm": 2.683705848328992, + "learning_rate": 1.6051537301041812e-06, + "loss": 0.5355, + "step": 683 + }, + { + "epoch": 0.3150168388935264, + "grad_norm": 3.4297858809617363, + "learning_rate": 1.6039649097739032e-06, + "loss": 0.6449, + "step": 684 + }, + { + "epoch": 0.3154773898275812, + "grad_norm": 3.292570386362054, + "learning_rate": 1.602774744180706e-06, + "loss": 0.6552, + "step": 685 + }, + { + "epoch": 0.3159379407616361, + "grad_norm": 3.123629844208179, + "learning_rate": 1.601583235975548e-06, + "loss": 0.4843, + "step": 686 + }, + { + "epoch": 0.31639849169569095, + "grad_norm": 3.1402599959393473, + "learning_rate": 1.6003903878123782e-06, + "loss": 0.5134, + "step": 687 + }, + { + "epoch": 0.31685904262974585, + "grad_norm": 3.3270904677521873, + "learning_rate": 1.599196202348129e-06, + "loss": 0.5501, + "step": 688 + }, + { + "epoch": 0.3173195935638007, + "grad_norm": 3.5015088481238577, + "learning_rate": 1.5980006822427123e-06, + "loss": 0.5634, + "step": 689 + }, + { + "epoch": 0.3177801444978556, + "grad_norm": 3.08180457972679, + "learning_rate": 1.5968038301590133e-06, + "loss": 0.6374, + "step": 690 + }, + { + "epoch": 0.3182406954319104, + "grad_norm": 3.2445020121060715, + "learning_rate": 1.5956056487628827e-06, + "loss": 0.6407, + "step": 691 + }, + { + "epoch": 0.3187012463659653, + "grad_norm": 3.233197428352453, + "learning_rate": 1.5944061407231338e-06, + "loss": 0.706, + "step": 692 + }, + { + "epoch": 0.31916179730002014, + "grad_norm": 3.280658309577868, + "learning_rate": 1.593205308711533e-06, + "loss": 0.5266, + "step": 693 + }, + { + "epoch": 0.31962234823407504, + "grad_norm": 3.5140387469250736, + "learning_rate": 1.5920031554027969e-06, + "loss": 0.5574, + "step": 694 + }, + { + "epoch": 0.3200828991681299, + "grad_norm": 3.42036786677391, + "learning_rate": 1.590799683474585e-06, + "loss": 0.6599, + "step": 695 + }, + { + "epoch": 0.3205434501021847, + "grad_norm": 3.1846836901173075, + "learning_rate": 1.5895948956074933e-06, + "loss": 0.668, + "step": 696 + }, + { + "epoch": 0.3210040010362396, + "grad_norm": 3.3566033229272967, + "learning_rate": 1.5883887944850495e-06, + "loss": 0.6669, + "step": 697 + }, + { + "epoch": 0.32146455197029444, + "grad_norm": 3.2045770323594382, + "learning_rate": 1.5871813827937063e-06, + "loss": 0.5538, + "step": 698 + }, + { + "epoch": 0.32192510290434934, + "grad_norm": 2.9902993069464565, + "learning_rate": 1.5859726632228357e-06, + "loss": 0.5938, + "step": 699 + }, + { + "epoch": 0.3223856538384042, + "grad_norm": 3.0057175558399485, + "learning_rate": 1.5847626384647221e-06, + "loss": 0.5181, + "step": 700 + }, + { + "epoch": 0.32284620477245907, + "grad_norm": 3.5222478856728254, + "learning_rate": 1.583551311214558e-06, + "loss": 0.5791, + "step": 701 + }, + { + "epoch": 0.3233067557065139, + "grad_norm": 3.998127982109179, + "learning_rate": 1.5823386841704362e-06, + "loss": 0.4773, + "step": 702 + }, + { + "epoch": 0.3237673066405688, + "grad_norm": 2.995949094547311, + "learning_rate": 1.5811247600333456e-06, + "loss": 0.5852, + "step": 703 + }, + { + "epoch": 0.32422785757462363, + "grad_norm": 2.9705371769732425, + "learning_rate": 1.5799095415071628e-06, + "loss": 0.5957, + "step": 704 + }, + { + "epoch": 0.3246884085086785, + "grad_norm": 2.6316896525372537, + "learning_rate": 1.5786930312986495e-06, + "loss": 0.4241, + "step": 705 + }, + { + "epoch": 0.32514895944273337, + "grad_norm": 3.200708018638179, + "learning_rate": 1.5774752321174427e-06, + "loss": 0.5434, + "step": 706 + }, + { + "epoch": 0.32560951037678826, + "grad_norm": 3.452466534810513, + "learning_rate": 1.576256146676051e-06, + "loss": 0.5766, + "step": 707 + }, + { + "epoch": 0.3260700613108431, + "grad_norm": 2.8456835798060816, + "learning_rate": 1.575035777689849e-06, + "loss": 0.4522, + "step": 708 + }, + { + "epoch": 0.32653061224489793, + "grad_norm": 3.593887134081324, + "learning_rate": 1.5738141278770685e-06, + "loss": 0.628, + "step": 709 + }, + { + "epoch": 0.3269911631789528, + "grad_norm": 3.217566834694963, + "learning_rate": 1.5725911999587953e-06, + "loss": 0.594, + "step": 710 + }, + { + "epoch": 0.32745171411300766, + "grad_norm": 3.0506131303862234, + "learning_rate": 1.5713669966589618e-06, + "loss": 0.6114, + "step": 711 + }, + { + "epoch": 0.32791226504706256, + "grad_norm": 3.2672150522195254, + "learning_rate": 1.5701415207043414e-06, + "loss": 0.5329, + "step": 712 + }, + { + "epoch": 0.3283728159811174, + "grad_norm": 3.0741601299943957, + "learning_rate": 1.5689147748245412e-06, + "loss": 0.5823, + "step": 713 + }, + { + "epoch": 0.3288333669151723, + "grad_norm": 2.7741610627962974, + "learning_rate": 1.5676867617519986e-06, + "loss": 0.5868, + "step": 714 + }, + { + "epoch": 0.3292939178492271, + "grad_norm": 3.3153530179766766, + "learning_rate": 1.566457484221972e-06, + "loss": 0.5851, + "step": 715 + }, + { + "epoch": 0.329754468783282, + "grad_norm": 3.317026645580635, + "learning_rate": 1.5652269449725374e-06, + "loss": 0.6181, + "step": 716 + }, + { + "epoch": 0.33021501971733686, + "grad_norm": 3.404975554795291, + "learning_rate": 1.5639951467445798e-06, + "loss": 0.6734, + "step": 717 + }, + { + "epoch": 0.33067557065139175, + "grad_norm": 2.8122628566375933, + "learning_rate": 1.5627620922817895e-06, + "loss": 0.5022, + "step": 718 + }, + { + "epoch": 0.3311361215854466, + "grad_norm": 2.8689456040300954, + "learning_rate": 1.561527784330655e-06, + "loss": 0.4525, + "step": 719 + }, + { + "epoch": 0.3315966725195015, + "grad_norm": 3.1909349124276734, + "learning_rate": 1.5602922256404556e-06, + "loss": 0.6127, + "step": 720 + }, + { + "epoch": 0.3320572234535563, + "grad_norm": 3.0736929753426776, + "learning_rate": 1.559055418963258e-06, + "loss": 0.5143, + "step": 721 + }, + { + "epoch": 0.33251777438761115, + "grad_norm": 2.9374948837179518, + "learning_rate": 1.557817367053908e-06, + "loss": 0.4802, + "step": 722 + }, + { + "epoch": 0.33297832532166605, + "grad_norm": 2.8350541529140316, + "learning_rate": 1.5565780726700244e-06, + "loss": 0.4546, + "step": 723 + }, + { + "epoch": 0.3334388762557209, + "grad_norm": 2.9599261081932027, + "learning_rate": 1.5553375385719943e-06, + "loss": 0.5198, + "step": 724 + }, + { + "epoch": 0.3338994271897758, + "grad_norm": 3.2317683594863715, + "learning_rate": 1.5540957675229663e-06, + "loss": 0.5195, + "step": 725 + }, + { + "epoch": 0.3343599781238306, + "grad_norm": 3.296064700074342, + "learning_rate": 1.5528527622888428e-06, + "loss": 0.6044, + "step": 726 + }, + { + "epoch": 0.3348205290578855, + "grad_norm": 2.863524627128321, + "learning_rate": 1.5516085256382764e-06, + "loss": 0.6144, + "step": 727 + }, + { + "epoch": 0.33528107999194035, + "grad_norm": 3.0821132303925594, + "learning_rate": 1.5503630603426628e-06, + "loss": 0.576, + "step": 728 + }, + { + "epoch": 0.33574163092599524, + "grad_norm": 2.9688429474981026, + "learning_rate": 1.5491163691761334e-06, + "loss": 0.5294, + "step": 729 + }, + { + "epoch": 0.3362021818600501, + "grad_norm": 3.268176435044111, + "learning_rate": 1.5478684549155507e-06, + "loss": 0.523, + "step": 730 + }, + { + "epoch": 0.33666273279410497, + "grad_norm": 3.3700516164655374, + "learning_rate": 1.5466193203405015e-06, + "loss": 0.6837, + "step": 731 + }, + { + "epoch": 0.3371232837281598, + "grad_norm": 3.2319186451474646, + "learning_rate": 1.5453689682332898e-06, + "loss": 0.6385, + "step": 732 + }, + { + "epoch": 0.3375838346622147, + "grad_norm": 3.4936427230146987, + "learning_rate": 1.5441174013789326e-06, + "loss": 0.6258, + "step": 733 + }, + { + "epoch": 0.33804438559626954, + "grad_norm": 3.305223919072284, + "learning_rate": 1.5428646225651525e-06, + "loss": 0.5428, + "step": 734 + }, + { + "epoch": 0.3385049365303244, + "grad_norm": 2.9963569062928777, + "learning_rate": 1.5416106345823714e-06, + "loss": 0.5679, + "step": 735 + }, + { + "epoch": 0.33896548746437927, + "grad_norm": 3.3209457502261026, + "learning_rate": 1.540355440223704e-06, + "loss": 0.6125, + "step": 736 + }, + { + "epoch": 0.3394260383984341, + "grad_norm": 3.304460716728442, + "learning_rate": 1.5390990422849528e-06, + "loss": 0.5128, + "step": 737 + }, + { + "epoch": 0.339886589332489, + "grad_norm": 3.177691233283076, + "learning_rate": 1.5378414435646008e-06, + "loss": 0.6157, + "step": 738 + }, + { + "epoch": 0.34034714026654384, + "grad_norm": 3.2484901797414687, + "learning_rate": 1.5365826468638057e-06, + "loss": 0.4804, + "step": 739 + }, + { + "epoch": 0.34080769120059873, + "grad_norm": 2.97003236725624, + "learning_rate": 1.5353226549863933e-06, + "loss": 0.5174, + "step": 740 + }, + { + "epoch": 0.34126824213465357, + "grad_norm": 3.0081534150970985, + "learning_rate": 1.5340614707388516e-06, + "loss": 0.5972, + "step": 741 + }, + { + "epoch": 0.34172879306870846, + "grad_norm": 3.0537773804963044, + "learning_rate": 1.5327990969303256e-06, + "loss": 0.6268, + "step": 742 + }, + { + "epoch": 0.3421893440027633, + "grad_norm": 2.4616414555847537, + "learning_rate": 1.531535536372608e-06, + "loss": 0.5428, + "step": 743 + }, + { + "epoch": 0.3426498949368182, + "grad_norm": 3.459066350919476, + "learning_rate": 1.5302707918801354e-06, + "loss": 0.6306, + "step": 744 + }, + { + "epoch": 0.343110445870873, + "grad_norm": 3.352692171370741, + "learning_rate": 1.5290048662699828e-06, + "loss": 0.5567, + "step": 745 + }, + { + "epoch": 0.3435709968049279, + "grad_norm": 3.444376949244577, + "learning_rate": 1.5277377623618546e-06, + "loss": 0.587, + "step": 746 + }, + { + "epoch": 0.34403154773898276, + "grad_norm": 3.018488954664052, + "learning_rate": 1.5264694829780801e-06, + "loss": 0.6234, + "step": 747 + }, + { + "epoch": 0.3444920986730376, + "grad_norm": 4.059892549594848, + "learning_rate": 1.5252000309436071e-06, + "loss": 0.5284, + "step": 748 + }, + { + "epoch": 0.3449526496070925, + "grad_norm": 3.1529595103252848, + "learning_rate": 1.5239294090859948e-06, + "loss": 0.5514, + "step": 749 + }, + { + "epoch": 0.3454132005411473, + "grad_norm": 3.2294187071291938, + "learning_rate": 1.522657620235409e-06, + "loss": 0.5978, + "step": 750 + }, + { + "epoch": 0.3458737514752022, + "grad_norm": 3.1490886609776685, + "learning_rate": 1.5213846672246138e-06, + "loss": 0.596, + "step": 751 + }, + { + "epoch": 0.34633430240925706, + "grad_norm": 3.1168755025387362, + "learning_rate": 1.5201105528889666e-06, + "loss": 0.5168, + "step": 752 + }, + { + "epoch": 0.34679485334331195, + "grad_norm": 3.054962204132803, + "learning_rate": 1.5188352800664119e-06, + "loss": 0.5459, + "step": 753 + }, + { + "epoch": 0.3472554042773668, + "grad_norm": 3.575951938973164, + "learning_rate": 1.5175588515974748e-06, + "loss": 0.6461, + "step": 754 + }, + { + "epoch": 0.3477159552114217, + "grad_norm": 2.8785648294557085, + "learning_rate": 1.5162812703252537e-06, + "loss": 0.4715, + "step": 755 + }, + { + "epoch": 0.3481765061454765, + "grad_norm": 3.2534706235609585, + "learning_rate": 1.5150025390954152e-06, + "loss": 0.5632, + "step": 756 + }, + { + "epoch": 0.3486370570795314, + "grad_norm": 3.0206088523549974, + "learning_rate": 1.513722660756187e-06, + "loss": 0.4888, + "step": 757 + }, + { + "epoch": 0.34909760801358625, + "grad_norm": 2.7017314694335086, + "learning_rate": 1.5124416381583517e-06, + "loss": 0.4697, + "step": 758 + }, + { + "epoch": 0.34955815894764114, + "grad_norm": 3.3514313878368625, + "learning_rate": 1.5111594741552423e-06, + "loss": 0.5044, + "step": 759 + }, + { + "epoch": 0.350018709881696, + "grad_norm": 3.338106474247453, + "learning_rate": 1.5098761716027315e-06, + "loss": 0.5476, + "step": 760 + }, + { + "epoch": 0.3504792608157508, + "grad_norm": 3.3207826162715666, + "learning_rate": 1.5085917333592297e-06, + "loss": 0.6685, + "step": 761 + }, + { + "epoch": 0.3509398117498057, + "grad_norm": 2.882204620095301, + "learning_rate": 1.5073061622856765e-06, + "loss": 0.4214, + "step": 762 + }, + { + "epoch": 0.35140036268386055, + "grad_norm": 3.2702426268023856, + "learning_rate": 1.506019461245535e-06, + "loss": 0.4785, + "step": 763 + }, + { + "epoch": 0.35186091361791544, + "grad_norm": 3.393774716059176, + "learning_rate": 1.5047316331047846e-06, + "loss": 0.5249, + "step": 764 + }, + { + "epoch": 0.3523214645519703, + "grad_norm": 2.626311070223108, + "learning_rate": 1.5034426807319162e-06, + "loss": 0.5107, + "step": 765 + }, + { + "epoch": 0.35278201548602517, + "grad_norm": 3.0508858155806586, + "learning_rate": 1.5021526069979232e-06, + "loss": 0.6297, + "step": 766 + }, + { + "epoch": 0.35324256642008, + "grad_norm": 3.2942945723575336, + "learning_rate": 1.5008614147762982e-06, + "loss": 0.6742, + "step": 767 + }, + { + "epoch": 0.3537031173541349, + "grad_norm": 3.375703871690601, + "learning_rate": 1.4995691069430244e-06, + "loss": 0.6203, + "step": 768 + }, + { + "epoch": 0.35416366828818974, + "grad_norm": 3.11233555593855, + "learning_rate": 1.49827568637657e-06, + "loss": 0.5992, + "step": 769 + }, + { + "epoch": 0.35462421922224463, + "grad_norm": 2.8563952523571965, + "learning_rate": 1.4969811559578818e-06, + "loss": 0.4969, + "step": 770 + }, + { + "epoch": 0.35508477015629947, + "grad_norm": 3.2356844981316235, + "learning_rate": 1.4956855185703786e-06, + "loss": 0.4214, + "step": 771 + }, + { + "epoch": 0.35554532109035436, + "grad_norm": 2.8462493199493024, + "learning_rate": 1.4943887770999447e-06, + "loss": 0.5523, + "step": 772 + }, + { + "epoch": 0.3560058720244092, + "grad_norm": 3.5665113209504105, + "learning_rate": 1.493090934434924e-06, + "loss": 0.5731, + "step": 773 + }, + { + "epoch": 0.35646642295846404, + "grad_norm": 3.169353405800153, + "learning_rate": 1.4917919934661128e-06, + "loss": 0.5358, + "step": 774 + }, + { + "epoch": 0.35692697389251893, + "grad_norm": 3.121133974442328, + "learning_rate": 1.4904919570867539e-06, + "loss": 0.5471, + "step": 775 + }, + { + "epoch": 0.35738752482657377, + "grad_norm": 3.0928538679959243, + "learning_rate": 1.4891908281925298e-06, + "loss": 0.6502, + "step": 776 + }, + { + "epoch": 0.35784807576062866, + "grad_norm": 3.2886205307157823, + "learning_rate": 1.4878886096815569e-06, + "loss": 0.5099, + "step": 777 + }, + { + "epoch": 0.3583086266946835, + "grad_norm": 3.041026939521522, + "learning_rate": 1.486585304454378e-06, + "loss": 0.5957, + "step": 778 + }, + { + "epoch": 0.3587691776287384, + "grad_norm": 2.8217805539112732, + "learning_rate": 1.4852809154139576e-06, + "loss": 0.5313, + "step": 779 + }, + { + "epoch": 0.3592297285627932, + "grad_norm": 2.6440360065601145, + "learning_rate": 1.4839754454656723e-06, + "loss": 0.5048, + "step": 780 + }, + { + "epoch": 0.3596902794968481, + "grad_norm": 3.4469745235083393, + "learning_rate": 1.4826688975173084e-06, + "loss": 0.6932, + "step": 781 + }, + { + "epoch": 0.36015083043090296, + "grad_norm": 4.096492089448597, + "learning_rate": 1.481361274479052e-06, + "loss": 0.5383, + "step": 782 + }, + { + "epoch": 0.36061138136495785, + "grad_norm": 2.9081019095353606, + "learning_rate": 1.4800525792634838e-06, + "loss": 0.5196, + "step": 783 + }, + { + "epoch": 0.3610719322990127, + "grad_norm": 3.0363832332393073, + "learning_rate": 1.4787428147855737e-06, + "loss": 0.5606, + "step": 784 + }, + { + "epoch": 0.3615324832330676, + "grad_norm": 3.404973081364874, + "learning_rate": 1.4774319839626725e-06, + "loss": 0.6067, + "step": 785 + }, + { + "epoch": 0.3619930341671224, + "grad_norm": 2.9657594105232916, + "learning_rate": 1.476120089714506e-06, + "loss": 0.5427, + "step": 786 + }, + { + "epoch": 0.36245358510117726, + "grad_norm": 3.6192624322683256, + "learning_rate": 1.4748071349631693e-06, + "loss": 0.6706, + "step": 787 + }, + { + "epoch": 0.36291413603523215, + "grad_norm": 3.526460171368003, + "learning_rate": 1.4734931226331188e-06, + "loss": 0.5103, + "step": 788 + }, + { + "epoch": 0.363374686969287, + "grad_norm": 2.8430807766055577, + "learning_rate": 1.4721780556511674e-06, + "loss": 0.4935, + "step": 789 + }, + { + "epoch": 0.3638352379033419, + "grad_norm": 3.264151914199905, + "learning_rate": 1.4708619369464765e-06, + "loss": 0.6703, + "step": 790 + }, + { + "epoch": 0.3642957888373967, + "grad_norm": 3.5948426136873843, + "learning_rate": 1.469544769450551e-06, + "loss": 0.5244, + "step": 791 + }, + { + "epoch": 0.3647563397714516, + "grad_norm": 3.2156355781796018, + "learning_rate": 1.46822655609723e-06, + "loss": 0.5689, + "step": 792 + }, + { + "epoch": 0.36521689070550645, + "grad_norm": 3.4654337540005002, + "learning_rate": 1.4669072998226843e-06, + "loss": 0.5605, + "step": 793 + }, + { + "epoch": 0.36567744163956134, + "grad_norm": 3.0725623075599184, + "learning_rate": 1.4655870035654065e-06, + "loss": 0.6267, + "step": 794 + }, + { + "epoch": 0.3661379925736162, + "grad_norm": 3.029931817827877, + "learning_rate": 1.4642656702662058e-06, + "loss": 0.6285, + "step": 795 + }, + { + "epoch": 0.36659854350767107, + "grad_norm": 2.9315413167488558, + "learning_rate": 1.4629433028682013e-06, + "loss": 0.5949, + "step": 796 + }, + { + "epoch": 0.3670590944417259, + "grad_norm": 3.151885439468786, + "learning_rate": 1.4616199043168154e-06, + "loss": 0.6758, + "step": 797 + }, + { + "epoch": 0.3675196453757808, + "grad_norm": 3.0346115082411655, + "learning_rate": 1.4602954775597673e-06, + "loss": 0.5696, + "step": 798 + }, + { + "epoch": 0.36798019630983564, + "grad_norm": 3.2157712771710827, + "learning_rate": 1.458970025547067e-06, + "loss": 0.6722, + "step": 799 + }, + { + "epoch": 0.36844074724389053, + "grad_norm": 3.2513066677812748, + "learning_rate": 1.457643551231007e-06, + "loss": 0.6715, + "step": 800 + }, + { + "epoch": 0.36890129817794537, + "grad_norm": 3.5891769122485653, + "learning_rate": 1.456316057566158e-06, + "loss": 0.6068, + "step": 801 + }, + { + "epoch": 0.3693618491120002, + "grad_norm": 2.9833048436720655, + "learning_rate": 1.45498754750936e-06, + "loss": 0.5265, + "step": 802 + }, + { + "epoch": 0.3698224000460551, + "grad_norm": 3.0143882511843754, + "learning_rate": 1.453658024019718e-06, + "loss": 0.6277, + "step": 803 + }, + { + "epoch": 0.37028295098010994, + "grad_norm": 3.303125029561216, + "learning_rate": 1.4523274900585942e-06, + "loss": 0.5178, + "step": 804 + }, + { + "epoch": 0.37074350191416483, + "grad_norm": 2.96600210169327, + "learning_rate": 1.4509959485896004e-06, + "loss": 0.6883, + "step": 805 + }, + { + "epoch": 0.37120405284821967, + "grad_norm": 3.222533442523701, + "learning_rate": 1.4496634025785937e-06, + "loss": 0.5035, + "step": 806 + }, + { + "epoch": 0.37166460378227456, + "grad_norm": 3.3343786199048027, + "learning_rate": 1.4483298549936684e-06, + "loss": 0.5683, + "step": 807 + }, + { + "epoch": 0.3721251547163294, + "grad_norm": 3.194169003790268, + "learning_rate": 1.4469953088051497e-06, + "loss": 0.6087, + "step": 808 + }, + { + "epoch": 0.3725857056503843, + "grad_norm": 3.1623966174805083, + "learning_rate": 1.445659766985586e-06, + "loss": 0.5334, + "step": 809 + }, + { + "epoch": 0.37304625658443913, + "grad_norm": 2.9573679310798138, + "learning_rate": 1.4443232325097454e-06, + "loss": 0.4965, + "step": 810 + }, + { + "epoch": 0.373506807518494, + "grad_norm": 3.1354387245216766, + "learning_rate": 1.4429857083546053e-06, + "loss": 0.4966, + "step": 811 + }, + { + "epoch": 0.37396735845254886, + "grad_norm": 3.7718550261376045, + "learning_rate": 1.4416471974993487e-06, + "loss": 0.4793, + "step": 812 + }, + { + "epoch": 0.37442790938660375, + "grad_norm": 3.0917131083646194, + "learning_rate": 1.4403077029253553e-06, + "loss": 0.6511, + "step": 813 + }, + { + "epoch": 0.3748884603206586, + "grad_norm": 3.060066893764434, + "learning_rate": 1.4389672276161963e-06, + "loss": 0.4503, + "step": 814 + }, + { + "epoch": 0.3753490112547134, + "grad_norm": 3.0880275565306006, + "learning_rate": 1.4376257745576282e-06, + "loss": 0.5234, + "step": 815 + }, + { + "epoch": 0.3758095621887683, + "grad_norm": 3.3246972054582233, + "learning_rate": 1.4362833467375836e-06, + "loss": 0.5849, + "step": 816 + }, + { + "epoch": 0.37627011312282316, + "grad_norm": 3.5727318566464263, + "learning_rate": 1.4349399471461684e-06, + "loss": 0.6084, + "step": 817 + }, + { + "epoch": 0.37673066405687805, + "grad_norm": 3.688093227741573, + "learning_rate": 1.4335955787756513e-06, + "loss": 0.6864, + "step": 818 + }, + { + "epoch": 0.3771912149909329, + "grad_norm": 3.6380575782207702, + "learning_rate": 1.4322502446204592e-06, + "loss": 0.6678, + "step": 819 + }, + { + "epoch": 0.3776517659249878, + "grad_norm": 3.1559208653988633, + "learning_rate": 1.4309039476771706e-06, + "loss": 0.4839, + "step": 820 + }, + { + "epoch": 0.3781123168590426, + "grad_norm": 3.7163228600713745, + "learning_rate": 1.429556690944509e-06, + "loss": 0.6348, + "step": 821 + }, + { + "epoch": 0.3785728677930975, + "grad_norm": 2.9842451520879876, + "learning_rate": 1.4282084774233338e-06, + "loss": 0.4819, + "step": 822 + }, + { + "epoch": 0.37903341872715235, + "grad_norm": 3.069211559831523, + "learning_rate": 1.4268593101166378e-06, + "loss": 0.6175, + "step": 823 + }, + { + "epoch": 0.37949396966120724, + "grad_norm": 3.115770022996151, + "learning_rate": 1.4255091920295367e-06, + "loss": 0.6113, + "step": 824 + }, + { + "epoch": 0.3799545205952621, + "grad_norm": 3.2472503472001484, + "learning_rate": 1.4241581261692647e-06, + "loss": 0.6686, + "step": 825 + }, + { + "epoch": 0.380415071529317, + "grad_norm": 3.1742136266716314, + "learning_rate": 1.422806115545167e-06, + "loss": 0.6194, + "step": 826 + }, + { + "epoch": 0.3808756224633718, + "grad_norm": 2.7416191430113788, + "learning_rate": 1.4214531631686929e-06, + "loss": 0.575, + "step": 827 + }, + { + "epoch": 0.38133617339742665, + "grad_norm": 2.991715260469191, + "learning_rate": 1.4200992720533886e-06, + "loss": 0.6348, + "step": 828 + }, + { + "epoch": 0.38179672433148154, + "grad_norm": 3.3536018044430143, + "learning_rate": 1.4187444452148934e-06, + "loss": 0.5707, + "step": 829 + }, + { + "epoch": 0.3822572752655364, + "grad_norm": 2.9504178057835957, + "learning_rate": 1.4173886856709288e-06, + "loss": 0.5539, + "step": 830 + }, + { + "epoch": 0.38271782619959127, + "grad_norm": 3.4467632526859826, + "learning_rate": 1.416031996441294e-06, + "loss": 0.5473, + "step": 831 + }, + { + "epoch": 0.3831783771336461, + "grad_norm": 3.1022606765939877, + "learning_rate": 1.4146743805478605e-06, + "loss": 0.5181, + "step": 832 + }, + { + "epoch": 0.383638928067701, + "grad_norm": 2.8177854685524575, + "learning_rate": 1.413315841014562e-06, + "loss": 0.5749, + "step": 833 + }, + { + "epoch": 0.38409947900175584, + "grad_norm": 3.6263375137296614, + "learning_rate": 1.4119563808673905e-06, + "loss": 0.5913, + "step": 834 + }, + { + "epoch": 0.38456002993581073, + "grad_norm": 3.0015961938916806, + "learning_rate": 1.4105960031343889e-06, + "loss": 0.5724, + "step": 835 + }, + { + "epoch": 0.38502058086986557, + "grad_norm": 3.0678893744276357, + "learning_rate": 1.4092347108456424e-06, + "loss": 0.6605, + "step": 836 + }, + { + "epoch": 0.38548113180392046, + "grad_norm": 2.899485221765189, + "learning_rate": 1.4078725070332746e-06, + "loss": 0.3908, + "step": 837 + }, + { + "epoch": 0.3859416827379753, + "grad_norm": 3.085157334581907, + "learning_rate": 1.4065093947314396e-06, + "loss": 0.4573, + "step": 838 + }, + { + "epoch": 0.3864022336720302, + "grad_norm": 3.4352911831634887, + "learning_rate": 1.4051453769763143e-06, + "loss": 0.6433, + "step": 839 + }, + { + "epoch": 0.38686278460608503, + "grad_norm": 3.742434977201632, + "learning_rate": 1.4037804568060919e-06, + "loss": 0.5888, + "step": 840 + }, + { + "epoch": 0.38732333554013987, + "grad_norm": 2.9975870813637586, + "learning_rate": 1.402414637260977e-06, + "loss": 0.5201, + "step": 841 + }, + { + "epoch": 0.38778388647419476, + "grad_norm": 3.0225885866635367, + "learning_rate": 1.4010479213831762e-06, + "loss": 0.4794, + "step": 842 + }, + { + "epoch": 0.3882444374082496, + "grad_norm": 3.2296706627603857, + "learning_rate": 1.399680312216894e-06, + "loss": 0.5355, + "step": 843 + }, + { + "epoch": 0.3887049883423045, + "grad_norm": 3.0042503059260883, + "learning_rate": 1.3983118128083234e-06, + "loss": 0.5446, + "step": 844 + }, + { + "epoch": 0.38916553927635933, + "grad_norm": 3.5187839319359684, + "learning_rate": 1.3969424262056402e-06, + "loss": 0.598, + "step": 845 + }, + { + "epoch": 0.3896260902104142, + "grad_norm": 3.059136381267136, + "learning_rate": 1.3955721554589975e-06, + "loss": 0.621, + "step": 846 + }, + { + "epoch": 0.39008664114446906, + "grad_norm": 3.186481313726005, + "learning_rate": 1.3942010036205165e-06, + "loss": 0.5483, + "step": 847 + }, + { + "epoch": 0.39054719207852395, + "grad_norm": 3.3209014633626133, + "learning_rate": 1.392828973744282e-06, + "loss": 0.5836, + "step": 848 + }, + { + "epoch": 0.3910077430125788, + "grad_norm": 3.3185866705706184, + "learning_rate": 1.3914560688863336e-06, + "loss": 0.4829, + "step": 849 + }, + { + "epoch": 0.3914682939466337, + "grad_norm": 3.1927851100882862, + "learning_rate": 1.39008229210466e-06, + "loss": 0.5053, + "step": 850 + }, + { + "epoch": 0.3919288448806885, + "grad_norm": 3.3595068103015473, + "learning_rate": 1.3887076464591928e-06, + "loss": 0.5338, + "step": 851 + }, + { + "epoch": 0.3923893958147434, + "grad_norm": 2.7905012085301837, + "learning_rate": 1.3873321350117981e-06, + "loss": 0.4979, + "step": 852 + }, + { + "epoch": 0.39284994674879825, + "grad_norm": 3.090347369854456, + "learning_rate": 1.3859557608262705e-06, + "loss": 0.5797, + "step": 853 + }, + { + "epoch": 0.3933104976828531, + "grad_norm": 3.060994610172803, + "learning_rate": 1.384578526968326e-06, + "loss": 0.5184, + "step": 854 + }, + { + "epoch": 0.393771048616908, + "grad_norm": 3.6607469509314914, + "learning_rate": 1.3832004365055974e-06, + "loss": 0.5485, + "step": 855 + }, + { + "epoch": 0.3942315995509628, + "grad_norm": 2.7397062923925053, + "learning_rate": 1.3818214925076223e-06, + "loss": 0.4594, + "step": 856 + }, + { + "epoch": 0.3946921504850177, + "grad_norm": 3.0413655649505262, + "learning_rate": 1.380441698045842e-06, + "loss": 0.495, + "step": 857 + }, + { + "epoch": 0.39515270141907255, + "grad_norm": 3.4287955433322423, + "learning_rate": 1.3790610561935911e-06, + "loss": 0.645, + "step": 858 + }, + { + "epoch": 0.39561325235312744, + "grad_norm": 3.3357893051930816, + "learning_rate": 1.3776795700260915e-06, + "loss": 0.6037, + "step": 859 + }, + { + "epoch": 0.3960738032871823, + "grad_norm": 3.6164730160728413, + "learning_rate": 1.3762972426204461e-06, + "loss": 0.622, + "step": 860 + }, + { + "epoch": 0.3965343542212372, + "grad_norm": 3.386533311875332, + "learning_rate": 1.374914077055632e-06, + "loss": 0.6871, + "step": 861 + }, + { + "epoch": 0.396994905155292, + "grad_norm": 2.9410360813121352, + "learning_rate": 1.3735300764124916e-06, + "loss": 0.5102, + "step": 862 + }, + { + "epoch": 0.3974554560893469, + "grad_norm": 3.103887613925395, + "learning_rate": 1.3721452437737293e-06, + "loss": 0.5432, + "step": 863 + }, + { + "epoch": 0.39791600702340174, + "grad_norm": 3.517765292291276, + "learning_rate": 1.3707595822239015e-06, + "loss": 0.5159, + "step": 864 + }, + { + "epoch": 0.39837655795745663, + "grad_norm": 3.043417914703796, + "learning_rate": 1.3693730948494114e-06, + "loss": 0.5317, + "step": 865 + }, + { + "epoch": 0.39883710889151147, + "grad_norm": 3.338449707312485, + "learning_rate": 1.3679857847385009e-06, + "loss": 0.632, + "step": 866 + }, + { + "epoch": 0.3992976598255663, + "grad_norm": 3.253152412711078, + "learning_rate": 1.3665976549812452e-06, + "loss": 0.5134, + "step": 867 + }, + { + "epoch": 0.3997582107596212, + "grad_norm": 3.072410142653441, + "learning_rate": 1.365208708669545e-06, + "loss": 0.4643, + "step": 868 + }, + { + "epoch": 0.40021876169367604, + "grad_norm": 3.525403495130597, + "learning_rate": 1.36381894889712e-06, + "loss": 0.5517, + "step": 869 + }, + { + "epoch": 0.40067931262773093, + "grad_norm": 3.556527192054287, + "learning_rate": 1.362428378759501e-06, + "loss": 0.6111, + "step": 870 + }, + { + "epoch": 0.40113986356178577, + "grad_norm": 3.1449463321315236, + "learning_rate": 1.3610370013540247e-06, + "loss": 0.5445, + "step": 871 + }, + { + "epoch": 0.40160041449584066, + "grad_norm": 3.0202852656779577, + "learning_rate": 1.3596448197798253e-06, + "loss": 0.576, + "step": 872 + }, + { + "epoch": 0.4020609654298955, + "grad_norm": 3.08760829256673, + "learning_rate": 1.3582518371378282e-06, + "loss": 0.6271, + "step": 873 + }, + { + "epoch": 0.4025215163639504, + "grad_norm": 3.328960142744962, + "learning_rate": 1.3568580565307436e-06, + "loss": 0.5746, + "step": 874 + }, + { + "epoch": 0.40298206729800523, + "grad_norm": 3.0333254368255136, + "learning_rate": 1.355463481063059e-06, + "loss": 0.5229, + "step": 875 + }, + { + "epoch": 0.4034426182320601, + "grad_norm": 2.933573447144409, + "learning_rate": 1.3540681138410314e-06, + "loss": 0.5778, + "step": 876 + }, + { + "epoch": 0.40390316916611496, + "grad_norm": 2.882788964082942, + "learning_rate": 1.3526719579726829e-06, + "loss": 0.5073, + "step": 877 + }, + { + "epoch": 0.40436372010016985, + "grad_norm": 3.2365144628176568, + "learning_rate": 1.3512750165677906e-06, + "loss": 0.5211, + "step": 878 + }, + { + "epoch": 0.4048242710342247, + "grad_norm": 3.38193990207538, + "learning_rate": 1.3498772927378824e-06, + "loss": 0.606, + "step": 879 + }, + { + "epoch": 0.40528482196827953, + "grad_norm": 3.56597317135757, + "learning_rate": 1.348478789596229e-06, + "loss": 0.6701, + "step": 880 + }, + { + "epoch": 0.4057453729023344, + "grad_norm": 3.324345046149201, + "learning_rate": 1.3470795102578355e-06, + "loss": 0.6522, + "step": 881 + }, + { + "epoch": 0.40620592383638926, + "grad_norm": 3.340866287319074, + "learning_rate": 1.3456794578394382e-06, + "loss": 0.6136, + "step": 882 + }, + { + "epoch": 0.40666647477044415, + "grad_norm": 3.131575960928237, + "learning_rate": 1.3442786354594937e-06, + "loss": 0.5138, + "step": 883 + }, + { + "epoch": 0.407127025704499, + "grad_norm": 3.1323338349582306, + "learning_rate": 1.3428770462381739e-06, + "loss": 0.5023, + "step": 884 + }, + { + "epoch": 0.4075875766385539, + "grad_norm": 2.8727768076877664, + "learning_rate": 1.3414746932973583e-06, + "loss": 0.5462, + "step": 885 + }, + { + "epoch": 0.4080481275726087, + "grad_norm": 2.818312617541394, + "learning_rate": 1.340071579760629e-06, + "loss": 0.5003, + "step": 886 + }, + { + "epoch": 0.4085086785066636, + "grad_norm": 2.8944571031714674, + "learning_rate": 1.338667708753261e-06, + "loss": 0.4793, + "step": 887 + }, + { + "epoch": 0.40896922944071845, + "grad_norm": 3.0617046790599582, + "learning_rate": 1.3372630834022165e-06, + "loss": 0.5714, + "step": 888 + }, + { + "epoch": 0.40942978037477334, + "grad_norm": 3.3240252994477477, + "learning_rate": 1.3358577068361383e-06, + "loss": 0.7543, + "step": 889 + }, + { + "epoch": 0.4098903313088282, + "grad_norm": 3.387874616156958, + "learning_rate": 1.3344515821853427e-06, + "loss": 0.5331, + "step": 890 + }, + { + "epoch": 0.4103508822428831, + "grad_norm": 2.9538858616900296, + "learning_rate": 1.3330447125818114e-06, + "loss": 0.5126, + "step": 891 + }, + { + "epoch": 0.4108114331769379, + "grad_norm": 2.6905912340286045, + "learning_rate": 1.331637101159186e-06, + "loss": 0.6335, + "step": 892 + }, + { + "epoch": 0.41127198411099275, + "grad_norm": 2.8234464497330842, + "learning_rate": 1.3302287510527606e-06, + "loss": 0.485, + "step": 893 + }, + { + "epoch": 0.41173253504504764, + "grad_norm": 3.364736286801205, + "learning_rate": 1.3288196653994742e-06, + "loss": 0.6351, + "step": 894 + }, + { + "epoch": 0.4121930859791025, + "grad_norm": 3.148255912278157, + "learning_rate": 1.3274098473379041e-06, + "loss": 0.501, + "step": 895 + }, + { + "epoch": 0.4126536369131574, + "grad_norm": 2.9927281051396397, + "learning_rate": 1.3259993000082597e-06, + "loss": 0.6576, + "step": 896 + }, + { + "epoch": 0.4131141878472122, + "grad_norm": 2.6304206915731387, + "learning_rate": 1.3245880265523737e-06, + "loss": 0.5005, + "step": 897 + }, + { + "epoch": 0.4135747387812671, + "grad_norm": 3.033315201466835, + "learning_rate": 1.3231760301136968e-06, + "loss": 0.5032, + "step": 898 + }, + { + "epoch": 0.41403528971532194, + "grad_norm": 3.153932562017399, + "learning_rate": 1.32176331383729e-06, + "loss": 0.6438, + "step": 899 + }, + { + "epoch": 0.41449584064937683, + "grad_norm": 3.0569467140610564, + "learning_rate": 1.3203498808698177e-06, + "loss": 0.4582, + "step": 900 + }, + { + "epoch": 0.41495639158343167, + "grad_norm": 3.4798136677368525, + "learning_rate": 1.3189357343595405e-06, + "loss": 0.4739, + "step": 901 + }, + { + "epoch": 0.41541694251748656, + "grad_norm": 3.267066000558948, + "learning_rate": 1.317520877456308e-06, + "loss": 0.5333, + "step": 902 + }, + { + "epoch": 0.4158774934515414, + "grad_norm": 3.0973720845505057, + "learning_rate": 1.3161053133115534e-06, + "loss": 0.6643, + "step": 903 + }, + { + "epoch": 0.4163380443855963, + "grad_norm": 3.124046522546217, + "learning_rate": 1.3146890450782833e-06, + "loss": 0.6385, + "step": 904 + }, + { + "epoch": 0.41679859531965113, + "grad_norm": 3.2564321361920348, + "learning_rate": 1.3132720759110742e-06, + "loss": 0.5914, + "step": 905 + }, + { + "epoch": 0.41725914625370597, + "grad_norm": 3.5529506389797207, + "learning_rate": 1.3118544089660632e-06, + "loss": 0.5549, + "step": 906 + }, + { + "epoch": 0.41771969718776086, + "grad_norm": 3.1088968115091458, + "learning_rate": 1.3104360474009413e-06, + "loss": 0.5016, + "step": 907 + }, + { + "epoch": 0.4181802481218157, + "grad_norm": 2.961528786967664, + "learning_rate": 1.3090169943749473e-06, + "loss": 0.6016, + "step": 908 + }, + { + "epoch": 0.4186407990558706, + "grad_norm": 2.9590563505493557, + "learning_rate": 1.3075972530488601e-06, + "loss": 0.6245, + "step": 909 + }, + { + "epoch": 0.41910134998992543, + "grad_norm": 3.2817559385376502, + "learning_rate": 1.306176826584991e-06, + "loss": 0.5741, + "step": 910 + }, + { + "epoch": 0.4195619009239803, + "grad_norm": 3.1851733205126456, + "learning_rate": 1.3047557181471782e-06, + "loss": 0.6102, + "step": 911 + }, + { + "epoch": 0.42002245185803516, + "grad_norm": 3.3709380563412856, + "learning_rate": 1.3033339309007782e-06, + "loss": 0.6029, + "step": 912 + }, + { + "epoch": 0.42048300279209005, + "grad_norm": 2.978439007428691, + "learning_rate": 1.3019114680126607e-06, + "loss": 0.6095, + "step": 913 + }, + { + "epoch": 0.4209435537261449, + "grad_norm": 3.3731454043582545, + "learning_rate": 1.3004883326511986e-06, + "loss": 0.6838, + "step": 914 + }, + { + "epoch": 0.4214041046601998, + "grad_norm": 3.3633712629594723, + "learning_rate": 1.2990645279862637e-06, + "loss": 0.637, + "step": 915 + }, + { + "epoch": 0.4218646555942546, + "grad_norm": 3.3342371154036936, + "learning_rate": 1.2976400571892187e-06, + "loss": 0.5637, + "step": 916 + }, + { + "epoch": 0.4223252065283095, + "grad_norm": 3.2152650834374885, + "learning_rate": 1.2962149234329096e-06, + "loss": 0.5651, + "step": 917 + }, + { + "epoch": 0.42278575746236435, + "grad_norm": 3.728918503268583, + "learning_rate": 1.2947891298916597e-06, + "loss": 0.5591, + "step": 918 + }, + { + "epoch": 0.4232463083964192, + "grad_norm": 3.4867704670014814, + "learning_rate": 1.2933626797412601e-06, + "loss": 0.4446, + "step": 919 + }, + { + "epoch": 0.4237068593304741, + "grad_norm": 3.3645940418700717, + "learning_rate": 1.2919355761589673e-06, + "loss": 0.5977, + "step": 920 + }, + { + "epoch": 0.4241674102645289, + "grad_norm": 3.165145021423808, + "learning_rate": 1.2905078223234907e-06, + "loss": 0.6094, + "step": 921 + }, + { + "epoch": 0.4246279611985838, + "grad_norm": 3.1311304590118083, + "learning_rate": 1.2890794214149895e-06, + "loss": 0.5793, + "step": 922 + }, + { + "epoch": 0.42508851213263865, + "grad_norm": 2.8231203010209462, + "learning_rate": 1.2876503766150634e-06, + "loss": 0.5225, + "step": 923 + }, + { + "epoch": 0.42554906306669354, + "grad_norm": 3.268397607752156, + "learning_rate": 1.2862206911067467e-06, + "loss": 0.5686, + "step": 924 + }, + { + "epoch": 0.4260096140007484, + "grad_norm": 3.305099301303672, + "learning_rate": 1.2847903680745012e-06, + "loss": 0.5445, + "step": 925 + }, + { + "epoch": 0.4264701649348033, + "grad_norm": 3.3965795137188675, + "learning_rate": 1.2833594107042075e-06, + "loss": 0.6591, + "step": 926 + }, + { + "epoch": 0.4269307158688581, + "grad_norm": 3.483867364296341, + "learning_rate": 1.2819278221831604e-06, + "loss": 0.6056, + "step": 927 + }, + { + "epoch": 0.427391266802913, + "grad_norm": 3.4347389707625107, + "learning_rate": 1.2804956057000597e-06, + "loss": 0.5861, + "step": 928 + }, + { + "epoch": 0.42785181773696784, + "grad_norm": 3.1912886523840545, + "learning_rate": 1.2790627644450042e-06, + "loss": 0.5413, + "step": 929 + }, + { + "epoch": 0.42831236867102274, + "grad_norm": 3.277432951067786, + "learning_rate": 1.2776293016094848e-06, + "loss": 0.5728, + "step": 930 + }, + { + "epoch": 0.4287729196050776, + "grad_norm": 3.1216436794666897, + "learning_rate": 1.2761952203863758e-06, + "loss": 0.5291, + "step": 931 + }, + { + "epoch": 0.4292334705391324, + "grad_norm": 3.2701087953996315, + "learning_rate": 1.2747605239699293e-06, + "loss": 0.5777, + "step": 932 + }, + { + "epoch": 0.4296940214731873, + "grad_norm": 2.703298534665295, + "learning_rate": 1.2733252155557686e-06, + "loss": 0.4752, + "step": 933 + }, + { + "epoch": 0.43015457240724214, + "grad_norm": 3.5331916544052584, + "learning_rate": 1.2718892983408787e-06, + "loss": 0.5741, + "step": 934 + }, + { + "epoch": 0.43061512334129703, + "grad_norm": 3.2776132030927196, + "learning_rate": 1.270452775523602e-06, + "loss": 0.6765, + "step": 935 + }, + { + "epoch": 0.43107567427535187, + "grad_norm": 3.556572826939555, + "learning_rate": 1.2690156503036288e-06, + "loss": 0.5738, + "step": 936 + }, + { + "epoch": 0.43153622520940677, + "grad_norm": 2.6845624221473003, + "learning_rate": 1.2675779258819913e-06, + "loss": 0.437, + "step": 937 + }, + { + "epoch": 0.4319967761434616, + "grad_norm": 3.0987030708733965, + "learning_rate": 1.2661396054610568e-06, + "loss": 0.5322, + "step": 938 + }, + { + "epoch": 0.4324573270775165, + "grad_norm": 2.69969210409678, + "learning_rate": 1.2647006922445203e-06, + "loss": 0.563, + "step": 939 + }, + { + "epoch": 0.43291787801157133, + "grad_norm": 3.1443985858808112, + "learning_rate": 1.2632611894373963e-06, + "loss": 0.5224, + "step": 940 + }, + { + "epoch": 0.4333784289456262, + "grad_norm": 3.041880247975588, + "learning_rate": 1.2618211002460133e-06, + "loss": 0.6286, + "step": 941 + }, + { + "epoch": 0.43383897987968106, + "grad_norm": 3.1243004909346612, + "learning_rate": 1.2603804278780054e-06, + "loss": 0.4417, + "step": 942 + }, + { + "epoch": 0.43429953081373596, + "grad_norm": 3.0394710034585044, + "learning_rate": 1.2589391755423061e-06, + "loss": 0.3974, + "step": 943 + }, + { + "epoch": 0.4347600817477908, + "grad_norm": 2.9343561758404855, + "learning_rate": 1.2574973464491406e-06, + "loss": 0.5109, + "step": 944 + }, + { + "epoch": 0.43522063268184563, + "grad_norm": 3.5301907527590837, + "learning_rate": 1.2560549438100187e-06, + "loss": 0.6818, + "step": 945 + }, + { + "epoch": 0.4356811836159005, + "grad_norm": 3.098431035830371, + "learning_rate": 1.2546119708377273e-06, + "loss": 0.6054, + "step": 946 + }, + { + "epoch": 0.43614173454995536, + "grad_norm": 3.0568020174713344, + "learning_rate": 1.2531684307463243e-06, + "loss": 0.5611, + "step": 947 + }, + { + "epoch": 0.43660228548401026, + "grad_norm": 3.133249513429743, + "learning_rate": 1.2517243267511308e-06, + "loss": 0.5266, + "step": 948 + }, + { + "epoch": 0.4370628364180651, + "grad_norm": 3.5046506132656656, + "learning_rate": 1.2502796620687232e-06, + "loss": 0.7523, + "step": 949 + }, + { + "epoch": 0.43752338735212, + "grad_norm": 3.8415980756490913, + "learning_rate": 1.2488344399169275e-06, + "loss": 0.6561, + "step": 950 + }, + { + "epoch": 0.4379839382861748, + "grad_norm": 3.04800030433528, + "learning_rate": 1.2473886635148107e-06, + "loss": 0.3969, + "step": 951 + }, + { + "epoch": 0.4384444892202297, + "grad_norm": 2.9634130405670542, + "learning_rate": 1.2459423360826753e-06, + "loss": 0.5083, + "step": 952 + }, + { + "epoch": 0.43890504015428455, + "grad_norm": 3.2286597169487496, + "learning_rate": 1.2444954608420509e-06, + "loss": 0.5204, + "step": 953 + }, + { + "epoch": 0.43936559108833945, + "grad_norm": 3.072141673231512, + "learning_rate": 1.2430480410156859e-06, + "loss": 0.5266, + "step": 954 + }, + { + "epoch": 0.4398261420223943, + "grad_norm": 3.1772864308017503, + "learning_rate": 1.2416000798275434e-06, + "loss": 0.5762, + "step": 955 + }, + { + "epoch": 0.4402866929564492, + "grad_norm": 3.1044222947486766, + "learning_rate": 1.2401515805027923e-06, + "loss": 0.6046, + "step": 956 + }, + { + "epoch": 0.440747243890504, + "grad_norm": 2.9714963515506674, + "learning_rate": 1.2387025462677986e-06, + "loss": 0.5226, + "step": 957 + }, + { + "epoch": 0.44120779482455885, + "grad_norm": 3.6632065071396602, + "learning_rate": 1.2372529803501212e-06, + "loss": 0.6285, + "step": 958 + }, + { + "epoch": 0.44166834575861375, + "grad_norm": 3.3335254391441236, + "learning_rate": 1.2358028859785027e-06, + "loss": 0.6491, + "step": 959 + }, + { + "epoch": 0.4421288966926686, + "grad_norm": 3.2741227211236277, + "learning_rate": 1.234352266382863e-06, + "loss": 0.472, + "step": 960 + }, + { + "epoch": 0.4425894476267235, + "grad_norm": 2.734950400129683, + "learning_rate": 1.2329011247942913e-06, + "loss": 0.4678, + "step": 961 + }, + { + "epoch": 0.4430499985607783, + "grad_norm": 3.706836559033099, + "learning_rate": 1.2314494644450405e-06, + "loss": 0.572, + "step": 962 + }, + { + "epoch": 0.4435105494948332, + "grad_norm": 3.191997067089114, + "learning_rate": 1.2299972885685175e-06, + "loss": 0.526, + "step": 963 + }, + { + "epoch": 0.44397110042888804, + "grad_norm": 3.750204622573935, + "learning_rate": 1.2285446003992794e-06, + "loss": 0.6557, + "step": 964 + }, + { + "epoch": 0.44443165136294294, + "grad_norm": 3.3390534313214086, + "learning_rate": 1.2270914031730227e-06, + "loss": 0.6844, + "step": 965 + }, + { + "epoch": 0.4448922022969978, + "grad_norm": 3.4357388932326103, + "learning_rate": 1.2256377001265782e-06, + "loss": 0.589, + "step": 966 + }, + { + "epoch": 0.44535275323105267, + "grad_norm": 2.9239348458649497, + "learning_rate": 1.2241834944979043e-06, + "loss": 0.5562, + "step": 967 + }, + { + "epoch": 0.4458133041651075, + "grad_norm": 3.0247685003568656, + "learning_rate": 1.2227287895260774e-06, + "loss": 0.5027, + "step": 968 + }, + { + "epoch": 0.4462738550991624, + "grad_norm": 3.4607078630570274, + "learning_rate": 1.2212735884512873e-06, + "loss": 0.5647, + "step": 969 + }, + { + "epoch": 0.44673440603321724, + "grad_norm": 3.134258529274002, + "learning_rate": 1.2198178945148284e-06, + "loss": 0.6152, + "step": 970 + }, + { + "epoch": 0.4471949569672721, + "grad_norm": 3.2437674513922143, + "learning_rate": 1.2183617109590923e-06, + "loss": 0.4777, + "step": 971 + }, + { + "epoch": 0.44765550790132697, + "grad_norm": 2.9881905608566846, + "learning_rate": 1.2169050410275617e-06, + "loss": 0.5202, + "step": 972 + }, + { + "epoch": 0.4481160588353818, + "grad_norm": 3.186446601937435, + "learning_rate": 1.2154478879648034e-06, + "loss": 0.5035, + "step": 973 + }, + { + "epoch": 0.4485766097694367, + "grad_norm": 3.5679383322300495, + "learning_rate": 1.213990255016459e-06, + "loss": 0.6799, + "step": 974 + }, + { + "epoch": 0.44903716070349153, + "grad_norm": 3.3889885780961526, + "learning_rate": 1.2125321454292397e-06, + "loss": 0.638, + "step": 975 + }, + { + "epoch": 0.4494977116375464, + "grad_norm": 3.0589827390964848, + "learning_rate": 1.2110735624509184e-06, + "loss": 0.6329, + "step": 976 + }, + { + "epoch": 0.44995826257160126, + "grad_norm": 3.2178815155136067, + "learning_rate": 1.2096145093303215e-06, + "loss": 0.5451, + "step": 977 + }, + { + "epoch": 0.45041881350565616, + "grad_norm": 3.1650611316618398, + "learning_rate": 1.2081549893173244e-06, + "loss": 0.5188, + "step": 978 + }, + { + "epoch": 0.450879364439711, + "grad_norm": 2.7477870679696226, + "learning_rate": 1.206695005662841e-06, + "loss": 0.4523, + "step": 979 + }, + { + "epoch": 0.4513399153737659, + "grad_norm": 2.9102863855372747, + "learning_rate": 1.2052345616188177e-06, + "loss": 0.5282, + "step": 980 + }, + { + "epoch": 0.4518004663078207, + "grad_norm": 3.1966574178250116, + "learning_rate": 1.2037736604382277e-06, + "loss": 0.5015, + "step": 981 + }, + { + "epoch": 0.4522610172418756, + "grad_norm": 2.89336588474616, + "learning_rate": 1.2023123053750613e-06, + "loss": 0.5461, + "step": 982 + }, + { + "epoch": 0.45272156817593046, + "grad_norm": 3.832342329803752, + "learning_rate": 1.2008504996843206e-06, + "loss": 0.6394, + "step": 983 + }, + { + "epoch": 0.4531821191099853, + "grad_norm": 3.4482291994471086, + "learning_rate": 1.1993882466220102e-06, + "loss": 0.6259, + "step": 984 + }, + { + "epoch": 0.4536426700440402, + "grad_norm": 3.611548614335293, + "learning_rate": 1.1979255494451326e-06, + "loss": 0.6066, + "step": 985 + }, + { + "epoch": 0.454103220978095, + "grad_norm": 2.7453352434114713, + "learning_rate": 1.1964624114116784e-06, + "loss": 0.6955, + "step": 986 + }, + { + "epoch": 0.4545637719121499, + "grad_norm": 3.075608198924997, + "learning_rate": 1.194998835780621e-06, + "loss": 0.6155, + "step": 987 + }, + { + "epoch": 0.45502432284620475, + "grad_norm": 3.1080551036437654, + "learning_rate": 1.1935348258119083e-06, + "loss": 0.578, + "step": 988 + }, + { + "epoch": 0.45548487378025965, + "grad_norm": 3.2505311952038234, + "learning_rate": 1.1920703847664546e-06, + "loss": 0.535, + "step": 989 + }, + { + "epoch": 0.4559454247143145, + "grad_norm": 3.0544071466547336, + "learning_rate": 1.190605515906136e-06, + "loss": 0.4539, + "step": 990 + }, + { + "epoch": 0.4564059756483694, + "grad_norm": 2.9834516516156366, + "learning_rate": 1.1891402224937804e-06, + "loss": 0.5145, + "step": 991 + }, + { + "epoch": 0.4568665265824242, + "grad_norm": 3.2098618470017963, + "learning_rate": 1.1876745077931617e-06, + "loss": 0.4956, + "step": 992 + }, + { + "epoch": 0.4573270775164791, + "grad_norm": 3.00024716203184, + "learning_rate": 1.1862083750689923e-06, + "loss": 0.5351, + "step": 993 + }, + { + "epoch": 0.45778762845053395, + "grad_norm": 3.084747400551803, + "learning_rate": 1.1847418275869151e-06, + "loss": 0.5766, + "step": 994 + }, + { + "epoch": 0.45824817938458884, + "grad_norm": 3.04028326868723, + "learning_rate": 1.183274868613498e-06, + "loss": 0.6411, + "step": 995 + }, + { + "epoch": 0.4587087303186437, + "grad_norm": 2.5418869985884354, + "learning_rate": 1.181807501416224e-06, + "loss": 0.4326, + "step": 996 + }, + { + "epoch": 0.4591692812526985, + "grad_norm": 3.5975504622299534, + "learning_rate": 1.1803397292634867e-06, + "loss": 0.6207, + "step": 997 + }, + { + "epoch": 0.4596298321867534, + "grad_norm": 3.385303810357531, + "learning_rate": 1.1788715554245807e-06, + "loss": 0.6074, + "step": 998 + }, + { + "epoch": 0.46009038312080824, + "grad_norm": 3.0211329657757005, + "learning_rate": 1.1774029831696955e-06, + "loss": 0.6681, + "step": 999 + }, + { + "epoch": 0.46055093405486314, + "grad_norm": 3.055209977773643, + "learning_rate": 1.1759340157699088e-06, + "loss": 0.5646, + "step": 1000 + }, + { + "epoch": 0.461011484988918, + "grad_norm": 3.1014875109105753, + "learning_rate": 1.1744646564971777e-06, + "loss": 0.5598, + "step": 1001 + }, + { + "epoch": 0.46147203592297287, + "grad_norm": 3.256357038090698, + "learning_rate": 1.1729949086243319e-06, + "loss": 0.6722, + "step": 1002 + }, + { + "epoch": 0.4619325868570277, + "grad_norm": 2.917952076658023, + "learning_rate": 1.1715247754250673e-06, + "loss": 0.4466, + "step": 1003 + }, + { + "epoch": 0.4623931377910826, + "grad_norm": 2.7874434854895207, + "learning_rate": 1.1700542601739381e-06, + "loss": 0.5989, + "step": 1004 + }, + { + "epoch": 0.46285368872513744, + "grad_norm": 3.0402556453953546, + "learning_rate": 1.1685833661463488e-06, + "loss": 0.4839, + "step": 1005 + }, + { + "epoch": 0.46331423965919233, + "grad_norm": 3.544520931422804, + "learning_rate": 1.1671120966185484e-06, + "loss": 0.5576, + "step": 1006 + }, + { + "epoch": 0.46377479059324717, + "grad_norm": 3.3715770705932204, + "learning_rate": 1.1656404548676219e-06, + "loss": 0.595, + "step": 1007 + }, + { + "epoch": 0.46423534152730206, + "grad_norm": 3.07379805336765, + "learning_rate": 1.1641684441714828e-06, + "loss": 0.5866, + "step": 1008 + }, + { + "epoch": 0.4646958924613569, + "grad_norm": 3.2769110964863413, + "learning_rate": 1.1626960678088677e-06, + "loss": 0.4229, + "step": 1009 + }, + { + "epoch": 0.46515644339541173, + "grad_norm": 3.231165225029422, + "learning_rate": 1.1612233290593264e-06, + "loss": 0.4908, + "step": 1010 + }, + { + "epoch": 0.4656169943294666, + "grad_norm": 2.6717664603336164, + "learning_rate": 1.1597502312032168e-06, + "loss": 0.5189, + "step": 1011 + }, + { + "epoch": 0.46607754526352146, + "grad_norm": 3.5412588764816575, + "learning_rate": 1.158276777521696e-06, + "loss": 0.5227, + "step": 1012 + }, + { + "epoch": 0.46653809619757636, + "grad_norm": 3.367681094901886, + "learning_rate": 1.1568029712967137e-06, + "loss": 0.4772, + "step": 1013 + }, + { + "epoch": 0.4669986471316312, + "grad_norm": 2.9896273143544185, + "learning_rate": 1.1553288158110057e-06, + "loss": 0.6199, + "step": 1014 + }, + { + "epoch": 0.4674591980656861, + "grad_norm": 3.5056640416959346, + "learning_rate": 1.153854314348085e-06, + "loss": 0.507, + "step": 1015 + }, + { + "epoch": 0.4679197489997409, + "grad_norm": 2.8729584735763694, + "learning_rate": 1.152379470192235e-06, + "loss": 0.5681, + "step": 1016 + }, + { + "epoch": 0.4683802999337958, + "grad_norm": 3.1311855689183608, + "learning_rate": 1.1509042866285028e-06, + "loss": 0.5204, + "step": 1017 + }, + { + "epoch": 0.46884085086785066, + "grad_norm": 3.6303555087991493, + "learning_rate": 1.149428766942692e-06, + "loss": 0.5597, + "step": 1018 + }, + { + "epoch": 0.46930140180190555, + "grad_norm": 3.0256164571238795, + "learning_rate": 1.1479529144213537e-06, + "loss": 0.5227, + "step": 1019 + }, + { + "epoch": 0.4697619527359604, + "grad_norm": 2.7202389363979016, + "learning_rate": 1.1464767323517813e-06, + "loss": 0.3788, + "step": 1020 + }, + { + "epoch": 0.4702225036700153, + "grad_norm": 2.9201338750532893, + "learning_rate": 1.145000224022002e-06, + "loss": 0.4937, + "step": 1021 + }, + { + "epoch": 0.4706830546040701, + "grad_norm": 3.387209727218353, + "learning_rate": 1.143523392720769e-06, + "loss": 0.5268, + "step": 1022 + }, + { + "epoch": 0.47114360553812495, + "grad_norm": 2.8811446548900124, + "learning_rate": 1.1420462417375562e-06, + "loss": 0.4288, + "step": 1023 + }, + { + "epoch": 0.47160415647217985, + "grad_norm": 3.416259827843531, + "learning_rate": 1.140568774362549e-06, + "loss": 0.5689, + "step": 1024 + }, + { + "epoch": 0.4720647074062347, + "grad_norm": 3.1462397361747043, + "learning_rate": 1.1390909938866367e-06, + "loss": 0.5057, + "step": 1025 + }, + { + "epoch": 0.4725252583402896, + "grad_norm": 3.064526836533319, + "learning_rate": 1.137612903601407e-06, + "loss": 0.4729, + "step": 1026 + }, + { + "epoch": 0.4729858092743444, + "grad_norm": 3.451553189665636, + "learning_rate": 1.1361345067991375e-06, + "loss": 0.7637, + "step": 1027 + }, + { + "epoch": 0.4734463602083993, + "grad_norm": 2.8877618039147515, + "learning_rate": 1.134655806772788e-06, + "loss": 0.4614, + "step": 1028 + }, + { + "epoch": 0.47390691114245415, + "grad_norm": 3.253794710465501, + "learning_rate": 1.1331768068159946e-06, + "loss": 0.6915, + "step": 1029 + }, + { + "epoch": 0.47436746207650904, + "grad_norm": 3.6020230395836617, + "learning_rate": 1.1316975102230604e-06, + "loss": 0.5978, + "step": 1030 + }, + { + "epoch": 0.4748280130105639, + "grad_norm": 3.2619527720299444, + "learning_rate": 1.1302179202889505e-06, + "loss": 0.5066, + "step": 1031 + }, + { + "epoch": 0.47528856394461877, + "grad_norm": 3.2521154822577687, + "learning_rate": 1.1287380403092816e-06, + "loss": 0.5423, + "step": 1032 + }, + { + "epoch": 0.4757491148786736, + "grad_norm": 2.851595035378972, + "learning_rate": 1.127257873580318e-06, + "loss": 0.493, + "step": 1033 + }, + { + "epoch": 0.4762096658127285, + "grad_norm": 2.813869134619398, + "learning_rate": 1.1257774233989623e-06, + "loss": 0.4491, + "step": 1034 + }, + { + "epoch": 0.47667021674678334, + "grad_norm": 2.8702386818101977, + "learning_rate": 1.1242966930627484e-06, + "loss": 0.4677, + "step": 1035 + }, + { + "epoch": 0.47713076768083823, + "grad_norm": 2.9773792194983235, + "learning_rate": 1.1228156858698343e-06, + "loss": 0.4339, + "step": 1036 + }, + { + "epoch": 0.47759131861489307, + "grad_norm": 2.936870179141437, + "learning_rate": 1.1213344051189939e-06, + "loss": 0.5943, + "step": 1037 + }, + { + "epoch": 0.4780518695489479, + "grad_norm": 3.305259733776011, + "learning_rate": 1.1198528541096115e-06, + "loss": 0.4593, + "step": 1038 + }, + { + "epoch": 0.4785124204830028, + "grad_norm": 3.36680958646396, + "learning_rate": 1.1183710361416727e-06, + "loss": 0.7228, + "step": 1039 + }, + { + "epoch": 0.47897297141705764, + "grad_norm": 3.2573238811104486, + "learning_rate": 1.1168889545157582e-06, + "loss": 0.5007, + "step": 1040 + }, + { + "epoch": 0.47943352235111253, + "grad_norm": 2.9011246681530043, + "learning_rate": 1.1154066125330357e-06, + "loss": 0.5315, + "step": 1041 + }, + { + "epoch": 0.47989407328516737, + "grad_norm": 2.981185948614369, + "learning_rate": 1.1139240134952523e-06, + "loss": 0.5441, + "step": 1042 + }, + { + "epoch": 0.48035462421922226, + "grad_norm": 3.255149727332359, + "learning_rate": 1.1124411607047288e-06, + "loss": 0.5446, + "step": 1043 + }, + { + "epoch": 0.4808151751532771, + "grad_norm": 3.1280648749420132, + "learning_rate": 1.1109580574643503e-06, + "loss": 0.5637, + "step": 1044 + }, + { + "epoch": 0.481275726087332, + "grad_norm": 3.124825859884878, + "learning_rate": 1.10947470707756e-06, + "loss": 0.4994, + "step": 1045 + }, + { + "epoch": 0.4817362770213868, + "grad_norm": 3.66113588450651, + "learning_rate": 1.107991112848352e-06, + "loss": 0.5917, + "step": 1046 + }, + { + "epoch": 0.4821968279554417, + "grad_norm": 3.1021700535523298, + "learning_rate": 1.1065072780812625e-06, + "loss": 0.5356, + "step": 1047 + }, + { + "epoch": 0.48265737888949656, + "grad_norm": 3.3696723546575194, + "learning_rate": 1.1050232060813644e-06, + "loss": 0.5811, + "step": 1048 + }, + { + "epoch": 0.48311792982355145, + "grad_norm": 3.0818897055953434, + "learning_rate": 1.1035389001542595e-06, + "loss": 0.6459, + "step": 1049 + }, + { + "epoch": 0.4835784807576063, + "grad_norm": 3.024745793339092, + "learning_rate": 1.1020543636060683e-06, + "loss": 0.5107, + "step": 1050 + }, + { + "epoch": 0.4840390316916611, + "grad_norm": 3.2367103732823774, + "learning_rate": 1.100569599743428e-06, + "loss": 0.5783, + "step": 1051 + }, + { + "epoch": 0.484499582625716, + "grad_norm": 3.2724898116979797, + "learning_rate": 1.09908461187348e-06, + "loss": 0.6343, + "step": 1052 + }, + { + "epoch": 0.48496013355977086, + "grad_norm": 2.6886278432787836, + "learning_rate": 1.0975994033038656e-06, + "loss": 0.5134, + "step": 1053 + }, + { + "epoch": 0.48542068449382575, + "grad_norm": 3.5676405092063175, + "learning_rate": 1.0961139773427171e-06, + "loss": 0.6208, + "step": 1054 + }, + { + "epoch": 0.4858812354278806, + "grad_norm": 2.8882955010908598, + "learning_rate": 1.0946283372986516e-06, + "loss": 0.5888, + "step": 1055 + }, + { + "epoch": 0.4863417863619355, + "grad_norm": 3.5114513574479087, + "learning_rate": 1.0931424864807623e-06, + "loss": 0.57, + "step": 1056 + }, + { + "epoch": 0.4868023372959903, + "grad_norm": 3.0588259590216236, + "learning_rate": 1.0916564281986133e-06, + "loss": 0.5229, + "step": 1057 + }, + { + "epoch": 0.4872628882300452, + "grad_norm": 3.0749972740879348, + "learning_rate": 1.0901701657622291e-06, + "loss": 0.5475, + "step": 1058 + }, + { + "epoch": 0.48772343916410005, + "grad_norm": 3.6869592539698215, + "learning_rate": 1.0886837024820897e-06, + "loss": 0.6199, + "step": 1059 + }, + { + "epoch": 0.48818399009815494, + "grad_norm": 3.3785618023616797, + "learning_rate": 1.0871970416691227e-06, + "loss": 0.6568, + "step": 1060 + }, + { + "epoch": 0.4886445410322098, + "grad_norm": 3.2577480072609193, + "learning_rate": 1.085710186634695e-06, + "loss": 0.6063, + "step": 1061 + }, + { + "epoch": 0.48910509196626467, + "grad_norm": 2.930851386132042, + "learning_rate": 1.0842231406906076e-06, + "loss": 0.5489, + "step": 1062 + }, + { + "epoch": 0.4895656429003195, + "grad_norm": 3.056596065641448, + "learning_rate": 1.0827359071490845e-06, + "loss": 0.5766, + "step": 1063 + }, + { + "epoch": 0.49002619383437435, + "grad_norm": 3.3438308914593686, + "learning_rate": 1.0812484893227688e-06, + "loss": 0.5343, + "step": 1064 + }, + { + "epoch": 0.49048674476842924, + "grad_norm": 3.2170156199513684, + "learning_rate": 1.079760890524715e-06, + "loss": 0.4873, + "step": 1065 + }, + { + "epoch": 0.4909472957024841, + "grad_norm": 3.3949566731113276, + "learning_rate": 1.0782731140683784e-06, + "loss": 0.6256, + "step": 1066 + }, + { + "epoch": 0.49140784663653897, + "grad_norm": 3.8201701220980815, + "learning_rate": 1.0767851632676119e-06, + "loss": 0.5645, + "step": 1067 + }, + { + "epoch": 0.4918683975705938, + "grad_norm": 3.321842231119507, + "learning_rate": 1.0752970414366561e-06, + "loss": 0.5429, + "step": 1068 + }, + { + "epoch": 0.4923289485046487, + "grad_norm": 3.1614648927797493, + "learning_rate": 1.0738087518901326e-06, + "loss": 0.578, + "step": 1069 + }, + { + "epoch": 0.49278949943870354, + "grad_norm": 3.1744346557606957, + "learning_rate": 1.0723202979430364e-06, + "loss": 0.5857, + "step": 1070 + }, + { + "epoch": 0.49325005037275843, + "grad_norm": 3.0752420415036132, + "learning_rate": 1.0708316829107293e-06, + "loss": 0.4874, + "step": 1071 + }, + { + "epoch": 0.49371060130681327, + "grad_norm": 3.139920581664928, + "learning_rate": 1.0693429101089306e-06, + "loss": 0.5561, + "step": 1072 + }, + { + "epoch": 0.49417115224086816, + "grad_norm": 3.3471269540196786, + "learning_rate": 1.0678539828537123e-06, + "loss": 0.4969, + "step": 1073 + }, + { + "epoch": 0.494631703174923, + "grad_norm": 3.026522963165782, + "learning_rate": 1.06636490446149e-06, + "loss": 0.5516, + "step": 1074 + }, + { + "epoch": 0.4950922541089779, + "grad_norm": 2.803007358195874, + "learning_rate": 1.064875678249016e-06, + "loss": 0.5154, + "step": 1075 + }, + { + "epoch": 0.49555280504303273, + "grad_norm": 3.5445642741425165, + "learning_rate": 1.0633863075333712e-06, + "loss": 0.6616, + "step": 1076 + }, + { + "epoch": 0.49601335597708757, + "grad_norm": 3.3956547555358725, + "learning_rate": 1.0618967956319595e-06, + "loss": 0.6653, + "step": 1077 + }, + { + "epoch": 0.49647390691114246, + "grad_norm": 3.236674834459769, + "learning_rate": 1.0604071458624985e-06, + "loss": 0.5443, + "step": 1078 + }, + { + "epoch": 0.4969344578451973, + "grad_norm": 2.943577612886821, + "learning_rate": 1.058917361543013e-06, + "loss": 0.6144, + "step": 1079 + }, + { + "epoch": 0.4973950087792522, + "grad_norm": 3.789273872300937, + "learning_rate": 1.0574274459918279e-06, + "loss": 0.5742, + "step": 1080 + }, + { + "epoch": 0.497855559713307, + "grad_norm": 3.4217054874372423, + "learning_rate": 1.0559374025275595e-06, + "loss": 0.5328, + "step": 1081 + }, + { + "epoch": 0.4983161106473619, + "grad_norm": 3.1895741231305252, + "learning_rate": 1.0544472344691102e-06, + "loss": 0.6774, + "step": 1082 + }, + { + "epoch": 0.49877666158141676, + "grad_norm": 3.2720121537966347, + "learning_rate": 1.0529569451356586e-06, + "loss": 0.5828, + "step": 1083 + }, + { + "epoch": 0.49923721251547165, + "grad_norm": 3.404120717189837, + "learning_rate": 1.051466537846655e-06, + "loss": 0.6939, + "step": 1084 + }, + { + "epoch": 0.4996977634495265, + "grad_norm": 3.3671005760273323, + "learning_rate": 1.049976015921811e-06, + "loss": 0.6451, + "step": 1085 + }, + { + "epoch": 0.5001583143835814, + "grad_norm": 3.529477019557084, + "learning_rate": 1.048485382681094e-06, + "loss": 0.7398, + "step": 1086 + }, + { + "epoch": 0.5006188653176362, + "grad_norm": 2.985478155983292, + "learning_rate": 1.0469946414447196e-06, + "loss": 0.4825, + "step": 1087 + }, + { + "epoch": 0.5010794162516911, + "grad_norm": 3.257171883150608, + "learning_rate": 1.0455037955331447e-06, + "loss": 0.4787, + "step": 1088 + }, + { + "epoch": 0.5015399671857459, + "grad_norm": 2.9340259115329226, + "learning_rate": 1.0440128482670569e-06, + "loss": 0.5687, + "step": 1089 + }, + { + "epoch": 0.5020005181198008, + "grad_norm": 3.7349893867163773, + "learning_rate": 1.0425218029673718e-06, + "loss": 0.502, + "step": 1090 + }, + { + "epoch": 0.5024610690538557, + "grad_norm": 3.1463328721295714, + "learning_rate": 1.0410306629552231e-06, + "loss": 0.5196, + "step": 1091 + }, + { + "epoch": 0.5029216199879105, + "grad_norm": 3.2092568851256007, + "learning_rate": 1.0395394315519541e-06, + "loss": 0.5733, + "step": 1092 + }, + { + "epoch": 0.5033821709219654, + "grad_norm": 3.079164349910014, + "learning_rate": 1.0380481120791136e-06, + "loss": 0.5044, + "step": 1093 + }, + { + "epoch": 0.5038427218560203, + "grad_norm": 2.701558803955672, + "learning_rate": 1.036556707858445e-06, + "loss": 0.4957, + "step": 1094 + }, + { + "epoch": 0.5043032727900751, + "grad_norm": 3.1632512432778226, + "learning_rate": 1.0350652222118807e-06, + "loss": 0.5624, + "step": 1095 + }, + { + "epoch": 0.50476382372413, + "grad_norm": 2.872004805465866, + "learning_rate": 1.0335736584615356e-06, + "loss": 0.5077, + "step": 1096 + }, + { + "epoch": 0.5052243746581848, + "grad_norm": 3.038715190759613, + "learning_rate": 1.0320820199296974e-06, + "loss": 0.487, + "step": 1097 + }, + { + "epoch": 0.5056849255922398, + "grad_norm": 3.1237869179473594, + "learning_rate": 1.0305903099388202e-06, + "loss": 0.418, + "step": 1098 + }, + { + "epoch": 0.5061454765262946, + "grad_norm": 2.9307402726415757, + "learning_rate": 1.0290985318115184e-06, + "loss": 0.5496, + "step": 1099 + }, + { + "epoch": 0.5066060274603494, + "grad_norm": 3.3590679344076193, + "learning_rate": 1.0276066888705574e-06, + "loss": 0.6662, + "step": 1100 + }, + { + "epoch": 0.5070665783944043, + "grad_norm": 3.0656098416878033, + "learning_rate": 1.0261147844388472e-06, + "loss": 0.5917, + "step": 1101 + }, + { + "epoch": 0.5075271293284591, + "grad_norm": 3.082634007004262, + "learning_rate": 1.0246228218394346e-06, + "loss": 0.5372, + "step": 1102 + }, + { + "epoch": 0.5079876802625141, + "grad_norm": 3.5524616274647567, + "learning_rate": 1.023130804395496e-06, + "loss": 0.5913, + "step": 1103 + }, + { + "epoch": 0.5084482311965689, + "grad_norm": 2.9986863183328696, + "learning_rate": 1.0216387354303295e-06, + "loss": 0.4397, + "step": 1104 + }, + { + "epoch": 0.5089087821306237, + "grad_norm": 2.613344675102279, + "learning_rate": 1.0201466182673498e-06, + "loss": 0.4783, + "step": 1105 + }, + { + "epoch": 0.5093693330646786, + "grad_norm": 3.2217947984483, + "learning_rate": 1.0186544562300764e-06, + "loss": 0.5062, + "step": 1106 + }, + { + "epoch": 0.5098298839987335, + "grad_norm": 2.9440216394401904, + "learning_rate": 1.0171622526421304e-06, + "loss": 0.6022, + "step": 1107 + }, + { + "epoch": 0.5102904349327884, + "grad_norm": 2.7773735259088257, + "learning_rate": 1.0156700108272252e-06, + "loss": 0.6232, + "step": 1108 + }, + { + "epoch": 0.5107509858668432, + "grad_norm": 2.941806533194381, + "learning_rate": 1.0141777341091587e-06, + "loss": 0.4926, + "step": 1109 + }, + { + "epoch": 0.511211536800898, + "grad_norm": 3.548025228777574, + "learning_rate": 1.0126854258118074e-06, + "loss": 0.5843, + "step": 1110 + }, + { + "epoch": 0.511672087734953, + "grad_norm": 3.275863911485487, + "learning_rate": 1.011193089259118e-06, + "loss": 0.4883, + "step": 1111 + }, + { + "epoch": 0.5121326386690078, + "grad_norm": 3.0059327106969653, + "learning_rate": 1.009700727775099e-06, + "loss": 0.5342, + "step": 1112 + }, + { + "epoch": 0.5125931896030627, + "grad_norm": 3.0101617328590278, + "learning_rate": 1.008208344683816e-06, + "loss": 0.4742, + "step": 1113 + }, + { + "epoch": 0.5130537405371175, + "grad_norm": 3.752871827480606, + "learning_rate": 1.0067159433093815e-06, + "loss": 0.605, + "step": 1114 + }, + { + "epoch": 0.5135142914711723, + "grad_norm": 3.3469797895498843, + "learning_rate": 1.00522352697595e-06, + "loss": 0.681, + "step": 1115 + }, + { + "epoch": 0.5139748424052273, + "grad_norm": 2.6263493290364246, + "learning_rate": 1.003731099007708e-06, + "loss": 0.4897, + "step": 1116 + }, + { + "epoch": 0.5144353933392821, + "grad_norm": 2.9725521268454003, + "learning_rate": 1.002238662728869e-06, + "loss": 0.5346, + "step": 1117 + }, + { + "epoch": 0.514895944273337, + "grad_norm": 3.049153105281882, + "learning_rate": 1.000746221463664e-06, + "loss": 0.5959, + "step": 1118 + }, + { + "epoch": 0.5153564952073918, + "grad_norm": 2.7288089533635334, + "learning_rate": 9.992537785363361e-07, + "loss": 0.4807, + "step": 1119 + }, + { + "epoch": 0.5158170461414467, + "grad_norm": 3.0318291312947667, + "learning_rate": 9.977613372711308e-07, + "loss": 0.5651, + "step": 1120 + }, + { + "epoch": 0.5162775970755016, + "grad_norm": 2.6295894746203583, + "learning_rate": 9.962689009922918e-07, + "loss": 0.5055, + "step": 1121 + }, + { + "epoch": 0.5167381480095564, + "grad_norm": 3.1101382519689285, + "learning_rate": 9.947764730240501e-07, + "loss": 0.5315, + "step": 1122 + }, + { + "epoch": 0.5171986989436113, + "grad_norm": 3.019675975546636, + "learning_rate": 9.932840566906184e-07, + "loss": 0.5095, + "step": 1123 + }, + { + "epoch": 0.5176592498776662, + "grad_norm": 2.8010710502034897, + "learning_rate": 9.917916553161841e-07, + "loss": 0.4929, + "step": 1124 + }, + { + "epoch": 0.518119800811721, + "grad_norm": 3.0678462211009436, + "learning_rate": 9.90299272224901e-07, + "loss": 0.5167, + "step": 1125 + }, + { + "epoch": 0.5185803517457759, + "grad_norm": 3.134942537587258, + "learning_rate": 9.888069107408824e-07, + "loss": 0.5602, + "step": 1126 + }, + { + "epoch": 0.5190409026798307, + "grad_norm": 2.971903282768269, + "learning_rate": 9.873145741881927e-07, + "loss": 0.6232, + "step": 1127 + }, + { + "epoch": 0.5195014536138856, + "grad_norm": 3.402808011964365, + "learning_rate": 9.858222658908412e-07, + "loss": 0.6225, + "step": 1128 + }, + { + "epoch": 0.5199620045479405, + "grad_norm": 3.0789459137502644, + "learning_rate": 9.84329989172775e-07, + "loss": 0.6397, + "step": 1129 + }, + { + "epoch": 0.5204225554819953, + "grad_norm": 2.9406870784619428, + "learning_rate": 9.828377473578697e-07, + "loss": 0.5466, + "step": 1130 + }, + { + "epoch": 0.5208831064160502, + "grad_norm": 3.364937021577735, + "learning_rate": 9.813455437699237e-07, + "loss": 0.6092, + "step": 1131 + }, + { + "epoch": 0.521343657350105, + "grad_norm": 2.9557279281094724, + "learning_rate": 9.798533817326504e-07, + "loss": 0.5889, + "step": 1132 + }, + { + "epoch": 0.52180420828416, + "grad_norm": 3.7473302217626654, + "learning_rate": 9.783612645696702e-07, + "loss": 0.5108, + "step": 1133 + }, + { + "epoch": 0.5222647592182148, + "grad_norm": 2.973166241915579, + "learning_rate": 9.768691956045042e-07, + "loss": 0.5232, + "step": 1134 + }, + { + "epoch": 0.5227253101522696, + "grad_norm": 3.21594378369249, + "learning_rate": 9.753771781605657e-07, + "loss": 0.5568, + "step": 1135 + }, + { + "epoch": 0.5231858610863245, + "grad_norm": 3.144947147889769, + "learning_rate": 9.73885215561153e-07, + "loss": 0.5373, + "step": 1136 + }, + { + "epoch": 0.5236464120203794, + "grad_norm": 3.2253884546962137, + "learning_rate": 9.723933111294427e-07, + "loss": 0.5413, + "step": 1137 + }, + { + "epoch": 0.5241069629544343, + "grad_norm": 3.223035357344408, + "learning_rate": 9.709014681884815e-07, + "loss": 0.4688, + "step": 1138 + }, + { + "epoch": 0.5245675138884891, + "grad_norm": 3.4567627154136322, + "learning_rate": 9.6940969006118e-07, + "loss": 0.6319, + "step": 1139 + }, + { + "epoch": 0.5250280648225439, + "grad_norm": 3.2739054341505596, + "learning_rate": 9.67917980070303e-07, + "loss": 0.6505, + "step": 1140 + }, + { + "epoch": 0.5254886157565988, + "grad_norm": 3.0010401161709943, + "learning_rate": 9.664263415384643e-07, + "loss": 0.4819, + "step": 1141 + }, + { + "epoch": 0.5259491666906537, + "grad_norm": 3.213300971817815, + "learning_rate": 9.649347777881192e-07, + "loss": 0.4948, + "step": 1142 + }, + { + "epoch": 0.5264097176247086, + "grad_norm": 3.834117009480357, + "learning_rate": 9.634432921415554e-07, + "loss": 0.5202, + "step": 1143 + }, + { + "epoch": 0.5268702685587634, + "grad_norm": 3.0405631156566524, + "learning_rate": 9.619518879208865e-07, + "loss": 0.5498, + "step": 1144 + }, + { + "epoch": 0.5273308194928182, + "grad_norm": 2.736642170245626, + "learning_rate": 9.604605684480458e-07, + "loss": 0.5615, + "step": 1145 + }, + { + "epoch": 0.5277913704268732, + "grad_norm": 3.4155336627808577, + "learning_rate": 9.589693370447768e-07, + "loss": 0.6218, + "step": 1146 + }, + { + "epoch": 0.528251921360928, + "grad_norm": 3.197761407312923, + "learning_rate": 9.574781970326283e-07, + "loss": 0.5547, + "step": 1147 + }, + { + "epoch": 0.5287124722949829, + "grad_norm": 3.025078798504295, + "learning_rate": 9.559871517329434e-07, + "loss": 0.4866, + "step": 1148 + }, + { + "epoch": 0.5291730232290377, + "grad_norm": 3.2517856164626027, + "learning_rate": 9.544962044668555e-07, + "loss": 0.5777, + "step": 1149 + }, + { + "epoch": 0.5296335741630926, + "grad_norm": 2.996348096347719, + "learning_rate": 9.530053585552802e-07, + "loss": 0.5648, + "step": 1150 + }, + { + "epoch": 0.5300941250971475, + "grad_norm": 3.119156079432281, + "learning_rate": 9.515146173189057e-07, + "loss": 0.5534, + "step": 1151 + }, + { + "epoch": 0.5305546760312023, + "grad_norm": 2.984885291180427, + "learning_rate": 9.50023984078189e-07, + "loss": 0.4584, + "step": 1152 + }, + { + "epoch": 0.5310152269652572, + "grad_norm": 3.2174005544489765, + "learning_rate": 9.485334621533453e-07, + "loss": 0.4978, + "step": 1153 + }, + { + "epoch": 0.5314757778993121, + "grad_norm": 2.838123600964648, + "learning_rate": 9.470430548643411e-07, + "loss": 0.5271, + "step": 1154 + }, + { + "epoch": 0.531936328833367, + "grad_norm": 3.2298666663636766, + "learning_rate": 9.455527655308899e-07, + "loss": 0.5232, + "step": 1155 + }, + { + "epoch": 0.5323968797674218, + "grad_norm": 3.7741680239909963, + "learning_rate": 9.440625974724407e-07, + "loss": 0.6279, + "step": 1156 + }, + { + "epoch": 0.5328574307014766, + "grad_norm": 2.9599505842958873, + "learning_rate": 9.425725540081721e-07, + "loss": 0.4482, + "step": 1157 + }, + { + "epoch": 0.5333179816355315, + "grad_norm": 3.0963847317641533, + "learning_rate": 9.410826384569869e-07, + "loss": 0.5946, + "step": 1158 + }, + { + "epoch": 0.5337785325695864, + "grad_norm": 3.0756573597480825, + "learning_rate": 9.395928541375013e-07, + "loss": 0.4987, + "step": 1159 + }, + { + "epoch": 0.5342390835036412, + "grad_norm": 2.96523606047648, + "learning_rate": 9.381032043680405e-07, + "loss": 0.5864, + "step": 1160 + }, + { + "epoch": 0.5346996344376961, + "grad_norm": 2.8034523910128377, + "learning_rate": 9.366136924666288e-07, + "loss": 0.6241, + "step": 1161 + }, + { + "epoch": 0.5351601853717509, + "grad_norm": 3.0938529680087794, + "learning_rate": 9.351243217509842e-07, + "loss": 0.613, + "step": 1162 + }, + { + "epoch": 0.5356207363058059, + "grad_norm": 3.3153575829450728, + "learning_rate": 9.336350955385101e-07, + "loss": 0.5514, + "step": 1163 + }, + { + "epoch": 0.5360812872398607, + "grad_norm": 3.1688595006317124, + "learning_rate": 9.321460171462876e-07, + "loss": 0.5431, + "step": 1164 + }, + { + "epoch": 0.5365418381739155, + "grad_norm": 3.1725759909285123, + "learning_rate": 9.306570898910695e-07, + "loss": 0.4865, + "step": 1165 + }, + { + "epoch": 0.5370023891079704, + "grad_norm": 2.9853416997922406, + "learning_rate": 9.29168317089271e-07, + "loss": 0.5308, + "step": 1166 + }, + { + "epoch": 0.5374629400420253, + "grad_norm": 3.2282727253852053, + "learning_rate": 9.276797020569635e-07, + "loss": 0.5703, + "step": 1167 + }, + { + "epoch": 0.5379234909760802, + "grad_norm": 2.7961799624127734, + "learning_rate": 9.261912481098675e-07, + "loss": 0.4944, + "step": 1168 + }, + { + "epoch": 0.538384041910135, + "grad_norm": 3.3641211633805126, + "learning_rate": 9.24702958563344e-07, + "loss": 0.5362, + "step": 1169 + }, + { + "epoch": 0.5388445928441898, + "grad_norm": 3.101865055004353, + "learning_rate": 9.232148367323882e-07, + "loss": 0.6308, + "step": 1170 + }, + { + "epoch": 0.5393051437782447, + "grad_norm": 3.189825939308207, + "learning_rate": 9.217268859316218e-07, + "loss": 0.5148, + "step": 1171 + }, + { + "epoch": 0.5397656947122996, + "grad_norm": 3.82610831328444, + "learning_rate": 9.202391094752853e-07, + "loss": 0.5346, + "step": 1172 + }, + { + "epoch": 0.5402262456463545, + "grad_norm": 3.2076777255853672, + "learning_rate": 9.187515106772311e-07, + "loss": 0.5662, + "step": 1173 + }, + { + "epoch": 0.5406867965804093, + "grad_norm": 3.4891483553752343, + "learning_rate": 9.172640928509158e-07, + "loss": 0.5638, + "step": 1174 + }, + { + "epoch": 0.5411473475144641, + "grad_norm": 3.5063754666432865, + "learning_rate": 9.157768593093925e-07, + "loss": 0.4947, + "step": 1175 + }, + { + "epoch": 0.5416078984485191, + "grad_norm": 2.803505276485617, + "learning_rate": 9.142898133653047e-07, + "loss": 0.5989, + "step": 1176 + }, + { + "epoch": 0.5420684493825739, + "grad_norm": 3.4189599088321745, + "learning_rate": 9.128029583308773e-07, + "loss": 0.5396, + "step": 1177 + }, + { + "epoch": 0.5425290003166288, + "grad_norm": 3.4562786815564723, + "learning_rate": 9.113162975179104e-07, + "loss": 0.5468, + "step": 1178 + }, + { + "epoch": 0.5429895512506836, + "grad_norm": 3.1130591716501272, + "learning_rate": 9.098298342377711e-07, + "loss": 0.6484, + "step": 1179 + }, + { + "epoch": 0.5434501021847385, + "grad_norm": 3.218340063155536, + "learning_rate": 9.083435718013868e-07, + "loss": 0.5841, + "step": 1180 + }, + { + "epoch": 0.5439106531187934, + "grad_norm": 2.871969590049317, + "learning_rate": 9.068575135192376e-07, + "loss": 0.5347, + "step": 1181 + }, + { + "epoch": 0.5443712040528482, + "grad_norm": 3.1118275872712906, + "learning_rate": 9.053716627013487e-07, + "loss": 0.528, + "step": 1182 + }, + { + "epoch": 0.5448317549869031, + "grad_norm": 3.262925860586302, + "learning_rate": 9.038860226572831e-07, + "loss": 0.6184, + "step": 1183 + }, + { + "epoch": 0.5452923059209579, + "grad_norm": 3.379255799830584, + "learning_rate": 9.024005966961346e-07, + "loss": 0.6056, + "step": 1184 + }, + { + "epoch": 0.5457528568550128, + "grad_norm": 3.1526864635641103, + "learning_rate": 9.009153881265198e-07, + "loss": 0.6153, + "step": 1185 + }, + { + "epoch": 0.5462134077890677, + "grad_norm": 3.2354021477420547, + "learning_rate": 8.994304002565722e-07, + "loss": 0.6598, + "step": 1186 + }, + { + "epoch": 0.5466739587231225, + "grad_norm": 3.3399627148509077, + "learning_rate": 8.979456363939317e-07, + "loss": 0.5406, + "step": 1187 + }, + { + "epoch": 0.5471345096571774, + "grad_norm": 3.2227858379890106, + "learning_rate": 8.964610998457407e-07, + "loss": 0.4731, + "step": 1188 + }, + { + "epoch": 0.5475950605912323, + "grad_norm": 3.128161422108759, + "learning_rate": 8.949767939186356e-07, + "loss": 0.5487, + "step": 1189 + }, + { + "epoch": 0.5480556115252871, + "grad_norm": 3.2758864860806076, + "learning_rate": 8.934927219187373e-07, + "loss": 0.5796, + "step": 1190 + }, + { + "epoch": 0.548516162459342, + "grad_norm": 3.0508736285456903, + "learning_rate": 8.920088871516481e-07, + "loss": 0.5156, + "step": 1191 + }, + { + "epoch": 0.5489767133933968, + "grad_norm": 3.454863882802425, + "learning_rate": 8.905252929224402e-07, + "loss": 0.5471, + "step": 1192 + }, + { + "epoch": 0.5494372643274518, + "grad_norm": 3.241696417453311, + "learning_rate": 8.890419425356495e-07, + "loss": 0.6282, + "step": 1193 + }, + { + "epoch": 0.5498978152615066, + "grad_norm": 3.168159151579064, + "learning_rate": 8.875588392952712e-07, + "loss": 0.5145, + "step": 1194 + }, + { + "epoch": 0.5503583661955614, + "grad_norm": 3.496337240225602, + "learning_rate": 8.860759865047475e-07, + "loss": 0.5628, + "step": 1195 + }, + { + "epoch": 0.5508189171296163, + "grad_norm": 3.2304073520392165, + "learning_rate": 8.845933874669644e-07, + "loss": 0.4703, + "step": 1196 + }, + { + "epoch": 0.5512794680636711, + "grad_norm": 3.1834073949679214, + "learning_rate": 8.831110454842418e-07, + "loss": 0.5432, + "step": 1197 + }, + { + "epoch": 0.5517400189977261, + "grad_norm": 3.4899372677451823, + "learning_rate": 8.816289638583272e-07, + "loss": 0.6382, + "step": 1198 + }, + { + "epoch": 0.5522005699317809, + "grad_norm": 2.9457383672792754, + "learning_rate": 8.801471458903885e-07, + "loss": 0.4662, + "step": 1199 + }, + { + "epoch": 0.5526611208658357, + "grad_norm": 3.2935543447014104, + "learning_rate": 8.786655948810062e-07, + "loss": 0.4872, + "step": 1200 + }, + { + "epoch": 0.5531216717998906, + "grad_norm": 3.2241377818141057, + "learning_rate": 8.771843141301658e-07, + "loss": 0.568, + "step": 1201 + }, + { + "epoch": 0.5535822227339455, + "grad_norm": 3.1610954571795853, + "learning_rate": 8.757033069372514e-07, + "loss": 0.5381, + "step": 1202 + }, + { + "epoch": 0.5540427736680004, + "grad_norm": 2.8053997458619193, + "learning_rate": 8.742225766010375e-07, + "loss": 0.6084, + "step": 1203 + }, + { + "epoch": 0.5545033246020552, + "grad_norm": 3.2358176609289977, + "learning_rate": 8.727421264196819e-07, + "loss": 0.5904, + "step": 1204 + }, + { + "epoch": 0.55496387553611, + "grad_norm": 3.0926449818062127, + "learning_rate": 8.712619596907187e-07, + "loss": 0.5746, + "step": 1205 + }, + { + "epoch": 0.555424426470165, + "grad_norm": 2.941504999556666, + "learning_rate": 8.697820797110498e-07, + "loss": 0.5781, + "step": 1206 + }, + { + "epoch": 0.5558849774042198, + "grad_norm": 2.871854054187517, + "learning_rate": 8.683024897769395e-07, + "loss": 0.4819, + "step": 1207 + }, + { + "epoch": 0.5563455283382747, + "grad_norm": 3.357101022208657, + "learning_rate": 8.668231931840053e-07, + "loss": 0.5112, + "step": 1208 + }, + { + "epoch": 0.5568060792723295, + "grad_norm": 2.926617988170589, + "learning_rate": 8.653441932272118e-07, + "loss": 0.5429, + "step": 1209 + }, + { + "epoch": 0.5572666302063843, + "grad_norm": 2.8511154209063703, + "learning_rate": 8.638654932008626e-07, + "loss": 0.589, + "step": 1210 + }, + { + "epoch": 0.5577271811404393, + "grad_norm": 3.0744117053038984, + "learning_rate": 8.623870963985929e-07, + "loss": 0.4968, + "step": 1211 + }, + { + "epoch": 0.5581877320744941, + "grad_norm": 3.2017070331025663, + "learning_rate": 8.609090061133633e-07, + "loss": 0.483, + "step": 1212 + }, + { + "epoch": 0.558648283008549, + "grad_norm": 3.4242826485145024, + "learning_rate": 8.594312256374512e-07, + "loss": 0.6987, + "step": 1213 + }, + { + "epoch": 0.5591088339426038, + "grad_norm": 3.215000736037071, + "learning_rate": 8.579537582624437e-07, + "loss": 0.569, + "step": 1214 + }, + { + "epoch": 0.5595693848766587, + "grad_norm": 3.21779173610873, + "learning_rate": 8.564766072792311e-07, + "loss": 0.5391, + "step": 1215 + }, + { + "epoch": 0.5600299358107136, + "grad_norm": 3.131067041749219, + "learning_rate": 8.54999775977998e-07, + "loss": 0.5869, + "step": 1216 + }, + { + "epoch": 0.5604904867447684, + "grad_norm": 3.4662747526743347, + "learning_rate": 8.535232676482189e-07, + "loss": 0.5868, + "step": 1217 + }, + { + "epoch": 0.5609510376788233, + "grad_norm": 3.428017059923847, + "learning_rate": 8.520470855786466e-07, + "loss": 0.6251, + "step": 1218 + }, + { + "epoch": 0.5614115886128782, + "grad_norm": 3.3290755200586872, + "learning_rate": 8.505712330573079e-07, + "loss": 0.6237, + "step": 1219 + }, + { + "epoch": 0.561872139546933, + "grad_norm": 2.9893892056090734, + "learning_rate": 8.490957133714973e-07, + "loss": 0.4902, + "step": 1220 + }, + { + "epoch": 0.5623326904809879, + "grad_norm": 3.5120837728765664, + "learning_rate": 8.476205298077649e-07, + "loss": 0.6707, + "step": 1221 + }, + { + "epoch": 0.5627932414150427, + "grad_norm": 3.1658388926282517, + "learning_rate": 8.46145685651915e-07, + "loss": 0.514, + "step": 1222 + }, + { + "epoch": 0.5632537923490976, + "grad_norm": 3.117691401780372, + "learning_rate": 8.446711841889945e-07, + "loss": 0.6758, + "step": 1223 + }, + { + "epoch": 0.5637143432831525, + "grad_norm": 3.000516000189048, + "learning_rate": 8.431970287032861e-07, + "loss": 0.5931, + "step": 1224 + }, + { + "epoch": 0.5641748942172073, + "grad_norm": 2.796362177103071, + "learning_rate": 8.417232224783041e-07, + "loss": 0.475, + "step": 1225 + }, + { + "epoch": 0.5646354451512622, + "grad_norm": 3.1187898436905743, + "learning_rate": 8.402497687967836e-07, + "loss": 0.4477, + "step": 1226 + }, + { + "epoch": 0.565095996085317, + "grad_norm": 3.7014134238719962, + "learning_rate": 8.387766709406735e-07, + "loss": 0.5918, + "step": 1227 + }, + { + "epoch": 0.565556547019372, + "grad_norm": 3.8201360952855854, + "learning_rate": 8.373039321911323e-07, + "loss": 0.6437, + "step": 1228 + }, + { + "epoch": 0.5660170979534268, + "grad_norm": 3.4961060613658907, + "learning_rate": 8.358315558285169e-07, + "loss": 0.6191, + "step": 1229 + }, + { + "epoch": 0.5664776488874816, + "grad_norm": 3.374429880061983, + "learning_rate": 8.343595451323781e-07, + "loss": 0.5717, + "step": 1230 + }, + { + "epoch": 0.5669381998215365, + "grad_norm": 3.5990525265114863, + "learning_rate": 8.328879033814515e-07, + "loss": 0.7007, + "step": 1231 + }, + { + "epoch": 0.5673987507555914, + "grad_norm": 3.43846448225756, + "learning_rate": 8.31416633853651e-07, + "loss": 0.5106, + "step": 1232 + }, + { + "epoch": 0.5678593016896463, + "grad_norm": 3.240170428540241, + "learning_rate": 8.29945739826062e-07, + "loss": 0.6552, + "step": 1233 + }, + { + "epoch": 0.5683198526237011, + "grad_norm": 3.268819388844623, + "learning_rate": 8.284752245749327e-07, + "loss": 0.563, + "step": 1234 + }, + { + "epoch": 0.5687804035577559, + "grad_norm": 3.322362445609891, + "learning_rate": 8.270050913756683e-07, + "loss": 0.6392, + "step": 1235 + }, + { + "epoch": 0.5692409544918108, + "grad_norm": 3.4868505757465855, + "learning_rate": 8.255353435028226e-07, + "loss": 0.5105, + "step": 1236 + }, + { + "epoch": 0.5697015054258657, + "grad_norm": 3.7205750052168556, + "learning_rate": 8.240659842300912e-07, + "loss": 0.555, + "step": 1237 + }, + { + "epoch": 0.5701620563599206, + "grad_norm": 3.3997467433257222, + "learning_rate": 8.225970168303045e-07, + "loss": 0.5784, + "step": 1238 + }, + { + "epoch": 0.5706226072939754, + "grad_norm": 3.666481631082462, + "learning_rate": 8.211284445754197e-07, + "loss": 0.6415, + "step": 1239 + }, + { + "epoch": 0.5710831582280302, + "grad_norm": 3.030350707526125, + "learning_rate": 8.196602707365134e-07, + "loss": 0.5301, + "step": 1240 + }, + { + "epoch": 0.5715437091620852, + "grad_norm": 3.4386931144286104, + "learning_rate": 8.18192498583776e-07, + "loss": 0.4771, + "step": 1241 + }, + { + "epoch": 0.57200426009614, + "grad_norm": 3.6879705378930017, + "learning_rate": 8.16725131386502e-07, + "loss": 0.4923, + "step": 1242 + }, + { + "epoch": 0.5724648110301949, + "grad_norm": 3.258657524610071, + "learning_rate": 8.152581724130849e-07, + "loss": 0.4746, + "step": 1243 + }, + { + "epoch": 0.5729253619642497, + "grad_norm": 3.207585199421819, + "learning_rate": 8.13791624931008e-07, + "loss": 0.6232, + "step": 1244 + }, + { + "epoch": 0.5733859128983047, + "grad_norm": 3.1524462601648535, + "learning_rate": 8.123254922068383e-07, + "loss": 0.6256, + "step": 1245 + }, + { + "epoch": 0.5738464638323595, + "grad_norm": 2.829223353488783, + "learning_rate": 8.108597775062199e-07, + "loss": 0.5638, + "step": 1246 + }, + { + "epoch": 0.5743070147664143, + "grad_norm": 2.991108516511504, + "learning_rate": 8.093944840938638e-07, + "loss": 0.528, + "step": 1247 + }, + { + "epoch": 0.5747675657004692, + "grad_norm": 3.122195079526398, + "learning_rate": 8.079296152335454e-07, + "loss": 0.5076, + "step": 1248 + }, + { + "epoch": 0.575228116634524, + "grad_norm": 2.8771471658342227, + "learning_rate": 8.06465174188092e-07, + "loss": 0.4779, + "step": 1249 + }, + { + "epoch": 0.575688667568579, + "grad_norm": 2.9430839805871236, + "learning_rate": 8.050011642193787e-07, + "loss": 0.5621, + "step": 1250 + }, + { + "epoch": 0.5761492185026338, + "grad_norm": 3.2638073071160147, + "learning_rate": 8.035375885883217e-07, + "loss": 0.581, + "step": 1251 + }, + { + "epoch": 0.5766097694366886, + "grad_norm": 2.8037502705877397, + "learning_rate": 8.020744505548678e-07, + "loss": 0.4642, + "step": 1252 + }, + { + "epoch": 0.5770703203707435, + "grad_norm": 3.009407280040476, + "learning_rate": 8.006117533779897e-07, + "loss": 0.5578, + "step": 1253 + }, + { + "epoch": 0.5775308713047984, + "grad_norm": 2.7699074199330163, + "learning_rate": 7.991495003156799e-07, + "loss": 0.548, + "step": 1254 + }, + { + "epoch": 0.5779914222388532, + "grad_norm": 3.105688826369827, + "learning_rate": 7.976876946249385e-07, + "loss": 0.4663, + "step": 1255 + }, + { + "epoch": 0.5784519731729081, + "grad_norm": 3.111402743097655, + "learning_rate": 7.962263395617723e-07, + "loss": 0.6285, + "step": 1256 + }, + { + "epoch": 0.5789125241069629, + "grad_norm": 2.740304619765724, + "learning_rate": 7.947654383811826e-07, + "loss": 0.6369, + "step": 1257 + }, + { + "epoch": 0.5793730750410179, + "grad_norm": 3.321527884757525, + "learning_rate": 7.933049943371591e-07, + "loss": 0.5833, + "step": 1258 + }, + { + "epoch": 0.5798336259750727, + "grad_norm": 3.3544814320822325, + "learning_rate": 7.918450106826756e-07, + "loss": 0.4738, + "step": 1259 + }, + { + "epoch": 0.5802941769091275, + "grad_norm": 2.8103939697101104, + "learning_rate": 7.903854906696783e-07, + "loss": 0.4384, + "step": 1260 + }, + { + "epoch": 0.5807547278431824, + "grad_norm": 2.9663453333883867, + "learning_rate": 7.889264375490819e-07, + "loss": 0.5188, + "step": 1261 + }, + { + "epoch": 0.5812152787772372, + "grad_norm": 3.2684264262146296, + "learning_rate": 7.874678545707605e-07, + "loss": 0.4919, + "step": 1262 + }, + { + "epoch": 0.5816758297112922, + "grad_norm": 3.2619536382755183, + "learning_rate": 7.86009744983541e-07, + "loss": 0.5307, + "step": 1263 + }, + { + "epoch": 0.582136380645347, + "grad_norm": 3.4192324602148094, + "learning_rate": 7.845521120351967e-07, + "loss": 0.5289, + "step": 1264 + }, + { + "epoch": 0.5825969315794018, + "grad_norm": 3.359910314026855, + "learning_rate": 7.830949589724381e-07, + "loss": 0.6354, + "step": 1265 + }, + { + "epoch": 0.5830574825134567, + "grad_norm": 2.824340063484425, + "learning_rate": 7.816382890409079e-07, + "loss": 0.4714, + "step": 1266 + }, + { + "epoch": 0.5835180334475116, + "grad_norm": 3.0782272088831473, + "learning_rate": 7.80182105485172e-07, + "loss": 0.606, + "step": 1267 + }, + { + "epoch": 0.5839785843815665, + "grad_norm": 3.4682328165851817, + "learning_rate": 7.787264115487125e-07, + "loss": 0.6244, + "step": 1268 + }, + { + "epoch": 0.5844391353156213, + "grad_norm": 3.094844298682695, + "learning_rate": 7.772712104739225e-07, + "loss": 0.6137, + "step": 1269 + }, + { + "epoch": 0.5848996862496761, + "grad_norm": 3.1487313141421818, + "learning_rate": 7.758165055020959e-07, + "loss": 0.4985, + "step": 1270 + }, + { + "epoch": 0.5853602371837311, + "grad_norm": 3.0555631645161974, + "learning_rate": 7.743622998734216e-07, + "loss": 0.5003, + "step": 1271 + }, + { + "epoch": 0.5858207881177859, + "grad_norm": 2.89659469677659, + "learning_rate": 7.729085968269775e-07, + "loss": 0.5633, + "step": 1272 + }, + { + "epoch": 0.5862813390518408, + "grad_norm": 3.0731709449259954, + "learning_rate": 7.714553996007207e-07, + "loss": 0.6242, + "step": 1273 + }, + { + "epoch": 0.5867418899858956, + "grad_norm": 3.5726138177591227, + "learning_rate": 7.700027114314824e-07, + "loss": 0.5221, + "step": 1274 + }, + { + "epoch": 0.5872024409199504, + "grad_norm": 2.7589540594281448, + "learning_rate": 7.685505355549599e-07, + "loss": 0.5246, + "step": 1275 + }, + { + "epoch": 0.5876629918540054, + "grad_norm": 2.7404624681182486, + "learning_rate": 7.670988752057087e-07, + "loss": 0.4816, + "step": 1276 + }, + { + "epoch": 0.5881235427880602, + "grad_norm": 3.6053355624856906, + "learning_rate": 7.656477336171372e-07, + "loss": 0.5095, + "step": 1277 + }, + { + "epoch": 0.5885840937221151, + "grad_norm": 3.127209652870214, + "learning_rate": 7.64197114021497e-07, + "loss": 0.5685, + "step": 1278 + }, + { + "epoch": 0.5890446446561699, + "grad_norm": 3.3101979407630506, + "learning_rate": 7.627470196498788e-07, + "loss": 0.6193, + "step": 1279 + }, + { + "epoch": 0.5895051955902249, + "grad_norm": 3.0195380063665938, + "learning_rate": 7.612974537322015e-07, + "loss": 0.6205, + "step": 1280 + }, + { + "epoch": 0.5899657465242797, + "grad_norm": 3.130560991980397, + "learning_rate": 7.598484194972076e-07, + "loss": 0.5309, + "step": 1281 + }, + { + "epoch": 0.5904262974583345, + "grad_norm": 2.875522406498749, + "learning_rate": 7.583999201724565e-07, + "loss": 0.5542, + "step": 1282 + }, + { + "epoch": 0.5908868483923894, + "grad_norm": 3.0420530994023336, + "learning_rate": 7.569519589843144e-07, + "loss": 0.5288, + "step": 1283 + }, + { + "epoch": 0.5913473993264443, + "grad_norm": 2.9257495629106565, + "learning_rate": 7.555045391579492e-07, + "loss": 0.5173, + "step": 1284 + }, + { + "epoch": 0.5918079502604991, + "grad_norm": 3.0015108226030085, + "learning_rate": 7.540576639173247e-07, + "loss": 0.4839, + "step": 1285 + }, + { + "epoch": 0.592268501194554, + "grad_norm": 2.7862911331282447, + "learning_rate": 7.526113364851891e-07, + "loss": 0.5727, + "step": 1286 + }, + { + "epoch": 0.5927290521286088, + "grad_norm": 3.101011663748682, + "learning_rate": 7.511655600830727e-07, + "loss": 0.4828, + "step": 1287 + }, + { + "epoch": 0.5931896030626637, + "grad_norm": 2.8268391388305663, + "learning_rate": 7.497203379312771e-07, + "loss": 0.471, + "step": 1288 + }, + { + "epoch": 0.5936501539967186, + "grad_norm": 2.9519498437477503, + "learning_rate": 7.482756732488691e-07, + "loss": 0.5866, + "step": 1289 + }, + { + "epoch": 0.5941107049307734, + "grad_norm": 2.8204629128276255, + "learning_rate": 7.468315692536755e-07, + "loss": 0.5342, + "step": 1290 + }, + { + "epoch": 0.5945712558648283, + "grad_norm": 3.1769123313998913, + "learning_rate": 7.453880291622725e-07, + "loss": 0.4381, + "step": 1291 + }, + { + "epoch": 0.5950318067988831, + "grad_norm": 3.014532693904851, + "learning_rate": 7.439450561899813e-07, + "loss": 0.4928, + "step": 1292 + }, + { + "epoch": 0.5954923577329381, + "grad_norm": 3.0757999291111813, + "learning_rate": 7.425026535508593e-07, + "loss": 0.5666, + "step": 1293 + }, + { + "epoch": 0.5959529086669929, + "grad_norm": 3.3979586211632555, + "learning_rate": 7.410608244576937e-07, + "loss": 0.5643, + "step": 1294 + }, + { + "epoch": 0.5964134596010477, + "grad_norm": 3.259635269308452, + "learning_rate": 7.396195721219945e-07, + "loss": 0.5211, + "step": 1295 + }, + { + "epoch": 0.5968740105351026, + "grad_norm": 2.9795483102185014, + "learning_rate": 7.381788997539868e-07, + "loss": 0.5127, + "step": 1296 + }, + { + "epoch": 0.5973345614691575, + "grad_norm": 3.296329788058233, + "learning_rate": 7.367388105626036e-07, + "loss": 0.5219, + "step": 1297 + }, + { + "epoch": 0.5977951124032124, + "grad_norm": 3.2181878786157205, + "learning_rate": 7.352993077554798e-07, + "loss": 0.4597, + "step": 1298 + }, + { + "epoch": 0.5982556633372672, + "grad_norm": 2.952432694766975, + "learning_rate": 7.33860394538943e-07, + "loss": 0.5352, + "step": 1299 + }, + { + "epoch": 0.598716214271322, + "grad_norm": 2.9260033657087505, + "learning_rate": 7.324220741180088e-07, + "loss": 0.5421, + "step": 1300 + }, + { + "epoch": 0.5991767652053769, + "grad_norm": 3.211788941840772, + "learning_rate": 7.309843496963715e-07, + "loss": 0.5918, + "step": 1301 + }, + { + "epoch": 0.5996373161394318, + "grad_norm": 2.6095863519936033, + "learning_rate": 7.295472244763981e-07, + "loss": 0.4312, + "step": 1302 + }, + { + "epoch": 0.6000978670734867, + "grad_norm": 2.8768992985690374, + "learning_rate": 7.281107016591213e-07, + "loss": 0.4502, + "step": 1303 + }, + { + "epoch": 0.6005584180075415, + "grad_norm": 3.0788536260420947, + "learning_rate": 7.266747844442315e-07, + "loss": 0.6204, + "step": 1304 + }, + { + "epoch": 0.6010189689415963, + "grad_norm": 3.326645359641706, + "learning_rate": 7.252394760300707e-07, + "loss": 0.6226, + "step": 1305 + }, + { + "epoch": 0.6014795198756513, + "grad_norm": 3.211498413419631, + "learning_rate": 7.238047796136246e-07, + "loss": 0.5597, + "step": 1306 + }, + { + "epoch": 0.6019400708097061, + "grad_norm": 3.2233758236272863, + "learning_rate": 7.223706983905153e-07, + "loss": 0.5445, + "step": 1307 + }, + { + "epoch": 0.602400621743761, + "grad_norm": 3.47488384908334, + "learning_rate": 7.209372355549956e-07, + "loss": 0.5055, + "step": 1308 + }, + { + "epoch": 0.6028611726778158, + "grad_norm": 3.2688153234505664, + "learning_rate": 7.195043942999404e-07, + "loss": 0.6033, + "step": 1309 + }, + { + "epoch": 0.6033217236118708, + "grad_norm": 2.95887248920406, + "learning_rate": 7.180721778168397e-07, + "loss": 0.4693, + "step": 1310 + }, + { + "epoch": 0.6037822745459256, + "grad_norm": 3.132075461669666, + "learning_rate": 7.166405892957925e-07, + "loss": 0.602, + "step": 1311 + }, + { + "epoch": 0.6042428254799804, + "grad_norm": 3.177087477555439, + "learning_rate": 7.152096319254988e-07, + "loss": 0.5589, + "step": 1312 + }, + { + "epoch": 0.6047033764140353, + "grad_norm": 3.189488580952695, + "learning_rate": 7.137793088932533e-07, + "loss": 0.6641, + "step": 1313 + }, + { + "epoch": 0.6051639273480901, + "grad_norm": 2.9722614592308263, + "learning_rate": 7.123496233849367e-07, + "loss": 0.5488, + "step": 1314 + }, + { + "epoch": 0.605624478282145, + "grad_norm": 2.7544767985522443, + "learning_rate": 7.109205785850106e-07, + "loss": 0.4281, + "step": 1315 + }, + { + "epoch": 0.6060850292161999, + "grad_norm": 3.3634419751440388, + "learning_rate": 7.094921776765094e-07, + "loss": 0.509, + "step": 1316 + }, + { + "epoch": 0.6065455801502547, + "grad_norm": 2.9636798760844845, + "learning_rate": 7.080644238410325e-07, + "loss": 0.5577, + "step": 1317 + }, + { + "epoch": 0.6070061310843096, + "grad_norm": 3.4072070822619063, + "learning_rate": 7.066373202587397e-07, + "loss": 0.6024, + "step": 1318 + }, + { + "epoch": 0.6074666820183645, + "grad_norm": 3.107976483566616, + "learning_rate": 7.052108701083407e-07, + "loss": 0.5571, + "step": 1319 + }, + { + "epoch": 0.6079272329524193, + "grad_norm": 2.965152268025164, + "learning_rate": 7.0378507656709e-07, + "loss": 0.5274, + "step": 1320 + }, + { + "epoch": 0.6083877838864742, + "grad_norm": 2.910489708230388, + "learning_rate": 7.023599428107814e-07, + "loss": 0.4719, + "step": 1321 + }, + { + "epoch": 0.608848334820529, + "grad_norm": 3.1173838425844553, + "learning_rate": 7.009354720137364e-07, + "loss": 0.5094, + "step": 1322 + }, + { + "epoch": 0.609308885754584, + "grad_norm": 2.797005334303556, + "learning_rate": 6.995116673488014e-07, + "loss": 0.6097, + "step": 1323 + }, + { + "epoch": 0.6097694366886388, + "grad_norm": 3.0227555124071928, + "learning_rate": 6.980885319873397e-07, + "loss": 0.5937, + "step": 1324 + }, + { + "epoch": 0.6102299876226936, + "grad_norm": 2.777519608456516, + "learning_rate": 6.966660690992214e-07, + "loss": 0.4724, + "step": 1325 + }, + { + "epoch": 0.6106905385567485, + "grad_norm": 3.095622108896745, + "learning_rate": 6.952442818528219e-07, + "loss": 0.5058, + "step": 1326 + }, + { + "epoch": 0.6111510894908033, + "grad_norm": 3.334941539522611, + "learning_rate": 6.938231734150093e-07, + "loss": 0.6349, + "step": 1327 + }, + { + "epoch": 0.6116116404248583, + "grad_norm": 3.059763555645584, + "learning_rate": 6.9240274695114e-07, + "loss": 0.4978, + "step": 1328 + }, + { + "epoch": 0.6120721913589131, + "grad_norm": 3.115604189349727, + "learning_rate": 6.909830056250526e-07, + "loss": 0.6158, + "step": 1329 + }, + { + "epoch": 0.612532742292968, + "grad_norm": 3.5110850133149913, + "learning_rate": 6.895639525990586e-07, + "loss": 0.6059, + "step": 1330 + }, + { + "epoch": 0.6129932932270228, + "grad_norm": 3.1973315575132717, + "learning_rate": 6.881455910339369e-07, + "loss": 0.5804, + "step": 1331 + }, + { + "epoch": 0.6134538441610777, + "grad_norm": 3.2520702824243335, + "learning_rate": 6.867279240889259e-07, + "loss": 0.5976, + "step": 1332 + }, + { + "epoch": 0.6139143950951326, + "grad_norm": 3.447876365367868, + "learning_rate": 6.853109549217166e-07, + "loss": 0.6177, + "step": 1333 + }, + { + "epoch": 0.6143749460291874, + "grad_norm": 3.1913415774468676, + "learning_rate": 6.838946866884467e-07, + "loss": 0.6005, + "step": 1334 + }, + { + "epoch": 0.6148354969632422, + "grad_norm": 3.445057324348534, + "learning_rate": 6.824791225436918e-07, + "loss": 0.5425, + "step": 1335 + }, + { + "epoch": 0.6152960478972972, + "grad_norm": 3.7985479498424786, + "learning_rate": 6.810642656404596e-07, + "loss": 0.5883, + "step": 1336 + }, + { + "epoch": 0.615756598831352, + "grad_norm": 2.9721621801480067, + "learning_rate": 6.796501191301824e-07, + "loss": 0.5032, + "step": 1337 + }, + { + "epoch": 0.6162171497654069, + "grad_norm": 2.709696747780453, + "learning_rate": 6.782366861627101e-07, + "loss": 0.408, + "step": 1338 + }, + { + "epoch": 0.6166777006994617, + "grad_norm": 2.797272959212533, + "learning_rate": 6.768239698863033e-07, + "loss": 0.4895, + "step": 1339 + }, + { + "epoch": 0.6171382516335165, + "grad_norm": 2.687895200110118, + "learning_rate": 6.754119734476266e-07, + "loss": 0.4482, + "step": 1340 + }, + { + "epoch": 0.6175988025675715, + "grad_norm": 3.0721198321588887, + "learning_rate": 6.740006999917405e-07, + "loss": 0.6072, + "step": 1341 + }, + { + "epoch": 0.6180593535016263, + "grad_norm": 2.9971206089309823, + "learning_rate": 6.725901526620959e-07, + "loss": 0.596, + "step": 1342 + }, + { + "epoch": 0.6185199044356812, + "grad_norm": 2.9204887920199987, + "learning_rate": 6.711803346005258e-07, + "loss": 0.4545, + "step": 1343 + }, + { + "epoch": 0.618980455369736, + "grad_norm": 3.208202279508134, + "learning_rate": 6.697712489472395e-07, + "loss": 0.6018, + "step": 1344 + }, + { + "epoch": 0.619441006303791, + "grad_norm": 2.698911787184527, + "learning_rate": 6.68362898840814e-07, + "loss": 0.5294, + "step": 1345 + }, + { + "epoch": 0.6199015572378458, + "grad_norm": 2.628724441436368, + "learning_rate": 6.669552874181888e-07, + "loss": 0.4935, + "step": 1346 + }, + { + "epoch": 0.6203621081719006, + "grad_norm": 3.6333914395043996, + "learning_rate": 6.655484178146576e-07, + "loss": 0.5121, + "step": 1347 + }, + { + "epoch": 0.6208226591059555, + "grad_norm": 2.840757981463247, + "learning_rate": 6.641422931638614e-07, + "loss": 0.5378, + "step": 1348 + }, + { + "epoch": 0.6212832100400104, + "grad_norm": 3.119838224826943, + "learning_rate": 6.627369165977837e-07, + "loss": 0.5759, + "step": 1349 + }, + { + "epoch": 0.6217437609740653, + "grad_norm": 2.854001213576483, + "learning_rate": 6.613322912467392e-07, + "loss": 0.4695, + "step": 1350 + }, + { + "epoch": 0.6222043119081201, + "grad_norm": 3.0516655650363695, + "learning_rate": 6.599284202393708e-07, + "loss": 0.5992, + "step": 1351 + }, + { + "epoch": 0.6226648628421749, + "grad_norm": 3.1765489105582927, + "learning_rate": 6.585253067026417e-07, + "loss": 0.5748, + "step": 1352 + }, + { + "epoch": 0.6231254137762298, + "grad_norm": 3.012215519926939, + "learning_rate": 6.571229537618266e-07, + "loss": 0.5621, + "step": 1353 + }, + { + "epoch": 0.6235859647102847, + "grad_norm": 3.277543278020275, + "learning_rate": 6.557213645405064e-07, + "loss": 0.4244, + "step": 1354 + }, + { + "epoch": 0.6240465156443396, + "grad_norm": 3.070612553633971, + "learning_rate": 6.54320542160562e-07, + "loss": 0.6149, + "step": 1355 + }, + { + "epoch": 0.6245070665783944, + "grad_norm": 3.3615068454127046, + "learning_rate": 6.529204897421643e-07, + "loss": 0.5554, + "step": 1356 + }, + { + "epoch": 0.6249676175124492, + "grad_norm": 3.3561738697585266, + "learning_rate": 6.515212104037713e-07, + "loss": 0.5889, + "step": 1357 + }, + { + "epoch": 0.6254281684465042, + "grad_norm": 3.0933916171873483, + "learning_rate": 6.50122707262118e-07, + "loss": 0.4735, + "step": 1358 + }, + { + "epoch": 0.625888719380559, + "grad_norm": 3.5136171225226622, + "learning_rate": 6.487249834322095e-07, + "loss": 0.6822, + "step": 1359 + }, + { + "epoch": 0.6263492703146138, + "grad_norm": 3.133863907558626, + "learning_rate": 6.473280420273172e-07, + "loss": 0.5195, + "step": 1360 + }, + { + "epoch": 0.6268098212486687, + "grad_norm": 3.302281968664348, + "learning_rate": 6.459318861589685e-07, + "loss": 0.6776, + "step": 1361 + }, + { + "epoch": 0.6272703721827236, + "grad_norm": 3.1331202449009643, + "learning_rate": 6.445365189369411e-07, + "loss": 0.5329, + "step": 1362 + }, + { + "epoch": 0.6277309231167785, + "grad_norm": 3.2107222157004673, + "learning_rate": 6.431419434692563e-07, + "loss": 0.4996, + "step": 1363 + }, + { + "epoch": 0.6281914740508333, + "grad_norm": 3.3986716761760984, + "learning_rate": 6.417481628621717e-07, + "loss": 0.5303, + "step": 1364 + }, + { + "epoch": 0.6286520249848881, + "grad_norm": 3.066320791566163, + "learning_rate": 6.403551802201748e-07, + "loss": 0.5218, + "step": 1365 + }, + { + "epoch": 0.629112575918943, + "grad_norm": 3.2972142393202772, + "learning_rate": 6.389629986459755e-07, + "loss": 0.6122, + "step": 1366 + }, + { + "epoch": 0.6295731268529979, + "grad_norm": 3.1560358559235415, + "learning_rate": 6.375716212404989e-07, + "loss": 0.539, + "step": 1367 + }, + { + "epoch": 0.6300336777870528, + "grad_norm": 3.0343679900830036, + "learning_rate": 6.3618105110288e-07, + "loss": 0.4605, + "step": 1368 + }, + { + "epoch": 0.6304942287211076, + "grad_norm": 3.050511631671474, + "learning_rate": 6.347912913304548e-07, + "loss": 0.6179, + "step": 1369 + }, + { + "epoch": 0.6309547796551624, + "grad_norm": 3.2629876129159627, + "learning_rate": 6.334023450187549e-07, + "loss": 0.5378, + "step": 1370 + }, + { + "epoch": 0.6314153305892174, + "grad_norm": 3.0550921504844113, + "learning_rate": 6.320142152614993e-07, + "loss": 0.5586, + "step": 1371 + }, + { + "epoch": 0.6318758815232722, + "grad_norm": 2.87904745693612, + "learning_rate": 6.306269051505888e-07, + "loss": 0.5824, + "step": 1372 + }, + { + "epoch": 0.6323364324573271, + "grad_norm": 4.120605427481767, + "learning_rate": 6.292404177760986e-07, + "loss": 0.5787, + "step": 1373 + }, + { + "epoch": 0.6327969833913819, + "grad_norm": 3.281112660782104, + "learning_rate": 6.278547562262706e-07, + "loss": 0.5811, + "step": 1374 + }, + { + "epoch": 0.6332575343254369, + "grad_norm": 3.154859004430421, + "learning_rate": 6.264699235875084e-07, + "loss": 0.6182, + "step": 1375 + }, + { + "epoch": 0.6337180852594917, + "grad_norm": 3.0668232663377295, + "learning_rate": 6.250859229443684e-07, + "loss": 0.5067, + "step": 1376 + }, + { + "epoch": 0.6341786361935465, + "grad_norm": 3.060022701793419, + "learning_rate": 6.237027573795538e-07, + "loss": 0.5955, + "step": 1377 + }, + { + "epoch": 0.6346391871276014, + "grad_norm": 3.158763791575397, + "learning_rate": 6.223204299739087e-07, + "loss": 0.6123, + "step": 1378 + }, + { + "epoch": 0.6350997380616562, + "grad_norm": 3.075043672464968, + "learning_rate": 6.209389438064092e-07, + "loss": 0.5822, + "step": 1379 + }, + { + "epoch": 0.6355602889957112, + "grad_norm": 2.7672680741174247, + "learning_rate": 6.19558301954158e-07, + "loss": 0.5106, + "step": 1380 + }, + { + "epoch": 0.636020839929766, + "grad_norm": 2.9893686560154107, + "learning_rate": 6.181785074923777e-07, + "loss": 0.523, + "step": 1381 + }, + { + "epoch": 0.6364813908638208, + "grad_norm": 3.6442869591374722, + "learning_rate": 6.167995634944025e-07, + "loss": 0.6623, + "step": 1382 + }, + { + "epoch": 0.6369419417978757, + "grad_norm": 2.9356555531231527, + "learning_rate": 6.154214730316738e-07, + "loss": 0.4692, + "step": 1383 + }, + { + "epoch": 0.6374024927319306, + "grad_norm": 3.201063109690519, + "learning_rate": 6.1404423917373e-07, + "loss": 0.4439, + "step": 1384 + }, + { + "epoch": 0.6378630436659855, + "grad_norm": 3.0104867074963315, + "learning_rate": 6.126678649882019e-07, + "loss": 0.4768, + "step": 1385 + }, + { + "epoch": 0.6383235946000403, + "grad_norm": 3.327881299870761, + "learning_rate": 6.112923535408073e-07, + "loss": 0.4477, + "step": 1386 + }, + { + "epoch": 0.6387841455340951, + "grad_norm": 3.0269280110830863, + "learning_rate": 6.099177078953397e-07, + "loss": 0.5409, + "step": 1387 + }, + { + "epoch": 0.6392446964681501, + "grad_norm": 3.277885482165906, + "learning_rate": 6.085439311136664e-07, + "loss": 0.6291, + "step": 1388 + }, + { + "epoch": 0.6397052474022049, + "grad_norm": 3.180671239132894, + "learning_rate": 6.071710262557181e-07, + "loss": 0.5016, + "step": 1389 + }, + { + "epoch": 0.6401657983362598, + "grad_norm": 3.056915861723639, + "learning_rate": 6.057989963794832e-07, + "loss": 0.4639, + "step": 1390 + }, + { + "epoch": 0.6406263492703146, + "grad_norm": 2.977693194989121, + "learning_rate": 6.044278445410025e-07, + "loss": 0.5895, + "step": 1391 + }, + { + "epoch": 0.6410869002043694, + "grad_norm": 2.9786799583494266, + "learning_rate": 6.030575737943595e-07, + "loss": 0.4599, + "step": 1392 + }, + { + "epoch": 0.6415474511384244, + "grad_norm": 2.8694400733669227, + "learning_rate": 6.016881871916766e-07, + "loss": 0.6239, + "step": 1393 + }, + { + "epoch": 0.6420080020724792, + "grad_norm": 3.509458526598221, + "learning_rate": 6.003196877831059e-07, + "loss": 0.6965, + "step": 1394 + }, + { + "epoch": 0.642468553006534, + "grad_norm": 3.131981687735861, + "learning_rate": 5.989520786168235e-07, + "loss": 0.5088, + "step": 1395 + }, + { + "epoch": 0.6429291039405889, + "grad_norm": 3.532782457274475, + "learning_rate": 5.975853627390232e-07, + "loss": 0.5129, + "step": 1396 + }, + { + "epoch": 0.6433896548746438, + "grad_norm": 2.5625501669909307, + "learning_rate": 5.962195431939084e-07, + "loss": 0.4677, + "step": 1397 + }, + { + "epoch": 0.6438502058086987, + "grad_norm": 3.27442037003512, + "learning_rate": 5.94854623023686e-07, + "loss": 0.5125, + "step": 1398 + }, + { + "epoch": 0.6443107567427535, + "grad_norm": 3.2218724725996184, + "learning_rate": 5.934906052685603e-07, + "loss": 0.6094, + "step": 1399 + }, + { + "epoch": 0.6447713076768083, + "grad_norm": 3.5948100953063102, + "learning_rate": 5.921274929667251e-07, + "loss": 0.6102, + "step": 1400 + }, + { + "epoch": 0.6452318586108633, + "grad_norm": 3.1588508612291766, + "learning_rate": 5.907652891543576e-07, + "loss": 0.5734, + "step": 1401 + }, + { + "epoch": 0.6456924095449181, + "grad_norm": 3.700691639948302, + "learning_rate": 5.894039968656114e-07, + "loss": 0.5963, + "step": 1402 + }, + { + "epoch": 0.646152960478973, + "grad_norm": 3.2297651679426806, + "learning_rate": 5.880436191326092e-07, + "loss": 0.6396, + "step": 1403 + }, + { + "epoch": 0.6466135114130278, + "grad_norm": 3.241229586568738, + "learning_rate": 5.866841589854381e-07, + "loss": 0.5148, + "step": 1404 + }, + { + "epoch": 0.6470740623470826, + "grad_norm": 3.2127854047817186, + "learning_rate": 5.853256194521395e-07, + "loss": 0.5669, + "step": 1405 + }, + { + "epoch": 0.6475346132811376, + "grad_norm": 3.1150319104457234, + "learning_rate": 5.83968003558706e-07, + "loss": 0.5669, + "step": 1406 + }, + { + "epoch": 0.6479951642151924, + "grad_norm": 3.1285530927395095, + "learning_rate": 5.826113143290717e-07, + "loss": 0.5897, + "step": 1407 + }, + { + "epoch": 0.6484557151492473, + "grad_norm": 3.5323185152016388, + "learning_rate": 5.812555547851068e-07, + "loss": 0.5374, + "step": 1408 + }, + { + "epoch": 0.6489162660833021, + "grad_norm": 3.4748049321150383, + "learning_rate": 5.799007279466111e-07, + "loss": 0.5503, + "step": 1409 + }, + { + "epoch": 0.649376817017357, + "grad_norm": 3.404681472569695, + "learning_rate": 5.785468368313076e-07, + "loss": 0.5303, + "step": 1410 + }, + { + "epoch": 0.6498373679514119, + "grad_norm": 2.9485744210160107, + "learning_rate": 5.77193884454833e-07, + "loss": 0.5113, + "step": 1411 + }, + { + "epoch": 0.6502979188854667, + "grad_norm": 2.7092540440014807, + "learning_rate": 5.758418738307351e-07, + "loss": 0.5, + "step": 1412 + }, + { + "epoch": 0.6507584698195216, + "grad_norm": 2.853364404240714, + "learning_rate": 5.74490807970463e-07, + "loss": 0.5665, + "step": 1413 + }, + { + "epoch": 0.6512190207535765, + "grad_norm": 3.116516600142004, + "learning_rate": 5.731406898833623e-07, + "loss": 0.6661, + "step": 1414 + }, + { + "epoch": 0.6516795716876314, + "grad_norm": 2.8236292161466294, + "learning_rate": 5.717915225766661e-07, + "loss": 0.4524, + "step": 1415 + }, + { + "epoch": 0.6521401226216862, + "grad_norm": 3.0310468470234246, + "learning_rate": 5.704433090554911e-07, + "loss": 0.462, + "step": 1416 + }, + { + "epoch": 0.652600673555741, + "grad_norm": 3.24038506868089, + "learning_rate": 5.690960523228294e-07, + "loss": 0.6213, + "step": 1417 + }, + { + "epoch": 0.6530612244897959, + "grad_norm": 3.0890141559456157, + "learning_rate": 5.677497553795409e-07, + "loss": 0.4549, + "step": 1418 + }, + { + "epoch": 0.6535217754238508, + "grad_norm": 2.7880931209315842, + "learning_rate": 5.664044212243489e-07, + "loss": 0.4132, + "step": 1419 + }, + { + "epoch": 0.6539823263579057, + "grad_norm": 2.9558254074326427, + "learning_rate": 5.650600528538318e-07, + "loss": 0.494, + "step": 1420 + }, + { + "epoch": 0.6544428772919605, + "grad_norm": 2.914896993211941, + "learning_rate": 5.637166532624163e-07, + "loss": 0.5937, + "step": 1421 + }, + { + "epoch": 0.6549034282260153, + "grad_norm": 3.292628338829398, + "learning_rate": 5.623742254423718e-07, + "loss": 0.5269, + "step": 1422 + }, + { + "epoch": 0.6553639791600703, + "grad_norm": 3.2242310456562673, + "learning_rate": 5.610327723838037e-07, + "loss": 0.7023, + "step": 1423 + }, + { + "epoch": 0.6558245300941251, + "grad_norm": 3.240542534935448, + "learning_rate": 5.596922970746449e-07, + "loss": 0.5462, + "step": 1424 + }, + { + "epoch": 0.65628508102818, + "grad_norm": 2.949873633276392, + "learning_rate": 5.583528025006513e-07, + "loss": 0.4527, + "step": 1425 + }, + { + "epoch": 0.6567456319622348, + "grad_norm": 3.45439820707799, + "learning_rate": 5.570142916453944e-07, + "loss": 0.5727, + "step": 1426 + }, + { + "epoch": 0.6572061828962897, + "grad_norm": 3.0037709983691574, + "learning_rate": 5.556767674902548e-07, + "loss": 0.5799, + "step": 1427 + }, + { + "epoch": 0.6576667338303446, + "grad_norm": 2.846902934654531, + "learning_rate": 5.54340233014414e-07, + "loss": 0.5848, + "step": 1428 + }, + { + "epoch": 0.6581272847643994, + "grad_norm": 3.193107289231858, + "learning_rate": 5.530046911948505e-07, + "loss": 0.4999, + "step": 1429 + }, + { + "epoch": 0.6585878356984542, + "grad_norm": 3.396826464978981, + "learning_rate": 5.516701450063316e-07, + "loss": 0.4476, + "step": 1430 + }, + { + "epoch": 0.6590483866325091, + "grad_norm": 3.8083097104308505, + "learning_rate": 5.503365974214058e-07, + "loss": 0.6308, + "step": 1431 + }, + { + "epoch": 0.659508937566564, + "grad_norm": 2.9191556934998615, + "learning_rate": 5.490040514103995e-07, + "loss": 0.4147, + "step": 1432 + }, + { + "epoch": 0.6599694885006189, + "grad_norm": 2.8784243722308553, + "learning_rate": 5.476725099414062e-07, + "loss": 0.5174, + "step": 1433 + }, + { + "epoch": 0.6604300394346737, + "grad_norm": 3.350266406134782, + "learning_rate": 5.463419759802817e-07, + "loss": 0.6058, + "step": 1434 + }, + { + "epoch": 0.6608905903687285, + "grad_norm": 3.1483929420773555, + "learning_rate": 5.450124524906401e-07, + "loss": 0.518, + "step": 1435 + }, + { + "epoch": 0.6613511413027835, + "grad_norm": 3.4936502869433137, + "learning_rate": 5.436839424338425e-07, + "loss": 0.5507, + "step": 1436 + }, + { + "epoch": 0.6618116922368383, + "grad_norm": 3.1219573705478125, + "learning_rate": 5.423564487689929e-07, + "loss": 0.4988, + "step": 1437 + }, + { + "epoch": 0.6622722431708932, + "grad_norm": 3.582117403088983, + "learning_rate": 5.410299744529332e-07, + "loss": 0.5612, + "step": 1438 + }, + { + "epoch": 0.662732794104948, + "grad_norm": 3.5187886191292295, + "learning_rate": 5.397045224402326e-07, + "loss": 0.5867, + "step": 1439 + }, + { + "epoch": 0.663193345039003, + "grad_norm": 2.963457737411872, + "learning_rate": 5.383800956831846e-07, + "loss": 0.4929, + "step": 1440 + }, + { + "epoch": 0.6636538959730578, + "grad_norm": 3.4468724975782883, + "learning_rate": 5.370566971317989e-07, + "loss": 0.6108, + "step": 1441 + }, + { + "epoch": 0.6641144469071126, + "grad_norm": 3.1533494269546267, + "learning_rate": 5.357343297337943e-07, + "loss": 0.5902, + "step": 1442 + }, + { + "epoch": 0.6645749978411675, + "grad_norm": 3.3732802906601522, + "learning_rate": 5.344129964345934e-07, + "loss": 0.4952, + "step": 1443 + }, + { + "epoch": 0.6650355487752223, + "grad_norm": 2.9687293866646747, + "learning_rate": 5.330927001773154e-07, + "loss": 0.5435, + "step": 1444 + }, + { + "epoch": 0.6654960997092773, + "grad_norm": 3.480755832372405, + "learning_rate": 5.317734439027699e-07, + "loss": 0.564, + "step": 1445 + }, + { + "epoch": 0.6659566506433321, + "grad_norm": 2.9931832336561564, + "learning_rate": 5.304552305494492e-07, + "loss": 0.4201, + "step": 1446 + }, + { + "epoch": 0.6664172015773869, + "grad_norm": 2.85584265239967, + "learning_rate": 5.291380630535231e-07, + "loss": 0.5615, + "step": 1447 + }, + { + "epoch": 0.6668777525114418, + "grad_norm": 2.8281402193294056, + "learning_rate": 5.278219443488328e-07, + "loss": 0.5043, + "step": 1448 + }, + { + "epoch": 0.6673383034454967, + "grad_norm": 3.0403775949677923, + "learning_rate": 5.265068773668812e-07, + "loss": 0.5079, + "step": 1449 + }, + { + "epoch": 0.6677988543795516, + "grad_norm": 3.3174054936344897, + "learning_rate": 5.251928650368307e-07, + "loss": 0.5716, + "step": 1450 + }, + { + "epoch": 0.6682594053136064, + "grad_norm": 3.081361444525388, + "learning_rate": 5.238799102854941e-07, + "loss": 0.4981, + "step": 1451 + }, + { + "epoch": 0.6687199562476612, + "grad_norm": 3.1883940258418684, + "learning_rate": 5.225680160373275e-07, + "loss": 0.4924, + "step": 1452 + }, + { + "epoch": 0.6691805071817162, + "grad_norm": 3.311273891600756, + "learning_rate": 5.212571852144261e-07, + "loss": 0.5837, + "step": 1453 + }, + { + "epoch": 0.669641058115771, + "grad_norm": 3.138972108234792, + "learning_rate": 5.199474207365162e-07, + "loss": 0.5111, + "step": 1454 + }, + { + "epoch": 0.6701016090498259, + "grad_norm": 2.749902984155162, + "learning_rate": 5.186387255209481e-07, + "loss": 0.4912, + "step": 1455 + }, + { + "epoch": 0.6705621599838807, + "grad_norm": 3.201073779062438, + "learning_rate": 5.173311024826915e-07, + "loss": 0.5993, + "step": 1456 + }, + { + "epoch": 0.6710227109179355, + "grad_norm": 3.054277406726679, + "learning_rate": 5.160245545343274e-07, + "loss": 0.5429, + "step": 1457 + }, + { + "epoch": 0.6714832618519905, + "grad_norm": 3.262974854228882, + "learning_rate": 5.147190845860426e-07, + "loss": 0.67, + "step": 1458 + }, + { + "epoch": 0.6719438127860453, + "grad_norm": 3.2252159866634584, + "learning_rate": 5.134146955456218e-07, + "loss": 0.4783, + "step": 1459 + }, + { + "epoch": 0.6724043637201002, + "grad_norm": 3.3051119530232946, + "learning_rate": 5.121113903184431e-07, + "loss": 0.4966, + "step": 1460 + }, + { + "epoch": 0.672864914654155, + "grad_norm": 3.064671697196766, + "learning_rate": 5.108091718074705e-07, + "loss": 0.6525, + "step": 1461 + }, + { + "epoch": 0.6733254655882099, + "grad_norm": 2.857963094780648, + "learning_rate": 5.095080429132459e-07, + "loss": 0.5369, + "step": 1462 + }, + { + "epoch": 0.6737860165222648, + "grad_norm": 3.310426657500392, + "learning_rate": 5.082080065338872e-07, + "loss": 0.5112, + "step": 1463 + }, + { + "epoch": 0.6742465674563196, + "grad_norm": 3.225512724763038, + "learning_rate": 5.069090655650762e-07, + "loss": 0.581, + "step": 1464 + }, + { + "epoch": 0.6747071183903744, + "grad_norm": 3.2679785563098784, + "learning_rate": 5.05611222900055e-07, + "loss": 0.5522, + "step": 1465 + }, + { + "epoch": 0.6751676693244294, + "grad_norm": 3.446697533304695, + "learning_rate": 5.043144814296215e-07, + "loss": 0.5458, + "step": 1466 + }, + { + "epoch": 0.6756282202584842, + "grad_norm": 3.180139270812277, + "learning_rate": 5.030188440421185e-07, + "loss": 0.4763, + "step": 1467 + }, + { + "epoch": 0.6760887711925391, + "grad_norm": 3.448958257550766, + "learning_rate": 5.017243136234298e-07, + "loss": 0.5669, + "step": 1468 + }, + { + "epoch": 0.6765493221265939, + "grad_norm": 3.5331462163297243, + "learning_rate": 5.004308930569757e-07, + "loss": 0.6389, + "step": 1469 + }, + { + "epoch": 0.6770098730606487, + "grad_norm": 2.9967238157665124, + "learning_rate": 4.991385852237017e-07, + "loss": 0.5617, + "step": 1470 + }, + { + "epoch": 0.6774704239947037, + "grad_norm": 2.7688203871377235, + "learning_rate": 4.978473930020767e-07, + "loss": 0.4754, + "step": 1471 + }, + { + "epoch": 0.6779309749287585, + "grad_norm": 3.266647952459185, + "learning_rate": 4.965573192680841e-07, + "loss": 0.5109, + "step": 1472 + }, + { + "epoch": 0.6783915258628134, + "grad_norm": 2.825024520007373, + "learning_rate": 4.952683668952152e-07, + "loss": 0.4731, + "step": 1473 + }, + { + "epoch": 0.6788520767968682, + "grad_norm": 3.426822674267336, + "learning_rate": 4.939805387544649e-07, + "loss": 0.5494, + "step": 1474 + }, + { + "epoch": 0.6793126277309232, + "grad_norm": 3.4379075787612448, + "learning_rate": 4.926938377143232e-07, + "loss": 0.6234, + "step": 1475 + }, + { + "epoch": 0.679773178664978, + "grad_norm": 3.037669101666441, + "learning_rate": 4.914082666407704e-07, + "loss": 0.4952, + "step": 1476 + }, + { + "epoch": 0.6802337295990328, + "grad_norm": 3.1953172155925977, + "learning_rate": 4.901238283972685e-07, + "loss": 0.4285, + "step": 1477 + }, + { + "epoch": 0.6806942805330877, + "grad_norm": 3.2567976107387646, + "learning_rate": 4.888405258447576e-07, + "loss": 0.5178, + "step": 1478 + }, + { + "epoch": 0.6811548314671426, + "grad_norm": 3.0243656981433, + "learning_rate": 4.875583618416481e-07, + "loss": 0.5382, + "step": 1479 + }, + { + "epoch": 0.6816153824011975, + "grad_norm": 2.995186764899891, + "learning_rate": 4.862773392438131e-07, + "loss": 0.5426, + "step": 1480 + }, + { + "epoch": 0.6820759333352523, + "grad_norm": 3.6636504352304686, + "learning_rate": 4.849974609045848e-07, + "loss": 0.6404, + "step": 1481 + }, + { + "epoch": 0.6825364842693071, + "grad_norm": 2.692736667622777, + "learning_rate": 4.837187296747463e-07, + "loss": 0.5281, + "step": 1482 + }, + { + "epoch": 0.682997035203362, + "grad_norm": 2.7042483128746064, + "learning_rate": 4.82441148402525e-07, + "loss": 0.575, + "step": 1483 + }, + { + "epoch": 0.6834575861374169, + "grad_norm": 3.0352564028625952, + "learning_rate": 4.811647199335877e-07, + "loss": 0.5098, + "step": 1484 + }, + { + "epoch": 0.6839181370714718, + "grad_norm": 3.127898271916526, + "learning_rate": 4.798894471110336e-07, + "loss": 0.5613, + "step": 1485 + }, + { + "epoch": 0.6843786880055266, + "grad_norm": 2.950470950095703, + "learning_rate": 4.786153327753864e-07, + "loss": 0.5638, + "step": 1486 + }, + { + "epoch": 0.6848392389395814, + "grad_norm": 2.9327112511286777, + "learning_rate": 4.773423797645911e-07, + "loss": 0.5308, + "step": 1487 + }, + { + "epoch": 0.6852997898736364, + "grad_norm": 3.2221808265567917, + "learning_rate": 4.76070590914005e-07, + "loss": 0.5875, + "step": 1488 + }, + { + "epoch": 0.6857603408076912, + "grad_norm": 2.9032838510402383, + "learning_rate": 4.747999690563932e-07, + "loss": 0.4657, + "step": 1489 + }, + { + "epoch": 0.686220891741746, + "grad_norm": 3.253618175545866, + "learning_rate": 4.7353051702191994e-07, + "loss": 0.5498, + "step": 1490 + }, + { + "epoch": 0.6866814426758009, + "grad_norm": 3.3055447616805296, + "learning_rate": 4.7226223763814545e-07, + "loss": 0.5192, + "step": 1491 + }, + { + "epoch": 0.6871419936098558, + "grad_norm": 3.662312307552012, + "learning_rate": 4.709951337300174e-07, + "loss": 0.5508, + "step": 1492 + }, + { + "epoch": 0.6876025445439107, + "grad_norm": 3.0362093035408795, + "learning_rate": 4.697292081198646e-07, + "loss": 0.4648, + "step": 1493 + }, + { + "epoch": 0.6880630954779655, + "grad_norm": 3.1492972902080627, + "learning_rate": 4.684644636273922e-07, + "loss": 0.4502, + "step": 1494 + }, + { + "epoch": 0.6885236464120204, + "grad_norm": 2.9996769191548416, + "learning_rate": 4.6720090306967465e-07, + "loss": 0.6546, + "step": 1495 + }, + { + "epoch": 0.6889841973460752, + "grad_norm": 2.9045793925682104, + "learning_rate": 4.6593852926114784e-07, + "loss": 0.4884, + "step": 1496 + }, + { + "epoch": 0.6894447482801301, + "grad_norm": 3.1298670395653962, + "learning_rate": 4.646773450136067e-07, + "loss": 0.5642, + "step": 1497 + }, + { + "epoch": 0.689905299214185, + "grad_norm": 3.0495111305757163, + "learning_rate": 4.634173531361947e-07, + "loss": 0.5585, + "step": 1498 + }, + { + "epoch": 0.6903658501482398, + "grad_norm": 3.020709936806421, + "learning_rate": 4.6215855643539903e-07, + "loss": 0.4491, + "step": 1499 + }, + { + "epoch": 0.6908264010822946, + "grad_norm": 3.1842354060638467, + "learning_rate": 4.609009577150472e-07, + "loss": 0.5734, + "step": 1500 + }, + { + "epoch": 0.6912869520163496, + "grad_norm": 3.4495373317520914, + "learning_rate": 4.5964455977629593e-07, + "loss": 0.5147, + "step": 1501 + }, + { + "epoch": 0.6917475029504044, + "grad_norm": 2.8702835074826494, + "learning_rate": 4.583893654176285e-07, + "loss": 0.4863, + "step": 1502 + }, + { + "epoch": 0.6922080538844593, + "grad_norm": 2.929782205551764, + "learning_rate": 4.5713537743484754e-07, + "loss": 0.4934, + "step": 1503 + }, + { + "epoch": 0.6926686048185141, + "grad_norm": 3.0017796045087017, + "learning_rate": 4.5588259862106725e-07, + "loss": 0.5107, + "step": 1504 + }, + { + "epoch": 0.6931291557525691, + "grad_norm": 3.484258951507888, + "learning_rate": 4.5463103176671016e-07, + "loss": 0.6145, + "step": 1505 + }, + { + "epoch": 0.6935897066866239, + "grad_norm": 2.7555413280578143, + "learning_rate": 4.533806796594989e-07, + "loss": 0.5649, + "step": 1506 + }, + { + "epoch": 0.6940502576206787, + "grad_norm": 3.3509703674324425, + "learning_rate": 4.521315450844492e-07, + "loss": 0.5738, + "step": 1507 + }, + { + "epoch": 0.6945108085547336, + "grad_norm": 3.075909066062986, + "learning_rate": 4.508836308238664e-07, + "loss": 0.5215, + "step": 1508 + }, + { + "epoch": 0.6949713594887884, + "grad_norm": 3.116185512270017, + "learning_rate": 4.4963693965733686e-07, + "loss": 0.5707, + "step": 1509 + }, + { + "epoch": 0.6954319104228434, + "grad_norm": 3.301765717057608, + "learning_rate": 4.483914743617235e-07, + "loss": 0.6423, + "step": 1510 + }, + { + "epoch": 0.6958924613568982, + "grad_norm": 3.517582404076343, + "learning_rate": 4.471472377111574e-07, + "loss": 0.6392, + "step": 1511 + }, + { + "epoch": 0.696353012290953, + "grad_norm": 3.4096020565867193, + "learning_rate": 4.459042324770338e-07, + "loss": 0.59, + "step": 1512 + }, + { + "epoch": 0.6968135632250079, + "grad_norm": 3.2420031543237977, + "learning_rate": 4.446624614280058e-07, + "loss": 0.5858, + "step": 1513 + }, + { + "epoch": 0.6972741141590628, + "grad_norm": 3.3459346730805, + "learning_rate": 4.4342192732997565e-07, + "loss": 0.5173, + "step": 1514 + }, + { + "epoch": 0.6977346650931177, + "grad_norm": 3.2862422773543147, + "learning_rate": 4.4218263294609205e-07, + "loss": 0.5255, + "step": 1515 + }, + { + "epoch": 0.6981952160271725, + "grad_norm": 3.1669321223815614, + "learning_rate": 4.4094458103674204e-07, + "loss": 0.5333, + "step": 1516 + }, + { + "epoch": 0.6986557669612273, + "grad_norm": 2.9341763845722686, + "learning_rate": 4.397077743595444e-07, + "loss": 0.5475, + "step": 1517 + }, + { + "epoch": 0.6991163178952823, + "grad_norm": 3.298479093779986, + "learning_rate": 4.384722156693451e-07, + "loss": 0.6168, + "step": 1518 + }, + { + "epoch": 0.6995768688293371, + "grad_norm": 3.774693547188722, + "learning_rate": 4.3723790771821067e-07, + "loss": 0.4971, + "step": 1519 + }, + { + "epoch": 0.700037419763392, + "grad_norm": 3.2701266862488003, + "learning_rate": 4.3600485325542047e-07, + "loss": 0.5021, + "step": 1520 + }, + { + "epoch": 0.7004979706974468, + "grad_norm": 2.8544733630249457, + "learning_rate": 4.3477305502746275e-07, + "loss": 0.5578, + "step": 1521 + }, + { + "epoch": 0.7009585216315016, + "grad_norm": 2.859027758778164, + "learning_rate": 4.335425157780277e-07, + "loss": 0.5669, + "step": 1522 + }, + { + "epoch": 0.7014190725655566, + "grad_norm": 3.1340569464788226, + "learning_rate": 4.323132382480015e-07, + "loss": 0.4886, + "step": 1523 + }, + { + "epoch": 0.7018796234996114, + "grad_norm": 3.533977315795044, + "learning_rate": 4.3108522517545866e-07, + "loss": 0.6171, + "step": 1524 + }, + { + "epoch": 0.7023401744336663, + "grad_norm": 3.1550549229148186, + "learning_rate": 4.2985847929565865e-07, + "loss": 0.5375, + "step": 1525 + }, + { + "epoch": 0.7028007253677211, + "grad_norm": 2.7951674621480063, + "learning_rate": 4.2863300334103837e-07, + "loss": 0.4927, + "step": 1526 + }, + { + "epoch": 0.703261276301776, + "grad_norm": 3.404007775142097, + "learning_rate": 4.2740880004120474e-07, + "loss": 0.5759, + "step": 1527 + }, + { + "epoch": 0.7037218272358309, + "grad_norm": 3.0714696704935154, + "learning_rate": 4.2618587212293147e-07, + "loss": 0.5976, + "step": 1528 + }, + { + "epoch": 0.7041823781698857, + "grad_norm": 3.6804392641687826, + "learning_rate": 4.2496422231015115e-07, + "loss": 0.6249, + "step": 1529 + }, + { + "epoch": 0.7046429291039406, + "grad_norm": 3.2731509390374627, + "learning_rate": 4.237438533239488e-07, + "loss": 0.5356, + "step": 1530 + }, + { + "epoch": 0.7051034800379955, + "grad_norm": 3.6052417982765563, + "learning_rate": 4.2252476788255733e-07, + "loss": 0.5792, + "step": 1531 + }, + { + "epoch": 0.7055640309720503, + "grad_norm": 3.3733022802458446, + "learning_rate": 4.213069687013505e-07, + "loss": 0.6865, + "step": 1532 + }, + { + "epoch": 0.7060245819061052, + "grad_norm": 2.6172790916082618, + "learning_rate": 4.200904584928373e-07, + "loss": 0.4629, + "step": 1533 + }, + { + "epoch": 0.70648513284016, + "grad_norm": 3.3735377620802764, + "learning_rate": 4.1887523996665474e-07, + "loss": 0.6296, + "step": 1534 + }, + { + "epoch": 0.7069456837742149, + "grad_norm": 3.5155816418818584, + "learning_rate": 4.176613158295639e-07, + "loss": 0.6252, + "step": 1535 + }, + { + "epoch": 0.7074062347082698, + "grad_norm": 2.9319016244934795, + "learning_rate": 4.164486887854424e-07, + "loss": 0.6537, + "step": 1536 + }, + { + "epoch": 0.7078667856423246, + "grad_norm": 3.0752439651363703, + "learning_rate": 4.15237361535278e-07, + "loss": 0.6219, + "step": 1537 + }, + { + "epoch": 0.7083273365763795, + "grad_norm": 3.772619082368496, + "learning_rate": 4.140273367771643e-07, + "loss": 0.5692, + "step": 1538 + }, + { + "epoch": 0.7087878875104343, + "grad_norm": 3.0735890994766457, + "learning_rate": 4.1281861720629374e-07, + "loss": 0.5757, + "step": 1539 + }, + { + "epoch": 0.7092484384444893, + "grad_norm": 3.449289367509567, + "learning_rate": 4.1161120551495023e-07, + "loss": 0.6328, + "step": 1540 + }, + { + "epoch": 0.7097089893785441, + "grad_norm": 3.4333027638235, + "learning_rate": 4.1040510439250676e-07, + "loss": 0.453, + "step": 1541 + }, + { + "epoch": 0.7101695403125989, + "grad_norm": 3.4665469813554695, + "learning_rate": 4.092003165254154e-07, + "loss": 0.5686, + "step": 1542 + }, + { + "epoch": 0.7106300912466538, + "grad_norm": 2.979020490366814, + "learning_rate": 4.0799684459720295e-07, + "loss": 0.459, + "step": 1543 + }, + { + "epoch": 0.7110906421807087, + "grad_norm": 3.180076549332751, + "learning_rate": 4.067946912884672e-07, + "loss": 0.6623, + "step": 1544 + }, + { + "epoch": 0.7115511931147636, + "grad_norm": 3.345275079523344, + "learning_rate": 4.055938592768663e-07, + "loss": 0.5705, + "step": 1545 + }, + { + "epoch": 0.7120117440488184, + "grad_norm": 2.9898569750138875, + "learning_rate": 4.0439435123711707e-07, + "loss": 0.4446, + "step": 1546 + }, + { + "epoch": 0.7124722949828732, + "grad_norm": 2.892301015122262, + "learning_rate": 4.031961698409869e-07, + "loss": 0.4983, + "step": 1547 + }, + { + "epoch": 0.7129328459169281, + "grad_norm": 3.291501709727223, + "learning_rate": 4.0199931775728767e-07, + "loss": 0.6132, + "step": 1548 + }, + { + "epoch": 0.713393396850983, + "grad_norm": 3.1031247090774974, + "learning_rate": 4.008037976518711e-07, + "loss": 0.5285, + "step": 1549 + }, + { + "epoch": 0.7138539477850379, + "grad_norm": 3.5470619388484623, + "learning_rate": 3.996096121876221e-07, + "loss": 0.4594, + "step": 1550 + }, + { + "epoch": 0.7143144987190927, + "grad_norm": 3.095992564328872, + "learning_rate": 3.984167640244518e-07, + "loss": 0.5343, + "step": 1551 + }, + { + "epoch": 0.7147750496531475, + "grad_norm": 3.528528857914403, + "learning_rate": 3.972252558192938e-07, + "loss": 0.5564, + "step": 1552 + }, + { + "epoch": 0.7152356005872025, + "grad_norm": 3.0649641218559673, + "learning_rate": 3.960350902260967e-07, + "loss": 0.4632, + "step": 1553 + }, + { + "epoch": 0.7156961515212573, + "grad_norm": 3.4153270827868747, + "learning_rate": 3.948462698958188e-07, + "loss": 0.4675, + "step": 1554 + }, + { + "epoch": 0.7161567024553122, + "grad_norm": 3.1991732501247507, + "learning_rate": 3.9365879747642106e-07, + "loss": 0.5517, + "step": 1555 + }, + { + "epoch": 0.716617253389367, + "grad_norm": 2.9871782936902713, + "learning_rate": 3.924726756128631e-07, + "loss": 0.5189, + "step": 1556 + }, + { + "epoch": 0.7170778043234219, + "grad_norm": 2.9744051796884334, + "learning_rate": 3.912879069470966e-07, + "loss": 0.537, + "step": 1557 + }, + { + "epoch": 0.7175383552574768, + "grad_norm": 2.9363741803915433, + "learning_rate": 3.9010449411805747e-07, + "loss": 0.6124, + "step": 1558 + }, + { + "epoch": 0.7179989061915316, + "grad_norm": 2.9230129555443094, + "learning_rate": 3.889224397616635e-07, + "loss": 0.4577, + "step": 1559 + }, + { + "epoch": 0.7184594571255865, + "grad_norm": 2.9997803472056392, + "learning_rate": 3.8774174651080596e-07, + "loss": 0.5885, + "step": 1560 + }, + { + "epoch": 0.7189200080596413, + "grad_norm": 3.121077182785591, + "learning_rate": 3.865624169953439e-07, + "loss": 0.617, + "step": 1561 + }, + { + "epoch": 0.7193805589936962, + "grad_norm": 3.3670297246006378, + "learning_rate": 3.853844538420993e-07, + "loss": 0.5977, + "step": 1562 + }, + { + "epoch": 0.7198411099277511, + "grad_norm": 2.842166340649367, + "learning_rate": 3.8420785967485115e-07, + "loss": 0.5409, + "step": 1563 + }, + { + "epoch": 0.7203016608618059, + "grad_norm": 3.5406635193421656, + "learning_rate": 3.83032637114328e-07, + "loss": 0.6372, + "step": 1564 + }, + { + "epoch": 0.7207622117958608, + "grad_norm": 3.194617429657409, + "learning_rate": 3.8185878877820443e-07, + "loss": 0.5469, + "step": 1565 + }, + { + "epoch": 0.7212227627299157, + "grad_norm": 3.417157379310719, + "learning_rate": 3.806863172810936e-07, + "loss": 0.4731, + "step": 1566 + }, + { + "epoch": 0.7216833136639705, + "grad_norm": 3.231247160093252, + "learning_rate": 3.7951522523454214e-07, + "loss": 0.6959, + "step": 1567 + }, + { + "epoch": 0.7221438645980254, + "grad_norm": 3.358643739910038, + "learning_rate": 3.7834551524702364e-07, + "loss": 0.616, + "step": 1568 + }, + { + "epoch": 0.7226044155320802, + "grad_norm": 3.159395399638094, + "learning_rate": 3.7717718992393365e-07, + "loss": 0.539, + "step": 1569 + }, + { + "epoch": 0.7230649664661352, + "grad_norm": 3.288938398533548, + "learning_rate": 3.760102518675839e-07, + "loss": 0.6766, + "step": 1570 + }, + { + "epoch": 0.72352551740019, + "grad_norm": 3.1929425292220484, + "learning_rate": 3.748447036771949e-07, + "loss": 0.5312, + "step": 1571 + }, + { + "epoch": 0.7239860683342448, + "grad_norm": 3.1532440738029455, + "learning_rate": 3.736805479488936e-07, + "loss": 0.5694, + "step": 1572 + }, + { + "epoch": 0.7244466192682997, + "grad_norm": 3.1699423213216726, + "learning_rate": 3.7251778727570305e-07, + "loss": 0.5722, + "step": 1573 + }, + { + "epoch": 0.7249071702023545, + "grad_norm": 2.9111701929100198, + "learning_rate": 3.7135642424753967e-07, + "loss": 0.5001, + "step": 1574 + }, + { + "epoch": 0.7253677211364095, + "grad_norm": 3.1237883324779623, + "learning_rate": 3.701964614512082e-07, + "loss": 0.5708, + "step": 1575 + }, + { + "epoch": 0.7258282720704643, + "grad_norm": 2.7652944931913623, + "learning_rate": 3.690379014703928e-07, + "loss": 0.4368, + "step": 1576 + }, + { + "epoch": 0.7262888230045191, + "grad_norm": 3.209037928138226, + "learning_rate": 3.67880746885653e-07, + "loss": 0.6249, + "step": 1577 + }, + { + "epoch": 0.726749373938574, + "grad_norm": 3.0792001259828323, + "learning_rate": 3.667250002744199e-07, + "loss": 0.5269, + "step": 1578 + }, + { + "epoch": 0.7272099248726289, + "grad_norm": 3.3870656223423428, + "learning_rate": 3.6557066421098604e-07, + "loss": 0.4438, + "step": 1579 + }, + { + "epoch": 0.7276704758066838, + "grad_norm": 3.187784911852215, + "learning_rate": 3.644177412665039e-07, + "loss": 0.4357, + "step": 1580 + }, + { + "epoch": 0.7281310267407386, + "grad_norm": 3.3629689440356927, + "learning_rate": 3.6326623400897796e-07, + "loss": 0.5516, + "step": 1581 + }, + { + "epoch": 0.7285915776747934, + "grad_norm": 3.579677642223135, + "learning_rate": 3.621161450032586e-07, + "loss": 0.6288, + "step": 1582 + }, + { + "epoch": 0.7290521286088484, + "grad_norm": 2.918537780420392, + "learning_rate": 3.609674768110381e-07, + "loss": 0.5887, + "step": 1583 + }, + { + "epoch": 0.7295126795429032, + "grad_norm": 3.2728257749235903, + "learning_rate": 3.59820231990844e-07, + "loss": 0.6692, + "step": 1584 + }, + { + "epoch": 0.7299732304769581, + "grad_norm": 3.1155337342093614, + "learning_rate": 3.5867441309803325e-07, + "loss": 0.4757, + "step": 1585 + }, + { + "epoch": 0.7304337814110129, + "grad_norm": 3.017811596425641, + "learning_rate": 3.5753002268478625e-07, + "loss": 0.5903, + "step": 1586 + }, + { + "epoch": 0.7308943323450677, + "grad_norm": 3.5070818468579708, + "learning_rate": 3.5638706330010236e-07, + "loss": 0.5974, + "step": 1587 + }, + { + "epoch": 0.7313548832791227, + "grad_norm": 3.013090840123994, + "learning_rate": 3.552455374897935e-07, + "loss": 0.536, + "step": 1588 + }, + { + "epoch": 0.7318154342131775, + "grad_norm": 2.7740669709707517, + "learning_rate": 3.5410544779647735e-07, + "loss": 0.4466, + "step": 1589 + }, + { + "epoch": 0.7322759851472324, + "grad_norm": 3.437946984823077, + "learning_rate": 3.529667967595742e-07, + "loss": 0.5002, + "step": 1590 + }, + { + "epoch": 0.7327365360812872, + "grad_norm": 2.813883495540624, + "learning_rate": 3.518295869152994e-07, + "loss": 0.4955, + "step": 1591 + }, + { + "epoch": 0.7331970870153421, + "grad_norm": 2.8502290667648342, + "learning_rate": 3.5069382079665763e-07, + "loss": 0.4805, + "step": 1592 + }, + { + "epoch": 0.733657637949397, + "grad_norm": 3.234497100576854, + "learning_rate": 3.4955950093343857e-07, + "loss": 0.7048, + "step": 1593 + }, + { + "epoch": 0.7341181888834518, + "grad_norm": 3.131622303838718, + "learning_rate": 3.484266298522106e-07, + "loss": 0.5671, + "step": 1594 + }, + { + "epoch": 0.7345787398175067, + "grad_norm": 2.875003807206262, + "learning_rate": 3.472952100763141e-07, + "loss": 0.5951, + "step": 1595 + }, + { + "epoch": 0.7350392907515616, + "grad_norm": 2.967122454191587, + "learning_rate": 3.461652441258579e-07, + "loss": 0.5307, + "step": 1596 + }, + { + "epoch": 0.7354998416856164, + "grad_norm": 3.501420492607783, + "learning_rate": 3.450367345177122e-07, + "loss": 0.5531, + "step": 1597 + }, + { + "epoch": 0.7359603926196713, + "grad_norm": 3.0527813124759535, + "learning_rate": 3.4390968376550367e-07, + "loss": 0.5332, + "step": 1598 + }, + { + "epoch": 0.7364209435537261, + "grad_norm": 2.8619147841769266, + "learning_rate": 3.4278409437960865e-07, + "loss": 0.5634, + "step": 1599 + }, + { + "epoch": 0.7368814944877811, + "grad_norm": 3.1275902113895264, + "learning_rate": 3.4165996886714944e-07, + "loss": 0.5385, + "step": 1600 + }, + { + "epoch": 0.7373420454218359, + "grad_norm": 2.93151492063357, + "learning_rate": 3.405373097319875e-07, + "loss": 0.489, + "step": 1601 + }, + { + "epoch": 0.7378025963558907, + "grad_norm": 3.2966345212103354, + "learning_rate": 3.3941611947471703e-07, + "loss": 0.5367, + "step": 1602 + }, + { + "epoch": 0.7382631472899456, + "grad_norm": 2.7068239388271516, + "learning_rate": 3.3829640059266283e-07, + "loss": 0.5216, + "step": 1603 + }, + { + "epoch": 0.7387236982240004, + "grad_norm": 3.435870012306948, + "learning_rate": 3.3717815557987027e-07, + "loss": 0.5401, + "step": 1604 + }, + { + "epoch": 0.7391842491580554, + "grad_norm": 3.774078430992497, + "learning_rate": 3.360613869271016e-07, + "loss": 0.5301, + "step": 1605 + }, + { + "epoch": 0.7396448000921102, + "grad_norm": 2.974006922477261, + "learning_rate": 3.349460971218332e-07, + "loss": 0.5254, + "step": 1606 + }, + { + "epoch": 0.740105351026165, + "grad_norm": 3.1026779731207106, + "learning_rate": 3.3383228864824496e-07, + "loss": 0.4061, + "step": 1607 + }, + { + "epoch": 0.7405659019602199, + "grad_norm": 3.1384179646636836, + "learning_rate": 3.327199639872177e-07, + "loss": 0.5428, + "step": 1608 + }, + { + "epoch": 0.7410264528942748, + "grad_norm": 2.860698890001966, + "learning_rate": 3.316091256163288e-07, + "loss": 0.5852, + "step": 1609 + }, + { + "epoch": 0.7414870038283297, + "grad_norm": 2.8765636527594154, + "learning_rate": 3.3049977600984304e-07, + "loss": 0.4724, + "step": 1610 + }, + { + "epoch": 0.7419475547623845, + "grad_norm": 3.3500780381811746, + "learning_rate": 3.293919176387104e-07, + "loss": 0.6346, + "step": 1611 + }, + { + "epoch": 0.7424081056964393, + "grad_norm": 3.7026885508653193, + "learning_rate": 3.2828555297055946e-07, + "loss": 0.5841, + "step": 1612 + }, + { + "epoch": 0.7428686566304943, + "grad_norm": 3.0274792259951737, + "learning_rate": 3.271806844696905e-07, + "loss": 0.5425, + "step": 1613 + }, + { + "epoch": 0.7433292075645491, + "grad_norm": 3.746883296992905, + "learning_rate": 3.260773145970723e-07, + "loss": 0.6408, + "step": 1614 + }, + { + "epoch": 0.743789758498604, + "grad_norm": 3.357176161663444, + "learning_rate": 3.2497544581033555e-07, + "loss": 0.6193, + "step": 1615 + }, + { + "epoch": 0.7442503094326588, + "grad_norm": 3.59282723815335, + "learning_rate": 3.2387508056376724e-07, + "loss": 0.5914, + "step": 1616 + }, + { + "epoch": 0.7447108603667136, + "grad_norm": 3.3394681551065184, + "learning_rate": 3.2277622130830505e-07, + "loss": 0.5148, + "step": 1617 + }, + { + "epoch": 0.7451714113007686, + "grad_norm": 2.8985388546730535, + "learning_rate": 3.216788704915327e-07, + "loss": 0.5079, + "step": 1618 + }, + { + "epoch": 0.7456319622348234, + "grad_norm": 3.069882169066366, + "learning_rate": 3.2058303055767443e-07, + "loss": 0.4949, + "step": 1619 + }, + { + "epoch": 0.7460925131688783, + "grad_norm": 2.911216557138714, + "learning_rate": 3.19488703947588e-07, + "loss": 0.585, + "step": 1620 + }, + { + "epoch": 0.7465530641029331, + "grad_norm": 3.442946480558847, + "learning_rate": 3.1839589309876115e-07, + "loss": 0.544, + "step": 1621 + }, + { + "epoch": 0.747013615036988, + "grad_norm": 2.7674990139149847, + "learning_rate": 3.1730460044530573e-07, + "loss": 0.5363, + "step": 1622 + }, + { + "epoch": 0.7474741659710429, + "grad_norm": 3.229630271750196, + "learning_rate": 3.1621482841795124e-07, + "loss": 0.6189, + "step": 1623 + }, + { + "epoch": 0.7479347169050977, + "grad_norm": 3.1155759443087945, + "learning_rate": 3.151265794440404e-07, + "loss": 0.4911, + "step": 1624 + }, + { + "epoch": 0.7483952678391526, + "grad_norm": 3.4937874237026114, + "learning_rate": 3.140398559475244e-07, + "loss": 0.4647, + "step": 1625 + }, + { + "epoch": 0.7488558187732075, + "grad_norm": 3.2934599527181034, + "learning_rate": 3.129546603489548e-07, + "loss": 0.4893, + "step": 1626 + }, + { + "epoch": 0.7493163697072623, + "grad_norm": 3.0506555877752595, + "learning_rate": 3.1187099506548153e-07, + "loss": 0.5267, + "step": 1627 + }, + { + "epoch": 0.7497769206413172, + "grad_norm": 3.5830543718291015, + "learning_rate": 3.1078886251084525e-07, + "loss": 0.5325, + "step": 1628 + }, + { + "epoch": 0.750237471575372, + "grad_norm": 3.088278820265878, + "learning_rate": 3.0970826509537304e-07, + "loss": 0.5628, + "step": 1629 + }, + { + "epoch": 0.7506980225094269, + "grad_norm": 3.1594316317748294, + "learning_rate": 3.0862920522597167e-07, + "loss": 0.4871, + "step": 1630 + }, + { + "epoch": 0.7511585734434818, + "grad_norm": 2.8769280379230735, + "learning_rate": 3.075516853061244e-07, + "loss": 0.4936, + "step": 1631 + }, + { + "epoch": 0.7516191243775366, + "grad_norm": 3.4881331778008176, + "learning_rate": 3.0647570773588403e-07, + "loss": 0.5471, + "step": 1632 + }, + { + "epoch": 0.7520796753115915, + "grad_norm": 3.006655258177075, + "learning_rate": 3.0540127491186727e-07, + "loss": 0.4765, + "step": 1633 + }, + { + "epoch": 0.7525402262456463, + "grad_norm": 3.4773351804629296, + "learning_rate": 3.043283892272508e-07, + "loss": 0.5993, + "step": 1634 + }, + { + "epoch": 0.7530007771797013, + "grad_norm": 2.9508457470911256, + "learning_rate": 3.0325705307176564e-07, + "loss": 0.444, + "step": 1635 + }, + { + "epoch": 0.7534613281137561, + "grad_norm": 3.593179054371365, + "learning_rate": 3.0218726883168955e-07, + "loss": 0.5524, + "step": 1636 + }, + { + "epoch": 0.7539218790478109, + "grad_norm": 2.985359000550235, + "learning_rate": 3.011190388898464e-07, + "loss": 0.4384, + "step": 1637 + }, + { + "epoch": 0.7543824299818658, + "grad_norm": 3.0881433691468723, + "learning_rate": 3.0005236562559566e-07, + "loss": 0.5832, + "step": 1638 + }, + { + "epoch": 0.7548429809159207, + "grad_norm": 2.64976112453638, + "learning_rate": 2.989872514148298e-07, + "loss": 0.5254, + "step": 1639 + }, + { + "epoch": 0.7553035318499756, + "grad_norm": 3.178227072350045, + "learning_rate": 2.9792369862997046e-07, + "loss": 0.4896, + "step": 1640 + }, + { + "epoch": 0.7557640827840304, + "grad_norm": 3.1094527462428543, + "learning_rate": 2.9686170963995915e-07, + "loss": 0.5024, + "step": 1641 + }, + { + "epoch": 0.7562246337180852, + "grad_norm": 2.830372620158942, + "learning_rate": 2.958012868102553e-07, + "loss": 0.5466, + "step": 1642 + }, + { + "epoch": 0.7566851846521401, + "grad_norm": 3.5892656567533057, + "learning_rate": 2.9474243250283035e-07, + "loss": 0.5726, + "step": 1643 + }, + { + "epoch": 0.757145735586195, + "grad_norm": 3.1400502525766463, + "learning_rate": 2.936851490761606e-07, + "loss": 0.6686, + "step": 1644 + }, + { + "epoch": 0.7576062865202499, + "grad_norm": 3.611911685117865, + "learning_rate": 2.926294388852246e-07, + "loss": 0.531, + "step": 1645 + }, + { + "epoch": 0.7580668374543047, + "grad_norm": 2.9071023212444076, + "learning_rate": 2.9157530428149677e-07, + "loss": 0.446, + "step": 1646 + }, + { + "epoch": 0.7585273883883595, + "grad_norm": 3.573059215096367, + "learning_rate": 2.9052274761294094e-07, + "loss": 0.6017, + "step": 1647 + }, + { + "epoch": 0.7589879393224145, + "grad_norm": 2.926410495931059, + "learning_rate": 2.8947177122400737e-07, + "loss": 0.4499, + "step": 1648 + }, + { + "epoch": 0.7594484902564693, + "grad_norm": 2.9077271228177906, + "learning_rate": 2.8842237745562583e-07, + "loss": 0.4628, + "step": 1649 + }, + { + "epoch": 0.7599090411905242, + "grad_norm": 3.128424023474016, + "learning_rate": 2.873745686452017e-07, + "loss": 0.5119, + "step": 1650 + }, + { + "epoch": 0.760369592124579, + "grad_norm": 3.2061268862774464, + "learning_rate": 2.863283471266088e-07, + "loss": 0.5419, + "step": 1651 + }, + { + "epoch": 0.760830143058634, + "grad_norm": 3.176712368087586, + "learning_rate": 2.852837152301867e-07, + "loss": 0.575, + "step": 1652 + }, + { + "epoch": 0.7612906939926888, + "grad_norm": 3.002968313659636, + "learning_rate": 2.8424067528273374e-07, + "loss": 0.5412, + "step": 1653 + }, + { + "epoch": 0.7617512449267436, + "grad_norm": 3.3131721641797522, + "learning_rate": 2.83199229607502e-07, + "loss": 0.5517, + "step": 1654 + }, + { + "epoch": 0.7622117958607985, + "grad_norm": 3.0397564537569806, + "learning_rate": 2.821593805241932e-07, + "loss": 0.4976, + "step": 1655 + }, + { + "epoch": 0.7626723467948533, + "grad_norm": 3.3953414466743714, + "learning_rate": 2.811211303489527e-07, + "loss": 0.5132, + "step": 1656 + }, + { + "epoch": 0.7631328977289082, + "grad_norm": 3.2736894751575525, + "learning_rate": 2.8008448139436367e-07, + "loss": 0.5464, + "step": 1657 + }, + { + "epoch": 0.7635934486629631, + "grad_norm": 3.1651926004830915, + "learning_rate": 2.7904943596944373e-07, + "loss": 0.5585, + "step": 1658 + }, + { + "epoch": 0.7640539995970179, + "grad_norm": 3.036581859418946, + "learning_rate": 2.7801599637963893e-07, + "loss": 0.5441, + "step": 1659 + }, + { + "epoch": 0.7645145505310728, + "grad_norm": 2.625544113481615, + "learning_rate": 2.769841649268171e-07, + "loss": 0.5043, + "step": 1660 + }, + { + "epoch": 0.7649751014651277, + "grad_norm": 3.165210415457858, + "learning_rate": 2.759539439092655e-07, + "loss": 0.5323, + "step": 1661 + }, + { + "epoch": 0.7654356523991825, + "grad_norm": 2.968389660757054, + "learning_rate": 2.7492533562168407e-07, + "loss": 0.5542, + "step": 1662 + }, + { + "epoch": 0.7658962033332374, + "grad_norm": 3.28366116605155, + "learning_rate": 2.738983423551804e-07, + "loss": 0.4676, + "step": 1663 + }, + { + "epoch": 0.7663567542672922, + "grad_norm": 3.0790653400040715, + "learning_rate": 2.7287296639726443e-07, + "loss": 0.5762, + "step": 1664 + }, + { + "epoch": 0.7668173052013472, + "grad_norm": 2.8472233247954795, + "learning_rate": 2.7184921003184424e-07, + "loss": 0.5132, + "step": 1665 + }, + { + "epoch": 0.767277856135402, + "grad_norm": 3.173379604510871, + "learning_rate": 2.7082707553922067e-07, + "loss": 0.5762, + "step": 1666 + }, + { + "epoch": 0.7677384070694568, + "grad_norm": 3.3239774867727268, + "learning_rate": 2.698065651960809e-07, + "loss": 0.5437, + "step": 1667 + }, + { + "epoch": 0.7681989580035117, + "grad_norm": 3.004069248804067, + "learning_rate": 2.687876812754963e-07, + "loss": 0.5013, + "step": 1668 + }, + { + "epoch": 0.7686595089375665, + "grad_norm": 2.848925751252902, + "learning_rate": 2.67770426046914e-07, + "loss": 0.3852, + "step": 1669 + }, + { + "epoch": 0.7691200598716215, + "grad_norm": 3.47280346861256, + "learning_rate": 2.6675480177615326e-07, + "loss": 0.6775, + "step": 1670 + }, + { + "epoch": 0.7695806108056763, + "grad_norm": 3.2787012091514938, + "learning_rate": 2.6574081072540264e-07, + "loss": 0.575, + "step": 1671 + }, + { + "epoch": 0.7700411617397311, + "grad_norm": 2.7278525194039727, + "learning_rate": 2.647284551532104e-07, + "loss": 0.4094, + "step": 1672 + }, + { + "epoch": 0.770501712673786, + "grad_norm": 3.4300818176528063, + "learning_rate": 2.6371773731448357e-07, + "loss": 0.5578, + "step": 1673 + }, + { + "epoch": 0.7709622636078409, + "grad_norm": 2.968395762450879, + "learning_rate": 2.6270865946048084e-07, + "loss": 0.5684, + "step": 1674 + }, + { + "epoch": 0.7714228145418958, + "grad_norm": 2.8747901281611172, + "learning_rate": 2.617012238388077e-07, + "loss": 0.4909, + "step": 1675 + }, + { + "epoch": 0.7718833654759506, + "grad_norm": 3.1871301141813384, + "learning_rate": 2.60695432693412e-07, + "loss": 0.5484, + "step": 1676 + }, + { + "epoch": 0.7723439164100054, + "grad_norm": 3.2847684133576855, + "learning_rate": 2.596912882645792e-07, + "loss": 0.5747, + "step": 1677 + }, + { + "epoch": 0.7728044673440604, + "grad_norm": 2.6955957814746707, + "learning_rate": 2.5868879278892597e-07, + "loss": 0.4021, + "step": 1678 + }, + { + "epoch": 0.7732650182781152, + "grad_norm": 2.5964611847982786, + "learning_rate": 2.576879484993968e-07, + "loss": 0.5846, + "step": 1679 + }, + { + "epoch": 0.7737255692121701, + "grad_norm": 3.0346764723083868, + "learning_rate": 2.56688757625258e-07, + "loss": 0.4852, + "step": 1680 + }, + { + "epoch": 0.7741861201462249, + "grad_norm": 3.172623244649948, + "learning_rate": 2.5569122239209364e-07, + "loss": 0.5507, + "step": 1681 + }, + { + "epoch": 0.7746466710802797, + "grad_norm": 3.513693571783645, + "learning_rate": 2.54695345021799e-07, + "loss": 0.6611, + "step": 1682 + }, + { + "epoch": 0.7751072220143347, + "grad_norm": 2.9204689205282337, + "learning_rate": 2.537011277325777e-07, + "loss": 0.4998, + "step": 1683 + }, + { + "epoch": 0.7755677729483895, + "grad_norm": 2.8688540667247437, + "learning_rate": 2.527085727389354e-07, + "loss": 0.5582, + "step": 1684 + }, + { + "epoch": 0.7760283238824444, + "grad_norm": 3.254940468026647, + "learning_rate": 2.5171768225167465e-07, + "loss": 0.5264, + "step": 1685 + }, + { + "epoch": 0.7764888748164992, + "grad_norm": 3.319619529120737, + "learning_rate": 2.5072845847789126e-07, + "loss": 0.5844, + "step": 1686 + }, + { + "epoch": 0.7769494257505541, + "grad_norm": 2.711029642886499, + "learning_rate": 2.4974090362096843e-07, + "loss": 0.4466, + "step": 1687 + }, + { + "epoch": 0.777409976684609, + "grad_norm": 2.9892514572487947, + "learning_rate": 2.487550198805715e-07, + "loss": 0.5246, + "step": 1688 + }, + { + "epoch": 0.7778705276186638, + "grad_norm": 3.2361104141741213, + "learning_rate": 2.4777080945264416e-07, + "loss": 0.5495, + "step": 1689 + }, + { + "epoch": 0.7783310785527187, + "grad_norm": 2.778603000630699, + "learning_rate": 2.467882745294031e-07, + "loss": 0.43, + "step": 1690 + }, + { + "epoch": 0.7787916294867736, + "grad_norm": 2.9641786432879442, + "learning_rate": 2.458074172993324e-07, + "loss": 0.5534, + "step": 1691 + }, + { + "epoch": 0.7792521804208284, + "grad_norm": 3.0337502563475276, + "learning_rate": 2.4482823994717974e-07, + "loss": 0.556, + "step": 1692 + }, + { + "epoch": 0.7797127313548833, + "grad_norm": 3.0752220435659616, + "learning_rate": 2.4385074465395084e-07, + "loss": 0.536, + "step": 1693 + }, + { + "epoch": 0.7801732822889381, + "grad_norm": 3.1565462424883246, + "learning_rate": 2.4287493359690534e-07, + "loss": 0.4882, + "step": 1694 + }, + { + "epoch": 0.780633833222993, + "grad_norm": 3.0427946667754777, + "learning_rate": 2.4190080894955054e-07, + "loss": 0.4546, + "step": 1695 + }, + { + "epoch": 0.7810943841570479, + "grad_norm": 3.2264986971076155, + "learning_rate": 2.4092837288163805e-07, + "loss": 0.4948, + "step": 1696 + }, + { + "epoch": 0.7815549350911027, + "grad_norm": 3.35844053135339, + "learning_rate": 2.399576275591586e-07, + "loss": 0.5087, + "step": 1697 + }, + { + "epoch": 0.7820154860251576, + "grad_norm": 3.0112899189235116, + "learning_rate": 2.389885751443358e-07, + "loss": 0.5081, + "step": 1698 + }, + { + "epoch": 0.7824760369592124, + "grad_norm": 3.2352717237579722, + "learning_rate": 2.3802121779562446e-07, + "loss": 0.6003, + "step": 1699 + }, + { + "epoch": 0.7829365878932674, + "grad_norm": 2.671473345514583, + "learning_rate": 2.3705555766770203e-07, + "loss": 0.466, + "step": 1700 + }, + { + "epoch": 0.7833971388273222, + "grad_norm": 3.574193258639437, + "learning_rate": 2.3609159691146575e-07, + "loss": 0.5839, + "step": 1701 + }, + { + "epoch": 0.783857689761377, + "grad_norm": 3.051925522352705, + "learning_rate": 2.3512933767402942e-07, + "loss": 0.6309, + "step": 1702 + }, + { + "epoch": 0.7843182406954319, + "grad_norm": 3.5580346934856686, + "learning_rate": 2.3416878209871493e-07, + "loss": 0.5154, + "step": 1703 + }, + { + "epoch": 0.7847787916294868, + "grad_norm": 3.2002268979972905, + "learning_rate": 2.3320993232504993e-07, + "loss": 0.4721, + "step": 1704 + }, + { + "epoch": 0.7852393425635417, + "grad_norm": 2.9878864928285007, + "learning_rate": 2.3225279048876367e-07, + "loss": 0.5072, + "step": 1705 + }, + { + "epoch": 0.7856998934975965, + "grad_norm": 3.1381323142253685, + "learning_rate": 2.312973587217798e-07, + "loss": 0.6361, + "step": 1706 + }, + { + "epoch": 0.7861604444316513, + "grad_norm": 2.833292804782264, + "learning_rate": 2.3034363915221378e-07, + "loss": 0.4536, + "step": 1707 + }, + { + "epoch": 0.7866209953657062, + "grad_norm": 3.1215024593062015, + "learning_rate": 2.2939163390436732e-07, + "loss": 0.4223, + "step": 1708 + }, + { + "epoch": 0.7870815462997611, + "grad_norm": 3.311510890413361, + "learning_rate": 2.2844134509872292e-07, + "loss": 0.5819, + "step": 1709 + }, + { + "epoch": 0.787542097233816, + "grad_norm": 3.040556067928825, + "learning_rate": 2.2749277485194085e-07, + "loss": 0.5059, + "step": 1710 + }, + { + "epoch": 0.7880026481678708, + "grad_norm": 2.9822721827674985, + "learning_rate": 2.26545925276853e-07, + "loss": 0.4199, + "step": 1711 + }, + { + "epoch": 0.7884631991019256, + "grad_norm": 3.1686788844095637, + "learning_rate": 2.2560079848245905e-07, + "loss": 0.4875, + "step": 1712 + }, + { + "epoch": 0.7889237500359806, + "grad_norm": 3.3382646126754154, + "learning_rate": 2.2465739657392057e-07, + "loss": 0.5694, + "step": 1713 + }, + { + "epoch": 0.7893843009700354, + "grad_norm": 2.9076693356746084, + "learning_rate": 2.2371572165255792e-07, + "loss": 0.4706, + "step": 1714 + }, + { + "epoch": 0.7898448519040903, + "grad_norm": 3.925072525348147, + "learning_rate": 2.2277577581584473e-07, + "loss": 0.5798, + "step": 1715 + }, + { + "epoch": 0.7903054028381451, + "grad_norm": 3.484998071545751, + "learning_rate": 2.218375611574027e-07, + "loss": 0.5484, + "step": 1716 + }, + { + "epoch": 0.7907659537722, + "grad_norm": 3.301699935752327, + "learning_rate": 2.2090107976699802e-07, + "loss": 0.5115, + "step": 1717 + }, + { + "epoch": 0.7912265047062549, + "grad_norm": 3.042551513495106, + "learning_rate": 2.1996633373053653e-07, + "loss": 0.579, + "step": 1718 + }, + { + "epoch": 0.7916870556403097, + "grad_norm": 4.004445646052093, + "learning_rate": 2.190333251300578e-07, + "loss": 0.5797, + "step": 1719 + }, + { + "epoch": 0.7921476065743646, + "grad_norm": 3.0220642732410457, + "learning_rate": 2.1810205604373233e-07, + "loss": 0.5784, + "step": 1720 + }, + { + "epoch": 0.7926081575084194, + "grad_norm": 3.1050063154048204, + "learning_rate": 2.171725285458559e-07, + "loss": 0.4233, + "step": 1721 + }, + { + "epoch": 0.7930687084424743, + "grad_norm": 2.8115340027844966, + "learning_rate": 2.162447447068444e-07, + "loss": 0.5435, + "step": 1722 + }, + { + "epoch": 0.7935292593765292, + "grad_norm": 2.9604984251953526, + "learning_rate": 2.1531870659323082e-07, + "loss": 0.5504, + "step": 1723 + }, + { + "epoch": 0.793989810310584, + "grad_norm": 3.2405218813980166, + "learning_rate": 2.1439441626765943e-07, + "loss": 0.5433, + "step": 1724 + }, + { + "epoch": 0.7944503612446389, + "grad_norm": 3.1770006678350655, + "learning_rate": 2.1347187578888158e-07, + "loss": 0.6119, + "step": 1725 + }, + { + "epoch": 0.7949109121786938, + "grad_norm": 3.5617997893715945, + "learning_rate": 2.1255108721175065e-07, + "loss": 0.5371, + "step": 1726 + }, + { + "epoch": 0.7953714631127486, + "grad_norm": 3.6514613591140317, + "learning_rate": 2.1163205258721806e-07, + "loss": 0.6407, + "step": 1727 + }, + { + "epoch": 0.7958320140468035, + "grad_norm": 3.0047824295231127, + "learning_rate": 2.1071477396232894e-07, + "loss": 0.488, + "step": 1728 + }, + { + "epoch": 0.7962925649808583, + "grad_norm": 2.8092385249228293, + "learning_rate": 2.097992533802163e-07, + "loss": 0.5019, + "step": 1729 + }, + { + "epoch": 0.7967531159149133, + "grad_norm": 3.173996181660635, + "learning_rate": 2.0888549288009804e-07, + "loss": 0.6845, + "step": 1730 + }, + { + "epoch": 0.7972136668489681, + "grad_norm": 2.8459418296608145, + "learning_rate": 2.0797349449727163e-07, + "loss": 0.5215, + "step": 1731 + }, + { + "epoch": 0.7976742177830229, + "grad_norm": 2.867902063042974, + "learning_rate": 2.070632602631086e-07, + "loss": 0.5121, + "step": 1732 + }, + { + "epoch": 0.7981347687170778, + "grad_norm": 3.218011225763629, + "learning_rate": 2.0615479220505293e-07, + "loss": 0.5774, + "step": 1733 + }, + { + "epoch": 0.7985953196511326, + "grad_norm": 3.037535439229652, + "learning_rate": 2.05248092346613e-07, + "loss": 0.5455, + "step": 1734 + }, + { + "epoch": 0.7990558705851876, + "grad_norm": 3.44155141665967, + "learning_rate": 2.0434316270735875e-07, + "loss": 0.3846, + "step": 1735 + }, + { + "epoch": 0.7995164215192424, + "grad_norm": 3.251900855648779, + "learning_rate": 2.0344000530291872e-07, + "loss": 0.4974, + "step": 1736 + }, + { + "epoch": 0.7999769724532972, + "grad_norm": 3.3912040917359167, + "learning_rate": 2.025386221449722e-07, + "loss": 0.5486, + "step": 1737 + }, + { + "epoch": 0.8004375233873521, + "grad_norm": 3.289387489843882, + "learning_rate": 2.0163901524124771e-07, + "loss": 0.5502, + "step": 1738 + }, + { + "epoch": 0.800898074321407, + "grad_norm": 3.3531542710146884, + "learning_rate": 2.0074118659551697e-07, + "loss": 0.4939, + "step": 1739 + }, + { + "epoch": 0.8013586252554619, + "grad_norm": 3.0019863613387643, + "learning_rate": 1.9984513820759052e-07, + "loss": 0.4646, + "step": 1740 + }, + { + "epoch": 0.8018191761895167, + "grad_norm": 2.9999237180628726, + "learning_rate": 1.9895087207331417e-07, + "loss": 0.5051, + "step": 1741 + }, + { + "epoch": 0.8022797271235715, + "grad_norm": 3.440811419447699, + "learning_rate": 1.980583901845636e-07, + "loss": 0.5719, + "step": 1742 + }, + { + "epoch": 0.8027402780576265, + "grad_norm": 2.954781327967473, + "learning_rate": 1.9716769452924065e-07, + "loss": 0.4992, + "step": 1743 + }, + { + "epoch": 0.8032008289916813, + "grad_norm": 2.9449328433082433, + "learning_rate": 1.9627878709126778e-07, + "loss": 0.5873, + "step": 1744 + }, + { + "epoch": 0.8036613799257362, + "grad_norm": 2.9550948833942914, + "learning_rate": 1.9539166985058508e-07, + "loss": 0.5085, + "step": 1745 + }, + { + "epoch": 0.804121930859791, + "grad_norm": 3.0968335366037256, + "learning_rate": 1.945063447831452e-07, + "loss": 0.4837, + "step": 1746 + }, + { + "epoch": 0.8045824817938458, + "grad_norm": 2.933117571611147, + "learning_rate": 1.936228138609084e-07, + "loss": 0.5462, + "step": 1747 + }, + { + "epoch": 0.8050430327279008, + "grad_norm": 3.652807035479544, + "learning_rate": 1.92741079051839e-07, + "loss": 0.5419, + "step": 1748 + }, + { + "epoch": 0.8055035836619556, + "grad_norm": 2.906573757054981, + "learning_rate": 1.9186114231990104e-07, + "loss": 0.5128, + "step": 1749 + }, + { + "epoch": 0.8059641345960105, + "grad_norm": 2.947586128595587, + "learning_rate": 1.9098300562505264e-07, + "loss": 0.4349, + "step": 1750 + }, + { + "epoch": 0.8064246855300653, + "grad_norm": 3.0454260234507924, + "learning_rate": 1.901066709232434e-07, + "loss": 0.6054, + "step": 1751 + }, + { + "epoch": 0.8068852364641202, + "grad_norm": 3.411169435801757, + "learning_rate": 1.8923214016640898e-07, + "loss": 0.4776, + "step": 1752 + }, + { + "epoch": 0.8073457873981751, + "grad_norm": 3.2706182841611278, + "learning_rate": 1.8835941530246657e-07, + "loss": 0.5874, + "step": 1753 + }, + { + "epoch": 0.8078063383322299, + "grad_norm": 3.0129377224539513, + "learning_rate": 1.8748849827531133e-07, + "loss": 0.4817, + "step": 1754 + }, + { + "epoch": 0.8082668892662848, + "grad_norm": 3.185149212645243, + "learning_rate": 1.866193910248115e-07, + "loss": 0.4825, + "step": 1755 + }, + { + "epoch": 0.8087274402003397, + "grad_norm": 3.343736216544324, + "learning_rate": 1.857520954868047e-07, + "loss": 0.5353, + "step": 1756 + }, + { + "epoch": 0.8091879911343945, + "grad_norm": 2.7806290534872526, + "learning_rate": 1.848866135930922e-07, + "loss": 0.5085, + "step": 1757 + }, + { + "epoch": 0.8096485420684494, + "grad_norm": 3.0440711729049656, + "learning_rate": 1.8402294727143642e-07, + "loss": 0.4974, + "step": 1758 + }, + { + "epoch": 0.8101090930025042, + "grad_norm": 3.084951995273267, + "learning_rate": 1.831610984455557e-07, + "loss": 0.5661, + "step": 1759 + }, + { + "epoch": 0.8105696439365591, + "grad_norm": 3.0500734438430106, + "learning_rate": 1.8230106903511965e-07, + "loss": 0.4579, + "step": 1760 + }, + { + "epoch": 0.811030194870614, + "grad_norm": 3.2959846911138304, + "learning_rate": 1.814428609557458e-07, + "loss": 0.396, + "step": 1761 + }, + { + "epoch": 0.8114907458046688, + "grad_norm": 3.4228830494829583, + "learning_rate": 1.805864761189949e-07, + "loss": 0.6473, + "step": 1762 + }, + { + "epoch": 0.8119512967387237, + "grad_norm": 3.5275669816599247, + "learning_rate": 1.7973191643236574e-07, + "loss": 0.5014, + "step": 1763 + }, + { + "epoch": 0.8124118476727785, + "grad_norm": 2.655456203502326, + "learning_rate": 1.7887918379929356e-07, + "loss": 0.4284, + "step": 1764 + }, + { + "epoch": 0.8128723986068335, + "grad_norm": 3.119374485962208, + "learning_rate": 1.780282801191425e-07, + "loss": 0.5117, + "step": 1765 + }, + { + "epoch": 0.8133329495408883, + "grad_norm": 3.603722648362608, + "learning_rate": 1.771792072872028e-07, + "loss": 0.5251, + "step": 1766 + }, + { + "epoch": 0.8137935004749431, + "grad_norm": 3.636072681634815, + "learning_rate": 1.7633196719468846e-07, + "loss": 0.5528, + "step": 1767 + }, + { + "epoch": 0.814254051408998, + "grad_norm": 2.9215076629877563, + "learning_rate": 1.7548656172872922e-07, + "loss": 0.6344, + "step": 1768 + }, + { + "epoch": 0.8147146023430529, + "grad_norm": 3.093587532258315, + "learning_rate": 1.746429927723696e-07, + "loss": 0.5363, + "step": 1769 + }, + { + "epoch": 0.8151751532771078, + "grad_norm": 3.202770682033625, + "learning_rate": 1.738012622045635e-07, + "loss": 0.4939, + "step": 1770 + }, + { + "epoch": 0.8156357042111626, + "grad_norm": 3.0790051489322945, + "learning_rate": 1.7296137190016913e-07, + "loss": 0.5565, + "step": 1771 + }, + { + "epoch": 0.8160962551452174, + "grad_norm": 3.522068919897294, + "learning_rate": 1.7212332372994654e-07, + "loss": 0.6042, + "step": 1772 + }, + { + "epoch": 0.8165568060792723, + "grad_norm": 2.947521873551182, + "learning_rate": 1.7128711956055274e-07, + "loss": 0.5351, + "step": 1773 + }, + { + "epoch": 0.8170173570133272, + "grad_norm": 3.197903959240655, + "learning_rate": 1.7045276125453645e-07, + "loss": 0.5098, + "step": 1774 + }, + { + "epoch": 0.8174779079473821, + "grad_norm": 3.3416585809701833, + "learning_rate": 1.6962025067033604e-07, + "loss": 0.5499, + "step": 1775 + }, + { + "epoch": 0.8179384588814369, + "grad_norm": 2.6718382039429756, + "learning_rate": 1.6878958966227363e-07, + "loss": 0.4988, + "step": 1776 + }, + { + "epoch": 0.8183990098154917, + "grad_norm": 2.79487789143777, + "learning_rate": 1.6796078008055225e-07, + "loss": 0.4255, + "step": 1777 + }, + { + "epoch": 0.8188595607495467, + "grad_norm": 2.693027511901247, + "learning_rate": 1.671338237712502e-07, + "loss": 0.5501, + "step": 1778 + }, + { + "epoch": 0.8193201116836015, + "grad_norm": 2.834140007806073, + "learning_rate": 1.6630872257631834e-07, + "loss": 0.3992, + "step": 1779 + }, + { + "epoch": 0.8197806626176564, + "grad_norm": 2.9518201333896883, + "learning_rate": 1.6548547833357573e-07, + "loss": 0.4642, + "step": 1780 + }, + { + "epoch": 0.8202412135517112, + "grad_norm": 3.1159470952195636, + "learning_rate": 1.6466409287670468e-07, + "loss": 0.5313, + "step": 1781 + }, + { + "epoch": 0.8207017644857661, + "grad_norm": 3.257480140561922, + "learning_rate": 1.638445680352476e-07, + "loss": 0.5634, + "step": 1782 + }, + { + "epoch": 0.821162315419821, + "grad_norm": 3.423203304235, + "learning_rate": 1.6302690563460288e-07, + "loss": 0.5939, + "step": 1783 + }, + { + "epoch": 0.8216228663538758, + "grad_norm": 2.954374172187009, + "learning_rate": 1.6221110749601973e-07, + "loss": 0.3948, + "step": 1784 + }, + { + "epoch": 0.8220834172879307, + "grad_norm": 2.9485459890560994, + "learning_rate": 1.613971754365957e-07, + "loss": 0.483, + "step": 1785 + }, + { + "epoch": 0.8225439682219855, + "grad_norm": 3.052068636325435, + "learning_rate": 1.6058511126927176e-07, + "loss": 0.6306, + "step": 1786 + }, + { + "epoch": 0.8230045191560404, + "grad_norm": 3.276425368352071, + "learning_rate": 1.5977491680282762e-07, + "loss": 0.5366, + "step": 1787 + }, + { + "epoch": 0.8234650700900953, + "grad_norm": 3.322933684622075, + "learning_rate": 1.589665938418795e-07, + "loss": 0.5081, + "step": 1788 + }, + { + "epoch": 0.8239256210241501, + "grad_norm": 3.119742625969975, + "learning_rate": 1.581601441868743e-07, + "loss": 0.5368, + "step": 1789 + }, + { + "epoch": 0.824386171958205, + "grad_norm": 2.9653697872519036, + "learning_rate": 1.5735556963408693e-07, + "loss": 0.6565, + "step": 1790 + }, + { + "epoch": 0.8248467228922599, + "grad_norm": 3.373344219441705, + "learning_rate": 1.5655287197561495e-07, + "loss": 0.6218, + "step": 1791 + }, + { + "epoch": 0.8253072738263147, + "grad_norm": 3.1421457828079795, + "learning_rate": 1.5575205299937599e-07, + "loss": 0.5257, + "step": 1792 + }, + { + "epoch": 0.8257678247603696, + "grad_norm": 2.9754341538820426, + "learning_rate": 1.549531144891032e-07, + "loss": 0.5216, + "step": 1793 + }, + { + "epoch": 0.8262283756944244, + "grad_norm": 3.744224931739949, + "learning_rate": 1.5415605822434053e-07, + "loss": 0.5341, + "step": 1794 + }, + { + "epoch": 0.8266889266284794, + "grad_norm": 3.312132421784843, + "learning_rate": 1.5336088598043995e-07, + "loss": 0.4812, + "step": 1795 + }, + { + "epoch": 0.8271494775625342, + "grad_norm": 3.1823045221187805, + "learning_rate": 1.5256759952855737e-07, + "loss": 0.6088, + "step": 1796 + }, + { + "epoch": 0.827610028496589, + "grad_norm": 3.144738416475895, + "learning_rate": 1.5177620063564712e-07, + "loss": 0.626, + "step": 1797 + }, + { + "epoch": 0.8280705794306439, + "grad_norm": 3.09856360625291, + "learning_rate": 1.5098669106446026e-07, + "loss": 0.5281, + "step": 1798 + }, + { + "epoch": 0.8285311303646987, + "grad_norm": 2.8488512325058206, + "learning_rate": 1.5019907257353925e-07, + "loss": 0.4859, + "step": 1799 + }, + { + "epoch": 0.8289916812987537, + "grad_norm": 3.1008236444549166, + "learning_rate": 1.4941334691721474e-07, + "loss": 0.5706, + "step": 1800 + }, + { + "epoch": 0.8294522322328085, + "grad_norm": 3.5681942314685804, + "learning_rate": 1.4862951584560034e-07, + "loss": 0.5725, + "step": 1801 + }, + { + "epoch": 0.8299127831668633, + "grad_norm": 3.479007930611103, + "learning_rate": 1.4784758110459073e-07, + "loss": 0.455, + "step": 1802 + }, + { + "epoch": 0.8303733341009182, + "grad_norm": 3.6942391140034454, + "learning_rate": 1.4706754443585644e-07, + "loss": 0.5998, + "step": 1803 + }, + { + "epoch": 0.8308338850349731, + "grad_norm": 3.0399020070035414, + "learning_rate": 1.4628940757683972e-07, + "loss": 0.531, + "step": 1804 + }, + { + "epoch": 0.831294435969028, + "grad_norm": 3.3824973071742668, + "learning_rate": 1.4551317226075176e-07, + "loss": 0.632, + "step": 1805 + }, + { + "epoch": 0.8317549869030828, + "grad_norm": 3.0111947391524474, + "learning_rate": 1.4473884021656858e-07, + "loss": 0.4523, + "step": 1806 + }, + { + "epoch": 0.8322155378371376, + "grad_norm": 3.245242080771524, + "learning_rate": 1.4396641316902558e-07, + "loss": 0.5705, + "step": 1807 + }, + { + "epoch": 0.8326760887711926, + "grad_norm": 3.101099899015713, + "learning_rate": 1.431958928386169e-07, + "loss": 0.5403, + "step": 1808 + }, + { + "epoch": 0.8331366397052474, + "grad_norm": 3.4188190961039213, + "learning_rate": 1.4242728094158807e-07, + "loss": 0.5086, + "step": 1809 + }, + { + "epoch": 0.8335971906393023, + "grad_norm": 3.179675044847406, + "learning_rate": 1.41660579189934e-07, + "loss": 0.4884, + "step": 1810 + }, + { + "epoch": 0.8340577415733571, + "grad_norm": 3.225968066840041, + "learning_rate": 1.4089578929139635e-07, + "loss": 0.558, + "step": 1811 + }, + { + "epoch": 0.8345182925074119, + "grad_norm": 2.9209325285695433, + "learning_rate": 1.4013291294945652e-07, + "loss": 0.5264, + "step": 1812 + }, + { + "epoch": 0.8349788434414669, + "grad_norm": 3.1770764013810244, + "learning_rate": 1.3937195186333483e-07, + "loss": 0.4391, + "step": 1813 + }, + { + "epoch": 0.8354393943755217, + "grad_norm": 3.5252276429059015, + "learning_rate": 1.3861290772798552e-07, + "loss": 0.6889, + "step": 1814 + }, + { + "epoch": 0.8358999453095766, + "grad_norm": 3.334270364364526, + "learning_rate": 1.378557822340922e-07, + "loss": 0.5445, + "step": 1815 + }, + { + "epoch": 0.8363604962436314, + "grad_norm": 2.8274203446918955, + "learning_rate": 1.3710057706806588e-07, + "loss": 0.4528, + "step": 1816 + }, + { + "epoch": 0.8368210471776864, + "grad_norm": 2.970687258829064, + "learning_rate": 1.3634729391204003e-07, + "loss": 0.4576, + "step": 1817 + }, + { + "epoch": 0.8372815981117412, + "grad_norm": 3.0526123989623404, + "learning_rate": 1.355959344438665e-07, + "loss": 0.4776, + "step": 1818 + }, + { + "epoch": 0.837742149045796, + "grad_norm": 3.108639242698878, + "learning_rate": 1.3484650033711308e-07, + "loss": 0.505, + "step": 1819 + }, + { + "epoch": 0.8382026999798509, + "grad_norm": 3.568732239359027, + "learning_rate": 1.3409899326105856e-07, + "loss": 0.6024, + "step": 1820 + }, + { + "epoch": 0.8386632509139058, + "grad_norm": 3.6924487796854875, + "learning_rate": 1.3335341488068997e-07, + "loss": 0.5641, + "step": 1821 + }, + { + "epoch": 0.8391238018479606, + "grad_norm": 2.7392248909998833, + "learning_rate": 1.3260976685669767e-07, + "loss": 0.4318, + "step": 1822 + }, + { + "epoch": 0.8395843527820155, + "grad_norm": 3.0029706301333032, + "learning_rate": 1.3186805084547292e-07, + "loss": 0.5296, + "step": 1823 + }, + { + "epoch": 0.8400449037160703, + "grad_norm": 2.837543312679349, + "learning_rate": 1.3112826849910374e-07, + "loss": 0.6362, + "step": 1824 + }, + { + "epoch": 0.8405054546501252, + "grad_norm": 3.625903809155881, + "learning_rate": 1.303904214653705e-07, + "loss": 0.637, + "step": 1825 + }, + { + "epoch": 0.8409660055841801, + "grad_norm": 3.0363996469171393, + "learning_rate": 1.2965451138774342e-07, + "loss": 0.4819, + "step": 1826 + }, + { + "epoch": 0.841426556518235, + "grad_norm": 2.98767711754336, + "learning_rate": 1.2892053990537855e-07, + "loss": 0.416, + "step": 1827 + }, + { + "epoch": 0.8418871074522898, + "grad_norm": 3.368814353904181, + "learning_rate": 1.2818850865311304e-07, + "loss": 0.4946, + "step": 1828 + }, + { + "epoch": 0.8423476583863446, + "grad_norm": 2.866982470674382, + "learning_rate": 1.2745841926146328e-07, + "loss": 0.6318, + "step": 1829 + }, + { + "epoch": 0.8428082093203996, + "grad_norm": 3.1759522349648397, + "learning_rate": 1.2673027335662023e-07, + "loss": 0.5292, + "step": 1830 + }, + { + "epoch": 0.8432687602544544, + "grad_norm": 3.6119453846451184, + "learning_rate": 1.2600407256044543e-07, + "loss": 0.5958, + "step": 1831 + }, + { + "epoch": 0.8437293111885092, + "grad_norm": 3.0612740647541714, + "learning_rate": 1.2527981849046855e-07, + "loss": 0.53, + "step": 1832 + }, + { + "epoch": 0.8441898621225641, + "grad_norm": 3.0373834058484523, + "learning_rate": 1.245575127598828e-07, + "loss": 0.4711, + "step": 1833 + }, + { + "epoch": 0.844650413056619, + "grad_norm": 3.368821133890806, + "learning_rate": 1.2383715697754194e-07, + "loss": 0.5848, + "step": 1834 + }, + { + "epoch": 0.8451109639906739, + "grad_norm": 2.6277881136703476, + "learning_rate": 1.23118752747956e-07, + "loss": 0.4662, + "step": 1835 + }, + { + "epoch": 0.8455715149247287, + "grad_norm": 3.3152498659397733, + "learning_rate": 1.224023016712886e-07, + "loss": 0.5397, + "step": 1836 + }, + { + "epoch": 0.8460320658587835, + "grad_norm": 2.820968906592374, + "learning_rate": 1.2168780534335288e-07, + "loss": 0.5026, + "step": 1837 + }, + { + "epoch": 0.8464926167928384, + "grad_norm": 3.677672748447081, + "learning_rate": 1.2097526535560732e-07, + "loss": 0.6399, + "step": 1838 + }, + { + "epoch": 0.8469531677268933, + "grad_norm": 3.009389680786758, + "learning_rate": 1.2026468329515415e-07, + "loss": 0.4806, + "step": 1839 + }, + { + "epoch": 0.8474137186609482, + "grad_norm": 3.3016771762707444, + "learning_rate": 1.1955606074473368e-07, + "loss": 0.5922, + "step": 1840 + }, + { + "epoch": 0.847874269595003, + "grad_norm": 3.161579889786646, + "learning_rate": 1.1884939928272108e-07, + "loss": 0.587, + "step": 1841 + }, + { + "epoch": 0.8483348205290578, + "grad_norm": 3.490485543641311, + "learning_rate": 1.1814470048312508e-07, + "loss": 0.4875, + "step": 1842 + }, + { + "epoch": 0.8487953714631128, + "grad_norm": 3.106076702165179, + "learning_rate": 1.1744196591558153e-07, + "loss": 0.4083, + "step": 1843 + }, + { + "epoch": 0.8492559223971676, + "grad_norm": 3.138124379295683, + "learning_rate": 1.167411971453509e-07, + "loss": 0.5066, + "step": 1844 + }, + { + "epoch": 0.8497164733312225, + "grad_norm": 2.9058322118896642, + "learning_rate": 1.1604239573331653e-07, + "loss": 0.4712, + "step": 1845 + }, + { + "epoch": 0.8501770242652773, + "grad_norm": 3.2851181268981398, + "learning_rate": 1.1534556323597821e-07, + "loss": 0.5974, + "step": 1846 + }, + { + "epoch": 0.8506375751993323, + "grad_norm": 3.000290790230382, + "learning_rate": 1.1465070120545106e-07, + "loss": 0.4775, + "step": 1847 + }, + { + "epoch": 0.8510981261333871, + "grad_norm": 3.3447028114933377, + "learning_rate": 1.1395781118946124e-07, + "loss": 0.59, + "step": 1848 + }, + { + "epoch": 0.8515586770674419, + "grad_norm": 2.6080519147762393, + "learning_rate": 1.1326689473134166e-07, + "loss": 0.4667, + "step": 1849 + }, + { + "epoch": 0.8520192280014968, + "grad_norm": 2.9008718617685556, + "learning_rate": 1.1257795337003007e-07, + "loss": 0.5282, + "step": 1850 + }, + { + "epoch": 0.8524797789355516, + "grad_norm": 3.169497217996663, + "learning_rate": 1.1189098864006486e-07, + "loss": 0.5051, + "step": 1851 + }, + { + "epoch": 0.8529403298696066, + "grad_norm": 3.215162842301655, + "learning_rate": 1.112060020715817e-07, + "loss": 0.6173, + "step": 1852 + }, + { + "epoch": 0.8534008808036614, + "grad_norm": 3.42236331926524, + "learning_rate": 1.1052299519030961e-07, + "loss": 0.48, + "step": 1853 + }, + { + "epoch": 0.8538614317377162, + "grad_norm": 3.045553200483245, + "learning_rate": 1.0984196951756863e-07, + "loss": 0.5809, + "step": 1854 + }, + { + "epoch": 0.8543219826717711, + "grad_norm": 3.0183531859561543, + "learning_rate": 1.0916292657026616e-07, + "loss": 0.5366, + "step": 1855 + }, + { + "epoch": 0.854782533605826, + "grad_norm": 3.233752941923723, + "learning_rate": 1.084858678608922e-07, + "loss": 0.5023, + "step": 1856 + }, + { + "epoch": 0.8552430845398808, + "grad_norm": 3.7031466518660467, + "learning_rate": 1.078107948975181e-07, + "loss": 0.5871, + "step": 1857 + }, + { + "epoch": 0.8557036354739357, + "grad_norm": 3.231664196992226, + "learning_rate": 1.0713770918379206e-07, + "loss": 0.5641, + "step": 1858 + }, + { + "epoch": 0.8561641864079905, + "grad_norm": 2.8655131047972344, + "learning_rate": 1.0646661221893538e-07, + "loss": 0.5098, + "step": 1859 + }, + { + "epoch": 0.8566247373420455, + "grad_norm": 3.01957958338486, + "learning_rate": 1.0579750549773992e-07, + "loss": 0.5884, + "step": 1860 + }, + { + "epoch": 0.8570852882761003, + "grad_norm": 2.8146657597507017, + "learning_rate": 1.0513039051056504e-07, + "loss": 0.5192, + "step": 1861 + }, + { + "epoch": 0.8575458392101551, + "grad_norm": 3.579182844425862, + "learning_rate": 1.0446526874333262e-07, + "loss": 0.4323, + "step": 1862 + }, + { + "epoch": 0.85800639014421, + "grad_norm": 3.1443912809179, + "learning_rate": 1.0380214167752588e-07, + "loss": 0.5651, + "step": 1863 + }, + { + "epoch": 0.8584669410782648, + "grad_norm": 3.1366442156691607, + "learning_rate": 1.0314101079018456e-07, + "loss": 0.597, + "step": 1864 + }, + { + "epoch": 0.8589274920123198, + "grad_norm": 3.0110739023780178, + "learning_rate": 1.0248187755390247e-07, + "loss": 0.4957, + "step": 1865 + }, + { + "epoch": 0.8593880429463746, + "grad_norm": 3.1211140110824656, + "learning_rate": 1.0182474343682346e-07, + "loss": 0.5807, + "step": 1866 + }, + { + "epoch": 0.8598485938804294, + "grad_norm": 3.196850276682387, + "learning_rate": 1.0116960990263879e-07, + "loss": 0.432, + "step": 1867 + }, + { + "epoch": 0.8603091448144843, + "grad_norm": 3.0649211283399143, + "learning_rate": 1.0051647841058385e-07, + "loss": 0.4683, + "step": 1868 + }, + { + "epoch": 0.8607696957485392, + "grad_norm": 3.0704976261787276, + "learning_rate": 9.986535041543409e-08, + "loss": 0.5084, + "step": 1869 + }, + { + "epoch": 0.8612302466825941, + "grad_norm": 2.8430066136777223, + "learning_rate": 9.921622736750345e-08, + "loss": 0.4893, + "step": 1870 + }, + { + "epoch": 0.8616907976166489, + "grad_norm": 3.1871422194117915, + "learning_rate": 9.856911071263918e-08, + "loss": 0.5582, + "step": 1871 + }, + { + "epoch": 0.8621513485507037, + "grad_norm": 3.1695517066788335, + "learning_rate": 9.792400189221927e-08, + "loss": 0.5491, + "step": 1872 + }, + { + "epoch": 0.8626118994847587, + "grad_norm": 3.192178213736622, + "learning_rate": 9.72809023431509e-08, + "loss": 0.5684, + "step": 1873 + }, + { + "epoch": 0.8630724504188135, + "grad_norm": 3.5963603776479833, + "learning_rate": 9.663981349786443e-08, + "loss": 0.5006, + "step": 1874 + }, + { + "epoch": 0.8635330013528684, + "grad_norm": 2.9762857754260996, + "learning_rate": 9.600073678431186e-08, + "loss": 0.5183, + "step": 1875 + }, + { + "epoch": 0.8639935522869232, + "grad_norm": 3.512211825383684, + "learning_rate": 9.53636736259642e-08, + "loss": 0.5459, + "step": 1876 + }, + { + "epoch": 0.864454103220978, + "grad_norm": 2.959882910299774, + "learning_rate": 9.472862544180659e-08, + "loss": 0.538, + "step": 1877 + }, + { + "epoch": 0.864914654155033, + "grad_norm": 3.201929324826627, + "learning_rate": 9.409559364633646e-08, + "loss": 0.4949, + "step": 1878 + }, + { + "epoch": 0.8653752050890878, + "grad_norm": 3.2818565007848703, + "learning_rate": 9.346457964956023e-08, + "loss": 0.5904, + "step": 1879 + }, + { + "epoch": 0.8658357560231427, + "grad_norm": 3.1114338599754374, + "learning_rate": 9.283558485698894e-08, + "loss": 0.4755, + "step": 1880 + }, + { + "epoch": 0.8662963069571975, + "grad_norm": 2.923497203537618, + "learning_rate": 9.220861066963715e-08, + "loss": 0.4812, + "step": 1881 + }, + { + "epoch": 0.8667568578912525, + "grad_norm": 3.199024742719346, + "learning_rate": 9.158365848401817e-08, + "loss": 0.546, + "step": 1882 + }, + { + "epoch": 0.8672174088253073, + "grad_norm": 2.9843247776927257, + "learning_rate": 9.096072969214197e-08, + "loss": 0.543, + "step": 1883 + }, + { + "epoch": 0.8676779597593621, + "grad_norm": 3.0691692564049653, + "learning_rate": 9.0339825681511e-08, + "loss": 0.5088, + "step": 1884 + }, + { + "epoch": 0.868138510693417, + "grad_norm": 3.383333358394925, + "learning_rate": 8.972094783511807e-08, + "loss": 0.6416, + "step": 1885 + }, + { + "epoch": 0.8685990616274719, + "grad_norm": 3.0949849503575773, + "learning_rate": 8.910409753144344e-08, + "loss": 0.4999, + "step": 1886 + }, + { + "epoch": 0.8690596125615268, + "grad_norm": 3.273519298979524, + "learning_rate": 8.848927614445011e-08, + "loss": 0.5933, + "step": 1887 + }, + { + "epoch": 0.8695201634955816, + "grad_norm": 3.0038213358417183, + "learning_rate": 8.787648504358291e-08, + "loss": 0.4551, + "step": 1888 + }, + { + "epoch": 0.8699807144296364, + "grad_norm": 3.1769567170597615, + "learning_rate": 8.726572559376433e-08, + "loss": 0.6628, + "step": 1889 + }, + { + "epoch": 0.8704412653636913, + "grad_norm": 3.3453327234543067, + "learning_rate": 8.665699915539094e-08, + "loss": 0.656, + "step": 1890 + }, + { + "epoch": 0.8709018162977462, + "grad_norm": 3.0704691251867513, + "learning_rate": 8.605030708433147e-08, + "loss": 0.5198, + "step": 1891 + }, + { + "epoch": 0.871362367231801, + "grad_norm": 3.019643595841549, + "learning_rate": 8.544565073192367e-08, + "loss": 0.4624, + "step": 1892 + }, + { + "epoch": 0.8718229181658559, + "grad_norm": 3.08664418112486, + "learning_rate": 8.484303144497007e-08, + "loss": 0.49, + "step": 1893 + }, + { + "epoch": 0.8722834690999107, + "grad_norm": 3.262560025269264, + "learning_rate": 8.424245056573653e-08, + "loss": 0.5118, + "step": 1894 + }, + { + "epoch": 0.8727440200339657, + "grad_norm": 3.0452114320652086, + "learning_rate": 8.364390943194855e-08, + "loss": 0.5284, + "step": 1895 + }, + { + "epoch": 0.8732045709680205, + "grad_norm": 3.2005415403238366, + "learning_rate": 8.304740937678833e-08, + "loss": 0.6092, + "step": 1896 + }, + { + "epoch": 0.8736651219020753, + "grad_norm": 3.1426226729757794, + "learning_rate": 8.245295172889121e-08, + "loss": 0.4615, + "step": 1897 + }, + { + "epoch": 0.8741256728361302, + "grad_norm": 2.9393983445977403, + "learning_rate": 8.186053781234414e-08, + "loss": 0.4892, + "step": 1898 + }, + { + "epoch": 0.8745862237701851, + "grad_norm": 2.745952741647922, + "learning_rate": 8.12701689466816e-08, + "loss": 0.5493, + "step": 1899 + }, + { + "epoch": 0.87504677470424, + "grad_norm": 2.948466311290946, + "learning_rate": 8.068184644688248e-08, + "loss": 0.474, + "step": 1900 + }, + { + "epoch": 0.8755073256382948, + "grad_norm": 3.172245629853397, + "learning_rate": 8.009557162336822e-08, + "loss": 0.5008, + "step": 1901 + }, + { + "epoch": 0.8759678765723496, + "grad_norm": 3.2352168665011853, + "learning_rate": 7.951134578199925e-08, + "loss": 0.4891, + "step": 1902 + }, + { + "epoch": 0.8764284275064045, + "grad_norm": 3.2067263063797866, + "learning_rate": 7.892917022407153e-08, + "loss": 0.5522, + "step": 1903 + }, + { + "epoch": 0.8768889784404594, + "grad_norm": 3.389480208236002, + "learning_rate": 7.834904624631523e-08, + "loss": 0.6624, + "step": 1904 + }, + { + "epoch": 0.8773495293745143, + "grad_norm": 3.2052256238249903, + "learning_rate": 7.777097514089014e-08, + "loss": 0.6098, + "step": 1905 + }, + { + "epoch": 0.8778100803085691, + "grad_norm": 3.2221846053421337, + "learning_rate": 7.719495819538324e-08, + "loss": 0.5657, + "step": 1906 + }, + { + "epoch": 0.878270631242624, + "grad_norm": 3.018523302700844, + "learning_rate": 7.66209966928072e-08, + "loss": 0.4995, + "step": 1907 + }, + { + "epoch": 0.8787311821766789, + "grad_norm": 3.5183165482279417, + "learning_rate": 7.604909191159537e-08, + "loss": 0.5429, + "step": 1908 + }, + { + "epoch": 0.8791917331107337, + "grad_norm": 3.1284052208629416, + "learning_rate": 7.547924512560044e-08, + "loss": 0.6085, + "step": 1909 + }, + { + "epoch": 0.8796522840447886, + "grad_norm": 2.985628461533933, + "learning_rate": 7.491145760409134e-08, + "loss": 0.6259, + "step": 1910 + }, + { + "epoch": 0.8801128349788434, + "grad_norm": 3.340901063995122, + "learning_rate": 7.434573061174965e-08, + "loss": 0.6032, + "step": 1911 + }, + { + "epoch": 0.8805733859128984, + "grad_norm": 3.0741105448726027, + "learning_rate": 7.378206540866783e-08, + "loss": 0.5554, + "step": 1912 + }, + { + "epoch": 0.8810339368469532, + "grad_norm": 2.9332744072817922, + "learning_rate": 7.322046325034603e-08, + "loss": 0.4567, + "step": 1913 + }, + { + "epoch": 0.881494487781008, + "grad_norm": 2.8092291752801857, + "learning_rate": 7.266092538768853e-08, + "loss": 0.5092, + "step": 1914 + }, + { + "epoch": 0.8819550387150629, + "grad_norm": 3.438214393413268, + "learning_rate": 7.210345306700238e-08, + "loss": 0.563, + "step": 1915 + }, + { + "epoch": 0.8824155896491177, + "grad_norm": 3.2745821627508565, + "learning_rate": 7.154804752999344e-08, + "loss": 0.486, + "step": 1916 + }, + { + "epoch": 0.8828761405831727, + "grad_norm": 2.892400504571004, + "learning_rate": 7.099471001376434e-08, + "loss": 0.5935, + "step": 1917 + }, + { + "epoch": 0.8833366915172275, + "grad_norm": 3.2933601567721387, + "learning_rate": 7.044344175081107e-08, + "loss": 0.6022, + "step": 1918 + }, + { + "epoch": 0.8837972424512823, + "grad_norm": 3.4349342024607052, + "learning_rate": 6.989424396902078e-08, + "loss": 0.6374, + "step": 1919 + }, + { + "epoch": 0.8842577933853372, + "grad_norm": 3.1507821922545203, + "learning_rate": 6.934711789166902e-08, + "loss": 0.5632, + "step": 1920 + }, + { + "epoch": 0.8847183443193921, + "grad_norm": 3.089402889899754, + "learning_rate": 6.880206473741646e-08, + "loss": 0.5506, + "step": 1921 + }, + { + "epoch": 0.885178895253447, + "grad_norm": 3.181790361740718, + "learning_rate": 6.825908572030703e-08, + "loss": 0.6158, + "step": 1922 + }, + { + "epoch": 0.8856394461875018, + "grad_norm": 3.0742747552763077, + "learning_rate": 6.771818204976453e-08, + "loss": 0.5871, + "step": 1923 + }, + { + "epoch": 0.8860999971215566, + "grad_norm": 3.010007045848609, + "learning_rate": 6.71793549305899e-08, + "loss": 0.5538, + "step": 1924 + }, + { + "epoch": 0.8865605480556116, + "grad_norm": 3.3014591830340225, + "learning_rate": 6.66426055629593e-08, + "loss": 0.5739, + "step": 1925 + }, + { + "epoch": 0.8870210989896664, + "grad_norm": 2.8280186580117337, + "learning_rate": 6.610793514242074e-08, + "loss": 0.5761, + "step": 1926 + }, + { + "epoch": 0.8874816499237212, + "grad_norm": 2.8955154098471696, + "learning_rate": 6.557534485989135e-08, + "loss": 0.4836, + "step": 1927 + }, + { + "epoch": 0.8879422008577761, + "grad_norm": 3.2863599262582546, + "learning_rate": 6.504483590165533e-08, + "loss": 0.6659, + "step": 1928 + }, + { + "epoch": 0.8884027517918309, + "grad_norm": 2.7810076021075325, + "learning_rate": 6.451640944936087e-08, + "loss": 0.5125, + "step": 1929 + }, + { + "epoch": 0.8888633027258859, + "grad_norm": 2.829017960020371, + "learning_rate": 6.399006668001772e-08, + "loss": 0.4712, + "step": 1930 + }, + { + "epoch": 0.8893238536599407, + "grad_norm": 3.1200160087326485, + "learning_rate": 6.346580876599394e-08, + "loss": 0.5241, + "step": 1931 + }, + { + "epoch": 0.8897844045939955, + "grad_norm": 3.284501857717466, + "learning_rate": 6.294363687501459e-08, + "loss": 0.4956, + "step": 1932 + }, + { + "epoch": 0.8902449555280504, + "grad_norm": 3.0368507746622497, + "learning_rate": 6.242355217015793e-08, + "loss": 0.5335, + "step": 1933 + }, + { + "epoch": 0.8907055064621053, + "grad_norm": 2.707473642304596, + "learning_rate": 6.190555580985291e-08, + "loss": 0.4174, + "step": 1934 + }, + { + "epoch": 0.8911660573961602, + "grad_norm": 3.31398125775273, + "learning_rate": 6.138964894787802e-08, + "loss": 0.5595, + "step": 1935 + }, + { + "epoch": 0.891626608330215, + "grad_norm": 3.113081540988843, + "learning_rate": 6.08758327333564e-08, + "loss": 0.5791, + "step": 1936 + }, + { + "epoch": 0.8920871592642698, + "grad_norm": 2.8145565837317608, + "learning_rate": 6.036410831075489e-08, + "loss": 0.5162, + "step": 1937 + }, + { + "epoch": 0.8925477101983248, + "grad_norm": 3.196959966502691, + "learning_rate": 5.985447681988187e-08, + "loss": 0.656, + "step": 1938 + }, + { + "epoch": 0.8930082611323796, + "grad_norm": 3.2061347516327476, + "learning_rate": 5.934693939588276e-08, + "loss": 0.5659, + "step": 1939 + }, + { + "epoch": 0.8934688120664345, + "grad_norm": 2.7407334976039937, + "learning_rate": 5.884149716923947e-08, + "loss": 0.5263, + "step": 1940 + }, + { + "epoch": 0.8939293630004893, + "grad_norm": 3.0639659095757925, + "learning_rate": 5.833815126576713e-08, + "loss": 0.4792, + "step": 1941 + }, + { + "epoch": 0.8943899139345441, + "grad_norm": 3.705402334304555, + "learning_rate": 5.78369028066108e-08, + "loss": 0.5598, + "step": 1942 + }, + { + "epoch": 0.8948504648685991, + "grad_norm": 3.314179143806477, + "learning_rate": 5.7337752908244604e-08, + "loss": 0.6057, + "step": 1943 + }, + { + "epoch": 0.8953110158026539, + "grad_norm": 3.093901085439443, + "learning_rate": 5.684070268246799e-08, + "loss": 0.5189, + "step": 1944 + }, + { + "epoch": 0.8957715667367088, + "grad_norm": 2.9306316685724814, + "learning_rate": 5.634575323640334e-08, + "loss": 0.518, + "step": 1945 + }, + { + "epoch": 0.8962321176707636, + "grad_norm": 2.6131420700596744, + "learning_rate": 5.5852905672494235e-08, + "loss": 0.4514, + "step": 1946 + }, + { + "epoch": 0.8966926686048186, + "grad_norm": 3.4838526888557224, + "learning_rate": 5.5362161088502335e-08, + "loss": 0.6153, + "step": 1947 + }, + { + "epoch": 0.8971532195388734, + "grad_norm": 2.7410462665447732, + "learning_rate": 5.487352057750538e-08, + "loss": 0.5487, + "step": 1948 + }, + { + "epoch": 0.8976137704729282, + "grad_norm": 3.323249823556552, + "learning_rate": 5.438698522789409e-08, + "loss": 0.4776, + "step": 1949 + }, + { + "epoch": 0.8980743214069831, + "grad_norm": 3.3914954871956207, + "learning_rate": 5.390255612337058e-08, + "loss": 0.5644, + "step": 1950 + }, + { + "epoch": 0.898534872341038, + "grad_norm": 3.453540431554986, + "learning_rate": 5.3420234342945515e-08, + "loss": 0.5669, + "step": 1951 + }, + { + "epoch": 0.8989954232750929, + "grad_norm": 3.2848747343127696, + "learning_rate": 5.2940020960935416e-08, + "loss": 0.6555, + "step": 1952 + }, + { + "epoch": 0.8994559742091477, + "grad_norm": 3.335247069769459, + "learning_rate": 5.246191704696079e-08, + "loss": 0.5593, + "step": 1953 + }, + { + "epoch": 0.8999165251432025, + "grad_norm": 2.373713227217355, + "learning_rate": 5.1985923665943787e-08, + "loss": 0.3408, + "step": 1954 + }, + { + "epoch": 0.9003770760772574, + "grad_norm": 3.2307700903116334, + "learning_rate": 5.1512041878105095e-08, + "loss": 0.5363, + "step": 1955 + }, + { + "epoch": 0.9008376270113123, + "grad_norm": 3.2016297093263817, + "learning_rate": 5.104027273896239e-08, + "loss": 0.5384, + "step": 1956 + }, + { + "epoch": 0.9012981779453672, + "grad_norm": 3.20013985926728, + "learning_rate": 5.057061729932777e-08, + "loss": 0.4193, + "step": 1957 + }, + { + "epoch": 0.901758728879422, + "grad_norm": 2.7276925258485436, + "learning_rate": 5.0103076605304885e-08, + "loss": 0.5143, + "step": 1958 + }, + { + "epoch": 0.9022192798134768, + "grad_norm": 2.9954612745734903, + "learning_rate": 4.963765169828737e-08, + "loss": 0.4886, + "step": 1959 + }, + { + "epoch": 0.9026798307475318, + "grad_norm": 3.1663609715145102, + "learning_rate": 4.917434361495609e-08, + "loss": 0.529, + "step": 1960 + }, + { + "epoch": 0.9031403816815866, + "grad_norm": 3.1213827237745333, + "learning_rate": 4.871315338727711e-08, + "loss": 0.5155, + "step": 1961 + }, + { + "epoch": 0.9036009326156415, + "grad_norm": 2.7436550688979837, + "learning_rate": 4.825408204249881e-08, + "loss": 0.4606, + "step": 1962 + }, + { + "epoch": 0.9040614835496963, + "grad_norm": 3.249120899149753, + "learning_rate": 4.779713060315016e-08, + "loss": 0.5054, + "step": 1963 + }, + { + "epoch": 0.9045220344837512, + "grad_norm": 2.90283494064835, + "learning_rate": 4.734230008703877e-08, + "loss": 0.5579, + "step": 1964 + }, + { + "epoch": 0.9049825854178061, + "grad_norm": 3.4042002585801865, + "learning_rate": 4.688959150724703e-08, + "loss": 0.5775, + "step": 1965 + }, + { + "epoch": 0.9054431363518609, + "grad_norm": 3.1250557810610182, + "learning_rate": 4.6439005872132454e-08, + "loss": 0.6137, + "step": 1966 + }, + { + "epoch": 0.9059036872859157, + "grad_norm": 3.1195028338428226, + "learning_rate": 4.599054418532267e-08, + "loss": 0.561, + "step": 1967 + }, + { + "epoch": 0.9063642382199706, + "grad_norm": 2.8851519193897643, + "learning_rate": 4.554420744571463e-08, + "loss": 0.5235, + "step": 1968 + }, + { + "epoch": 0.9068247891540255, + "grad_norm": 3.6481537119175997, + "learning_rate": 4.5099996647473215e-08, + "loss": 0.4759, + "step": 1969 + }, + { + "epoch": 0.9072853400880804, + "grad_norm": 2.790296368178869, + "learning_rate": 4.465791278002684e-08, + "loss": 0.4618, + "step": 1970 + }, + { + "epoch": 0.9077458910221352, + "grad_norm": 3.556677402896051, + "learning_rate": 4.4217956828066614e-08, + "loss": 0.5706, + "step": 1971 + }, + { + "epoch": 0.90820644195619, + "grad_norm": 2.978589496267026, + "learning_rate": 4.3780129771544885e-08, + "loss": 0.538, + "step": 1972 + }, + { + "epoch": 0.908666992890245, + "grad_norm": 3.6725000383644386, + "learning_rate": 4.3344432585670886e-08, + "loss": 0.6891, + "step": 1973 + }, + { + "epoch": 0.9091275438242998, + "grad_norm": 3.2917964842156717, + "learning_rate": 4.291086624091067e-08, + "loss": 0.5737, + "step": 1974 + }, + { + "epoch": 0.9095880947583547, + "grad_norm": 3.0743766947515088, + "learning_rate": 4.2479431702983845e-08, + "loss": 0.5731, + "step": 1975 + }, + { + "epoch": 0.9100486456924095, + "grad_norm": 3.0506298198871993, + "learning_rate": 4.205012993286139e-08, + "loss": 0.4587, + "step": 1976 + }, + { + "epoch": 0.9105091966264645, + "grad_norm": 3.0219223404896907, + "learning_rate": 4.162296188676417e-08, + "loss": 0.527, + "step": 1977 + }, + { + "epoch": 0.9109697475605193, + "grad_norm": 2.79095841731694, + "learning_rate": 4.119792851616022e-08, + "loss": 0.4888, + "step": 1978 + }, + { + "epoch": 0.9114302984945741, + "grad_norm": 3.4172455118807283, + "learning_rate": 4.0775030767762895e-08, + "loss": 0.5758, + "step": 1979 + }, + { + "epoch": 0.911890849428629, + "grad_norm": 3.239863460054807, + "learning_rate": 4.035426958352861e-08, + "loss": 0.5547, + "step": 1980 + }, + { + "epoch": 0.9123514003626838, + "grad_norm": 2.8654349056505017, + "learning_rate": 3.99356459006549e-08, + "loss": 0.438, + "step": 1981 + }, + { + "epoch": 0.9128119512967388, + "grad_norm": 2.9491143905443065, + "learning_rate": 3.9519160651578456e-08, + "loss": 0.5215, + "step": 1982 + }, + { + "epoch": 0.9132725022307936, + "grad_norm": 3.590224655674636, + "learning_rate": 3.910481476397231e-08, + "loss": 0.5205, + "step": 1983 + }, + { + "epoch": 0.9137330531648484, + "grad_norm": 3.0897242283731816, + "learning_rate": 3.8692609160744796e-08, + "loss": 0.4683, + "step": 1984 + }, + { + "epoch": 0.9141936040989033, + "grad_norm": 2.6174259753352773, + "learning_rate": 3.8282544760037005e-08, + "loss": 0.5923, + "step": 1985 + }, + { + "epoch": 0.9146541550329582, + "grad_norm": 3.281785734259912, + "learning_rate": 3.787462247522033e-08, + "loss": 0.5454, + "step": 1986 + }, + { + "epoch": 0.915114705967013, + "grad_norm": 3.0957714777119003, + "learning_rate": 3.74688432148953e-08, + "loss": 0.4839, + "step": 1987 + }, + { + "epoch": 0.9155752569010679, + "grad_norm": 3.2146157875749735, + "learning_rate": 3.7065207882888915e-08, + "loss": 0.5671, + "step": 1988 + }, + { + "epoch": 0.9160358078351227, + "grad_norm": 3.2632112706777097, + "learning_rate": 3.666371737825269e-08, + "loss": 0.5403, + "step": 1989 + }, + { + "epoch": 0.9164963587691777, + "grad_norm": 2.588003078548097, + "learning_rate": 3.626437259526094e-08, + "loss": 0.4528, + "step": 1990 + }, + { + "epoch": 0.9169569097032325, + "grad_norm": 3.5583915478306025, + "learning_rate": 3.58671744234087e-08, + "loss": 0.5301, + "step": 1991 + }, + { + "epoch": 0.9174174606372874, + "grad_norm": 2.97658613622478, + "learning_rate": 3.54721237474096e-08, + "loss": 0.5364, + "step": 1992 + }, + { + "epoch": 0.9178780115713422, + "grad_norm": 3.005308580137038, + "learning_rate": 3.5079221447193665e-08, + "loss": 0.5161, + "step": 1993 + }, + { + "epoch": 0.918338562505397, + "grad_norm": 3.4338709756555335, + "learning_rate": 3.468846839790629e-08, + "loss": 0.5028, + "step": 1994 + }, + { + "epoch": 0.918799113439452, + "grad_norm": 3.022995712791708, + "learning_rate": 3.4299865469905156e-08, + "loss": 0.4699, + "step": 1995 + }, + { + "epoch": 0.9192596643735068, + "grad_norm": 3.0589636260878654, + "learning_rate": 3.391341352875887e-08, + "loss": 0.5397, + "step": 1996 + }, + { + "epoch": 0.9197202153075617, + "grad_norm": 3.029785621200751, + "learning_rate": 3.3529113435245e-08, + "loss": 0.5563, + "step": 1997 + }, + { + "epoch": 0.9201807662416165, + "grad_norm": 3.12537296154988, + "learning_rate": 3.314696604534839e-08, + "loss": 0.559, + "step": 1998 + }, + { + "epoch": 0.9206413171756714, + "grad_norm": 3.416953532596617, + "learning_rate": 3.276697221025848e-08, + "loss": 0.5028, + "step": 1999 + }, + { + "epoch": 0.9211018681097263, + "grad_norm": 3.7639797946349485, + "learning_rate": 3.238913277636846e-08, + "loss": 0.4885, + "step": 2000 + }, + { + "epoch": 0.9215624190437811, + "grad_norm": 3.355422997580863, + "learning_rate": 3.201344858527233e-08, + "loss": 0.5052, + "step": 2001 + }, + { + "epoch": 0.922022969977836, + "grad_norm": 3.1382941450648425, + "learning_rate": 3.163992047376374e-08, + "loss": 0.5372, + "step": 2002 + }, + { + "epoch": 0.9224835209118909, + "grad_norm": 3.4395472891340377, + "learning_rate": 3.126854927383416e-08, + "loss": 0.5008, + "step": 2003 + }, + { + "epoch": 0.9229440718459457, + "grad_norm": 3.1440121431768078, + "learning_rate": 3.089933581267024e-08, + "loss": 0.6216, + "step": 2004 + }, + { + "epoch": 0.9234046227800006, + "grad_norm": 3.519897694068012, + "learning_rate": 3.053228091265314e-08, + "loss": 0.5959, + "step": 2005 + }, + { + "epoch": 0.9238651737140554, + "grad_norm": 3.1572535401512756, + "learning_rate": 3.016738539135566e-08, + "loss": 0.4811, + "step": 2006 + }, + { + "epoch": 0.9243257246481102, + "grad_norm": 3.2199243277505616, + "learning_rate": 2.980465006154076e-08, + "loss": 0.5451, + "step": 2007 + }, + { + "epoch": 0.9247862755821652, + "grad_norm": 3.3241287834796096, + "learning_rate": 2.9444075731160256e-08, + "loss": 0.6162, + "step": 2008 + }, + { + "epoch": 0.92524682651622, + "grad_norm": 3.164226428742799, + "learning_rate": 2.908566320335215e-08, + "loss": 0.5484, + "step": 2009 + }, + { + "epoch": 0.9257073774502749, + "grad_norm": 3.4187086885851774, + "learning_rate": 2.872941327643963e-08, + "loss": 0.5215, + "step": 2010 + }, + { + "epoch": 0.9261679283843297, + "grad_norm": 3.33801664810882, + "learning_rate": 2.837532674392862e-08, + "loss": 0.4978, + "step": 2011 + }, + { + "epoch": 0.9266284793183847, + "grad_norm": 3.1540649821799573, + "learning_rate": 2.8023404394506345e-08, + "loss": 0.5169, + "step": 2012 + }, + { + "epoch": 0.9270890302524395, + "grad_norm": 3.332203820351019, + "learning_rate": 2.767364701204e-08, + "loss": 0.4781, + "step": 2013 + }, + { + "epoch": 0.9275495811864943, + "grad_norm": 3.0643310804507253, + "learning_rate": 2.7326055375573976e-08, + "loss": 0.5149, + "step": 2014 + }, + { + "epoch": 0.9280101321205492, + "grad_norm": 2.841374181617664, + "learning_rate": 2.6980630259329063e-08, + "loss": 0.5501, + "step": 2015 + }, + { + "epoch": 0.9284706830546041, + "grad_norm": 3.3194075100355835, + "learning_rate": 2.6637372432700476e-08, + "loss": 0.6163, + "step": 2016 + }, + { + "epoch": 0.928931233988659, + "grad_norm": 3.0237095704445633, + "learning_rate": 2.629628266025552e-08, + "loss": 0.5245, + "step": 2017 + }, + { + "epoch": 0.9293917849227138, + "grad_norm": 3.0048824665623566, + "learning_rate": 2.5957361701732904e-08, + "loss": 0.7035, + "step": 2018 + }, + { + "epoch": 0.9298523358567686, + "grad_norm": 2.850832918602004, + "learning_rate": 2.5620610312040436e-08, + "loss": 0.4749, + "step": 2019 + }, + { + "epoch": 0.9303128867908235, + "grad_norm": 2.889924325240167, + "learning_rate": 2.528602924125334e-08, + "loss": 0.4982, + "step": 2020 + }, + { + "epoch": 0.9307734377248784, + "grad_norm": 3.097179425048999, + "learning_rate": 2.495361923461281e-08, + "loss": 0.57, + "step": 2021 + }, + { + "epoch": 0.9312339886589333, + "grad_norm": 3.195072157476586, + "learning_rate": 2.462338103252415e-08, + "loss": 0.6917, + "step": 2022 + }, + { + "epoch": 0.9316945395929881, + "grad_norm": 2.964114672020939, + "learning_rate": 2.4295315370555402e-08, + "loss": 0.4675, + "step": 2023 + }, + { + "epoch": 0.9321550905270429, + "grad_norm": 2.698298386130625, + "learning_rate": 2.3969422979435162e-08, + "loss": 0.4485, + "step": 2024 + }, + { + "epoch": 0.9326156414610979, + "grad_norm": 3.05208848367821, + "learning_rate": 2.3645704585051775e-08, + "loss": 0.6052, + "step": 2025 + }, + { + "epoch": 0.9330761923951527, + "grad_norm": 3.626805603045707, + "learning_rate": 2.3324160908451017e-08, + "loss": 0.458, + "step": 2026 + }, + { + "epoch": 0.9335367433292076, + "grad_norm": 3.2241633697370413, + "learning_rate": 2.300479266583455e-08, + "loss": 0.5033, + "step": 2027 + }, + { + "epoch": 0.9339972942632624, + "grad_norm": 3.5953424649745735, + "learning_rate": 2.2687600568558785e-08, + "loss": 0.7304, + "step": 2028 + }, + { + "epoch": 0.9344578451973173, + "grad_norm": 3.8158227802153952, + "learning_rate": 2.237258532313302e-08, + "loss": 0.5633, + "step": 2029 + }, + { + "epoch": 0.9349183961313722, + "grad_norm": 3.040372697625964, + "learning_rate": 2.205974763121754e-08, + "loss": 0.5436, + "step": 2030 + }, + { + "epoch": 0.935378947065427, + "grad_norm": 2.9246000563345866, + "learning_rate": 2.1749088189622844e-08, + "loss": 0.5782, + "step": 2031 + }, + { + "epoch": 0.9358394979994819, + "grad_norm": 3.1558727218247244, + "learning_rate": 2.144060769030742e-08, + "loss": 0.5921, + "step": 2032 + }, + { + "epoch": 0.9363000489335367, + "grad_norm": 2.9122079524337754, + "learning_rate": 2.113430682037598e-08, + "loss": 0.5348, + "step": 2033 + }, + { + "epoch": 0.9367605998675916, + "grad_norm": 2.5503478146770244, + "learning_rate": 2.083018626207933e-08, + "loss": 0.4812, + "step": 2034 + }, + { + "epoch": 0.9372211508016465, + "grad_norm": 3.2463091586540958, + "learning_rate": 2.0528246692810835e-08, + "loss": 0.5133, + "step": 2035 + }, + { + "epoch": 0.9376817017357013, + "grad_norm": 2.7033666955582225, + "learning_rate": 2.0228488785106634e-08, + "loss": 0.5074, + "step": 2036 + }, + { + "epoch": 0.9381422526697561, + "grad_norm": 3.3458142554070736, + "learning_rate": 1.9930913206643306e-08, + "loss": 0.5898, + "step": 2037 + }, + { + "epoch": 0.9386028036038111, + "grad_norm": 3.0937605106073947, + "learning_rate": 1.9635520620236323e-08, + "loss": 0.4866, + "step": 2038 + }, + { + "epoch": 0.9390633545378659, + "grad_norm": 3.3805419118935545, + "learning_rate": 1.934231168383915e-08, + "loss": 0.6963, + "step": 2039 + }, + { + "epoch": 0.9395239054719208, + "grad_norm": 2.9416660248540536, + "learning_rate": 1.9051287050541263e-08, + "loss": 0.5596, + "step": 2040 + }, + { + "epoch": 0.9399844564059756, + "grad_norm": 2.5583360386888883, + "learning_rate": 1.876244736856658e-08, + "loss": 0.3893, + "step": 2041 + }, + { + "epoch": 0.9404450073400306, + "grad_norm": 3.602394844208884, + "learning_rate": 1.847579328127269e-08, + "loss": 0.6501, + "step": 2042 + }, + { + "epoch": 0.9409055582740854, + "grad_norm": 3.278596111429843, + "learning_rate": 1.819132542714874e-08, + "loss": 0.5125, + "step": 2043 + }, + { + "epoch": 0.9413661092081402, + "grad_norm": 3.2792307923445794, + "learning_rate": 1.790904443981478e-08, + "loss": 0.546, + "step": 2044 + }, + { + "epoch": 0.9418266601421951, + "grad_norm": 3.354526716736895, + "learning_rate": 1.7628950948018974e-08, + "loss": 0.5228, + "step": 2045 + }, + { + "epoch": 0.9422872110762499, + "grad_norm": 3.2620023435160395, + "learning_rate": 1.7351045575638044e-08, + "loss": 0.4842, + "step": 2046 + }, + { + "epoch": 0.9427477620103049, + "grad_norm": 3.509391862715867, + "learning_rate": 1.7075328941674295e-08, + "loss": 0.5415, + "step": 2047 + }, + { + "epoch": 0.9432083129443597, + "grad_norm": 2.9644652855209945, + "learning_rate": 1.680180166025513e-08, + "loss": 0.5368, + "step": 2048 + }, + { + "epoch": 0.9436688638784145, + "grad_norm": 3.103421227871461, + "learning_rate": 1.653046434063121e-08, + "loss": 0.6085, + "step": 2049 + }, + { + "epoch": 0.9441294148124694, + "grad_norm": 3.2771987160259877, + "learning_rate": 1.626131758717575e-08, + "loss": 0.5967, + "step": 2050 + }, + { + "epoch": 0.9445899657465243, + "grad_norm": 2.840695547956422, + "learning_rate": 1.59943619993822e-08, + "loss": 0.5215, + "step": 2051 + }, + { + "epoch": 0.9450505166805792, + "grad_norm": 3.5326470386345545, + "learning_rate": 1.572959817186359e-08, + "loss": 0.5683, + "step": 2052 + }, + { + "epoch": 0.945511067614634, + "grad_norm": 3.4768749960533127, + "learning_rate": 1.5467026694351404e-08, + "loss": 0.5466, + "step": 2053 + }, + { + "epoch": 0.9459716185486888, + "grad_norm": 3.3583093067528313, + "learning_rate": 1.5206648151693478e-08, + "loss": 0.553, + "step": 2054 + }, + { + "epoch": 0.9464321694827438, + "grad_norm": 3.009812369106128, + "learning_rate": 1.4948463123853337e-08, + "loss": 0.4992, + "step": 2055 + }, + { + "epoch": 0.9468927204167986, + "grad_norm": 2.9099681097218286, + "learning_rate": 1.4692472185908633e-08, + "loss": 0.4736, + "step": 2056 + }, + { + "epoch": 0.9473532713508535, + "grad_norm": 3.3355322307095108, + "learning_rate": 1.4438675908050036e-08, + "loss": 0.5518, + "step": 2057 + }, + { + "epoch": 0.9478138222849083, + "grad_norm": 2.5897747211717586, + "learning_rate": 1.4187074855579795e-08, + "loss": 0.3858, + "step": 2058 + }, + { + "epoch": 0.9482743732189632, + "grad_norm": 3.3391284256863103, + "learning_rate": 1.3937669588910406e-08, + "loss": 0.6045, + "step": 2059 + }, + { + "epoch": 0.9487349241530181, + "grad_norm": 3.0970613079517957, + "learning_rate": 1.3690460663563829e-08, + "loss": 0.5936, + "step": 2060 + }, + { + "epoch": 0.9491954750870729, + "grad_norm": 3.3727344553879384, + "learning_rate": 1.344544863016961e-08, + "loss": 0.5295, + "step": 2061 + }, + { + "epoch": 0.9496560260211278, + "grad_norm": 2.9033018743602312, + "learning_rate": 1.3202634034464199e-08, + "loss": 0.549, + "step": 2062 + }, + { + "epoch": 0.9501165769551826, + "grad_norm": 2.958202980684059, + "learning_rate": 1.2962017417289418e-08, + "loss": 0.5569, + "step": 2063 + }, + { + "epoch": 0.9505771278892375, + "grad_norm": 2.976503730921731, + "learning_rate": 1.2723599314591105e-08, + "loss": 0.5678, + "step": 2064 + }, + { + "epoch": 0.9510376788232924, + "grad_norm": 3.1304654491952872, + "learning_rate": 1.2487380257418578e-08, + "loss": 0.5418, + "step": 2065 + }, + { + "epoch": 0.9514982297573472, + "grad_norm": 3.25198019264722, + "learning_rate": 1.2253360771922739e-08, + "loss": 0.5825, + "step": 2066 + }, + { + "epoch": 0.951958780691402, + "grad_norm": 3.5969220214188997, + "learning_rate": 1.2021541379355404e-08, + "loss": 0.5422, + "step": 2067 + }, + { + "epoch": 0.952419331625457, + "grad_norm": 2.84240161611037, + "learning_rate": 1.1791922596067649e-08, + "loss": 0.5475, + "step": 2068 + }, + { + "epoch": 0.9528798825595118, + "grad_norm": 2.7264361506813537, + "learning_rate": 1.1564504933509244e-08, + "loss": 0.4494, + "step": 2069 + }, + { + "epoch": 0.9533404334935667, + "grad_norm": 2.9820660647231296, + "learning_rate": 1.1339288898227106e-08, + "loss": 0.5436, + "step": 2070 + }, + { + "epoch": 0.9538009844276215, + "grad_norm": 3.139054611664098, + "learning_rate": 1.1116274991864072e-08, + "loss": 0.4871, + "step": 2071 + }, + { + "epoch": 0.9542615353616765, + "grad_norm": 2.8848562462423377, + "learning_rate": 1.0895463711158349e-08, + "loss": 0.4775, + "step": 2072 + }, + { + "epoch": 0.9547220862957313, + "grad_norm": 2.940726909982554, + "learning_rate": 1.0676855547941844e-08, + "loss": 0.4509, + "step": 2073 + }, + { + "epoch": 0.9551826372297861, + "grad_norm": 3.011334549424482, + "learning_rate": 1.0460450989139169e-08, + "loss": 0.5029, + "step": 2074 + }, + { + "epoch": 0.955643188163841, + "grad_norm": 2.997675838063664, + "learning_rate": 1.0246250516766863e-08, + "loss": 0.5425, + "step": 2075 + }, + { + "epoch": 0.9561037390978958, + "grad_norm": 2.985898615725367, + "learning_rate": 1.0034254607932168e-08, + "loss": 0.5514, + "step": 2076 + }, + { + "epoch": 0.9565642900319508, + "grad_norm": 3.0381025030705233, + "learning_rate": 9.82446373483159e-09, + "loss": 0.4691, + "step": 2077 + }, + { + "epoch": 0.9570248409660056, + "grad_norm": 2.7713452807820937, + "learning_rate": 9.616878364750446e-09, + "loss": 0.4628, + "step": 2078 + }, + { + "epoch": 0.9574853919000604, + "grad_norm": 2.9033280044478853, + "learning_rate": 9.411498960061436e-09, + "loss": 0.4301, + "step": 2079 + }, + { + "epoch": 0.9579459428341153, + "grad_norm": 3.2313767447057917, + "learning_rate": 9.208325978223741e-09, + "loss": 0.5824, + "step": 2080 + }, + { + "epoch": 0.9584064937681702, + "grad_norm": 3.0409141727255915, + "learning_rate": 9.00735987178214e-09, + "loss": 0.5694, + "step": 2081 + }, + { + "epoch": 0.9588670447022251, + "grad_norm": 3.0094026886557526, + "learning_rate": 8.808601088365453e-09, + "loss": 0.5971, + "step": 2082 + }, + { + "epoch": 0.9593275956362799, + "grad_norm": 3.4474240649260555, + "learning_rate": 8.612050070686217e-09, + "loss": 0.5686, + "step": 2083 + }, + { + "epoch": 0.9597881465703347, + "grad_norm": 3.264238061980219, + "learning_rate": 8.417707256539675e-09, + "loss": 0.4905, + "step": 2084 + }, + { + "epoch": 0.9602486975043897, + "grad_norm": 3.445851440079108, + "learning_rate": 8.225573078802006e-09, + "loss": 0.536, + "step": 2085 + }, + { + "epoch": 0.9607092484384445, + "grad_norm": 3.2368359736420684, + "learning_rate": 8.035647965430215e-09, + "loss": 0.5559, + "step": 2086 + }, + { + "epoch": 0.9611697993724994, + "grad_norm": 2.7042062125888835, + "learning_rate": 7.847932339460906e-09, + "loss": 0.537, + "step": 2087 + }, + { + "epoch": 0.9616303503065542, + "grad_norm": 3.352414822995779, + "learning_rate": 7.662426619009178e-09, + "loss": 0.5649, + "step": 2088 + }, + { + "epoch": 0.962090901240609, + "grad_norm": 3.04805145690995, + "learning_rate": 7.479131217267732e-09, + "loss": 0.6832, + "step": 2089 + }, + { + "epoch": 0.962551452174664, + "grad_norm": 3.0121047224737474, + "learning_rate": 7.2980465425063196e-09, + "loss": 0.5337, + "step": 2090 + }, + { + "epoch": 0.9630120031087188, + "grad_norm": 3.4643442345702815, + "learning_rate": 7.119172998070411e-09, + "loss": 0.5652, + "step": 2091 + }, + { + "epoch": 0.9634725540427737, + "grad_norm": 3.1880017551075035, + "learning_rate": 6.9425109823803e-09, + "loss": 0.4508, + "step": 2092 + }, + { + "epoch": 0.9639331049768285, + "grad_norm": 3.5879986405315836, + "learning_rate": 6.768060888930449e-09, + "loss": 0.611, + "step": 2093 + }, + { + "epoch": 0.9643936559108834, + "grad_norm": 3.1074545754597493, + "learning_rate": 6.595823106288589e-09, + "loss": 0.6024, + "step": 2094 + }, + { + "epoch": 0.9648542068449383, + "grad_norm": 3.1449480649098573, + "learning_rate": 6.4257980180948415e-09, + "loss": 0.5402, + "step": 2095 + }, + { + "epoch": 0.9653147577789931, + "grad_norm": 3.240649850034883, + "learning_rate": 6.257986003060489e-09, + "loss": 0.5048, + "step": 2096 + }, + { + "epoch": 0.965775308713048, + "grad_norm": 2.958071414094661, + "learning_rate": 6.09238743496776e-09, + "loss": 0.5455, + "step": 2097 + }, + { + "epoch": 0.9662358596471029, + "grad_norm": 3.3522897979936244, + "learning_rate": 5.929002682668494e-09, + "loss": 0.5016, + "step": 2098 + }, + { + "epoch": 0.9666964105811577, + "grad_norm": 3.3646238303023464, + "learning_rate": 5.7678321100836925e-09, + "loss": 0.6399, + "step": 2099 + }, + { + "epoch": 0.9671569615152126, + "grad_norm": 3.146114846622207, + "learning_rate": 5.608876076202307e-09, + "loss": 0.4167, + "step": 2100 + }, + { + "epoch": 0.9676175124492674, + "grad_norm": 3.2240830087243104, + "learning_rate": 5.452134935080899e-09, + "loss": 0.519, + "step": 2101 + }, + { + "epoch": 0.9680780633833223, + "grad_norm": 2.9509149534120898, + "learning_rate": 5.29760903584231e-09, + "loss": 0.4709, + "step": 2102 + }, + { + "epoch": 0.9685386143173772, + "grad_norm": 2.9753664130176345, + "learning_rate": 5.145298722675439e-09, + "loss": 0.5777, + "step": 2103 + }, + { + "epoch": 0.968999165251432, + "grad_norm": 3.026540264362648, + "learning_rate": 4.9952043348342465e-09, + "loss": 0.4863, + "step": 2104 + }, + { + "epoch": 0.9694597161854869, + "grad_norm": 3.3658355981602197, + "learning_rate": 4.847326206636526e-09, + "loss": 0.5415, + "step": 2105 + }, + { + "epoch": 0.9699202671195417, + "grad_norm": 3.370998012399039, + "learning_rate": 4.701664667464245e-09, + "loss": 0.5083, + "step": 2106 + }, + { + "epoch": 0.9703808180535967, + "grad_norm": 3.0673456063667293, + "learning_rate": 4.5582200417617625e-09, + "loss": 0.5203, + "step": 2107 + }, + { + "epoch": 0.9708413689876515, + "grad_norm": 3.05103189149114, + "learning_rate": 4.416992649035612e-09, + "loss": 0.5176, + "step": 2108 + }, + { + "epoch": 0.9713019199217063, + "grad_norm": 2.88875246748082, + "learning_rate": 4.2779828038536085e-09, + "loss": 0.5542, + "step": 2109 + }, + { + "epoch": 0.9717624708557612, + "grad_norm": 3.03592366750373, + "learning_rate": 4.14119081584452e-09, + "loss": 0.5563, + "step": 2110 + }, + { + "epoch": 0.9722230217898161, + "grad_norm": 3.6405619640934184, + "learning_rate": 4.00661698969662e-09, + "loss": 0.5303, + "step": 2111 + }, + { + "epoch": 0.972683572723871, + "grad_norm": 2.8429713783566926, + "learning_rate": 3.874261625157915e-09, + "loss": 0.5358, + "step": 2112 + }, + { + "epoch": 0.9731441236579258, + "grad_norm": 2.7464922162048238, + "learning_rate": 3.744125017034916e-09, + "loss": 0.5143, + "step": 2113 + }, + { + "epoch": 0.9736046745919806, + "grad_norm": 2.9563946004018575, + "learning_rate": 3.6162074551919772e-09, + "loss": 0.4829, + "step": 2114 + }, + { + "epoch": 0.9740652255260355, + "grad_norm": 3.4014494942852282, + "learning_rate": 3.4905092245509637e-09, + "loss": 0.5424, + "step": 2115 + }, + { + "epoch": 0.9745257764600904, + "grad_norm": 3.4672435378031845, + "learning_rate": 3.3670306050902485e-09, + "loss": 0.5683, + "step": 2116 + }, + { + "epoch": 0.9749863273941453, + "grad_norm": 2.852268606152846, + "learning_rate": 3.2457718718443827e-09, + "loss": 0.5653, + "step": 2117 + }, + { + "epoch": 0.9754468783282001, + "grad_norm": 3.00608356499513, + "learning_rate": 3.1267332949033166e-09, + "loss": 0.5951, + "step": 2118 + }, + { + "epoch": 0.9759074292622549, + "grad_norm": 3.279145796994075, + "learning_rate": 3.009915139412067e-09, + "loss": 0.5203, + "step": 2119 + }, + { + "epoch": 0.9763679801963099, + "grad_norm": 3.034428779025666, + "learning_rate": 2.8953176655696075e-09, + "loss": 0.6216, + "step": 2120 + }, + { + "epoch": 0.9768285311303647, + "grad_norm": 3.400829045951133, + "learning_rate": 2.7829411286287572e-09, + "loss": 0.5613, + "step": 2121 + }, + { + "epoch": 0.9772890820644196, + "grad_norm": 3.1672048746314547, + "learning_rate": 2.6727857788954033e-09, + "loss": 0.5935, + "step": 2122 + }, + { + "epoch": 0.9777496329984744, + "grad_norm": 3.26875632935082, + "learning_rate": 2.5648518617280567e-09, + "loss": 0.5012, + "step": 2123 + }, + { + "epoch": 0.9782101839325293, + "grad_norm": 3.271789631236318, + "learning_rate": 2.459139617537187e-09, + "loss": 0.4801, + "step": 2124 + }, + { + "epoch": 0.9786707348665842, + "grad_norm": 3.2181552684453583, + "learning_rate": 2.3556492817847773e-09, + "loss": 0.5425, + "step": 2125 + }, + { + "epoch": 0.979131285800639, + "grad_norm": 3.0926183100897493, + "learning_rate": 2.2543810849836586e-09, + "loss": 0.5107, + "step": 2126 + }, + { + "epoch": 0.9795918367346939, + "grad_norm": 2.8517011860409887, + "learning_rate": 2.1553352526972878e-09, + "loss": 0.5229, + "step": 2127 + }, + { + "epoch": 0.9800523876687487, + "grad_norm": 2.8804770422411283, + "learning_rate": 2.0585120055389705e-09, + "loss": 0.4706, + "step": 2128 + }, + { + "epoch": 0.9805129386028036, + "grad_norm": 3.202711757414141, + "learning_rate": 1.963911559171416e-09, + "loss": 0.5939, + "step": 2129 + }, + { + "epoch": 0.9809734895368585, + "grad_norm": 3.2187761279662706, + "learning_rate": 1.8715341243061846e-09, + "loss": 0.5934, + "step": 2130 + }, + { + "epoch": 0.9814340404709133, + "grad_norm": 3.881243006981297, + "learning_rate": 1.7813799067035729e-09, + "loss": 0.5152, + "step": 2131 + }, + { + "epoch": 0.9818945914049682, + "grad_norm": 3.1208611268878124, + "learning_rate": 1.6934491071719515e-09, + "loss": 0.5928, + "step": 2132 + }, + { + "epoch": 0.9823551423390231, + "grad_norm": 3.6853248210429426, + "learning_rate": 1.6077419215668742e-09, + "loss": 0.5364, + "step": 2133 + }, + { + "epoch": 0.9828156932730779, + "grad_norm": 3.672076871339007, + "learning_rate": 1.5242585407915231e-09, + "loss": 0.4995, + "step": 2134 + }, + { + "epoch": 0.9832762442071328, + "grad_norm": 3.308203790005742, + "learning_rate": 1.4429991507954874e-09, + "loss": 0.5985, + "step": 2135 + }, + { + "epoch": 0.9837367951411876, + "grad_norm": 3.4864392711956063, + "learning_rate": 1.3639639325748741e-09, + "loss": 0.5583, + "step": 2136 + }, + { + "epoch": 0.9841973460752426, + "grad_norm": 2.865215830877929, + "learning_rate": 1.287153062171642e-09, + "loss": 0.5195, + "step": 2137 + }, + { + "epoch": 0.9846578970092974, + "grad_norm": 3.1310255182151754, + "learning_rate": 1.2125667106730464e-09, + "loss": 0.5164, + "step": 2138 + }, + { + "epoch": 0.9851184479433522, + "grad_norm": 3.112056106943904, + "learning_rate": 1.1402050442118616e-09, + "loss": 0.4333, + "step": 2139 + }, + { + "epoch": 0.9855789988774071, + "grad_norm": 3.038352446915218, + "learning_rate": 1.0700682239653814e-09, + "loss": 0.5689, + "step": 2140 + }, + { + "epoch": 0.9860395498114619, + "grad_norm": 3.0348900204468903, + "learning_rate": 1.002156406155419e-09, + "loss": 0.5682, + "step": 2141 + }, + { + "epoch": 0.9865001007455169, + "grad_norm": 3.115185553653833, + "learning_rate": 9.364697420476408e-10, + "loss": 0.4725, + "step": 2142 + }, + { + "epoch": 0.9869606516795717, + "grad_norm": 3.087394317229768, + "learning_rate": 8.730083779516784e-10, + "loss": 0.5686, + "step": 2143 + }, + { + "epoch": 0.9874212026136265, + "grad_norm": 2.8838196694727647, + "learning_rate": 8.117724552205718e-10, + "loss": 0.4791, + "step": 2144 + }, + { + "epoch": 0.9878817535476814, + "grad_norm": 2.9019511674207417, + "learning_rate": 7.527621102503268e-10, + "loss": 0.5693, + "step": 2145 + }, + { + "epoch": 0.9883423044817363, + "grad_norm": 3.153315732843879, + "learning_rate": 6.959774744796921e-10, + "loss": 0.6183, + "step": 2146 + }, + { + "epoch": 0.9888028554157912, + "grad_norm": 3.2841453445938162, + "learning_rate": 6.414186743899375e-10, + "loss": 0.4677, + "step": 2147 + }, + { + "epoch": 0.989263406349846, + "grad_norm": 3.205902530946899, + "learning_rate": 5.890858315046321e-10, + "loss": 0.5202, + "step": 2148 + }, + { + "epoch": 0.9897239572839008, + "grad_norm": 2.911194242328273, + "learning_rate": 5.389790623891999e-10, + "loss": 0.4913, + "step": 2149 + }, + { + "epoch": 0.9901845082179558, + "grad_norm": 3.087952168592988, + "learning_rate": 4.910984786506978e-10, + "loss": 0.5976, + "step": 2150 + }, + { + "epoch": 0.9906450591520106, + "grad_norm": 3.2125702072705162, + "learning_rate": 4.454441869377046e-10, + "loss": 0.5693, + "step": 2151 + }, + { + "epoch": 0.9911056100860655, + "grad_norm": 3.1911572214123365, + "learning_rate": 4.020162889399881e-10, + "loss": 0.5079, + "step": 2152 + }, + { + "epoch": 0.9915661610201203, + "grad_norm": 2.842869367824956, + "learning_rate": 3.6081488138817176e-10, + "loss": 0.5438, + "step": 2153 + }, + { + "epoch": 0.9920267119541751, + "grad_norm": 3.59761551281296, + "learning_rate": 3.2184005605373487e-10, + "loss": 0.6356, + "step": 2154 + }, + { + "epoch": 0.9924872628882301, + "grad_norm": 3.266624178638357, + "learning_rate": 2.850918997485685e-10, + "loss": 0.6384, + "step": 2155 + }, + { + "epoch": 0.9929478138222849, + "grad_norm": 2.847984241902026, + "learning_rate": 2.505704943251974e-10, + "loss": 0.4189, + "step": 2156 + }, + { + "epoch": 0.9934083647563398, + "grad_norm": 2.8940228158585684, + "learning_rate": 2.1827591667578083e-10, + "loss": 0.543, + "step": 2157 + }, + { + "epoch": 0.9938689156903946, + "grad_norm": 3.2268790504480798, + "learning_rate": 1.8820823873311187e-10, + "loss": 0.5865, + "step": 2158 + }, + { + "epoch": 0.9943294666244495, + "grad_norm": 2.816014447163771, + "learning_rate": 1.6036752746939608e-10, + "loss": 0.6104, + "step": 2159 + }, + { + "epoch": 0.9947900175585044, + "grad_norm": 2.834508001501944, + "learning_rate": 1.347538448966956e-10, + "loss": 0.5525, + "step": 2160 + }, + { + "epoch": 0.9952505684925592, + "grad_norm": 3.296408037769054, + "learning_rate": 1.113672480663741e-10, + "loss": 0.527, + "step": 2161 + }, + { + "epoch": 0.995711119426614, + "grad_norm": 3.1020643793075795, + "learning_rate": 9.020778906965176e-11, + "loss": 0.5439, + "step": 2162 + }, + { + "epoch": 0.996171670360669, + "grad_norm": 2.9887893240557477, + "learning_rate": 7.127551503671724e-11, + "loss": 0.5408, + "step": 2163 + }, + { + "epoch": 0.9966322212947238, + "grad_norm": 3.311987992840308, + "learning_rate": 5.4570468136949655e-11, + "loss": 0.5569, + "step": 2164 + }, + { + "epoch": 0.9970927722287787, + "grad_norm": 3.3309536253152436, + "learning_rate": 4.009268557902956e-11, + "loss": 0.5253, + "step": 2165 + }, + { + "epoch": 0.9975533231628335, + "grad_norm": 3.3046896055168546, + "learning_rate": 2.7842199610605965e-11, + "loss": 0.5117, + "step": 2166 + }, + { + "epoch": 0.9980138740968884, + "grad_norm": 3.1272281406984206, + "learning_rate": 1.7819037518185252e-11, + "loss": 0.5905, + "step": 2167 + }, + { + "epoch": 0.9984744250309433, + "grad_norm": 3.614597151505851, + "learning_rate": 1.0023221627242229e-11, + "loss": 0.4945, + "step": 2168 + }, + { + "epoch": 0.9989349759649981, + "grad_norm": 3.190348932550531, + "learning_rate": 4.454769301998063e-12, + "loss": 0.6752, + "step": 2169 + }, + { + "epoch": 0.999395526899053, + "grad_norm": 3.393120014197588, + "learning_rate": 1.1136929456423416e-12, + "loss": 0.5265, + "step": 2170 + }, + { + "epoch": 0.9998560778331078, + "grad_norm": 3.1321707930638376, + "learning_rate": 0.0, + "loss": 0.5797, + "step": 2171 + }, + { + "epoch": 0.9998560778331078, + "step": 2171, + "total_flos": 1478472491958272.0, + "train_loss": 0.577474653254122, + "train_runtime": 98261.8902, + "train_samples_per_second": 1.414, + "train_steps_per_second": 0.022 + } + ], + "logging_steps": 1.0, + "max_steps": 2171, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1478472491958272.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}