{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998981773750127, "eval_steps": 500, "global_step": 2455, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004072904999490887, "grad_norm": 0.7094523906707764, "learning_rate": 2.702702702702703e-05, "loss": 1.8961, "mean_token_accuracy": 0.5438283555209636, "step": 10 }, { "epoch": 0.008145809998981774, "grad_norm": 0.4965726137161255, "learning_rate": 5.405405405405406e-05, "loss": 2.0135, "mean_token_accuracy": 0.5206024497747421, "step": 20 }, { "epoch": 0.01221871499847266, "grad_norm": 0.5204955339431763, "learning_rate": 8.108108108108109e-05, "loss": 1.7338, "mean_token_accuracy": 0.5830004885792732, "step": 30 }, { "epoch": 0.01629161999796355, "grad_norm": 0.4678299129009247, "learning_rate": 0.00010810810810810812, "loss": 1.7561, "mean_token_accuracy": 0.5730855345726014, "step": 40 }, { "epoch": 0.020364524997454433, "grad_norm": 0.439376562833786, "learning_rate": 0.00013513513513513514, "loss": 1.7277, "mean_token_accuracy": 0.5785414174199104, "step": 50 }, { "epoch": 0.02443742999694532, "grad_norm": 0.5652154684066772, "learning_rate": 0.00016216216216216218, "loss": 1.5663, "mean_token_accuracy": 0.6036677993834019, "step": 60 }, { "epoch": 0.02851033499643621, "grad_norm": 0.5163070559501648, "learning_rate": 0.0001891891891891892, "loss": 1.8259, "mean_token_accuracy": 0.5530456639826298, "step": 70 }, { "epoch": 0.0325832399959271, "grad_norm": 0.41974571347236633, "learning_rate": 0.00019999686634492516, "loss": 1.6554, "mean_token_accuracy": 0.6073097884654999, "step": 80 }, { "epoch": 0.03665614499541798, "grad_norm": 0.5097134709358215, "learning_rate": 0.00019997771694180204, "loss": 1.7208, "mean_token_accuracy": 0.5835812106728554, "step": 90 }, { "epoch": 0.04072904999490887, "grad_norm": 0.3469955623149872, "learning_rate": 0.00019994116238472668, "loss": 1.7954, "mean_token_accuracy": 0.5794057920575142, "step": 100 }, { "epoch": 0.044801954994399755, "grad_norm": 0.5898286700248718, "learning_rate": 0.0001998872090374941, "loss": 1.8089, "mean_token_accuracy": 0.5614037178456783, "step": 110 }, { "epoch": 0.04887485999389064, "grad_norm": 0.3150334656238556, "learning_rate": 0.0001998158662928604, "loss": 1.5827, "mean_token_accuracy": 0.5801003761589527, "step": 120 }, { "epoch": 0.05294776499338153, "grad_norm": 0.3100312352180481, "learning_rate": 0.00019972714657090772, "loss": 1.6712, "mean_token_accuracy": 0.5900266923010349, "step": 130 }, { "epoch": 0.05702066999287242, "grad_norm": 0.30420345067977905, "learning_rate": 0.0001996210653168819, "loss": 1.646, "mean_token_accuracy": 0.5839473098516464, "step": 140 }, { "epoch": 0.0610935749923633, "grad_norm": 0.454593688249588, "learning_rate": 0.0001994976409985037, "loss": 1.7184, "mean_token_accuracy": 0.566600239276886, "step": 150 }, { "epoch": 0.0651664799918542, "grad_norm": 0.35688647627830505, "learning_rate": 0.0001993568951027537, "loss": 1.6766, "mean_token_accuracy": 0.5824202686548233, "step": 160 }, { "epoch": 0.06923938499134508, "grad_norm": 0.3199939727783203, "learning_rate": 0.0001991988521321317, "loss": 1.542, "mean_token_accuracy": 0.6095141984522343, "step": 170 }, { "epoch": 0.07331228999083596, "grad_norm": 0.5523242950439453, "learning_rate": 0.00019902353960039087, "loss": 1.7218, "mean_token_accuracy": 0.5745485998690129, "step": 180 }, { "epoch": 0.07738519499032685, "grad_norm": 0.3872721493244171, "learning_rate": 0.00019883098802774812, "loss": 1.7306, "mean_token_accuracy": 0.5514535710215569, "step": 190 }, { "epoch": 0.08145809998981773, "grad_norm": 0.2679811120033264, "learning_rate": 0.0001986212309355707, "loss": 1.6524, "mean_token_accuracy": 0.5822832569479942, "step": 200 }, { "epoch": 0.08553100498930863, "grad_norm": 0.5167363882064819, "learning_rate": 0.00019839430484054046, "loss": 1.6964, "mean_token_accuracy": 0.573430598527193, "step": 210 }, { "epoch": 0.08960390998879951, "grad_norm": 0.4363399147987366, "learning_rate": 0.0001981502492482967, "loss": 1.7296, "mean_token_accuracy": 0.5835007324814796, "step": 220 }, { "epoch": 0.0936768149882904, "grad_norm": 0.4052150845527649, "learning_rate": 0.00019788910664655848, "loss": 1.5772, "mean_token_accuracy": 0.5771500714123249, "step": 230 }, { "epoch": 0.09774971998778129, "grad_norm": 0.34224212169647217, "learning_rate": 0.0001976109224977281, "loss": 1.6263, "mean_token_accuracy": 0.5942870646715164, "step": 240 }, { "epoch": 0.10182262498727217, "grad_norm": 0.4852873980998993, "learning_rate": 0.00019731574523097647, "loss": 1.5731, "mean_token_accuracy": 0.5886094763875007, "step": 250 }, { "epoch": 0.10589552998676306, "grad_norm": 0.30241233110427856, "learning_rate": 0.00019700362623381197, "loss": 1.8311, "mean_token_accuracy": 0.5616028495132923, "step": 260 }, { "epoch": 0.10996843498625394, "grad_norm": 0.38147303462028503, "learning_rate": 0.00019667461984313448, "loss": 1.652, "mean_token_accuracy": 0.5836799181997776, "step": 270 }, { "epoch": 0.11404133998574484, "grad_norm": 0.3174324333667755, "learning_rate": 0.00019632878333577592, "loss": 1.6831, "mean_token_accuracy": 0.5850063987076283, "step": 280 }, { "epoch": 0.11811424498523572, "grad_norm": 0.350323349237442, "learning_rate": 0.00019596617691852863, "loss": 1.6644, "mean_token_accuracy": 0.5841067053377629, "step": 290 }, { "epoch": 0.1221871499847266, "grad_norm": 0.30346542596817017, "learning_rate": 0.0001955868637176643, "loss": 1.656, "mean_token_accuracy": 0.584677055478096, "step": 300 }, { "epoch": 0.1262600549842175, "grad_norm": 0.2639765739440918, "learning_rate": 0.00019519090976794406, "loss": 1.7454, "mean_token_accuracy": 0.5678185373544693, "step": 310 }, { "epoch": 0.1303329599837084, "grad_norm": 0.3039887547492981, "learning_rate": 0.00019477838400112254, "loss": 1.754, "mean_token_accuracy": 0.5744720883667469, "step": 320 }, { "epoch": 0.13440586498319926, "grad_norm": 0.35102295875549316, "learning_rate": 0.00019434935823394746, "loss": 1.6665, "mean_token_accuracy": 0.5876846723258495, "step": 330 }, { "epoch": 0.13847876998269015, "grad_norm": 0.3325759470462799, "learning_rate": 0.00019390390715565725, "loss": 1.6773, "mean_token_accuracy": 0.5869172632694244, "step": 340 }, { "epoch": 0.14255167498218105, "grad_norm": 0.37209993600845337, "learning_rate": 0.000193442108314978, "loss": 1.6328, "mean_token_accuracy": 0.5927142709493637, "step": 350 }, { "epoch": 0.14662457998167192, "grad_norm": 0.2964717149734497, "learning_rate": 0.00019296404210662331, "loss": 1.5659, "mean_token_accuracy": 0.60322862342, "step": 360 }, { "epoch": 0.1506974849811628, "grad_norm": 0.29879456758499146, "learning_rate": 0.00019246979175729822, "loss": 1.7083, "mean_token_accuracy": 0.5893984287977219, "step": 370 }, { "epoch": 0.1547703899806537, "grad_norm": 0.3726056218147278, "learning_rate": 0.00019195944331121015, "loss": 1.6854, "mean_token_accuracy": 0.5761750474572181, "step": 380 }, { "epoch": 0.1588432949801446, "grad_norm": 0.517816424369812, "learning_rate": 0.0001914330856150897, "loss": 1.7282, "mean_token_accuracy": 0.5854727104306221, "step": 390 }, { "epoch": 0.16291619997963547, "grad_norm": 0.25848233699798584, "learning_rate": 0.00019089081030272296, "loss": 1.5562, "mean_token_accuracy": 0.6038706839084625, "step": 400 }, { "epoch": 0.16698910497912636, "grad_norm": 0.3191607892513275, "learning_rate": 0.00019033271177899922, "loss": 1.6452, "mean_token_accuracy": 0.5812859788537026, "step": 410 }, { "epoch": 0.17106200997861726, "grad_norm": 0.3990655243396759, "learning_rate": 0.0001897588872034758, "loss": 1.626, "mean_token_accuracy": 0.569889971613884, "step": 420 }, { "epoch": 0.17513491497810812, "grad_norm": 0.346086323261261, "learning_rate": 0.00018916943647346375, "loss": 1.7451, "mean_token_accuracy": 0.578500047326088, "step": 430 }, { "epoch": 0.17920781997759902, "grad_norm": 0.36437422037124634, "learning_rate": 0.0001885644622066364, "loss": 1.846, "mean_token_accuracy": 0.5627885892987251, "step": 440 }, { "epoch": 0.18328072497708991, "grad_norm": 0.2968160808086395, "learning_rate": 0.00018794406972316482, "loss": 1.671, "mean_token_accuracy": 0.5769762165844441, "step": 450 }, { "epoch": 0.1873536299765808, "grad_norm": 0.2781198024749756, "learning_rate": 0.00018730836702738257, "loss": 1.4983, "mean_token_accuracy": 0.613883113116026, "step": 460 }, { "epoch": 0.19142653497607168, "grad_norm": 0.4645621180534363, "learning_rate": 0.0001866574647889831, "loss": 1.6776, "mean_token_accuracy": 0.5890260674059391, "step": 470 }, { "epoch": 0.19549943997556257, "grad_norm": 0.3920878767967224, "learning_rate": 0.00018599147632375332, "loss": 1.802, "mean_token_accuracy": 0.568213502317667, "step": 480 }, { "epoch": 0.19957234497505347, "grad_norm": 0.3473225235939026, "learning_rate": 0.00018531051757384633, "loss": 1.7161, "mean_token_accuracy": 0.5727271348237991, "step": 490 }, { "epoch": 0.20364524997454433, "grad_norm": 0.30091673135757446, "learning_rate": 0.00018461470708759712, "loss": 1.7042, "mean_token_accuracy": 0.5763454169034958, "step": 500 }, { "epoch": 0.20771815497403523, "grad_norm": 0.31175661087036133, "learning_rate": 0.00018390416599888435, "loss": 1.689, "mean_token_accuracy": 0.5796464458107948, "step": 510 }, { "epoch": 0.21179105997352612, "grad_norm": 0.3624255955219269, "learning_rate": 0.0001831790180060422, "loss": 1.5619, "mean_token_accuracy": 0.6015763126313687, "step": 520 }, { "epoch": 0.215863964973017, "grad_norm": 0.2667541205883026, "learning_rate": 0.00018243938935032561, "loss": 1.6877, "mean_token_accuracy": 0.5839527539908886, "step": 530 }, { "epoch": 0.2199368699725079, "grad_norm": 0.31019967794418335, "learning_rate": 0.00018168540879393296, "loss": 1.7831, "mean_token_accuracy": 0.5688935197889805, "step": 540 }, { "epoch": 0.22400977497199878, "grad_norm": 0.2726418673992157, "learning_rate": 0.0001809172075975897, "loss": 1.7288, "mean_token_accuracy": 0.5798229008913041, "step": 550 }, { "epoch": 0.22808267997148968, "grad_norm": 0.29514381289482117, "learning_rate": 0.00018013491949769734, "loss": 1.7188, "mean_token_accuracy": 0.5756009854376316, "step": 560 }, { "epoch": 0.23215558497098054, "grad_norm": 0.37964069843292236, "learning_rate": 0.00017933868068305104, "loss": 1.6244, "mean_token_accuracy": 0.5932842157781124, "step": 570 }, { "epoch": 0.23622848997047144, "grad_norm": 0.300620436668396, "learning_rate": 0.0001785286297711305, "loss": 1.5565, "mean_token_accuracy": 0.5965760670602321, "step": 580 }, { "epoch": 0.24030139496996233, "grad_norm": 0.5466737151145935, "learning_rate": 0.00017770490778396808, "loss": 1.6532, "mean_token_accuracy": 0.5821332208812237, "step": 590 }, { "epoch": 0.2443742999694532, "grad_norm": 0.3445660173892975, "learning_rate": 0.00017686765812359808, "loss": 1.7585, "mean_token_accuracy": 0.5790032669901848, "step": 600 }, { "epoch": 0.2484472049689441, "grad_norm": 0.3492606282234192, "learning_rate": 0.0001760170265470921, "loss": 1.6401, "mean_token_accuracy": 0.6002471588551999, "step": 610 }, { "epoch": 0.252520109968435, "grad_norm": 0.31294527649879456, "learning_rate": 0.00017515316114118375, "loss": 1.6915, "mean_token_accuracy": 0.5570432722568512, "step": 620 }, { "epoch": 0.25659301496792586, "grad_norm": 0.27257561683654785, "learning_rate": 0.00017427621229648853, "loss": 1.5666, "mean_token_accuracy": 0.6028999522328377, "step": 630 }, { "epoch": 0.2606659199674168, "grad_norm": 0.29818692803382874, "learning_rate": 0.00017338633268132212, "loss": 1.5926, "mean_token_accuracy": 0.5965964362025261, "step": 640 }, { "epoch": 0.26473882496690765, "grad_norm": 0.49210649728775024, "learning_rate": 0.0001724836772151223, "loss": 1.5925, "mean_token_accuracy": 0.5952631443738937, "step": 650 }, { "epoch": 0.2688117299663985, "grad_norm": 0.3807302713394165, "learning_rate": 0.00017156840304147902, "loss": 1.6237, "mean_token_accuracy": 0.5884141281247139, "step": 660 }, { "epoch": 0.27288463496588944, "grad_norm": 0.2621661126613617, "learning_rate": 0.00017064066950077722, "loss": 1.7356, "mean_token_accuracy": 0.5827617473900318, "step": 670 }, { "epoch": 0.2769575399653803, "grad_norm": 0.30957838892936707, "learning_rate": 0.00016970063810245716, "loss": 1.5585, "mean_token_accuracy": 0.5888052701950073, "step": 680 }, { "epoch": 0.2810304449648712, "grad_norm": 0.2501460611820221, "learning_rate": 0.00016874847249689722, "loss": 1.5913, "mean_token_accuracy": 0.5886548452079297, "step": 690 }, { "epoch": 0.2851033499643621, "grad_norm": 0.3207811415195465, "learning_rate": 0.00016778433844692397, "loss": 1.6791, "mean_token_accuracy": 0.5861249402165413, "step": 700 }, { "epoch": 0.28917625496385296, "grad_norm": 0.45466601848602295, "learning_rate": 0.0001668084037989544, "loss": 1.5153, "mean_token_accuracy": 0.5999807387590408, "step": 710 }, { "epoch": 0.29324915996334383, "grad_norm": 0.34910282492637634, "learning_rate": 0.00016582083845377552, "loss": 1.6821, "mean_token_accuracy": 0.5889992110431195, "step": 720 }, { "epoch": 0.29732206496283475, "grad_norm": 0.4916020929813385, "learning_rate": 0.00016482181433696643, "loss": 1.8462, "mean_token_accuracy": 0.5748938458040357, "step": 730 }, { "epoch": 0.3013949699623256, "grad_norm": 0.2545833885669708, "learning_rate": 0.00016381150536896736, "loss": 1.5756, "mean_token_accuracy": 0.6056667067110538, "step": 740 }, { "epoch": 0.3054678749618165, "grad_norm": 0.30347147583961487, "learning_rate": 0.0001627900874348022, "loss": 1.6003, "mean_token_accuracy": 0.5873342089354991, "step": 750 }, { "epoch": 0.3095407799613074, "grad_norm": 0.37371426820755005, "learning_rate": 0.0001617577383534584, "loss": 1.6576, "mean_token_accuracy": 0.5790071420371532, "step": 760 }, { "epoch": 0.3136136849607983, "grad_norm": 0.41969504952430725, "learning_rate": 0.00016071463784693045, "loss": 1.6181, "mean_token_accuracy": 0.5854876518249512, "step": 770 }, { "epoch": 0.3176865899602892, "grad_norm": 0.17495319247245789, "learning_rate": 0.00015966096750893197, "loss": 1.5142, "mean_token_accuracy": 0.6079291738569736, "step": 780 }, { "epoch": 0.32175949495978007, "grad_norm": 0.30013784766197205, "learning_rate": 0.00015859691077328215, "loss": 1.6583, "mean_token_accuracy": 0.581703095138073, "step": 790 }, { "epoch": 0.32583239995927094, "grad_norm": 0.3358050584793091, "learning_rate": 0.00015752265288197155, "loss": 1.6468, "mean_token_accuracy": 0.6049091577529907, "step": 800 }, { "epoch": 0.32990530495876186, "grad_norm": 0.3690403699874878, "learning_rate": 0.00015643838085291323, "loss": 1.8431, "mean_token_accuracy": 0.5602408707141876, "step": 810 }, { "epoch": 0.3339782099582527, "grad_norm": 0.34296655654907227, "learning_rate": 0.00015534428344738505, "loss": 1.7042, "mean_token_accuracy": 0.5799131192266941, "step": 820 }, { "epoch": 0.3380511149577436, "grad_norm": 0.2764555513858795, "learning_rate": 0.00015424055113716763, "loss": 1.5479, "mean_token_accuracy": 0.5909703068435193, "step": 830 }, { "epoch": 0.3421240199572345, "grad_norm": 0.26227012276649475, "learning_rate": 0.0001531273760713855, "loss": 1.5303, "mean_token_accuracy": 0.5910052061080933, "step": 840 }, { "epoch": 0.3461969249567254, "grad_norm": 0.3656936585903168, "learning_rate": 0.00015200495204305574, "loss": 1.5586, "mean_token_accuracy": 0.5943005800247192, "step": 850 }, { "epoch": 0.35026982995621625, "grad_norm": 0.29997819662094116, "learning_rate": 0.00015087347445535013, "loss": 1.8219, "mean_token_accuracy": 0.5533552631735802, "step": 860 }, { "epoch": 0.3543427349557072, "grad_norm": 0.290685772895813, "learning_rate": 0.00014973314028757787, "loss": 1.7261, "mean_token_accuracy": 0.5844682581722737, "step": 870 }, { "epoch": 0.35841563995519804, "grad_norm": 0.34553012251853943, "learning_rate": 0.00014858414806089295, "loss": 1.6982, "mean_token_accuracy": 0.5762835793197155, "step": 880 }, { "epoch": 0.3624885449546889, "grad_norm": 0.2141156941652298, "learning_rate": 0.0001474266978037338, "loss": 1.5318, "mean_token_accuracy": 0.6048024773597718, "step": 890 }, { "epoch": 0.36656144995417983, "grad_norm": 0.30456602573394775, "learning_rate": 0.00014626099101700018, "loss": 1.7901, "mean_token_accuracy": 0.5763920709490776, "step": 900 }, { "epoch": 0.3706343549536707, "grad_norm": 0.26921945810317993, "learning_rate": 0.00014508723063897376, "loss": 1.4936, "mean_token_accuracy": 0.6303243085741996, "step": 910 }, { "epoch": 0.3747072599531616, "grad_norm": 0.28455570340156555, "learning_rate": 0.00014390562100998868, "loss": 1.5804, "mean_token_accuracy": 0.6074232332408428, "step": 920 }, { "epoch": 0.3787801649526525, "grad_norm": 0.3388415575027466, "learning_rate": 0.00014271636783685777, "loss": 1.6731, "mean_token_accuracy": 0.5768752813339233, "step": 930 }, { "epoch": 0.38285306995214335, "grad_norm": 0.4311608076095581, "learning_rate": 0.00014151967815706091, "loss": 1.7237, "mean_token_accuracy": 0.5706497602164745, "step": 940 }, { "epoch": 0.3869259749516343, "grad_norm": 0.35940027236938477, "learning_rate": 0.00014031576030270202, "loss": 1.5355, "mean_token_accuracy": 0.5908183179795742, "step": 950 }, { "epoch": 0.39099887995112514, "grad_norm": 0.34071287512779236, "learning_rate": 0.00013910482386424023, "loss": 1.7247, "mean_token_accuracy": 0.5757749699056148, "step": 960 }, { "epoch": 0.395071784950616, "grad_norm": 0.413870245218277, "learning_rate": 0.00013788707965400236, "loss": 1.6796, "mean_token_accuracy": 0.592286454886198, "step": 970 }, { "epoch": 0.39914468995010693, "grad_norm": 0.2649496793746948, "learning_rate": 0.00013666273966948252, "loss": 1.5955, "mean_token_accuracy": 0.5936679825186729, "step": 980 }, { "epoch": 0.4032175949495978, "grad_norm": 0.3525199294090271, "learning_rate": 0.00013543201705643526, "loss": 1.647, "mean_token_accuracy": 0.5950982637703419, "step": 990 }, { "epoch": 0.40729049994908867, "grad_norm": 0.33436283469200134, "learning_rate": 0.00013419512607176914, "loss": 1.7161, "mean_token_accuracy": 0.574284989386797, "step": 1000 }, { "epoch": 0.4113634049485796, "grad_norm": 0.46867313981056213, "learning_rate": 0.00013295228204624648, "loss": 1.544, "mean_token_accuracy": 0.6102774910628795, "step": 1010 }, { "epoch": 0.41543630994807046, "grad_norm": 0.30373555421829224, "learning_rate": 0.00013170370134699653, "loss": 1.6287, "mean_token_accuracy": 0.5843084178864956, "step": 1020 }, { "epoch": 0.4195092149475613, "grad_norm": 0.2981624901294708, "learning_rate": 0.00013044960133984804, "loss": 1.6858, "mean_token_accuracy": 0.5856122255325318, "step": 1030 }, { "epoch": 0.42358211994705225, "grad_norm": 0.3545626997947693, "learning_rate": 0.00012919020035148776, "loss": 1.7392, "mean_token_accuracy": 0.5841099888086319, "step": 1040 }, { "epoch": 0.4276550249465431, "grad_norm": 0.2896677553653717, "learning_rate": 0.0001279257176314521, "loss": 1.5007, "mean_token_accuracy": 0.573243772238493, "step": 1050 }, { "epoch": 0.431727929946034, "grad_norm": 0.36384209990501404, "learning_rate": 0.00012665637331395785, "loss": 1.487, "mean_token_accuracy": 0.6025885075330735, "step": 1060 }, { "epoch": 0.4358008349455249, "grad_norm": 0.3681187033653259, "learning_rate": 0.00012538238837957882, "loss": 1.4913, "mean_token_accuracy": 0.5982382036745548, "step": 1070 }, { "epoch": 0.4398737399450158, "grad_norm": 0.2680988311767578, "learning_rate": 0.00012410398461677554, "loss": 1.6263, "mean_token_accuracy": 0.5956345148384571, "step": 1080 }, { "epoch": 0.4439466449445067, "grad_norm": 0.23174384236335754, "learning_rate": 0.00012282138458328358, "loss": 1.7378, "mean_token_accuracy": 0.590882021188736, "step": 1090 }, { "epoch": 0.44801954994399756, "grad_norm": 0.34088292717933655, "learning_rate": 0.00012153481156736892, "loss": 1.7385, "mean_token_accuracy": 0.5994494572281838, "step": 1100 }, { "epoch": 0.45209245494348843, "grad_norm": 0.24563632905483246, "learning_rate": 0.00012024448954895522, "loss": 1.5212, "mean_token_accuracy": 0.6165470741689205, "step": 1110 }, { "epoch": 0.45616535994297935, "grad_norm": 0.26980966329574585, "learning_rate": 0.00011895064316063127, "loss": 1.5254, "mean_token_accuracy": 0.5898841544985771, "step": 1120 }, { "epoch": 0.4602382649424702, "grad_norm": 0.32573202252388, "learning_rate": 0.00011765349764854461, "loss": 1.5704, "mean_token_accuracy": 0.6047514051198959, "step": 1130 }, { "epoch": 0.4643111699419611, "grad_norm": 0.3137454390525818, "learning_rate": 0.00011635327883318831, "loss": 1.5893, "mean_token_accuracy": 0.5792985640466213, "step": 1140 }, { "epoch": 0.468384074941452, "grad_norm": 0.368747353553772, "learning_rate": 0.00011505021307008785, "loss": 1.6388, "mean_token_accuracy": 0.5851111486554146, "step": 1150 }, { "epoch": 0.4724569799409429, "grad_norm": 0.325250506401062, "learning_rate": 0.00011374452721039477, "loss": 1.7192, "mean_token_accuracy": 0.5636343933641911, "step": 1160 }, { "epoch": 0.47652988494043375, "grad_norm": 0.32918378710746765, "learning_rate": 0.00011243644856139403, "loss": 1.6048, "mean_token_accuracy": 0.6072004094719887, "step": 1170 }, { "epoch": 0.48060278993992467, "grad_norm": 0.2892746031284332, "learning_rate": 0.00011112620484693223, "loss": 1.6785, "mean_token_accuracy": 0.5872686378657818, "step": 1180 }, { "epoch": 0.48467569493941554, "grad_norm": 0.2459000200033188, "learning_rate": 0.0001098140241677728, "loss": 1.5799, "mean_token_accuracy": 0.6077749952673912, "step": 1190 }, { "epoch": 0.4887485999389064, "grad_norm": 0.3696756660938263, "learning_rate": 0.00010850013496188606, "loss": 1.5966, "mean_token_accuracy": 0.5970290452241898, "step": 1200 }, { "epoch": 0.4928215049383973, "grad_norm": 0.27681517601013184, "learning_rate": 0.00010718476596468028, "loss": 1.7161, "mean_token_accuracy": 0.5730410292744637, "step": 1210 }, { "epoch": 0.4968944099378882, "grad_norm": 0.2720302641391754, "learning_rate": 0.00010586814616918113, "loss": 1.6991, "mean_token_accuracy": 0.5764113113284111, "step": 1220 }, { "epoch": 0.5009673149373791, "grad_norm": 0.34990179538726807, "learning_rate": 0.00010455050478616617, "loss": 1.7114, "mean_token_accuracy": 0.5776129819452762, "step": 1230 }, { "epoch": 0.50504021993687, "grad_norm": 0.33753877878189087, "learning_rate": 0.00010323207120426142, "loss": 1.8174, "mean_token_accuracy": 0.5551487416028976, "step": 1240 }, { "epoch": 0.5091131249363609, "grad_norm": 0.41568267345428467, "learning_rate": 0.00010191307495000712, "loss": 1.799, "mean_token_accuracy": 0.5767477229237556, "step": 1250 }, { "epoch": 0.5131860299358517, "grad_norm": 0.2747114300727844, "learning_rate": 0.00010059374564789932, "loss": 1.4763, "mean_token_accuracy": 0.6238099962472916, "step": 1260 }, { "epoch": 0.5172589349353426, "grad_norm": 0.2458280771970749, "learning_rate": 9.927431298041441e-05, "loss": 1.5262, "mean_token_accuracy": 0.6056429393589496, "step": 1270 }, { "epoch": 0.5213318399348336, "grad_norm": 0.2757134437561035, "learning_rate": 9.795500664802385e-05, "loss": 1.621, "mean_token_accuracy": 0.5842474676668644, "step": 1280 }, { "epoch": 0.5254047449343244, "grad_norm": 0.21551673114299774, "learning_rate": 9.663605632920518e-05, "loss": 1.659, "mean_token_accuracy": 0.5935076788067818, "step": 1290 }, { "epoch": 0.5294776499338153, "grad_norm": 0.5034237504005432, "learning_rate": 9.53176916404576e-05, "loss": 1.7666, "mean_token_accuracy": 0.5699214018881321, "step": 1300 }, { "epoch": 0.5335505549333062, "grad_norm": 0.26525890827178955, "learning_rate": 9.400014209632763e-05, "loss": 1.6026, "mean_token_accuracy": 0.5935329027473927, "step": 1310 }, { "epoch": 0.537623459932797, "grad_norm": 0.28077974915504456, "learning_rate": 9.268363706945312e-05, "loss": 1.7769, "mean_token_accuracy": 0.5664741955697536, "step": 1320 }, { "epoch": 0.5416963649322879, "grad_norm": 0.514976978302002, "learning_rate": 9.136840575063147e-05, "loss": 1.5157, "mean_token_accuracy": 0.6034789860248566, "step": 1330 }, { "epoch": 0.5457692699317789, "grad_norm": 0.318249374628067, "learning_rate": 9.005467710891987e-05, "loss": 1.8756, "mean_token_accuracy": 0.5630597174167633, "step": 1340 }, { "epoch": 0.5498421749312697, "grad_norm": 0.24940232932567596, "learning_rate": 8.874267985177394e-05, "loss": 1.5708, "mean_token_accuracy": 0.5888857699930667, "step": 1350 }, { "epoch": 0.5539150799307606, "grad_norm": 0.26299914717674255, "learning_rate": 8.743264238523199e-05, "loss": 1.6876, "mean_token_accuracy": 0.5782084472477436, "step": 1360 }, { "epoch": 0.5579879849302515, "grad_norm": 0.2588869333267212, "learning_rate": 8.612479277415174e-05, "loss": 1.6694, "mean_token_accuracy": 0.585976778715849, "step": 1370 }, { "epoch": 0.5620608899297423, "grad_norm": 0.2464841604232788, "learning_rate": 8.481935870250637e-05, "loss": 1.5838, "mean_token_accuracy": 0.605075704306364, "step": 1380 }, { "epoch": 0.5661337949292333, "grad_norm": 0.3231446146965027, "learning_rate": 8.351656743374709e-05, "loss": 1.6321, "mean_token_accuracy": 0.5716924026608468, "step": 1390 }, { "epoch": 0.5702066999287242, "grad_norm": 0.23010632395744324, "learning_rate": 8.22166457712386e-05, "loss": 1.5016, "mean_token_accuracy": 0.6048496462404728, "step": 1400 }, { "epoch": 0.5742796049282151, "grad_norm": 0.3723667860031128, "learning_rate": 8.091982001877493e-05, "loss": 1.5412, "mean_token_accuracy": 0.6111127749085427, "step": 1410 }, { "epoch": 0.5783525099277059, "grad_norm": 0.24990710616111755, "learning_rate": 7.962631594118208e-05, "loss": 1.7629, "mean_token_accuracy": 0.5585654892027379, "step": 1420 }, { "epoch": 0.5824254149271968, "grad_norm": 0.3681967556476593, "learning_rate": 7.833635872501462e-05, "loss": 1.6342, "mean_token_accuracy": 0.5907308183610439, "step": 1430 }, { "epoch": 0.5864983199266877, "grad_norm": 0.3382493555545807, "learning_rate": 7.705017293935281e-05, "loss": 1.5803, "mean_token_accuracy": 0.6061145611107349, "step": 1440 }, { "epoch": 0.5905712249261786, "grad_norm": 0.28145501017570496, "learning_rate": 7.576798249670725e-05, "loss": 1.8459, "mean_token_accuracy": 0.5457224696874619, "step": 1450 }, { "epoch": 0.5946441299256695, "grad_norm": 0.3189752697944641, "learning_rate": 7.449001061403809e-05, "loss": 1.5263, "mean_token_accuracy": 0.5937092356383801, "step": 1460 }, { "epoch": 0.5987170349251604, "grad_norm": 0.2588890492916107, "learning_rate": 7.321647977389479e-05, "loss": 1.5965, "mean_token_accuracy": 0.5941358201205731, "step": 1470 }, { "epoch": 0.6027899399246512, "grad_norm": 0.2777283191680908, "learning_rate": 7.194761168568445e-05, "loss": 1.5667, "mean_token_accuracy": 0.6003799811005592, "step": 1480 }, { "epoch": 0.6068628449241421, "grad_norm": 0.23376941680908203, "learning_rate": 7.068362724707392e-05, "loss": 1.4813, "mean_token_accuracy": 0.6078310683369637, "step": 1490 }, { "epoch": 0.610935749923633, "grad_norm": 0.2295948565006256, "learning_rate": 6.942474650553408e-05, "loss": 1.6786, "mean_token_accuracy": 0.5886344678699971, "step": 1500 }, { "epoch": 0.615008654923124, "grad_norm": 0.3243666887283325, "learning_rate": 6.817118862003132e-05, "loss": 1.6343, "mean_token_accuracy": 0.5855603873729706, "step": 1510 }, { "epoch": 0.6190815599226148, "grad_norm": 0.7187057733535767, "learning_rate": 6.692317182287432e-05, "loss": 1.8144, "mean_token_accuracy": 0.5671629451215268, "step": 1520 }, { "epoch": 0.6231544649221057, "grad_norm": 0.35659492015838623, "learning_rate": 6.568091338172195e-05, "loss": 1.6117, "mean_token_accuracy": 0.601442601531744, "step": 1530 }, { "epoch": 0.6272273699215966, "grad_norm": 0.3395217955112457, "learning_rate": 6.444462956175876e-05, "loss": 1.6222, "mean_token_accuracy": 0.5970501154661179, "step": 1540 }, { "epoch": 0.6313002749210874, "grad_norm": 0.26399192214012146, "learning_rate": 6.321453558804571e-05, "loss": 1.6048, "mean_token_accuracy": 0.5844796732068062, "step": 1550 }, { "epoch": 0.6353731799205784, "grad_norm": 0.2993052899837494, "learning_rate": 6.199084560805121e-05, "loss": 1.7073, "mean_token_accuracy": 0.5789771333336831, "step": 1560 }, { "epoch": 0.6394460849200693, "grad_norm": 0.2676873505115509, "learning_rate": 6.077377265437043e-05, "loss": 1.8152, "mean_token_accuracy": 0.5734024614095687, "step": 1570 }, { "epoch": 0.6435189899195601, "grad_norm": 0.293557733297348, "learning_rate": 5.956352860763809e-05, "loss": 1.7108, "mean_token_accuracy": 0.5808110930025577, "step": 1580 }, { "epoch": 0.647591894919051, "grad_norm": 0.23729322850704193, "learning_rate": 5.83603241596423e-05, "loss": 1.4793, "mean_token_accuracy": 0.6202867470681668, "step": 1590 }, { "epoch": 0.6516647999185419, "grad_norm": 0.30609002709388733, "learning_rate": 5.716436877664517e-05, "loss": 1.752, "mean_token_accuracy": 0.5730870619416237, "step": 1600 }, { "epoch": 0.6557377049180327, "grad_norm": 0.30717799067497253, "learning_rate": 5.5975870662916484e-05, "loss": 1.7172, "mean_token_accuracy": 0.5701417997479439, "step": 1610 }, { "epoch": 0.6598106099175237, "grad_norm": 0.44037064909935, "learning_rate": 5.4795036724487735e-05, "loss": 1.5377, "mean_token_accuracy": 0.6102925211191177, "step": 1620 }, { "epoch": 0.6638835149170146, "grad_norm": 0.24488377571105957, "learning_rate": 5.362207253313136e-05, "loss": 1.4547, "mean_token_accuracy": 0.6181615687906742, "step": 1630 }, { "epoch": 0.6679564199165055, "grad_norm": 0.2750435769557953, "learning_rate": 5.245718229057326e-05, "loss": 1.6086, "mean_token_accuracy": 0.5703060247004033, "step": 1640 }, { "epoch": 0.6720293249159963, "grad_norm": 0.2821342647075653, "learning_rate": 5.1300568792942535e-05, "loss": 1.6018, "mean_token_accuracy": 0.5989562854170799, "step": 1650 }, { "epoch": 0.6761022299154872, "grad_norm": 0.22521165013313293, "learning_rate": 5.015243339546731e-05, "loss": 1.7574, "mean_token_accuracy": 0.5801547184586525, "step": 1660 }, { "epoch": 0.6801751349149782, "grad_norm": 0.29259297251701355, "learning_rate": 4.90129759774202e-05, "loss": 1.7425, "mean_token_accuracy": 0.5723637498915195, "step": 1670 }, { "epoch": 0.684248039914469, "grad_norm": 0.2705146074295044, "learning_rate": 4.7882394907321674e-05, "loss": 1.6121, "mean_token_accuracy": 0.6098110035061837, "step": 1680 }, { "epoch": 0.6883209449139599, "grad_norm": 0.2677505910396576, "learning_rate": 4.676088700840575e-05, "loss": 1.6416, "mean_token_accuracy": 0.5757282719016075, "step": 1690 }, { "epoch": 0.6923938499134508, "grad_norm": 0.2644527554512024, "learning_rate": 4.564864752435509e-05, "loss": 1.6675, "mean_token_accuracy": 0.6154301188886165, "step": 1700 }, { "epoch": 0.6964667549129416, "grad_norm": 0.23048701882362366, "learning_rate": 4.454587008531097e-05, "loss": 1.6641, "mean_token_accuracy": 0.5855869121849537, "step": 1710 }, { "epoch": 0.7005396599124325, "grad_norm": 0.2789078652858734, "learning_rate": 4.345274667416399e-05, "loss": 1.6978, "mean_token_accuracy": 0.5762215368449688, "step": 1720 }, { "epoch": 0.7046125649119235, "grad_norm": 0.271881103515625, "learning_rate": 4.2369467593131926e-05, "loss": 1.681, "mean_token_accuracy": 0.5667479492723941, "step": 1730 }, { "epoch": 0.7086854699114143, "grad_norm": 0.24953240156173706, "learning_rate": 4.129622143062985e-05, "loss": 1.5405, "mean_token_accuracy": 0.6005463972687721, "step": 1740 }, { "epoch": 0.7127583749109052, "grad_norm": 0.3925758898258209, "learning_rate": 4.02331950284387e-05, "loss": 1.7217, "mean_token_accuracy": 0.5689709268510341, "step": 1750 }, { "epoch": 0.7168312799103961, "grad_norm": 0.2544846832752228, "learning_rate": 3.918057344917795e-05, "loss": 1.5948, "mean_token_accuracy": 0.5933421194553375, "step": 1760 }, { "epoch": 0.720904184909887, "grad_norm": 0.32760509848594666, "learning_rate": 3.813853994408793e-05, "loss": 1.6678, "mean_token_accuracy": 0.5856216661632061, "step": 1770 }, { "epoch": 0.7249770899093778, "grad_norm": 0.2847062647342682, "learning_rate": 3.7107275921127704e-05, "loss": 1.682, "mean_token_accuracy": 0.5889982558786869, "step": 1780 }, { "epoch": 0.7290499949088688, "grad_norm": 0.22774401307106018, "learning_rate": 3.60869609133936e-05, "loss": 1.7135, "mean_token_accuracy": 0.5773006275296211, "step": 1790 }, { "epoch": 0.7331228999083597, "grad_norm": 0.2606080174446106, "learning_rate": 3.507777254786425e-05, "loss": 1.4999, "mean_token_accuracy": 0.6269011601805687, "step": 1800 }, { "epoch": 0.7371958049078505, "grad_norm": 0.2962757647037506, "learning_rate": 3.407988651447738e-05, "loss": 1.6202, "mean_token_accuracy": 0.5973276488482953, "step": 1810 }, { "epoch": 0.7412687099073414, "grad_norm": 0.29107147455215454, "learning_rate": 3.3093476535544074e-05, "loss": 1.5502, "mean_token_accuracy": 0.6133273020386696, "step": 1820 }, { "epoch": 0.7453416149068323, "grad_norm": 0.20980948209762573, "learning_rate": 3.211871433550513e-05, "loss": 1.6333, "mean_token_accuracy": 0.6155988665297627, "step": 1830 }, { "epoch": 0.7494145199063232, "grad_norm": 0.24882718920707703, "learning_rate": 3.1155769611035825e-05, "loss": 1.4907, "mean_token_accuracy": 0.6201219961047173, "step": 1840 }, { "epoch": 0.7534874249058141, "grad_norm": 0.23715901374816895, "learning_rate": 3.0204810001503124e-05, "loss": 1.8018, "mean_token_accuracy": 0.5756942637264728, "step": 1850 }, { "epoch": 0.757560329905305, "grad_norm": 0.35216882824897766, "learning_rate": 2.9266001059781258e-05, "loss": 1.7305, "mean_token_accuracy": 0.5722471877932549, "step": 1860 }, { "epoch": 0.7616332349047958, "grad_norm": 0.2924104332923889, "learning_rate": 2.83395062234308e-05, "loss": 1.6642, "mean_token_accuracy": 0.58627370595932, "step": 1870 }, { "epoch": 0.7657061399042867, "grad_norm": 0.27772393822669983, "learning_rate": 2.742548678624548e-05, "loss": 1.8349, "mean_token_accuracy": 0.5614061944186688, "step": 1880 }, { "epoch": 0.7697790449037776, "grad_norm": 0.31574469804763794, "learning_rate": 2.6524101870172846e-05, "loss": 1.7883, "mean_token_accuracy": 0.561104378849268, "step": 1890 }, { "epoch": 0.7738519499032686, "grad_norm": 0.253779798746109, "learning_rate": 2.5635508397612262e-05, "loss": 1.6654, "mean_token_accuracy": 0.5888113439083099, "step": 1900 }, { "epoch": 0.7779248549027594, "grad_norm": 0.2504970133304596, "learning_rate": 2.4759861064096603e-05, "loss": 1.6478, "mean_token_accuracy": 0.5726306334137916, "step": 1910 }, { "epoch": 0.7819977599022503, "grad_norm": 0.23571030795574188, "learning_rate": 2.3897312311360955e-05, "loss": 1.5355, "mean_token_accuracy": 0.6026113323867321, "step": 1920 }, { "epoch": 0.7860706649017412, "grad_norm": 0.2395690232515335, "learning_rate": 2.3048012300804222e-05, "loss": 1.5565, "mean_token_accuracy": 0.5976604223251343, "step": 1930 }, { "epoch": 0.790143569901232, "grad_norm": 0.5269713997840881, "learning_rate": 2.221210888734736e-05, "loss": 1.636, "mean_token_accuracy": 0.5818449839949608, "step": 1940 }, { "epoch": 0.7942164749007229, "grad_norm": 0.4233987033367157, "learning_rate": 2.13897475936933e-05, "loss": 1.7844, "mean_token_accuracy": 0.5720866233110428, "step": 1950 }, { "epoch": 0.7982893799002139, "grad_norm": 0.2641923427581787, "learning_rate": 2.0581071584992818e-05, "loss": 1.5874, "mean_token_accuracy": 0.5966846913099288, "step": 1960 }, { "epoch": 0.8023622848997047, "grad_norm": 0.27280351519584656, "learning_rate": 1.9786221643920844e-05, "loss": 1.6279, "mean_token_accuracy": 0.5751761384308338, "step": 1970 }, { "epoch": 0.8064351898991956, "grad_norm": 0.3823714256286621, "learning_rate": 1.9005336146167686e-05, "loss": 1.6269, "mean_token_accuracy": 0.5963201723992825, "step": 1980 }, { "epoch": 0.8105080948986865, "grad_norm": 0.25173816084861755, "learning_rate": 1.8238551036349028e-05, "loss": 1.5308, "mean_token_accuracy": 0.6112879984080791, "step": 1990 }, { "epoch": 0.8145809998981773, "grad_norm": 0.21256780624389648, "learning_rate": 1.7485999804339348e-05, "loss": 1.5568, "mean_token_accuracy": 0.5963364981114865, "step": 2000 }, { "epoch": 0.8186539048976683, "grad_norm": 0.2510949969291687, "learning_rate": 1.6747813462032615e-05, "loss": 1.6787, "mean_token_accuracy": 0.58960345312953, "step": 2010 }, { "epoch": 0.8227268098971592, "grad_norm": 0.255790650844574, "learning_rate": 1.6024120520534326e-05, "loss": 1.6416, "mean_token_accuracy": 0.5875880800187587, "step": 2020 }, { "epoch": 0.82679971489665, "grad_norm": 0.307492196559906, "learning_rate": 1.5315046967789082e-05, "loss": 1.69, "mean_token_accuracy": 0.5625761769711971, "step": 2030 }, { "epoch": 0.8308726198961409, "grad_norm": 0.2648999094963074, "learning_rate": 1.4620716246647203e-05, "loss": 1.6092, "mean_token_accuracy": 0.6106476083397865, "step": 2040 }, { "epoch": 0.8349455248956318, "grad_norm": 0.2488166093826294, "learning_rate": 1.394124923337462e-05, "loss": 1.6848, "mean_token_accuracy": 0.5697021905332804, "step": 2050 }, { "epoch": 0.8390184298951227, "grad_norm": 0.2427694946527481, "learning_rate": 1.3276764216609294e-05, "loss": 1.5843, "mean_token_accuracy": 0.6084981314837933, "step": 2060 }, { "epoch": 0.8430913348946136, "grad_norm": 0.2833966910839081, "learning_rate": 1.2627376876768593e-05, "loss": 1.5443, "mean_token_accuracy": 0.6015144042670727, "step": 2070 }, { "epoch": 0.8471642398941045, "grad_norm": 0.4057978689670563, "learning_rate": 1.1993200265910131e-05, "loss": 1.6073, "mean_token_accuracy": 0.5917512811720371, "step": 2080 }, { "epoch": 0.8512371448935954, "grad_norm": 0.25613030791282654, "learning_rate": 1.1374344788050829e-05, "loss": 1.8038, "mean_token_accuracy": 0.5568435616791249, "step": 2090 }, { "epoch": 0.8553100498930862, "grad_norm": 0.30181950330734253, "learning_rate": 1.0770918179946388e-05, "loss": 1.5022, "mean_token_accuracy": 0.6081097513437271, "step": 2100 }, { "epoch": 0.8593829548925771, "grad_norm": 0.23373402655124664, "learning_rate": 1.0183025492335408e-05, "loss": 1.7432, "mean_token_accuracy": 0.5653887689113617, "step": 2110 }, { "epoch": 0.863455859892068, "grad_norm": 0.2826649248600006, "learning_rate": 9.610769071651193e-06, "loss": 1.6706, "mean_token_accuracy": 0.5875243842601776, "step": 2120 }, { "epoch": 0.867528764891559, "grad_norm": 0.3047688603401184, "learning_rate": 9.05424854220408e-06, "loss": 1.5901, "mean_token_accuracy": 0.6013362683355808, "step": 2130 }, { "epoch": 0.8716016698910498, "grad_norm": 0.3211512863636017, "learning_rate": 8.513560788837916e-06, "loss": 1.6414, "mean_token_accuracy": 0.5845984369516373, "step": 2140 }, { "epoch": 0.8756745748905407, "grad_norm": 0.22475050389766693, "learning_rate": 7.988799940063297e-06, "loss": 1.6038, "mean_token_accuracy": 0.5835995152592659, "step": 2150 }, { "epoch": 0.8797474798900315, "grad_norm": 0.2239948809146881, "learning_rate": 7.480057351670688e-06, "loss": 1.6661, "mean_token_accuracy": 0.5898953646421432, "step": 2160 }, { "epoch": 0.8838203848895224, "grad_norm": 0.3669275641441345, "learning_rate": 6.987421590826282e-06, "loss": 1.6066, "mean_token_accuracy": 0.5877827815711498, "step": 2170 }, { "epoch": 0.8878932898890134, "grad_norm": 0.30003634095191956, "learning_rate": 6.510978420653335e-06, "loss": 1.6816, "mean_token_accuracy": 0.5926426865160466, "step": 2180 }, { "epoch": 0.8919661948885043, "grad_norm": 0.2707299590110779, "learning_rate": 6.050810785301597e-06, "loss": 1.7702, "mean_token_accuracy": 0.561020129173994, "step": 2190 }, { "epoch": 0.8960390998879951, "grad_norm": 0.3029952347278595, "learning_rate": 5.606998795507578e-06, "loss": 1.5417, "mean_token_accuracy": 0.598423033952713, "step": 2200 }, { "epoch": 0.900112004887486, "grad_norm": 0.27840766310691833, "learning_rate": 5.1796197146479985e-06, "loss": 1.5119, "mean_token_accuracy": 0.6152562454342843, "step": 2210 }, { "epoch": 0.9041849098869769, "grad_norm": 0.28235796093940735, "learning_rate": 4.768747945288987e-06, "loss": 1.5287, "mean_token_accuracy": 0.61318289488554, "step": 2220 }, { "epoch": 0.9082578148864677, "grad_norm": 0.21450947225093842, "learning_rate": 4.37445501623337e-06, "loss": 1.5842, "mean_token_accuracy": 0.6025399126112461, "step": 2230 }, { "epoch": 0.9123307198859587, "grad_norm": 0.29954469203948975, "learning_rate": 3.996809570068127e-06, "loss": 1.5514, "mean_token_accuracy": 0.6040661752223968, "step": 2240 }, { "epoch": 0.9164036248854496, "grad_norm": 0.34261876344680786, "learning_rate": 3.635877351214445e-06, "loss": 1.5493, "mean_token_accuracy": 0.5996488876640796, "step": 2250 }, { "epoch": 0.9204765298849404, "grad_norm": 0.24511079490184784, "learning_rate": 3.291721194482189e-06, "loss": 1.5494, "mean_token_accuracy": 0.6054005287587643, "step": 2260 }, { "epoch": 0.9245494348844313, "grad_norm": 0.21510252356529236, "learning_rate": 2.9644010141310017e-06, "loss": 1.6294, "mean_token_accuracy": 0.5961603626608849, "step": 2270 }, { "epoch": 0.9286223398839222, "grad_norm": 0.23636655509471893, "learning_rate": 2.65397379343979e-06, "loss": 1.7332, "mean_token_accuracy": 0.5859133303165436, "step": 2280 }, { "epoch": 0.932695244883413, "grad_norm": 0.25582408905029297, "learning_rate": 2.3604935747865377e-06, "loss": 1.6691, "mean_token_accuracy": 0.5889919593930244, "step": 2290 }, { "epoch": 0.936768149882904, "grad_norm": 0.3853449523448944, "learning_rate": 2.0840114502400086e-06, "loss": 1.5358, "mean_token_accuracy": 0.5844359740614891, "step": 2300 }, { "epoch": 0.9408410548823949, "grad_norm": 0.2177136093378067, "learning_rate": 1.8245755526650753e-06, "loss": 1.6318, "mean_token_accuracy": 0.5915890723466873, "step": 2310 }, { "epoch": 0.9449139598818858, "grad_norm": 0.23138591647148132, "learning_rate": 1.5822310473433411e-06, "loss": 1.5595, "mean_token_accuracy": 0.5974130786955356, "step": 2320 }, { "epoch": 0.9489868648813766, "grad_norm": 0.2235519289970398, "learning_rate": 1.357020124110231e-06, "loss": 1.7522, "mean_token_accuracy": 0.5713608346879482, "step": 2330 }, { "epoch": 0.9530597698808675, "grad_norm": 0.37900933623313904, "learning_rate": 1.1489819900101784e-06, "loss": 1.5307, "mean_token_accuracy": 0.6045880667865277, "step": 2340 }, { "epoch": 0.9571326748803585, "grad_norm": 0.2911360561847687, "learning_rate": 9.581528624710734e-07, "loss": 1.5633, "mean_token_accuracy": 0.5826431967318058, "step": 2350 }, { "epoch": 0.9612055798798493, "grad_norm": 0.25369352102279663, "learning_rate": 7.845659629990842e-07, "loss": 1.6927, "mean_token_accuracy": 0.5901580177247524, "step": 2360 }, { "epoch": 0.9652784848793402, "grad_norm": 0.32107028365135193, "learning_rate": 6.282515113952281e-07, "loss": 1.815, "mean_token_accuracy": 0.56534923017025, "step": 2370 }, { "epoch": 0.9693513898788311, "grad_norm": 0.3105465769767761, "learning_rate": 4.892367204943016e-07, "loss": 1.5694, "mean_token_accuracy": 0.5809950686991214, "step": 2380 }, { "epoch": 0.9734242948783219, "grad_norm": 0.2689298689365387, "learning_rate": 3.6754579142741495e-07, "loss": 1.6555, "mean_token_accuracy": 0.591179046779871, "step": 2390 }, { "epoch": 0.9774971998778128, "grad_norm": 0.44850870966911316, "learning_rate": 2.6319990940885107e-07, "loss": 1.7315, "mean_token_accuracy": 0.5772897489368916, "step": 2400 }, { "epoch": 0.9815701048773038, "grad_norm": 0.25496381521224976, "learning_rate": 1.762172400478601e-07, "loss": 1.5847, "mean_token_accuracy": 0.5798953503370285, "step": 2410 }, { "epoch": 0.9856430098767947, "grad_norm": 0.2383822500705719, "learning_rate": 1.0661292618624474e-07, "loss": 1.54, "mean_token_accuracy": 0.6138455606997013, "step": 2420 }, { "epoch": 0.9897159148762855, "grad_norm": 0.2854715585708618, "learning_rate": 5.439908526212456e-08, "loss": 1.4109, "mean_token_accuracy": 0.6151122771203518, "step": 2430 }, { "epoch": 0.9937888198757764, "grad_norm": 0.297370046377182, "learning_rate": 1.9584807200423438e-08, "loss": 1.5128, "mean_token_accuracy": 0.6013165354728699, "step": 2440 }, { "epoch": 0.9978617248752673, "grad_norm": 0.2563394010066986, "learning_rate": 2.176152830357658e-09, "loss": 1.6287, "mean_token_accuracy": 0.5945099242031574, "step": 2450 } ], "logging_steps": 10, "max_steps": 2455, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.283473658609664e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }