{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 20, "global_step": 8786, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0022765430693491933, "grad_norm": 0.469247430562973, "learning_rate": 0.0002, "loss": 1.9469, "step": 20 }, { "epoch": 0.004553086138698387, "grad_norm": 0.6239348649978638, "learning_rate": 0.0002, "loss": 1.556, "step": 40 }, { "epoch": 0.006829629208047579, "grad_norm": 0.4587397277355194, "learning_rate": 0.0002, "loss": 1.4108, "step": 60 }, { "epoch": 0.009106172277396773, "grad_norm": 0.42919760942459106, "learning_rate": 0.0002, "loss": 1.3352, "step": 80 }, { "epoch": 0.011382715346745967, "grad_norm": 0.46492573618888855, "learning_rate": 0.0002, "loss": 1.3388, "step": 100 }, { "epoch": 0.013659258416095159, "grad_norm": 0.453070729970932, "learning_rate": 0.0002, "loss": 1.2295, "step": 120 }, { "epoch": 0.015935801485444354, "grad_norm": 0.4760678708553314, "learning_rate": 0.0002, "loss": 1.2493, "step": 140 }, { "epoch": 0.018212344554793546, "grad_norm": 0.4545675814151764, "learning_rate": 0.0002, "loss": 1.215, "step": 160 }, { "epoch": 0.020488887624142738, "grad_norm": 0.4772235155105591, "learning_rate": 0.0002, "loss": 1.2173, "step": 180 }, { "epoch": 0.022765430693491934, "grad_norm": 0.4403541088104248, "learning_rate": 0.0002, "loss": 1.1058, "step": 200 }, { "epoch": 0.025041973762841126, "grad_norm": 0.511401355266571, "learning_rate": 0.0002, "loss": 1.1049, "step": 220 }, { "epoch": 0.027318516832190318, "grad_norm": 0.3809013366699219, "learning_rate": 0.0002, "loss": 1.0498, "step": 240 }, { "epoch": 0.029595059901539513, "grad_norm": 0.3980010449886322, "learning_rate": 0.0002, "loss": 0.9842, "step": 260 }, { "epoch": 0.03187160297088871, "grad_norm": 0.5747793316841125, "learning_rate": 0.0002, "loss": 1.0988, "step": 280 }, { "epoch": 0.0341481460402379, "grad_norm": 0.46827971935272217, "learning_rate": 0.0002, "loss": 1.0367, "step": 300 }, { "epoch": 0.03642468910958709, "grad_norm": 0.4702209532260895, "learning_rate": 0.0002, "loss": 1.066, "step": 320 }, { "epoch": 0.038701232178936285, "grad_norm": 0.5084996223449707, "learning_rate": 0.0002, "loss": 1.0652, "step": 340 }, { "epoch": 0.040977775248285477, "grad_norm": 0.3944012522697449, "learning_rate": 0.0002, "loss": 0.9642, "step": 360 }, { "epoch": 0.04325431831763467, "grad_norm": 0.40287718176841736, "learning_rate": 0.0002, "loss": 0.9431, "step": 380 }, { "epoch": 0.04553086138698387, "grad_norm": 0.4629077613353729, "learning_rate": 0.0002, "loss": 0.9615, "step": 400 }, { "epoch": 0.04780740445633306, "grad_norm": 0.44827452301979065, "learning_rate": 0.0002, "loss": 0.9434, "step": 420 }, { "epoch": 0.05008394752568225, "grad_norm": 0.41644710302352905, "learning_rate": 0.0002, "loss": 0.9241, "step": 440 }, { "epoch": 0.05236049059503144, "grad_norm": 0.4760611057281494, "learning_rate": 0.0002, "loss": 0.8475, "step": 460 }, { "epoch": 0.054637033664380635, "grad_norm": 0.45987364649772644, "learning_rate": 0.0002, "loss": 0.898, "step": 480 }, { "epoch": 0.056913576733729834, "grad_norm": 0.4840068817138672, "learning_rate": 0.0002, "loss": 0.9611, "step": 500 }, { "epoch": 0.059190119803079026, "grad_norm": 0.40314286947250366, "learning_rate": 0.0002, "loss": 0.8884, "step": 520 }, { "epoch": 0.06146666287242822, "grad_norm": 0.5458106398582458, "learning_rate": 0.0002, "loss": 0.8939, "step": 540 }, { "epoch": 0.06374320594177742, "grad_norm": 0.5420896410942078, "learning_rate": 0.0002, "loss": 0.8265, "step": 560 }, { "epoch": 0.0660197490111266, "grad_norm": 0.5356529355049133, "learning_rate": 0.0002, "loss": 0.8432, "step": 580 }, { "epoch": 0.0682962920804758, "grad_norm": 0.5064826011657715, "learning_rate": 0.0002, "loss": 0.8272, "step": 600 }, { "epoch": 0.07057283514982499, "grad_norm": 0.4143005311489105, "learning_rate": 0.0002, "loss": 0.7854, "step": 620 }, { "epoch": 0.07284937821917419, "grad_norm": 0.3817225396633148, "learning_rate": 0.0002, "loss": 0.8219, "step": 640 }, { "epoch": 0.07512592128852338, "grad_norm": 0.5336936712265015, "learning_rate": 0.0002, "loss": 0.7977, "step": 660 }, { "epoch": 0.07740246435787257, "grad_norm": 0.5397001504898071, "learning_rate": 0.0002, "loss": 0.8117, "step": 680 }, { "epoch": 0.07967900742722177, "grad_norm": 0.4968530535697937, "learning_rate": 0.0002, "loss": 0.7527, "step": 700 }, { "epoch": 0.08195555049657095, "grad_norm": 0.4084935784339905, "learning_rate": 0.0002, "loss": 0.651, "step": 720 }, { "epoch": 0.08423209356592015, "grad_norm": 0.48406732082366943, "learning_rate": 0.0002, "loss": 0.7352, "step": 740 }, { "epoch": 0.08650863663526934, "grad_norm": 0.5246301293373108, "learning_rate": 0.0002, "loss": 0.7785, "step": 760 }, { "epoch": 0.08878517970461854, "grad_norm": 0.5729619264602661, "learning_rate": 0.0002, "loss": 0.7646, "step": 780 }, { "epoch": 0.09106172277396773, "grad_norm": 0.5675190687179565, "learning_rate": 0.0002, "loss": 0.7784, "step": 800 }, { "epoch": 0.09333826584331692, "grad_norm": 0.4682878255844116, "learning_rate": 0.0002, "loss": 0.7284, "step": 820 }, { "epoch": 0.09561480891266612, "grad_norm": 0.5388545393943787, "learning_rate": 0.0002, "loss": 0.6959, "step": 840 }, { "epoch": 0.0978913519820153, "grad_norm": 0.48806509375572205, "learning_rate": 0.0002, "loss": 0.7585, "step": 860 }, { "epoch": 0.1001678950513645, "grad_norm": 0.4149261713027954, "learning_rate": 0.0002, "loss": 0.6978, "step": 880 }, { "epoch": 0.1024444381207137, "grad_norm": 0.4971105754375458, "learning_rate": 0.0002, "loss": 0.7103, "step": 900 }, { "epoch": 0.10472098119006289, "grad_norm": 0.5066735744476318, "learning_rate": 0.0002, "loss": 0.6854, "step": 920 }, { "epoch": 0.10699752425941209, "grad_norm": 0.4922661781311035, "learning_rate": 0.0002, "loss": 0.6231, "step": 940 }, { "epoch": 0.10927406732876127, "grad_norm": 0.5949555039405823, "learning_rate": 0.0002, "loss": 0.6813, "step": 960 }, { "epoch": 0.11155061039811047, "grad_norm": 0.581446647644043, "learning_rate": 0.0002, "loss": 0.6174, "step": 980 }, { "epoch": 0.11382715346745967, "grad_norm": 0.6152529716491699, "learning_rate": 0.0002, "loss": 0.6405, "step": 1000 }, { "epoch": 0.11610369653680885, "grad_norm": 0.5986836552619934, "learning_rate": 0.0002, "loss": 0.5776, "step": 1020 }, { "epoch": 0.11838023960615805, "grad_norm": 0.4255094528198242, "learning_rate": 0.0002, "loss": 0.6576, "step": 1040 }, { "epoch": 0.12065678267550724, "grad_norm": 0.4563849866390228, "learning_rate": 0.0002, "loss": 0.6647, "step": 1060 }, { "epoch": 0.12293332574485644, "grad_norm": 0.593227744102478, "learning_rate": 0.0002, "loss": 0.6043, "step": 1080 }, { "epoch": 0.12520986881420562, "grad_norm": 0.47059598565101624, "learning_rate": 0.0002, "loss": 0.591, "step": 1100 }, { "epoch": 0.12748641188355483, "grad_norm": 0.5013225674629211, "learning_rate": 0.0002, "loss": 0.5947, "step": 1120 }, { "epoch": 0.12976295495290402, "grad_norm": 0.46772757172584534, "learning_rate": 0.0002, "loss": 0.6292, "step": 1140 }, { "epoch": 0.1320394980222532, "grad_norm": 0.5844313502311707, "learning_rate": 0.0002, "loss": 0.6128, "step": 1160 }, { "epoch": 0.1343160410916024, "grad_norm": 0.5295489430427551, "learning_rate": 0.0002, "loss": 0.6064, "step": 1180 }, { "epoch": 0.1365925841609516, "grad_norm": 0.4482004642486572, "learning_rate": 0.0002, "loss": 0.5899, "step": 1200 }, { "epoch": 0.1388691272303008, "grad_norm": 0.6281692981719971, "learning_rate": 0.0002, "loss": 0.6109, "step": 1220 }, { "epoch": 0.14114567029964997, "grad_norm": 0.4718242585659027, "learning_rate": 0.0002, "loss": 0.5857, "step": 1240 }, { "epoch": 0.14342221336899919, "grad_norm": 0.5219341516494751, "learning_rate": 0.0002, "loss": 0.5581, "step": 1260 }, { "epoch": 0.14569875643834837, "grad_norm": 0.47050580382347107, "learning_rate": 0.0002, "loss": 0.6368, "step": 1280 }, { "epoch": 0.14797529950769756, "grad_norm": 0.5425338745117188, "learning_rate": 0.0002, "loss": 0.5626, "step": 1300 }, { "epoch": 0.15025184257704677, "grad_norm": 0.4944934844970703, "learning_rate": 0.0002, "loss": 0.5337, "step": 1320 }, { "epoch": 0.15252838564639595, "grad_norm": 0.5921599864959717, "learning_rate": 0.0002, "loss": 0.5672, "step": 1340 }, { "epoch": 0.15480492871574514, "grad_norm": 0.4866751730442047, "learning_rate": 0.0002, "loss": 0.5305, "step": 1360 }, { "epoch": 0.15708147178509432, "grad_norm": 0.62166827917099, "learning_rate": 0.0002, "loss": 0.5737, "step": 1380 }, { "epoch": 0.15935801485444354, "grad_norm": 0.5006982684135437, "learning_rate": 0.0002, "loss": 0.5542, "step": 1400 }, { "epoch": 0.16163455792379272, "grad_norm": 0.6090095043182373, "learning_rate": 0.0002, "loss": 0.5215, "step": 1420 }, { "epoch": 0.1639111009931419, "grad_norm": 0.4260309636592865, "learning_rate": 0.0002, "loss": 0.5535, "step": 1440 }, { "epoch": 0.16618764406249112, "grad_norm": 0.48657718300819397, "learning_rate": 0.0002, "loss": 0.5441, "step": 1460 }, { "epoch": 0.1684641871318403, "grad_norm": 0.43275007605552673, "learning_rate": 0.0002, "loss": 0.5161, "step": 1480 }, { "epoch": 0.1707407302011895, "grad_norm": 0.4225006699562073, "learning_rate": 0.0002, "loss": 0.512, "step": 1500 }, { "epoch": 0.17301727327053867, "grad_norm": 0.5176346302032471, "learning_rate": 0.0002, "loss": 0.5384, "step": 1520 }, { "epoch": 0.1752938163398879, "grad_norm": 0.6492679715156555, "learning_rate": 0.0002, "loss": 0.4981, "step": 1540 }, { "epoch": 0.17757035940923707, "grad_norm": 0.5511758327484131, "learning_rate": 0.0002, "loss": 0.5289, "step": 1560 }, { "epoch": 0.17984690247858626, "grad_norm": 0.5211341977119446, "learning_rate": 0.0002, "loss": 0.5002, "step": 1580 }, { "epoch": 0.18212344554793547, "grad_norm": 0.5488260984420776, "learning_rate": 0.0002, "loss": 0.5178, "step": 1600 }, { "epoch": 0.18439998861728465, "grad_norm": 0.6779264211654663, "learning_rate": 0.0002, "loss": 0.5155, "step": 1620 }, { "epoch": 0.18667653168663384, "grad_norm": 0.502919614315033, "learning_rate": 0.0002, "loss": 0.4923, "step": 1640 }, { "epoch": 0.18895307475598305, "grad_norm": 0.4989205300807953, "learning_rate": 0.0002, "loss": 0.4825, "step": 1660 }, { "epoch": 0.19122961782533224, "grad_norm": 0.5155315399169922, "learning_rate": 0.0002, "loss": 0.4796, "step": 1680 }, { "epoch": 0.19350616089468142, "grad_norm": 0.5648865699768066, "learning_rate": 0.0002, "loss": 0.4985, "step": 1700 }, { "epoch": 0.1957827039640306, "grad_norm": 0.606176495552063, "learning_rate": 0.0002, "loss": 0.4819, "step": 1720 }, { "epoch": 0.19805924703337982, "grad_norm": 0.5440786480903625, "learning_rate": 0.0002, "loss": 0.5213, "step": 1740 }, { "epoch": 0.200335790102729, "grad_norm": 0.43152502179145813, "learning_rate": 0.0002, "loss": 0.4429, "step": 1760 }, { "epoch": 0.2026123331720782, "grad_norm": 0.5701313614845276, "learning_rate": 0.0002, "loss": 0.4486, "step": 1780 }, { "epoch": 0.2048888762414274, "grad_norm": 0.565666913986206, "learning_rate": 0.0002, "loss": 0.4561, "step": 1800 }, { "epoch": 0.2071654193107766, "grad_norm": 0.5725598931312561, "learning_rate": 0.0002, "loss": 0.4757, "step": 1820 }, { "epoch": 0.20944196238012577, "grad_norm": 0.4642520248889923, "learning_rate": 0.0002, "loss": 0.438, "step": 1840 }, { "epoch": 0.21171850544947496, "grad_norm": 0.6077229976654053, "learning_rate": 0.0002, "loss": 0.4295, "step": 1860 }, { "epoch": 0.21399504851882417, "grad_norm": 0.6314090490341187, "learning_rate": 0.0002, "loss": 0.449, "step": 1880 }, { "epoch": 0.21627159158817336, "grad_norm": 0.4416756331920624, "learning_rate": 0.0002, "loss": 0.4554, "step": 1900 }, { "epoch": 0.21854813465752254, "grad_norm": 0.5278882384300232, "learning_rate": 0.0002, "loss": 0.4554, "step": 1920 }, { "epoch": 0.22082467772687175, "grad_norm": 0.45619043707847595, "learning_rate": 0.0002, "loss": 0.4868, "step": 1940 }, { "epoch": 0.22310122079622094, "grad_norm": 0.5881581902503967, "learning_rate": 0.0002, "loss": 0.4672, "step": 1960 }, { "epoch": 0.22537776386557012, "grad_norm": 0.5379284024238586, "learning_rate": 0.0002, "loss": 0.4531, "step": 1980 }, { "epoch": 0.22765430693491934, "grad_norm": 0.5562624931335449, "learning_rate": 0.0002, "loss": 0.464, "step": 2000 }, { "epoch": 0.22993085000426852, "grad_norm": 0.554499626159668, "learning_rate": 0.0002, "loss": 0.446, "step": 2020 }, { "epoch": 0.2322073930736177, "grad_norm": 0.509219229221344, "learning_rate": 0.0002, "loss": 0.4417, "step": 2040 }, { "epoch": 0.2344839361429669, "grad_norm": 0.5206849575042725, "learning_rate": 0.0002, "loss": 0.4118, "step": 2060 }, { "epoch": 0.2367604792123161, "grad_norm": 0.548729658126831, "learning_rate": 0.0002, "loss": 0.4067, "step": 2080 }, { "epoch": 0.2390370222816653, "grad_norm": 0.4220084846019745, "learning_rate": 0.0002, "loss": 0.428, "step": 2100 }, { "epoch": 0.24131356535101448, "grad_norm": 0.5507292747497559, "learning_rate": 0.0002, "loss": 0.4176, "step": 2120 }, { "epoch": 0.2435901084203637, "grad_norm": 0.5605701208114624, "learning_rate": 0.0002, "loss": 0.4661, "step": 2140 }, { "epoch": 0.24586665148971287, "grad_norm": 0.43142881989479065, "learning_rate": 0.0002, "loss": 0.4197, "step": 2160 }, { "epoch": 0.24814319455906206, "grad_norm": 0.47790080308914185, "learning_rate": 0.0002, "loss": 0.4568, "step": 2180 }, { "epoch": 0.25041973762841124, "grad_norm": 0.6048968434333801, "learning_rate": 0.0002, "loss": 0.4199, "step": 2200 }, { "epoch": 0.25269628069776046, "grad_norm": 0.4925907850265503, "learning_rate": 0.0002, "loss": 0.4325, "step": 2220 }, { "epoch": 0.25497282376710967, "grad_norm": 0.5463051199913025, "learning_rate": 0.0002, "loss": 0.4549, "step": 2240 }, { "epoch": 0.2572493668364588, "grad_norm": 0.4631319046020508, "learning_rate": 0.0002, "loss": 0.3977, "step": 2260 }, { "epoch": 0.25952590990580804, "grad_norm": 0.4965234398841858, "learning_rate": 0.0002, "loss": 0.4285, "step": 2280 }, { "epoch": 0.2618024529751572, "grad_norm": 0.5436238646507263, "learning_rate": 0.0002, "loss": 0.4039, "step": 2300 }, { "epoch": 0.2640789960445064, "grad_norm": 0.5218191742897034, "learning_rate": 0.0002, "loss": 0.4092, "step": 2320 }, { "epoch": 0.2663555391138556, "grad_norm": 0.5417261719703674, "learning_rate": 0.0002, "loss": 0.3825, "step": 2340 }, { "epoch": 0.2686320821832048, "grad_norm": 0.6126281023025513, "learning_rate": 0.0002, "loss": 0.4391, "step": 2360 }, { "epoch": 0.270908625252554, "grad_norm": 0.4734433889389038, "learning_rate": 0.0002, "loss": 0.4151, "step": 2380 }, { "epoch": 0.2731851683219032, "grad_norm": 0.4501429796218872, "learning_rate": 0.0002, "loss": 0.4178, "step": 2400 }, { "epoch": 0.27546171139125236, "grad_norm": 0.5258509516716003, "learning_rate": 0.0002, "loss": 0.4007, "step": 2420 }, { "epoch": 0.2777382544606016, "grad_norm": 0.47874951362609863, "learning_rate": 0.0002, "loss": 0.4245, "step": 2440 }, { "epoch": 0.2800147975299508, "grad_norm": 0.528533399105072, "learning_rate": 0.0002, "loss": 0.3794, "step": 2460 }, { "epoch": 0.28229134059929994, "grad_norm": 0.46465063095092773, "learning_rate": 0.0002, "loss": 0.4019, "step": 2480 }, { "epoch": 0.28456788366864916, "grad_norm": 0.5217177867889404, "learning_rate": 0.0002, "loss": 0.4104, "step": 2500 }, { "epoch": 0.28684442673799837, "grad_norm": 0.510036289691925, "learning_rate": 0.0002, "loss": 0.389, "step": 2520 }, { "epoch": 0.2891209698073475, "grad_norm": 0.6968228220939636, "learning_rate": 0.0002, "loss": 0.4152, "step": 2540 }, { "epoch": 0.29139751287669674, "grad_norm": 0.4529867470264435, "learning_rate": 0.0002, "loss": 0.3987, "step": 2560 }, { "epoch": 0.29367405594604595, "grad_norm": 0.5680263638496399, "learning_rate": 0.0002, "loss": 0.3828, "step": 2580 }, { "epoch": 0.2959505990153951, "grad_norm": 0.4892405867576599, "learning_rate": 0.0002, "loss": 0.4006, "step": 2600 }, { "epoch": 0.2982271420847443, "grad_norm": 0.47588276863098145, "learning_rate": 0.0002, "loss": 0.4197, "step": 2620 }, { "epoch": 0.30050368515409354, "grad_norm": 0.5624070167541504, "learning_rate": 0.0002, "loss": 0.3997, "step": 2640 }, { "epoch": 0.3027802282234427, "grad_norm": 0.5434039831161499, "learning_rate": 0.0002, "loss": 0.3977, "step": 2660 }, { "epoch": 0.3050567712927919, "grad_norm": 0.5572277903556824, "learning_rate": 0.0002, "loss": 0.3966, "step": 2680 }, { "epoch": 0.30733331436214106, "grad_norm": 0.5533374547958374, "learning_rate": 0.0002, "loss": 0.3803, "step": 2700 }, { "epoch": 0.3096098574314903, "grad_norm": 0.40596967935562134, "learning_rate": 0.0002, "loss": 0.3682, "step": 2720 }, { "epoch": 0.3118864005008395, "grad_norm": 0.4737823009490967, "learning_rate": 0.0002, "loss": 0.3761, "step": 2740 }, { "epoch": 0.31416294357018865, "grad_norm": 0.4295174777507782, "learning_rate": 0.0002, "loss": 0.4035, "step": 2760 }, { "epoch": 0.31643948663953786, "grad_norm": 0.5348454713821411, "learning_rate": 0.0002, "loss": 0.404, "step": 2780 }, { "epoch": 0.31871602970888707, "grad_norm": 0.4819965362548828, "learning_rate": 0.0002, "loss": 0.3929, "step": 2800 }, { "epoch": 0.32099257277823623, "grad_norm": 0.5920088291168213, "learning_rate": 0.0002, "loss": 0.3798, "step": 2820 }, { "epoch": 0.32326911584758544, "grad_norm": 0.4936531186103821, "learning_rate": 0.0002, "loss": 0.3995, "step": 2840 }, { "epoch": 0.32554565891693465, "grad_norm": 0.5252315998077393, "learning_rate": 0.0002, "loss": 0.3842, "step": 2860 }, { "epoch": 0.3278222019862838, "grad_norm": 0.5818414688110352, "learning_rate": 0.0002, "loss": 0.3533, "step": 2880 }, { "epoch": 0.330098745055633, "grad_norm": 0.44053876399993896, "learning_rate": 0.0002, "loss": 0.3402, "step": 2900 }, { "epoch": 0.33237528812498224, "grad_norm": 0.5421345233917236, "learning_rate": 0.0002, "loss": 0.3542, "step": 2920 }, { "epoch": 0.3346518311943314, "grad_norm": 0.4642751216888428, "learning_rate": 0.0002, "loss": 0.3755, "step": 2940 }, { "epoch": 0.3369283742636806, "grad_norm": 0.5137833952903748, "learning_rate": 0.0002, "loss": 0.3602, "step": 2960 }, { "epoch": 0.3392049173330298, "grad_norm": 0.5032792687416077, "learning_rate": 0.0002, "loss": 0.3451, "step": 2980 }, { "epoch": 0.341481460402379, "grad_norm": 0.4932720363140106, "learning_rate": 0.0002, "loss": 0.384, "step": 3000 }, { "epoch": 0.3437580034717282, "grad_norm": 0.49986231327056885, "learning_rate": 0.0002, "loss": 0.3826, "step": 3020 }, { "epoch": 0.34603454654107735, "grad_norm": 0.6325618624687195, "learning_rate": 0.0002, "loss": 0.3582, "step": 3040 }, { "epoch": 0.34831108961042656, "grad_norm": 0.5402369499206543, "learning_rate": 0.0002, "loss": 0.3706, "step": 3060 }, { "epoch": 0.3505876326797758, "grad_norm": 0.4967012107372284, "learning_rate": 0.0002, "loss": 0.3456, "step": 3080 }, { "epoch": 0.35286417574912493, "grad_norm": 0.4491735100746155, "learning_rate": 0.0002, "loss": 0.347, "step": 3100 }, { "epoch": 0.35514071881847414, "grad_norm": 0.9062516093254089, "learning_rate": 0.0002, "loss": 0.3617, "step": 3120 }, { "epoch": 0.35741726188782336, "grad_norm": 0.5253359079360962, "learning_rate": 0.0002, "loss": 0.3512, "step": 3140 }, { "epoch": 0.3596938049571725, "grad_norm": 0.4836867153644562, "learning_rate": 0.0002, "loss": 0.3585, "step": 3160 }, { "epoch": 0.3619703480265217, "grad_norm": 0.49537473917007446, "learning_rate": 0.0002, "loss": 0.364, "step": 3180 }, { "epoch": 0.36424689109587094, "grad_norm": 0.6098095178604126, "learning_rate": 0.0002, "loss": 0.3455, "step": 3200 }, { "epoch": 0.3665234341652201, "grad_norm": 0.5926884412765503, "learning_rate": 0.0002, "loss": 0.3406, "step": 3220 }, { "epoch": 0.3687999772345693, "grad_norm": 0.5868669152259827, "learning_rate": 0.0002, "loss": 0.3643, "step": 3240 }, { "epoch": 0.3710765203039185, "grad_norm": 0.42670106887817383, "learning_rate": 0.0002, "loss": 0.344, "step": 3260 }, { "epoch": 0.3733530633732677, "grad_norm": 0.5992838740348816, "learning_rate": 0.0002, "loss": 0.3588, "step": 3280 }, { "epoch": 0.3756296064426169, "grad_norm": 0.4388341009616852, "learning_rate": 0.0002, "loss": 0.3375, "step": 3300 }, { "epoch": 0.3779061495119661, "grad_norm": 0.596488893032074, "learning_rate": 0.0002, "loss": 0.3425, "step": 3320 }, { "epoch": 0.38018269258131526, "grad_norm": 0.4572538137435913, "learning_rate": 0.0002, "loss": 0.3711, "step": 3340 }, { "epoch": 0.3824592356506645, "grad_norm": 0.5661656856536865, "learning_rate": 0.0002, "loss": 0.3415, "step": 3360 }, { "epoch": 0.38473577872001363, "grad_norm": 0.45082923769950867, "learning_rate": 0.0002, "loss": 0.3495, "step": 3380 }, { "epoch": 0.38701232178936285, "grad_norm": 0.4995211660861969, "learning_rate": 0.0002, "loss": 0.3311, "step": 3400 }, { "epoch": 0.38928886485871206, "grad_norm": 0.5004004240036011, "learning_rate": 0.0002, "loss": 0.3506, "step": 3420 }, { "epoch": 0.3915654079280612, "grad_norm": 0.5676460266113281, "learning_rate": 0.0002, "loss": 0.3383, "step": 3440 }, { "epoch": 0.39384195099741043, "grad_norm": 0.4805515706539154, "learning_rate": 0.0002, "loss": 0.3382, "step": 3460 }, { "epoch": 0.39611849406675964, "grad_norm": 0.47675764560699463, "learning_rate": 0.0002, "loss": 0.3021, "step": 3480 }, { "epoch": 0.3983950371361088, "grad_norm": 0.6285260915756226, "learning_rate": 0.0002, "loss": 0.3467, "step": 3500 }, { "epoch": 0.400671580205458, "grad_norm": 0.5657575130462646, "learning_rate": 0.0002, "loss": 0.3382, "step": 3520 }, { "epoch": 0.4029481232748072, "grad_norm": 0.6148316860198975, "learning_rate": 0.0002, "loss": 0.3396, "step": 3540 }, { "epoch": 0.4052246663441564, "grad_norm": 0.5819992423057556, "learning_rate": 0.0002, "loss": 0.3373, "step": 3560 }, { "epoch": 0.4075012094135056, "grad_norm": 0.6080338954925537, "learning_rate": 0.0002, "loss": 0.3463, "step": 3580 }, { "epoch": 0.4097777524828548, "grad_norm": 0.6103864312171936, "learning_rate": 0.0002, "loss": 0.3441, "step": 3600 }, { "epoch": 0.41205429555220396, "grad_norm": 0.5234800577163696, "learning_rate": 0.0002, "loss": 0.3272, "step": 3620 }, { "epoch": 0.4143308386215532, "grad_norm": 0.5393822193145752, "learning_rate": 0.0002, "loss": 0.3308, "step": 3640 }, { "epoch": 0.4166073816909024, "grad_norm": 0.4853431284427643, "learning_rate": 0.0002, "loss": 0.3152, "step": 3660 }, { "epoch": 0.41888392476025155, "grad_norm": 0.5507264733314514, "learning_rate": 0.0002, "loss": 0.3229, "step": 3680 }, { "epoch": 0.42116046782960076, "grad_norm": 0.44306129217147827, "learning_rate": 0.0002, "loss": 0.3389, "step": 3700 }, { "epoch": 0.4234370108989499, "grad_norm": 0.4574294984340668, "learning_rate": 0.0002, "loss": 0.3516, "step": 3720 }, { "epoch": 0.42571355396829913, "grad_norm": 0.5367994904518127, "learning_rate": 0.0002, "loss": 0.3576, "step": 3740 }, { "epoch": 0.42799009703764834, "grad_norm": 0.5044491291046143, "learning_rate": 0.0002, "loss": 0.3449, "step": 3760 }, { "epoch": 0.4302666401069975, "grad_norm": 0.41715556383132935, "learning_rate": 0.0002, "loss": 0.3128, "step": 3780 }, { "epoch": 0.4325431831763467, "grad_norm": 0.4355817437171936, "learning_rate": 0.0002, "loss": 0.3131, "step": 3800 }, { "epoch": 0.4348197262456959, "grad_norm": 0.5237382650375366, "learning_rate": 0.0002, "loss": 0.3281, "step": 3820 }, { "epoch": 0.4370962693150451, "grad_norm": 0.6210081577301025, "learning_rate": 0.0002, "loss": 0.3195, "step": 3840 }, { "epoch": 0.4393728123843943, "grad_norm": 0.5145352482795715, "learning_rate": 0.0002, "loss": 0.3107, "step": 3860 }, { "epoch": 0.4416493554537435, "grad_norm": 0.5554608106613159, "learning_rate": 0.0002, "loss": 0.3418, "step": 3880 }, { "epoch": 0.44392589852309267, "grad_norm": 0.4971628487110138, "learning_rate": 0.0002, "loss": 0.3293, "step": 3900 }, { "epoch": 0.4462024415924419, "grad_norm": 0.49732130765914917, "learning_rate": 0.0002, "loss": 0.3138, "step": 3920 }, { "epoch": 0.4484789846617911, "grad_norm": 0.5883257985115051, "learning_rate": 0.0002, "loss": 0.3357, "step": 3940 }, { "epoch": 0.45075552773114025, "grad_norm": 0.5349528193473816, "learning_rate": 0.0002, "loss": 0.3381, "step": 3960 }, { "epoch": 0.45303207080048946, "grad_norm": 0.5360047221183777, "learning_rate": 0.0002, "loss": 0.3116, "step": 3980 }, { "epoch": 0.4553086138698387, "grad_norm": 0.4889732003211975, "learning_rate": 0.0002, "loss": 0.3154, "step": 4000 }, { "epoch": 0.45758515693918783, "grad_norm": 0.4912421703338623, "learning_rate": 0.0002, "loss": 0.3054, "step": 4020 }, { "epoch": 0.45986170000853704, "grad_norm": 0.4449983835220337, "learning_rate": 0.0002, "loss": 0.3079, "step": 4040 }, { "epoch": 0.46213824307788626, "grad_norm": 0.4488675892353058, "learning_rate": 0.0002, "loss": 0.3027, "step": 4060 }, { "epoch": 0.4644147861472354, "grad_norm": 0.5412561893463135, "learning_rate": 0.0002, "loss": 0.2932, "step": 4080 }, { "epoch": 0.4666913292165846, "grad_norm": 0.41218650341033936, "learning_rate": 0.0002, "loss": 0.3087, "step": 4100 }, { "epoch": 0.4689678722859338, "grad_norm": 0.5233949422836304, "learning_rate": 0.0002, "loss": 0.3157, "step": 4120 }, { "epoch": 0.471244415355283, "grad_norm": 0.5676075220108032, "learning_rate": 0.0002, "loss": 0.3267, "step": 4140 }, { "epoch": 0.4735209584246322, "grad_norm": 0.5336834788322449, "learning_rate": 0.0002, "loss": 0.3185, "step": 4160 }, { "epoch": 0.47579750149398137, "grad_norm": 0.5505925416946411, "learning_rate": 0.0002, "loss": 0.3116, "step": 4180 }, { "epoch": 0.4780740445633306, "grad_norm": 0.5440223813056946, "learning_rate": 0.0002, "loss": 0.3234, "step": 4200 }, { "epoch": 0.4803505876326798, "grad_norm": 0.46334293484687805, "learning_rate": 0.0002, "loss": 0.3209, "step": 4220 }, { "epoch": 0.48262713070202895, "grad_norm": 0.452364444732666, "learning_rate": 0.0002, "loss": 0.3056, "step": 4240 }, { "epoch": 0.48490367377137816, "grad_norm": 0.5037956833839417, "learning_rate": 0.0002, "loss": 0.3141, "step": 4260 }, { "epoch": 0.4871802168407274, "grad_norm": 0.4308939278125763, "learning_rate": 0.0002, "loss": 0.2948, "step": 4280 }, { "epoch": 0.48945675991007653, "grad_norm": 0.45019960403442383, "learning_rate": 0.0002, "loss": 0.3142, "step": 4300 }, { "epoch": 0.49173330297942575, "grad_norm": 0.4351404011249542, "learning_rate": 0.0002, "loss": 0.31, "step": 4320 }, { "epoch": 0.49400984604877496, "grad_norm": 0.38306841254234314, "learning_rate": 0.0002, "loss": 0.2889, "step": 4340 }, { "epoch": 0.4962863891181241, "grad_norm": 0.545360803604126, "learning_rate": 0.0002, "loss": 0.311, "step": 4360 }, { "epoch": 0.49856293218747333, "grad_norm": 0.44942232966423035, "learning_rate": 0.0002, "loss": 0.2899, "step": 4380 }, { "epoch": 0.5008394752568225, "grad_norm": 0.46564239263534546, "learning_rate": 0.0002, "loss": 0.3013, "step": 4400 }, { "epoch": 0.5031160183261717, "grad_norm": 0.5398554801940918, "learning_rate": 0.0002, "loss": 0.3104, "step": 4420 }, { "epoch": 0.5053925613955209, "grad_norm": 0.47367504239082336, "learning_rate": 0.0002, "loss": 0.2945, "step": 4440 }, { "epoch": 0.5076691044648701, "grad_norm": 0.45659711956977844, "learning_rate": 0.0002, "loss": 0.304, "step": 4460 }, { "epoch": 0.5099456475342193, "grad_norm": 0.4942033290863037, "learning_rate": 0.0002, "loss": 0.2969, "step": 4480 }, { "epoch": 0.5122221906035684, "grad_norm": 0.46578243374824524, "learning_rate": 0.0002, "loss": 0.2935, "step": 4500 }, { "epoch": 0.5144987336729177, "grad_norm": 0.6523891687393188, "learning_rate": 0.0002, "loss": 0.2823, "step": 4520 }, { "epoch": 0.5167752767422669, "grad_norm": 0.4787238538265228, "learning_rate": 0.0002, "loss": 0.3148, "step": 4540 }, { "epoch": 0.5190518198116161, "grad_norm": 0.46825891733169556, "learning_rate": 0.0002, "loss": 0.3089, "step": 4560 }, { "epoch": 0.5213283628809653, "grad_norm": 0.46605536341667175, "learning_rate": 0.0002, "loss": 0.3012, "step": 4580 }, { "epoch": 0.5236049059503144, "grad_norm": 0.5826888680458069, "learning_rate": 0.0002, "loss": 0.3043, "step": 4600 }, { "epoch": 0.5258814490196636, "grad_norm": 0.48641151189804077, "learning_rate": 0.0002, "loss": 0.2952, "step": 4620 }, { "epoch": 0.5281579920890128, "grad_norm": 0.5396175384521484, "learning_rate": 0.0002, "loss": 0.2926, "step": 4640 }, { "epoch": 0.530434535158362, "grad_norm": 0.5584241151809692, "learning_rate": 0.0002, "loss": 0.3048, "step": 4660 }, { "epoch": 0.5327110782277112, "grad_norm": 0.5832685232162476, "learning_rate": 0.0002, "loss": 0.2948, "step": 4680 }, { "epoch": 0.5349876212970605, "grad_norm": 0.4676337242126465, "learning_rate": 0.0002, "loss": 0.3043, "step": 4700 }, { "epoch": 0.5372641643664096, "grad_norm": 0.4440428614616394, "learning_rate": 0.0002, "loss": 0.288, "step": 4720 }, { "epoch": 0.5395407074357588, "grad_norm": 0.49934279918670654, "learning_rate": 0.0002, "loss": 0.2882, "step": 4740 }, { "epoch": 0.541817250505108, "grad_norm": 0.5172054171562195, "learning_rate": 0.0002, "loss": 0.3225, "step": 4760 }, { "epoch": 0.5440937935744572, "grad_norm": 0.4527619183063507, "learning_rate": 0.0002, "loss": 0.2869, "step": 4780 }, { "epoch": 0.5463703366438064, "grad_norm": 0.548918604850769, "learning_rate": 0.0002, "loss": 0.3105, "step": 4800 }, { "epoch": 0.5486468797131556, "grad_norm": 0.48801419138908386, "learning_rate": 0.0002, "loss": 0.2835, "step": 4820 }, { "epoch": 0.5509234227825047, "grad_norm": 0.49810609221458435, "learning_rate": 0.0002, "loss": 0.3227, "step": 4840 }, { "epoch": 0.5531999658518539, "grad_norm": 0.49763086438179016, "learning_rate": 0.0002, "loss": 0.2786, "step": 4860 }, { "epoch": 0.5554765089212031, "grad_norm": 0.48815059661865234, "learning_rate": 0.0002, "loss": 0.2802, "step": 4880 }, { "epoch": 0.5577530519905524, "grad_norm": 0.3571115732192993, "learning_rate": 0.0002, "loss": 0.2796, "step": 4900 }, { "epoch": 0.5600295950599016, "grad_norm": 0.6448425650596619, "learning_rate": 0.0002, "loss": 0.2844, "step": 4920 }, { "epoch": 0.5623061381292508, "grad_norm": 0.49660468101501465, "learning_rate": 0.0002, "loss": 0.2892, "step": 4940 }, { "epoch": 0.5645826811985999, "grad_norm": 0.47702720761299133, "learning_rate": 0.0002, "loss": 0.3111, "step": 4960 }, { "epoch": 0.5668592242679491, "grad_norm": 0.5281921029090881, "learning_rate": 0.0002, "loss": 0.2908, "step": 4980 }, { "epoch": 0.5691357673372983, "grad_norm": 0.6427987813949585, "learning_rate": 0.0002, "loss": 0.2848, "step": 5000 }, { "epoch": 0.5714123104066475, "grad_norm": 0.5437233448028564, "learning_rate": 0.0002, "loss": 0.3023, "step": 5020 }, { "epoch": 0.5736888534759967, "grad_norm": 0.517444372177124, "learning_rate": 0.0002, "loss": 0.2876, "step": 5040 }, { "epoch": 0.5759653965453458, "grad_norm": 0.5197298526763916, "learning_rate": 0.0002, "loss": 0.304, "step": 5060 }, { "epoch": 0.578241939614695, "grad_norm": 0.3452152907848358, "learning_rate": 0.0002, "loss": 0.2794, "step": 5080 }, { "epoch": 0.5805184826840443, "grad_norm": 0.5630306601524353, "learning_rate": 0.0002, "loss": 0.2979, "step": 5100 }, { "epoch": 0.5827950257533935, "grad_norm": 0.5696737170219421, "learning_rate": 0.0002, "loss": 0.3035, "step": 5120 }, { "epoch": 0.5850715688227427, "grad_norm": 0.5024551153182983, "learning_rate": 0.0002, "loss": 0.2717, "step": 5140 }, { "epoch": 0.5873481118920919, "grad_norm": 0.4166383147239685, "learning_rate": 0.0002, "loss": 0.3065, "step": 5160 }, { "epoch": 0.589624654961441, "grad_norm": 0.36780408024787903, "learning_rate": 0.0002, "loss": 0.2864, "step": 5180 }, { "epoch": 0.5919011980307902, "grad_norm": 0.436526894569397, "learning_rate": 0.0002, "loss": 0.2764, "step": 5200 }, { "epoch": 0.5941777411001394, "grad_norm": 0.43115249276161194, "learning_rate": 0.0002, "loss": 0.2791, "step": 5220 }, { "epoch": 0.5964542841694886, "grad_norm": 0.359739750623703, "learning_rate": 0.0002, "loss": 0.3108, "step": 5240 }, { "epoch": 0.5987308272388379, "grad_norm": 0.4555259644985199, "learning_rate": 0.0002, "loss": 0.2623, "step": 5260 }, { "epoch": 0.6010073703081871, "grad_norm": 0.4587076008319855, "learning_rate": 0.0002, "loss": 0.293, "step": 5280 }, { "epoch": 0.6032839133775362, "grad_norm": 0.5236973166465759, "learning_rate": 0.0002, "loss": 0.2888, "step": 5300 }, { "epoch": 0.6055604564468854, "grad_norm": 0.46685513854026794, "learning_rate": 0.0002, "loss": 0.2731, "step": 5320 }, { "epoch": 0.6078369995162346, "grad_norm": 0.5701884627342224, "learning_rate": 0.0002, "loss": 0.28, "step": 5340 }, { "epoch": 0.6101135425855838, "grad_norm": 0.5002717971801758, "learning_rate": 0.0002, "loss": 0.2777, "step": 5360 }, { "epoch": 0.612390085654933, "grad_norm": 0.5896885395050049, "learning_rate": 0.0002, "loss": 0.3048, "step": 5380 }, { "epoch": 0.6146666287242821, "grad_norm": 0.49014943838119507, "learning_rate": 0.0002, "loss": 0.2642, "step": 5400 }, { "epoch": 0.6169431717936313, "grad_norm": 0.5924846529960632, "learning_rate": 0.0002, "loss": 0.2943, "step": 5420 }, { "epoch": 0.6192197148629806, "grad_norm": 0.49827829003334045, "learning_rate": 0.0002, "loss": 0.2879, "step": 5440 }, { "epoch": 0.6214962579323298, "grad_norm": 0.45312178134918213, "learning_rate": 0.0002, "loss": 0.2728, "step": 5460 }, { "epoch": 0.623772801001679, "grad_norm": 0.3595191538333893, "learning_rate": 0.0002, "loss": 0.2713, "step": 5480 }, { "epoch": 0.6260493440710282, "grad_norm": 0.6547619104385376, "learning_rate": 0.0002, "loss": 0.2855, "step": 5500 }, { "epoch": 0.6283258871403773, "grad_norm": 0.4659534692764282, "learning_rate": 0.0002, "loss": 0.2908, "step": 5520 }, { "epoch": 0.6306024302097265, "grad_norm": 0.4027460813522339, "learning_rate": 0.0002, "loss": 0.2651, "step": 5540 }, { "epoch": 0.6328789732790757, "grad_norm": 0.36129653453826904, "learning_rate": 0.0002, "loss": 0.2915, "step": 5560 }, { "epoch": 0.6351555163484249, "grad_norm": 0.5963912010192871, "learning_rate": 0.0002, "loss": 0.2968, "step": 5580 }, { "epoch": 0.6374320594177741, "grad_norm": 0.49669450521469116, "learning_rate": 0.0002, "loss": 0.2965, "step": 5600 }, { "epoch": 0.6397086024871234, "grad_norm": 0.5784302353858948, "learning_rate": 0.0002, "loss": 0.2626, "step": 5620 }, { "epoch": 0.6419851455564725, "grad_norm": 0.5651645660400391, "learning_rate": 0.0002, "loss": 0.2738, "step": 5640 }, { "epoch": 0.6442616886258217, "grad_norm": 0.45475292205810547, "learning_rate": 0.0002, "loss": 0.2653, "step": 5660 }, { "epoch": 0.6465382316951709, "grad_norm": 0.4691898822784424, "learning_rate": 0.0002, "loss": 0.2634, "step": 5680 }, { "epoch": 0.6488147747645201, "grad_norm": 0.4604431092739105, "learning_rate": 0.0002, "loss": 0.2838, "step": 5700 }, { "epoch": 0.6510913178338693, "grad_norm": 0.506804883480072, "learning_rate": 0.0002, "loss": 0.2657, "step": 5720 }, { "epoch": 0.6533678609032184, "grad_norm": 0.5051881670951843, "learning_rate": 0.0002, "loss": 0.2976, "step": 5740 }, { "epoch": 0.6556444039725676, "grad_norm": 0.4780672788619995, "learning_rate": 0.0002, "loss": 0.2828, "step": 5760 }, { "epoch": 0.6579209470419168, "grad_norm": 0.4695095121860504, "learning_rate": 0.0002, "loss": 0.2685, "step": 5780 }, { "epoch": 0.660197490111266, "grad_norm": 0.4259052276611328, "learning_rate": 0.0002, "loss": 0.2635, "step": 5800 }, { "epoch": 0.6624740331806153, "grad_norm": 0.5684182643890381, "learning_rate": 0.0002, "loss": 0.2879, "step": 5820 }, { "epoch": 0.6647505762499645, "grad_norm": 0.42193594574928284, "learning_rate": 0.0002, "loss": 0.2678, "step": 5840 }, { "epoch": 0.6670271193193136, "grad_norm": 0.5095034241676331, "learning_rate": 0.0002, "loss": 0.2677, "step": 5860 }, { "epoch": 0.6693036623886628, "grad_norm": 0.46626052260398865, "learning_rate": 0.0002, "loss": 0.2906, "step": 5880 }, { "epoch": 0.671580205458012, "grad_norm": 0.5086765289306641, "learning_rate": 0.0002, "loss": 0.2775, "step": 5900 }, { "epoch": 0.6738567485273612, "grad_norm": 0.44444966316223145, "learning_rate": 0.0002, "loss": 0.2764, "step": 5920 }, { "epoch": 0.6761332915967104, "grad_norm": 0.4477381706237793, "learning_rate": 0.0002, "loss": 0.2729, "step": 5940 }, { "epoch": 0.6784098346660596, "grad_norm": 0.46984028816223145, "learning_rate": 0.0002, "loss": 0.273, "step": 5960 }, { "epoch": 0.6806863777354087, "grad_norm": 0.417084276676178, "learning_rate": 0.0002, "loss": 0.2744, "step": 5980 }, { "epoch": 0.682962920804758, "grad_norm": 0.4144213795661926, "learning_rate": 0.0002, "loss": 0.2704, "step": 6000 }, { "epoch": 0.6852394638741072, "grad_norm": 0.5844799876213074, "learning_rate": 0.0002, "loss": 0.2635, "step": 6020 }, { "epoch": 0.6875160069434564, "grad_norm": 0.39512693881988525, "learning_rate": 0.0002, "loss": 0.2471, "step": 6040 }, { "epoch": 0.6897925500128056, "grad_norm": 0.5299990773200989, "learning_rate": 0.0002, "loss": 0.2648, "step": 6060 }, { "epoch": 0.6920690930821547, "grad_norm": 0.4980265498161316, "learning_rate": 0.0002, "loss": 0.2725, "step": 6080 }, { "epoch": 0.6943456361515039, "grad_norm": 0.4003869891166687, "learning_rate": 0.0002, "loss": 0.2768, "step": 6100 }, { "epoch": 0.6966221792208531, "grad_norm": 0.5103460550308228, "learning_rate": 0.0002, "loss": 0.2638, "step": 6120 }, { "epoch": 0.6988987222902023, "grad_norm": 0.737101137638092, "learning_rate": 0.0002, "loss": 0.2779, "step": 6140 }, { "epoch": 0.7011752653595515, "grad_norm": 0.4731826186180115, "learning_rate": 0.0002, "loss": 0.2691, "step": 6160 }, { "epoch": 0.7034518084289008, "grad_norm": 0.5234053730964661, "learning_rate": 0.0002, "loss": 0.2739, "step": 6180 }, { "epoch": 0.7057283514982499, "grad_norm": 0.5235525369644165, "learning_rate": 0.0002, "loss": 0.2754, "step": 6200 }, { "epoch": 0.7080048945675991, "grad_norm": 0.4453619122505188, "learning_rate": 0.0002, "loss": 0.2833, "step": 6220 }, { "epoch": 0.7102814376369483, "grad_norm": 0.4025666117668152, "learning_rate": 0.0002, "loss": 0.2713, "step": 6240 }, { "epoch": 0.7125579807062975, "grad_norm": 0.35240331292152405, "learning_rate": 0.0002, "loss": 0.2786, "step": 6260 }, { "epoch": 0.7148345237756467, "grad_norm": 0.4521905779838562, "learning_rate": 0.0002, "loss": 0.2639, "step": 6280 }, { "epoch": 0.7171110668449959, "grad_norm": 0.5230519771575928, "learning_rate": 0.0002, "loss": 0.2517, "step": 6300 }, { "epoch": 0.719387609914345, "grad_norm": 0.5415637493133545, "learning_rate": 0.0002, "loss": 0.2739, "step": 6320 }, { "epoch": 0.7216641529836942, "grad_norm": 0.4067966341972351, "learning_rate": 0.0002, "loss": 0.2751, "step": 6340 }, { "epoch": 0.7239406960530435, "grad_norm": 0.4670214354991913, "learning_rate": 0.0002, "loss": 0.2644, "step": 6360 }, { "epoch": 0.7262172391223927, "grad_norm": 0.5316203236579895, "learning_rate": 0.0002, "loss": 0.2746, "step": 6380 }, { "epoch": 0.7284937821917419, "grad_norm": 0.46312493085861206, "learning_rate": 0.0002, "loss": 0.2539, "step": 6400 }, { "epoch": 0.730770325261091, "grad_norm": 0.465279221534729, "learning_rate": 0.0002, "loss": 0.2742, "step": 6420 }, { "epoch": 0.7330468683304402, "grad_norm": 0.5096962451934814, "learning_rate": 0.0002, "loss": 0.2546, "step": 6440 }, { "epoch": 0.7353234113997894, "grad_norm": 0.4525590240955353, "learning_rate": 0.0002, "loss": 0.2694, "step": 6460 }, { "epoch": 0.7375999544691386, "grad_norm": 0.5033881664276123, "learning_rate": 0.0002, "loss": 0.2627, "step": 6480 }, { "epoch": 0.7398764975384878, "grad_norm": 0.44053900241851807, "learning_rate": 0.0002, "loss": 0.258, "step": 6500 }, { "epoch": 0.742153040607837, "grad_norm": 0.4677462875843048, "learning_rate": 0.0002, "loss": 0.2659, "step": 6520 }, { "epoch": 0.7444295836771861, "grad_norm": 0.5687553882598877, "learning_rate": 0.0002, "loss": 0.271, "step": 6540 }, { "epoch": 0.7467061267465354, "grad_norm": 0.4980468451976776, "learning_rate": 0.0002, "loss": 0.265, "step": 6560 }, { "epoch": 0.7489826698158846, "grad_norm": 0.5155619382858276, "learning_rate": 0.0002, "loss": 0.2491, "step": 6580 }, { "epoch": 0.7512592128852338, "grad_norm": 0.5364673733711243, "learning_rate": 0.0002, "loss": 0.2564, "step": 6600 }, { "epoch": 0.753535755954583, "grad_norm": 0.421838641166687, "learning_rate": 0.0002, "loss": 0.267, "step": 6620 }, { "epoch": 0.7558122990239322, "grad_norm": 0.46299833059310913, "learning_rate": 0.0002, "loss": 0.2461, "step": 6640 }, { "epoch": 0.7580888420932813, "grad_norm": 0.3832832872867584, "learning_rate": 0.0002, "loss": 0.265, "step": 6660 }, { "epoch": 0.7603653851626305, "grad_norm": 0.5560947060585022, "learning_rate": 0.0002, "loss": 0.253, "step": 6680 }, { "epoch": 0.7626419282319797, "grad_norm": 0.4832628667354584, "learning_rate": 0.0002, "loss": 0.2515, "step": 6700 }, { "epoch": 0.764918471301329, "grad_norm": 0.44354599714279175, "learning_rate": 0.0002, "loss": 0.2687, "step": 6720 }, { "epoch": 0.7671950143706782, "grad_norm": 0.3746070861816406, "learning_rate": 0.0002, "loss": 0.2481, "step": 6740 }, { "epoch": 0.7694715574400273, "grad_norm": 0.3048388659954071, "learning_rate": 0.0002, "loss": 0.269, "step": 6760 }, { "epoch": 0.7717481005093765, "grad_norm": 0.46471843123435974, "learning_rate": 0.0002, "loss": 0.2642, "step": 6780 }, { "epoch": 0.7740246435787257, "grad_norm": 0.44309428334236145, "learning_rate": 0.0002, "loss": 0.2565, "step": 6800 }, { "epoch": 0.7763011866480749, "grad_norm": 0.4174291789531708, "learning_rate": 0.0002, "loss": 0.262, "step": 6820 }, { "epoch": 0.7785777297174241, "grad_norm": 0.42592549324035645, "learning_rate": 0.0002, "loss": 0.2608, "step": 6840 }, { "epoch": 0.7808542727867733, "grad_norm": 0.4378054141998291, "learning_rate": 0.0002, "loss": 0.2765, "step": 6860 }, { "epoch": 0.7831308158561224, "grad_norm": 0.4560708701610565, "learning_rate": 0.0002, "loss": 0.2381, "step": 6880 }, { "epoch": 0.7854073589254716, "grad_norm": 0.4595545828342438, "learning_rate": 0.0002, "loss": 0.2561, "step": 6900 }, { "epoch": 0.7876839019948209, "grad_norm": 0.45213592052459717, "learning_rate": 0.0002, "loss": 0.2645, "step": 6920 }, { "epoch": 0.7899604450641701, "grad_norm": 0.4857342839241028, "learning_rate": 0.0002, "loss": 0.2687, "step": 6940 }, { "epoch": 0.7922369881335193, "grad_norm": 0.4939437508583069, "learning_rate": 0.0002, "loss": 0.2642, "step": 6960 }, { "epoch": 0.7945135312028685, "grad_norm": 0.46244382858276367, "learning_rate": 0.0002, "loss": 0.2536, "step": 6980 }, { "epoch": 0.7967900742722176, "grad_norm": 0.5876993536949158, "learning_rate": 0.0002, "loss": 0.2492, "step": 7000 }, { "epoch": 0.7990666173415668, "grad_norm": 0.5170072913169861, "learning_rate": 0.0002, "loss": 0.2548, "step": 7020 }, { "epoch": 0.801343160410916, "grad_norm": 0.394380658864975, "learning_rate": 0.0002, "loss": 0.2524, "step": 7040 }, { "epoch": 0.8036197034802652, "grad_norm": 0.4716455340385437, "learning_rate": 0.0002, "loss": 0.2573, "step": 7060 }, { "epoch": 0.8058962465496144, "grad_norm": 0.34525179862976074, "learning_rate": 0.0002, "loss": 0.246, "step": 7080 }, { "epoch": 0.8081727896189635, "grad_norm": 0.5030418038368225, "learning_rate": 0.0002, "loss": 0.2596, "step": 7100 }, { "epoch": 0.8104493326883128, "grad_norm": 0.5586132407188416, "learning_rate": 0.0002, "loss": 0.2568, "step": 7120 }, { "epoch": 0.812725875757662, "grad_norm": 0.47025129199028015, "learning_rate": 0.0002, "loss": 0.265, "step": 7140 }, { "epoch": 0.8150024188270112, "grad_norm": 0.5654832720756531, "learning_rate": 0.0002, "loss": 0.2468, "step": 7160 }, { "epoch": 0.8172789618963604, "grad_norm": 0.4701017141342163, "learning_rate": 0.0002, "loss": 0.2538, "step": 7180 }, { "epoch": 0.8195555049657096, "grad_norm": 0.47270438075065613, "learning_rate": 0.0002, "loss": 0.2529, "step": 7200 }, { "epoch": 0.8218320480350587, "grad_norm": 0.39433714747428894, "learning_rate": 0.0002, "loss": 0.2445, "step": 7220 }, { "epoch": 0.8241085911044079, "grad_norm": 0.4521467685699463, "learning_rate": 0.0002, "loss": 0.2556, "step": 7240 }, { "epoch": 0.8263851341737571, "grad_norm": 0.28483667969703674, "learning_rate": 0.0002, "loss": 0.2451, "step": 7260 }, { "epoch": 0.8286616772431064, "grad_norm": 0.4298310875892639, "learning_rate": 0.0002, "loss": 0.2599, "step": 7280 }, { "epoch": 0.8309382203124556, "grad_norm": 0.39677906036376953, "learning_rate": 0.0002, "loss": 0.2539, "step": 7300 }, { "epoch": 0.8332147633818048, "grad_norm": 0.5800175666809082, "learning_rate": 0.0002, "loss": 0.2463, "step": 7320 }, { "epoch": 0.8354913064511539, "grad_norm": 0.42742472887039185, "learning_rate": 0.0002, "loss": 0.2593, "step": 7340 }, { "epoch": 0.8377678495205031, "grad_norm": 0.5521807670593262, "learning_rate": 0.0002, "loss": 0.253, "step": 7360 }, { "epoch": 0.8400443925898523, "grad_norm": 0.5068047046661377, "learning_rate": 0.0002, "loss": 0.2503, "step": 7380 }, { "epoch": 0.8423209356592015, "grad_norm": 0.4325120151042938, "learning_rate": 0.0002, "loss": 0.2466, "step": 7400 }, { "epoch": 0.8445974787285507, "grad_norm": 0.5130394101142883, "learning_rate": 0.0002, "loss": 0.2521, "step": 7420 }, { "epoch": 0.8468740217978998, "grad_norm": 0.5091120600700378, "learning_rate": 0.0002, "loss": 0.2429, "step": 7440 }, { "epoch": 0.849150564867249, "grad_norm": 0.4635036289691925, "learning_rate": 0.0002, "loss": 0.235, "step": 7460 }, { "epoch": 0.8514271079365983, "grad_norm": 0.3827108144760132, "learning_rate": 0.0002, "loss": 0.2487, "step": 7480 }, { "epoch": 0.8537036510059475, "grad_norm": 0.3880899250507355, "learning_rate": 0.0002, "loss": 0.2469, "step": 7500 }, { "epoch": 0.8559801940752967, "grad_norm": 0.408933162689209, "learning_rate": 0.0002, "loss": 0.2499, "step": 7520 }, { "epoch": 0.8582567371446459, "grad_norm": 0.5049706101417542, "learning_rate": 0.0002, "loss": 0.2418, "step": 7540 }, { "epoch": 0.860533280213995, "grad_norm": 0.43551701307296753, "learning_rate": 0.0002, "loss": 0.2478, "step": 7560 }, { "epoch": 0.8628098232833442, "grad_norm": 0.5024411678314209, "learning_rate": 0.0002, "loss": 0.2538, "step": 7580 }, { "epoch": 0.8650863663526934, "grad_norm": 0.36361223459243774, "learning_rate": 0.0002, "loss": 0.2536, "step": 7600 }, { "epoch": 0.8673629094220426, "grad_norm": 0.4526277482509613, "learning_rate": 0.0002, "loss": 0.242, "step": 7620 }, { "epoch": 0.8696394524913919, "grad_norm": 0.5677676200866699, "learning_rate": 0.0002, "loss": 0.2572, "step": 7640 }, { "epoch": 0.8719159955607411, "grad_norm": 0.4915711283683777, "learning_rate": 0.0002, "loss": 0.2562, "step": 7660 }, { "epoch": 0.8741925386300902, "grad_norm": 0.36850452423095703, "learning_rate": 0.0002, "loss": 0.2523, "step": 7680 }, { "epoch": 0.8764690816994394, "grad_norm": 0.38313761353492737, "learning_rate": 0.0002, "loss": 0.2596, "step": 7700 }, { "epoch": 0.8787456247687886, "grad_norm": 0.5384640097618103, "learning_rate": 0.0002, "loss": 0.2455, "step": 7720 }, { "epoch": 0.8810221678381378, "grad_norm": 0.5308900475502014, "learning_rate": 0.0002, "loss": 0.2439, "step": 7740 }, { "epoch": 0.883298710907487, "grad_norm": 0.5488154292106628, "learning_rate": 0.0002, "loss": 0.2428, "step": 7760 }, { "epoch": 0.8855752539768362, "grad_norm": 0.5271242260932922, "learning_rate": 0.0002, "loss": 0.2372, "step": 7780 }, { "epoch": 0.8878517970461853, "grad_norm": 0.46171802282333374, "learning_rate": 0.0002, "loss": 0.2506, "step": 7800 }, { "epoch": 0.8901283401155345, "grad_norm": 0.45436665415763855, "learning_rate": 0.0002, "loss": 0.2414, "step": 7820 }, { "epoch": 0.8924048831848838, "grad_norm": 0.4920847415924072, "learning_rate": 0.0002, "loss": 0.2669, "step": 7840 }, { "epoch": 0.894681426254233, "grad_norm": 0.5913518071174622, "learning_rate": 0.0002, "loss": 0.2552, "step": 7860 }, { "epoch": 0.8969579693235822, "grad_norm": 0.6011972427368164, "learning_rate": 0.0002, "loss": 0.2533, "step": 7880 }, { "epoch": 0.8992345123929313, "grad_norm": 0.4650927186012268, "learning_rate": 0.0002, "loss": 0.2448, "step": 7900 }, { "epoch": 0.9015110554622805, "grad_norm": 0.5828790664672852, "learning_rate": 0.0002, "loss": 0.2381, "step": 7920 }, { "epoch": 0.9037875985316297, "grad_norm": 0.5178338885307312, "learning_rate": 0.0002, "loss": 0.2619, "step": 7940 }, { "epoch": 0.9060641416009789, "grad_norm": 0.5147708058357239, "learning_rate": 0.0002, "loss": 0.258, "step": 7960 }, { "epoch": 0.9083406846703281, "grad_norm": 0.45790836215019226, "learning_rate": 0.0002, "loss": 0.2474, "step": 7980 }, { "epoch": 0.9106172277396773, "grad_norm": 0.3837074935436249, "learning_rate": 0.0002, "loss": 0.2356, "step": 8000 }, { "epoch": 0.9128937708090265, "grad_norm": 0.4466090500354767, "learning_rate": 0.0002, "loss": 0.237, "step": 8020 }, { "epoch": 0.9151703138783757, "grad_norm": 0.5893344283103943, "learning_rate": 0.0002, "loss": 0.2399, "step": 8040 }, { "epoch": 0.9174468569477249, "grad_norm": 0.49547362327575684, "learning_rate": 0.0002, "loss": 0.2526, "step": 8060 }, { "epoch": 0.9197234000170741, "grad_norm": 0.47068551182746887, "learning_rate": 0.0002, "loss": 0.2631, "step": 8080 }, { "epoch": 0.9219999430864233, "grad_norm": 0.3512951135635376, "learning_rate": 0.0002, "loss": 0.2395, "step": 8100 }, { "epoch": 0.9242764861557725, "grad_norm": 0.3996793031692505, "learning_rate": 0.0002, "loss": 0.2424, "step": 8120 }, { "epoch": 0.9265530292251216, "grad_norm": 0.5782022476196289, "learning_rate": 0.0002, "loss": 0.2549, "step": 8140 }, { "epoch": 0.9288295722944708, "grad_norm": 0.450860857963562, "learning_rate": 0.0002, "loss": 0.2465, "step": 8160 }, { "epoch": 0.93110611536382, "grad_norm": 0.4679816663265228, "learning_rate": 0.0002, "loss": 0.2326, "step": 8180 }, { "epoch": 0.9333826584331693, "grad_norm": 0.5497337579727173, "learning_rate": 0.0002, "loss": 0.2457, "step": 8200 }, { "epoch": 0.9356592015025185, "grad_norm": 0.3775748312473297, "learning_rate": 0.0002, "loss": 0.2331, "step": 8220 }, { "epoch": 0.9379357445718676, "grad_norm": 0.5428327918052673, "learning_rate": 0.0002, "loss": 0.2399, "step": 8240 }, { "epoch": 0.9402122876412168, "grad_norm": 0.4089830219745636, "learning_rate": 0.0002, "loss": 0.246, "step": 8260 }, { "epoch": 0.942488830710566, "grad_norm": 0.5781340003013611, "learning_rate": 0.0002, "loss": 0.2451, "step": 8280 }, { "epoch": 0.9447653737799152, "grad_norm": 0.5869989395141602, "learning_rate": 0.0002, "loss": 0.2541, "step": 8300 }, { "epoch": 0.9470419168492644, "grad_norm": 0.47708019614219666, "learning_rate": 0.0002, "loss": 0.2559, "step": 8320 }, { "epoch": 0.9493184599186136, "grad_norm": 0.5445525050163269, "learning_rate": 0.0002, "loss": 0.2466, "step": 8340 }, { "epoch": 0.9515950029879627, "grad_norm": 0.480214387178421, "learning_rate": 0.0002, "loss": 0.236, "step": 8360 }, { "epoch": 0.953871546057312, "grad_norm": 0.5392053127288818, "learning_rate": 0.0002, "loss": 0.2383, "step": 8380 }, { "epoch": 0.9561480891266612, "grad_norm": 0.4515858292579651, "learning_rate": 0.0002, "loss": 0.238, "step": 8400 }, { "epoch": 0.9584246321960104, "grad_norm": 0.5461826324462891, "learning_rate": 0.0002, "loss": 0.2442, "step": 8420 }, { "epoch": 0.9607011752653596, "grad_norm": 0.44309332966804504, "learning_rate": 0.0002, "loss": 0.2622, "step": 8440 }, { "epoch": 0.9629777183347088, "grad_norm": 0.5409505367279053, "learning_rate": 0.0002, "loss": 0.2303, "step": 8460 }, { "epoch": 0.9652542614040579, "grad_norm": 0.3868342638015747, "learning_rate": 0.0002, "loss": 0.2624, "step": 8480 }, { "epoch": 0.9675308044734071, "grad_norm": 0.38888975977897644, "learning_rate": 0.0002, "loss": 0.246, "step": 8500 }, { "epoch": 0.9698073475427563, "grad_norm": 0.38946032524108887, "learning_rate": 0.0002, "loss": 0.2503, "step": 8520 }, { "epoch": 0.9720838906121055, "grad_norm": 0.42425817251205444, "learning_rate": 0.0002, "loss": 0.2556, "step": 8540 }, { "epoch": 0.9743604336814548, "grad_norm": 0.41515296697616577, "learning_rate": 0.0002, "loss": 0.2437, "step": 8560 }, { "epoch": 0.9766369767508039, "grad_norm": 0.4085826575756073, "learning_rate": 0.0002, "loss": 0.2293, "step": 8580 }, { "epoch": 0.9789135198201531, "grad_norm": 0.3404542803764343, "learning_rate": 0.0002, "loss": 0.242, "step": 8600 }, { "epoch": 0.9811900628895023, "grad_norm": 0.43266579508781433, "learning_rate": 0.0002, "loss": 0.2513, "step": 8620 }, { "epoch": 0.9834666059588515, "grad_norm": 0.42724549770355225, "learning_rate": 0.0002, "loss": 0.2384, "step": 8640 }, { "epoch": 0.9857431490282007, "grad_norm": 0.5089221596717834, "learning_rate": 0.0002, "loss": 0.2409, "step": 8660 }, { "epoch": 0.9880196920975499, "grad_norm": 0.519223690032959, "learning_rate": 0.0002, "loss": 0.2353, "step": 8680 }, { "epoch": 0.990296235166899, "grad_norm": 0.5701056122779846, "learning_rate": 0.0002, "loss": 0.2486, "step": 8700 }, { "epoch": 0.9925727782362482, "grad_norm": 0.4519595503807068, "learning_rate": 0.0002, "loss": 0.2374, "step": 8720 }, { "epoch": 0.9948493213055974, "grad_norm": 0.4883946180343628, "learning_rate": 0.0002, "loss": 0.2441, "step": 8740 }, { "epoch": 0.9971258643749467, "grad_norm": 0.6918900012969971, "learning_rate": 0.0002, "loss": 0.2403, "step": 8760 }, { "epoch": 0.9994024074442959, "grad_norm": 0.4810091555118561, "learning_rate": 0.0002, "loss": 0.2334, "step": 8780 }, { "epoch": 1.0, "eval_loss": 0.30941203236579895, "eval_runtime": 408.7196, "eval_samples_per_second": 7.083, "eval_steps_per_second": 0.886, "step": 8786 } ], "logging_steps": 20, "max_steps": 13000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 77, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.923169198364426e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }