|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 100, |
|
"global_step": 2310, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.021645021645021644, |
|
"grad_norm": 2.8125383853912354, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9514, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04329004329004329, |
|
"grad_norm": 1.2800142765045166, |
|
"learning_rate": 4e-05, |
|
"loss": 0.7706, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06493506493506493, |
|
"grad_norm": 1.0814083814620972, |
|
"learning_rate": 6e-05, |
|
"loss": 0.6133, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08658008658008658, |
|
"grad_norm": 0.7970395088195801, |
|
"learning_rate": 8e-05, |
|
"loss": 0.5192, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10822510822510822, |
|
"grad_norm": 0.6642207503318787, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4351, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.12987012987012986, |
|
"grad_norm": 0.5111215114593506, |
|
"learning_rate": 0.00012, |
|
"loss": 0.3603, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.15151515151515152, |
|
"grad_norm": 1.2057693004608154, |
|
"learning_rate": 0.00014, |
|
"loss": 0.3361, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.17316017316017315, |
|
"grad_norm": 0.4057958424091339, |
|
"learning_rate": 0.00016, |
|
"loss": 0.3053, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.19480519480519481, |
|
"grad_norm": 0.5448099374771118, |
|
"learning_rate": 0.00018, |
|
"loss": 0.3024, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.21645021645021645, |
|
"grad_norm": 0.6636802554130554, |
|
"learning_rate": 0.0002, |
|
"loss": 0.2595, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21645021645021645, |
|
"eval_loss": 0.1525065004825592, |
|
"eval_runtime": 43.7956, |
|
"eval_samples_per_second": 11.417, |
|
"eval_steps_per_second": 0.365, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.23809523809523808, |
|
"grad_norm": 0.6046785116195679, |
|
"learning_rate": 0.0001990950226244344, |
|
"loss": 0.2407, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2597402597402597, |
|
"grad_norm": 0.5757384896278381, |
|
"learning_rate": 0.0001981900452488688, |
|
"loss": 0.2223, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2813852813852814, |
|
"grad_norm": 0.5616349577903748, |
|
"learning_rate": 0.00019728506787330318, |
|
"loss": 0.224, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.30303030303030304, |
|
"grad_norm": 0.5419019460678101, |
|
"learning_rate": 0.00019638009049773755, |
|
"loss": 0.2306, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3246753246753247, |
|
"grad_norm": 0.49299636483192444, |
|
"learning_rate": 0.00019547511312217194, |
|
"loss": 0.2214, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3463203463203463, |
|
"grad_norm": 0.6301876306533813, |
|
"learning_rate": 0.00019457013574660634, |
|
"loss": 0.2296, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.36796536796536794, |
|
"grad_norm": 1.009000539779663, |
|
"learning_rate": 0.00019366515837104074, |
|
"loss": 0.2277, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.38961038961038963, |
|
"grad_norm": 0.5606774091720581, |
|
"learning_rate": 0.00019276018099547514, |
|
"loss": 0.2235, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.41125541125541126, |
|
"grad_norm": 0.47087350487709045, |
|
"learning_rate": 0.0001918552036199095, |
|
"loss": 0.2175, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4329004329004329, |
|
"grad_norm": 0.566985011100769, |
|
"learning_rate": 0.0001909502262443439, |
|
"loss": 0.2124, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4329004329004329, |
|
"eval_loss": 0.13294899463653564, |
|
"eval_runtime": 23.5705, |
|
"eval_samples_per_second": 21.213, |
|
"eval_steps_per_second": 0.679, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 0.7860066294670105, |
|
"learning_rate": 0.00019004524886877827, |
|
"loss": 0.2164, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 0.47214949131011963, |
|
"learning_rate": 0.00018914027149321267, |
|
"loss": 0.2086, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.49783549783549785, |
|
"grad_norm": 0.5978089570999146, |
|
"learning_rate": 0.00018823529411764707, |
|
"loss": 0.2176, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5194805194805194, |
|
"grad_norm": 0.5042030811309814, |
|
"learning_rate": 0.00018733031674208147, |
|
"loss": 0.2003, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5411255411255411, |
|
"grad_norm": 0.5709463953971863, |
|
"learning_rate": 0.00018642533936651584, |
|
"loss": 0.1969, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5627705627705628, |
|
"grad_norm": 0.5876468420028687, |
|
"learning_rate": 0.00018552036199095024, |
|
"loss": 0.1928, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5844155844155844, |
|
"grad_norm": 0.5380046963691711, |
|
"learning_rate": 0.00018461538461538463, |
|
"loss": 0.1944, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6060606060606061, |
|
"grad_norm": 0.3912442624568939, |
|
"learning_rate": 0.000183710407239819, |
|
"loss": 0.1999, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6277056277056277, |
|
"grad_norm": 0.6305837631225586, |
|
"learning_rate": 0.0001828054298642534, |
|
"loss": 0.1976, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.6493506493506493, |
|
"grad_norm": 0.6630299091339111, |
|
"learning_rate": 0.0001819004524886878, |
|
"loss": 0.1881, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6493506493506493, |
|
"eval_loss": 0.11815983802080154, |
|
"eval_runtime": 23.4543, |
|
"eval_samples_per_second": 21.318, |
|
"eval_steps_per_second": 0.682, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.670995670995671, |
|
"grad_norm": 0.3871740996837616, |
|
"learning_rate": 0.00018099547511312217, |
|
"loss": 0.1918, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6926406926406926, |
|
"grad_norm": 0.39743664860725403, |
|
"learning_rate": 0.00018009049773755657, |
|
"loss": 0.1875, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.580439031124115, |
|
"learning_rate": 0.00017918552036199096, |
|
"loss": 0.1881, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7359307359307359, |
|
"grad_norm": 0.39449596405029297, |
|
"learning_rate": 0.00017828054298642536, |
|
"loss": 0.189, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.7575757575757576, |
|
"grad_norm": 0.5583558678627014, |
|
"learning_rate": 0.00017737556561085973, |
|
"loss": 0.1912, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7792207792207793, |
|
"grad_norm": 0.5783946514129639, |
|
"learning_rate": 0.00017647058823529413, |
|
"loss": 0.1891, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8008658008658008, |
|
"grad_norm": 0.5039686560630798, |
|
"learning_rate": 0.0001755656108597285, |
|
"loss": 0.1859, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.8225108225108225, |
|
"grad_norm": 0.45908617973327637, |
|
"learning_rate": 0.0001746606334841629, |
|
"loss": 0.1868, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8441558441558441, |
|
"grad_norm": 0.4211718738079071, |
|
"learning_rate": 0.0001737556561085973, |
|
"loss": 0.1961, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.8658008658008658, |
|
"grad_norm": 0.4308678209781647, |
|
"learning_rate": 0.0001728506787330317, |
|
"loss": 0.2057, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8658008658008658, |
|
"eval_loss": 0.12011528760194778, |
|
"eval_runtime": 21.7151, |
|
"eval_samples_per_second": 23.025, |
|
"eval_steps_per_second": 0.737, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8874458874458875, |
|
"grad_norm": 0.5541667342185974, |
|
"learning_rate": 0.0001719457013574661, |
|
"loss": 0.1946, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.782439112663269, |
|
"learning_rate": 0.00017104072398190046, |
|
"loss": 0.1746, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.9307359307359307, |
|
"grad_norm": 0.4267190396785736, |
|
"learning_rate": 0.00017013574660633486, |
|
"loss": 0.1905, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 0.4720573127269745, |
|
"learning_rate": 0.00016923076923076923, |
|
"loss": 0.1925, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.974025974025974, |
|
"grad_norm": 0.49390169978141785, |
|
"learning_rate": 0.00016832579185520363, |
|
"loss": 0.1967, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.9956709956709957, |
|
"grad_norm": 0.4374224841594696, |
|
"learning_rate": 0.00016742081447963802, |
|
"loss": 0.1813, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.0173160173160174, |
|
"grad_norm": 0.4421156048774719, |
|
"learning_rate": 0.00016651583710407242, |
|
"loss": 0.1673, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.0389610389610389, |
|
"grad_norm": 0.35764458775520325, |
|
"learning_rate": 0.0001656108597285068, |
|
"loss": 0.1759, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.0606060606060606, |
|
"grad_norm": 0.36364632844924927, |
|
"learning_rate": 0.0001647058823529412, |
|
"loss": 0.1726, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.0822510822510822, |
|
"grad_norm": 0.37476664781570435, |
|
"learning_rate": 0.00016380090497737556, |
|
"loss": 0.1819, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0822510822510822, |
|
"eval_loss": 0.12120614945888519, |
|
"eval_runtime": 38.4695, |
|
"eval_samples_per_second": 12.997, |
|
"eval_steps_per_second": 0.416, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.103896103896104, |
|
"grad_norm": 0.4027290344238281, |
|
"learning_rate": 0.00016289592760180996, |
|
"loss": 0.1633, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.1255411255411256, |
|
"grad_norm": 0.3566421866416931, |
|
"learning_rate": 0.00016199095022624435, |
|
"loss": 0.1607, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.1471861471861473, |
|
"grad_norm": 0.4911403954029083, |
|
"learning_rate": 0.00016108597285067875, |
|
"loss": 0.1655, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.1688311688311688, |
|
"grad_norm": 0.3717828392982483, |
|
"learning_rate": 0.00016018099547511315, |
|
"loss": 0.1646, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.1904761904761905, |
|
"grad_norm": 0.4402911961078644, |
|
"learning_rate": 0.00015927601809954752, |
|
"loss": 0.1677, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.2121212121212122, |
|
"grad_norm": 0.4106089770793915, |
|
"learning_rate": 0.0001583710407239819, |
|
"loss": 0.1607, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.2337662337662338, |
|
"grad_norm": 0.4335547387599945, |
|
"learning_rate": 0.0001574660633484163, |
|
"loss": 0.1625, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.2554112554112553, |
|
"grad_norm": 0.3730684816837311, |
|
"learning_rate": 0.00015656108597285069, |
|
"loss": 0.1627, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.277056277056277, |
|
"grad_norm": 0.4024712145328522, |
|
"learning_rate": 0.00015565610859728508, |
|
"loss": 0.1575, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.2987012987012987, |
|
"grad_norm": 0.5081992745399475, |
|
"learning_rate": 0.00015475113122171948, |
|
"loss": 0.1654, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.2987012987012987, |
|
"eval_loss": 0.11448249965906143, |
|
"eval_runtime": 25.9879, |
|
"eval_samples_per_second": 19.24, |
|
"eval_steps_per_second": 0.616, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.3203463203463204, |
|
"grad_norm": 0.5350440144538879, |
|
"learning_rate": 0.00015384615384615385, |
|
"loss": 0.158, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.341991341991342, |
|
"grad_norm": 0.44623634219169617, |
|
"learning_rate": 0.00015294117647058822, |
|
"loss": 0.1628, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 0.4370821714401245, |
|
"learning_rate": 0.00015203619909502262, |
|
"loss": 0.1563, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.3852813852813852, |
|
"grad_norm": 0.35637155175209045, |
|
"learning_rate": 0.00015113122171945702, |
|
"loss": 0.1706, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.406926406926407, |
|
"grad_norm": 0.3456045687198639, |
|
"learning_rate": 0.00015022624434389141, |
|
"loss": 0.1695, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.48406872153282166, |
|
"learning_rate": 0.0001493212669683258, |
|
"loss": 0.1561, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.4502164502164503, |
|
"grad_norm": 0.45908018946647644, |
|
"learning_rate": 0.00014841628959276018, |
|
"loss": 0.1584, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.4718614718614718, |
|
"grad_norm": 0.4222131073474884, |
|
"learning_rate": 0.00014751131221719458, |
|
"loss": 0.1524, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.4935064935064934, |
|
"grad_norm": 0.40552523732185364, |
|
"learning_rate": 0.00014660633484162895, |
|
"loss": 0.1664, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.5151515151515151, |
|
"grad_norm": 0.441180020570755, |
|
"learning_rate": 0.00014570135746606335, |
|
"loss": 0.1732, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5151515151515151, |
|
"eval_loss": 0.10767688602209091, |
|
"eval_runtime": 25.2267, |
|
"eval_samples_per_second": 19.82, |
|
"eval_steps_per_second": 0.634, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5367965367965368, |
|
"grad_norm": 0.39050132036209106, |
|
"learning_rate": 0.00014479638009049775, |
|
"loss": 0.1544, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.5584415584415585, |
|
"grad_norm": 0.37475523352622986, |
|
"learning_rate": 0.00014389140271493214, |
|
"loss": 0.1534, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.5800865800865802, |
|
"grad_norm": 0.3851136267185211, |
|
"learning_rate": 0.00014298642533936651, |
|
"loss": 0.1623, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.601731601731602, |
|
"grad_norm": 0.37124335765838623, |
|
"learning_rate": 0.0001420814479638009, |
|
"loss": 0.152, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.6233766233766234, |
|
"grad_norm": 0.3956158459186554, |
|
"learning_rate": 0.0001411764705882353, |
|
"loss": 0.1619, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.645021645021645, |
|
"grad_norm": 0.34193211793899536, |
|
"learning_rate": 0.00014027149321266968, |
|
"loss": 0.158, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.5510150194168091, |
|
"learning_rate": 0.00013936651583710408, |
|
"loss": 0.158, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.6883116883116882, |
|
"grad_norm": 0.6107664108276367, |
|
"learning_rate": 0.00013846153846153847, |
|
"loss": 0.1487, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.70995670995671, |
|
"grad_norm": 0.4111076295375824, |
|
"learning_rate": 0.00013755656108597284, |
|
"loss": 0.1588, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.7316017316017316, |
|
"grad_norm": 0.5250778198242188, |
|
"learning_rate": 0.00013665158371040724, |
|
"loss": 0.1653, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.7316017316017316, |
|
"eval_loss": 0.10399862378835678, |
|
"eval_runtime": 23.9812, |
|
"eval_samples_per_second": 20.85, |
|
"eval_steps_per_second": 0.667, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.7532467532467533, |
|
"grad_norm": 0.4260198771953583, |
|
"learning_rate": 0.00013574660633484164, |
|
"loss": 0.1482, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.774891774891775, |
|
"grad_norm": 0.28968125581741333, |
|
"learning_rate": 0.00013484162895927604, |
|
"loss": 0.1541, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.7965367965367967, |
|
"grad_norm": 0.29932093620300293, |
|
"learning_rate": 0.0001339366515837104, |
|
"loss": 0.1625, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.4358128011226654, |
|
"learning_rate": 0.0001330316742081448, |
|
"loss": 0.1658, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.8398268398268398, |
|
"grad_norm": 0.4161946773529053, |
|
"learning_rate": 0.00013212669683257918, |
|
"loss": 0.1472, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.8614718614718615, |
|
"grad_norm": 0.3558347225189209, |
|
"learning_rate": 0.00013122171945701357, |
|
"loss": 0.1671, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.883116883116883, |
|
"grad_norm": 0.41748765110969543, |
|
"learning_rate": 0.00013031674208144797, |
|
"loss": 0.1664, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 0.5195390582084656, |
|
"learning_rate": 0.00012941176470588237, |
|
"loss": 0.1603, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.9264069264069263, |
|
"grad_norm": 0.344159334897995, |
|
"learning_rate": 0.00012850678733031677, |
|
"loss": 0.1589, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.948051948051948, |
|
"grad_norm": 0.4217064380645752, |
|
"learning_rate": 0.00012760180995475114, |
|
"loss": 0.1631, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.948051948051948, |
|
"eval_loss": 0.11024898290634155, |
|
"eval_runtime": 26.0192, |
|
"eval_samples_per_second": 19.217, |
|
"eval_steps_per_second": 0.615, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.9696969696969697, |
|
"grad_norm": 0.377990186214447, |
|
"learning_rate": 0.0001266968325791855, |
|
"loss": 0.162, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.9913419913419914, |
|
"grad_norm": 0.3964829742908478, |
|
"learning_rate": 0.0001257918552036199, |
|
"loss": 0.1565, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.012987012987013, |
|
"grad_norm": 0.31516608595848083, |
|
"learning_rate": 0.0001248868778280543, |
|
"loss": 0.1356, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.034632034632035, |
|
"grad_norm": 0.4001309275627136, |
|
"learning_rate": 0.0001239819004524887, |
|
"loss": 0.1398, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.0562770562770565, |
|
"grad_norm": 0.2704612612724304, |
|
"learning_rate": 0.0001230769230769231, |
|
"loss": 0.139, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.0779220779220777, |
|
"grad_norm": 0.37967827916145325, |
|
"learning_rate": 0.0001221719457013575, |
|
"loss": 0.1474, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.0995670995670994, |
|
"grad_norm": 0.3611961603164673, |
|
"learning_rate": 0.00012126696832579185, |
|
"loss": 0.1394, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.121212121212121, |
|
"grad_norm": 0.38165083527565, |
|
"learning_rate": 0.00012036199095022625, |
|
"loss": 0.1509, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 0.44539740681648254, |
|
"learning_rate": 0.00011945701357466063, |
|
"loss": 0.1335, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.1645021645021645, |
|
"grad_norm": 0.2796030044555664, |
|
"learning_rate": 0.00011855203619909503, |
|
"loss": 0.1354, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.1645021645021645, |
|
"eval_loss": 0.11003410071134567, |
|
"eval_runtime": 23.006, |
|
"eval_samples_per_second": 21.733, |
|
"eval_steps_per_second": 0.695, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.186147186147186, |
|
"grad_norm": 0.39886558055877686, |
|
"learning_rate": 0.00011764705882352942, |
|
"loss": 0.1365, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.207792207792208, |
|
"grad_norm": 0.27149999141693115, |
|
"learning_rate": 0.00011674208144796381, |
|
"loss": 0.1303, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.2294372294372296, |
|
"grad_norm": 0.4485577940940857, |
|
"learning_rate": 0.00011583710407239821, |
|
"loss": 0.136, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.2510822510822512, |
|
"grad_norm": 0.5114185214042664, |
|
"learning_rate": 0.00011493212669683258, |
|
"loss": 0.129, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.2727272727272725, |
|
"grad_norm": 0.41526681184768677, |
|
"learning_rate": 0.00011402714932126696, |
|
"loss": 0.1352, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.2943722943722946, |
|
"grad_norm": 0.4560807943344116, |
|
"learning_rate": 0.00011312217194570136, |
|
"loss": 0.1473, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.316017316017316, |
|
"grad_norm": 0.4193613529205322, |
|
"learning_rate": 0.00011221719457013576, |
|
"loss": 0.1427, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.3376623376623376, |
|
"grad_norm": 0.5285654664039612, |
|
"learning_rate": 0.00011131221719457014, |
|
"loss": 0.1395, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.3593073593073592, |
|
"grad_norm": 0.3672701120376587, |
|
"learning_rate": 0.00011040723981900454, |
|
"loss": 0.1303, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"grad_norm": 0.4193953573703766, |
|
"learning_rate": 0.00010950226244343893, |
|
"loss": 0.1353, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"eval_loss": 0.10145355015993118, |
|
"eval_runtime": 20.0537, |
|
"eval_samples_per_second": 24.933, |
|
"eval_steps_per_second": 0.798, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.4025974025974026, |
|
"grad_norm": 0.39170539379119873, |
|
"learning_rate": 0.0001085972850678733, |
|
"loss": 0.1344, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.4242424242424243, |
|
"grad_norm": 0.4148479104042053, |
|
"learning_rate": 0.0001076923076923077, |
|
"loss": 0.1463, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.445887445887446, |
|
"grad_norm": 0.3758697211742401, |
|
"learning_rate": 0.00010678733031674209, |
|
"loss": 0.1338, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.4675324675324677, |
|
"grad_norm": 0.3020533621311188, |
|
"learning_rate": 0.00010588235294117647, |
|
"loss": 0.1369, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.4891774891774894, |
|
"grad_norm": 0.3161577582359314, |
|
"learning_rate": 0.00010497737556561087, |
|
"loss": 0.1438, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.5108225108225106, |
|
"grad_norm": 0.39273881912231445, |
|
"learning_rate": 0.00010407239819004526, |
|
"loss": 0.1404, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.5324675324675323, |
|
"grad_norm": 0.3449176251888275, |
|
"learning_rate": 0.00010316742081447965, |
|
"loss": 0.1346, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.554112554112554, |
|
"grad_norm": 0.261294960975647, |
|
"learning_rate": 0.00010226244343891402, |
|
"loss": 0.1244, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.5757575757575757, |
|
"grad_norm": 0.5024470686912537, |
|
"learning_rate": 0.00010135746606334842, |
|
"loss": 0.1385, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.5974025974025974, |
|
"grad_norm": 0.4657529294490814, |
|
"learning_rate": 0.0001004524886877828, |
|
"loss": 0.1458, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.5974025974025974, |
|
"eval_loss": 0.09968920797109604, |
|
"eval_runtime": 22.2909, |
|
"eval_samples_per_second": 22.431, |
|
"eval_steps_per_second": 0.718, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.619047619047619, |
|
"grad_norm": 0.28800147771835327, |
|
"learning_rate": 9.95475113122172e-05, |
|
"loss": 0.1366, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.6406926406926408, |
|
"grad_norm": 0.24663890898227692, |
|
"learning_rate": 9.864253393665159e-05, |
|
"loss": 0.1315, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.6623376623376624, |
|
"grad_norm": 0.3126956522464752, |
|
"learning_rate": 9.773755656108597e-05, |
|
"loss": 0.1305, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.683982683982684, |
|
"grad_norm": 0.34753990173339844, |
|
"learning_rate": 9.683257918552037e-05, |
|
"loss": 0.1326, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.7056277056277054, |
|
"grad_norm": 0.4045655429363251, |
|
"learning_rate": 9.592760180995475e-05, |
|
"loss": 0.147, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 0.369632750749588, |
|
"learning_rate": 9.502262443438914e-05, |
|
"loss": 0.1339, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.7489177489177488, |
|
"grad_norm": 0.3891206383705139, |
|
"learning_rate": 9.411764705882353e-05, |
|
"loss": 0.1326, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.7705627705627704, |
|
"grad_norm": 0.4095381498336792, |
|
"learning_rate": 9.321266968325792e-05, |
|
"loss": 0.1305, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.792207792207792, |
|
"grad_norm": 0.39312949776649475, |
|
"learning_rate": 9.230769230769232e-05, |
|
"loss": 0.1348, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.813852813852814, |
|
"grad_norm": 0.4769635498523712, |
|
"learning_rate": 9.14027149321267e-05, |
|
"loss": 0.1292, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.813852813852814, |
|
"eval_loss": 0.09773257374763489, |
|
"eval_runtime": 21.4232, |
|
"eval_samples_per_second": 23.339, |
|
"eval_steps_per_second": 0.747, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.8354978354978355, |
|
"grad_norm": 0.4419485628604889, |
|
"learning_rate": 9.049773755656108e-05, |
|
"loss": 0.1332, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.273708313703537, |
|
"learning_rate": 8.959276018099548e-05, |
|
"loss": 0.1351, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.878787878787879, |
|
"grad_norm": 0.41094347834587097, |
|
"learning_rate": 8.868778280542987e-05, |
|
"loss": 0.1436, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.9004329004329006, |
|
"grad_norm": 0.3220170736312866, |
|
"learning_rate": 8.778280542986425e-05, |
|
"loss": 0.1372, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.9220779220779223, |
|
"grad_norm": 0.3632793128490448, |
|
"learning_rate": 8.687782805429865e-05, |
|
"loss": 0.1421, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.9437229437229435, |
|
"grad_norm": 0.33830907940864563, |
|
"learning_rate": 8.597285067873304e-05, |
|
"loss": 0.1452, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.965367965367965, |
|
"grad_norm": 0.3863072097301483, |
|
"learning_rate": 8.506787330316743e-05, |
|
"loss": 0.1424, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.987012987012987, |
|
"grad_norm": 0.5766463279724121, |
|
"learning_rate": 8.416289592760181e-05, |
|
"loss": 0.1424, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 3.0086580086580086, |
|
"grad_norm": 0.23465487360954285, |
|
"learning_rate": 8.325791855203621e-05, |
|
"loss": 0.1241, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 3.0303030303030303, |
|
"grad_norm": 0.32335883378982544, |
|
"learning_rate": 8.23529411764706e-05, |
|
"loss": 0.1181, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.0303030303030303, |
|
"eval_loss": 0.10378438234329224, |
|
"eval_runtime": 24.145, |
|
"eval_samples_per_second": 20.708, |
|
"eval_steps_per_second": 0.663, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.051948051948052, |
|
"grad_norm": 0.3104758560657501, |
|
"learning_rate": 8.144796380090498e-05, |
|
"loss": 0.1227, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 3.0735930735930737, |
|
"grad_norm": 0.27100279927253723, |
|
"learning_rate": 8.054298642533938e-05, |
|
"loss": 0.1164, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 3.0952380952380953, |
|
"grad_norm": 0.3317118287086487, |
|
"learning_rate": 7.963800904977376e-05, |
|
"loss": 0.1233, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 3.116883116883117, |
|
"grad_norm": 0.41682133078575134, |
|
"learning_rate": 7.873303167420814e-05, |
|
"loss": 0.1213, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 3.1385281385281387, |
|
"grad_norm": 0.31365829706192017, |
|
"learning_rate": 7.782805429864254e-05, |
|
"loss": 0.1183, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.16017316017316, |
|
"grad_norm": 0.3687366247177124, |
|
"learning_rate": 7.692307692307693e-05, |
|
"loss": 0.119, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 3.1818181818181817, |
|
"grad_norm": 0.4626697599887848, |
|
"learning_rate": 7.601809954751131e-05, |
|
"loss": 0.1244, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 3.2034632034632033, |
|
"grad_norm": 0.46320992708206177, |
|
"learning_rate": 7.511312217194571e-05, |
|
"loss": 0.1214, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 3.225108225108225, |
|
"grad_norm": 0.3054867386817932, |
|
"learning_rate": 7.420814479638009e-05, |
|
"loss": 0.1276, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 3.2467532467532467, |
|
"grad_norm": 0.26975077390670776, |
|
"learning_rate": 7.330316742081448e-05, |
|
"loss": 0.1235, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.2467532467532467, |
|
"eval_loss": 0.10016042739152908, |
|
"eval_runtime": 23.6078, |
|
"eval_samples_per_second": 21.179, |
|
"eval_steps_per_second": 0.678, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.2683982683982684, |
|
"grad_norm": 0.33022555708885193, |
|
"learning_rate": 7.239819004524887e-05, |
|
"loss": 0.12, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 3.29004329004329, |
|
"grad_norm": 0.4828197956085205, |
|
"learning_rate": 7.149321266968326e-05, |
|
"loss": 0.1299, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 3.311688311688312, |
|
"grad_norm": 0.36965712904930115, |
|
"learning_rate": 7.058823529411765e-05, |
|
"loss": 0.1217, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 0.4338422417640686, |
|
"learning_rate": 6.968325791855204e-05, |
|
"loss": 0.1276, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 3.354978354978355, |
|
"grad_norm": 0.40615785121917725, |
|
"learning_rate": 6.877828054298642e-05, |
|
"loss": 0.1214, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 3.3766233766233764, |
|
"grad_norm": 0.2703055739402771, |
|
"learning_rate": 6.787330316742082e-05, |
|
"loss": 0.1179, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 3.398268398268398, |
|
"grad_norm": 0.469108521938324, |
|
"learning_rate": 6.69683257918552e-05, |
|
"loss": 0.1189, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 3.41991341991342, |
|
"grad_norm": 0.2688772976398468, |
|
"learning_rate": 6.606334841628959e-05, |
|
"loss": 0.1174, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 3.4415584415584415, |
|
"grad_norm": 0.2989065647125244, |
|
"learning_rate": 6.515837104072399e-05, |
|
"loss": 0.111, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 3.463203463203463, |
|
"grad_norm": 0.397502601146698, |
|
"learning_rate": 6.425339366515838e-05, |
|
"loss": 0.1189, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.463203463203463, |
|
"eval_loss": 0.10202713310718536, |
|
"eval_runtime": 25.0286, |
|
"eval_samples_per_second": 19.977, |
|
"eval_steps_per_second": 0.639, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.484848484848485, |
|
"grad_norm": 0.3283541202545166, |
|
"learning_rate": 6.334841628959275e-05, |
|
"loss": 0.1216, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 3.5064935064935066, |
|
"grad_norm": 0.3664211928844452, |
|
"learning_rate": 6.244343891402715e-05, |
|
"loss": 0.1171, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 3.5281385281385282, |
|
"grad_norm": 0.33489325642585754, |
|
"learning_rate": 6.153846153846155e-05, |
|
"loss": 0.1222, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 3.54978354978355, |
|
"grad_norm": 0.27272143959999084, |
|
"learning_rate": 6.0633484162895926e-05, |
|
"loss": 0.1089, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 3.571428571428571, |
|
"grad_norm": 0.3384559750556946, |
|
"learning_rate": 5.972850678733032e-05, |
|
"loss": 0.1092, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.5930735930735933, |
|
"grad_norm": 0.4882173240184784, |
|
"learning_rate": 5.882352941176471e-05, |
|
"loss": 0.1326, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 3.6147186147186146, |
|
"grad_norm": 0.33967503905296326, |
|
"learning_rate": 5.7918552036199105e-05, |
|
"loss": 0.119, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 0.30732062458992004, |
|
"learning_rate": 5.701357466063348e-05, |
|
"loss": 0.1115, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 3.658008658008658, |
|
"grad_norm": 0.46696531772613525, |
|
"learning_rate": 5.610859728506788e-05, |
|
"loss": 0.1246, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 3.6796536796536796, |
|
"grad_norm": 0.4071387052536011, |
|
"learning_rate": 5.520361990950227e-05, |
|
"loss": 0.1305, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.6796536796536796, |
|
"eval_loss": 0.09984961152076721, |
|
"eval_runtime": 22.2364, |
|
"eval_samples_per_second": 22.486, |
|
"eval_steps_per_second": 0.72, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.7012987012987013, |
|
"grad_norm": 0.34078463912010193, |
|
"learning_rate": 5.429864253393665e-05, |
|
"loss": 0.1227, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 3.722943722943723, |
|
"grad_norm": 0.46931177377700806, |
|
"learning_rate": 5.3393665158371045e-05, |
|
"loss": 0.1222, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 3.7445887445887447, |
|
"grad_norm": 0.28127625584602356, |
|
"learning_rate": 5.2488687782805436e-05, |
|
"loss": 0.1186, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.7662337662337664, |
|
"grad_norm": 0.3582036793231964, |
|
"learning_rate": 5.158371040723983e-05, |
|
"loss": 0.1124, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.787878787878788, |
|
"grad_norm": 0.31167203187942505, |
|
"learning_rate": 5.067873303167421e-05, |
|
"loss": 0.1179, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.8095238095238093, |
|
"grad_norm": 0.391791969537735, |
|
"learning_rate": 4.97737556561086e-05, |
|
"loss": 0.127, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.8311688311688314, |
|
"grad_norm": 0.3683635890483856, |
|
"learning_rate": 4.8868778280542986e-05, |
|
"loss": 0.1129, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 3.8528138528138527, |
|
"grad_norm": 0.3955051302909851, |
|
"learning_rate": 4.7963800904977377e-05, |
|
"loss": 0.125, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 3.8744588744588744, |
|
"grad_norm": 0.40321120619773865, |
|
"learning_rate": 4.705882352941177e-05, |
|
"loss": 0.1272, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 3.896103896103896, |
|
"grad_norm": 0.31911927461624146, |
|
"learning_rate": 4.615384615384616e-05, |
|
"loss": 0.1072, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.896103896103896, |
|
"eval_loss": 0.10081314295530319, |
|
"eval_runtime": 21.9055, |
|
"eval_samples_per_second": 22.825, |
|
"eval_steps_per_second": 0.73, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.9177489177489178, |
|
"grad_norm": 0.4627375900745392, |
|
"learning_rate": 4.524886877828054e-05, |
|
"loss": 0.1238, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 3.9393939393939394, |
|
"grad_norm": 0.43323320150375366, |
|
"learning_rate": 4.434389140271493e-05, |
|
"loss": 0.1202, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 3.961038961038961, |
|
"grad_norm": 0.4214964807033539, |
|
"learning_rate": 4.3438914027149324e-05, |
|
"loss": 0.1186, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 3.982683982683983, |
|
"grad_norm": 0.302083283662796, |
|
"learning_rate": 4.2533936651583714e-05, |
|
"loss": 0.1226, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 4.004329004329004, |
|
"grad_norm": 0.39333033561706543, |
|
"learning_rate": 4.1628959276018105e-05, |
|
"loss": 0.1249, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 4.025974025974026, |
|
"grad_norm": 0.34783726930618286, |
|
"learning_rate": 4.072398190045249e-05, |
|
"loss": 0.1135, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 4.0476190476190474, |
|
"grad_norm": 0.3140253722667694, |
|
"learning_rate": 3.981900452488688e-05, |
|
"loss": 0.1095, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 4.06926406926407, |
|
"grad_norm": 0.3263933062553406, |
|
"learning_rate": 3.891402714932127e-05, |
|
"loss": 0.1136, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 4.090909090909091, |
|
"grad_norm": 0.4250706434249878, |
|
"learning_rate": 3.8009049773755655e-05, |
|
"loss": 0.1056, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 4.112554112554113, |
|
"grad_norm": 0.2250766158103943, |
|
"learning_rate": 3.7104072398190046e-05, |
|
"loss": 0.1079, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.112554112554113, |
|
"eval_loss": 0.10422435402870178, |
|
"eval_runtime": 22.3048, |
|
"eval_samples_per_second": 22.417, |
|
"eval_steps_per_second": 0.717, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.134199134199134, |
|
"grad_norm": 0.35859718918800354, |
|
"learning_rate": 3.6199095022624436e-05, |
|
"loss": 0.1048, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 4.1558441558441555, |
|
"grad_norm": 0.2876388728618622, |
|
"learning_rate": 3.529411764705883e-05, |
|
"loss": 0.1109, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 4.177489177489178, |
|
"grad_norm": 0.2952074408531189, |
|
"learning_rate": 3.438914027149321e-05, |
|
"loss": 0.1118, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 4.199134199134199, |
|
"grad_norm": 0.38420605659484863, |
|
"learning_rate": 3.34841628959276e-05, |
|
"loss": 0.1113, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 4.220779220779221, |
|
"grad_norm": 0.35064896941185, |
|
"learning_rate": 3.257918552036199e-05, |
|
"loss": 0.108, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 4.242424242424242, |
|
"grad_norm": 0.3417244553565979, |
|
"learning_rate": 3.167420814479638e-05, |
|
"loss": 0.1067, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 4.264069264069264, |
|
"grad_norm": 0.2506988048553467, |
|
"learning_rate": 3.0769230769230774e-05, |
|
"loss": 0.116, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 4.285714285714286, |
|
"grad_norm": 0.40218839049339294, |
|
"learning_rate": 2.986425339366516e-05, |
|
"loss": 0.111, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 4.307359307359308, |
|
"grad_norm": 0.33777332305908203, |
|
"learning_rate": 2.8959276018099553e-05, |
|
"loss": 0.1081, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 4.329004329004329, |
|
"grad_norm": 0.2849682569503784, |
|
"learning_rate": 2.805429864253394e-05, |
|
"loss": 0.1043, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.329004329004329, |
|
"eval_loss": 0.10182041674852371, |
|
"eval_runtime": 32.5102, |
|
"eval_samples_per_second": 15.38, |
|
"eval_steps_per_second": 0.492, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.35064935064935, |
|
"grad_norm": 0.32567378878593445, |
|
"learning_rate": 2.7149321266968324e-05, |
|
"loss": 0.1065, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 4.372294372294372, |
|
"grad_norm": 0.35031840205192566, |
|
"learning_rate": 2.6244343891402718e-05, |
|
"loss": 0.1133, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 4.393939393939394, |
|
"grad_norm": 0.3233943581581116, |
|
"learning_rate": 2.5339366515837106e-05, |
|
"loss": 0.1034, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 4.415584415584416, |
|
"grad_norm": 0.326164573431015, |
|
"learning_rate": 2.4434389140271493e-05, |
|
"loss": 0.1053, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 4.437229437229437, |
|
"grad_norm": 0.3046077489852905, |
|
"learning_rate": 2.3529411764705884e-05, |
|
"loss": 0.1034, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 4.458874458874459, |
|
"grad_norm": 0.3930221498012543, |
|
"learning_rate": 2.262443438914027e-05, |
|
"loss": 0.1132, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 4.48051948051948, |
|
"grad_norm": 0.4203428626060486, |
|
"learning_rate": 2.1719457013574662e-05, |
|
"loss": 0.1123, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 4.5021645021645025, |
|
"grad_norm": 0.23356233537197113, |
|
"learning_rate": 2.0814479638009053e-05, |
|
"loss": 0.1158, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 4.523809523809524, |
|
"grad_norm": 0.3921778202056885, |
|
"learning_rate": 1.990950226244344e-05, |
|
"loss": 0.1017, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 4.545454545454545, |
|
"grad_norm": 0.41227224469184875, |
|
"learning_rate": 1.9004524886877827e-05, |
|
"loss": 0.1095, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.545454545454545, |
|
"eval_loss": 0.10190469026565552, |
|
"eval_runtime": 22.5368, |
|
"eval_samples_per_second": 22.186, |
|
"eval_steps_per_second": 0.71, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.567099567099567, |
|
"grad_norm": 0.46910232305526733, |
|
"learning_rate": 1.8099547511312218e-05, |
|
"loss": 0.1041, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 4.588744588744589, |
|
"grad_norm": 0.2650741934776306, |
|
"learning_rate": 1.7194570135746606e-05, |
|
"loss": 0.1096, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 4.6103896103896105, |
|
"grad_norm": 0.30446478724479675, |
|
"learning_rate": 1.6289592760180996e-05, |
|
"loss": 0.11, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 4.632034632034632, |
|
"grad_norm": 0.30164214968681335, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 0.1009, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 4.653679653679654, |
|
"grad_norm": 0.33579009771347046, |
|
"learning_rate": 1.4479638009049776e-05, |
|
"loss": 0.116, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 4.675324675324675, |
|
"grad_norm": 0.2805331349372864, |
|
"learning_rate": 1.3574660633484162e-05, |
|
"loss": 0.1154, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 4.696969696969697, |
|
"grad_norm": 0.2765107750892639, |
|
"learning_rate": 1.2669683257918553e-05, |
|
"loss": 0.1124, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 4.7186147186147185, |
|
"grad_norm": 0.39715567231178284, |
|
"learning_rate": 1.1764705882352942e-05, |
|
"loss": 0.1078, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 4.740259740259741, |
|
"grad_norm": 0.4515060782432556, |
|
"learning_rate": 1.0859728506787331e-05, |
|
"loss": 0.1135, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 4.761904761904762, |
|
"grad_norm": 0.3021312654018402, |
|
"learning_rate": 9.95475113122172e-06, |
|
"loss": 0.1069, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.761904761904762, |
|
"eval_loss": 0.10030877590179443, |
|
"eval_runtime": 22.0777, |
|
"eval_samples_per_second": 22.647, |
|
"eval_steps_per_second": 0.725, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.783549783549784, |
|
"grad_norm": 0.41326892375946045, |
|
"learning_rate": 9.049773755656109e-06, |
|
"loss": 0.1131, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 4.805194805194805, |
|
"grad_norm": 0.3018375039100647, |
|
"learning_rate": 8.144796380090498e-06, |
|
"loss": 0.1064, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 4.8268398268398265, |
|
"grad_norm": 0.3863151967525482, |
|
"learning_rate": 7.239819004524888e-06, |
|
"loss": 0.1187, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 4.848484848484849, |
|
"grad_norm": 0.47006744146347046, |
|
"learning_rate": 6.334841628959276e-06, |
|
"loss": 0.1073, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 4.87012987012987, |
|
"grad_norm": 0.36669015884399414, |
|
"learning_rate": 5.4298642533936655e-06, |
|
"loss": 0.1098, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 4.891774891774892, |
|
"grad_norm": 0.3454284965991974, |
|
"learning_rate": 4.5248868778280546e-06, |
|
"loss": 0.0987, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 4.913419913419913, |
|
"grad_norm": 0.45213282108306885, |
|
"learning_rate": 3.619909502262444e-06, |
|
"loss": 0.1094, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 4.935064935064935, |
|
"grad_norm": 0.3391339182853699, |
|
"learning_rate": 2.7149321266968327e-06, |
|
"loss": 0.1011, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 4.956709956709957, |
|
"grad_norm": 0.3675156235694885, |
|
"learning_rate": 1.809954751131222e-06, |
|
"loss": 0.1143, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 4.978354978354979, |
|
"grad_norm": 0.3254551589488983, |
|
"learning_rate": 9.04977375565611e-07, |
|
"loss": 0.1123, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.978354978354979, |
|
"eval_loss": 0.09990464895963669, |
|
"eval_runtime": 21.2785, |
|
"eval_samples_per_second": 23.498, |
|
"eval_steps_per_second": 0.752, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.34439313411712646, |
|
"learning_rate": 0.0, |
|
"loss": 0.1115, |
|
"step": 2310 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2310, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.35957960819826e+18, |
|
"train_batch_size": 256, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|