|
{ |
|
"best_metric": 0.10513312369585037, |
|
"best_model_checkpoint": "/users/u2023000898/train_moe/pretrain_data_chunk/60000_3/checkpoint-1800", |
|
"epoch": 2.9985985985985986, |
|
"eval_steps": 100, |
|
"global_step": 1872, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016016016016016016, |
|
"grad_norm": 13.409518938793026, |
|
"learning_rate": 5.319148936170213e-07, |
|
"loss": 1.2929, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03203203203203203, |
|
"grad_norm": 10.369912201219414, |
|
"learning_rate": 1.0638297872340427e-06, |
|
"loss": 1.2223, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04804804804804805, |
|
"grad_norm": 5.44261186688239, |
|
"learning_rate": 1.595744680851064e-06, |
|
"loss": 0.943, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.06406406406406406, |
|
"grad_norm": 3.605857199785854, |
|
"learning_rate": 2.1276595744680853e-06, |
|
"loss": 0.7396, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08008008008008008, |
|
"grad_norm": 1.9924246604906095, |
|
"learning_rate": 2.6595744680851065e-06, |
|
"loss": 0.6245, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0960960960960961, |
|
"grad_norm": 2.3307973177641075, |
|
"learning_rate": 3.191489361702128e-06, |
|
"loss": 0.5384, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11211211211211211, |
|
"grad_norm": 2.417305413795213, |
|
"learning_rate": 3.723404255319149e-06, |
|
"loss": 0.4779, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12812812812812813, |
|
"grad_norm": 2.4395863243528044, |
|
"learning_rate": 4.255319148936171e-06, |
|
"loss": 0.4384, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14414414414414414, |
|
"grad_norm": 2.967607994777358, |
|
"learning_rate": 4.787234042553192e-06, |
|
"loss": 0.4194, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.16016016016016016, |
|
"grad_norm": 1.8226498622217497, |
|
"learning_rate": 5.319148936170213e-06, |
|
"loss": 0.3881, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.16016016016016016, |
|
"eval_loss": 0.38211190700531006, |
|
"eval_runtime": 5.3031, |
|
"eval_samples_per_second": 11.314, |
|
"eval_steps_per_second": 5.657, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.17617617617617617, |
|
"grad_norm": 2.0417129648638404, |
|
"learning_rate": 5.851063829787235e-06, |
|
"loss": 0.3735, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1921921921921922, |
|
"grad_norm": 1.8910151101905892, |
|
"learning_rate": 6.382978723404256e-06, |
|
"loss": 0.3597, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2082082082082082, |
|
"grad_norm": 2.356033958506011, |
|
"learning_rate": 6.914893617021278e-06, |
|
"loss": 0.3451, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.22422422422422422, |
|
"grad_norm": 1.6182433590940526, |
|
"learning_rate": 7.446808510638298e-06, |
|
"loss": 0.3299, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.24024024024024024, |
|
"grad_norm": 1.5854757979381888, |
|
"learning_rate": 7.97872340425532e-06, |
|
"loss": 0.3224, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.25625625625625625, |
|
"grad_norm": 1.6564206751789106, |
|
"learning_rate": 8.510638297872341e-06, |
|
"loss": 0.3077, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2722722722722723, |
|
"grad_norm": 1.647017991315564, |
|
"learning_rate": 9.042553191489362e-06, |
|
"loss": 0.3017, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.2882882882882883, |
|
"grad_norm": 2.0446337917200434, |
|
"learning_rate": 9.574468085106385e-06, |
|
"loss": 0.2859, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.30430430430430433, |
|
"grad_norm": 1.434904669145637, |
|
"learning_rate": 9.999965197129365e-06, |
|
"loss": 0.2877, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3203203203203203, |
|
"grad_norm": 2.9106697114797573, |
|
"learning_rate": 9.998747147528375e-06, |
|
"loss": 0.2773, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3203203203203203, |
|
"eval_loss": 0.2963399589061737, |
|
"eval_runtime": 4.7335, |
|
"eval_samples_per_second": 12.676, |
|
"eval_steps_per_second": 6.338, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.33633633633633636, |
|
"grad_norm": 2.310160659638816, |
|
"learning_rate": 9.995789438861128e-06, |
|
"loss": 0.2793, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.35235235235235235, |
|
"grad_norm": 1.6140139693791007, |
|
"learning_rate": 9.991093100466482e-06, |
|
"loss": 0.2678, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3683683683683684, |
|
"grad_norm": 1.6727704090132902, |
|
"learning_rate": 9.98465976675951e-06, |
|
"loss": 0.2633, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3843843843843844, |
|
"grad_norm": 1.6386702854417348, |
|
"learning_rate": 9.976491676662679e-06, |
|
"loss": 0.2556, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4004004004004004, |
|
"grad_norm": 1.5318495172198847, |
|
"learning_rate": 9.966591672826674e-06, |
|
"loss": 0.2501, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4164164164164164, |
|
"grad_norm": 1.4131958267494484, |
|
"learning_rate": 9.95496320064109e-06, |
|
"loss": 0.239, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.43243243243243246, |
|
"grad_norm": 2.116850571157825, |
|
"learning_rate": 9.941610307035385e-06, |
|
"loss": 0.2352, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.44844844844844844, |
|
"grad_norm": 1.4833918481789057, |
|
"learning_rate": 9.926537639070457e-06, |
|
"loss": 0.2343, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4644644644644645, |
|
"grad_norm": 1.4499258247835383, |
|
"learning_rate": 9.90975044232139e-06, |
|
"loss": 0.2246, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.4804804804804805, |
|
"grad_norm": 1.5412749043910834, |
|
"learning_rate": 9.891254559051886e-06, |
|
"loss": 0.2209, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4804804804804805, |
|
"eval_loss": 0.22802673280239105, |
|
"eval_runtime": 4.7226, |
|
"eval_samples_per_second": 12.705, |
|
"eval_steps_per_second": 6.352, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4964964964964965, |
|
"grad_norm": 1.6016396794420027, |
|
"learning_rate": 9.871056426181052e-06, |
|
"loss": 0.2206, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5125125125125125, |
|
"grad_norm": 1.4910134315726407, |
|
"learning_rate": 9.849163073043223e-06, |
|
"loss": 0.2229, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5285285285285285, |
|
"grad_norm": 1.5947649628687839, |
|
"learning_rate": 9.82558211894163e-06, |
|
"loss": 0.206, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5445445445445446, |
|
"grad_norm": 2.281846286082721, |
|
"learning_rate": 9.800321770496726e-06, |
|
"loss": 0.2106, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5605605605605606, |
|
"grad_norm": 1.628823704341528, |
|
"learning_rate": 9.773390818790136e-06, |
|
"loss": 0.2049, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5765765765765766, |
|
"grad_norm": 1.8232097235612017, |
|
"learning_rate": 9.744798636305189e-06, |
|
"loss": 0.2045, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 1.880889712421178, |
|
"learning_rate": 9.714555173665112e-06, |
|
"loss": 0.1948, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6086086086086087, |
|
"grad_norm": 1.8941186642260008, |
|
"learning_rate": 9.68267095617003e-06, |
|
"loss": 0.1947, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.6246246246246246, |
|
"grad_norm": 1.6000201393992286, |
|
"learning_rate": 9.649157080133962e-06, |
|
"loss": 0.1923, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6406406406406406, |
|
"grad_norm": 3.414988222682504, |
|
"learning_rate": 9.614025209023084e-06, |
|
"loss": 0.1945, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6406406406406406, |
|
"eval_loss": 0.19540859758853912, |
|
"eval_runtime": 4.7637, |
|
"eval_samples_per_second": 12.595, |
|
"eval_steps_per_second": 6.298, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6566566566566566, |
|
"grad_norm": 1.9420429807772246, |
|
"learning_rate": 9.577287569396632e-06, |
|
"loss": 0.1894, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6726726726726727, |
|
"grad_norm": 1.3642644226107268, |
|
"learning_rate": 9.538956946651816e-06, |
|
"loss": 0.1782, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6886886886886887, |
|
"grad_norm": 1.6342771748206477, |
|
"learning_rate": 9.499046680574267e-06, |
|
"loss": 0.1723, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.7047047047047047, |
|
"grad_norm": 1.487958949075534, |
|
"learning_rate": 9.457570660695542e-06, |
|
"loss": 0.1785, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.7207207207207207, |
|
"grad_norm": 1.7299656441999525, |
|
"learning_rate": 9.41454332145928e-06, |
|
"loss": 0.1767, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7367367367367368, |
|
"grad_norm": 1.4998181975998246, |
|
"learning_rate": 9.369979637197774e-06, |
|
"loss": 0.1738, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.7527527527527528, |
|
"grad_norm": 1.5911390545676716, |
|
"learning_rate": 9.323895116920591e-06, |
|
"loss": 0.1688, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7687687687687688, |
|
"grad_norm": 2.394239823663753, |
|
"learning_rate": 9.27630579891716e-06, |
|
"loss": 0.1628, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7847847847847848, |
|
"grad_norm": 1.542509136620149, |
|
"learning_rate": 9.227228245175127e-06, |
|
"loss": 0.1676, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8008008008008008, |
|
"grad_norm": 1.5343832382043492, |
|
"learning_rate": 9.176679535616477e-06, |
|
"loss": 0.1592, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8008008008008008, |
|
"eval_loss": 0.1710364669561386, |
|
"eval_runtime": 4.7555, |
|
"eval_samples_per_second": 12.617, |
|
"eval_steps_per_second": 6.309, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.8168168168168168, |
|
"grad_norm": 1.8010583431481462, |
|
"learning_rate": 9.124677262153405e-06, |
|
"loss": 0.1586, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.8328328328328328, |
|
"grad_norm": 1.6686707098067743, |
|
"learning_rate": 9.071239522565978e-06, |
|
"loss": 0.1556, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.8488488488488488, |
|
"grad_norm": 1.7504894762611782, |
|
"learning_rate": 9.016384914203771e-06, |
|
"loss": 0.1592, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8648648648648649, |
|
"grad_norm": 1.4989615763117103, |
|
"learning_rate": 8.960132527513642e-06, |
|
"loss": 0.1616, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8808808808808809, |
|
"grad_norm": 1.5542001700716142, |
|
"learning_rate": 8.902501939395887e-06, |
|
"loss": 0.155, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8968968968968969, |
|
"grad_norm": 2.502100279988363, |
|
"learning_rate": 8.8435132063911e-06, |
|
"loss": 0.1514, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.9129129129129129, |
|
"grad_norm": 1.5418563195158612, |
|
"learning_rate": 8.783186857700137e-06, |
|
"loss": 0.1455, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.928928928928929, |
|
"grad_norm": 1.6683319786348245, |
|
"learning_rate": 8.721543888039534e-06, |
|
"loss": 0.1417, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.944944944944945, |
|
"grad_norm": 1.762643457697612, |
|
"learning_rate": 8.658605750334972e-06, |
|
"loss": 0.155, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.960960960960961, |
|
"grad_norm": 1.9599503664864883, |
|
"learning_rate": 8.594394348255239e-06, |
|
"loss": 0.1393, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.960960960960961, |
|
"eval_loss": 0.1448938250541687, |
|
"eval_runtime": 4.5683, |
|
"eval_samples_per_second": 13.134, |
|
"eval_steps_per_second": 6.567, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9769769769769769, |
|
"grad_norm": 1.9697402213513955, |
|
"learning_rate": 8.528932028589337e-06, |
|
"loss": 0.142, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.992992992992993, |
|
"grad_norm": 1.6768033303014018, |
|
"learning_rate": 8.462241573469378e-06, |
|
"loss": 0.1426, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.0092092092092093, |
|
"grad_norm": 1.5267241545230772, |
|
"learning_rate": 8.394346192441967e-06, |
|
"loss": 0.1415, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.0252252252252252, |
|
"grad_norm": 1.476716477059175, |
|
"learning_rate": 8.325269514390835e-06, |
|
"loss": 0.1176, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.0412412412412413, |
|
"grad_norm": 1.4540927119806955, |
|
"learning_rate": 8.255035579313545e-06, |
|
"loss": 0.1223, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.0572572572572572, |
|
"grad_norm": 1.5802819169958848, |
|
"learning_rate": 8.183668829955111e-06, |
|
"loss": 0.1117, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.0732732732732733, |
|
"grad_norm": 2.1509571510914203, |
|
"learning_rate": 8.111194103301461e-06, |
|
"loss": 0.1176, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.0892892892892894, |
|
"grad_norm": 1.4577026982979966, |
|
"learning_rate": 8.037636621935686e-06, |
|
"loss": 0.1193, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.1053053053053052, |
|
"grad_norm": 1.5430361342124643, |
|
"learning_rate": 7.96302198526011e-06, |
|
"loss": 0.1182, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.1213213213213213, |
|
"grad_norm": 1.5919162208902233, |
|
"learning_rate": 7.887376160587214e-06, |
|
"loss": 0.1138, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.1213213213213213, |
|
"eval_loss": 0.15331269800662994, |
|
"eval_runtime": 4.5367, |
|
"eval_samples_per_second": 13.225, |
|
"eval_steps_per_second": 6.613, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.1373373373373372, |
|
"grad_norm": 1.2773733225900368, |
|
"learning_rate": 7.810725474102504e-06, |
|
"loss": 0.1116, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.1533533533533533, |
|
"grad_norm": 1.3037345062586103, |
|
"learning_rate": 7.733096601702508e-06, |
|
"loss": 0.1123, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.1693693693693694, |
|
"grad_norm": 1.5584266021091908, |
|
"learning_rate": 7.654516559711053e-06, |
|
"loss": 0.1142, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.1853853853853853, |
|
"grad_norm": 1.5547636549777273, |
|
"learning_rate": 7.575012695477076e-06, |
|
"loss": 0.1095, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.2014014014014014, |
|
"grad_norm": 1.473482889988994, |
|
"learning_rate": 7.494612677857218e-06, |
|
"loss": 0.1091, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.2174174174174175, |
|
"grad_norm": 1.3570410291065769, |
|
"learning_rate": 7.413344487586542e-06, |
|
"loss": 0.1099, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.2334334334334334, |
|
"grad_norm": 1.5362278958719333, |
|
"learning_rate": 7.331236407540704e-06, |
|
"loss": 0.1137, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.2494494494494495, |
|
"grad_norm": 1.9191542465617197, |
|
"learning_rate": 7.248317012892969e-06, |
|
"loss": 0.1128, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.2654654654654656, |
|
"grad_norm": 1.3514066304735823, |
|
"learning_rate": 7.164615161169518e-06, |
|
"loss": 0.1087, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.2814814814814814, |
|
"grad_norm": 1.6846669149108675, |
|
"learning_rate": 7.080159982206471e-06, |
|
"loss": 0.1096, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.2814814814814814, |
|
"eval_loss": 0.140634223818779, |
|
"eval_runtime": 4.5862, |
|
"eval_samples_per_second": 13.083, |
|
"eval_steps_per_second": 6.541, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.2974974974974975, |
|
"grad_norm": 1.1919650111365638, |
|
"learning_rate": 6.994980868012151e-06, |
|
"loss": 0.1109, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.3135135135135134, |
|
"grad_norm": 1.6072997465464531, |
|
"learning_rate": 6.909107462538113e-06, |
|
"loss": 0.1104, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.3295295295295295, |
|
"grad_norm": 1.5714907447670126, |
|
"learning_rate": 6.822569651362475e-06, |
|
"loss": 0.1091, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.3455455455455456, |
|
"grad_norm": 1.3715345334587152, |
|
"learning_rate": 6.735397551289179e-06, |
|
"loss": 0.1072, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.3615615615615615, |
|
"grad_norm": 1.5656149539968518, |
|
"learning_rate": 6.647621499866762e-06, |
|
"loss": 0.1065, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.3775775775775776, |
|
"grad_norm": 1.747219292587951, |
|
"learning_rate": 6.5592720448303174e-06, |
|
"loss": 0.1049, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.3935935935935935, |
|
"grad_norm": 1.4629586813910707, |
|
"learning_rate": 6.470379933470296e-06, |
|
"loss": 0.1018, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.4096096096096096, |
|
"grad_norm": 2.030872539804225, |
|
"learning_rate": 6.380976101931879e-06, |
|
"loss": 0.1015, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.4256256256256257, |
|
"grad_norm": 1.229739648882705, |
|
"learning_rate": 6.291091664448589e-06, |
|
"loss": 0.1076, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.4416416416416418, |
|
"grad_norm": 1.584198326139697, |
|
"learning_rate": 6.200757902513962e-06, |
|
"loss": 0.1028, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.4416416416416418, |
|
"eval_loss": 0.12726753950119019, |
|
"eval_runtime": 4.6398, |
|
"eval_samples_per_second": 12.932, |
|
"eval_steps_per_second": 6.466, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.4576576576576576, |
|
"grad_norm": 1.053882152208791, |
|
"learning_rate": 6.11000625399499e-06, |
|
"loss": 0.1058, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.4736736736736737, |
|
"grad_norm": 1.5483751760051925, |
|
"learning_rate": 6.0188683021911394e-06, |
|
"loss": 0.1008, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.4896896896896896, |
|
"grad_norm": 1.5020771306479124, |
|
"learning_rate": 5.927375764842766e-06, |
|
"loss": 0.0986, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.5057057057057057, |
|
"grad_norm": 1.2818645291771074, |
|
"learning_rate": 5.835560483092743e-06, |
|
"loss": 0.1045, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.5217217217217218, |
|
"grad_norm": 1.5262879682098887, |
|
"learning_rate": 5.743454410405126e-06, |
|
"loss": 0.1008, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.5377377377377377, |
|
"grad_norm": 1.9445218478299358, |
|
"learning_rate": 5.651089601444752e-06, |
|
"loss": 0.0975, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.5537537537537538, |
|
"grad_norm": 1.153429430002744, |
|
"learning_rate": 5.558498200921597e-06, |
|
"loss": 0.103, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.5697697697697697, |
|
"grad_norm": 1.7142266460197206, |
|
"learning_rate": 5.465712432403812e-06, |
|
"loss": 0.1009, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.5857857857857858, |
|
"grad_norm": 1.0420472106372876, |
|
"learning_rate": 5.372764587103309e-06, |
|
"loss": 0.1026, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.6018018018018019, |
|
"grad_norm": 1.0933198718882033, |
|
"learning_rate": 5.279687012637798e-06, |
|
"loss": 0.0998, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.6018018018018019, |
|
"eval_loss": 0.12056411057710648, |
|
"eval_runtime": 4.6363, |
|
"eval_samples_per_second": 12.941, |
|
"eval_steps_per_second": 6.471, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.617817817817818, |
|
"grad_norm": 1.5249725941966104, |
|
"learning_rate": 5.186512101773206e-06, |
|
"loss": 0.0987, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.6338338338338338, |
|
"grad_norm": 1.1551410324839504, |
|
"learning_rate": 5.093272281150383e-06, |
|
"loss": 0.0967, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.6498498498498497, |
|
"grad_norm": 1.3764150156451525, |
|
"learning_rate": 5e-06, |
|
"loss": 0.102, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.6658658658658658, |
|
"grad_norm": 1.4315332194265298, |
|
"learning_rate": 4.906727718849619e-06, |
|
"loss": 0.0918, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.681881881881882, |
|
"grad_norm": 1.5641425623387613, |
|
"learning_rate": 4.813487898226794e-06, |
|
"loss": 0.0972, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.697897897897898, |
|
"grad_norm": 1.364206236390582, |
|
"learning_rate": 4.720312987362204e-06, |
|
"loss": 0.0963, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.713913913913914, |
|
"grad_norm": 1.2084858559030025, |
|
"learning_rate": 4.6272354128966924e-06, |
|
"loss": 0.0947, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.7299299299299298, |
|
"grad_norm": 1.369095339213906, |
|
"learning_rate": 4.534287567596189e-06, |
|
"loss": 0.1, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.7459459459459459, |
|
"grad_norm": 1.7200163939268696, |
|
"learning_rate": 4.441501799078405e-06, |
|
"loss": 0.0939, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.761961961961962, |
|
"grad_norm": 1.0601289469307587, |
|
"learning_rate": 4.348910398555249e-06, |
|
"loss": 0.0952, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.761961961961962, |
|
"eval_loss": 0.12079311162233353, |
|
"eval_runtime": 4.5849, |
|
"eval_samples_per_second": 13.087, |
|
"eval_steps_per_second": 6.543, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.777977977977978, |
|
"grad_norm": 1.3096671495162804, |
|
"learning_rate": 4.2565455895948745e-06, |
|
"loss": 0.095, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.793993993993994, |
|
"grad_norm": 1.6474183248144982, |
|
"learning_rate": 4.164439516907258e-06, |
|
"loss": 0.096, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.81001001001001, |
|
"grad_norm": 1.2638496884017785, |
|
"learning_rate": 4.072624235157234e-06, |
|
"loss": 0.0932, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.826026026026026, |
|
"grad_norm": 1.1666611117167804, |
|
"learning_rate": 3.981131697808862e-06, |
|
"loss": 0.0961, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.842042042042042, |
|
"grad_norm": 1.4101778997279006, |
|
"learning_rate": 3.889993746005011e-06, |
|
"loss": 0.0938, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.8580580580580581, |
|
"grad_norm": 1.4720670252418966, |
|
"learning_rate": 3.799242097486038e-06, |
|
"loss": 0.0958, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.8740740740740742, |
|
"grad_norm": 1.9229886608107283, |
|
"learning_rate": 3.708908335551412e-06, |
|
"loss": 0.0972, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.89009009009009, |
|
"grad_norm": 1.5097045011947579, |
|
"learning_rate": 3.6190238980681235e-06, |
|
"loss": 0.0931, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.906106106106106, |
|
"grad_norm": 1.3032785079127815, |
|
"learning_rate": 3.529620066529704e-06, |
|
"loss": 0.0925, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.922122122122122, |
|
"grad_norm": 1.158235321697393, |
|
"learning_rate": 3.4407279551696846e-06, |
|
"loss": 0.092, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.922122122122122, |
|
"eval_loss": 0.12118110805749893, |
|
"eval_runtime": 4.5355, |
|
"eval_samples_per_second": 13.229, |
|
"eval_steps_per_second": 6.614, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.9381381381381382, |
|
"grad_norm": 1.3273343135981375, |
|
"learning_rate": 3.352378500133239e-06, |
|
"loss": 0.0912, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.9541541541541543, |
|
"grad_norm": 1.2638328817270799, |
|
"learning_rate": 3.264602448710822e-06, |
|
"loss": 0.092, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.9701701701701702, |
|
"grad_norm": 1.0061664501466365, |
|
"learning_rate": 3.177430348637527e-06, |
|
"loss": 0.0913, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.986186186186186, |
|
"grad_norm": 1.3814485492365947, |
|
"learning_rate": 3.090892537461889e-06, |
|
"loss": 0.0874, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.0024024024024025, |
|
"grad_norm": 1.4890736474425244, |
|
"learning_rate": 3.00501913198785e-06, |
|
"loss": 0.0869, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.0184184184184186, |
|
"grad_norm": 1.143647977026878, |
|
"learning_rate": 2.9198400177935303e-06, |
|
"loss": 0.0629, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.0344344344344343, |
|
"grad_norm": 1.0231219995939023, |
|
"learning_rate": 2.835384838830481e-06, |
|
"loss": 0.0638, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.0504504504504504, |
|
"grad_norm": 1.09838986733996, |
|
"learning_rate": 2.7516829871070295e-06, |
|
"loss": 0.0588, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.0664664664664665, |
|
"grad_norm": 1.1570777805297492, |
|
"learning_rate": 2.668763592459297e-06, |
|
"loss": 0.0595, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.0824824824824826, |
|
"grad_norm": 0.9848777368332091, |
|
"learning_rate": 2.586655512413458e-06, |
|
"loss": 0.061, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.0824824824824826, |
|
"eval_loss": 0.1126304417848587, |
|
"eval_runtime": 4.4139, |
|
"eval_samples_per_second": 13.594, |
|
"eval_steps_per_second": 6.797, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.0984984984984987, |
|
"grad_norm": 1.0676656917105798, |
|
"learning_rate": 2.505387322142782e-06, |
|
"loss": 0.0612, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.1145145145145143, |
|
"grad_norm": 1.05569010042999, |
|
"learning_rate": 2.4249873045229244e-06, |
|
"loss": 0.0607, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.1305305305305304, |
|
"grad_norm": 0.9996457001160142, |
|
"learning_rate": 2.345483440288947e-06, |
|
"loss": 0.06, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.1465465465465465, |
|
"grad_norm": 1.07628545223274, |
|
"learning_rate": 2.2669033982974946e-06, |
|
"loss": 0.0588, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.1625625625625626, |
|
"grad_norm": 1.07905549545333, |
|
"learning_rate": 2.189274525897498e-06, |
|
"loss": 0.0636, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.1785785785785787, |
|
"grad_norm": 1.2549844294011818, |
|
"learning_rate": 2.1126238394127868e-06, |
|
"loss": 0.0607, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.1945945945945944, |
|
"grad_norm": 1.0059915885008073, |
|
"learning_rate": 2.03697801473989e-06, |
|
"loss": 0.062, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.2106106106106105, |
|
"grad_norm": 1.274382465169598, |
|
"learning_rate": 1.962363378064316e-06, |
|
"loss": 0.0633, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.2266266266266266, |
|
"grad_norm": 1.281532287957869, |
|
"learning_rate": 1.8888058966985407e-06, |
|
"loss": 0.0633, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.2426426426426427, |
|
"grad_norm": 1.1922439699801886, |
|
"learning_rate": 1.8163311700448899e-06, |
|
"loss": 0.0624, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.2426426426426427, |
|
"eval_loss": 0.1149037629365921, |
|
"eval_runtime": 4.4069, |
|
"eval_samples_per_second": 13.615, |
|
"eval_steps_per_second": 6.808, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.258658658658659, |
|
"grad_norm": 1.7050676523518535, |
|
"learning_rate": 1.7449644206864564e-06, |
|
"loss": 0.0623, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.2746746746746744, |
|
"grad_norm": 0.9435102102997306, |
|
"learning_rate": 1.6747304856091662e-06, |
|
"loss": 0.059, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.2906906906906905, |
|
"grad_norm": 1.0831267008587788, |
|
"learning_rate": 1.6056538075580342e-06, |
|
"loss": 0.0627, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.3067067067067066, |
|
"grad_norm": 1.08268476394283, |
|
"learning_rate": 1.5377584265306222e-06, |
|
"loss": 0.0593, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.3227227227227227, |
|
"grad_norm": 0.9770773292942597, |
|
"learning_rate": 1.4710679714106635e-06, |
|
"loss": 0.0583, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.338738738738739, |
|
"grad_norm": 1.3751312648199294, |
|
"learning_rate": 1.4056056517447637e-06, |
|
"loss": 0.0626, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.354754754754755, |
|
"grad_norm": 1.071282141200762, |
|
"learning_rate": 1.3413942496650301e-06, |
|
"loss": 0.0598, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.3707707707707706, |
|
"grad_norm": 0.9441682147431711, |
|
"learning_rate": 1.2784561119604683e-06, |
|
"loss": 0.0609, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.3867867867867867, |
|
"grad_norm": 1.2531258809500883, |
|
"learning_rate": 1.2168131422998653e-06, |
|
"loss": 0.0618, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.402802802802803, |
|
"grad_norm": 0.988649947120785, |
|
"learning_rate": 1.156486793608899e-06, |
|
"loss": 0.0607, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.402802802802803, |
|
"eval_loss": 0.10994044691324234, |
|
"eval_runtime": 4.4237, |
|
"eval_samples_per_second": 13.563, |
|
"eval_steps_per_second": 6.782, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.418818818818819, |
|
"grad_norm": 1.1998096160223135, |
|
"learning_rate": 1.0974980606041152e-06, |
|
"loss": 0.0594, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.434834834834835, |
|
"grad_norm": 0.9677720356738567, |
|
"learning_rate": 1.0398674724863584e-06, |
|
"loss": 0.059, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.450850850850851, |
|
"grad_norm": 1.3141425510127478, |
|
"learning_rate": 9.836150857962296e-07, |
|
"loss": 0.0596, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.4668668668668667, |
|
"grad_norm": 0.8985622286196838, |
|
"learning_rate": 9.287604774340236e-07, |
|
"loss": 0.0581, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.482882882882883, |
|
"grad_norm": 1.2915272829435755, |
|
"learning_rate": 8.753227378465956e-07, |
|
"loss": 0.0589, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.498898898898899, |
|
"grad_norm": 1.051170086009324, |
|
"learning_rate": 8.233204643835235e-07, |
|
"loss": 0.0585, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.514914914914915, |
|
"grad_norm": 1.1697265988828673, |
|
"learning_rate": 7.72771754824877e-07, |
|
"loss": 0.0627, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.530930930930931, |
|
"grad_norm": 1.045582836173671, |
|
"learning_rate": 7.23694201082843e-07, |
|
"loss": 0.0594, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.546946946946947, |
|
"grad_norm": 1.0460247904446258, |
|
"learning_rate": 6.761048830794098e-07, |
|
"loss": 0.0583, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.562962962962963, |
|
"grad_norm": 1.0766438599871688, |
|
"learning_rate": 6.300203628022272e-07, |
|
"loss": 0.0584, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.562962962962963, |
|
"eval_loss": 0.1075584813952446, |
|
"eval_runtime": 4.7311, |
|
"eval_samples_per_second": 12.682, |
|
"eval_steps_per_second": 6.341, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.578978978978979, |
|
"grad_norm": 1.0644137083005472, |
|
"learning_rate": 5.854566785407212e-07, |
|
"loss": 0.0615, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.594994994994995, |
|
"grad_norm": 0.8862800692246755, |
|
"learning_rate": 5.42429339304461e-07, |
|
"loss": 0.0589, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.611011011011011, |
|
"grad_norm": 1.0632023267290576, |
|
"learning_rate": 5.009533194257332e-07, |
|
"loss": 0.0576, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.627027027027027, |
|
"grad_norm": 1.0612601719189565, |
|
"learning_rate": 4.6104305334818577e-07, |
|
"loss": 0.0581, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.643043043043043, |
|
"grad_norm": 1.1363089897778307, |
|
"learning_rate": 4.2271243060336976e-07, |
|
"loss": 0.0565, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.659059059059059, |
|
"grad_norm": 1.0581328451419771, |
|
"learning_rate": 3.8597479097691626e-07, |
|
"loss": 0.06, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.675075075075075, |
|
"grad_norm": 1.0292294546952114, |
|
"learning_rate": 3.508429198660379e-07, |
|
"loss": 0.0592, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.6910910910910912, |
|
"grad_norm": 1.1041696576832245, |
|
"learning_rate": 3.1732904382996975e-07, |
|
"loss": 0.0531, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.707107107107107, |
|
"grad_norm": 1.1017298827099373, |
|
"learning_rate": 2.854448263348891e-07, |
|
"loss": 0.0592, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.723123123123123, |
|
"grad_norm": 1.2373658193384414, |
|
"learning_rate": 2.5520136369481194e-07, |
|
"loss": 0.0574, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.723123123123123, |
|
"eval_loss": 0.10619153082370758, |
|
"eval_runtime": 4.4379, |
|
"eval_samples_per_second": 13.52, |
|
"eval_steps_per_second": 6.76, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.739139139139139, |
|
"grad_norm": 1.0150510884990294, |
|
"learning_rate": 2.266091812098642e-07, |
|
"loss": 0.0581, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.755155155155155, |
|
"grad_norm": 1.4492779833818328, |
|
"learning_rate": 1.9967822950327453e-07, |
|
"loss": 0.0609, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.7711711711711713, |
|
"grad_norm": 1.104287468688097, |
|
"learning_rate": 1.7441788105837133e-07, |
|
"loss": 0.0593, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.787187187187187, |
|
"grad_norm": 1.0784218287885254, |
|
"learning_rate": 1.508369269567783e-07, |
|
"loss": 0.0606, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.803203203203203, |
|
"grad_norm": 1.2207885496699904, |
|
"learning_rate": 1.2894357381894984e-07, |
|
"loss": 0.0564, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.819219219219219, |
|
"grad_norm": 0.8638978123535029, |
|
"learning_rate": 1.0874544094811424e-07, |
|
"loss": 0.0586, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.8352352352352352, |
|
"grad_norm": 1.1498833578746668, |
|
"learning_rate": 9.024955767861054e-08, |
|
"loss": 0.0629, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.8512512512512513, |
|
"grad_norm": 0.9945976550163285, |
|
"learning_rate": 7.346236092954318e-08, |
|
"loss": 0.0562, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.867267267267267, |
|
"grad_norm": 0.9949658942902132, |
|
"learning_rate": 5.838969296461605e-08, |
|
"loss": 0.0583, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.8832832832832835, |
|
"grad_norm": 1.0428607480343814, |
|
"learning_rate": 4.50367993589107e-08, |
|
"loss": 0.0571, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.8832832832832835, |
|
"eval_loss": 0.10513312369585037, |
|
"eval_runtime": 4.4044, |
|
"eval_samples_per_second": 13.623, |
|
"eval_steps_per_second": 6.811, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.899299299299299, |
|
"grad_norm": 1.0170732585763345, |
|
"learning_rate": 3.340832717332765e-08, |
|
"loss": 0.0605, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.9153153153153153, |
|
"grad_norm": 0.9518652403398978, |
|
"learning_rate": 2.3508323337321225e-08, |
|
"loss": 0.0567, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.9313313313313314, |
|
"grad_norm": 1.0033269558751705, |
|
"learning_rate": 1.534023324049061e-08, |
|
"loss": 0.0564, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.9473473473473475, |
|
"grad_norm": 1.1693211443264353, |
|
"learning_rate": 8.906899533517866e-09, |
|
"loss": 0.0589, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.9633633633633636, |
|
"grad_norm": 1.2335037547499492, |
|
"learning_rate": 4.210561138873193e-09, |
|
"loss": 0.0576, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.9793793793793792, |
|
"grad_norm": 1.1025808173861562, |
|
"learning_rate": 1.2528524716259872e-09, |
|
"loss": 0.0548, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.9953953953953953, |
|
"grad_norm": 0.9062323167927434, |
|
"learning_rate": 3.480287063706289e-11, |
|
"loss": 0.0587, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.9985985985985986, |
|
"step": 1872, |
|
"total_flos": 222868335149056.0, |
|
"train_loss": 0.15247767985376537, |
|
"train_runtime": 21883.2415, |
|
"train_samples_per_second": 8.217, |
|
"train_steps_per_second": 0.086 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1872, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 222868335149056.0, |
|
"train_batch_size": 3, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|