Meta-chunker-1.5B-60K / trainer_state.json
Robot2050's picture
Upload 7 files
0bfddb9 verified
{
"best_metric": 0.10513312369585037,
"best_model_checkpoint": "/users/u2023000898/train_moe/pretrain_data_chunk/60000_3/checkpoint-1800",
"epoch": 2.9985985985985986,
"eval_steps": 100,
"global_step": 1872,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016016016016016016,
"grad_norm": 13.409518938793026,
"learning_rate": 5.319148936170213e-07,
"loss": 1.2929,
"step": 10
},
{
"epoch": 0.03203203203203203,
"grad_norm": 10.369912201219414,
"learning_rate": 1.0638297872340427e-06,
"loss": 1.2223,
"step": 20
},
{
"epoch": 0.04804804804804805,
"grad_norm": 5.44261186688239,
"learning_rate": 1.595744680851064e-06,
"loss": 0.943,
"step": 30
},
{
"epoch": 0.06406406406406406,
"grad_norm": 3.605857199785854,
"learning_rate": 2.1276595744680853e-06,
"loss": 0.7396,
"step": 40
},
{
"epoch": 0.08008008008008008,
"grad_norm": 1.9924246604906095,
"learning_rate": 2.6595744680851065e-06,
"loss": 0.6245,
"step": 50
},
{
"epoch": 0.0960960960960961,
"grad_norm": 2.3307973177641075,
"learning_rate": 3.191489361702128e-06,
"loss": 0.5384,
"step": 60
},
{
"epoch": 0.11211211211211211,
"grad_norm": 2.417305413795213,
"learning_rate": 3.723404255319149e-06,
"loss": 0.4779,
"step": 70
},
{
"epoch": 0.12812812812812813,
"grad_norm": 2.4395863243528044,
"learning_rate": 4.255319148936171e-06,
"loss": 0.4384,
"step": 80
},
{
"epoch": 0.14414414414414414,
"grad_norm": 2.967607994777358,
"learning_rate": 4.787234042553192e-06,
"loss": 0.4194,
"step": 90
},
{
"epoch": 0.16016016016016016,
"grad_norm": 1.8226498622217497,
"learning_rate": 5.319148936170213e-06,
"loss": 0.3881,
"step": 100
},
{
"epoch": 0.16016016016016016,
"eval_loss": 0.38211190700531006,
"eval_runtime": 5.3031,
"eval_samples_per_second": 11.314,
"eval_steps_per_second": 5.657,
"step": 100
},
{
"epoch": 0.17617617617617617,
"grad_norm": 2.0417129648638404,
"learning_rate": 5.851063829787235e-06,
"loss": 0.3735,
"step": 110
},
{
"epoch": 0.1921921921921922,
"grad_norm": 1.8910151101905892,
"learning_rate": 6.382978723404256e-06,
"loss": 0.3597,
"step": 120
},
{
"epoch": 0.2082082082082082,
"grad_norm": 2.356033958506011,
"learning_rate": 6.914893617021278e-06,
"loss": 0.3451,
"step": 130
},
{
"epoch": 0.22422422422422422,
"grad_norm": 1.6182433590940526,
"learning_rate": 7.446808510638298e-06,
"loss": 0.3299,
"step": 140
},
{
"epoch": 0.24024024024024024,
"grad_norm": 1.5854757979381888,
"learning_rate": 7.97872340425532e-06,
"loss": 0.3224,
"step": 150
},
{
"epoch": 0.25625625625625625,
"grad_norm": 1.6564206751789106,
"learning_rate": 8.510638297872341e-06,
"loss": 0.3077,
"step": 160
},
{
"epoch": 0.2722722722722723,
"grad_norm": 1.647017991315564,
"learning_rate": 9.042553191489362e-06,
"loss": 0.3017,
"step": 170
},
{
"epoch": 0.2882882882882883,
"grad_norm": 2.0446337917200434,
"learning_rate": 9.574468085106385e-06,
"loss": 0.2859,
"step": 180
},
{
"epoch": 0.30430430430430433,
"grad_norm": 1.434904669145637,
"learning_rate": 9.999965197129365e-06,
"loss": 0.2877,
"step": 190
},
{
"epoch": 0.3203203203203203,
"grad_norm": 2.9106697114797573,
"learning_rate": 9.998747147528375e-06,
"loss": 0.2773,
"step": 200
},
{
"epoch": 0.3203203203203203,
"eval_loss": 0.2963399589061737,
"eval_runtime": 4.7335,
"eval_samples_per_second": 12.676,
"eval_steps_per_second": 6.338,
"step": 200
},
{
"epoch": 0.33633633633633636,
"grad_norm": 2.310160659638816,
"learning_rate": 9.995789438861128e-06,
"loss": 0.2793,
"step": 210
},
{
"epoch": 0.35235235235235235,
"grad_norm": 1.6140139693791007,
"learning_rate": 9.991093100466482e-06,
"loss": 0.2678,
"step": 220
},
{
"epoch": 0.3683683683683684,
"grad_norm": 1.6727704090132902,
"learning_rate": 9.98465976675951e-06,
"loss": 0.2633,
"step": 230
},
{
"epoch": 0.3843843843843844,
"grad_norm": 1.6386702854417348,
"learning_rate": 9.976491676662679e-06,
"loss": 0.2556,
"step": 240
},
{
"epoch": 0.4004004004004004,
"grad_norm": 1.5318495172198847,
"learning_rate": 9.966591672826674e-06,
"loss": 0.2501,
"step": 250
},
{
"epoch": 0.4164164164164164,
"grad_norm": 1.4131958267494484,
"learning_rate": 9.95496320064109e-06,
"loss": 0.239,
"step": 260
},
{
"epoch": 0.43243243243243246,
"grad_norm": 2.116850571157825,
"learning_rate": 9.941610307035385e-06,
"loss": 0.2352,
"step": 270
},
{
"epoch": 0.44844844844844844,
"grad_norm": 1.4833918481789057,
"learning_rate": 9.926537639070457e-06,
"loss": 0.2343,
"step": 280
},
{
"epoch": 0.4644644644644645,
"grad_norm": 1.4499258247835383,
"learning_rate": 9.90975044232139e-06,
"loss": 0.2246,
"step": 290
},
{
"epoch": 0.4804804804804805,
"grad_norm": 1.5412749043910834,
"learning_rate": 9.891254559051886e-06,
"loss": 0.2209,
"step": 300
},
{
"epoch": 0.4804804804804805,
"eval_loss": 0.22802673280239105,
"eval_runtime": 4.7226,
"eval_samples_per_second": 12.705,
"eval_steps_per_second": 6.352,
"step": 300
},
{
"epoch": 0.4964964964964965,
"grad_norm": 1.6016396794420027,
"learning_rate": 9.871056426181052e-06,
"loss": 0.2206,
"step": 310
},
{
"epoch": 0.5125125125125125,
"grad_norm": 1.4910134315726407,
"learning_rate": 9.849163073043223e-06,
"loss": 0.2229,
"step": 320
},
{
"epoch": 0.5285285285285285,
"grad_norm": 1.5947649628687839,
"learning_rate": 9.82558211894163e-06,
"loss": 0.206,
"step": 330
},
{
"epoch": 0.5445445445445446,
"grad_norm": 2.281846286082721,
"learning_rate": 9.800321770496726e-06,
"loss": 0.2106,
"step": 340
},
{
"epoch": 0.5605605605605606,
"grad_norm": 1.628823704341528,
"learning_rate": 9.773390818790136e-06,
"loss": 0.2049,
"step": 350
},
{
"epoch": 0.5765765765765766,
"grad_norm": 1.8232097235612017,
"learning_rate": 9.744798636305189e-06,
"loss": 0.2045,
"step": 360
},
{
"epoch": 0.5925925925925926,
"grad_norm": 1.880889712421178,
"learning_rate": 9.714555173665112e-06,
"loss": 0.1948,
"step": 370
},
{
"epoch": 0.6086086086086087,
"grad_norm": 1.8941186642260008,
"learning_rate": 9.68267095617003e-06,
"loss": 0.1947,
"step": 380
},
{
"epoch": 0.6246246246246246,
"grad_norm": 1.6000201393992286,
"learning_rate": 9.649157080133962e-06,
"loss": 0.1923,
"step": 390
},
{
"epoch": 0.6406406406406406,
"grad_norm": 3.414988222682504,
"learning_rate": 9.614025209023084e-06,
"loss": 0.1945,
"step": 400
},
{
"epoch": 0.6406406406406406,
"eval_loss": 0.19540859758853912,
"eval_runtime": 4.7637,
"eval_samples_per_second": 12.595,
"eval_steps_per_second": 6.298,
"step": 400
},
{
"epoch": 0.6566566566566566,
"grad_norm": 1.9420429807772246,
"learning_rate": 9.577287569396632e-06,
"loss": 0.1894,
"step": 410
},
{
"epoch": 0.6726726726726727,
"grad_norm": 1.3642644226107268,
"learning_rate": 9.538956946651816e-06,
"loss": 0.1782,
"step": 420
},
{
"epoch": 0.6886886886886887,
"grad_norm": 1.6342771748206477,
"learning_rate": 9.499046680574267e-06,
"loss": 0.1723,
"step": 430
},
{
"epoch": 0.7047047047047047,
"grad_norm": 1.487958949075534,
"learning_rate": 9.457570660695542e-06,
"loss": 0.1785,
"step": 440
},
{
"epoch": 0.7207207207207207,
"grad_norm": 1.7299656441999525,
"learning_rate": 9.41454332145928e-06,
"loss": 0.1767,
"step": 450
},
{
"epoch": 0.7367367367367368,
"grad_norm": 1.4998181975998246,
"learning_rate": 9.369979637197774e-06,
"loss": 0.1738,
"step": 460
},
{
"epoch": 0.7527527527527528,
"grad_norm": 1.5911390545676716,
"learning_rate": 9.323895116920591e-06,
"loss": 0.1688,
"step": 470
},
{
"epoch": 0.7687687687687688,
"grad_norm": 2.394239823663753,
"learning_rate": 9.27630579891716e-06,
"loss": 0.1628,
"step": 480
},
{
"epoch": 0.7847847847847848,
"grad_norm": 1.542509136620149,
"learning_rate": 9.227228245175127e-06,
"loss": 0.1676,
"step": 490
},
{
"epoch": 0.8008008008008008,
"grad_norm": 1.5343832382043492,
"learning_rate": 9.176679535616477e-06,
"loss": 0.1592,
"step": 500
},
{
"epoch": 0.8008008008008008,
"eval_loss": 0.1710364669561386,
"eval_runtime": 4.7555,
"eval_samples_per_second": 12.617,
"eval_steps_per_second": 6.309,
"step": 500
},
{
"epoch": 0.8168168168168168,
"grad_norm": 1.8010583431481462,
"learning_rate": 9.124677262153405e-06,
"loss": 0.1586,
"step": 510
},
{
"epoch": 0.8328328328328328,
"grad_norm": 1.6686707098067743,
"learning_rate": 9.071239522565978e-06,
"loss": 0.1556,
"step": 520
},
{
"epoch": 0.8488488488488488,
"grad_norm": 1.7504894762611782,
"learning_rate": 9.016384914203771e-06,
"loss": 0.1592,
"step": 530
},
{
"epoch": 0.8648648648648649,
"grad_norm": 1.4989615763117103,
"learning_rate": 8.960132527513642e-06,
"loss": 0.1616,
"step": 540
},
{
"epoch": 0.8808808808808809,
"grad_norm": 1.5542001700716142,
"learning_rate": 8.902501939395887e-06,
"loss": 0.155,
"step": 550
},
{
"epoch": 0.8968968968968969,
"grad_norm": 2.502100279988363,
"learning_rate": 8.8435132063911e-06,
"loss": 0.1514,
"step": 560
},
{
"epoch": 0.9129129129129129,
"grad_norm": 1.5418563195158612,
"learning_rate": 8.783186857700137e-06,
"loss": 0.1455,
"step": 570
},
{
"epoch": 0.928928928928929,
"grad_norm": 1.6683319786348245,
"learning_rate": 8.721543888039534e-06,
"loss": 0.1417,
"step": 580
},
{
"epoch": 0.944944944944945,
"grad_norm": 1.762643457697612,
"learning_rate": 8.658605750334972e-06,
"loss": 0.155,
"step": 590
},
{
"epoch": 0.960960960960961,
"grad_norm": 1.9599503664864883,
"learning_rate": 8.594394348255239e-06,
"loss": 0.1393,
"step": 600
},
{
"epoch": 0.960960960960961,
"eval_loss": 0.1448938250541687,
"eval_runtime": 4.5683,
"eval_samples_per_second": 13.134,
"eval_steps_per_second": 6.567,
"step": 600
},
{
"epoch": 0.9769769769769769,
"grad_norm": 1.9697402213513955,
"learning_rate": 8.528932028589337e-06,
"loss": 0.142,
"step": 610
},
{
"epoch": 0.992992992992993,
"grad_norm": 1.6768033303014018,
"learning_rate": 8.462241573469378e-06,
"loss": 0.1426,
"step": 620
},
{
"epoch": 1.0092092092092093,
"grad_norm": 1.5267241545230772,
"learning_rate": 8.394346192441967e-06,
"loss": 0.1415,
"step": 630
},
{
"epoch": 1.0252252252252252,
"grad_norm": 1.476716477059175,
"learning_rate": 8.325269514390835e-06,
"loss": 0.1176,
"step": 640
},
{
"epoch": 1.0412412412412413,
"grad_norm": 1.4540927119806955,
"learning_rate": 8.255035579313545e-06,
"loss": 0.1223,
"step": 650
},
{
"epoch": 1.0572572572572572,
"grad_norm": 1.5802819169958848,
"learning_rate": 8.183668829955111e-06,
"loss": 0.1117,
"step": 660
},
{
"epoch": 1.0732732732732733,
"grad_norm": 2.1509571510914203,
"learning_rate": 8.111194103301461e-06,
"loss": 0.1176,
"step": 670
},
{
"epoch": 1.0892892892892894,
"grad_norm": 1.4577026982979966,
"learning_rate": 8.037636621935686e-06,
"loss": 0.1193,
"step": 680
},
{
"epoch": 1.1053053053053052,
"grad_norm": 1.5430361342124643,
"learning_rate": 7.96302198526011e-06,
"loss": 0.1182,
"step": 690
},
{
"epoch": 1.1213213213213213,
"grad_norm": 1.5919162208902233,
"learning_rate": 7.887376160587214e-06,
"loss": 0.1138,
"step": 700
},
{
"epoch": 1.1213213213213213,
"eval_loss": 0.15331269800662994,
"eval_runtime": 4.5367,
"eval_samples_per_second": 13.225,
"eval_steps_per_second": 6.613,
"step": 700
},
{
"epoch": 1.1373373373373372,
"grad_norm": 1.2773733225900368,
"learning_rate": 7.810725474102504e-06,
"loss": 0.1116,
"step": 710
},
{
"epoch": 1.1533533533533533,
"grad_norm": 1.3037345062586103,
"learning_rate": 7.733096601702508e-06,
"loss": 0.1123,
"step": 720
},
{
"epoch": 1.1693693693693694,
"grad_norm": 1.5584266021091908,
"learning_rate": 7.654516559711053e-06,
"loss": 0.1142,
"step": 730
},
{
"epoch": 1.1853853853853853,
"grad_norm": 1.5547636549777273,
"learning_rate": 7.575012695477076e-06,
"loss": 0.1095,
"step": 740
},
{
"epoch": 1.2014014014014014,
"grad_norm": 1.473482889988994,
"learning_rate": 7.494612677857218e-06,
"loss": 0.1091,
"step": 750
},
{
"epoch": 1.2174174174174175,
"grad_norm": 1.3570410291065769,
"learning_rate": 7.413344487586542e-06,
"loss": 0.1099,
"step": 760
},
{
"epoch": 1.2334334334334334,
"grad_norm": 1.5362278958719333,
"learning_rate": 7.331236407540704e-06,
"loss": 0.1137,
"step": 770
},
{
"epoch": 1.2494494494494495,
"grad_norm": 1.9191542465617197,
"learning_rate": 7.248317012892969e-06,
"loss": 0.1128,
"step": 780
},
{
"epoch": 1.2654654654654656,
"grad_norm": 1.3514066304735823,
"learning_rate": 7.164615161169518e-06,
"loss": 0.1087,
"step": 790
},
{
"epoch": 1.2814814814814814,
"grad_norm": 1.6846669149108675,
"learning_rate": 7.080159982206471e-06,
"loss": 0.1096,
"step": 800
},
{
"epoch": 1.2814814814814814,
"eval_loss": 0.140634223818779,
"eval_runtime": 4.5862,
"eval_samples_per_second": 13.083,
"eval_steps_per_second": 6.541,
"step": 800
},
{
"epoch": 1.2974974974974975,
"grad_norm": 1.1919650111365638,
"learning_rate": 6.994980868012151e-06,
"loss": 0.1109,
"step": 810
},
{
"epoch": 1.3135135135135134,
"grad_norm": 1.6072997465464531,
"learning_rate": 6.909107462538113e-06,
"loss": 0.1104,
"step": 820
},
{
"epoch": 1.3295295295295295,
"grad_norm": 1.5714907447670126,
"learning_rate": 6.822569651362475e-06,
"loss": 0.1091,
"step": 830
},
{
"epoch": 1.3455455455455456,
"grad_norm": 1.3715345334587152,
"learning_rate": 6.735397551289179e-06,
"loss": 0.1072,
"step": 840
},
{
"epoch": 1.3615615615615615,
"grad_norm": 1.5656149539968518,
"learning_rate": 6.647621499866762e-06,
"loss": 0.1065,
"step": 850
},
{
"epoch": 1.3775775775775776,
"grad_norm": 1.747219292587951,
"learning_rate": 6.5592720448303174e-06,
"loss": 0.1049,
"step": 860
},
{
"epoch": 1.3935935935935935,
"grad_norm": 1.4629586813910707,
"learning_rate": 6.470379933470296e-06,
"loss": 0.1018,
"step": 870
},
{
"epoch": 1.4096096096096096,
"grad_norm": 2.030872539804225,
"learning_rate": 6.380976101931879e-06,
"loss": 0.1015,
"step": 880
},
{
"epoch": 1.4256256256256257,
"grad_norm": 1.229739648882705,
"learning_rate": 6.291091664448589e-06,
"loss": 0.1076,
"step": 890
},
{
"epoch": 1.4416416416416418,
"grad_norm": 1.584198326139697,
"learning_rate": 6.200757902513962e-06,
"loss": 0.1028,
"step": 900
},
{
"epoch": 1.4416416416416418,
"eval_loss": 0.12726753950119019,
"eval_runtime": 4.6398,
"eval_samples_per_second": 12.932,
"eval_steps_per_second": 6.466,
"step": 900
},
{
"epoch": 1.4576576576576576,
"grad_norm": 1.053882152208791,
"learning_rate": 6.11000625399499e-06,
"loss": 0.1058,
"step": 910
},
{
"epoch": 1.4736736736736737,
"grad_norm": 1.5483751760051925,
"learning_rate": 6.0188683021911394e-06,
"loss": 0.1008,
"step": 920
},
{
"epoch": 1.4896896896896896,
"grad_norm": 1.5020771306479124,
"learning_rate": 5.927375764842766e-06,
"loss": 0.0986,
"step": 930
},
{
"epoch": 1.5057057057057057,
"grad_norm": 1.2818645291771074,
"learning_rate": 5.835560483092743e-06,
"loss": 0.1045,
"step": 940
},
{
"epoch": 1.5217217217217218,
"grad_norm": 1.5262879682098887,
"learning_rate": 5.743454410405126e-06,
"loss": 0.1008,
"step": 950
},
{
"epoch": 1.5377377377377377,
"grad_norm": 1.9445218478299358,
"learning_rate": 5.651089601444752e-06,
"loss": 0.0975,
"step": 960
},
{
"epoch": 1.5537537537537538,
"grad_norm": 1.153429430002744,
"learning_rate": 5.558498200921597e-06,
"loss": 0.103,
"step": 970
},
{
"epoch": 1.5697697697697697,
"grad_norm": 1.7142266460197206,
"learning_rate": 5.465712432403812e-06,
"loss": 0.1009,
"step": 980
},
{
"epoch": 1.5857857857857858,
"grad_norm": 1.0420472106372876,
"learning_rate": 5.372764587103309e-06,
"loss": 0.1026,
"step": 990
},
{
"epoch": 1.6018018018018019,
"grad_norm": 1.0933198718882033,
"learning_rate": 5.279687012637798e-06,
"loss": 0.0998,
"step": 1000
},
{
"epoch": 1.6018018018018019,
"eval_loss": 0.12056411057710648,
"eval_runtime": 4.6363,
"eval_samples_per_second": 12.941,
"eval_steps_per_second": 6.471,
"step": 1000
},
{
"epoch": 1.617817817817818,
"grad_norm": 1.5249725941966104,
"learning_rate": 5.186512101773206e-06,
"loss": 0.0987,
"step": 1010
},
{
"epoch": 1.6338338338338338,
"grad_norm": 1.1551410324839504,
"learning_rate": 5.093272281150383e-06,
"loss": 0.0967,
"step": 1020
},
{
"epoch": 1.6498498498498497,
"grad_norm": 1.3764150156451525,
"learning_rate": 5e-06,
"loss": 0.102,
"step": 1030
},
{
"epoch": 1.6658658658658658,
"grad_norm": 1.4315332194265298,
"learning_rate": 4.906727718849619e-06,
"loss": 0.0918,
"step": 1040
},
{
"epoch": 1.681881881881882,
"grad_norm": 1.5641425623387613,
"learning_rate": 4.813487898226794e-06,
"loss": 0.0972,
"step": 1050
},
{
"epoch": 1.697897897897898,
"grad_norm": 1.364206236390582,
"learning_rate": 4.720312987362204e-06,
"loss": 0.0963,
"step": 1060
},
{
"epoch": 1.713913913913914,
"grad_norm": 1.2084858559030025,
"learning_rate": 4.6272354128966924e-06,
"loss": 0.0947,
"step": 1070
},
{
"epoch": 1.7299299299299298,
"grad_norm": 1.369095339213906,
"learning_rate": 4.534287567596189e-06,
"loss": 0.1,
"step": 1080
},
{
"epoch": 1.7459459459459459,
"grad_norm": 1.7200163939268696,
"learning_rate": 4.441501799078405e-06,
"loss": 0.0939,
"step": 1090
},
{
"epoch": 1.761961961961962,
"grad_norm": 1.0601289469307587,
"learning_rate": 4.348910398555249e-06,
"loss": 0.0952,
"step": 1100
},
{
"epoch": 1.761961961961962,
"eval_loss": 0.12079311162233353,
"eval_runtime": 4.5849,
"eval_samples_per_second": 13.087,
"eval_steps_per_second": 6.543,
"step": 1100
},
{
"epoch": 1.777977977977978,
"grad_norm": 1.3096671495162804,
"learning_rate": 4.2565455895948745e-06,
"loss": 0.095,
"step": 1110
},
{
"epoch": 1.793993993993994,
"grad_norm": 1.6474183248144982,
"learning_rate": 4.164439516907258e-06,
"loss": 0.096,
"step": 1120
},
{
"epoch": 1.81001001001001,
"grad_norm": 1.2638496884017785,
"learning_rate": 4.072624235157234e-06,
"loss": 0.0932,
"step": 1130
},
{
"epoch": 1.826026026026026,
"grad_norm": 1.1666611117167804,
"learning_rate": 3.981131697808862e-06,
"loss": 0.0961,
"step": 1140
},
{
"epoch": 1.842042042042042,
"grad_norm": 1.4101778997279006,
"learning_rate": 3.889993746005011e-06,
"loss": 0.0938,
"step": 1150
},
{
"epoch": 1.8580580580580581,
"grad_norm": 1.4720670252418966,
"learning_rate": 3.799242097486038e-06,
"loss": 0.0958,
"step": 1160
},
{
"epoch": 1.8740740740740742,
"grad_norm": 1.9229886608107283,
"learning_rate": 3.708908335551412e-06,
"loss": 0.0972,
"step": 1170
},
{
"epoch": 1.89009009009009,
"grad_norm": 1.5097045011947579,
"learning_rate": 3.6190238980681235e-06,
"loss": 0.0931,
"step": 1180
},
{
"epoch": 1.906106106106106,
"grad_norm": 1.3032785079127815,
"learning_rate": 3.529620066529704e-06,
"loss": 0.0925,
"step": 1190
},
{
"epoch": 1.922122122122122,
"grad_norm": 1.158235321697393,
"learning_rate": 3.4407279551696846e-06,
"loss": 0.092,
"step": 1200
},
{
"epoch": 1.922122122122122,
"eval_loss": 0.12118110805749893,
"eval_runtime": 4.5355,
"eval_samples_per_second": 13.229,
"eval_steps_per_second": 6.614,
"step": 1200
},
{
"epoch": 1.9381381381381382,
"grad_norm": 1.3273343135981375,
"learning_rate": 3.352378500133239e-06,
"loss": 0.0912,
"step": 1210
},
{
"epoch": 1.9541541541541543,
"grad_norm": 1.2638328817270799,
"learning_rate": 3.264602448710822e-06,
"loss": 0.092,
"step": 1220
},
{
"epoch": 1.9701701701701702,
"grad_norm": 1.0061664501466365,
"learning_rate": 3.177430348637527e-06,
"loss": 0.0913,
"step": 1230
},
{
"epoch": 1.986186186186186,
"grad_norm": 1.3814485492365947,
"learning_rate": 3.090892537461889e-06,
"loss": 0.0874,
"step": 1240
},
{
"epoch": 2.0024024024024025,
"grad_norm": 1.4890736474425244,
"learning_rate": 3.00501913198785e-06,
"loss": 0.0869,
"step": 1250
},
{
"epoch": 2.0184184184184186,
"grad_norm": 1.143647977026878,
"learning_rate": 2.9198400177935303e-06,
"loss": 0.0629,
"step": 1260
},
{
"epoch": 2.0344344344344343,
"grad_norm": 1.0231219995939023,
"learning_rate": 2.835384838830481e-06,
"loss": 0.0638,
"step": 1270
},
{
"epoch": 2.0504504504504504,
"grad_norm": 1.09838986733996,
"learning_rate": 2.7516829871070295e-06,
"loss": 0.0588,
"step": 1280
},
{
"epoch": 2.0664664664664665,
"grad_norm": 1.1570777805297492,
"learning_rate": 2.668763592459297e-06,
"loss": 0.0595,
"step": 1290
},
{
"epoch": 2.0824824824824826,
"grad_norm": 0.9848777368332091,
"learning_rate": 2.586655512413458e-06,
"loss": 0.061,
"step": 1300
},
{
"epoch": 2.0824824824824826,
"eval_loss": 0.1126304417848587,
"eval_runtime": 4.4139,
"eval_samples_per_second": 13.594,
"eval_steps_per_second": 6.797,
"step": 1300
},
{
"epoch": 2.0984984984984987,
"grad_norm": 1.0676656917105798,
"learning_rate": 2.505387322142782e-06,
"loss": 0.0612,
"step": 1310
},
{
"epoch": 2.1145145145145143,
"grad_norm": 1.05569010042999,
"learning_rate": 2.4249873045229244e-06,
"loss": 0.0607,
"step": 1320
},
{
"epoch": 2.1305305305305304,
"grad_norm": 0.9996457001160142,
"learning_rate": 2.345483440288947e-06,
"loss": 0.06,
"step": 1330
},
{
"epoch": 2.1465465465465465,
"grad_norm": 1.07628545223274,
"learning_rate": 2.2669033982974946e-06,
"loss": 0.0588,
"step": 1340
},
{
"epoch": 2.1625625625625626,
"grad_norm": 1.07905549545333,
"learning_rate": 2.189274525897498e-06,
"loss": 0.0636,
"step": 1350
},
{
"epoch": 2.1785785785785787,
"grad_norm": 1.2549844294011818,
"learning_rate": 2.1126238394127868e-06,
"loss": 0.0607,
"step": 1360
},
{
"epoch": 2.1945945945945944,
"grad_norm": 1.0059915885008073,
"learning_rate": 2.03697801473989e-06,
"loss": 0.062,
"step": 1370
},
{
"epoch": 2.2106106106106105,
"grad_norm": 1.274382465169598,
"learning_rate": 1.962363378064316e-06,
"loss": 0.0633,
"step": 1380
},
{
"epoch": 2.2266266266266266,
"grad_norm": 1.281532287957869,
"learning_rate": 1.8888058966985407e-06,
"loss": 0.0633,
"step": 1390
},
{
"epoch": 2.2426426426426427,
"grad_norm": 1.1922439699801886,
"learning_rate": 1.8163311700448899e-06,
"loss": 0.0624,
"step": 1400
},
{
"epoch": 2.2426426426426427,
"eval_loss": 0.1149037629365921,
"eval_runtime": 4.4069,
"eval_samples_per_second": 13.615,
"eval_steps_per_second": 6.808,
"step": 1400
},
{
"epoch": 2.258658658658659,
"grad_norm": 1.7050676523518535,
"learning_rate": 1.7449644206864564e-06,
"loss": 0.0623,
"step": 1410
},
{
"epoch": 2.2746746746746744,
"grad_norm": 0.9435102102997306,
"learning_rate": 1.6747304856091662e-06,
"loss": 0.059,
"step": 1420
},
{
"epoch": 2.2906906906906905,
"grad_norm": 1.0831267008587788,
"learning_rate": 1.6056538075580342e-06,
"loss": 0.0627,
"step": 1430
},
{
"epoch": 2.3067067067067066,
"grad_norm": 1.08268476394283,
"learning_rate": 1.5377584265306222e-06,
"loss": 0.0593,
"step": 1440
},
{
"epoch": 2.3227227227227227,
"grad_norm": 0.9770773292942597,
"learning_rate": 1.4710679714106635e-06,
"loss": 0.0583,
"step": 1450
},
{
"epoch": 2.338738738738739,
"grad_norm": 1.3751312648199294,
"learning_rate": 1.4056056517447637e-06,
"loss": 0.0626,
"step": 1460
},
{
"epoch": 2.354754754754755,
"grad_norm": 1.071282141200762,
"learning_rate": 1.3413942496650301e-06,
"loss": 0.0598,
"step": 1470
},
{
"epoch": 2.3707707707707706,
"grad_norm": 0.9441682147431711,
"learning_rate": 1.2784561119604683e-06,
"loss": 0.0609,
"step": 1480
},
{
"epoch": 2.3867867867867867,
"grad_norm": 1.2531258809500883,
"learning_rate": 1.2168131422998653e-06,
"loss": 0.0618,
"step": 1490
},
{
"epoch": 2.402802802802803,
"grad_norm": 0.988649947120785,
"learning_rate": 1.156486793608899e-06,
"loss": 0.0607,
"step": 1500
},
{
"epoch": 2.402802802802803,
"eval_loss": 0.10994044691324234,
"eval_runtime": 4.4237,
"eval_samples_per_second": 13.563,
"eval_steps_per_second": 6.782,
"step": 1500
},
{
"epoch": 2.418818818818819,
"grad_norm": 1.1998096160223135,
"learning_rate": 1.0974980606041152e-06,
"loss": 0.0594,
"step": 1510
},
{
"epoch": 2.434834834834835,
"grad_norm": 0.9677720356738567,
"learning_rate": 1.0398674724863584e-06,
"loss": 0.059,
"step": 1520
},
{
"epoch": 2.450850850850851,
"grad_norm": 1.3141425510127478,
"learning_rate": 9.836150857962296e-07,
"loss": 0.0596,
"step": 1530
},
{
"epoch": 2.4668668668668667,
"grad_norm": 0.8985622286196838,
"learning_rate": 9.287604774340236e-07,
"loss": 0.0581,
"step": 1540
},
{
"epoch": 2.482882882882883,
"grad_norm": 1.2915272829435755,
"learning_rate": 8.753227378465956e-07,
"loss": 0.0589,
"step": 1550
},
{
"epoch": 2.498898898898899,
"grad_norm": 1.051170086009324,
"learning_rate": 8.233204643835235e-07,
"loss": 0.0585,
"step": 1560
},
{
"epoch": 2.514914914914915,
"grad_norm": 1.1697265988828673,
"learning_rate": 7.72771754824877e-07,
"loss": 0.0627,
"step": 1570
},
{
"epoch": 2.530930930930931,
"grad_norm": 1.045582836173671,
"learning_rate": 7.23694201082843e-07,
"loss": 0.0594,
"step": 1580
},
{
"epoch": 2.546946946946947,
"grad_norm": 1.0460247904446258,
"learning_rate": 6.761048830794098e-07,
"loss": 0.0583,
"step": 1590
},
{
"epoch": 2.562962962962963,
"grad_norm": 1.0766438599871688,
"learning_rate": 6.300203628022272e-07,
"loss": 0.0584,
"step": 1600
},
{
"epoch": 2.562962962962963,
"eval_loss": 0.1075584813952446,
"eval_runtime": 4.7311,
"eval_samples_per_second": 12.682,
"eval_steps_per_second": 6.341,
"step": 1600
},
{
"epoch": 2.578978978978979,
"grad_norm": 1.0644137083005472,
"learning_rate": 5.854566785407212e-07,
"loss": 0.0615,
"step": 1610
},
{
"epoch": 2.594994994994995,
"grad_norm": 0.8862800692246755,
"learning_rate": 5.42429339304461e-07,
"loss": 0.0589,
"step": 1620
},
{
"epoch": 2.611011011011011,
"grad_norm": 1.0632023267290576,
"learning_rate": 5.009533194257332e-07,
"loss": 0.0576,
"step": 1630
},
{
"epoch": 2.627027027027027,
"grad_norm": 1.0612601719189565,
"learning_rate": 4.6104305334818577e-07,
"loss": 0.0581,
"step": 1640
},
{
"epoch": 2.643043043043043,
"grad_norm": 1.1363089897778307,
"learning_rate": 4.2271243060336976e-07,
"loss": 0.0565,
"step": 1650
},
{
"epoch": 2.659059059059059,
"grad_norm": 1.0581328451419771,
"learning_rate": 3.8597479097691626e-07,
"loss": 0.06,
"step": 1660
},
{
"epoch": 2.675075075075075,
"grad_norm": 1.0292294546952114,
"learning_rate": 3.508429198660379e-07,
"loss": 0.0592,
"step": 1670
},
{
"epoch": 2.6910910910910912,
"grad_norm": 1.1041696576832245,
"learning_rate": 3.1732904382996975e-07,
"loss": 0.0531,
"step": 1680
},
{
"epoch": 2.707107107107107,
"grad_norm": 1.1017298827099373,
"learning_rate": 2.854448263348891e-07,
"loss": 0.0592,
"step": 1690
},
{
"epoch": 2.723123123123123,
"grad_norm": 1.2373658193384414,
"learning_rate": 2.5520136369481194e-07,
"loss": 0.0574,
"step": 1700
},
{
"epoch": 2.723123123123123,
"eval_loss": 0.10619153082370758,
"eval_runtime": 4.4379,
"eval_samples_per_second": 13.52,
"eval_steps_per_second": 6.76,
"step": 1700
},
{
"epoch": 2.739139139139139,
"grad_norm": 1.0150510884990294,
"learning_rate": 2.266091812098642e-07,
"loss": 0.0581,
"step": 1710
},
{
"epoch": 2.755155155155155,
"grad_norm": 1.4492779833818328,
"learning_rate": 1.9967822950327453e-07,
"loss": 0.0609,
"step": 1720
},
{
"epoch": 2.7711711711711713,
"grad_norm": 1.104287468688097,
"learning_rate": 1.7441788105837133e-07,
"loss": 0.0593,
"step": 1730
},
{
"epoch": 2.787187187187187,
"grad_norm": 1.0784218287885254,
"learning_rate": 1.508369269567783e-07,
"loss": 0.0606,
"step": 1740
},
{
"epoch": 2.803203203203203,
"grad_norm": 1.2207885496699904,
"learning_rate": 1.2894357381894984e-07,
"loss": 0.0564,
"step": 1750
},
{
"epoch": 2.819219219219219,
"grad_norm": 0.8638978123535029,
"learning_rate": 1.0874544094811424e-07,
"loss": 0.0586,
"step": 1760
},
{
"epoch": 2.8352352352352352,
"grad_norm": 1.1498833578746668,
"learning_rate": 9.024955767861054e-08,
"loss": 0.0629,
"step": 1770
},
{
"epoch": 2.8512512512512513,
"grad_norm": 0.9945976550163285,
"learning_rate": 7.346236092954318e-08,
"loss": 0.0562,
"step": 1780
},
{
"epoch": 2.867267267267267,
"grad_norm": 0.9949658942902132,
"learning_rate": 5.838969296461605e-08,
"loss": 0.0583,
"step": 1790
},
{
"epoch": 2.8832832832832835,
"grad_norm": 1.0428607480343814,
"learning_rate": 4.50367993589107e-08,
"loss": 0.0571,
"step": 1800
},
{
"epoch": 2.8832832832832835,
"eval_loss": 0.10513312369585037,
"eval_runtime": 4.4044,
"eval_samples_per_second": 13.623,
"eval_steps_per_second": 6.811,
"step": 1800
},
{
"epoch": 2.899299299299299,
"grad_norm": 1.0170732585763345,
"learning_rate": 3.340832717332765e-08,
"loss": 0.0605,
"step": 1810
},
{
"epoch": 2.9153153153153153,
"grad_norm": 0.9518652403398978,
"learning_rate": 2.3508323337321225e-08,
"loss": 0.0567,
"step": 1820
},
{
"epoch": 2.9313313313313314,
"grad_norm": 1.0033269558751705,
"learning_rate": 1.534023324049061e-08,
"loss": 0.0564,
"step": 1830
},
{
"epoch": 2.9473473473473475,
"grad_norm": 1.1693211443264353,
"learning_rate": 8.906899533517866e-09,
"loss": 0.0589,
"step": 1840
},
{
"epoch": 2.9633633633633636,
"grad_norm": 1.2335037547499492,
"learning_rate": 4.210561138873193e-09,
"loss": 0.0576,
"step": 1850
},
{
"epoch": 2.9793793793793792,
"grad_norm": 1.1025808173861562,
"learning_rate": 1.2528524716259872e-09,
"loss": 0.0548,
"step": 1860
},
{
"epoch": 2.9953953953953953,
"grad_norm": 0.9062323167927434,
"learning_rate": 3.480287063706289e-11,
"loss": 0.0587,
"step": 1870
},
{
"epoch": 2.9985985985985986,
"step": 1872,
"total_flos": 222868335149056.0,
"train_loss": 0.15247767985376537,
"train_runtime": 21883.2415,
"train_samples_per_second": 8.217,
"train_steps_per_second": 0.086
}
],
"logging_steps": 10,
"max_steps": 1872,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 222868335149056.0,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}