rft-finetune-llama-3.1-8b-math / trainer_state.json
JakeOh's picture
Upload folder using huggingface_hub
a0ebdb0 verified
{
"best_metric": 0.16189107298851013,
"best_model_checkpoint": "checkpoints/rft-finetune-llama-3.1-8b-math50k/math50k/finetune-llama-3.1-8b-math50k-step-1/checkpoint-2421",
"epoch": 0.999721059972106,
"eval_steps": 269,
"global_step": 2688,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0018596001859600185,
"grad_norm": 23.25,
"learning_rate": 1.8587360594795542e-07,
"loss": 0.5644,
"step": 5
},
{
"epoch": 0.003719200371920037,
"grad_norm": 24.625,
"learning_rate": 3.7174721189591085e-07,
"loss": 0.5677,
"step": 10
},
{
"epoch": 0.005578800557880056,
"grad_norm": 20.625,
"learning_rate": 5.576208178438662e-07,
"loss": 0.5478,
"step": 15
},
{
"epoch": 0.007438400743840074,
"grad_norm": 18.125,
"learning_rate": 7.434944237918217e-07,
"loss": 0.5364,
"step": 20
},
{
"epoch": 0.009298000929800094,
"grad_norm": 17.875,
"learning_rate": 9.29368029739777e-07,
"loss": 0.4842,
"step": 25
},
{
"epoch": 0.011157601115760111,
"grad_norm": 14.3125,
"learning_rate": 1.1152416356877324e-06,
"loss": 0.4142,
"step": 30
},
{
"epoch": 0.01301720130172013,
"grad_norm": 14.75,
"learning_rate": 1.3011152416356879e-06,
"loss": 0.3353,
"step": 35
},
{
"epoch": 0.014876801487680148,
"grad_norm": 8.4375,
"learning_rate": 1.4869888475836434e-06,
"loss": 0.2517,
"step": 40
},
{
"epoch": 0.016736401673640166,
"grad_norm": 5.875,
"learning_rate": 1.6728624535315987e-06,
"loss": 0.1879,
"step": 45
},
{
"epoch": 0.018596001859600187,
"grad_norm": 3.5625,
"learning_rate": 1.858736059479554e-06,
"loss": 0.1624,
"step": 50
},
{
"epoch": 0.020455602045560205,
"grad_norm": 2.984375,
"learning_rate": 2.0446096654275095e-06,
"loss": 0.1526,
"step": 55
},
{
"epoch": 0.022315202231520222,
"grad_norm": 3.328125,
"learning_rate": 2.2304832713754648e-06,
"loss": 0.1651,
"step": 60
},
{
"epoch": 0.02417480241748024,
"grad_norm": 3.046875,
"learning_rate": 2.41635687732342e-06,
"loss": 0.1557,
"step": 65
},
{
"epoch": 0.02603440260344026,
"grad_norm": 2.890625,
"learning_rate": 2.6022304832713758e-06,
"loss": 0.1586,
"step": 70
},
{
"epoch": 0.02789400278940028,
"grad_norm": 2.546875,
"learning_rate": 2.788104089219331e-06,
"loss": 0.1491,
"step": 75
},
{
"epoch": 0.029753602975360297,
"grad_norm": 2.671875,
"learning_rate": 2.973977695167287e-06,
"loss": 0.1569,
"step": 80
},
{
"epoch": 0.03161320316132032,
"grad_norm": 2.640625,
"learning_rate": 3.159851301115242e-06,
"loss": 0.1491,
"step": 85
},
{
"epoch": 0.03347280334728033,
"grad_norm": 2.234375,
"learning_rate": 3.3457249070631974e-06,
"loss": 0.1459,
"step": 90
},
{
"epoch": 0.03533240353324035,
"grad_norm": 2.453125,
"learning_rate": 3.531598513011153e-06,
"loss": 0.1548,
"step": 95
},
{
"epoch": 0.037192003719200374,
"grad_norm": 2.265625,
"learning_rate": 3.717472118959108e-06,
"loss": 0.1529,
"step": 100
},
{
"epoch": 0.03905160390516039,
"grad_norm": 2.5625,
"learning_rate": 3.903345724907064e-06,
"loss": 0.1543,
"step": 105
},
{
"epoch": 0.04091120409112041,
"grad_norm": 2.453125,
"learning_rate": 4.089219330855019e-06,
"loss": 0.1521,
"step": 110
},
{
"epoch": 0.04277080427708043,
"grad_norm": 2.5,
"learning_rate": 4.275092936802974e-06,
"loss": 0.153,
"step": 115
},
{
"epoch": 0.044630404463040445,
"grad_norm": 2.515625,
"learning_rate": 4.4609665427509296e-06,
"loss": 0.1423,
"step": 120
},
{
"epoch": 0.046490004649000466,
"grad_norm": 2.609375,
"learning_rate": 4.646840148698885e-06,
"loss": 0.1479,
"step": 125
},
{
"epoch": 0.04834960483496048,
"grad_norm": 2.328125,
"learning_rate": 4.83271375464684e-06,
"loss": 0.1501,
"step": 130
},
{
"epoch": 0.0502092050209205,
"grad_norm": 2.796875,
"learning_rate": 5.0185873605947954e-06,
"loss": 0.1611,
"step": 135
},
{
"epoch": 0.05206880520688052,
"grad_norm": 2.484375,
"learning_rate": 5.2044609665427516e-06,
"loss": 0.1506,
"step": 140
},
{
"epoch": 0.05392840539284054,
"grad_norm": 2.578125,
"learning_rate": 5.390334572490706e-06,
"loss": 0.1532,
"step": 145
},
{
"epoch": 0.05578800557880056,
"grad_norm": 2.5625,
"learning_rate": 5.576208178438662e-06,
"loss": 0.1523,
"step": 150
},
{
"epoch": 0.05764760576476058,
"grad_norm": 2.859375,
"learning_rate": 5.7620817843866174e-06,
"loss": 0.1584,
"step": 155
},
{
"epoch": 0.05950720595072059,
"grad_norm": 2.59375,
"learning_rate": 5.947955390334574e-06,
"loss": 0.1504,
"step": 160
},
{
"epoch": 0.061366806136680614,
"grad_norm": 2.609375,
"learning_rate": 6.133828996282528e-06,
"loss": 0.1609,
"step": 165
},
{
"epoch": 0.06322640632264064,
"grad_norm": 2.28125,
"learning_rate": 6.319702602230484e-06,
"loss": 0.1546,
"step": 170
},
{
"epoch": 0.06508600650860065,
"grad_norm": 2.421875,
"learning_rate": 6.5055762081784395e-06,
"loss": 0.1492,
"step": 175
},
{
"epoch": 0.06694560669456066,
"grad_norm": 3.34375,
"learning_rate": 6.691449814126395e-06,
"loss": 0.1427,
"step": 180
},
{
"epoch": 0.06880520688052069,
"grad_norm": 2.625,
"learning_rate": 6.87732342007435e-06,
"loss": 0.1569,
"step": 185
},
{
"epoch": 0.0706648070664807,
"grad_norm": 2.21875,
"learning_rate": 7.063197026022306e-06,
"loss": 0.1523,
"step": 190
},
{
"epoch": 0.07252440725244072,
"grad_norm": 3.046875,
"learning_rate": 7.249070631970261e-06,
"loss": 0.1609,
"step": 195
},
{
"epoch": 0.07438400743840075,
"grad_norm": 2.359375,
"learning_rate": 7.434944237918216e-06,
"loss": 0.1586,
"step": 200
},
{
"epoch": 0.07624360762436076,
"grad_norm": 2.609375,
"learning_rate": 7.620817843866172e-06,
"loss": 0.1536,
"step": 205
},
{
"epoch": 0.07810320781032078,
"grad_norm": 2.671875,
"learning_rate": 7.806691449814127e-06,
"loss": 0.1632,
"step": 210
},
{
"epoch": 0.0799628079962808,
"grad_norm": 2.4375,
"learning_rate": 7.992565055762083e-06,
"loss": 0.1558,
"step": 215
},
{
"epoch": 0.08182240818224082,
"grad_norm": 2.4375,
"learning_rate": 8.178438661710038e-06,
"loss": 0.1534,
"step": 220
},
{
"epoch": 0.08368200836820083,
"grad_norm": 2.78125,
"learning_rate": 8.364312267657993e-06,
"loss": 0.1576,
"step": 225
},
{
"epoch": 0.08554160855416086,
"grad_norm": 2.40625,
"learning_rate": 8.550185873605949e-06,
"loss": 0.1643,
"step": 230
},
{
"epoch": 0.08740120874012088,
"grad_norm": 2.609375,
"learning_rate": 8.736059479553904e-06,
"loss": 0.1599,
"step": 235
},
{
"epoch": 0.08926080892608089,
"grad_norm": 2.5625,
"learning_rate": 8.921933085501859e-06,
"loss": 0.1573,
"step": 240
},
{
"epoch": 0.09112040911204092,
"grad_norm": 2.828125,
"learning_rate": 9.107806691449816e-06,
"loss": 0.1614,
"step": 245
},
{
"epoch": 0.09298000929800093,
"grad_norm": 2.296875,
"learning_rate": 9.29368029739777e-06,
"loss": 0.152,
"step": 250
},
{
"epoch": 0.09483960948396095,
"grad_norm": 2.625,
"learning_rate": 9.479553903345727e-06,
"loss": 0.1531,
"step": 255
},
{
"epoch": 0.09669920966992096,
"grad_norm": 2.4375,
"learning_rate": 9.66542750929368e-06,
"loss": 0.1592,
"step": 260
},
{
"epoch": 0.09855880985588099,
"grad_norm": 2.609375,
"learning_rate": 9.851301115241636e-06,
"loss": 0.158,
"step": 265
},
{
"epoch": 0.100046490004649,
"eval_loss": 0.182636097073555,
"eval_runtime": 33.5333,
"eval_samples_per_second": 306.71,
"eval_steps_per_second": 9.602,
"step": 269
},
{
"epoch": 0.100418410041841,
"grad_norm": 2.71875,
"learning_rate": 9.99586606035552e-06,
"loss": 0.1626,
"step": 270
},
{
"epoch": 0.10227801022780102,
"grad_norm": 2.609375,
"learning_rate": 9.975196362133114e-06,
"loss": 0.1612,
"step": 275
},
{
"epoch": 0.10413761041376104,
"grad_norm": 2.875,
"learning_rate": 9.954526663910708e-06,
"loss": 0.1575,
"step": 280
},
{
"epoch": 0.10599721059972106,
"grad_norm": 2.421875,
"learning_rate": 9.933856965688302e-06,
"loss": 0.1661,
"step": 285
},
{
"epoch": 0.10785681078568107,
"grad_norm": 2.46875,
"learning_rate": 9.913187267465896e-06,
"loss": 0.1663,
"step": 290
},
{
"epoch": 0.1097164109716411,
"grad_norm": 2.578125,
"learning_rate": 9.89251756924349e-06,
"loss": 0.1742,
"step": 295
},
{
"epoch": 0.11157601115760112,
"grad_norm": 2.578125,
"learning_rate": 9.871847871021084e-06,
"loss": 0.1564,
"step": 300
},
{
"epoch": 0.11343561134356113,
"grad_norm": 4.125,
"learning_rate": 9.851178172798678e-06,
"loss": 0.1633,
"step": 305
},
{
"epoch": 0.11529521152952116,
"grad_norm": 2.765625,
"learning_rate": 9.830508474576272e-06,
"loss": 0.1665,
"step": 310
},
{
"epoch": 0.11715481171548117,
"grad_norm": 2.921875,
"learning_rate": 9.809838776353866e-06,
"loss": 0.1739,
"step": 315
},
{
"epoch": 0.11901441190144119,
"grad_norm": 2.296875,
"learning_rate": 9.78916907813146e-06,
"loss": 0.1621,
"step": 320
},
{
"epoch": 0.12087401208740121,
"grad_norm": 2.5625,
"learning_rate": 9.768499379909055e-06,
"loss": 0.167,
"step": 325
},
{
"epoch": 0.12273361227336123,
"grad_norm": 2.703125,
"learning_rate": 9.747829681686649e-06,
"loss": 0.167,
"step": 330
},
{
"epoch": 0.12459321245932124,
"grad_norm": 2.75,
"learning_rate": 9.727159983464243e-06,
"loss": 0.1601,
"step": 335
},
{
"epoch": 0.12645281264528127,
"grad_norm": 2.703125,
"learning_rate": 9.706490285241837e-06,
"loss": 0.1705,
"step": 340
},
{
"epoch": 0.12831241283124128,
"grad_norm": 2.59375,
"learning_rate": 9.685820587019429e-06,
"loss": 0.1575,
"step": 345
},
{
"epoch": 0.1301720130172013,
"grad_norm": 2.40625,
"learning_rate": 9.665150888797025e-06,
"loss": 0.161,
"step": 350
},
{
"epoch": 0.1320316132031613,
"grad_norm": 2.3125,
"learning_rate": 9.644481190574619e-06,
"loss": 0.1734,
"step": 355
},
{
"epoch": 0.13389121338912133,
"grad_norm": 2.40625,
"learning_rate": 9.623811492352211e-06,
"loss": 0.1635,
"step": 360
},
{
"epoch": 0.13575081357508137,
"grad_norm": 2.53125,
"learning_rate": 9.603141794129807e-06,
"loss": 0.1536,
"step": 365
},
{
"epoch": 0.13761041376104138,
"grad_norm": 2.5625,
"learning_rate": 9.5824720959074e-06,
"loss": 0.1595,
"step": 370
},
{
"epoch": 0.1394700139470014,
"grad_norm": 2.96875,
"learning_rate": 9.561802397684995e-06,
"loss": 0.1638,
"step": 375
},
{
"epoch": 0.1413296141329614,
"grad_norm": 2.46875,
"learning_rate": 9.54113269946259e-06,
"loss": 0.1649,
"step": 380
},
{
"epoch": 0.14318921431892143,
"grad_norm": 2.65625,
"learning_rate": 9.520463001240182e-06,
"loss": 0.1559,
"step": 385
},
{
"epoch": 0.14504881450488144,
"grad_norm": 2.84375,
"learning_rate": 9.499793303017778e-06,
"loss": 0.1628,
"step": 390
},
{
"epoch": 0.14690841469084148,
"grad_norm": 2.484375,
"learning_rate": 9.47912360479537e-06,
"loss": 0.1653,
"step": 395
},
{
"epoch": 0.1487680148768015,
"grad_norm": 2.78125,
"learning_rate": 9.458453906572966e-06,
"loss": 0.1589,
"step": 400
},
{
"epoch": 0.1506276150627615,
"grad_norm": 2.265625,
"learning_rate": 9.437784208350558e-06,
"loss": 0.1548,
"step": 405
},
{
"epoch": 0.15248721524872152,
"grad_norm": 2.34375,
"learning_rate": 9.417114510128152e-06,
"loss": 0.1567,
"step": 410
},
{
"epoch": 0.15434681543468154,
"grad_norm": 3.0625,
"learning_rate": 9.396444811905748e-06,
"loss": 0.1611,
"step": 415
},
{
"epoch": 0.15620641562064155,
"grad_norm": 2.65625,
"learning_rate": 9.37577511368334e-06,
"loss": 0.1647,
"step": 420
},
{
"epoch": 0.15806601580660157,
"grad_norm": 2.796875,
"learning_rate": 9.355105415460936e-06,
"loss": 0.1594,
"step": 425
},
{
"epoch": 0.1599256159925616,
"grad_norm": 2.34375,
"learning_rate": 9.334435717238529e-06,
"loss": 0.1607,
"step": 430
},
{
"epoch": 0.16178521617852162,
"grad_norm": 2.421875,
"learning_rate": 9.313766019016123e-06,
"loss": 0.1591,
"step": 435
},
{
"epoch": 0.16364481636448164,
"grad_norm": 2.359375,
"learning_rate": 9.293096320793717e-06,
"loss": 0.1596,
"step": 440
},
{
"epoch": 0.16550441655044165,
"grad_norm": 2.53125,
"learning_rate": 9.27242662257131e-06,
"loss": 0.1562,
"step": 445
},
{
"epoch": 0.16736401673640167,
"grad_norm": 2.359375,
"learning_rate": 9.251756924348905e-06,
"loss": 0.1604,
"step": 450
},
{
"epoch": 0.16922361692236168,
"grad_norm": 2.578125,
"learning_rate": 9.231087226126499e-06,
"loss": 0.1637,
"step": 455
},
{
"epoch": 0.17108321710832172,
"grad_norm": 2.734375,
"learning_rate": 9.210417527904093e-06,
"loss": 0.1543,
"step": 460
},
{
"epoch": 0.17294281729428174,
"grad_norm": 2.375,
"learning_rate": 9.189747829681687e-06,
"loss": 0.1573,
"step": 465
},
{
"epoch": 0.17480241748024175,
"grad_norm": 2.34375,
"learning_rate": 9.169078131459281e-06,
"loss": 0.1511,
"step": 470
},
{
"epoch": 0.17666201766620176,
"grad_norm": 2.3125,
"learning_rate": 9.148408433236875e-06,
"loss": 0.152,
"step": 475
},
{
"epoch": 0.17852161785216178,
"grad_norm": 2.90625,
"learning_rate": 9.12773873501447e-06,
"loss": 0.1561,
"step": 480
},
{
"epoch": 0.1803812180381218,
"grad_norm": 2.375,
"learning_rate": 9.107069036792063e-06,
"loss": 0.1557,
"step": 485
},
{
"epoch": 0.18224081822408184,
"grad_norm": 2.234375,
"learning_rate": 9.086399338569657e-06,
"loss": 0.159,
"step": 490
},
{
"epoch": 0.18410041841004185,
"grad_norm": 2.359375,
"learning_rate": 9.065729640347252e-06,
"loss": 0.1493,
"step": 495
},
{
"epoch": 0.18596001859600186,
"grad_norm": 2.765625,
"learning_rate": 9.045059942124846e-06,
"loss": 0.1604,
"step": 500
},
{
"epoch": 0.18781961878196188,
"grad_norm": 2.28125,
"learning_rate": 9.02439024390244e-06,
"loss": 0.159,
"step": 505
},
{
"epoch": 0.1896792189679219,
"grad_norm": 2.625,
"learning_rate": 9.003720545680034e-06,
"loss": 0.1663,
"step": 510
},
{
"epoch": 0.1915388191538819,
"grad_norm": 2.609375,
"learning_rate": 8.983050847457628e-06,
"loss": 0.1559,
"step": 515
},
{
"epoch": 0.19339841933984192,
"grad_norm": 2.421875,
"learning_rate": 8.962381149235222e-06,
"loss": 0.1528,
"step": 520
},
{
"epoch": 0.19525801952580196,
"grad_norm": 2.234375,
"learning_rate": 8.941711451012816e-06,
"loss": 0.1544,
"step": 525
},
{
"epoch": 0.19711761971176198,
"grad_norm": 2.609375,
"learning_rate": 8.92104175279041e-06,
"loss": 0.1559,
"step": 530
},
{
"epoch": 0.198977219897722,
"grad_norm": 2.640625,
"learning_rate": 8.900372054568004e-06,
"loss": 0.1648,
"step": 535
},
{
"epoch": 0.200092980009298,
"eval_loss": 0.17836953699588776,
"eval_runtime": 33.5274,
"eval_samples_per_second": 306.764,
"eval_steps_per_second": 9.604,
"step": 538
},
{
"epoch": 0.200836820083682,
"grad_norm": 2.3125,
"learning_rate": 8.879702356345598e-06,
"loss": 0.168,
"step": 540
},
{
"epoch": 0.20269642026964202,
"grad_norm": 2.109375,
"learning_rate": 8.859032658123192e-06,
"loss": 0.1605,
"step": 545
},
{
"epoch": 0.20455602045560203,
"grad_norm": 2.796875,
"learning_rate": 8.838362959900786e-06,
"loss": 0.1626,
"step": 550
},
{
"epoch": 0.20641562064156208,
"grad_norm": 2.203125,
"learning_rate": 8.81769326167838e-06,
"loss": 0.1535,
"step": 555
},
{
"epoch": 0.2082752208275221,
"grad_norm": 2.546875,
"learning_rate": 8.797023563455975e-06,
"loss": 0.1566,
"step": 560
},
{
"epoch": 0.2101348210134821,
"grad_norm": 2.4375,
"learning_rate": 8.776353865233567e-06,
"loss": 0.1587,
"step": 565
},
{
"epoch": 0.21199442119944212,
"grad_norm": 2.640625,
"learning_rate": 8.755684167011163e-06,
"loss": 0.1609,
"step": 570
},
{
"epoch": 0.21385402138540213,
"grad_norm": 2.828125,
"learning_rate": 8.735014468788757e-06,
"loss": 0.1522,
"step": 575
},
{
"epoch": 0.21571362157136215,
"grad_norm": 2.28125,
"learning_rate": 8.714344770566351e-06,
"loss": 0.1631,
"step": 580
},
{
"epoch": 0.2175732217573222,
"grad_norm": 2.28125,
"learning_rate": 8.693675072343945e-06,
"loss": 0.1662,
"step": 585
},
{
"epoch": 0.2194328219432822,
"grad_norm": 2.328125,
"learning_rate": 8.673005374121537e-06,
"loss": 0.154,
"step": 590
},
{
"epoch": 0.22129242212924222,
"grad_norm": 2.171875,
"learning_rate": 8.652335675899133e-06,
"loss": 0.1576,
"step": 595
},
{
"epoch": 0.22315202231520223,
"grad_norm": 2.53125,
"learning_rate": 8.631665977676727e-06,
"loss": 0.1599,
"step": 600
},
{
"epoch": 0.22501162250116225,
"grad_norm": 2.046875,
"learning_rate": 8.610996279454321e-06,
"loss": 0.1563,
"step": 605
},
{
"epoch": 0.22687122268712226,
"grad_norm": 2.453125,
"learning_rate": 8.590326581231915e-06,
"loss": 0.1585,
"step": 610
},
{
"epoch": 0.22873082287308227,
"grad_norm": 2.40625,
"learning_rate": 8.569656883009508e-06,
"loss": 0.1566,
"step": 615
},
{
"epoch": 0.23059042305904232,
"grad_norm": 2.328125,
"learning_rate": 8.548987184787104e-06,
"loss": 0.16,
"step": 620
},
{
"epoch": 0.23245002324500233,
"grad_norm": 2.109375,
"learning_rate": 8.528317486564696e-06,
"loss": 0.1503,
"step": 625
},
{
"epoch": 0.23430962343096234,
"grad_norm": 2.375,
"learning_rate": 8.507647788342292e-06,
"loss": 0.158,
"step": 630
},
{
"epoch": 0.23616922361692236,
"grad_norm": 2.609375,
"learning_rate": 8.486978090119886e-06,
"loss": 0.1615,
"step": 635
},
{
"epoch": 0.23802882380288237,
"grad_norm": 2.296875,
"learning_rate": 8.466308391897478e-06,
"loss": 0.1569,
"step": 640
},
{
"epoch": 0.2398884239888424,
"grad_norm": 2.953125,
"learning_rate": 8.445638693675074e-06,
"loss": 0.1536,
"step": 645
},
{
"epoch": 0.24174802417480243,
"grad_norm": 2.4375,
"learning_rate": 8.424968995452666e-06,
"loss": 0.1525,
"step": 650
},
{
"epoch": 0.24360762436076244,
"grad_norm": 2.359375,
"learning_rate": 8.40429929723026e-06,
"loss": 0.1461,
"step": 655
},
{
"epoch": 0.24546722454672246,
"grad_norm": 2.546875,
"learning_rate": 8.383629599007855e-06,
"loss": 0.1585,
"step": 660
},
{
"epoch": 0.24732682473268247,
"grad_norm": 2.390625,
"learning_rate": 8.362959900785449e-06,
"loss": 0.1494,
"step": 665
},
{
"epoch": 0.24918642491864249,
"grad_norm": 2.53125,
"learning_rate": 8.342290202563044e-06,
"loss": 0.1589,
"step": 670
},
{
"epoch": 0.2510460251046025,
"grad_norm": 2.5,
"learning_rate": 8.321620504340637e-06,
"loss": 0.1554,
"step": 675
},
{
"epoch": 0.25290562529056254,
"grad_norm": 2.515625,
"learning_rate": 8.30095080611823e-06,
"loss": 0.1539,
"step": 680
},
{
"epoch": 0.2547652254765225,
"grad_norm": 2.296875,
"learning_rate": 8.280281107895825e-06,
"loss": 0.1518,
"step": 685
},
{
"epoch": 0.25662482566248257,
"grad_norm": 2.578125,
"learning_rate": 8.259611409673419e-06,
"loss": 0.1545,
"step": 690
},
{
"epoch": 0.2584844258484426,
"grad_norm": 2.515625,
"learning_rate": 8.238941711451015e-06,
"loss": 0.1535,
"step": 695
},
{
"epoch": 0.2603440260344026,
"grad_norm": 2.484375,
"learning_rate": 8.218272013228607e-06,
"loss": 0.1541,
"step": 700
},
{
"epoch": 0.26220362622036264,
"grad_norm": 2.3125,
"learning_rate": 8.197602315006201e-06,
"loss": 0.1518,
"step": 705
},
{
"epoch": 0.2640632264063226,
"grad_norm": 2.328125,
"learning_rate": 8.176932616783795e-06,
"loss": 0.1532,
"step": 710
},
{
"epoch": 0.26592282659228267,
"grad_norm": 2.3125,
"learning_rate": 8.15626291856139e-06,
"loss": 0.1554,
"step": 715
},
{
"epoch": 0.26778242677824265,
"grad_norm": 2.09375,
"learning_rate": 8.135593220338983e-06,
"loss": 0.1485,
"step": 720
},
{
"epoch": 0.2696420269642027,
"grad_norm": 2.234375,
"learning_rate": 8.114923522116578e-06,
"loss": 0.1564,
"step": 725
},
{
"epoch": 0.27150162715016274,
"grad_norm": 2.171875,
"learning_rate": 8.094253823894172e-06,
"loss": 0.1523,
"step": 730
},
{
"epoch": 0.2733612273361227,
"grad_norm": 2.28125,
"learning_rate": 8.073584125671766e-06,
"loss": 0.1565,
"step": 735
},
{
"epoch": 0.27522082752208277,
"grad_norm": 2.171875,
"learning_rate": 8.05291442744936e-06,
"loss": 0.1482,
"step": 740
},
{
"epoch": 0.27708042770804275,
"grad_norm": 2.296875,
"learning_rate": 8.032244729226954e-06,
"loss": 0.155,
"step": 745
},
{
"epoch": 0.2789400278940028,
"grad_norm": 2.390625,
"learning_rate": 8.011575031004548e-06,
"loss": 0.1449,
"step": 750
},
{
"epoch": 0.2807996280799628,
"grad_norm": 2.109375,
"learning_rate": 7.990905332782142e-06,
"loss": 0.151,
"step": 755
},
{
"epoch": 0.2826592282659228,
"grad_norm": 2.5,
"learning_rate": 7.970235634559736e-06,
"loss": 0.1576,
"step": 760
},
{
"epoch": 0.28451882845188287,
"grad_norm": 2.296875,
"learning_rate": 7.94956593633733e-06,
"loss": 0.1625,
"step": 765
},
{
"epoch": 0.28637842863784285,
"grad_norm": 2.5,
"learning_rate": 7.928896238114924e-06,
"loss": 0.1558,
"step": 770
},
{
"epoch": 0.2882380288238029,
"grad_norm": 2.453125,
"learning_rate": 7.908226539892518e-06,
"loss": 0.1512,
"step": 775
},
{
"epoch": 0.2900976290097629,
"grad_norm": 2.171875,
"learning_rate": 7.887556841670112e-06,
"loss": 0.1568,
"step": 780
},
{
"epoch": 0.2919572291957229,
"grad_norm": 2.34375,
"learning_rate": 7.866887143447707e-06,
"loss": 0.1561,
"step": 785
},
{
"epoch": 0.29381682938168296,
"grad_norm": 2.40625,
"learning_rate": 7.8462174452253e-06,
"loss": 0.1592,
"step": 790
},
{
"epoch": 0.29567642956764295,
"grad_norm": 2.1875,
"learning_rate": 7.825547747002895e-06,
"loss": 0.1481,
"step": 795
},
{
"epoch": 0.297536029753603,
"grad_norm": 2.015625,
"learning_rate": 7.804878048780489e-06,
"loss": 0.1478,
"step": 800
},
{
"epoch": 0.299395629939563,
"grad_norm": 2.125,
"learning_rate": 7.784208350558083e-06,
"loss": 0.1562,
"step": 805
},
{
"epoch": 0.300139470013947,
"eval_loss": 0.17311781644821167,
"eval_runtime": 33.5454,
"eval_samples_per_second": 306.599,
"eval_steps_per_second": 9.599,
"step": 807
},
{
"epoch": 0.301255230125523,
"grad_norm": 2.234375,
"learning_rate": 7.763538652335677e-06,
"loss": 0.1632,
"step": 810
},
{
"epoch": 0.303114830311483,
"grad_norm": 2.25,
"learning_rate": 7.742868954113271e-06,
"loss": 0.1533,
"step": 815
},
{
"epoch": 0.30497443049744305,
"grad_norm": 2.3125,
"learning_rate": 7.722199255890865e-06,
"loss": 0.1567,
"step": 820
},
{
"epoch": 0.3068340306834031,
"grad_norm": 2.46875,
"learning_rate": 7.70152955766846e-06,
"loss": 0.1494,
"step": 825
},
{
"epoch": 0.3086936308693631,
"grad_norm": 2.375,
"learning_rate": 7.680859859446053e-06,
"loss": 0.1539,
"step": 830
},
{
"epoch": 0.3105532310553231,
"grad_norm": 2.453125,
"learning_rate": 7.660190161223646e-06,
"loss": 0.152,
"step": 835
},
{
"epoch": 0.3124128312412831,
"grad_norm": 2.234375,
"learning_rate": 7.639520463001241e-06,
"loss": 0.1491,
"step": 840
},
{
"epoch": 0.31427243142724315,
"grad_norm": 2.4375,
"learning_rate": 7.618850764778835e-06,
"loss": 0.1475,
"step": 845
},
{
"epoch": 0.31613203161320313,
"grad_norm": 2.546875,
"learning_rate": 7.59818106655643e-06,
"loss": 0.1527,
"step": 850
},
{
"epoch": 0.3179916317991632,
"grad_norm": 2.453125,
"learning_rate": 7.577511368334023e-06,
"loss": 0.1527,
"step": 855
},
{
"epoch": 0.3198512319851232,
"grad_norm": 2.09375,
"learning_rate": 7.556841670111617e-06,
"loss": 0.1491,
"step": 860
},
{
"epoch": 0.3217108321710832,
"grad_norm": 2.578125,
"learning_rate": 7.536171971889211e-06,
"loss": 0.1509,
"step": 865
},
{
"epoch": 0.32357043235704325,
"grad_norm": 2.390625,
"learning_rate": 7.515502273666805e-06,
"loss": 0.1501,
"step": 870
},
{
"epoch": 0.32543003254300323,
"grad_norm": 2.46875,
"learning_rate": 7.4948325754444e-06,
"loss": 0.1509,
"step": 875
},
{
"epoch": 0.3272896327289633,
"grad_norm": 1.9296875,
"learning_rate": 7.474162877221993e-06,
"loss": 0.1449,
"step": 880
},
{
"epoch": 0.3291492329149233,
"grad_norm": 2.125,
"learning_rate": 7.4534931789995864e-06,
"loss": 0.148,
"step": 885
},
{
"epoch": 0.3310088331008833,
"grad_norm": 1.8984375,
"learning_rate": 7.432823480777181e-06,
"loss": 0.1459,
"step": 890
},
{
"epoch": 0.33286843328684335,
"grad_norm": 2.28125,
"learning_rate": 7.4121537825547755e-06,
"loss": 0.1604,
"step": 895
},
{
"epoch": 0.33472803347280333,
"grad_norm": 2.171875,
"learning_rate": 7.3914840843323695e-06,
"loss": 0.1491,
"step": 900
},
{
"epoch": 0.3365876336587634,
"grad_norm": 1.8828125,
"learning_rate": 7.370814386109964e-06,
"loss": 0.1453,
"step": 905
},
{
"epoch": 0.33844723384472336,
"grad_norm": 2.203125,
"learning_rate": 7.350144687887557e-06,
"loss": 0.1485,
"step": 910
},
{
"epoch": 0.3403068340306834,
"grad_norm": 2.921875,
"learning_rate": 7.329474989665152e-06,
"loss": 0.1503,
"step": 915
},
{
"epoch": 0.34216643421664344,
"grad_norm": 2.21875,
"learning_rate": 7.308805291442745e-06,
"loss": 0.1553,
"step": 920
},
{
"epoch": 0.34402603440260343,
"grad_norm": 2.1875,
"learning_rate": 7.288135593220339e-06,
"loss": 0.1485,
"step": 925
},
{
"epoch": 0.3458856345885635,
"grad_norm": 2.125,
"learning_rate": 7.267465894997934e-06,
"loss": 0.1616,
"step": 930
},
{
"epoch": 0.34774523477452346,
"grad_norm": 2.09375,
"learning_rate": 7.246796196775527e-06,
"loss": 0.1444,
"step": 935
},
{
"epoch": 0.3496048349604835,
"grad_norm": 2.03125,
"learning_rate": 7.226126498553122e-06,
"loss": 0.1489,
"step": 940
},
{
"epoch": 0.3514644351464435,
"grad_norm": 2.421875,
"learning_rate": 7.205456800330715e-06,
"loss": 0.1459,
"step": 945
},
{
"epoch": 0.35332403533240353,
"grad_norm": 2.21875,
"learning_rate": 7.1847871021083095e-06,
"loss": 0.1472,
"step": 950
},
{
"epoch": 0.35518363551836357,
"grad_norm": 2.546875,
"learning_rate": 7.164117403885904e-06,
"loss": 0.1488,
"step": 955
},
{
"epoch": 0.35704323570432356,
"grad_norm": 2.1875,
"learning_rate": 7.143447705663498e-06,
"loss": 0.1418,
"step": 960
},
{
"epoch": 0.3589028358902836,
"grad_norm": 2.359375,
"learning_rate": 7.122778007441093e-06,
"loss": 0.1523,
"step": 965
},
{
"epoch": 0.3607624360762436,
"grad_norm": 1.9375,
"learning_rate": 7.102108309218686e-06,
"loss": 0.1517,
"step": 970
},
{
"epoch": 0.36262203626220363,
"grad_norm": 2.234375,
"learning_rate": 7.08143861099628e-06,
"loss": 0.1486,
"step": 975
},
{
"epoch": 0.36448163644816367,
"grad_norm": 2.46875,
"learning_rate": 7.060768912773874e-06,
"loss": 0.1517,
"step": 980
},
{
"epoch": 0.36634123663412366,
"grad_norm": 2.25,
"learning_rate": 7.040099214551468e-06,
"loss": 0.1514,
"step": 985
},
{
"epoch": 0.3682008368200837,
"grad_norm": 2.171875,
"learning_rate": 7.019429516329063e-06,
"loss": 0.1442,
"step": 990
},
{
"epoch": 0.3700604370060437,
"grad_norm": 2.234375,
"learning_rate": 6.998759818106656e-06,
"loss": 0.1558,
"step": 995
},
{
"epoch": 0.3719200371920037,
"grad_norm": 2.015625,
"learning_rate": 6.97809011988425e-06,
"loss": 0.1462,
"step": 1000
},
{
"epoch": 0.3737796373779637,
"grad_norm": 2.234375,
"learning_rate": 6.957420421661844e-06,
"loss": 0.1455,
"step": 1005
},
{
"epoch": 0.37563923756392376,
"grad_norm": 1.9765625,
"learning_rate": 6.9367507234394385e-06,
"loss": 0.1477,
"step": 1010
},
{
"epoch": 0.3774988377498838,
"grad_norm": 2.5,
"learning_rate": 6.9160810252170325e-06,
"loss": 0.1542,
"step": 1015
},
{
"epoch": 0.3793584379358438,
"grad_norm": 2.046875,
"learning_rate": 6.895411326994627e-06,
"loss": 0.1518,
"step": 1020
},
{
"epoch": 0.3812180381218038,
"grad_norm": 2.21875,
"learning_rate": 6.87474162877222e-06,
"loss": 0.1554,
"step": 1025
},
{
"epoch": 0.3830776383077638,
"grad_norm": 2.21875,
"learning_rate": 6.854071930549815e-06,
"loss": 0.1465,
"step": 1030
},
{
"epoch": 0.38493723849372385,
"grad_norm": 2.5,
"learning_rate": 6.833402232327409e-06,
"loss": 0.1462,
"step": 1035
},
{
"epoch": 0.38679683867968384,
"grad_norm": 2.90625,
"learning_rate": 6.812732534105002e-06,
"loss": 0.1509,
"step": 1040
},
{
"epoch": 0.3886564388656439,
"grad_norm": 2.046875,
"learning_rate": 6.792062835882597e-06,
"loss": 0.1497,
"step": 1045
},
{
"epoch": 0.3905160390516039,
"grad_norm": 2.375,
"learning_rate": 6.77139313766019e-06,
"loss": 0.1496,
"step": 1050
},
{
"epoch": 0.3923756392375639,
"grad_norm": 2.328125,
"learning_rate": 6.750723439437785e-06,
"loss": 0.1439,
"step": 1055
},
{
"epoch": 0.39423523942352395,
"grad_norm": 2.1875,
"learning_rate": 6.730053741215378e-06,
"loss": 0.1433,
"step": 1060
},
{
"epoch": 0.39609483960948394,
"grad_norm": 2.015625,
"learning_rate": 6.7093840429929725e-06,
"loss": 0.1542,
"step": 1065
},
{
"epoch": 0.397954439795444,
"grad_norm": 2.125,
"learning_rate": 6.6887143447705674e-06,
"loss": 0.1373,
"step": 1070
},
{
"epoch": 0.399814039981404,
"grad_norm": 2.328125,
"learning_rate": 6.668044646548161e-06,
"loss": 0.1475,
"step": 1075
},
{
"epoch": 0.400185960018596,
"eval_loss": 0.16861507296562195,
"eval_runtime": 33.5378,
"eval_samples_per_second": 306.669,
"eval_steps_per_second": 9.601,
"step": 1076
},
{
"epoch": 0.401673640167364,
"grad_norm": 2.3125,
"learning_rate": 6.647374948325756e-06,
"loss": 0.1518,
"step": 1080
},
{
"epoch": 0.40353324035332405,
"grad_norm": 2.125,
"learning_rate": 6.626705250103349e-06,
"loss": 0.1466,
"step": 1085
},
{
"epoch": 0.40539284053928404,
"grad_norm": 2.328125,
"learning_rate": 6.606035551880943e-06,
"loss": 0.1472,
"step": 1090
},
{
"epoch": 0.4072524407252441,
"grad_norm": 2.46875,
"learning_rate": 6.585365853658538e-06,
"loss": 0.1469,
"step": 1095
},
{
"epoch": 0.40911204091120407,
"grad_norm": 2.015625,
"learning_rate": 6.564696155436131e-06,
"loss": 0.1413,
"step": 1100
},
{
"epoch": 0.4109716410971641,
"grad_norm": 2.078125,
"learning_rate": 6.544026457213726e-06,
"loss": 0.151,
"step": 1105
},
{
"epoch": 0.41283124128312415,
"grad_norm": 2.1875,
"learning_rate": 6.523356758991319e-06,
"loss": 0.1492,
"step": 1110
},
{
"epoch": 0.41469084146908414,
"grad_norm": 2.640625,
"learning_rate": 6.502687060768913e-06,
"loss": 0.1415,
"step": 1115
},
{
"epoch": 0.4165504416550442,
"grad_norm": 2.40625,
"learning_rate": 6.482017362546507e-06,
"loss": 0.1425,
"step": 1120
},
{
"epoch": 0.41841004184100417,
"grad_norm": 2.25,
"learning_rate": 6.4613476643241015e-06,
"loss": 0.1482,
"step": 1125
},
{
"epoch": 0.4202696420269642,
"grad_norm": 2.140625,
"learning_rate": 6.440677966101695e-06,
"loss": 0.1435,
"step": 1130
},
{
"epoch": 0.4221292422129242,
"grad_norm": 2.1875,
"learning_rate": 6.42000826787929e-06,
"loss": 0.1448,
"step": 1135
},
{
"epoch": 0.42398884239888424,
"grad_norm": 2.015625,
"learning_rate": 6.399338569656883e-06,
"loss": 0.1452,
"step": 1140
},
{
"epoch": 0.4258484425848443,
"grad_norm": 2.234375,
"learning_rate": 6.378668871434478e-06,
"loss": 0.1498,
"step": 1145
},
{
"epoch": 0.42770804277080426,
"grad_norm": 2.03125,
"learning_rate": 6.357999173212072e-06,
"loss": 0.146,
"step": 1150
},
{
"epoch": 0.4295676429567643,
"grad_norm": 2.234375,
"learning_rate": 6.337329474989665e-06,
"loss": 0.1426,
"step": 1155
},
{
"epoch": 0.4314272431427243,
"grad_norm": 2.046875,
"learning_rate": 6.31665977676726e-06,
"loss": 0.1387,
"step": 1160
},
{
"epoch": 0.43328684332868433,
"grad_norm": 2.171875,
"learning_rate": 6.295990078544853e-06,
"loss": 0.1495,
"step": 1165
},
{
"epoch": 0.4351464435146444,
"grad_norm": 2.15625,
"learning_rate": 6.275320380322448e-06,
"loss": 0.1367,
"step": 1170
},
{
"epoch": 0.43700604370060436,
"grad_norm": 2.171875,
"learning_rate": 6.254650682100042e-06,
"loss": 0.1469,
"step": 1175
},
{
"epoch": 0.4388656438865644,
"grad_norm": 2.328125,
"learning_rate": 6.2339809838776355e-06,
"loss": 0.1466,
"step": 1180
},
{
"epoch": 0.4407252440725244,
"grad_norm": 2.09375,
"learning_rate": 6.2133112856552304e-06,
"loss": 0.1382,
"step": 1185
},
{
"epoch": 0.44258484425848443,
"grad_norm": 2.078125,
"learning_rate": 6.192641587432824e-06,
"loss": 0.146,
"step": 1190
},
{
"epoch": 0.4444444444444444,
"grad_norm": 2.28125,
"learning_rate": 6.171971889210419e-06,
"loss": 0.1453,
"step": 1195
},
{
"epoch": 0.44630404463040446,
"grad_norm": 2.28125,
"learning_rate": 6.151302190988012e-06,
"loss": 0.1559,
"step": 1200
},
{
"epoch": 0.4481636448163645,
"grad_norm": 2.203125,
"learning_rate": 6.130632492765606e-06,
"loss": 0.1461,
"step": 1205
},
{
"epoch": 0.4500232450023245,
"grad_norm": 2.6875,
"learning_rate": 6.109962794543201e-06,
"loss": 0.1512,
"step": 1210
},
{
"epoch": 0.45188284518828453,
"grad_norm": 2.03125,
"learning_rate": 6.089293096320794e-06,
"loss": 0.1455,
"step": 1215
},
{
"epoch": 0.4537424453742445,
"grad_norm": 2.203125,
"learning_rate": 6.068623398098388e-06,
"loss": 0.1458,
"step": 1220
},
{
"epoch": 0.45560204556020456,
"grad_norm": 2.078125,
"learning_rate": 6.047953699875982e-06,
"loss": 0.1452,
"step": 1225
},
{
"epoch": 0.45746164574616455,
"grad_norm": 2.109375,
"learning_rate": 6.027284001653576e-06,
"loss": 0.1427,
"step": 1230
},
{
"epoch": 0.4593212459321246,
"grad_norm": 2.1875,
"learning_rate": 6.00661430343117e-06,
"loss": 0.1433,
"step": 1235
},
{
"epoch": 0.46118084611808463,
"grad_norm": 2.28125,
"learning_rate": 5.9859446052087645e-06,
"loss": 0.143,
"step": 1240
},
{
"epoch": 0.4630404463040446,
"grad_norm": 2.40625,
"learning_rate": 5.965274906986358e-06,
"loss": 0.1357,
"step": 1245
},
{
"epoch": 0.46490004649000466,
"grad_norm": 2.046875,
"learning_rate": 5.944605208763953e-06,
"loss": 0.1469,
"step": 1250
},
{
"epoch": 0.46675964667596465,
"grad_norm": 2.09375,
"learning_rate": 5.923935510541547e-06,
"loss": 0.1466,
"step": 1255
},
{
"epoch": 0.4686192468619247,
"grad_norm": 2.25,
"learning_rate": 5.903265812319141e-06,
"loss": 0.1451,
"step": 1260
},
{
"epoch": 0.47047884704788473,
"grad_norm": 2.125,
"learning_rate": 5.882596114096735e-06,
"loss": 0.1475,
"step": 1265
},
{
"epoch": 0.4723384472338447,
"grad_norm": 1.8515625,
"learning_rate": 5.861926415874328e-06,
"loss": 0.1412,
"step": 1270
},
{
"epoch": 0.47419804741980476,
"grad_norm": 2.078125,
"learning_rate": 5.841256717651923e-06,
"loss": 0.1461,
"step": 1275
},
{
"epoch": 0.47605764760576474,
"grad_norm": 2.0625,
"learning_rate": 5.820587019429516e-06,
"loss": 0.1437,
"step": 1280
},
{
"epoch": 0.4779172477917248,
"grad_norm": 2.625,
"learning_rate": 5.799917321207111e-06,
"loss": 0.145,
"step": 1285
},
{
"epoch": 0.4797768479776848,
"grad_norm": 2.359375,
"learning_rate": 5.779247622984705e-06,
"loss": 0.1407,
"step": 1290
},
{
"epoch": 0.4816364481636448,
"grad_norm": 2.171875,
"learning_rate": 5.7585779247622985e-06,
"loss": 0.1455,
"step": 1295
},
{
"epoch": 0.48349604834960486,
"grad_norm": 2.015625,
"learning_rate": 5.7379082265398934e-06,
"loss": 0.1437,
"step": 1300
},
{
"epoch": 0.48535564853556484,
"grad_norm": 2.171875,
"learning_rate": 5.717238528317487e-06,
"loss": 0.1423,
"step": 1305
},
{
"epoch": 0.4872152487215249,
"grad_norm": 2.265625,
"learning_rate": 5.696568830095081e-06,
"loss": 0.1538,
"step": 1310
},
{
"epoch": 0.48907484890748487,
"grad_norm": 2.125,
"learning_rate": 5.675899131872676e-06,
"loss": 0.1477,
"step": 1315
},
{
"epoch": 0.4909344490934449,
"grad_norm": 2.21875,
"learning_rate": 5.655229433650269e-06,
"loss": 0.1467,
"step": 1320
},
{
"epoch": 0.49279404927940496,
"grad_norm": 2.140625,
"learning_rate": 5.634559735427864e-06,
"loss": 0.1357,
"step": 1325
},
{
"epoch": 0.49465364946536494,
"grad_norm": 2.328125,
"learning_rate": 5.613890037205457e-06,
"loss": 0.1483,
"step": 1330
},
{
"epoch": 0.496513249651325,
"grad_norm": 2.359375,
"learning_rate": 5.593220338983051e-06,
"loss": 0.1485,
"step": 1335
},
{
"epoch": 0.49837284983728497,
"grad_norm": 2.078125,
"learning_rate": 5.572550640760645e-06,
"loss": 0.1472,
"step": 1340
},
{
"epoch": 0.500232450023245,
"grad_norm": 2.46875,
"learning_rate": 5.551880942538239e-06,
"loss": 0.1515,
"step": 1345
},
{
"epoch": 0.500232450023245,
"eval_loss": 0.16567149758338928,
"eval_runtime": 33.5148,
"eval_samples_per_second": 306.88,
"eval_steps_per_second": 9.608,
"step": 1345
},
{
"epoch": 0.502092050209205,
"grad_norm": 2.25,
"learning_rate": 5.531211244315834e-06,
"loss": 0.1434,
"step": 1350
},
{
"epoch": 0.503951650395165,
"grad_norm": 2.515625,
"learning_rate": 5.5105415460934275e-06,
"loss": 0.1489,
"step": 1355
},
{
"epoch": 0.5058112505811251,
"grad_norm": 2.125,
"learning_rate": 5.489871847871021e-06,
"loss": 0.1423,
"step": 1360
},
{
"epoch": 0.5076708507670851,
"grad_norm": 1.984375,
"learning_rate": 5.469202149648616e-06,
"loss": 0.1532,
"step": 1365
},
{
"epoch": 0.509530450953045,
"grad_norm": 2.328125,
"learning_rate": 5.44853245142621e-06,
"loss": 0.1474,
"step": 1370
},
{
"epoch": 0.5113900511390052,
"grad_norm": 2.125,
"learning_rate": 5.427862753203804e-06,
"loss": 0.141,
"step": 1375
},
{
"epoch": 0.5132496513249651,
"grad_norm": 2.21875,
"learning_rate": 5.407193054981398e-06,
"loss": 0.1432,
"step": 1380
},
{
"epoch": 0.5151092515109251,
"grad_norm": 1.984375,
"learning_rate": 5.386523356758991e-06,
"loss": 0.1511,
"step": 1385
},
{
"epoch": 0.5169688516968852,
"grad_norm": 2.265625,
"learning_rate": 5.365853658536586e-06,
"loss": 0.1479,
"step": 1390
},
{
"epoch": 0.5188284518828452,
"grad_norm": 2.078125,
"learning_rate": 5.34518396031418e-06,
"loss": 0.1377,
"step": 1395
},
{
"epoch": 0.5206880520688052,
"grad_norm": 1.9609375,
"learning_rate": 5.324514262091773e-06,
"loss": 0.1413,
"step": 1400
},
{
"epoch": 0.5225476522547652,
"grad_norm": 2.25,
"learning_rate": 5.303844563869368e-06,
"loss": 0.1357,
"step": 1405
},
{
"epoch": 0.5244072524407253,
"grad_norm": 1.96875,
"learning_rate": 5.2831748656469615e-06,
"loss": 0.1454,
"step": 1410
},
{
"epoch": 0.5262668526266853,
"grad_norm": 2.125,
"learning_rate": 5.2625051674245564e-06,
"loss": 0.1455,
"step": 1415
},
{
"epoch": 0.5281264528126453,
"grad_norm": 2.203125,
"learning_rate": 5.24183546920215e-06,
"loss": 0.1477,
"step": 1420
},
{
"epoch": 0.5299860529986054,
"grad_norm": 1.890625,
"learning_rate": 5.221165770979744e-06,
"loss": 0.1383,
"step": 1425
},
{
"epoch": 0.5318456531845653,
"grad_norm": 2.21875,
"learning_rate": 5.200496072757339e-06,
"loss": 0.1435,
"step": 1430
},
{
"epoch": 0.5337052533705253,
"grad_norm": 2.328125,
"learning_rate": 5.179826374534932e-06,
"loss": 0.1492,
"step": 1435
},
{
"epoch": 0.5355648535564853,
"grad_norm": 2.078125,
"learning_rate": 5.159156676312527e-06,
"loss": 0.1387,
"step": 1440
},
{
"epoch": 0.5374244537424454,
"grad_norm": 2.578125,
"learning_rate": 5.13848697809012e-06,
"loss": 0.1475,
"step": 1445
},
{
"epoch": 0.5392840539284054,
"grad_norm": 2.296875,
"learning_rate": 5.117817279867714e-06,
"loss": 0.1459,
"step": 1450
},
{
"epoch": 0.5411436541143654,
"grad_norm": 2.453125,
"learning_rate": 5.097147581645308e-06,
"loss": 0.1536,
"step": 1455
},
{
"epoch": 0.5430032543003255,
"grad_norm": 2.0625,
"learning_rate": 5.076477883422902e-06,
"loss": 0.1467,
"step": 1460
},
{
"epoch": 0.5448628544862855,
"grad_norm": 2.34375,
"learning_rate": 5.055808185200497e-06,
"loss": 0.1452,
"step": 1465
},
{
"epoch": 0.5467224546722455,
"grad_norm": 2.25,
"learning_rate": 5.0351384869780905e-06,
"loss": 0.1467,
"step": 1470
},
{
"epoch": 0.5485820548582054,
"grad_norm": 2.25,
"learning_rate": 5.0144687887556846e-06,
"loss": 0.1422,
"step": 1475
},
{
"epoch": 0.5504416550441655,
"grad_norm": 2.046875,
"learning_rate": 4.993799090533279e-06,
"loss": 0.1348,
"step": 1480
},
{
"epoch": 0.5523012552301255,
"grad_norm": 2.359375,
"learning_rate": 4.973129392310873e-06,
"loss": 0.1468,
"step": 1485
},
{
"epoch": 0.5541608554160855,
"grad_norm": 2.046875,
"learning_rate": 4.952459694088467e-06,
"loss": 0.1431,
"step": 1490
},
{
"epoch": 0.5560204556020456,
"grad_norm": 2.234375,
"learning_rate": 4.931789995866061e-06,
"loss": 0.1513,
"step": 1495
},
{
"epoch": 0.5578800557880056,
"grad_norm": 2.03125,
"learning_rate": 4.911120297643655e-06,
"loss": 0.1388,
"step": 1500
},
{
"epoch": 0.5597396559739656,
"grad_norm": 2.078125,
"learning_rate": 4.890450599421249e-06,
"loss": 0.1392,
"step": 1505
},
{
"epoch": 0.5615992561599256,
"grad_norm": 1.9609375,
"learning_rate": 4.869780901198843e-06,
"loss": 0.1417,
"step": 1510
},
{
"epoch": 0.5634588563458857,
"grad_norm": 2.234375,
"learning_rate": 4.849111202976437e-06,
"loss": 0.1361,
"step": 1515
},
{
"epoch": 0.5653184565318456,
"grad_norm": 2.25,
"learning_rate": 4.828441504754031e-06,
"loss": 0.1434,
"step": 1520
},
{
"epoch": 0.5671780567178056,
"grad_norm": 2.390625,
"learning_rate": 4.8077718065316245e-06,
"loss": 0.1496,
"step": 1525
},
{
"epoch": 0.5690376569037657,
"grad_norm": 2.046875,
"learning_rate": 4.787102108309219e-06,
"loss": 0.1463,
"step": 1530
},
{
"epoch": 0.5708972570897257,
"grad_norm": 2.046875,
"learning_rate": 4.7664324100868135e-06,
"loss": 0.1323,
"step": 1535
},
{
"epoch": 0.5727568572756857,
"grad_norm": 1.9609375,
"learning_rate": 4.745762711864408e-06,
"loss": 0.1437,
"step": 1540
},
{
"epoch": 0.5746164574616457,
"grad_norm": 2.25,
"learning_rate": 4.725093013642002e-06,
"loss": 0.1543,
"step": 1545
},
{
"epoch": 0.5764760576476058,
"grad_norm": 2.171875,
"learning_rate": 4.704423315419595e-06,
"loss": 0.1409,
"step": 1550
},
{
"epoch": 0.5783356578335658,
"grad_norm": 1.9375,
"learning_rate": 4.683753617197189e-06,
"loss": 0.1439,
"step": 1555
},
{
"epoch": 0.5801952580195258,
"grad_norm": 2.015625,
"learning_rate": 4.663083918974783e-06,
"loss": 0.1395,
"step": 1560
},
{
"epoch": 0.5820548582054859,
"grad_norm": 2.0625,
"learning_rate": 4.642414220752377e-06,
"loss": 0.1416,
"step": 1565
},
{
"epoch": 0.5839144583914458,
"grad_norm": 2.734375,
"learning_rate": 4.621744522529971e-06,
"loss": 0.1385,
"step": 1570
},
{
"epoch": 0.5857740585774058,
"grad_norm": 2.1875,
"learning_rate": 4.601074824307565e-06,
"loss": 0.1382,
"step": 1575
},
{
"epoch": 0.5876336587633659,
"grad_norm": 2.109375,
"learning_rate": 4.580405126085159e-06,
"loss": 0.1378,
"step": 1580
},
{
"epoch": 0.5894932589493259,
"grad_norm": 2.078125,
"learning_rate": 4.5597354278627535e-06,
"loss": 0.1369,
"step": 1585
},
{
"epoch": 0.5913528591352859,
"grad_norm": 1.9921875,
"learning_rate": 4.5390657296403476e-06,
"loss": 0.145,
"step": 1590
},
{
"epoch": 0.5932124593212459,
"grad_norm": 2.078125,
"learning_rate": 4.518396031417942e-06,
"loss": 0.1376,
"step": 1595
},
{
"epoch": 0.595072059507206,
"grad_norm": 2.46875,
"learning_rate": 4.497726333195536e-06,
"loss": 0.1398,
"step": 1600
},
{
"epoch": 0.596931659693166,
"grad_norm": 2.0,
"learning_rate": 4.47705663497313e-06,
"loss": 0.144,
"step": 1605
},
{
"epoch": 0.598791259879126,
"grad_norm": 2.078125,
"learning_rate": 4.456386936750724e-06,
"loss": 0.1344,
"step": 1610
},
{
"epoch": 0.600278940027894,
"eval_loss": 0.16357110440731049,
"eval_runtime": 33.5259,
"eval_samples_per_second": 306.778,
"eval_steps_per_second": 9.605,
"step": 1614
},
{
"epoch": 0.6006508600650861,
"grad_norm": 2.140625,
"learning_rate": 4.435717238528318e-06,
"loss": 0.1411,
"step": 1615
},
{
"epoch": 0.602510460251046,
"grad_norm": 2.15625,
"learning_rate": 4.415047540305912e-06,
"loss": 0.1454,
"step": 1620
},
{
"epoch": 0.604370060437006,
"grad_norm": 1.984375,
"learning_rate": 4.394377842083506e-06,
"loss": 0.1457,
"step": 1625
},
{
"epoch": 0.606229660622966,
"grad_norm": 2.78125,
"learning_rate": 4.3737081438611e-06,
"loss": 0.1404,
"step": 1630
},
{
"epoch": 0.6080892608089261,
"grad_norm": 1.9375,
"learning_rate": 4.353038445638694e-06,
"loss": 0.1401,
"step": 1635
},
{
"epoch": 0.6099488609948861,
"grad_norm": 1.9921875,
"learning_rate": 4.3323687474162875e-06,
"loss": 0.1373,
"step": 1640
},
{
"epoch": 0.6118084611808461,
"grad_norm": 2.078125,
"learning_rate": 4.3116990491938824e-06,
"loss": 0.1409,
"step": 1645
},
{
"epoch": 0.6136680613668062,
"grad_norm": 1.984375,
"learning_rate": 4.2910293509714765e-06,
"loss": 0.1416,
"step": 1650
},
{
"epoch": 0.6155276615527662,
"grad_norm": 2.140625,
"learning_rate": 4.270359652749071e-06,
"loss": 0.1458,
"step": 1655
},
{
"epoch": 0.6173872617387262,
"grad_norm": 2.25,
"learning_rate": 4.249689954526664e-06,
"loss": 0.1459,
"step": 1660
},
{
"epoch": 0.6192468619246861,
"grad_norm": 1.7734375,
"learning_rate": 4.229020256304258e-06,
"loss": 0.1383,
"step": 1665
},
{
"epoch": 0.6211064621106462,
"grad_norm": 2.109375,
"learning_rate": 4.208350558081852e-06,
"loss": 0.1451,
"step": 1670
},
{
"epoch": 0.6229660622966062,
"grad_norm": 2.03125,
"learning_rate": 4.187680859859447e-06,
"loss": 0.1385,
"step": 1675
},
{
"epoch": 0.6248256624825662,
"grad_norm": 2.3125,
"learning_rate": 4.167011161637041e-06,
"loss": 0.141,
"step": 1680
},
{
"epoch": 0.6266852626685263,
"grad_norm": 2.265625,
"learning_rate": 4.146341463414634e-06,
"loss": 0.1428,
"step": 1685
},
{
"epoch": 0.6285448628544863,
"grad_norm": 2.03125,
"learning_rate": 4.125671765192228e-06,
"loss": 0.1425,
"step": 1690
},
{
"epoch": 0.6304044630404463,
"grad_norm": 1.890625,
"learning_rate": 4.105002066969822e-06,
"loss": 0.1369,
"step": 1695
},
{
"epoch": 0.6322640632264063,
"grad_norm": 2.0625,
"learning_rate": 4.0843323687474165e-06,
"loss": 0.1351,
"step": 1700
},
{
"epoch": 0.6341236634123664,
"grad_norm": 2.09375,
"learning_rate": 4.0636626705250106e-06,
"loss": 0.1393,
"step": 1705
},
{
"epoch": 0.6359832635983264,
"grad_norm": 2.203125,
"learning_rate": 4.042992972302605e-06,
"loss": 0.137,
"step": 1710
},
{
"epoch": 0.6378428637842863,
"grad_norm": 2.265625,
"learning_rate": 4.022323274080199e-06,
"loss": 0.1409,
"step": 1715
},
{
"epoch": 0.6397024639702464,
"grad_norm": 1.9453125,
"learning_rate": 4.001653575857793e-06,
"loss": 0.1459,
"step": 1720
},
{
"epoch": 0.6415620641562064,
"grad_norm": 1.8359375,
"learning_rate": 3.980983877635387e-06,
"loss": 0.1385,
"step": 1725
},
{
"epoch": 0.6434216643421664,
"grad_norm": 2.015625,
"learning_rate": 3.960314179412981e-06,
"loss": 0.153,
"step": 1730
},
{
"epoch": 0.6452812645281265,
"grad_norm": 2.1875,
"learning_rate": 3.939644481190575e-06,
"loss": 0.1397,
"step": 1735
},
{
"epoch": 0.6471408647140865,
"grad_norm": 2.078125,
"learning_rate": 3.918974782968169e-06,
"loss": 0.1405,
"step": 1740
},
{
"epoch": 0.6490004649000465,
"grad_norm": 1.8984375,
"learning_rate": 3.898305084745763e-06,
"loss": 0.1391,
"step": 1745
},
{
"epoch": 0.6508600650860065,
"grad_norm": 2.21875,
"learning_rate": 3.8776353865233564e-06,
"loss": 0.1409,
"step": 1750
},
{
"epoch": 0.6527196652719666,
"grad_norm": 2.21875,
"learning_rate": 3.856965688300951e-06,
"loss": 0.1402,
"step": 1755
},
{
"epoch": 0.6545792654579266,
"grad_norm": 2.21875,
"learning_rate": 3.8362959900785454e-06,
"loss": 0.1502,
"step": 1760
},
{
"epoch": 0.6564388656438865,
"grad_norm": 2.109375,
"learning_rate": 3.8156262918561395e-06,
"loss": 0.1408,
"step": 1765
},
{
"epoch": 0.6582984658298466,
"grad_norm": 2.25,
"learning_rate": 3.7949565936337336e-06,
"loss": 0.1445,
"step": 1770
},
{
"epoch": 0.6601580660158066,
"grad_norm": 1.890625,
"learning_rate": 3.7742868954113273e-06,
"loss": 0.1417,
"step": 1775
},
{
"epoch": 0.6620176662017666,
"grad_norm": 2.09375,
"learning_rate": 3.7536171971889213e-06,
"loss": 0.1402,
"step": 1780
},
{
"epoch": 0.6638772663877266,
"grad_norm": 2.03125,
"learning_rate": 3.7329474989665154e-06,
"loss": 0.1428,
"step": 1785
},
{
"epoch": 0.6657368665736867,
"grad_norm": 2.015625,
"learning_rate": 3.7122778007441095e-06,
"loss": 0.1408,
"step": 1790
},
{
"epoch": 0.6675964667596467,
"grad_norm": 1.921875,
"learning_rate": 3.6916081025217036e-06,
"loss": 0.1428,
"step": 1795
},
{
"epoch": 0.6694560669456067,
"grad_norm": 2.515625,
"learning_rate": 3.6709384042992972e-06,
"loss": 0.1463,
"step": 1800
},
{
"epoch": 0.6713156671315668,
"grad_norm": 2.109375,
"learning_rate": 3.6502687060768917e-06,
"loss": 0.1365,
"step": 1805
},
{
"epoch": 0.6731752673175267,
"grad_norm": 2.015625,
"learning_rate": 3.629599007854486e-06,
"loss": 0.1362,
"step": 1810
},
{
"epoch": 0.6750348675034867,
"grad_norm": 2.3125,
"learning_rate": 3.60892930963208e-06,
"loss": 0.1378,
"step": 1815
},
{
"epoch": 0.6768944676894467,
"grad_norm": 1.96875,
"learning_rate": 3.5882596114096736e-06,
"loss": 0.1323,
"step": 1820
},
{
"epoch": 0.6787540678754068,
"grad_norm": 2.015625,
"learning_rate": 3.5675899131872676e-06,
"loss": 0.1406,
"step": 1825
},
{
"epoch": 0.6806136680613668,
"grad_norm": 2.15625,
"learning_rate": 3.5469202149648617e-06,
"loss": 0.1416,
"step": 1830
},
{
"epoch": 0.6824732682473268,
"grad_norm": 1.953125,
"learning_rate": 3.526250516742456e-06,
"loss": 0.1347,
"step": 1835
},
{
"epoch": 0.6843328684332869,
"grad_norm": 2.21875,
"learning_rate": 3.5055808185200503e-06,
"loss": 0.148,
"step": 1840
},
{
"epoch": 0.6861924686192469,
"grad_norm": 2.015625,
"learning_rate": 3.484911120297644e-06,
"loss": 0.1443,
"step": 1845
},
{
"epoch": 0.6880520688052069,
"grad_norm": 1.953125,
"learning_rate": 3.464241422075238e-06,
"loss": 0.138,
"step": 1850
},
{
"epoch": 0.6899116689911668,
"grad_norm": 2.015625,
"learning_rate": 3.443571723852832e-06,
"loss": 0.1466,
"step": 1855
},
{
"epoch": 0.691771269177127,
"grad_norm": 2.34375,
"learning_rate": 3.422902025630426e-06,
"loss": 0.1403,
"step": 1860
},
{
"epoch": 0.6936308693630869,
"grad_norm": 2.03125,
"learning_rate": 3.40223232740802e-06,
"loss": 0.1361,
"step": 1865
},
{
"epoch": 0.6954904695490469,
"grad_norm": 2.265625,
"learning_rate": 3.381562629185614e-06,
"loss": 0.1393,
"step": 1870
},
{
"epoch": 0.697350069735007,
"grad_norm": 2.09375,
"learning_rate": 3.360892930963208e-06,
"loss": 0.1352,
"step": 1875
},
{
"epoch": 0.699209669920967,
"grad_norm": 2.34375,
"learning_rate": 3.3402232327408025e-06,
"loss": 0.1387,
"step": 1880
},
{
"epoch": 0.700325430032543,
"eval_loss": 0.16304655373096466,
"eval_runtime": 33.5474,
"eval_samples_per_second": 306.581,
"eval_steps_per_second": 9.598,
"step": 1883
},
{
"epoch": 0.701069270106927,
"grad_norm": 2.015625,
"learning_rate": 3.3195535345183966e-06,
"loss": 0.1373,
"step": 1885
},
{
"epoch": 0.702928870292887,
"grad_norm": 2.046875,
"learning_rate": 3.2988838362959903e-06,
"loss": 0.1392,
"step": 1890
},
{
"epoch": 0.7047884704788471,
"grad_norm": 1.9375,
"learning_rate": 3.2782141380735843e-06,
"loss": 0.1453,
"step": 1895
},
{
"epoch": 0.7066480706648071,
"grad_norm": 1.8359375,
"learning_rate": 3.2575444398511784e-06,
"loss": 0.141,
"step": 1900
},
{
"epoch": 0.708507670850767,
"grad_norm": 2.015625,
"learning_rate": 3.2368747416287725e-06,
"loss": 0.1346,
"step": 1905
},
{
"epoch": 0.7103672710367271,
"grad_norm": 2.046875,
"learning_rate": 3.216205043406366e-06,
"loss": 0.1375,
"step": 1910
},
{
"epoch": 0.7122268712226871,
"grad_norm": 1.9609375,
"learning_rate": 3.1955353451839607e-06,
"loss": 0.1433,
"step": 1915
},
{
"epoch": 0.7140864714086471,
"grad_norm": 2.375,
"learning_rate": 3.1748656469615547e-06,
"loss": 0.1487,
"step": 1920
},
{
"epoch": 0.7159460715946072,
"grad_norm": 2.25,
"learning_rate": 3.154195948739149e-06,
"loss": 0.1439,
"step": 1925
},
{
"epoch": 0.7178056717805672,
"grad_norm": 2.140625,
"learning_rate": 3.133526250516743e-06,
"loss": 0.1362,
"step": 1930
},
{
"epoch": 0.7196652719665272,
"grad_norm": 2.15625,
"learning_rate": 3.1128565522943366e-06,
"loss": 0.1373,
"step": 1935
},
{
"epoch": 0.7215248721524872,
"grad_norm": 2.453125,
"learning_rate": 3.0921868540719306e-06,
"loss": 0.1476,
"step": 1940
},
{
"epoch": 0.7233844723384473,
"grad_norm": 2.15625,
"learning_rate": 3.0715171558495247e-06,
"loss": 0.1363,
"step": 1945
},
{
"epoch": 0.7252440725244073,
"grad_norm": 2.265625,
"learning_rate": 3.0508474576271192e-06,
"loss": 0.1338,
"step": 1950
},
{
"epoch": 0.7271036727103672,
"grad_norm": 2.140625,
"learning_rate": 3.030177759404713e-06,
"loss": 0.1338,
"step": 1955
},
{
"epoch": 0.7289632728963273,
"grad_norm": 2.25,
"learning_rate": 3.009508061182307e-06,
"loss": 0.1391,
"step": 1960
},
{
"epoch": 0.7308228730822873,
"grad_norm": 2.140625,
"learning_rate": 2.988838362959901e-06,
"loss": 0.146,
"step": 1965
},
{
"epoch": 0.7326824732682473,
"grad_norm": 2.140625,
"learning_rate": 2.968168664737495e-06,
"loss": 0.138,
"step": 1970
},
{
"epoch": 0.7345420734542073,
"grad_norm": 2.078125,
"learning_rate": 2.947498966515089e-06,
"loss": 0.1395,
"step": 1975
},
{
"epoch": 0.7364016736401674,
"grad_norm": 2.125,
"learning_rate": 2.926829268292683e-06,
"loss": 0.1409,
"step": 1980
},
{
"epoch": 0.7382612738261274,
"grad_norm": 2.265625,
"learning_rate": 2.906159570070277e-06,
"loss": 0.1404,
"step": 1985
},
{
"epoch": 0.7401208740120874,
"grad_norm": 2.046875,
"learning_rate": 2.8854898718478715e-06,
"loss": 0.1371,
"step": 1990
},
{
"epoch": 0.7419804741980475,
"grad_norm": 2.234375,
"learning_rate": 2.8648201736254655e-06,
"loss": 0.1385,
"step": 1995
},
{
"epoch": 0.7438400743840075,
"grad_norm": 2.265625,
"learning_rate": 2.844150475403059e-06,
"loss": 0.1393,
"step": 2000
},
{
"epoch": 0.7456996745699674,
"grad_norm": 1.8515625,
"learning_rate": 2.8234807771806533e-06,
"loss": 0.1329,
"step": 2005
},
{
"epoch": 0.7475592747559274,
"grad_norm": 1.9765625,
"learning_rate": 2.8028110789582473e-06,
"loss": 0.1418,
"step": 2010
},
{
"epoch": 0.7494188749418875,
"grad_norm": 2.046875,
"learning_rate": 2.7821413807358414e-06,
"loss": 0.1366,
"step": 2015
},
{
"epoch": 0.7512784751278475,
"grad_norm": 2.140625,
"learning_rate": 2.761471682513436e-06,
"loss": 0.1378,
"step": 2020
},
{
"epoch": 0.7531380753138075,
"grad_norm": 2.078125,
"learning_rate": 2.7408019842910296e-06,
"loss": 0.1392,
"step": 2025
},
{
"epoch": 0.7549976754997676,
"grad_norm": 2.34375,
"learning_rate": 2.7201322860686237e-06,
"loss": 0.141,
"step": 2030
},
{
"epoch": 0.7568572756857276,
"grad_norm": 2.46875,
"learning_rate": 2.6994625878462178e-06,
"loss": 0.1391,
"step": 2035
},
{
"epoch": 0.7587168758716876,
"grad_norm": 2.140625,
"learning_rate": 2.678792889623812e-06,
"loss": 0.1408,
"step": 2040
},
{
"epoch": 0.7605764760576476,
"grad_norm": 2.140625,
"learning_rate": 2.6581231914014055e-06,
"loss": 0.1382,
"step": 2045
},
{
"epoch": 0.7624360762436077,
"grad_norm": 2.171875,
"learning_rate": 2.6374534931789996e-06,
"loss": 0.1424,
"step": 2050
},
{
"epoch": 0.7642956764295676,
"grad_norm": 2.0,
"learning_rate": 2.6167837949565936e-06,
"loss": 0.1297,
"step": 2055
},
{
"epoch": 0.7661552766155276,
"grad_norm": 1.921875,
"learning_rate": 2.596114096734188e-06,
"loss": 0.1398,
"step": 2060
},
{
"epoch": 0.7680148768014877,
"grad_norm": 2.5,
"learning_rate": 2.5754443985117822e-06,
"loss": 0.1418,
"step": 2065
},
{
"epoch": 0.7698744769874477,
"grad_norm": 2.15625,
"learning_rate": 2.554774700289376e-06,
"loss": 0.137,
"step": 2070
},
{
"epoch": 0.7717340771734077,
"grad_norm": 1.9296875,
"learning_rate": 2.53410500206697e-06,
"loss": 0.1409,
"step": 2075
},
{
"epoch": 0.7735936773593677,
"grad_norm": 2.015625,
"learning_rate": 2.513435303844564e-06,
"loss": 0.1345,
"step": 2080
},
{
"epoch": 0.7754532775453278,
"grad_norm": 1.8828125,
"learning_rate": 2.492765605622158e-06,
"loss": 0.1445,
"step": 2085
},
{
"epoch": 0.7773128777312878,
"grad_norm": 2.34375,
"learning_rate": 2.4720959073997522e-06,
"loss": 0.1414,
"step": 2090
},
{
"epoch": 0.7791724779172478,
"grad_norm": 2.015625,
"learning_rate": 2.4514262091773463e-06,
"loss": 0.1373,
"step": 2095
},
{
"epoch": 0.7810320781032078,
"grad_norm": 2.265625,
"learning_rate": 2.4307565109549404e-06,
"loss": 0.1367,
"step": 2100
},
{
"epoch": 0.7828916782891678,
"grad_norm": 2.109375,
"learning_rate": 2.4100868127325345e-06,
"loss": 0.1332,
"step": 2105
},
{
"epoch": 0.7847512784751278,
"grad_norm": 2.125,
"learning_rate": 2.389417114510128e-06,
"loss": 0.1368,
"step": 2110
},
{
"epoch": 0.7866108786610879,
"grad_norm": 1.984375,
"learning_rate": 2.3687474162877226e-06,
"loss": 0.1439,
"step": 2115
},
{
"epoch": 0.7884704788470479,
"grad_norm": 2.21875,
"learning_rate": 2.3480777180653163e-06,
"loss": 0.1423,
"step": 2120
},
{
"epoch": 0.7903300790330079,
"grad_norm": 2.09375,
"learning_rate": 2.3274080198429104e-06,
"loss": 0.1384,
"step": 2125
},
{
"epoch": 0.7921896792189679,
"grad_norm": 2.34375,
"learning_rate": 2.3067383216205044e-06,
"loss": 0.138,
"step": 2130
},
{
"epoch": 0.794049279404928,
"grad_norm": 2.03125,
"learning_rate": 2.2860686233980985e-06,
"loss": 0.1368,
"step": 2135
},
{
"epoch": 0.795908879590888,
"grad_norm": 2.0625,
"learning_rate": 2.2653989251756926e-06,
"loss": 0.1315,
"step": 2140
},
{
"epoch": 0.797768479776848,
"grad_norm": 2.34375,
"learning_rate": 2.2447292269532867e-06,
"loss": 0.1433,
"step": 2145
},
{
"epoch": 0.799628079962808,
"grad_norm": 2.390625,
"learning_rate": 2.2240595287308808e-06,
"loss": 0.1403,
"step": 2150
},
{
"epoch": 0.800371920037192,
"eval_loss": 0.16228820383548737,
"eval_runtime": 33.5429,
"eval_samples_per_second": 306.622,
"eval_steps_per_second": 9.6,
"step": 2152
},
{
"epoch": 0.801487680148768,
"grad_norm": 2.171875,
"learning_rate": 2.203389830508475e-06,
"loss": 0.138,
"step": 2155
},
{
"epoch": 0.803347280334728,
"grad_norm": 2.234375,
"learning_rate": 2.182720132286069e-06,
"loss": 0.1403,
"step": 2160
},
{
"epoch": 0.805206880520688,
"grad_norm": 1.984375,
"learning_rate": 2.1620504340636626e-06,
"loss": 0.1398,
"step": 2165
},
{
"epoch": 0.8070664807066481,
"grad_norm": 2.078125,
"learning_rate": 2.141380735841257e-06,
"loss": 0.1459,
"step": 2170
},
{
"epoch": 0.8089260808926081,
"grad_norm": 2.171875,
"learning_rate": 2.1207110376188507e-06,
"loss": 0.1374,
"step": 2175
},
{
"epoch": 0.8107856810785681,
"grad_norm": 1.9453125,
"learning_rate": 2.100041339396445e-06,
"loss": 0.138,
"step": 2180
},
{
"epoch": 0.8126452812645282,
"grad_norm": 2.265625,
"learning_rate": 2.079371641174039e-06,
"loss": 0.1389,
"step": 2185
},
{
"epoch": 0.8145048814504882,
"grad_norm": 1.890625,
"learning_rate": 2.058701942951633e-06,
"loss": 0.1393,
"step": 2190
},
{
"epoch": 0.8163644816364481,
"grad_norm": 2.203125,
"learning_rate": 2.038032244729227e-06,
"loss": 0.1341,
"step": 2195
},
{
"epoch": 0.8182240818224081,
"grad_norm": 2.140625,
"learning_rate": 2.017362546506821e-06,
"loss": 0.1413,
"step": 2200
},
{
"epoch": 0.8200836820083682,
"grad_norm": 2.21875,
"learning_rate": 1.9966928482844152e-06,
"loss": 0.1401,
"step": 2205
},
{
"epoch": 0.8219432821943282,
"grad_norm": 2.296875,
"learning_rate": 1.9760231500620093e-06,
"loss": 0.1442,
"step": 2210
},
{
"epoch": 0.8238028823802882,
"grad_norm": 2.140625,
"learning_rate": 1.9553534518396034e-06,
"loss": 0.144,
"step": 2215
},
{
"epoch": 0.8256624825662483,
"grad_norm": 2.125,
"learning_rate": 1.934683753617197e-06,
"loss": 0.1353,
"step": 2220
},
{
"epoch": 0.8275220827522083,
"grad_norm": 2.0,
"learning_rate": 1.9140140553947915e-06,
"loss": 0.1394,
"step": 2225
},
{
"epoch": 0.8293816829381683,
"grad_norm": 1.9765625,
"learning_rate": 1.8933443571723856e-06,
"loss": 0.1385,
"step": 2230
},
{
"epoch": 0.8312412831241283,
"grad_norm": 2.28125,
"learning_rate": 1.8726746589499795e-06,
"loss": 0.1451,
"step": 2235
},
{
"epoch": 0.8331008833100884,
"grad_norm": 2.015625,
"learning_rate": 1.8520049607275736e-06,
"loss": 0.1366,
"step": 2240
},
{
"epoch": 0.8349604834960483,
"grad_norm": 2.203125,
"learning_rate": 1.8313352625051674e-06,
"loss": 0.1348,
"step": 2245
},
{
"epoch": 0.8368200836820083,
"grad_norm": 2.09375,
"learning_rate": 1.8106655642827617e-06,
"loss": 0.1365,
"step": 2250
},
{
"epoch": 0.8386796838679684,
"grad_norm": 2.09375,
"learning_rate": 1.7899958660603556e-06,
"loss": 0.1366,
"step": 2255
},
{
"epoch": 0.8405392840539284,
"grad_norm": 2.09375,
"learning_rate": 1.7693261678379497e-06,
"loss": 0.1444,
"step": 2260
},
{
"epoch": 0.8423988842398884,
"grad_norm": 2.375,
"learning_rate": 1.7486564696155435e-06,
"loss": 0.144,
"step": 2265
},
{
"epoch": 0.8442584844258484,
"grad_norm": 2.09375,
"learning_rate": 1.7279867713931378e-06,
"loss": 0.1403,
"step": 2270
},
{
"epoch": 0.8461180846118085,
"grad_norm": 2.0,
"learning_rate": 1.707317073170732e-06,
"loss": 0.1427,
"step": 2275
},
{
"epoch": 0.8479776847977685,
"grad_norm": 1.9921875,
"learning_rate": 1.6866473749483258e-06,
"loss": 0.1367,
"step": 2280
},
{
"epoch": 0.8498372849837285,
"grad_norm": 2.140625,
"learning_rate": 1.66597767672592e-06,
"loss": 0.1474,
"step": 2285
},
{
"epoch": 0.8516968851696886,
"grad_norm": 2.125,
"learning_rate": 1.645307978503514e-06,
"loss": 0.1476,
"step": 2290
},
{
"epoch": 0.8535564853556485,
"grad_norm": 2.140625,
"learning_rate": 1.624638280281108e-06,
"loss": 0.1423,
"step": 2295
},
{
"epoch": 0.8554160855416085,
"grad_norm": 2.296875,
"learning_rate": 1.603968582058702e-06,
"loss": 0.137,
"step": 2300
},
{
"epoch": 0.8572756857275686,
"grad_norm": 2.046875,
"learning_rate": 1.5832988838362962e-06,
"loss": 0.1332,
"step": 2305
},
{
"epoch": 0.8591352859135286,
"grad_norm": 2.5,
"learning_rate": 1.56262918561389e-06,
"loss": 0.141,
"step": 2310
},
{
"epoch": 0.8609948860994886,
"grad_norm": 2.390625,
"learning_rate": 1.5419594873914841e-06,
"loss": 0.1375,
"step": 2315
},
{
"epoch": 0.8628544862854486,
"grad_norm": 2.046875,
"learning_rate": 1.5212897891690784e-06,
"loss": 0.1413,
"step": 2320
},
{
"epoch": 0.8647140864714087,
"grad_norm": 1.8828125,
"learning_rate": 1.5006200909466723e-06,
"loss": 0.1455,
"step": 2325
},
{
"epoch": 0.8665736866573687,
"grad_norm": 2.234375,
"learning_rate": 1.4799503927242664e-06,
"loss": 0.1442,
"step": 2330
},
{
"epoch": 0.8684332868433287,
"grad_norm": 2.109375,
"learning_rate": 1.4592806945018602e-06,
"loss": 0.1326,
"step": 2335
},
{
"epoch": 0.8702928870292888,
"grad_norm": 2.265625,
"learning_rate": 1.4386109962794545e-06,
"loss": 0.1418,
"step": 2340
},
{
"epoch": 0.8721524872152487,
"grad_norm": 2.125,
"learning_rate": 1.4179412980570484e-06,
"loss": 0.1398,
"step": 2345
},
{
"epoch": 0.8740120874012087,
"grad_norm": 2.0,
"learning_rate": 1.3972715998346425e-06,
"loss": 0.139,
"step": 2350
},
{
"epoch": 0.8758716875871687,
"grad_norm": 2.671875,
"learning_rate": 1.3766019016122364e-06,
"loss": 0.1506,
"step": 2355
},
{
"epoch": 0.8777312877731288,
"grad_norm": 2.09375,
"learning_rate": 1.3559322033898307e-06,
"loss": 0.1396,
"step": 2360
},
{
"epoch": 0.8795908879590888,
"grad_norm": 1.953125,
"learning_rate": 1.3352625051674247e-06,
"loss": 0.1293,
"step": 2365
},
{
"epoch": 0.8814504881450488,
"grad_norm": 2.03125,
"learning_rate": 1.3145928069450186e-06,
"loss": 0.1399,
"step": 2370
},
{
"epoch": 0.8833100883310089,
"grad_norm": 2.046875,
"learning_rate": 1.2939231087226129e-06,
"loss": 0.1365,
"step": 2375
},
{
"epoch": 0.8851696885169689,
"grad_norm": 2.0625,
"learning_rate": 1.2732534105002068e-06,
"loss": 0.1397,
"step": 2380
},
{
"epoch": 0.8870292887029289,
"grad_norm": 2.015625,
"learning_rate": 1.2525837122778008e-06,
"loss": 0.1437,
"step": 2385
},
{
"epoch": 0.8888888888888888,
"grad_norm": 1.984375,
"learning_rate": 1.231914014055395e-06,
"loss": 0.1418,
"step": 2390
},
{
"epoch": 0.8907484890748489,
"grad_norm": 2.03125,
"learning_rate": 1.211244315832989e-06,
"loss": 0.1384,
"step": 2395
},
{
"epoch": 0.8926080892608089,
"grad_norm": 2.359375,
"learning_rate": 1.190574617610583e-06,
"loss": 0.1445,
"step": 2400
},
{
"epoch": 0.8944676894467689,
"grad_norm": 2.140625,
"learning_rate": 1.169904919388177e-06,
"loss": 0.1358,
"step": 2405
},
{
"epoch": 0.896327289632729,
"grad_norm": 2.171875,
"learning_rate": 1.149235221165771e-06,
"loss": 0.1532,
"step": 2410
},
{
"epoch": 0.898186889818689,
"grad_norm": 1.96875,
"learning_rate": 1.1285655229433651e-06,
"loss": 0.1416,
"step": 2415
},
{
"epoch": 0.900046490004649,
"grad_norm": 1.9140625,
"learning_rate": 1.1078958247209592e-06,
"loss": 0.1383,
"step": 2420
},
{
"epoch": 0.900418410041841,
"eval_loss": 0.16189107298851013,
"eval_runtime": 33.532,
"eval_samples_per_second": 306.722,
"eval_steps_per_second": 9.603,
"step": 2421
},
{
"epoch": 0.901906090190609,
"grad_norm": 2.234375,
"learning_rate": 1.0872261264985533e-06,
"loss": 0.1319,
"step": 2425
},
{
"epoch": 0.9037656903765691,
"grad_norm": 2.28125,
"learning_rate": 1.0665564282761474e-06,
"loss": 0.1448,
"step": 2430
},
{
"epoch": 0.905625290562529,
"grad_norm": 2.265625,
"learning_rate": 1.0458867300537414e-06,
"loss": 0.1479,
"step": 2435
},
{
"epoch": 0.907484890748489,
"grad_norm": 2.15625,
"learning_rate": 1.0252170318313353e-06,
"loss": 0.1396,
"step": 2440
},
{
"epoch": 0.9093444909344491,
"grad_norm": 2.203125,
"learning_rate": 1.0045473336089294e-06,
"loss": 0.1494,
"step": 2445
},
{
"epoch": 0.9112040911204091,
"grad_norm": 2.0625,
"learning_rate": 9.838776353865235e-07,
"loss": 0.1362,
"step": 2450
},
{
"epoch": 0.9130636913063691,
"grad_norm": 2.0625,
"learning_rate": 9.632079371641175e-07,
"loss": 0.1371,
"step": 2455
},
{
"epoch": 0.9149232914923291,
"grad_norm": 2.1875,
"learning_rate": 9.425382389417115e-07,
"loss": 0.139,
"step": 2460
},
{
"epoch": 0.9167828916782892,
"grad_norm": 2.03125,
"learning_rate": 9.218685407193055e-07,
"loss": 0.1339,
"step": 2465
},
{
"epoch": 0.9186424918642492,
"grad_norm": 2.21875,
"learning_rate": 9.011988424968997e-07,
"loss": 0.1427,
"step": 2470
},
{
"epoch": 0.9205020920502092,
"grad_norm": 1.9140625,
"learning_rate": 8.805291442744937e-07,
"loss": 0.1302,
"step": 2475
},
{
"epoch": 0.9223616922361693,
"grad_norm": 1.9296875,
"learning_rate": 8.598594460520877e-07,
"loss": 0.1385,
"step": 2480
},
{
"epoch": 0.9242212924221292,
"grad_norm": 2.3125,
"learning_rate": 8.391897478296818e-07,
"loss": 0.1396,
"step": 2485
},
{
"epoch": 0.9260808926080892,
"grad_norm": 2.34375,
"learning_rate": 8.185200496072758e-07,
"loss": 0.1447,
"step": 2490
},
{
"epoch": 0.9279404927940493,
"grad_norm": 2.109375,
"learning_rate": 7.978503513848699e-07,
"loss": 0.1362,
"step": 2495
},
{
"epoch": 0.9298000929800093,
"grad_norm": 2.03125,
"learning_rate": 7.771806531624638e-07,
"loss": 0.1418,
"step": 2500
},
{
"epoch": 0.9316596931659693,
"grad_norm": 1.96875,
"learning_rate": 7.565109549400579e-07,
"loss": 0.1372,
"step": 2505
},
{
"epoch": 0.9335192933519293,
"grad_norm": 1.8984375,
"learning_rate": 7.358412567176519e-07,
"loss": 0.1421,
"step": 2510
},
{
"epoch": 0.9353788935378894,
"grad_norm": 2.171875,
"learning_rate": 7.151715584952461e-07,
"loss": 0.1426,
"step": 2515
},
{
"epoch": 0.9372384937238494,
"grad_norm": 2.125,
"learning_rate": 6.945018602728401e-07,
"loss": 0.1361,
"step": 2520
},
{
"epoch": 0.9390980939098094,
"grad_norm": 2.140625,
"learning_rate": 6.738321620504341e-07,
"loss": 0.1367,
"step": 2525
},
{
"epoch": 0.9409576940957695,
"grad_norm": 2.015625,
"learning_rate": 6.531624638280282e-07,
"loss": 0.1408,
"step": 2530
},
{
"epoch": 0.9428172942817294,
"grad_norm": 2.265625,
"learning_rate": 6.324927656056222e-07,
"loss": 0.141,
"step": 2535
},
{
"epoch": 0.9446768944676894,
"grad_norm": 2.359375,
"learning_rate": 6.118230673832163e-07,
"loss": 0.1346,
"step": 2540
},
{
"epoch": 0.9465364946536494,
"grad_norm": 2.09375,
"learning_rate": 5.911533691608104e-07,
"loss": 0.14,
"step": 2545
},
{
"epoch": 0.9483960948396095,
"grad_norm": 1.8984375,
"learning_rate": 5.704836709384043e-07,
"loss": 0.1347,
"step": 2550
},
{
"epoch": 0.9502556950255695,
"grad_norm": 1.96875,
"learning_rate": 5.498139727159984e-07,
"loss": 0.1354,
"step": 2555
},
{
"epoch": 0.9521152952115295,
"grad_norm": 2.203125,
"learning_rate": 5.291442744935924e-07,
"loss": 0.1394,
"step": 2560
},
{
"epoch": 0.9539748953974896,
"grad_norm": 1.9453125,
"learning_rate": 5.084745762711865e-07,
"loss": 0.1392,
"step": 2565
},
{
"epoch": 0.9558344955834496,
"grad_norm": 1.9140625,
"learning_rate": 4.878048780487805e-07,
"loss": 0.1394,
"step": 2570
},
{
"epoch": 0.9576940957694096,
"grad_norm": 2.015625,
"learning_rate": 4.671351798263746e-07,
"loss": 0.1401,
"step": 2575
},
{
"epoch": 0.9595536959553695,
"grad_norm": 1.8984375,
"learning_rate": 4.464654816039686e-07,
"loss": 0.1324,
"step": 2580
},
{
"epoch": 0.9614132961413296,
"grad_norm": 2.0,
"learning_rate": 4.2579578338156263e-07,
"loss": 0.1408,
"step": 2585
},
{
"epoch": 0.9632728963272896,
"grad_norm": 2.078125,
"learning_rate": 4.051260851591567e-07,
"loss": 0.1336,
"step": 2590
},
{
"epoch": 0.9651324965132496,
"grad_norm": 2.234375,
"learning_rate": 3.8445638693675074e-07,
"loss": 0.14,
"step": 2595
},
{
"epoch": 0.9669920966992097,
"grad_norm": 2.296875,
"learning_rate": 3.6378668871434477e-07,
"loss": 0.1442,
"step": 2600
},
{
"epoch": 0.9688516968851697,
"grad_norm": 2.140625,
"learning_rate": 3.431169904919388e-07,
"loss": 0.1428,
"step": 2605
},
{
"epoch": 0.9707112970711297,
"grad_norm": 2.0625,
"learning_rate": 3.2244729226953293e-07,
"loss": 0.1405,
"step": 2610
},
{
"epoch": 0.9725708972570897,
"grad_norm": 2.0625,
"learning_rate": 3.0177759404712695e-07,
"loss": 0.1325,
"step": 2615
},
{
"epoch": 0.9744304974430498,
"grad_norm": 1.921875,
"learning_rate": 2.81107895824721e-07,
"loss": 0.139,
"step": 2620
},
{
"epoch": 0.9762900976290098,
"grad_norm": 2.171875,
"learning_rate": 2.6043819760231506e-07,
"loss": 0.139,
"step": 2625
},
{
"epoch": 0.9781496978149697,
"grad_norm": 2.03125,
"learning_rate": 2.397684993799091e-07,
"loss": 0.1393,
"step": 2630
},
{
"epoch": 0.9800092980009298,
"grad_norm": 2.421875,
"learning_rate": 2.1909880115750314e-07,
"loss": 0.1484,
"step": 2635
},
{
"epoch": 0.9818688981868898,
"grad_norm": 2.09375,
"learning_rate": 1.9842910293509717e-07,
"loss": 0.1426,
"step": 2640
},
{
"epoch": 0.9837284983728498,
"grad_norm": 2.46875,
"learning_rate": 1.777594047126912e-07,
"loss": 0.1441,
"step": 2645
},
{
"epoch": 0.9855880985588099,
"grad_norm": 2.140625,
"learning_rate": 1.5708970649028525e-07,
"loss": 0.1423,
"step": 2650
},
{
"epoch": 0.9874476987447699,
"grad_norm": 2.03125,
"learning_rate": 1.364200082678793e-07,
"loss": 0.139,
"step": 2655
},
{
"epoch": 0.9893072989307299,
"grad_norm": 2.0,
"learning_rate": 1.1575031004547335e-07,
"loss": 0.1355,
"step": 2660
},
{
"epoch": 0.9911668991166899,
"grad_norm": 2.0625,
"learning_rate": 9.50806118230674e-08,
"loss": 0.1443,
"step": 2665
},
{
"epoch": 0.99302649930265,
"grad_norm": 2.015625,
"learning_rate": 7.441091360066144e-08,
"loss": 0.1404,
"step": 2670
},
{
"epoch": 0.99488609948861,
"grad_norm": 2.28125,
"learning_rate": 5.3741215378255483e-08,
"loss": 0.1501,
"step": 2675
},
{
"epoch": 0.9967456996745699,
"grad_norm": 2.078125,
"learning_rate": 3.3071517155849524e-08,
"loss": 0.1454,
"step": 2680
},
{
"epoch": 0.99860529986053,
"grad_norm": 1.890625,
"learning_rate": 1.2401818933443573e-08,
"loss": 0.1473,
"step": 2685
},
{
"epoch": 0.999721059972106,
"step": 2688,
"total_flos": 2.4410094732092375e+18,
"train_loss": 0.15163021730924292,
"train_runtime": 3273.2323,
"train_samples_per_second": 52.569,
"train_steps_per_second": 0.821
}
],
"logging_steps": 5,
"max_steps": 2688,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 269,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.4410094732092375e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}