|
{ |
|
"best_metric": 0.16189107298851013, |
|
"best_model_checkpoint": "checkpoints/rft-finetune-llama-3.1-8b-math50k/math50k/finetune-llama-3.1-8b-math50k-step-1/checkpoint-2421", |
|
"epoch": 0.999721059972106, |
|
"eval_steps": 269, |
|
"global_step": 2688, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0018596001859600185, |
|
"grad_norm": 23.25, |
|
"learning_rate": 1.8587360594795542e-07, |
|
"loss": 0.5644, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.003719200371920037, |
|
"grad_norm": 24.625, |
|
"learning_rate": 3.7174721189591085e-07, |
|
"loss": 0.5677, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005578800557880056, |
|
"grad_norm": 20.625, |
|
"learning_rate": 5.576208178438662e-07, |
|
"loss": 0.5478, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.007438400743840074, |
|
"grad_norm": 18.125, |
|
"learning_rate": 7.434944237918217e-07, |
|
"loss": 0.5364, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.009298000929800094, |
|
"grad_norm": 17.875, |
|
"learning_rate": 9.29368029739777e-07, |
|
"loss": 0.4842, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.011157601115760111, |
|
"grad_norm": 14.3125, |
|
"learning_rate": 1.1152416356877324e-06, |
|
"loss": 0.4142, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01301720130172013, |
|
"grad_norm": 14.75, |
|
"learning_rate": 1.3011152416356879e-06, |
|
"loss": 0.3353, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.014876801487680148, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 1.4869888475836434e-06, |
|
"loss": 0.2517, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.016736401673640166, |
|
"grad_norm": 5.875, |
|
"learning_rate": 1.6728624535315987e-06, |
|
"loss": 0.1879, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.018596001859600187, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.858736059479554e-06, |
|
"loss": 0.1624, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.020455602045560205, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 2.0446096654275095e-06, |
|
"loss": 0.1526, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.022315202231520222, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 2.2304832713754648e-06, |
|
"loss": 0.1651, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.02417480241748024, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 2.41635687732342e-06, |
|
"loss": 0.1557, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.02603440260344026, |
|
"grad_norm": 2.890625, |
|
"learning_rate": 2.6022304832713758e-06, |
|
"loss": 0.1586, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02789400278940028, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 2.788104089219331e-06, |
|
"loss": 0.1491, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.029753602975360297, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 2.973977695167287e-06, |
|
"loss": 0.1569, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03161320316132032, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 3.159851301115242e-06, |
|
"loss": 0.1491, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.03347280334728033, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 3.3457249070631974e-06, |
|
"loss": 0.1459, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.03533240353324035, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 3.531598513011153e-06, |
|
"loss": 0.1548, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.037192003719200374, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 3.717472118959108e-06, |
|
"loss": 0.1529, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03905160390516039, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 3.903345724907064e-06, |
|
"loss": 0.1543, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.04091120409112041, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 4.089219330855019e-06, |
|
"loss": 0.1521, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.04277080427708043, |
|
"grad_norm": 2.5, |
|
"learning_rate": 4.275092936802974e-06, |
|
"loss": 0.153, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.044630404463040445, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 4.4609665427509296e-06, |
|
"loss": 0.1423, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.046490004649000466, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 4.646840148698885e-06, |
|
"loss": 0.1479, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.04834960483496048, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 4.83271375464684e-06, |
|
"loss": 0.1501, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0502092050209205, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 5.0185873605947954e-06, |
|
"loss": 0.1611, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.05206880520688052, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 5.2044609665427516e-06, |
|
"loss": 0.1506, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.05392840539284054, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 5.390334572490706e-06, |
|
"loss": 0.1532, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.05578800557880056, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 5.576208178438662e-06, |
|
"loss": 0.1523, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05764760576476058, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 5.7620817843866174e-06, |
|
"loss": 0.1584, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.05950720595072059, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 5.947955390334574e-06, |
|
"loss": 0.1504, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.061366806136680614, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 6.133828996282528e-06, |
|
"loss": 0.1609, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.06322640632264064, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 6.319702602230484e-06, |
|
"loss": 0.1546, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.06508600650860065, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 6.5055762081784395e-06, |
|
"loss": 0.1492, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.06694560669456066, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 6.691449814126395e-06, |
|
"loss": 0.1427, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.06880520688052069, |
|
"grad_norm": 2.625, |
|
"learning_rate": 6.87732342007435e-06, |
|
"loss": 0.1569, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.0706648070664807, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 7.063197026022306e-06, |
|
"loss": 0.1523, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.07252440725244072, |
|
"grad_norm": 3.046875, |
|
"learning_rate": 7.249070631970261e-06, |
|
"loss": 0.1609, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.07438400743840075, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 7.434944237918216e-06, |
|
"loss": 0.1586, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07624360762436076, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 7.620817843866172e-06, |
|
"loss": 0.1536, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.07810320781032078, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 7.806691449814127e-06, |
|
"loss": 0.1632, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.0799628079962808, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 7.992565055762083e-06, |
|
"loss": 0.1558, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.08182240818224082, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 8.178438661710038e-06, |
|
"loss": 0.1534, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.08368200836820083, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 8.364312267657993e-06, |
|
"loss": 0.1576, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.08554160855416086, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 8.550185873605949e-06, |
|
"loss": 0.1643, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.08740120874012088, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 8.736059479553904e-06, |
|
"loss": 0.1599, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.08926080892608089, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 8.921933085501859e-06, |
|
"loss": 0.1573, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.09112040911204092, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 9.107806691449816e-06, |
|
"loss": 0.1614, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.09298000929800093, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 9.29368029739777e-06, |
|
"loss": 0.152, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.09483960948396095, |
|
"grad_norm": 2.625, |
|
"learning_rate": 9.479553903345727e-06, |
|
"loss": 0.1531, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.09669920966992096, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 9.66542750929368e-06, |
|
"loss": 0.1592, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.09855880985588099, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 9.851301115241636e-06, |
|
"loss": 0.158, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.100046490004649, |
|
"eval_loss": 0.182636097073555, |
|
"eval_runtime": 33.5333, |
|
"eval_samples_per_second": 306.71, |
|
"eval_steps_per_second": 9.602, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.100418410041841, |
|
"grad_norm": 2.71875, |
|
"learning_rate": 9.99586606035552e-06, |
|
"loss": 0.1626, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.10227801022780102, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 9.975196362133114e-06, |
|
"loss": 0.1612, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.10413761041376104, |
|
"grad_norm": 2.875, |
|
"learning_rate": 9.954526663910708e-06, |
|
"loss": 0.1575, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.10599721059972106, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 9.933856965688302e-06, |
|
"loss": 0.1661, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.10785681078568107, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 9.913187267465896e-06, |
|
"loss": 0.1663, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1097164109716411, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 9.89251756924349e-06, |
|
"loss": 0.1742, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.11157601115760112, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 9.871847871021084e-06, |
|
"loss": 0.1564, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.11343561134356113, |
|
"grad_norm": 4.125, |
|
"learning_rate": 9.851178172798678e-06, |
|
"loss": 0.1633, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.11529521152952116, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 9.830508474576272e-06, |
|
"loss": 0.1665, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.11715481171548117, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 9.809838776353866e-06, |
|
"loss": 0.1739, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.11901441190144119, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 9.78916907813146e-06, |
|
"loss": 0.1621, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.12087401208740121, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 9.768499379909055e-06, |
|
"loss": 0.167, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.12273361227336123, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 9.747829681686649e-06, |
|
"loss": 0.167, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.12459321245932124, |
|
"grad_norm": 2.75, |
|
"learning_rate": 9.727159983464243e-06, |
|
"loss": 0.1601, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.12645281264528127, |
|
"grad_norm": 2.703125, |
|
"learning_rate": 9.706490285241837e-06, |
|
"loss": 0.1705, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.12831241283124128, |
|
"grad_norm": 2.59375, |
|
"learning_rate": 9.685820587019429e-06, |
|
"loss": 0.1575, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.1301720130172013, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 9.665150888797025e-06, |
|
"loss": 0.161, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1320316132031613, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 9.644481190574619e-06, |
|
"loss": 0.1734, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.13389121338912133, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 9.623811492352211e-06, |
|
"loss": 0.1635, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.13575081357508137, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 9.603141794129807e-06, |
|
"loss": 0.1536, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.13761041376104138, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 9.5824720959074e-06, |
|
"loss": 0.1595, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1394700139470014, |
|
"grad_norm": 2.96875, |
|
"learning_rate": 9.561802397684995e-06, |
|
"loss": 0.1638, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.1413296141329614, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 9.54113269946259e-06, |
|
"loss": 0.1649, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.14318921431892143, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 9.520463001240182e-06, |
|
"loss": 0.1559, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.14504881450488144, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 9.499793303017778e-06, |
|
"loss": 0.1628, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.14690841469084148, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 9.47912360479537e-06, |
|
"loss": 0.1653, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.1487680148768015, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 9.458453906572966e-06, |
|
"loss": 0.1589, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1506276150627615, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 9.437784208350558e-06, |
|
"loss": 0.1548, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.15248721524872152, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 9.417114510128152e-06, |
|
"loss": 0.1567, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.15434681543468154, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 9.396444811905748e-06, |
|
"loss": 0.1611, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.15620641562064155, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 9.37577511368334e-06, |
|
"loss": 0.1647, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.15806601580660157, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 9.355105415460936e-06, |
|
"loss": 0.1594, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.1599256159925616, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 9.334435717238529e-06, |
|
"loss": 0.1607, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.16178521617852162, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 9.313766019016123e-06, |
|
"loss": 0.1591, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.16364481636448164, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 9.293096320793717e-06, |
|
"loss": 0.1596, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.16550441655044165, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 9.27242662257131e-06, |
|
"loss": 0.1562, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.16736401673640167, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 9.251756924348905e-06, |
|
"loss": 0.1604, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.16922361692236168, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 9.231087226126499e-06, |
|
"loss": 0.1637, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.17108321710832172, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 9.210417527904093e-06, |
|
"loss": 0.1543, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.17294281729428174, |
|
"grad_norm": 2.375, |
|
"learning_rate": 9.189747829681687e-06, |
|
"loss": 0.1573, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.17480241748024175, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 9.169078131459281e-06, |
|
"loss": 0.1511, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.17666201766620176, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 9.148408433236875e-06, |
|
"loss": 0.152, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.17852161785216178, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 9.12773873501447e-06, |
|
"loss": 0.1561, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.1803812180381218, |
|
"grad_norm": 2.375, |
|
"learning_rate": 9.107069036792063e-06, |
|
"loss": 0.1557, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.18224081822408184, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 9.086399338569657e-06, |
|
"loss": 0.159, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.18410041841004185, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 9.065729640347252e-06, |
|
"loss": 0.1493, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.18596001859600186, |
|
"grad_norm": 2.765625, |
|
"learning_rate": 9.045059942124846e-06, |
|
"loss": 0.1604, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.18781961878196188, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 9.02439024390244e-06, |
|
"loss": 0.159, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.1896792189679219, |
|
"grad_norm": 2.625, |
|
"learning_rate": 9.003720545680034e-06, |
|
"loss": 0.1663, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.1915388191538819, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 8.983050847457628e-06, |
|
"loss": 0.1559, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.19339841933984192, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 8.962381149235222e-06, |
|
"loss": 0.1528, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.19525801952580196, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 8.941711451012816e-06, |
|
"loss": 0.1544, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.19711761971176198, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 8.92104175279041e-06, |
|
"loss": 0.1559, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.198977219897722, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 8.900372054568004e-06, |
|
"loss": 0.1648, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.200092980009298, |
|
"eval_loss": 0.17836953699588776, |
|
"eval_runtime": 33.5274, |
|
"eval_samples_per_second": 306.764, |
|
"eval_steps_per_second": 9.604, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 0.200836820083682, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 8.879702356345598e-06, |
|
"loss": 0.168, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.20269642026964202, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 8.859032658123192e-06, |
|
"loss": 0.1605, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.20455602045560203, |
|
"grad_norm": 2.796875, |
|
"learning_rate": 8.838362959900786e-06, |
|
"loss": 0.1626, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.20641562064156208, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 8.81769326167838e-06, |
|
"loss": 0.1535, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.2082752208275221, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 8.797023563455975e-06, |
|
"loss": 0.1566, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.2101348210134821, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 8.776353865233567e-06, |
|
"loss": 0.1587, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.21199442119944212, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 8.755684167011163e-06, |
|
"loss": 0.1609, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.21385402138540213, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 8.735014468788757e-06, |
|
"loss": 0.1522, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.21571362157136215, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 8.714344770566351e-06, |
|
"loss": 0.1631, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.2175732217573222, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 8.693675072343945e-06, |
|
"loss": 0.1662, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.2194328219432822, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 8.673005374121537e-06, |
|
"loss": 0.154, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.22129242212924222, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 8.652335675899133e-06, |
|
"loss": 0.1576, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.22315202231520223, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 8.631665977676727e-06, |
|
"loss": 0.1599, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.22501162250116225, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 8.610996279454321e-06, |
|
"loss": 0.1563, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.22687122268712226, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 8.590326581231915e-06, |
|
"loss": 0.1585, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.22873082287308227, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 8.569656883009508e-06, |
|
"loss": 0.1566, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.23059042305904232, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 8.548987184787104e-06, |
|
"loss": 0.16, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.23245002324500233, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 8.528317486564696e-06, |
|
"loss": 0.1503, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.23430962343096234, |
|
"grad_norm": 2.375, |
|
"learning_rate": 8.507647788342292e-06, |
|
"loss": 0.158, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.23616922361692236, |
|
"grad_norm": 2.609375, |
|
"learning_rate": 8.486978090119886e-06, |
|
"loss": 0.1615, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.23802882380288237, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 8.466308391897478e-06, |
|
"loss": 0.1569, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.2398884239888424, |
|
"grad_norm": 2.953125, |
|
"learning_rate": 8.445638693675074e-06, |
|
"loss": 0.1536, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.24174802417480243, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 8.424968995452666e-06, |
|
"loss": 0.1525, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.24360762436076244, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 8.40429929723026e-06, |
|
"loss": 0.1461, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.24546722454672246, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 8.383629599007855e-06, |
|
"loss": 0.1585, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.24732682473268247, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 8.362959900785449e-06, |
|
"loss": 0.1494, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.24918642491864249, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 8.342290202563044e-06, |
|
"loss": 0.1589, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.2510460251046025, |
|
"grad_norm": 2.5, |
|
"learning_rate": 8.321620504340637e-06, |
|
"loss": 0.1554, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.25290562529056254, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 8.30095080611823e-06, |
|
"loss": 0.1539, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.2547652254765225, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 8.280281107895825e-06, |
|
"loss": 0.1518, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.25662482566248257, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 8.259611409673419e-06, |
|
"loss": 0.1545, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.2584844258484426, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 8.238941711451015e-06, |
|
"loss": 0.1535, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.2603440260344026, |
|
"grad_norm": 2.484375, |
|
"learning_rate": 8.218272013228607e-06, |
|
"loss": 0.1541, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.26220362622036264, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 8.197602315006201e-06, |
|
"loss": 0.1518, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.2640632264063226, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 8.176932616783795e-06, |
|
"loss": 0.1532, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.26592282659228267, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 8.15626291856139e-06, |
|
"loss": 0.1554, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.26778242677824265, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 8.135593220338983e-06, |
|
"loss": 0.1485, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.2696420269642027, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 8.114923522116578e-06, |
|
"loss": 0.1564, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.27150162715016274, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 8.094253823894172e-06, |
|
"loss": 0.1523, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.2733612273361227, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 8.073584125671766e-06, |
|
"loss": 0.1565, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.27522082752208277, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 8.05291442744936e-06, |
|
"loss": 0.1482, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.27708042770804275, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 8.032244729226954e-06, |
|
"loss": 0.155, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.2789400278940028, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 8.011575031004548e-06, |
|
"loss": 0.1449, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.2807996280799628, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 7.990905332782142e-06, |
|
"loss": 0.151, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.2826592282659228, |
|
"grad_norm": 2.5, |
|
"learning_rate": 7.970235634559736e-06, |
|
"loss": 0.1576, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.28451882845188287, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 7.94956593633733e-06, |
|
"loss": 0.1625, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.28637842863784285, |
|
"grad_norm": 2.5, |
|
"learning_rate": 7.928896238114924e-06, |
|
"loss": 0.1558, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.2882380288238029, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 7.908226539892518e-06, |
|
"loss": 0.1512, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.2900976290097629, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 7.887556841670112e-06, |
|
"loss": 0.1568, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.2919572291957229, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 7.866887143447707e-06, |
|
"loss": 0.1561, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.29381682938168296, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 7.8462174452253e-06, |
|
"loss": 0.1592, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.29567642956764295, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 7.825547747002895e-06, |
|
"loss": 0.1481, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.297536029753603, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 7.804878048780489e-06, |
|
"loss": 0.1478, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.299395629939563, |
|
"grad_norm": 2.125, |
|
"learning_rate": 7.784208350558083e-06, |
|
"loss": 0.1562, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.300139470013947, |
|
"eval_loss": 0.17311781644821167, |
|
"eval_runtime": 33.5454, |
|
"eval_samples_per_second": 306.599, |
|
"eval_steps_per_second": 9.599, |
|
"step": 807 |
|
}, |
|
{ |
|
"epoch": 0.301255230125523, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 7.763538652335677e-06, |
|
"loss": 0.1632, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.303114830311483, |
|
"grad_norm": 2.25, |
|
"learning_rate": 7.742868954113271e-06, |
|
"loss": 0.1533, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.30497443049744305, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 7.722199255890865e-06, |
|
"loss": 0.1567, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.3068340306834031, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 7.70152955766846e-06, |
|
"loss": 0.1494, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.3086936308693631, |
|
"grad_norm": 2.375, |
|
"learning_rate": 7.680859859446053e-06, |
|
"loss": 0.1539, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.3105532310553231, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 7.660190161223646e-06, |
|
"loss": 0.152, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.3124128312412831, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 7.639520463001241e-06, |
|
"loss": 0.1491, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.31427243142724315, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 7.618850764778835e-06, |
|
"loss": 0.1475, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.31613203161320313, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 7.59818106655643e-06, |
|
"loss": 0.1527, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.3179916317991632, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 7.577511368334023e-06, |
|
"loss": 0.1527, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.3198512319851232, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 7.556841670111617e-06, |
|
"loss": 0.1491, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.3217108321710832, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 7.536171971889211e-06, |
|
"loss": 0.1509, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.32357043235704325, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 7.515502273666805e-06, |
|
"loss": 0.1501, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.32543003254300323, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 7.4948325754444e-06, |
|
"loss": 0.1509, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.3272896327289633, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 7.474162877221993e-06, |
|
"loss": 0.1449, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.3291492329149233, |
|
"grad_norm": 2.125, |
|
"learning_rate": 7.4534931789995864e-06, |
|
"loss": 0.148, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.3310088331008833, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 7.432823480777181e-06, |
|
"loss": 0.1459, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.33286843328684335, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 7.4121537825547755e-06, |
|
"loss": 0.1604, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.33472803347280333, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 7.3914840843323695e-06, |
|
"loss": 0.1491, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.3365876336587634, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 7.370814386109964e-06, |
|
"loss": 0.1453, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.33844723384472336, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 7.350144687887557e-06, |
|
"loss": 0.1485, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.3403068340306834, |
|
"grad_norm": 2.921875, |
|
"learning_rate": 7.329474989665152e-06, |
|
"loss": 0.1503, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.34216643421664344, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 7.308805291442745e-06, |
|
"loss": 0.1553, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.34402603440260343, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 7.288135593220339e-06, |
|
"loss": 0.1485, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.3458856345885635, |
|
"grad_norm": 2.125, |
|
"learning_rate": 7.267465894997934e-06, |
|
"loss": 0.1616, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.34774523477452346, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 7.246796196775527e-06, |
|
"loss": 0.1444, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.3496048349604835, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 7.226126498553122e-06, |
|
"loss": 0.1489, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.3514644351464435, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 7.205456800330715e-06, |
|
"loss": 0.1459, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.35332403533240353, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 7.1847871021083095e-06, |
|
"loss": 0.1472, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.35518363551836357, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 7.164117403885904e-06, |
|
"loss": 0.1488, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.35704323570432356, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 7.143447705663498e-06, |
|
"loss": 0.1418, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.3589028358902836, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 7.122778007441093e-06, |
|
"loss": 0.1523, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.3607624360762436, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 7.102108309218686e-06, |
|
"loss": 0.1517, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.36262203626220363, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 7.08143861099628e-06, |
|
"loss": 0.1486, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.36448163644816367, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 7.060768912773874e-06, |
|
"loss": 0.1517, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.36634123663412366, |
|
"grad_norm": 2.25, |
|
"learning_rate": 7.040099214551468e-06, |
|
"loss": 0.1514, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.3682008368200837, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 7.019429516329063e-06, |
|
"loss": 0.1442, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.3700604370060437, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 6.998759818106656e-06, |
|
"loss": 0.1558, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.3719200371920037, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 6.97809011988425e-06, |
|
"loss": 0.1462, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3737796373779637, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 6.957420421661844e-06, |
|
"loss": 0.1455, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.37563923756392376, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 6.9367507234394385e-06, |
|
"loss": 0.1477, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.3774988377498838, |
|
"grad_norm": 2.5, |
|
"learning_rate": 6.9160810252170325e-06, |
|
"loss": 0.1542, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.3793584379358438, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 6.895411326994627e-06, |
|
"loss": 0.1518, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.3812180381218038, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 6.87474162877222e-06, |
|
"loss": 0.1554, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.3830776383077638, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 6.854071930549815e-06, |
|
"loss": 0.1465, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.38493723849372385, |
|
"grad_norm": 2.5, |
|
"learning_rate": 6.833402232327409e-06, |
|
"loss": 0.1462, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.38679683867968384, |
|
"grad_norm": 2.90625, |
|
"learning_rate": 6.812732534105002e-06, |
|
"loss": 0.1509, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.3886564388656439, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 6.792062835882597e-06, |
|
"loss": 0.1497, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.3905160390516039, |
|
"grad_norm": 2.375, |
|
"learning_rate": 6.77139313766019e-06, |
|
"loss": 0.1496, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.3923756392375639, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 6.750723439437785e-06, |
|
"loss": 0.1439, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.39423523942352395, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 6.730053741215378e-06, |
|
"loss": 0.1433, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.39609483960948394, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 6.7093840429929725e-06, |
|
"loss": 0.1542, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.397954439795444, |
|
"grad_norm": 2.125, |
|
"learning_rate": 6.6887143447705674e-06, |
|
"loss": 0.1373, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.399814039981404, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 6.668044646548161e-06, |
|
"loss": 0.1475, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.400185960018596, |
|
"eval_loss": 0.16861507296562195, |
|
"eval_runtime": 33.5378, |
|
"eval_samples_per_second": 306.669, |
|
"eval_steps_per_second": 9.601, |
|
"step": 1076 |
|
}, |
|
{ |
|
"epoch": 0.401673640167364, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 6.647374948325756e-06, |
|
"loss": 0.1518, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.40353324035332405, |
|
"grad_norm": 2.125, |
|
"learning_rate": 6.626705250103349e-06, |
|
"loss": 0.1466, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.40539284053928404, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 6.606035551880943e-06, |
|
"loss": 0.1472, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.4072524407252441, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 6.585365853658538e-06, |
|
"loss": 0.1469, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.40911204091120407, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 6.564696155436131e-06, |
|
"loss": 0.1413, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.4109716410971641, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 6.544026457213726e-06, |
|
"loss": 0.151, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.41283124128312415, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 6.523356758991319e-06, |
|
"loss": 0.1492, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.41469084146908414, |
|
"grad_norm": 2.640625, |
|
"learning_rate": 6.502687060768913e-06, |
|
"loss": 0.1415, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.4165504416550442, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 6.482017362546507e-06, |
|
"loss": 0.1425, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.41841004184100417, |
|
"grad_norm": 2.25, |
|
"learning_rate": 6.4613476643241015e-06, |
|
"loss": 0.1482, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.4202696420269642, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 6.440677966101695e-06, |
|
"loss": 0.1435, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.4221292422129242, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 6.42000826787929e-06, |
|
"loss": 0.1448, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.42398884239888424, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 6.399338569656883e-06, |
|
"loss": 0.1452, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.4258484425848443, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 6.378668871434478e-06, |
|
"loss": 0.1498, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.42770804277080426, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 6.357999173212072e-06, |
|
"loss": 0.146, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.4295676429567643, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 6.337329474989665e-06, |
|
"loss": 0.1426, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.4314272431427243, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 6.31665977676726e-06, |
|
"loss": 0.1387, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.43328684332868433, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 6.295990078544853e-06, |
|
"loss": 0.1495, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.4351464435146444, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 6.275320380322448e-06, |
|
"loss": 0.1367, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.43700604370060436, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 6.254650682100042e-06, |
|
"loss": 0.1469, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.4388656438865644, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 6.2339809838776355e-06, |
|
"loss": 0.1466, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.4407252440725244, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 6.2133112856552304e-06, |
|
"loss": 0.1382, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.44258484425848443, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 6.192641587432824e-06, |
|
"loss": 0.146, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 6.171971889210419e-06, |
|
"loss": 0.1453, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.44630404463040446, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 6.151302190988012e-06, |
|
"loss": 0.1559, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.4481636448163645, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 6.130632492765606e-06, |
|
"loss": 0.1461, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.4500232450023245, |
|
"grad_norm": 2.6875, |
|
"learning_rate": 6.109962794543201e-06, |
|
"loss": 0.1512, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.45188284518828453, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 6.089293096320794e-06, |
|
"loss": 0.1455, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.4537424453742445, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 6.068623398098388e-06, |
|
"loss": 0.1458, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.45560204556020456, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 6.047953699875982e-06, |
|
"loss": 0.1452, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.45746164574616455, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 6.027284001653576e-06, |
|
"loss": 0.1427, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.4593212459321246, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 6.00661430343117e-06, |
|
"loss": 0.1433, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.46118084611808463, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 5.9859446052087645e-06, |
|
"loss": 0.143, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.4630404463040446, |
|
"grad_norm": 2.40625, |
|
"learning_rate": 5.965274906986358e-06, |
|
"loss": 0.1357, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.46490004649000466, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 5.944605208763953e-06, |
|
"loss": 0.1469, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.46675964667596465, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 5.923935510541547e-06, |
|
"loss": 0.1466, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.4686192468619247, |
|
"grad_norm": 2.25, |
|
"learning_rate": 5.903265812319141e-06, |
|
"loss": 0.1451, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.47047884704788473, |
|
"grad_norm": 2.125, |
|
"learning_rate": 5.882596114096735e-06, |
|
"loss": 0.1475, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.4723384472338447, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 5.861926415874328e-06, |
|
"loss": 0.1412, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.47419804741980476, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 5.841256717651923e-06, |
|
"loss": 0.1461, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.47605764760576474, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 5.820587019429516e-06, |
|
"loss": 0.1437, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.4779172477917248, |
|
"grad_norm": 2.625, |
|
"learning_rate": 5.799917321207111e-06, |
|
"loss": 0.145, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.4797768479776848, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 5.779247622984705e-06, |
|
"loss": 0.1407, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.4816364481636448, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 5.7585779247622985e-06, |
|
"loss": 0.1455, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.48349604834960486, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 5.7379082265398934e-06, |
|
"loss": 0.1437, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.48535564853556484, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 5.717238528317487e-06, |
|
"loss": 0.1423, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.4872152487215249, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 5.696568830095081e-06, |
|
"loss": 0.1538, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.48907484890748487, |
|
"grad_norm": 2.125, |
|
"learning_rate": 5.675899131872676e-06, |
|
"loss": 0.1477, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.4909344490934449, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 5.655229433650269e-06, |
|
"loss": 0.1467, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.49279404927940496, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 5.634559735427864e-06, |
|
"loss": 0.1357, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.49465364946536494, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 5.613890037205457e-06, |
|
"loss": 0.1483, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.496513249651325, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 5.593220338983051e-06, |
|
"loss": 0.1485, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.49837284983728497, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 5.572550640760645e-06, |
|
"loss": 0.1472, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.500232450023245, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 5.551880942538239e-06, |
|
"loss": 0.1515, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.500232450023245, |
|
"eval_loss": 0.16567149758338928, |
|
"eval_runtime": 33.5148, |
|
"eval_samples_per_second": 306.88, |
|
"eval_steps_per_second": 9.608, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.502092050209205, |
|
"grad_norm": 2.25, |
|
"learning_rate": 5.531211244315834e-06, |
|
"loss": 0.1434, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.503951650395165, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 5.5105415460934275e-06, |
|
"loss": 0.1489, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.5058112505811251, |
|
"grad_norm": 2.125, |
|
"learning_rate": 5.489871847871021e-06, |
|
"loss": 0.1423, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.5076708507670851, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 5.469202149648616e-06, |
|
"loss": 0.1532, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.509530450953045, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 5.44853245142621e-06, |
|
"loss": 0.1474, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.5113900511390052, |
|
"grad_norm": 2.125, |
|
"learning_rate": 5.427862753203804e-06, |
|
"loss": 0.141, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.5132496513249651, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 5.407193054981398e-06, |
|
"loss": 0.1432, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.5151092515109251, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 5.386523356758991e-06, |
|
"loss": 0.1511, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.5169688516968852, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 5.365853658536586e-06, |
|
"loss": 0.1479, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.5188284518828452, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 5.34518396031418e-06, |
|
"loss": 0.1377, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.5206880520688052, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 5.324514262091773e-06, |
|
"loss": 0.1413, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.5225476522547652, |
|
"grad_norm": 2.25, |
|
"learning_rate": 5.303844563869368e-06, |
|
"loss": 0.1357, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.5244072524407253, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 5.2831748656469615e-06, |
|
"loss": 0.1454, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.5262668526266853, |
|
"grad_norm": 2.125, |
|
"learning_rate": 5.2625051674245564e-06, |
|
"loss": 0.1455, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.5281264528126453, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 5.24183546920215e-06, |
|
"loss": 0.1477, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.5299860529986054, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 5.221165770979744e-06, |
|
"loss": 0.1383, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.5318456531845653, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 5.200496072757339e-06, |
|
"loss": 0.1435, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.5337052533705253, |
|
"grad_norm": 2.328125, |
|
"learning_rate": 5.179826374534932e-06, |
|
"loss": 0.1492, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.5355648535564853, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 5.159156676312527e-06, |
|
"loss": 0.1387, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.5374244537424454, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 5.13848697809012e-06, |
|
"loss": 0.1475, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.5392840539284054, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 5.117817279867714e-06, |
|
"loss": 0.1459, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.5411436541143654, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 5.097147581645308e-06, |
|
"loss": 0.1536, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.5430032543003255, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 5.076477883422902e-06, |
|
"loss": 0.1467, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.5448628544862855, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 5.055808185200497e-06, |
|
"loss": 0.1452, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.5467224546722455, |
|
"grad_norm": 2.25, |
|
"learning_rate": 5.0351384869780905e-06, |
|
"loss": 0.1467, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.5485820548582054, |
|
"grad_norm": 2.25, |
|
"learning_rate": 5.0144687887556846e-06, |
|
"loss": 0.1422, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.5504416550441655, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 4.993799090533279e-06, |
|
"loss": 0.1348, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.5523012552301255, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 4.973129392310873e-06, |
|
"loss": 0.1468, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.5541608554160855, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 4.952459694088467e-06, |
|
"loss": 0.1431, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.5560204556020456, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 4.931789995866061e-06, |
|
"loss": 0.1513, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.5578800557880056, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 4.911120297643655e-06, |
|
"loss": 0.1388, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.5597396559739656, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 4.890450599421249e-06, |
|
"loss": 0.1392, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.5615992561599256, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 4.869780901198843e-06, |
|
"loss": 0.1417, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.5634588563458857, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 4.849111202976437e-06, |
|
"loss": 0.1361, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.5653184565318456, |
|
"grad_norm": 2.25, |
|
"learning_rate": 4.828441504754031e-06, |
|
"loss": 0.1434, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.5671780567178056, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 4.8077718065316245e-06, |
|
"loss": 0.1496, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.5690376569037657, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 4.787102108309219e-06, |
|
"loss": 0.1463, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.5708972570897257, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 4.7664324100868135e-06, |
|
"loss": 0.1323, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.5727568572756857, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 4.745762711864408e-06, |
|
"loss": 0.1437, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.5746164574616457, |
|
"grad_norm": 2.25, |
|
"learning_rate": 4.725093013642002e-06, |
|
"loss": 0.1543, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.5764760576476058, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 4.704423315419595e-06, |
|
"loss": 0.1409, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.5783356578335658, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 4.683753617197189e-06, |
|
"loss": 0.1439, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.5801952580195258, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 4.663083918974783e-06, |
|
"loss": 0.1395, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.5820548582054859, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 4.642414220752377e-06, |
|
"loss": 0.1416, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.5839144583914458, |
|
"grad_norm": 2.734375, |
|
"learning_rate": 4.621744522529971e-06, |
|
"loss": 0.1385, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.5857740585774058, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 4.601074824307565e-06, |
|
"loss": 0.1382, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.5876336587633659, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 4.580405126085159e-06, |
|
"loss": 0.1378, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.5894932589493259, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 4.5597354278627535e-06, |
|
"loss": 0.1369, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.5913528591352859, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 4.5390657296403476e-06, |
|
"loss": 0.145, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.5932124593212459, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 4.518396031417942e-06, |
|
"loss": 0.1376, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.595072059507206, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 4.497726333195536e-06, |
|
"loss": 0.1398, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.596931659693166, |
|
"grad_norm": 2.0, |
|
"learning_rate": 4.47705663497313e-06, |
|
"loss": 0.144, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.598791259879126, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 4.456386936750724e-06, |
|
"loss": 0.1344, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.600278940027894, |
|
"eval_loss": 0.16357110440731049, |
|
"eval_runtime": 33.5259, |
|
"eval_samples_per_second": 306.778, |
|
"eval_steps_per_second": 9.605, |
|
"step": 1614 |
|
}, |
|
{ |
|
"epoch": 0.6006508600650861, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 4.435717238528318e-06, |
|
"loss": 0.1411, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.602510460251046, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 4.415047540305912e-06, |
|
"loss": 0.1454, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.604370060437006, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 4.394377842083506e-06, |
|
"loss": 0.1457, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.606229660622966, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 4.3737081438611e-06, |
|
"loss": 0.1404, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.6080892608089261, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 4.353038445638694e-06, |
|
"loss": 0.1401, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.6099488609948861, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 4.3323687474162875e-06, |
|
"loss": 0.1373, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.6118084611808461, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 4.3116990491938824e-06, |
|
"loss": 0.1409, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.6136680613668062, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 4.2910293509714765e-06, |
|
"loss": 0.1416, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.6155276615527662, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 4.270359652749071e-06, |
|
"loss": 0.1458, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.6173872617387262, |
|
"grad_norm": 2.25, |
|
"learning_rate": 4.249689954526664e-06, |
|
"loss": 0.1459, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.6192468619246861, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 4.229020256304258e-06, |
|
"loss": 0.1383, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.6211064621106462, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 4.208350558081852e-06, |
|
"loss": 0.1451, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.6229660622966062, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 4.187680859859447e-06, |
|
"loss": 0.1385, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.6248256624825662, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 4.167011161637041e-06, |
|
"loss": 0.141, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.6266852626685263, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 4.146341463414634e-06, |
|
"loss": 0.1428, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.6285448628544863, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 4.125671765192228e-06, |
|
"loss": 0.1425, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.6304044630404463, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 4.105002066969822e-06, |
|
"loss": 0.1369, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.6322640632264063, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 4.0843323687474165e-06, |
|
"loss": 0.1351, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.6341236634123664, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 4.0636626705250106e-06, |
|
"loss": 0.1393, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.6359832635983264, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 4.042992972302605e-06, |
|
"loss": 0.137, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.6378428637842863, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 4.022323274080199e-06, |
|
"loss": 0.1409, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.6397024639702464, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 4.001653575857793e-06, |
|
"loss": 0.1459, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.6415620641562064, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 3.980983877635387e-06, |
|
"loss": 0.1385, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.6434216643421664, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 3.960314179412981e-06, |
|
"loss": 0.153, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.6452812645281265, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 3.939644481190575e-06, |
|
"loss": 0.1397, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.6471408647140865, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 3.918974782968169e-06, |
|
"loss": 0.1405, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.6490004649000465, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 3.898305084745763e-06, |
|
"loss": 0.1391, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.6508600650860065, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 3.8776353865233564e-06, |
|
"loss": 0.1409, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.6527196652719666, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 3.856965688300951e-06, |
|
"loss": 0.1402, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.6545792654579266, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 3.8362959900785454e-06, |
|
"loss": 0.1502, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.6564388656438865, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 3.8156262918561395e-06, |
|
"loss": 0.1408, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.6582984658298466, |
|
"grad_norm": 2.25, |
|
"learning_rate": 3.7949565936337336e-06, |
|
"loss": 0.1445, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.6601580660158066, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 3.7742868954113273e-06, |
|
"loss": 0.1417, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.6620176662017666, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 3.7536171971889213e-06, |
|
"loss": 0.1402, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.6638772663877266, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 3.7329474989665154e-06, |
|
"loss": 0.1428, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.6657368665736867, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 3.7122778007441095e-06, |
|
"loss": 0.1408, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.6675964667596467, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 3.6916081025217036e-06, |
|
"loss": 0.1428, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.6694560669456067, |
|
"grad_norm": 2.515625, |
|
"learning_rate": 3.6709384042992972e-06, |
|
"loss": 0.1463, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.6713156671315668, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 3.6502687060768917e-06, |
|
"loss": 0.1365, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.6731752673175267, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 3.629599007854486e-06, |
|
"loss": 0.1362, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.6750348675034867, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 3.60892930963208e-06, |
|
"loss": 0.1378, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.6768944676894467, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 3.5882596114096736e-06, |
|
"loss": 0.1323, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.6787540678754068, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 3.5675899131872676e-06, |
|
"loss": 0.1406, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.6806136680613668, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 3.5469202149648617e-06, |
|
"loss": 0.1416, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.6824732682473268, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 3.526250516742456e-06, |
|
"loss": 0.1347, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.6843328684332869, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 3.5055808185200503e-06, |
|
"loss": 0.148, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.6861924686192469, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 3.484911120297644e-06, |
|
"loss": 0.1443, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.6880520688052069, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 3.464241422075238e-06, |
|
"loss": 0.138, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.6899116689911668, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 3.443571723852832e-06, |
|
"loss": 0.1466, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.691771269177127, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 3.422902025630426e-06, |
|
"loss": 0.1403, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.6936308693630869, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 3.40223232740802e-06, |
|
"loss": 0.1361, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.6954904695490469, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 3.381562629185614e-06, |
|
"loss": 0.1393, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.697350069735007, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 3.360892930963208e-06, |
|
"loss": 0.1352, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.699209669920967, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 3.3402232327408025e-06, |
|
"loss": 0.1387, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.700325430032543, |
|
"eval_loss": 0.16304655373096466, |
|
"eval_runtime": 33.5474, |
|
"eval_samples_per_second": 306.581, |
|
"eval_steps_per_second": 9.598, |
|
"step": 1883 |
|
}, |
|
{ |
|
"epoch": 0.701069270106927, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 3.3195535345183966e-06, |
|
"loss": 0.1373, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.702928870292887, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 3.2988838362959903e-06, |
|
"loss": 0.1392, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.7047884704788471, |
|
"grad_norm": 1.9375, |
|
"learning_rate": 3.2782141380735843e-06, |
|
"loss": 0.1453, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.7066480706648071, |
|
"grad_norm": 1.8359375, |
|
"learning_rate": 3.2575444398511784e-06, |
|
"loss": 0.141, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.708507670850767, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 3.2368747416287725e-06, |
|
"loss": 0.1346, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.7103672710367271, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 3.216205043406366e-06, |
|
"loss": 0.1375, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.7122268712226871, |
|
"grad_norm": 1.9609375, |
|
"learning_rate": 3.1955353451839607e-06, |
|
"loss": 0.1433, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.7140864714086471, |
|
"grad_norm": 2.375, |
|
"learning_rate": 3.1748656469615547e-06, |
|
"loss": 0.1487, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.7159460715946072, |
|
"grad_norm": 2.25, |
|
"learning_rate": 3.154195948739149e-06, |
|
"loss": 0.1439, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.7178056717805672, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 3.133526250516743e-06, |
|
"loss": 0.1362, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.7196652719665272, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 3.1128565522943366e-06, |
|
"loss": 0.1373, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.7215248721524872, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 3.0921868540719306e-06, |
|
"loss": 0.1476, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.7233844723384473, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 3.0715171558495247e-06, |
|
"loss": 0.1363, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.7252440725244073, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 3.0508474576271192e-06, |
|
"loss": 0.1338, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.7271036727103672, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 3.030177759404713e-06, |
|
"loss": 0.1338, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.7289632728963273, |
|
"grad_norm": 2.25, |
|
"learning_rate": 3.009508061182307e-06, |
|
"loss": 0.1391, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.7308228730822873, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 2.988838362959901e-06, |
|
"loss": 0.146, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.7326824732682473, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 2.968168664737495e-06, |
|
"loss": 0.138, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.7345420734542073, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 2.947498966515089e-06, |
|
"loss": 0.1395, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.7364016736401674, |
|
"grad_norm": 2.125, |
|
"learning_rate": 2.926829268292683e-06, |
|
"loss": 0.1409, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.7382612738261274, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 2.906159570070277e-06, |
|
"loss": 0.1404, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.7401208740120874, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 2.8854898718478715e-06, |
|
"loss": 0.1371, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.7419804741980475, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 2.8648201736254655e-06, |
|
"loss": 0.1385, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.7438400743840075, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 2.844150475403059e-06, |
|
"loss": 0.1393, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.7456996745699674, |
|
"grad_norm": 1.8515625, |
|
"learning_rate": 2.8234807771806533e-06, |
|
"loss": 0.1329, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.7475592747559274, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 2.8028110789582473e-06, |
|
"loss": 0.1418, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.7494188749418875, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 2.7821413807358414e-06, |
|
"loss": 0.1366, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.7512784751278475, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 2.761471682513436e-06, |
|
"loss": 0.1378, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.7531380753138075, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 2.7408019842910296e-06, |
|
"loss": 0.1392, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.7549976754997676, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 2.7201322860686237e-06, |
|
"loss": 0.141, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.7568572756857276, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 2.6994625878462178e-06, |
|
"loss": 0.1391, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.7587168758716876, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 2.678792889623812e-06, |
|
"loss": 0.1408, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.7605764760576476, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 2.6581231914014055e-06, |
|
"loss": 0.1382, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.7624360762436077, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 2.6374534931789996e-06, |
|
"loss": 0.1424, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.7642956764295676, |
|
"grad_norm": 2.0, |
|
"learning_rate": 2.6167837949565936e-06, |
|
"loss": 0.1297, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.7661552766155276, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 2.596114096734188e-06, |
|
"loss": 0.1398, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.7680148768014877, |
|
"grad_norm": 2.5, |
|
"learning_rate": 2.5754443985117822e-06, |
|
"loss": 0.1418, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.7698744769874477, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 2.554774700289376e-06, |
|
"loss": 0.137, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.7717340771734077, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 2.53410500206697e-06, |
|
"loss": 0.1409, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.7735936773593677, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 2.513435303844564e-06, |
|
"loss": 0.1345, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.7754532775453278, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 2.492765605622158e-06, |
|
"loss": 0.1445, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.7773128777312878, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 2.4720959073997522e-06, |
|
"loss": 0.1414, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.7791724779172478, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 2.4514262091773463e-06, |
|
"loss": 0.1373, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.7810320781032078, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 2.4307565109549404e-06, |
|
"loss": 0.1367, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.7828916782891678, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 2.4100868127325345e-06, |
|
"loss": 0.1332, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.7847512784751278, |
|
"grad_norm": 2.125, |
|
"learning_rate": 2.389417114510128e-06, |
|
"loss": 0.1368, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.7866108786610879, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 2.3687474162877226e-06, |
|
"loss": 0.1439, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.7884704788470479, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 2.3480777180653163e-06, |
|
"loss": 0.1423, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.7903300790330079, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 2.3274080198429104e-06, |
|
"loss": 0.1384, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.7921896792189679, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 2.3067383216205044e-06, |
|
"loss": 0.138, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.794049279404928, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 2.2860686233980985e-06, |
|
"loss": 0.1368, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.795908879590888, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 2.2653989251756926e-06, |
|
"loss": 0.1315, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.797768479776848, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 2.2447292269532867e-06, |
|
"loss": 0.1433, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.799628079962808, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 2.2240595287308808e-06, |
|
"loss": 0.1403, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.800371920037192, |
|
"eval_loss": 0.16228820383548737, |
|
"eval_runtime": 33.5429, |
|
"eval_samples_per_second": 306.622, |
|
"eval_steps_per_second": 9.6, |
|
"step": 2152 |
|
}, |
|
{ |
|
"epoch": 0.801487680148768, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 2.203389830508475e-06, |
|
"loss": 0.138, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.803347280334728, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 2.182720132286069e-06, |
|
"loss": 0.1403, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.805206880520688, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 2.1620504340636626e-06, |
|
"loss": 0.1398, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.8070664807066481, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 2.141380735841257e-06, |
|
"loss": 0.1459, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.8089260808926081, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 2.1207110376188507e-06, |
|
"loss": 0.1374, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.8107856810785681, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 2.100041339396445e-06, |
|
"loss": 0.138, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.8126452812645282, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 2.079371641174039e-06, |
|
"loss": 0.1389, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.8145048814504882, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 2.058701942951633e-06, |
|
"loss": 0.1393, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.8163644816364481, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 2.038032244729227e-06, |
|
"loss": 0.1341, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.8182240818224081, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 2.017362546506821e-06, |
|
"loss": 0.1413, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.8200836820083682, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 1.9966928482844152e-06, |
|
"loss": 0.1401, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.8219432821943282, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.9760231500620093e-06, |
|
"loss": 0.1442, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.8238028823802882, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.9553534518396034e-06, |
|
"loss": 0.144, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.8256624825662483, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.934683753617197e-06, |
|
"loss": 0.1353, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.8275220827522083, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.9140140553947915e-06, |
|
"loss": 0.1394, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.8293816829381683, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 1.8933443571723856e-06, |
|
"loss": 0.1385, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.8312412831241283, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.8726746589499795e-06, |
|
"loss": 0.1451, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.8331008833100884, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.8520049607275736e-06, |
|
"loss": 0.1366, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.8349604834960483, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.8313352625051674e-06, |
|
"loss": 0.1348, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.8368200836820083, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.8106655642827617e-06, |
|
"loss": 0.1365, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.8386796838679684, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.7899958660603556e-06, |
|
"loss": 0.1366, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.8405392840539284, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.7693261678379497e-06, |
|
"loss": 0.1444, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.8423988842398884, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.7486564696155435e-06, |
|
"loss": 0.144, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.8442584844258484, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.7279867713931378e-06, |
|
"loss": 0.1403, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.8461180846118085, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.707317073170732e-06, |
|
"loss": 0.1427, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.8479776847977685, |
|
"grad_norm": 1.9921875, |
|
"learning_rate": 1.6866473749483258e-06, |
|
"loss": 0.1367, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.8498372849837285, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.66597767672592e-06, |
|
"loss": 0.1474, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.8516968851696886, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.645307978503514e-06, |
|
"loss": 0.1476, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.8535564853556485, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.624638280281108e-06, |
|
"loss": 0.1423, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.8554160855416085, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 1.603968582058702e-06, |
|
"loss": 0.137, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.8572756857275686, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.5832988838362962e-06, |
|
"loss": 0.1332, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.8591352859135286, |
|
"grad_norm": 2.5, |
|
"learning_rate": 1.56262918561389e-06, |
|
"loss": 0.141, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.8609948860994886, |
|
"grad_norm": 2.390625, |
|
"learning_rate": 1.5419594873914841e-06, |
|
"loss": 0.1375, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.8628544862854486, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.5212897891690784e-06, |
|
"loss": 0.1413, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.8647140864714087, |
|
"grad_norm": 1.8828125, |
|
"learning_rate": 1.5006200909466723e-06, |
|
"loss": 0.1455, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.8665736866573687, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.4799503927242664e-06, |
|
"loss": 0.1442, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.8684332868433287, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 1.4592806945018602e-06, |
|
"loss": 0.1326, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.8702928870292888, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.4386109962794545e-06, |
|
"loss": 0.1418, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.8721524872152487, |
|
"grad_norm": 2.125, |
|
"learning_rate": 1.4179412980570484e-06, |
|
"loss": 0.1398, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.8740120874012087, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.3972715998346425e-06, |
|
"loss": 0.139, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.8758716875871687, |
|
"grad_norm": 2.671875, |
|
"learning_rate": 1.3766019016122364e-06, |
|
"loss": 0.1506, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.8777312877731288, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.3559322033898307e-06, |
|
"loss": 0.1396, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.8795908879590888, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 1.3352625051674247e-06, |
|
"loss": 0.1293, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.8814504881450488, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.3145928069450186e-06, |
|
"loss": 0.1399, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.8833100883310089, |
|
"grad_norm": 2.046875, |
|
"learning_rate": 1.2939231087226129e-06, |
|
"loss": 0.1365, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.8851696885169689, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 1.2732534105002068e-06, |
|
"loss": 0.1397, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.8870292887029289, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 1.2525837122778008e-06, |
|
"loss": 0.1437, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 1.984375, |
|
"learning_rate": 1.231914014055395e-06, |
|
"loss": 0.1418, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.8907484890748489, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.211244315832989e-06, |
|
"loss": 0.1384, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.8926080892608089, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 1.190574617610583e-06, |
|
"loss": 0.1445, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.8944676894467689, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.169904919388177e-06, |
|
"loss": 0.1358, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.896327289632729, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 1.149235221165771e-06, |
|
"loss": 0.1532, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.898186889818689, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 1.1285655229433651e-06, |
|
"loss": 0.1416, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.900046490004649, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 1.1078958247209592e-06, |
|
"loss": 0.1383, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.900418410041841, |
|
"eval_loss": 0.16189107298851013, |
|
"eval_runtime": 33.532, |
|
"eval_samples_per_second": 306.722, |
|
"eval_steps_per_second": 9.603, |
|
"step": 2421 |
|
}, |
|
{ |
|
"epoch": 0.901906090190609, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 1.0872261264985533e-06, |
|
"loss": 0.1319, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.9037656903765691, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 1.0665564282761474e-06, |
|
"loss": 0.1448, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.905625290562529, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 1.0458867300537414e-06, |
|
"loss": 0.1479, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 0.907484890748489, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 1.0252170318313353e-06, |
|
"loss": 0.1396, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.9093444909344491, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 1.0045473336089294e-06, |
|
"loss": 0.1494, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.9112040911204091, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 9.838776353865235e-07, |
|
"loss": 0.1362, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.9130636913063691, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 9.632079371641175e-07, |
|
"loss": 0.1371, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 0.9149232914923291, |
|
"grad_norm": 2.1875, |
|
"learning_rate": 9.425382389417115e-07, |
|
"loss": 0.139, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.9167828916782892, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 9.218685407193055e-07, |
|
"loss": 0.1339, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 0.9186424918642492, |
|
"grad_norm": 2.21875, |
|
"learning_rate": 9.011988424968997e-07, |
|
"loss": 0.1427, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.9205020920502092, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 8.805291442744937e-07, |
|
"loss": 0.1302, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.9223616922361693, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 8.598594460520877e-07, |
|
"loss": 0.1385, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.9242212924221292, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 8.391897478296818e-07, |
|
"loss": 0.1396, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.9260808926080892, |
|
"grad_norm": 2.34375, |
|
"learning_rate": 8.185200496072758e-07, |
|
"loss": 0.1447, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.9279404927940493, |
|
"grad_norm": 2.109375, |
|
"learning_rate": 7.978503513848699e-07, |
|
"loss": 0.1362, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 0.9298000929800093, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 7.771806531624638e-07, |
|
"loss": 0.1418, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.9316596931659693, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 7.565109549400579e-07, |
|
"loss": 0.1372, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.9335192933519293, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 7.358412567176519e-07, |
|
"loss": 0.1421, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.9353788935378894, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 7.151715584952461e-07, |
|
"loss": 0.1426, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 0.9372384937238494, |
|
"grad_norm": 2.125, |
|
"learning_rate": 6.945018602728401e-07, |
|
"loss": 0.1361, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.9390980939098094, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 6.738321620504341e-07, |
|
"loss": 0.1367, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.9409576940957695, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 6.531624638280282e-07, |
|
"loss": 0.1408, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.9428172942817294, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 6.324927656056222e-07, |
|
"loss": 0.141, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.9446768944676894, |
|
"grad_norm": 2.359375, |
|
"learning_rate": 6.118230673832163e-07, |
|
"loss": 0.1346, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.9465364946536494, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 5.911533691608104e-07, |
|
"loss": 0.14, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 0.9483960948396095, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 5.704836709384043e-07, |
|
"loss": 0.1347, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.9502556950255695, |
|
"grad_norm": 1.96875, |
|
"learning_rate": 5.498139727159984e-07, |
|
"loss": 0.1354, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.9521152952115295, |
|
"grad_norm": 2.203125, |
|
"learning_rate": 5.291442744935924e-07, |
|
"loss": 0.1394, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.9539748953974896, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 5.084745762711865e-07, |
|
"loss": 0.1392, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.9558344955834496, |
|
"grad_norm": 1.9140625, |
|
"learning_rate": 4.878048780487805e-07, |
|
"loss": 0.1394, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.9576940957694096, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 4.671351798263746e-07, |
|
"loss": 0.1401, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.9595536959553695, |
|
"grad_norm": 1.8984375, |
|
"learning_rate": 4.464654816039686e-07, |
|
"loss": 0.1324, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.9614132961413296, |
|
"grad_norm": 2.0, |
|
"learning_rate": 4.2579578338156263e-07, |
|
"loss": 0.1408, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 0.9632728963272896, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 4.051260851591567e-07, |
|
"loss": 0.1336, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.9651324965132496, |
|
"grad_norm": 2.234375, |
|
"learning_rate": 3.8445638693675074e-07, |
|
"loss": 0.14, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.9669920966992097, |
|
"grad_norm": 2.296875, |
|
"learning_rate": 3.6378668871434477e-07, |
|
"loss": 0.1442, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.9688516968851697, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 3.431169904919388e-07, |
|
"loss": 0.1428, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 0.9707112970711297, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 3.2244729226953293e-07, |
|
"loss": 0.1405, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.9725708972570897, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 3.0177759404712695e-07, |
|
"loss": 0.1325, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 0.9744304974430498, |
|
"grad_norm": 1.921875, |
|
"learning_rate": 2.81107895824721e-07, |
|
"loss": 0.139, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.9762900976290098, |
|
"grad_norm": 2.171875, |
|
"learning_rate": 2.6043819760231506e-07, |
|
"loss": 0.139, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.9781496978149697, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 2.397684993799091e-07, |
|
"loss": 0.1393, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.9800092980009298, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 2.1909880115750314e-07, |
|
"loss": 0.1484, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 0.9818688981868898, |
|
"grad_norm": 2.09375, |
|
"learning_rate": 1.9842910293509717e-07, |
|
"loss": 0.1426, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.9837284983728498, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 1.777594047126912e-07, |
|
"loss": 0.1441, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 0.9855880985588099, |
|
"grad_norm": 2.140625, |
|
"learning_rate": 1.5708970649028525e-07, |
|
"loss": 0.1423, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.9874476987447699, |
|
"grad_norm": 2.03125, |
|
"learning_rate": 1.364200082678793e-07, |
|
"loss": 0.139, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.9893072989307299, |
|
"grad_norm": 2.0, |
|
"learning_rate": 1.1575031004547335e-07, |
|
"loss": 0.1355, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.9911668991166899, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 9.50806118230674e-08, |
|
"loss": 0.1443, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 0.99302649930265, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 7.441091360066144e-08, |
|
"loss": 0.1404, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.99488609948861, |
|
"grad_norm": 2.28125, |
|
"learning_rate": 5.3741215378255483e-08, |
|
"loss": 0.1501, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.9967456996745699, |
|
"grad_norm": 2.078125, |
|
"learning_rate": 3.3071517155849524e-08, |
|
"loss": 0.1454, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.99860529986053, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 1.2401818933443573e-08, |
|
"loss": 0.1473, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.999721059972106, |
|
"step": 2688, |
|
"total_flos": 2.4410094732092375e+18, |
|
"train_loss": 0.15163021730924292, |
|
"train_runtime": 3273.2323, |
|
"train_samples_per_second": 52.569, |
|
"train_steps_per_second": 0.821 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2688, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 269, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.4410094732092375e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|