|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 625, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 14.25, |
|
"learning_rate": 3.1746031746031746e-06, |
|
"loss": 2.4132, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 11.4375, |
|
"learning_rate": 6.349206349206349e-06, |
|
"loss": 2.462, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 11.0, |
|
"learning_rate": 9.523809523809525e-06, |
|
"loss": 2.3388, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 1.2698412698412699e-05, |
|
"loss": 2.1813, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 1.5873015873015872e-05, |
|
"loss": 1.9853, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 9.75, |
|
"learning_rate": 1.904761904761905e-05, |
|
"loss": 1.9632, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 1.9750889679715305e-05, |
|
"loss": 1.7842, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 7.75, |
|
"learning_rate": 1.939501779359431e-05, |
|
"loss": 1.7123, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 7.5625, |
|
"learning_rate": 1.903914590747331e-05, |
|
"loss": 1.6547, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 1.8683274021352315e-05, |
|
"loss": 1.5779, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 1.832740213523132e-05, |
|
"loss": 1.5657, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 1.7971530249110324e-05, |
|
"loss": 1.55, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 1.7615658362989325e-05, |
|
"loss": 1.504, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 1.725978647686833e-05, |
|
"loss": 1.5066, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 1.690391459074733e-05, |
|
"loss": 1.5028, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 1.6548042704626336e-05, |
|
"loss": 1.4306, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 7.875, |
|
"learning_rate": 1.619217081850534e-05, |
|
"loss": 1.4841, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 1.583629893238434e-05, |
|
"loss": 1.4241, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 1.5480427046263346e-05, |
|
"loss": 1.4209, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 1.5124555160142349e-05, |
|
"loss": 1.4404, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 1.4768683274021354e-05, |
|
"loss": 1.382, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 1.4412811387900356e-05, |
|
"loss": 1.4079, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 6.0, |
|
"learning_rate": 1.4056939501779361e-05, |
|
"loss": 1.3182, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 6.0, |
|
"learning_rate": 1.3701067615658364e-05, |
|
"loss": 1.3337, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 6.0, |
|
"learning_rate": 1.3345195729537369e-05, |
|
"loss": 1.3165, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 1.298932384341637e-05, |
|
"loss": 1.2977, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 1.2633451957295374e-05, |
|
"loss": 1.3135, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 7.65625, |
|
"learning_rate": 1.2277580071174377e-05, |
|
"loss": 1.281, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 6.125, |
|
"learning_rate": 1.1921708185053382e-05, |
|
"loss": 1.3239, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 6.71875, |
|
"learning_rate": 1.1565836298932385e-05, |
|
"loss": 1.287, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 6.0, |
|
"learning_rate": 1.120996441281139e-05, |
|
"loss": 1.3248, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 6.71875, |
|
"learning_rate": 1.0854092526690392e-05, |
|
"loss": 1.3225, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 6.375, |
|
"learning_rate": 1.0498220640569397e-05, |
|
"loss": 1.3121, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 1.01423487544484e-05, |
|
"loss": 1.2764, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 9.786476868327403e-06, |
|
"loss": 1.2815, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 9.430604982206405e-06, |
|
"loss": 1.2981, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 9.07473309608541e-06, |
|
"loss": 1.2226, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 7.125, |
|
"learning_rate": 8.718861209964413e-06, |
|
"loss": 1.2702, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 8.362989323843418e-06, |
|
"loss": 1.1996, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 8.00711743772242e-06, |
|
"loss": 1.2685, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 7.651245551601423e-06, |
|
"loss": 1.3484, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 7.295373665480427e-06, |
|
"loss": 1.2453, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 5.875, |
|
"learning_rate": 6.939501779359431e-06, |
|
"loss": 1.2525, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 6.5836298932384346e-06, |
|
"loss": 1.1891, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 6.227758007117438e-06, |
|
"loss": 1.2031, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 5.871886120996442e-06, |
|
"loss": 1.2297, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 5.516014234875445e-06, |
|
"loss": 1.1854, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 5.160142348754449e-06, |
|
"loss": 1.277, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 4.8042704626334524e-06, |
|
"loss": 1.1824, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 7.625, |
|
"learning_rate": 4.448398576512456e-06, |
|
"loss": 1.2103, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 7.5, |
|
"learning_rate": 4.09252669039146e-06, |
|
"loss": 1.1943, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 3.7366548042704632e-06, |
|
"loss": 1.2306, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 3.3807829181494666e-06, |
|
"loss": 1.2586, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 3.0249110320284703e-06, |
|
"loss": 1.2192, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 2.669039145907473e-06, |
|
"loss": 1.207, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 5.875, |
|
"learning_rate": 2.313167259786477e-06, |
|
"loss": 1.2041, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 1.9572953736654807e-06, |
|
"loss": 1.243, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 5.75, |
|
"learning_rate": 1.6014234875444842e-06, |
|
"loss": 1.2168, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 7.65625, |
|
"learning_rate": 1.2455516014234877e-06, |
|
"loss": 1.2393, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 8.896797153024913e-07, |
|
"loss": 1.2151, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 8.125, |
|
"learning_rate": 5.338078291814947e-07, |
|
"loss": 1.2361, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 1.7793594306049826e-07, |
|
"loss": 1.2307, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 625, |
|
"total_flos": 7089149865623552.0, |
|
"train_loss": 1.4169092582702636, |
|
"train_runtime": 351.5262, |
|
"train_samples_per_second": 28.447, |
|
"train_steps_per_second": 1.778 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 625, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7089149865623552.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|