|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.1988619347109912, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02395926924228811, |
|
"grad_norm": 16.852388381958008, |
|
"learning_rate": 1.1904761904761906e-06, |
|
"loss": 1.3705, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04791853848457622, |
|
"grad_norm": 4.8825883865356445, |
|
"learning_rate": 2.380952380952381e-06, |
|
"loss": 0.9316, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07187780772686433, |
|
"grad_norm": 2.1795146465301514, |
|
"learning_rate": 3.5714285714285718e-06, |
|
"loss": 0.5955, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09583707696915245, |
|
"grad_norm": 1.2667920589447021, |
|
"learning_rate": 4.761904761904762e-06, |
|
"loss": 0.4746, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.11979634621144056, |
|
"grad_norm": 1.1073658466339111, |
|
"learning_rate": 5.9523809523809525e-06, |
|
"loss": 0.421, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14375561545372867, |
|
"grad_norm": 1.2127255201339722, |
|
"learning_rate": 7.1428571428571436e-06, |
|
"loss": 0.3877, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.16771488469601678, |
|
"grad_norm": 1.0557327270507812, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.3716, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1916741539383049, |
|
"grad_norm": 1.04291570186615, |
|
"learning_rate": 9.523809523809525e-06, |
|
"loss": 0.3575, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.215633423180593, |
|
"grad_norm": 1.1732438802719116, |
|
"learning_rate": 9.9984209464165e-06, |
|
"loss": 0.3494, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.23959269242288112, |
|
"grad_norm": 1.0016248226165771, |
|
"learning_rate": 9.988774786134235e-06, |
|
"loss": 0.343, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2635519616651692, |
|
"grad_norm": 1.0416539907455444, |
|
"learning_rate": 9.970376619680024e-06, |
|
"loss": 0.3379, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.28751123090745734, |
|
"grad_norm": 0.9761713743209839, |
|
"learning_rate": 9.94325872368957e-06, |
|
"loss": 0.3297, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3114705001497454, |
|
"grad_norm": 1.0316894054412842, |
|
"learning_rate": 9.907468672167165e-06, |
|
"loss": 0.3317, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.33542976939203356, |
|
"grad_norm": 0.9167680740356445, |
|
"learning_rate": 9.863069253024719e-06, |
|
"loss": 0.3275, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.35938903863432164, |
|
"grad_norm": 0.8503734469413757, |
|
"learning_rate": 9.81013835793043e-06, |
|
"loss": 0.3237, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3833483078766098, |
|
"grad_norm": 0.8351507186889648, |
|
"learning_rate": 9.748768845660335e-06, |
|
"loss": 0.3213, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.40730757711889787, |
|
"grad_norm": 0.8993275165557861, |
|
"learning_rate": 9.679068379192455e-06, |
|
"loss": 0.3178, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.431266846361186, |
|
"grad_norm": 0.8393663167953491, |
|
"learning_rate": 9.601159236829353e-06, |
|
"loss": 0.3162, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4552261156034741, |
|
"grad_norm": 0.7886475920677185, |
|
"learning_rate": 9.515178097680437e-06, |
|
"loss": 0.3165, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.47918538484576223, |
|
"grad_norm": 0.8455172777175903, |
|
"learning_rate": 9.421275801880363e-06, |
|
"loss": 0.3118, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5031446540880503, |
|
"grad_norm": 0.8376420140266418, |
|
"learning_rate": 9.319617085964177e-06, |
|
"loss": 0.3102, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5271039233303384, |
|
"grad_norm": 0.8927198052406311, |
|
"learning_rate": 9.210380293863462e-06, |
|
"loss": 0.311, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5510631925726265, |
|
"grad_norm": 0.7255620360374451, |
|
"learning_rate": 9.093757064030473e-06, |
|
"loss": 0.3086, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5750224618149147, |
|
"grad_norm": 0.7063446640968323, |
|
"learning_rate": 8.969951993239177e-06, |
|
"loss": 0.3053, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5989817310572028, |
|
"grad_norm": 0.7932593822479248, |
|
"learning_rate": 8.83918227765299e-06, |
|
"loss": 0.304, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6229410002994908, |
|
"grad_norm": 0.7496311664581299, |
|
"learning_rate": 8.701677331788891e-06, |
|
"loss": 0.3022, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6469002695417789, |
|
"grad_norm": 0.7745276689529419, |
|
"learning_rate": 8.557678386046429e-06, |
|
"loss": 0.3033, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6708595387840671, |
|
"grad_norm": 0.68818598985672, |
|
"learning_rate": 8.4074380635076e-06, |
|
"loss": 0.3017, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6948188080263552, |
|
"grad_norm": 0.8030797243118286, |
|
"learning_rate": 8.251219936750145e-06, |
|
"loss": 0.3049, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7187780772686433, |
|
"grad_norm": 0.7876457571983337, |
|
"learning_rate": 8.089298065451673e-06, |
|
"loss": 0.297, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7427373465109314, |
|
"grad_norm": 0.693367600440979, |
|
"learning_rate": 7.921956515595861e-06, |
|
"loss": 0.2955, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7666966157532196, |
|
"grad_norm": 0.7320050001144409, |
|
"learning_rate": 7.7494888611242e-06, |
|
"loss": 0.2955, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7906558849955077, |
|
"grad_norm": 0.7215656042098999, |
|
"learning_rate": 7.572197668907533e-06, |
|
"loss": 0.2936, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8146151542377957, |
|
"grad_norm": 0.7600423693656921, |
|
"learning_rate": 7.390393967940962e-06, |
|
"loss": 0.2942, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8385744234800838, |
|
"grad_norm": 0.7513334155082703, |
|
"learning_rate": 7.2043967036932935e-06, |
|
"loss": 0.2938, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.862533692722372, |
|
"grad_norm": 0.6746324300765991, |
|
"learning_rate": 7.014532178568314e-06, |
|
"loss": 0.29, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8864929619646601, |
|
"grad_norm": 0.6711202263832092, |
|
"learning_rate": 6.821133479459492e-06, |
|
"loss": 0.2922, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9104522312069482, |
|
"grad_norm": 0.657655656337738, |
|
"learning_rate": 6.624539893402383e-06, |
|
"loss": 0.2867, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9344115004492363, |
|
"grad_norm": 0.762762188911438, |
|
"learning_rate": 6.425096312349881e-06, |
|
"loss": 0.2908, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9583707696915245, |
|
"grad_norm": 0.6959704756736755, |
|
"learning_rate": 6.223152628114537e-06, |
|
"loss": 0.2871, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9823300389338125, |
|
"grad_norm": 0.7447584271430969, |
|
"learning_rate": 6.019063118539425e-06, |
|
"loss": 0.2898, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.0071877807726863, |
|
"grad_norm": 0.6954111456871033, |
|
"learning_rate": 5.813185825974419e-06, |
|
"loss": 0.2862, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.0311470500149746, |
|
"grad_norm": 0.5891335606575012, |
|
"learning_rate": 5.605881929148254e-06, |
|
"loss": 0.2651, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.0551063192572627, |
|
"grad_norm": 0.6528168320655823, |
|
"learning_rate": 5.3975151095383e-06, |
|
"loss": 0.2648, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.0790655884995508, |
|
"grad_norm": 0.6022618412971497, |
|
"learning_rate": 5.188450913349674e-06, |
|
"loss": 0.2634, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.103024857741839, |
|
"grad_norm": 0.6223345994949341, |
|
"learning_rate": 4.979056110222982e-06, |
|
"loss": 0.2613, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.126984126984127, |
|
"grad_norm": 0.618979275226593, |
|
"learning_rate": 4.769698049795739e-06, |
|
"loss": 0.2626, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.150943396226415, |
|
"grad_norm": 0.5937119126319885, |
|
"learning_rate": 4.560744017246284e-06, |
|
"loss": 0.2597, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.1749026654687031, |
|
"grad_norm": 0.6550348401069641, |
|
"learning_rate": 4.352560588950766e-06, |
|
"loss": 0.263, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.1988619347109912, |
|
"grad_norm": 0.5479949116706848, |
|
"learning_rate": 4.145512989383618e-06, |
|
"loss": 0.2599, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 834, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0159083954242585e+19, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|