{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 100, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2, "grad_norm": 0.1518353819847107, "learning_rate": 5e-05, "loss": 1.1279, "step": 100 }, { "epoch": 0.2, "eval_loss": 1.0322937965393066, "eval_runtime": 186.6896, "eval_samples_per_second": 0.761, "eval_steps_per_second": 0.761, "step": 100 }, { "epoch": 0.4, "grad_norm": 0.3351062834262848, "learning_rate": 4.89795918367347e-05, "loss": 1.0369, "step": 200 }, { "epoch": 0.4, "eval_loss": 0.9710885882377625, "eval_runtime": 182.1459, "eval_samples_per_second": 0.78, "eval_steps_per_second": 0.78, "step": 200 }, { "epoch": 0.6, "grad_norm": 0.3858621120452881, "learning_rate": 4.795918367346939e-05, "loss": 1.004, "step": 300 }, { "epoch": 0.6, "eval_loss": 0.9358227252960205, "eval_runtime": 181.6505, "eval_samples_per_second": 0.782, "eval_steps_per_second": 0.782, "step": 300 }, { "epoch": 0.8, "grad_norm": 0.40071049332618713, "learning_rate": 4.6938775510204086e-05, "loss": 0.955, "step": 400 }, { "epoch": 0.8, "eval_loss": 0.9108649492263794, "eval_runtime": 181.8409, "eval_samples_per_second": 0.781, "eval_steps_per_second": 0.781, "step": 400 }, { "epoch": 1.0, "grad_norm": 0.41050222516059875, "learning_rate": 4.591836734693878e-05, "loss": 0.9817, "step": 500 }, { "epoch": 1.0, "eval_loss": 0.8939587473869324, "eval_runtime": 181.68, "eval_samples_per_second": 0.782, "eval_steps_per_second": 0.782, "step": 500 }, { "epoch": 1.2, "grad_norm": 0.5476531982421875, "learning_rate": 4.4897959183673474e-05, "loss": 0.8819, "step": 600 }, { "epoch": 1.2, "eval_loss": 0.8783901333808899, "eval_runtime": 182.0958, "eval_samples_per_second": 0.78, "eval_steps_per_second": 0.78, "step": 600 }, { "epoch": 1.4, "grad_norm": 0.5108929872512817, "learning_rate": 4.387755102040816e-05, "loss": 0.8373, "step": 700 }, { "epoch": 1.4, "eval_loss": 0.8652149438858032, "eval_runtime": 181.7715, "eval_samples_per_second": 0.781, "eval_steps_per_second": 0.781, "step": 700 }, { "epoch": 1.6, "grad_norm": 0.5909234881401062, "learning_rate": 4.2857142857142856e-05, "loss": 0.8588, "step": 800 }, { "epoch": 1.6, "eval_loss": 0.8540464639663696, "eval_runtime": 181.5147, "eval_samples_per_second": 0.782, "eval_steps_per_second": 0.782, "step": 800 }, { "epoch": 1.8, "grad_norm": 0.5984646081924438, "learning_rate": 4.183673469387756e-05, "loss": 0.8656, "step": 900 }, { "epoch": 1.8, "eval_loss": 0.8421266674995422, "eval_runtime": 181.6623, "eval_samples_per_second": 0.782, "eval_steps_per_second": 0.782, "step": 900 }, { "epoch": 2.0, "grad_norm": 0.6141483783721924, "learning_rate": 4.0816326530612245e-05, "loss": 0.8625, "step": 1000 }, { "epoch": 2.0, "eval_loss": 0.8279427289962769, "eval_runtime": 181.7062, "eval_samples_per_second": 0.781, "eval_steps_per_second": 0.781, "step": 1000 }, { "epoch": 2.2, "grad_norm": 0.6167280077934265, "learning_rate": 3.979591836734694e-05, "loss": 0.8022, "step": 1100 }, { "epoch": 2.2, "eval_loss": 0.8208828568458557, "eval_runtime": 182.0493, "eval_samples_per_second": 0.78, "eval_steps_per_second": 0.78, "step": 1100 }, { "epoch": 2.4, "grad_norm": 0.6359805464744568, "learning_rate": 3.8775510204081634e-05, "loss": 0.7596, "step": 1200 }, { "epoch": 2.4, "eval_loss": 0.814860463142395, "eval_runtime": 181.8235, "eval_samples_per_second": 0.781, "eval_steps_per_second": 0.781, "step": 1200 }, { "epoch": 2.6, "grad_norm": 0.723619818687439, "learning_rate": 3.775510204081633e-05, "loss": 0.7715, "step": 1300 }, { "epoch": 2.6, "eval_loss": 0.7978885173797607, "eval_runtime": 182.0693, "eval_samples_per_second": 0.78, "eval_steps_per_second": 0.78, "step": 1300 }, { "epoch": 2.8, "grad_norm": 0.7059823274612427, "learning_rate": 3.673469387755102e-05, "loss": 0.7314, "step": 1400 }, { "epoch": 2.8, "eval_loss": 0.7922654747962952, "eval_runtime": 182.558, "eval_samples_per_second": 0.778, "eval_steps_per_second": 0.778, "step": 1400 }, { "epoch": 3.0, "grad_norm": 0.5519229173660278, "learning_rate": 3.571428571428572e-05, "loss": 0.7402, "step": 1500 }, { "epoch": 3.0, "eval_loss": 0.782792329788208, "eval_runtime": 182.1181, "eval_samples_per_second": 0.78, "eval_steps_per_second": 0.78, "step": 1500 }, { "epoch": 3.2, "grad_norm": 0.6199519634246826, "learning_rate": 3.469387755102041e-05, "loss": 0.6767, "step": 1600 }, { "epoch": 3.2, "eval_loss": 0.7840178608894348, "eval_runtime": 178.9485, "eval_samples_per_second": 0.794, "eval_steps_per_second": 0.794, "step": 1600 }, { "epoch": 3.4, "grad_norm": 0.642126202583313, "learning_rate": 3.36734693877551e-05, "loss": 0.6481, "step": 1700 }, { "epoch": 3.4, "eval_loss": 0.7741957306861877, "eval_runtime": 179.1517, "eval_samples_per_second": 0.793, "eval_steps_per_second": 0.793, "step": 1700 }, { "epoch": 3.6, "grad_norm": 0.7174199819564819, "learning_rate": 3.265306122448979e-05, "loss": 0.6689, "step": 1800 }, { "epoch": 3.6, "eval_loss": 0.7609220743179321, "eval_runtime": 180.4754, "eval_samples_per_second": 0.787, "eval_steps_per_second": 0.787, "step": 1800 }, { "epoch": 3.8, "grad_norm": 0.6541227698326111, "learning_rate": 3.1632653061224494e-05, "loss": 0.6652, "step": 1900 }, { "epoch": 3.8, "eval_loss": 0.75916987657547, "eval_runtime": 179.9759, "eval_samples_per_second": 0.789, "eval_steps_per_second": 0.789, "step": 1900 }, { "epoch": 4.0, "grad_norm": 0.889131486415863, "learning_rate": 3.061224489795919e-05, "loss": 0.6938, "step": 2000 }, { "epoch": 4.0, "eval_loss": 0.7402585744857788, "eval_runtime": 181.8785, "eval_samples_per_second": 0.781, "eval_steps_per_second": 0.781, "step": 2000 }, { "epoch": 4.2, "grad_norm": 0.9711707234382629, "learning_rate": 2.959183673469388e-05, "loss": 0.5935, "step": 2100 }, { "epoch": 4.2, "eval_loss": 0.7453898191452026, "eval_runtime": 181.9504, "eval_samples_per_second": 0.78, "eval_steps_per_second": 0.78, "step": 2100 }, { "epoch": 4.4, "grad_norm": 1.0045188665390015, "learning_rate": 2.857142857142857e-05, "loss": 0.5731, "step": 2200 }, { "epoch": 4.4, "eval_loss": 0.7436273097991943, "eval_runtime": 182.1074, "eval_samples_per_second": 0.78, "eval_steps_per_second": 0.78, "step": 2200 }, { "epoch": 4.6, "grad_norm": 0.7766691446304321, "learning_rate": 2.7551020408163265e-05, "loss": 0.6094, "step": 2300 }, { "epoch": 4.6, "eval_loss": 0.7243757843971252, "eval_runtime": 182.5282, "eval_samples_per_second": 0.778, "eval_steps_per_second": 0.778, "step": 2300 }, { "epoch": 4.8, "grad_norm": 0.8756657242774963, "learning_rate": 2.6530612244897963e-05, "loss": 0.5904, "step": 2400 }, { "epoch": 4.8, "eval_loss": 0.7280852794647217, "eval_runtime": 182.6182, "eval_samples_per_second": 0.778, "eval_steps_per_second": 0.778, "step": 2400 }, { "epoch": 5.0, "grad_norm": 0.9650371670722961, "learning_rate": 2.5510204081632654e-05, "loss": 0.5649, "step": 2500 }, { "epoch": 5.0, "eval_loss": 0.714751124382019, "eval_runtime": 182.1779, "eval_samples_per_second": 0.779, "eval_steps_per_second": 0.779, "step": 2500 }, { "epoch": 5.2, "grad_norm": 0.9919518828392029, "learning_rate": 2.448979591836735e-05, "loss": 0.5211, "step": 2600 }, { "epoch": 5.2, "eval_loss": 0.719607949256897, "eval_runtime": 182.2031, "eval_samples_per_second": 0.779, "eval_steps_per_second": 0.779, "step": 2600 }, { "epoch": 5.4, "grad_norm": 0.979714572429657, "learning_rate": 2.3469387755102043e-05, "loss": 0.536, "step": 2700 }, { "epoch": 5.4, "eval_loss": 0.7172472476959229, "eval_runtime": 182.2624, "eval_samples_per_second": 0.779, "eval_steps_per_second": 0.779, "step": 2700 }, { "epoch": 5.6, "grad_norm": 0.9355886578559875, "learning_rate": 2.2448979591836737e-05, "loss": 0.4957, "step": 2800 }, { "epoch": 5.6, "eval_loss": 0.7044922113418579, "eval_runtime": 182.1756, "eval_samples_per_second": 0.779, "eval_steps_per_second": 0.779, "step": 2800 }, { "epoch": 5.8, "grad_norm": 0.907577633857727, "learning_rate": 2.1428571428571428e-05, "loss": 0.4935, "step": 2900 }, { "epoch": 5.8, "eval_loss": 0.6986051201820374, "eval_runtime": 182.1701, "eval_samples_per_second": 0.779, "eval_steps_per_second": 0.779, "step": 2900 }, { "epoch": 6.0, "grad_norm": 0.6769922971725464, "learning_rate": 2.0408163265306123e-05, "loss": 0.5038, "step": 3000 }, { "epoch": 6.0, "eval_loss": 0.6920154690742493, "eval_runtime": 182.1079, "eval_samples_per_second": 0.78, "eval_steps_per_second": 0.78, "step": 3000 }, { "epoch": 6.2, "grad_norm": 0.7743176817893982, "learning_rate": 1.9387755102040817e-05, "loss": 0.4357, "step": 3100 }, { "epoch": 6.2, "eval_loss": 0.7011306285858154, "eval_runtime": 182.6215, "eval_samples_per_second": 0.778, "eval_steps_per_second": 0.778, "step": 3100 }, { "epoch": 6.4, "grad_norm": 0.9698778986930847, "learning_rate": 1.836734693877551e-05, "loss": 0.435, "step": 3200 }, { "epoch": 6.4, "eval_loss": 0.6926498413085938, "eval_runtime": 182.1258, "eval_samples_per_second": 0.78, "eval_steps_per_second": 0.78, "step": 3200 }, { "epoch": 6.6, "grad_norm": 1.2019530534744263, "learning_rate": 1.7346938775510206e-05, "loss": 0.4521, "step": 3300 }, { "epoch": 6.6, "eval_loss": 0.6842972636222839, "eval_runtime": 182.0796, "eval_samples_per_second": 0.78, "eval_steps_per_second": 0.78, "step": 3300 }, { "epoch": 6.8, "grad_norm": 1.1843819618225098, "learning_rate": 1.6326530612244897e-05, "loss": 0.4559, "step": 3400 }, { "epoch": 6.8, "eval_loss": 0.6817460656166077, "eval_runtime": 182.1245, "eval_samples_per_second": 0.78, "eval_steps_per_second": 0.78, "step": 3400 }, { "epoch": 7.0, "grad_norm": 0.9009637832641602, "learning_rate": 1.5306122448979594e-05, "loss": 0.447, "step": 3500 }, { "epoch": 7.0, "eval_loss": 0.6794432401657104, "eval_runtime": 182.0907, "eval_samples_per_second": 0.78, "eval_steps_per_second": 0.78, "step": 3500 }, { "epoch": 7.2, "grad_norm": 1.0701977014541626, "learning_rate": 1.4285714285714285e-05, "loss": 0.3809, "step": 3600 }, { "epoch": 7.2, "eval_loss": 0.6871351599693298, "eval_runtime": 182.0665, "eval_samples_per_second": 0.78, "eval_steps_per_second": 0.78, "step": 3600 }, { "epoch": 7.4, "grad_norm": 0.9625453352928162, "learning_rate": 1.3265306122448982e-05, "loss": 0.3836, "step": 3700 }, { "epoch": 7.4, "eval_loss": 0.6862939596176147, "eval_runtime": 182.0265, "eval_samples_per_second": 0.78, "eval_steps_per_second": 0.78, "step": 3700 }, { "epoch": 7.6, "grad_norm": 1.1868125200271606, "learning_rate": 1.2244897959183674e-05, "loss": 0.3889, "step": 3800 }, { "epoch": 7.6, "eval_loss": 0.6783468723297119, "eval_runtime": 182.5364, "eval_samples_per_second": 0.778, "eval_steps_per_second": 0.778, "step": 3800 }, { "epoch": 7.8, "grad_norm": 0.9025134444236755, "learning_rate": 1.1224489795918369e-05, "loss": 0.4134, "step": 3900 }, { "epoch": 7.8, "eval_loss": 0.6690346002578735, "eval_runtime": 181.8517, "eval_samples_per_second": 0.781, "eval_steps_per_second": 0.781, "step": 3900 }, { "epoch": 8.0, "grad_norm": 1.2734243869781494, "learning_rate": 1.0204081632653061e-05, "loss": 0.4018, "step": 4000 }, { "epoch": 8.0, "eval_loss": 0.6694031953811646, "eval_runtime": 182.0072, "eval_samples_per_second": 0.78, "eval_steps_per_second": 0.78, "step": 4000 }, { "epoch": 8.2, "grad_norm": 0.974176824092865, "learning_rate": 9.183673469387756e-06, "loss": 0.3526, "step": 4100 }, { "epoch": 8.2, "eval_loss": 0.6763675212860107, "eval_runtime": 182.0229, "eval_samples_per_second": 0.78, "eval_steps_per_second": 0.78, "step": 4100 }, { "epoch": 8.4, "grad_norm": 0.9596600532531738, "learning_rate": 8.163265306122448e-06, "loss": 0.3373, "step": 4200 }, { "epoch": 8.4, "eval_loss": 0.6745020747184753, "eval_runtime": 181.8856, "eval_samples_per_second": 0.781, "eval_steps_per_second": 0.781, "step": 4200 }, { "epoch": 8.6, "grad_norm": 0.9683770537376404, "learning_rate": 7.142857142857143e-06, "loss": 0.3704, "step": 4300 }, { "epoch": 8.6, "eval_loss": 0.6758388876914978, "eval_runtime": 181.9951, "eval_samples_per_second": 0.78, "eval_steps_per_second": 0.78, "step": 4300 }, { "epoch": 8.8, "grad_norm": 1.0410691499710083, "learning_rate": 6.122448979591837e-06, "loss": 0.3482, "step": 4400 }, { "epoch": 8.8, "eval_loss": 0.6700472831726074, "eval_runtime": 181.8829, "eval_samples_per_second": 0.781, "eval_steps_per_second": 0.781, "step": 4400 }, { "epoch": 9.0, "grad_norm": 1.0724315643310547, "learning_rate": 5.102040816326531e-06, "loss": 0.3621, "step": 4500 }, { "epoch": 9.0, "eval_loss": 0.66424161195755, "eval_runtime": 182.2196, "eval_samples_per_second": 0.779, "eval_steps_per_second": 0.779, "step": 4500 }, { "epoch": 9.2, "grad_norm": 1.2040272951126099, "learning_rate": 4.081632653061224e-06, "loss": 0.3353, "step": 4600 }, { "epoch": 9.2, "eval_loss": 0.6743778586387634, "eval_runtime": 182.2041, "eval_samples_per_second": 0.779, "eval_steps_per_second": 0.779, "step": 4600 }, { "epoch": 9.4, "grad_norm": 1.0811737775802612, "learning_rate": 3.0612244897959185e-06, "loss": 0.3265, "step": 4700 }, { "epoch": 9.4, "eval_loss": 0.6780717372894287, "eval_runtime": 182.1426, "eval_samples_per_second": 0.78, "eval_steps_per_second": 0.78, "step": 4700 }, { "epoch": 9.6, "grad_norm": 0.8568335771560669, "learning_rate": 2.040816326530612e-06, "loss": 0.3247, "step": 4800 }, { "epoch": 9.6, "eval_loss": 0.6788098216056824, "eval_runtime": 182.2171, "eval_samples_per_second": 0.779, "eval_steps_per_second": 0.779, "step": 4800 }, { "epoch": 9.8, "grad_norm": 1.1256827116012573, "learning_rate": 1.020408163265306e-06, "loss": 0.3355, "step": 4900 }, { "epoch": 9.8, "eval_loss": 0.6737083196640015, "eval_runtime": 182.1568, "eval_samples_per_second": 0.78, "eval_steps_per_second": 0.78, "step": 4900 }, { "epoch": 10.0, "grad_norm": 1.1654198169708252, "learning_rate": 0.0, "loss": 0.3152, "step": 5000 }, { "epoch": 10.0, "eval_loss": 0.6746455430984497, "eval_runtime": 182.1302, "eval_samples_per_second": 0.78, "eval_steps_per_second": 0.78, "step": 5000 } ], "logging_steps": 100, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.670208882951987e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }