{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008, "grad_norm": 0.6091650128364563, "learning_rate": 1.5384615384615387e-05, "loss": 1.7915, "mean_token_accuracy": 0.6200876832008362, "step": 1 }, { "epoch": 0.04, "grad_norm": 0.536028265953064, "learning_rate": 7.692307692307693e-05, "loss": 1.7044, "mean_token_accuracy": 0.6384097561240196, "step": 5 }, { "epoch": 0.08, "grad_norm": 0.3632589876651764, "learning_rate": 0.00015384615384615385, "loss": 1.5683, "mean_token_accuracy": 0.6574486196041107, "step": 10 }, { "epoch": 0.12, "grad_norm": 0.38857078552246094, "learning_rate": 0.00019984268150178167, "loss": 1.3468, "mean_token_accuracy": 0.6951748728752136, "step": 15 }, { "epoch": 0.16, "grad_norm": 0.5654380321502686, "learning_rate": 0.00019807852804032305, "loss": 1.1075, "mean_token_accuracy": 0.7350626409053802, "step": 20 }, { "epoch": 0.2, "grad_norm": 0.5035667419433594, "learning_rate": 0.00019438833303083678, "loss": 0.8489, "mean_token_accuracy": 0.8006348431110382, "step": 25 }, { "epoch": 0.24, "grad_norm": 0.28405892848968506, "learning_rate": 0.00018884456359788724, "loss": 0.8182, "mean_token_accuracy": 0.8062106907367707, "step": 30 }, { "epoch": 0.28, "grad_norm": 0.20907118916511536, "learning_rate": 0.00018155608689592604, "loss": 0.7165, "mean_token_accuracy": 0.8268509089946747, "step": 35 }, { "epoch": 0.32, "grad_norm": 0.22087477147579193, "learning_rate": 0.0001726660322034027, "loss": 0.7139, "mean_token_accuracy": 0.8272643625736237, "step": 40 }, { "epoch": 0.36, "grad_norm": 0.23085324466228485, "learning_rate": 0.00016234898018587337, "loss": 0.6703, "mean_token_accuracy": 0.8364481091499328, "step": 45 }, { "epoch": 0.4, "grad_norm": 0.20583884418010712, "learning_rate": 0.00015080753452465296, "loss": 0.6077, "mean_token_accuracy": 0.8467150866985321, "step": 50 }, { "epoch": 0.44, "grad_norm": 0.20109635591506958, "learning_rate": 0.000138268343236509, "loss": 0.6485, "mean_token_accuracy": 0.8379231691360474, "step": 55 }, { "epoch": 0.48, "grad_norm": 0.19621974229812622, "learning_rate": 0.0001249776478167227, "loss": 0.661, "mean_token_accuracy": 0.8345841407775879, "step": 60 }, { "epoch": 0.52, "grad_norm": 0.2204616218805313, "learning_rate": 0.00011119644761033078, "loss": 0.6226, "mean_token_accuracy": 0.8435723900794982, "step": 65 }, { "epoch": 0.56, "grad_norm": 0.18189997971057892, "learning_rate": 9.719537437241312e-05, "loss": 0.5835, "mean_token_accuracy": 0.8532270193099976, "step": 70 }, { "epoch": 0.6, "grad_norm": 0.2008037269115448, "learning_rate": 8.324937766952638e-05, "loss": 0.6159, "mean_token_accuracy": 0.8451593399047852, "step": 75 }, { "epoch": 0.64, "grad_norm": 0.19920995831489563, "learning_rate": 6.963232548903853e-05, "loss": 0.6134, "mean_token_accuracy": 0.844947737455368, "step": 80 }, { "epoch": 0.68, "grad_norm": 0.21071140468120575, "learning_rate": 5.6611626088244194e-05, "loss": 0.6287, "mean_token_accuracy": 0.8413995563983917, "step": 85 }, { "epoch": 0.72, "grad_norm": 0.2002287358045578, "learning_rate": 4.444297669803981e-05, "loss": 0.5997, "mean_token_accuracy": 0.8461678445339202, "step": 90 }, { "epoch": 0.76, "grad_norm": 0.2541356086730957, "learning_rate": 3.336534220479961e-05, "loss": 0.6379, "mean_token_accuracy": 0.8375702917575836, "step": 95 }, { "epoch": 0.8, "grad_norm": 0.20431411266326904, "learning_rate": 2.3596262417839255e-05, "loss": 0.5859, "mean_token_accuracy": 0.8490248620510101, "step": 100 }, { "epoch": 0.84, "grad_norm": 0.2237038016319275, "learning_rate": 1.5327580077171587e-05, "loss": 0.6047, "mean_token_accuracy": 0.8456766724586486, "step": 105 }, { "epoch": 0.88, "grad_norm": 0.22031715512275696, "learning_rate": 8.72167349386811e-06, "loss": 0.5933, "mean_token_accuracy": 0.848451578617096, "step": 110 }, { "epoch": 0.92, "grad_norm": 0.21197955310344696, "learning_rate": 3.908267805490051e-06, "loss": 0.6018, "mean_token_accuracy": 0.8468173623085022, "step": 115 }, { "epoch": 0.96, "grad_norm": 0.20556138455867767, "learning_rate": 9.818874663554357e-07, "loss": 0.6277, "mean_token_accuracy": 0.8388452410697937, "step": 120 }, { "epoch": 1.0, "grad_norm": 0.2163011133670807, "learning_rate": 0.0, "loss": 0.6224, "mean_token_accuracy": 0.8408988118171692, "step": 125 }, { "epoch": 1.0, "step": 125, "total_flos": 4503614496178176.0, "train_loss": 0.7746698780059814, "train_runtime": 1219.5612, "train_samples_per_second": 0.82, "train_steps_per_second": 0.102 } ], "logging_steps": 5, "max_steps": 125, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4503614496178176.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }