{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.875912408759124, "eval_steps": 100, "global_step": 51, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 17.3368057012558, "epoch": 0.058394160583941604, "grad_norm": 0.5516418814659119, "kl": 0.0, "learning_rate": 3.333333333333333e-07, "loss": 0.0, "reward": 0.635416679084301, "reward_std": 0.33513265289366245, "rewards/semantic_entropy": 0.635416679084301, "step": 1 }, { "completion_length": 17.217013835906982, "epoch": 0.11678832116788321, "grad_norm": 0.4640360474586487, "kl": 0.0, "learning_rate": 6.666666666666666e-07, "loss": 0.0, "reward": 0.725694440305233, "reward_std": 0.29016363993287086, "rewards/semantic_entropy": 0.725694440305233, "step": 2 }, { "completion_length": 18.128472328186035, "epoch": 0.17518248175182483, "grad_norm": 0.540382981300354, "kl": 0.0012841224670410156, "learning_rate": 1e-06, "loss": 0.0001, "reward": 0.6006944552063942, "reward_std": 0.3786292914301157, "rewards/semantic_entropy": 0.6006944552063942, "step": 3 }, { "completion_length": 17.270833373069763, "epoch": 0.23357664233576642, "grad_norm": 0.5315675139427185, "kl": 0.0009038448333740234, "learning_rate": 1.3333333333333332e-06, "loss": 0.0, "reward": 0.666666679084301, "reward_std": 0.2968092504888773, "rewards/semantic_entropy": 0.666666679084301, "step": 4 }, { "completion_length": 18.501736402511597, "epoch": 0.291970802919708, "grad_norm": 0.5848979353904724, "kl": 0.0011067390441894531, "learning_rate": 1.6666666666666667e-06, "loss": 0.0, "reward": 0.642361119389534, "reward_std": 0.37005409598350525, "rewards/semantic_entropy": 0.642361119389534, "step": 5 }, { "completion_length": 19.102431058883667, "epoch": 0.35036496350364965, "grad_norm": 0.8071303367614746, "kl": 0.001129150390625, "learning_rate": 2e-06, "loss": 0.0, "reward": 0.5868055522441864, "reward_std": 0.40071484073996544, "rewards/semantic_entropy": 0.5868055522441864, "step": 6 }, { "completion_length": 18.661458492279053, "epoch": 0.40875912408759124, "grad_norm": 0.4785407781600952, "kl": 0.0016431808471679688, "learning_rate": 1.997564050259824e-06, "loss": 0.0001, "reward": 0.6614583432674408, "reward_std": 0.35613277927041054, "rewards/semantic_entropy": 0.6614583432674408, "step": 7 }, { "completion_length": 18.498263835906982, "epoch": 0.46715328467153283, "grad_norm": 0.7530333995819092, "kl": 0.0045261383056640625, "learning_rate": 1.99026806874157e-06, "loss": 0.0002, "reward": 0.6145833469927311, "reward_std": 0.35138164833188057, "rewards/semantic_entropy": 0.6145833469927311, "step": 8 }, { "completion_length": 19.208333611488342, "epoch": 0.5255474452554745, "grad_norm": 0.741835355758667, "kl": 0.00656890869140625, "learning_rate": 1.9781476007338054e-06, "loss": 0.0003, "reward": 0.5364583395421505, "reward_std": 0.39881302043795586, "rewards/semantic_entropy": 0.5364583395421505, "step": 9 }, { "completion_length": 18.463541746139526, "epoch": 0.583941605839416, "grad_norm": 0.8095004558563232, "kl": 0.0157012939453125, "learning_rate": 1.9612616959383188e-06, "loss": 0.0006, "reward": 0.6388888880610466, "reward_std": 0.3762888126075268, "rewards/semantic_entropy": 0.6388888880610466, "step": 10 }, { "completion_length": 16.302083492279053, "epoch": 0.6423357664233577, "grad_norm": 1.0500741004943848, "kl": 0.05213165283203125, "learning_rate": 1.9396926207859082e-06, "loss": 0.0021, "reward": 0.7083333432674408, "reward_std": 0.3477053064852953, "rewards/semantic_entropy": 0.7083333432674408, "step": 11 }, { "completion_length": 15.480902791023254, "epoch": 0.7007299270072993, "grad_norm": 0.6965835690498352, "kl": 0.107666015625, "learning_rate": 1.9135454576426007e-06, "loss": 0.0043, "reward": 0.6979166641831398, "reward_std": 0.3180003799498081, "rewards/semantic_entropy": 0.6979166641831398, "step": 12 }, { "completion_length": 16.611111283302307, "epoch": 0.7591240875912408, "grad_norm": 0.8703776001930237, "kl": 0.080535888671875, "learning_rate": 1.8829475928589268e-06, "loss": 0.0032, "reward": 0.6857638955116272, "reward_std": 0.3688342422246933, "rewards/semantic_entropy": 0.6857638955116272, "step": 13 }, { "completion_length": 14.387152791023254, "epoch": 0.8175182481751825, "grad_norm": 0.7894781827926636, "kl": 0.4075927734375, "learning_rate": 1.8480480961564257e-06, "loss": 0.0163, "reward": 0.6805555745959282, "reward_std": 0.31897793617099524, "rewards/semantic_entropy": 0.6805555745959282, "step": 14 }, { "completion_length": 14.901041746139526, "epoch": 0.8759124087591241, "grad_norm": 0.8611342906951904, "kl": 0.185882568359375, "learning_rate": 1.8090169943749474e-06, "loss": 0.0074, "reward": 0.7274305671453476, "reward_std": 0.27765000611543655, "rewards/semantic_entropy": 0.7274305671453476, "step": 15 }, { "completion_length": 13.159722089767456, "epoch": 0.9343065693430657, "grad_norm": 0.9914915561676025, "kl": 0.35858154296875, "learning_rate": 1.766044443118978e-06, "loss": 0.0143, "reward": 0.7239583432674408, "reward_std": 0.34174920059740543, "rewards/semantic_entropy": 0.7239583432674408, "step": 16 }, { "completion_length": 15.265625238418579, "epoch": 0.9927007299270073, "grad_norm": 0.7431650757789612, "kl": 0.18798828125, "learning_rate": 1.719339800338651e-06, "loss": 0.0075, "reward": 0.7552083358168602, "reward_std": 0.2897039409726858, "rewards/semantic_entropy": 0.7552083358168602, "step": 17 }, { "completion_length": 2.0, "epoch": 1.0, "grad_norm": 0.7431650757789612, "kl": 1.125, "learning_rate": 1.669130606358858e-06, "loss": 0.0012, "reward": 1.0, "reward_std": 0.38924944400787354, "rewards/semantic_entropy": 1.0, "step": 18 }, { "completion_length": 15.090277791023254, "epoch": 1.0583941605839415, "grad_norm": 0.8040208220481873, "kl": 0.3365478515625, "learning_rate": 1.615661475325658e-06, "loss": 0.0135, "reward": 0.7135416716337204, "reward_std": 0.3099258504807949, "rewards/semantic_entropy": 0.7135416716337204, "step": 19 }, { "completion_length": 15.520833373069763, "epoch": 1.1167883211678833, "grad_norm": 0.8632144927978516, "kl": 0.32586669921875, "learning_rate": 1.5591929034707466e-06, "loss": 0.0131, "reward": 0.737847238779068, "reward_std": 0.28588614612817764, "rewards/semantic_entropy": 0.737847238779068, "step": 20 }, { "completion_length": 16.050347328186035, "epoch": 1.1751824817518248, "grad_norm": 0.74057936668396, "kl": 0.1895751953125, "learning_rate": 1.5e-06, "loss": 0.0076, "reward": 0.75, "reward_std": 0.31531847827136517, "rewards/semantic_entropy": 0.75, "step": 21 }, { "completion_length": 16.63194465637207, "epoch": 1.2335766423357664, "grad_norm": 0.4329465627670288, "kl": 0.22442626953125, "learning_rate": 1.4383711467890773e-06, "loss": 0.009, "reward": 0.734375, "reward_std": 0.2730935662984848, "rewards/semantic_entropy": 0.734375, "step": 22 }, { "completion_length": 19.901041984558105, "epoch": 1.2919708029197081, "grad_norm": 0.652396023273468, "kl": 0.12689208984375, "learning_rate": 1.374606593415912e-06, "loss": 0.0051, "reward": 0.7239583507180214, "reward_std": 0.33322223369032145, "rewards/semantic_entropy": 0.7239583507180214, "step": 23 }, { "completion_length": 17.720486402511597, "epoch": 1.3503649635036497, "grad_norm": 0.5013155937194824, "kl": 0.1468505859375, "learning_rate": 1.3090169943749473e-06, "loss": 0.0059, "reward": 0.75, "reward_std": 0.29686133936047554, "rewards/semantic_entropy": 0.75, "step": 24 }, { "completion_length": 19.574653148651123, "epoch": 1.4087591240875912, "grad_norm": 0.5545840263366699, "kl": 0.14691162109375, "learning_rate": 1.2419218955996676e-06, "loss": 0.0059, "reward": 0.7378472313284874, "reward_std": 0.29906335659325123, "rewards/semantic_entropy": 0.7378472313284874, "step": 25 }, { "completion_length": 15.946180701255798, "epoch": 1.4671532846715327, "grad_norm": 0.5206867456436157, "kl": 0.1771240234375, "learning_rate": 1.1736481776669305e-06, "loss": 0.0071, "reward": 0.8107638955116272, "reward_std": 0.24001463688910007, "rewards/semantic_entropy": 0.8107638955116272, "step": 26 }, { "completion_length": 18.86805558204651, "epoch": 1.5255474452554745, "grad_norm": 0.7857072949409485, "kl": 0.1768798828125, "learning_rate": 1.1045284632676535e-06, "loss": 0.0071, "reward": 0.7552083432674408, "reward_std": 0.3070409968495369, "rewards/semantic_entropy": 0.7552083432674408, "step": 27 }, { "completion_length": 19.182291865348816, "epoch": 1.583941605839416, "grad_norm": 0.6400216221809387, "kl": 0.2479248046875, "learning_rate": 1.034899496702501e-06, "loss": 0.0099, "reward": 0.7534722238779068, "reward_std": 0.2666480904445052, "rewards/semantic_entropy": 0.7534722238779068, "step": 28 }, { "completion_length": 16.279513955116272, "epoch": 1.6423357664233578, "grad_norm": 0.6639309525489807, "kl": 0.14581298828125, "learning_rate": 9.651005032974993e-07, "loss": 0.0058, "reward": 0.8368055522441864, "reward_std": 0.19957617949694395, "rewards/semantic_entropy": 0.8368055522441864, "step": 29 }, { "completion_length": 19.109375, "epoch": 1.7007299270072993, "grad_norm": 0.6287054419517517, "kl": 0.17852783203125, "learning_rate": 8.954715367323466e-07, "loss": 0.0071, "reward": 0.798611119389534, "reward_std": 0.2921114172786474, "rewards/semantic_entropy": 0.798611119389534, "step": 30 }, { "completion_length": 16.519097566604614, "epoch": 1.7591240875912408, "grad_norm": 0.6585462689399719, "kl": 0.157318115234375, "learning_rate": 8.263518223330696e-07, "loss": 0.0063, "reward": 0.7708333358168602, "reward_std": 0.2721500750631094, "rewards/semantic_entropy": 0.7708333358168602, "step": 31 }, { "completion_length": 18.239583730697632, "epoch": 1.8175182481751824, "grad_norm": 0.6048464775085449, "kl": 0.147125244140625, "learning_rate": 7.580781044003324e-07, "loss": 0.0059, "reward": 0.7326388955116272, "reward_std": 0.29634279757738113, "rewards/semantic_entropy": 0.7326388955116272, "step": 32 }, { "completion_length": 19.44270896911621, "epoch": 1.8759124087591241, "grad_norm": 0.430084228515625, "kl": 0.1063232421875, "learning_rate": 6.909830056250526e-07, "loss": 0.0043, "reward": 0.774305559694767, "reward_std": 0.27460889145731926, "rewards/semantic_entropy": 0.774305559694767, "step": 33 }, { "completion_length": 16.817708730697632, "epoch": 1.9343065693430657, "grad_norm": 0.40789568424224854, "kl": 0.070526123046875, "learning_rate": 6.253934065840879e-07, "loss": 0.0028, "reward": 0.8107639029622078, "reward_std": 0.2299627624452114, "rewards/semantic_entropy": 0.8107639029622078, "step": 34 }, { "completion_length": 20.5625, "epoch": 1.9927007299270074, "grad_norm": 0.4874630868434906, "kl": 0.13616943359375, "learning_rate": 5.616288532109224e-07, "loss": 0.0054, "reward": 0.7361111044883728, "reward_std": 0.3129718992859125, "rewards/semantic_entropy": 0.7361111044883728, "step": 35 }, { "completion_length": 34.0, "epoch": 2.0, "grad_norm": 0.4874630868434906, "kl": 0.036865234375, "learning_rate": 5.000000000000002e-07, "loss": 0.0023, "reward": 1.0, "reward_std": 0.0, "rewards/semantic_entropy": 1.0, "step": 36 }, { "completion_length": 18.72743058204651, "epoch": 2.0583941605839415, "grad_norm": 0.5846592783927917, "kl": 0.12078857421875, "learning_rate": 4.408070965292533e-07, "loss": 0.0048, "reward": 0.7465277835726738, "reward_std": 0.31662504002451897, "rewards/semantic_entropy": 0.7465277835726738, "step": 37 }, { "completion_length": 17.776041984558105, "epoch": 2.116788321167883, "grad_norm": 0.6230023503303528, "kl": 0.21380615234375, "learning_rate": 3.843385246743417e-07, "loss": 0.0085, "reward": 0.7482638880610466, "reward_std": 0.28513461723923683, "rewards/semantic_entropy": 0.7482638880610466, "step": 38 }, { "completion_length": 19.468750476837158, "epoch": 2.1751824817518246, "grad_norm": 0.6272074580192566, "kl": 0.096343994140625, "learning_rate": 3.308693936411421e-07, "loss": 0.0039, "reward": 0.7291666716337204, "reward_std": 0.32780924811959267, "rewards/semantic_entropy": 0.7291666716337204, "step": 39 }, { "completion_length": 17.072916984558105, "epoch": 2.2335766423357666, "grad_norm": 0.5045897960662842, "kl": 0.077362060546875, "learning_rate": 2.8066019966134904e-07, "loss": 0.0031, "reward": 0.8090277835726738, "reward_std": 0.18912154575809836, "rewards/semantic_entropy": 0.8090277835726738, "step": 40 }, { "completion_length": 18.598958492279053, "epoch": 2.291970802919708, "grad_norm": 0.48655831813812256, "kl": 0.09051513671875, "learning_rate": 2.339555568810221e-07, "loss": 0.0036, "reward": 0.7760416641831398, "reward_std": 0.26947965286672115, "rewards/semantic_entropy": 0.7760416641831398, "step": 41 }, { "completion_length": 17.996527791023254, "epoch": 2.3503649635036497, "grad_norm": 0.5561981797218323, "kl": 0.0677490234375, "learning_rate": 1.9098300562505264e-07, "loss": 0.0027, "reward": 0.7638888955116272, "reward_std": 0.2775236200541258, "rewards/semantic_entropy": 0.7638888955116272, "step": 42 }, { "completion_length": 17.697916746139526, "epoch": 2.408759124087591, "grad_norm": 0.5540634989738464, "kl": 0.15765380859375, "learning_rate": 1.5195190384357404e-07, "loss": 0.0063, "reward": 0.774305559694767, "reward_std": 0.2512203995138407, "rewards/semantic_entropy": 0.774305559694767, "step": 43 }, { "completion_length": 18.682291865348816, "epoch": 2.4671532846715327, "grad_norm": 0.4448810815811157, "kl": 0.11761474609375, "learning_rate": 1.1705240714107301e-07, "loss": 0.0047, "reward": 0.7447916716337204, "reward_std": 0.2556060552597046, "rewards/semantic_entropy": 0.7447916716337204, "step": 44 }, { "completion_length": 16.9149307012558, "epoch": 2.5255474452554747, "grad_norm": 0.5861647725105286, "kl": 0.09136962890625, "learning_rate": 8.645454235739902e-08, "loss": 0.0037, "reward": 0.798611119389534, "reward_std": 0.2555408189073205, "rewards/semantic_entropy": 0.798611119389534, "step": 45 }, { "completion_length": 17.83506965637207, "epoch": 2.5839416058394162, "grad_norm": 0.4424433708190918, "kl": 0.1358642578125, "learning_rate": 6.030737921409168e-08, "loss": 0.0054, "reward": 0.7934027835726738, "reward_std": 0.27376995235681534, "rewards/semantic_entropy": 0.7934027835726738, "step": 46 }, { "completion_length": 18.23263943195343, "epoch": 2.6423357664233578, "grad_norm": 0.4687785804271698, "kl": 0.10284423828125, "learning_rate": 3.87383040616811e-08, "loss": 0.0041, "reward": 0.7934027761220932, "reward_std": 0.2595429290086031, "rewards/semantic_entropy": 0.7934027761220932, "step": 47 }, { "completion_length": 18.30381965637207, "epoch": 2.7007299270072993, "grad_norm": 0.5063730478286743, "kl": 0.10589599609375, "learning_rate": 2.185239926619431e-08, "loss": 0.0042, "reward": 0.758680559694767, "reward_std": 0.29030087031424046, "rewards/semantic_entropy": 0.758680559694767, "step": 48 }, { "completion_length": 18.715277910232544, "epoch": 2.759124087591241, "grad_norm": 0.5857909321784973, "kl": 0.117889404296875, "learning_rate": 9.731931258429638e-09, "loss": 0.0047, "reward": 0.7777777835726738, "reward_std": 0.2577416365966201, "rewards/semantic_entropy": 0.7777777835726738, "step": 49 }, { "completion_length": 17.322916626930237, "epoch": 2.8175182481751824, "grad_norm": 0.4949776828289032, "kl": 0.09075927734375, "learning_rate": 2.435949740175802e-09, "loss": 0.0036, "reward": 0.760416679084301, "reward_std": 0.2918264754116535, "rewards/semantic_entropy": 0.760416679084301, "step": 50 }, { "completion_length": 19.02256977558136, "epoch": 2.875912408759124, "grad_norm": 0.554519772529602, "kl": 0.09710693359375, "learning_rate": 0.0, "loss": 0.0039, "reward": 0.7395833358168602, "reward_std": 0.2774972226470709, "rewards/semantic_entropy": 0.7395833358168602, "step": 51 }, { "epoch": 2.875912408759124, "step": 51, "total_flos": 0.0, "train_loss": 0.004883308103913604, "train_runtime": 5205.8109, "train_samples_per_second": 0.471, "train_steps_per_second": 0.01 } ], "logging_steps": 1, "max_steps": 51, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }