{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.6890756302521008, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 523.3984375, "epoch": 0.01680672268907563, "grad_norm": 0.4084254801273346, "kl": 0.0, "learning_rate": 3.3333333333333335e-07, "loss": 0.0346, "num_tokens": 149643.0, "reward": 0.05572916800156236, "reward_std": 0.1236814484000206, "rewards/curriculum_aware_reward_fn": 0.024479168001562357, "rewards/format_reward": 0.03125, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 446.5390625, "epoch": 0.03361344537815126, "grad_norm": 0.5236720442771912, "kl": 0.0, "learning_rate": 6.666666666666667e-07, "loss": 0.015, "num_tokens": 282184.0, "reward": 0.17552083544433117, "reward_std": 0.2788553349673748, "rewards/curriculum_aware_reward_fn": 0.12864583544433117, "rewards/format_reward": 0.046875, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 404.5625, "epoch": 0.05042016806722689, "grad_norm": 0.5860257744789124, "kl": 0.00027751922607421875, "learning_rate": 1.0000000000000002e-06, "loss": 0.0033, "num_tokens": 411536.0, "reward": 0.1171875, "reward_std": 0.2090039700269699, "rewards/curriculum_aware_reward_fn": 0.078125, "rewards/format_reward": 0.0390625, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 437.8359375, "epoch": 0.06722689075630252, "grad_norm": 0.5572423338890076, "kl": 0.00028705596923828125, "learning_rate": 1.3333333333333334e-06, "loss": 0.0288, "num_tokens": 543811.0, "reward": 0.1276041716337204, "reward_std": 0.17299909517169, "rewards/curriculum_aware_reward_fn": 0.1041666716337204, "rewards/format_reward": 0.0234375, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 484.453125, "epoch": 0.08403361344537816, "grad_norm": 0.5424997806549072, "kl": 0.0003066062927246094, "learning_rate": 1.6666666666666667e-06, "loss": 0.0016, "num_tokens": 674637.0, "reward": 0.15625, "reward_std": 0.20851250365376472, "rewards/curriculum_aware_reward_fn": 0.109375, "rewards/format_reward": 0.046875, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 468.078125, "epoch": 0.10084033613445378, "grad_norm": 0.47155389189720154, "kl": 0.0003123283386230469, "learning_rate": 2.0000000000000003e-06, "loss": -0.0177, "num_tokens": 810527.0, "reward": 0.1276041716337204, "reward_std": 0.18760672956705093, "rewards/curriculum_aware_reward_fn": 0.0651041716337204, "rewards/format_reward": 0.0625, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 500.3125, "epoch": 0.11764705882352941, "grad_norm": 0.5090876817703247, "kl": 0.0004954338073730469, "learning_rate": 2.3333333333333336e-06, "loss": 0.0566, "num_tokens": 947063.0, "reward": 0.1354166679084301, "reward_std": 0.1753537617623806, "rewards/curriculum_aware_reward_fn": 0.07291666697710752, "rewards/format_reward": 0.0625, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 483.84375, "epoch": 0.13445378151260504, "grad_norm": 0.6272737979888916, "kl": 0.0008707046508789062, "learning_rate": 2.666666666666667e-06, "loss": 0.0704, "num_tokens": 1093987.0, "reward": 0.16250000149011612, "reward_std": 0.3029462620615959, "rewards/curriculum_aware_reward_fn": 0.037500000558793545, "rewards/format_reward": 0.125, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 423.1484375, "epoch": 0.15126050420168066, "grad_norm": 0.7402248978614807, "kl": 0.00279998779296875, "learning_rate": 3e-06, "loss": 0.0728, "num_tokens": 1218118.0, "reward": 0.3968750089406967, "reward_std": 0.45669952034950256, "rewards/curriculum_aware_reward_fn": 0.10781250474974513, "rewards/format_reward": 0.2890625, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 496.734375, "epoch": 0.16806722689075632, "grad_norm": 0.7178624272346497, "kl": 0.00432586669921875, "learning_rate": 3.3333333333333333e-06, "loss": -0.0207, "num_tokens": 1362188.0, "reward": 0.5677083432674408, "reward_std": 0.5023705065250397, "rewards/curriculum_aware_reward_fn": 0.013020833721384406, "rewards/format_reward": 0.5546875, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 500.3984375, "epoch": 0.18487394957983194, "grad_norm": 0.6241538524627686, "kl": 0.0084075927734375, "learning_rate": 3.6666666666666666e-06, "loss": 0.0165, "num_tokens": 1496743.0, "reward": 0.714062511920929, "reward_std": 0.344537615776062, "rewards/curriculum_aware_reward_fn": 0.010937500279396772, "rewards/format_reward": 0.703125, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 517.5625, "epoch": 0.20168067226890757, "grad_norm": 0.6269044280052185, "kl": 0.00717926025390625, "learning_rate": 4.000000000000001e-06, "loss": 0.0195, "num_tokens": 1650679.0, "reward": 0.7604166716337204, "reward_std": 0.2794100269675255, "rewards/curriculum_aware_reward_fn": 0.010416666977107525, "rewards/format_reward": 0.75, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 489.9921875, "epoch": 0.2184873949579832, "grad_norm": 0.6628085374832153, "kl": 0.011993408203125, "learning_rate": 4.333333333333334e-06, "loss": 0.0178, "num_tokens": 1791190.0, "reward": 0.9375000298023224, "reward_std": 0.23163868859410286, "rewards/curriculum_aware_reward_fn": 0.031250000931322575, "rewards/format_reward": 0.90625, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 418.90625, "epoch": 0.23529411764705882, "grad_norm": 0.7439326643943787, "kl": 0.0212249755859375, "learning_rate": 4.666666666666667e-06, "loss": -0.0283, "num_tokens": 1912322.0, "reward": 0.8645833432674408, "reward_std": 0.2743529714643955, "rewards/curriculum_aware_reward_fn": 0.02864583395421505, "rewards/format_reward": 0.8359375, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 440.5546875, "epoch": 0.25210084033613445, "grad_norm": 0.477730393409729, "kl": 0.0169830322265625, "learning_rate": 5e-06, "loss": 0.0314, "num_tokens": 2041473.0, "reward": 1.0234375298023224, "reward_std": 0.159461235627532, "rewards/curriculum_aware_reward_fn": 0.05468750325962901, "rewards/format_reward": 0.96875, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 497.265625, "epoch": 0.2689075630252101, "grad_norm": 0.3044699728488922, "kl": 0.021484375, "learning_rate": 4.999952797253148e-06, "loss": 0.0386, "num_tokens": 2178899.0, "reward": 0.9427083432674408, "reward_std": 0.10222155228257179, "rewards/curriculum_aware_reward_fn": 0.06770833395421505, "rewards/format_reward": 0.875, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 503.7421875, "epoch": 0.2857142857142857, "grad_norm": 0.4504559338092804, "kl": 0.033233642578125, "learning_rate": 4.9998111909931225e-06, "loss": 0.0346, "num_tokens": 2311042.0, "reward": 1.0927083492279053, "reward_std": 0.13363875821232796, "rewards/curriculum_aware_reward_fn": 0.09270833618938923, "rewards/format_reward": 1.0, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 582.390625, "epoch": 0.3025210084033613, "grad_norm": 0.3522663712501526, "kl": 0.033416748046875, "learning_rate": 4.999575187161439e-06, "loss": 0.0049, "num_tokens": 2477404.0, "reward": 0.997395858168602, "reward_std": 0.0840194127522409, "rewards/curriculum_aware_reward_fn": 0.05989583651535213, "rewards/format_reward": 0.9375, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 603.4140625, "epoch": 0.31932773109243695, "grad_norm": 0.2832561731338501, "kl": 0.0328369140625, "learning_rate": 4.9992447956603455e-06, "loss": 0.0152, "num_tokens": 2642801.0, "reward": 0.895833358168602, "reward_std": 0.05173455877229571, "rewards/curriculum_aware_reward_fn": 0.020833333721384406, "rewards/format_reward": 0.875, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 501.46875, "epoch": 0.33613445378151263, "grad_norm": 0.4254682660102844, "kl": 0.0416259765625, "learning_rate": 4.998820030352409e-06, "loss": 0.0009, "num_tokens": 2784445.0, "reward": 1.0010417103767395, "reward_std": 0.13327472284436226, "rewards/curriculum_aware_reward_fn": 0.07135416753590107, "rewards/format_reward": 0.9296875, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 598.765625, "epoch": 0.35294117647058826, "grad_norm": 0.3762480914592743, "kl": 0.033721923828125, "learning_rate": 4.998300909059929e-06, "loss": -0.0163, "num_tokens": 2951415.0, "reward": 0.9947916865348816, "reward_std": 0.11004853993654251, "rewards/curriculum_aware_reward_fn": 0.07291666604578495, "rewards/format_reward": 0.921875, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 489.6484375, "epoch": 0.3697478991596639, "grad_norm": 0.6521299481391907, "kl": 0.04229736328125, "learning_rate": 4.997687453564198e-06, "loss": 0.0083, "num_tokens": 3090354.0, "reward": 0.9609375298023224, "reward_std": 0.25490327179431915, "rewards/curriculum_aware_reward_fn": 0.1562500037252903, "rewards/format_reward": 0.8046875, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 446.625, "epoch": 0.3865546218487395, "grad_norm": 0.4581259489059448, "kl": 0.02838134765625, "learning_rate": 4.9969796896045775e-06, "loss": 0.0239, "num_tokens": 3234002.0, "reward": 1.1093750298023224, "reward_std": 0.15402578841894865, "rewards/curriculum_aware_reward_fn": 0.17187500139698386, "rewards/format_reward": 0.9375, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 438.296875, "epoch": 0.40336134453781514, "grad_norm": 0.469014436006546, "kl": 0.02874755859375, "learning_rate": 4.996177646877426e-06, "loss": 0.0065, "num_tokens": 3368280.0, "reward": 1.0302083790302277, "reward_std": 0.12476669438183308, "rewards/curriculum_aware_reward_fn": 0.045833335258066654, "rewards/format_reward": 0.984375, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 433.9921875, "epoch": 0.42016806722689076, "grad_norm": 0.5325790643692017, "kl": 0.028350830078125, "learning_rate": 4.995281359034851e-06, "loss": -0.0046, "num_tokens": 3495607.0, "reward": 1.0026041865348816, "reward_std": 0.2044544592499733, "rewards/curriculum_aware_reward_fn": 0.08072916977107525, "rewards/format_reward": 0.921875, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 380.015625, "epoch": 0.4369747899159664, "grad_norm": 0.5200158357620239, "kl": 0.03680419921875, "learning_rate": 4.994290863683296e-06, "loss": 0.0419, "num_tokens": 3609809.0, "reward": 1.1458334028720856, "reward_std": 0.20207119127735496, "rewards/curriculum_aware_reward_fn": 0.15364583837799728, "rewards/format_reward": 0.9921875, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 413.7265625, "epoch": 0.453781512605042, "grad_norm": 0.41265320777893066, "kl": 0.033966064453125, "learning_rate": 4.99320620238196e-06, "loss": -0.0204, "num_tokens": 3744550.0, "reward": 1.0963541865348816, "reward_std": 0.11664257757365704, "rewards/curriculum_aware_reward_fn": 0.09635416604578495, "rewards/format_reward": 1.0, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 461.4375, "epoch": 0.47058823529411764, "grad_norm": 0.3836808502674103, "kl": 0.032623291015625, "learning_rate": 4.99202742064106e-06, "loss": 0.035, "num_tokens": 3875238.0, "reward": 1.0182291865348816, "reward_std": 0.10335793904960155, "rewards/curriculum_aware_reward_fn": 0.0807291679084301, "rewards/format_reward": 0.9375, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 506.953125, "epoch": 0.48739495798319327, "grad_norm": 0.3787353038787842, "kl": 0.029510498046875, "learning_rate": 4.990754567919917e-06, "loss": 0.0725, "num_tokens": 4024312.0, "reward": 0.9531250298023224, "reward_std": 0.08838835544884205, "rewards/curriculum_aware_reward_fn": 0.08593750279396772, "rewards/format_reward": 0.8671875, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 474.1875, "epoch": 0.5042016806722689, "grad_norm": 0.37932828068733215, "kl": 0.03497314453125, "learning_rate": 4.989387697624881e-06, "loss": 0.0078, "num_tokens": 4161712.0, "reward": 1.1328125298023224, "reward_std": 0.14552949741482735, "rewards/curriculum_aware_reward_fn": 0.1328125037252903, "rewards/format_reward": 1.0, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 447.6796875, "epoch": 0.5210084033613446, "grad_norm": 0.396627813577652, "kl": 0.0396728515625, "learning_rate": 4.987926867107095e-06, "loss": 0.004, "num_tokens": 4304935.0, "reward": 0.9671875089406967, "reward_std": 0.12499183788895607, "rewards/curriculum_aware_reward_fn": 0.09218750381842256, "rewards/format_reward": 0.875, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 574.703125, "epoch": 0.5378151260504201, "grad_norm": 0.39759182929992676, "kl": 0.030426025390625, "learning_rate": 4.986372137660078e-06, "loss": 0.0566, "num_tokens": 4464105.0, "reward": 0.8953125327825546, "reward_std": 0.147516256198287, "rewards/curriculum_aware_reward_fn": 0.0906250006519258, "rewards/format_reward": 0.8046875, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 434.640625, "epoch": 0.5546218487394958, "grad_norm": 0.3933623433113098, "kl": 0.034210205078125, "learning_rate": 4.984723574517165e-06, "loss": 0.0163, "num_tokens": 4602139.0, "reward": 1.1234374940395355, "reward_std": 0.15428178012371063, "rewards/curriculum_aware_reward_fn": 0.18593750335276127, "rewards/format_reward": 0.9375, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 445.1640625, "epoch": 0.5714285714285714, "grad_norm": 0.3263511061668396, "kl": 0.04083251953125, "learning_rate": 4.9829812468487655e-06, "loss": 0.0145, "num_tokens": 4736544.0, "reward": 0.9843750298023224, "reward_std": 0.09849035926163197, "rewards/curriculum_aware_reward_fn": 0.05468750139698386, "rewards/format_reward": 0.9296875, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 477.53125, "epoch": 0.5882352941176471, "grad_norm": 0.336434543132782, "kl": 0.0318603515625, "learning_rate": 4.981145227759457e-06, "loss": 0.0017, "num_tokens": 4881308.0, "reward": 0.966145858168602, "reward_std": 0.07084779627621174, "rewards/curriculum_aware_reward_fn": 0.09114583441987634, "rewards/format_reward": 0.875, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 469.4609375, "epoch": 0.6050420168067226, "grad_norm": 0.3600573241710663, "kl": 0.0386962890625, "learning_rate": 4.979215594284924e-06, "loss": 0.008, "num_tokens": 5017415.0, "reward": 1.0843750536441803, "reward_std": 0.11371596809476614, "rewards/curriculum_aware_reward_fn": 0.08437500288709998, "rewards/format_reward": 1.0, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 422.7890625, "epoch": 0.6218487394957983, "grad_norm": 0.38463500142097473, "kl": 0.03997802734375, "learning_rate": 4.977192427388722e-06, "loss": 0.0195, "num_tokens": 5141804.0, "reward": 1.1130208671092987, "reward_std": 0.1456994889304042, "rewards/curriculum_aware_reward_fn": 0.12083333730697632, "rewards/format_reward": 0.9921875, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 415.4765625, "epoch": 0.6386554621848739, "grad_norm": 0.5343978404998779, "kl": 0.04315185546875, "learning_rate": 4.9750758119588824e-06, "loss": -0.0, "num_tokens": 5262529.0, "reward": 1.136979192495346, "reward_std": 0.29098181426525116, "rewards/curriculum_aware_reward_fn": 0.16822916036471725, "rewards/format_reward": 0.96875, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 442.2421875, "epoch": 0.6554621848739496, "grad_norm": 0.33322009444236755, "kl": 0.047119140625, "learning_rate": 4.972865836804349e-06, "loss": 0.0088, "num_tokens": 5399424.0, "reward": 1.03125, "reward_std": 0.13204573467373848, "rewards/curriculum_aware_reward_fn": 0.109375, "rewards/format_reward": 0.921875, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 482.59375, "epoch": 0.6722689075630253, "grad_norm": 0.35028988122940063, "kl": 0.0504150390625, "learning_rate": 4.970562594651254e-06, "loss": 0.0066, "num_tokens": 5539492.0, "reward": 1.09375, "reward_std": 0.1325825173407793, "rewards/curriculum_aware_reward_fn": 0.125, "rewards/format_reward": 0.96875, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 423.125, "epoch": 0.6890756302521008, "grad_norm": 0.3613918125629425, "kl": 0.06549072265625, "learning_rate": 4.968166182139026e-06, "loss": 0.0516, "num_tokens": 5667012.0, "reward": 1.122395858168602, "reward_std": 0.09522313997149467, "rewards/curriculum_aware_reward_fn": 0.13020833837799728, "rewards/format_reward": 0.9921875, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 391.953125, "epoch": 0.7058823529411765, "grad_norm": 0.6798313856124878, "kl": 0.080322265625, "learning_rate": 4.9656766998163306e-06, "loss": 0.0429, "num_tokens": 5786006.0, "reward": 1.1645833551883698, "reward_std": 0.23287939466536045, "rewards/curriculum_aware_reward_fn": 0.19583334028720856, "rewards/format_reward": 0.96875, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 406.59375, "epoch": 0.7226890756302521, "grad_norm": 0.4006165862083435, "kl": 0.06298828125, "learning_rate": 4.963094252136865e-06, "loss": 0.0119, "num_tokens": 5910762.0, "reward": 1.0989583432674408, "reward_std": 0.1059559416025877, "rewards/curriculum_aware_reward_fn": 0.16927083767950535, "rewards/format_reward": 0.9296875, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 411.28125, "epoch": 0.7394957983193278, "grad_norm": 0.3308926522731781, "kl": 0.068359375, "learning_rate": 4.960418947454958e-06, "loss": 0.0204, "num_tokens": 6042942.0, "reward": 0.9947916865348816, "reward_std": 0.10640286095440388, "rewards/curriculum_aware_reward_fn": 0.0572916679084301, "rewards/format_reward": 0.9375, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 380.9140625, "epoch": 0.7563025210084033, "grad_norm": 0.478098064661026, "kl": 0.0755615234375, "learning_rate": 4.957650898021038e-06, "loss": 0.0624, "num_tokens": 6162299.0, "reward": 1.158854216337204, "reward_std": 0.20395735278725624, "rewards/curriculum_aware_reward_fn": 0.15885416977107525, "rewards/format_reward": 1.0, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 333.5390625, "epoch": 0.773109243697479, "grad_norm": 0.47225892543792725, "kl": 0.0811767578125, "learning_rate": 4.954790219976915e-06, "loss": 0.0167, "num_tokens": 6272592.0, "reward": 1.170312523841858, "reward_std": 0.1300742938183248, "rewards/curriculum_aware_reward_fn": 0.17031250218860805, "rewards/format_reward": 1.0, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 366.34375, "epoch": 0.7899159663865546, "grad_norm": 0.4816167652606964, "kl": 0.0848388671875, "learning_rate": 4.95183703335091e-06, "loss": 0.0103, "num_tokens": 6397436.0, "reward": 1.1953125298023224, "reward_std": 0.12861409783363342, "rewards/curriculum_aware_reward_fn": 0.1953125074505806, "rewards/format_reward": 1.0, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 338.65625, "epoch": 0.8067226890756303, "grad_norm": 0.47690296173095703, "kl": 0.07958984375, "learning_rate": 4.948791462052819e-06, "loss": -0.0039, "num_tokens": 6519896.0, "reward": 1.0442708730697632, "reward_std": 0.11976211331784725, "rewards/curriculum_aware_reward_fn": 0.05208333441987634, "rewards/format_reward": 0.9921875, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 325.78125, "epoch": 0.8235294117647058, "grad_norm": 0.46987131237983704, "kl": 0.08935546875, "learning_rate": 4.945653633868716e-06, "loss": 0.023, "num_tokens": 6637036.0, "reward": 1.1223958730697632, "reward_std": 0.18378917127847672, "rewards/curriculum_aware_reward_fn": 0.14583333488553762, "rewards/format_reward": 0.9765625, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 302.6171875, "epoch": 0.8403361344537815, "grad_norm": 0.5635585784912109, "kl": 0.088623046875, "learning_rate": 4.942423680455584e-06, "loss": 0.0157, "num_tokens": 6749411.0, "reward": 1.1494792103767395, "reward_std": 0.19005249440670013, "rewards/curriculum_aware_reward_fn": 0.1494791703298688, "rewards/format_reward": 1.0, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 366.4140625, "epoch": 0.8571428571428571, "grad_norm": 0.5150569677352905, "kl": 0.0848388671875, "learning_rate": 4.939101737335802e-06, "loss": -0.0145, "num_tokens": 6879104.0, "reward": 0.9713541865348816, "reward_std": 0.12468592822551727, "rewards/curriculum_aware_reward_fn": 0.041666666977107525, "rewards/format_reward": 0.9296875, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 288.9609375, "epoch": 0.8739495798319328, "grad_norm": 0.6744732856750488, "kl": 0.088134765625, "learning_rate": 4.935687943891447e-06, "loss": 0.0413, "num_tokens": 6981627.0, "reward": 1.171354204416275, "reward_std": 0.26688926108181477, "rewards/curriculum_aware_reward_fn": 0.1791666648350656, "rewards/format_reward": 0.9921875, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 286.609375, "epoch": 0.8907563025210085, "grad_norm": 0.6298589706420898, "kl": 0.0899658203125, "learning_rate": 4.932182443358458e-06, "loss": 0.002, "num_tokens": 7088449.0, "reward": 1.1250000298023224, "reward_std": 0.14211241621524096, "rewards/curriculum_aware_reward_fn": 0.125000003259629, "rewards/format_reward": 1.0, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 302.4921875, "epoch": 0.907563025210084, "grad_norm": 0.6446663737297058, "kl": 0.0955810546875, "learning_rate": 4.928585382820616e-06, "loss": 0.0001, "num_tokens": 7197360.0, "reward": 1.1005208790302277, "reward_std": 0.08887648163363338, "rewards/curriculum_aware_reward_fn": 0.10052083828486502, "rewards/format_reward": 1.0, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 399.0625, "epoch": 0.9243697478991597, "grad_norm": 0.729965090751648, "kl": 0.1077880859375, "learning_rate": 4.924896913203376e-06, "loss": 0.0496, "num_tokens": 7331528.0, "reward": 0.9932291805744171, "reward_std": 0.15547171607613564, "rewards/curriculum_aware_reward_fn": 0.1807291698642075, "rewards/format_reward": 0.8125, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 494.09375, "epoch": 0.9411764705882353, "grad_norm": 0.41514354944229126, "kl": 0.0888671875, "learning_rate": 4.921117189267535e-06, "loss": -0.0255, "num_tokens": 7482548.0, "reward": 0.8671875149011612, "reward_std": 0.09729943191632628, "rewards/curriculum_aware_reward_fn": 0.05468750069849193, "rewards/format_reward": 0.8125, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 511.90625, "epoch": 0.957983193277311, "grad_norm": 0.6172528266906738, "kl": 0.1680908203125, "learning_rate": 4.917246369602742e-06, "loss": 0.0368, "num_tokens": 7621256.0, "reward": 1.050520896911621, "reward_std": 0.1373576819896698, "rewards/curriculum_aware_reward_fn": 0.17552083916962147, "rewards/format_reward": 0.875, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 415.8046875, "epoch": 0.9747899159663865, "grad_norm": 0.5895751118659973, "kl": 0.2454833984375, "learning_rate": 4.9132846166208355e-06, "loss": 0.0654, "num_tokens": 7740207.0, "reward": 1.1197916865348816, "reward_std": 0.09127403190359473, "rewards/curriculum_aware_reward_fn": 0.12760416674427688, "rewards/format_reward": 0.9921875, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 578.2857208251953, "epoch": 0.9915966386554622, "grad_norm": 0.4072950482368469, "kl": 0.12353515625, "learning_rate": 4.9092320965490365e-06, "loss": 0.0363, "num_tokens": 7885325.0, "reward": 1.0598958432674408, "reward_std": 0.11012758687138557, "rewards/curriculum_aware_reward_fn": 0.0598958358168602, "rewards/format_reward": 1.0, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 564.78125, "epoch": 1.0168067226890756, "grad_norm": 0.4054146707057953, "kl": 0.1219482421875, "learning_rate": 4.905088979422971e-06, "loss": 0.0515, "num_tokens": 8038297.0, "reward": 1.0937500596046448, "reward_std": 0.1636663186363876, "rewards/curriculum_aware_reward_fn": 0.1015625053551048, "rewards/format_reward": 0.9921875, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 578.2890625, "epoch": 1.0336134453781514, "grad_norm": 0.3728667199611664, "kl": 0.126708984375, "learning_rate": 4.900855439079536e-06, "loss": 0.0134, "num_tokens": 8185046.0, "reward": 1.1265625357627869, "reward_std": 0.14053087309002876, "rewards/curriculum_aware_reward_fn": 0.12656250409781933, "rewards/format_reward": 1.0, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 722.3125, "epoch": 1.050420168067227, "grad_norm": 0.32597795128822327, "kl": 0.1097412109375, "learning_rate": 4.8965316531496055e-06, "loss": 0.0184, "num_tokens": 8366894.0, "reward": 0.9677083492279053, "reward_std": 0.10382660711184144, "rewards/curriculum_aware_reward_fn": 0.04583333386108279, "rewards/format_reward": 0.921875, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 663.3671875, "epoch": 1.0672268907563025, "grad_norm": 0.4411354064941406, "kl": 0.1256103515625, "learning_rate": 4.892117803050578e-06, "loss": 0.0637, "num_tokens": 8514637.0, "reward": 1.0671875327825546, "reward_std": 0.21371026523411274, "rewards/curriculum_aware_reward_fn": 0.09843750204890966, "rewards/format_reward": 0.96875, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 644.1328125, "epoch": 1.084033613445378, "grad_norm": 0.36323124170303345, "kl": 0.1290283203125, "learning_rate": 4.887614073978761e-06, "loss": -0.0258, "num_tokens": 8683182.0, "reward": 0.9921875, "reward_std": 0.12415501847863197, "rewards/curriculum_aware_reward_fn": 0.0859375, "rewards/format_reward": 0.90625, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 659.5703125, "epoch": 1.1008403361344539, "grad_norm": 0.5263646245002747, "kl": 0.1494140625, "learning_rate": 4.883020654901609e-06, "loss": -0.03, "num_tokens": 8846031.0, "reward": 0.9375000149011612, "reward_std": 0.29325952008366585, "rewards/curriculum_aware_reward_fn": 0.08593750093132257, "rewards/format_reward": 0.8515625, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 542.421875, "epoch": 1.1176470588235294, "grad_norm": 0.5841398239135742, "kl": 0.1328125, "learning_rate": 4.878337738549785e-06, "loss": 0.0185, "num_tokens": 8994685.0, "reward": 0.9244791865348816, "reward_std": 0.350746251642704, "rewards/curriculum_aware_reward_fn": 0.0651041679084301, "rewards/format_reward": 0.859375, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 582.9453125, "epoch": 1.134453781512605, "grad_norm": 0.5570164322853088, "kl": 0.122314453125, "learning_rate": 4.873565521409082e-06, "loss": 0.0521, "num_tokens": 9155622.0, "reward": 0.9843750298023224, "reward_std": 0.3105625621974468, "rewards/curriculum_aware_reward_fn": 0.140625, "rewards/format_reward": 0.84375, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 485.3359375, "epoch": 1.1512605042016806, "grad_norm": 0.5985521078109741, "kl": 0.1263427734375, "learning_rate": 4.868704203712173e-06, "loss": 0.0059, "num_tokens": 9288609.0, "reward": 1.052083358168602, "reward_std": 0.3865399658679962, "rewards/curriculum_aware_reward_fn": 0.18489583488553762, "rewards/format_reward": 0.8671875, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 535.578125, "epoch": 1.1680672268907564, "grad_norm": 0.564681887626648, "kl": 0.1126708984375, "learning_rate": 4.86375398943021e-06, "loss": -0.0485, "num_tokens": 9437915.0, "reward": 0.9140625298023224, "reward_std": 0.28478266298770905, "rewards/curriculum_aware_reward_fn": 0.07031250256113708, "rewards/format_reward": 0.84375, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 496.0703125, "epoch": 1.184873949579832, "grad_norm": 0.5399004220962524, "kl": 0.099853515625, "learning_rate": 4.858715086264274e-06, "loss": 0.0, "num_tokens": 9580828.0, "reward": 0.934895858168602, "reward_std": 0.19700524397194386, "rewards/curriculum_aware_reward_fn": 0.04427083441987634, "rewards/format_reward": 0.890625, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 469.078125, "epoch": 1.2016806722689075, "grad_norm": 0.552808940410614, "kl": 0.104736328125, "learning_rate": 4.853587705636646e-06, "loss": -0.0129, "num_tokens": 9710926.0, "reward": 1.0234375447034836, "reward_std": 0.2662259414792061, "rewards/curriculum_aware_reward_fn": 0.07031250186264515, "rewards/format_reward": 0.953125, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 539.765625, "epoch": 1.2184873949579833, "grad_norm": 0.4241795837879181, "kl": 0.093505859375, "learning_rate": 4.84837206268195e-06, "loss": -0.0161, "num_tokens": 9867328.0, "reward": 0.796875, "reward_std": 0.12126770988106728, "rewards/curriculum_aware_reward_fn": 0.015625, "rewards/format_reward": 0.78125, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 502.6328125, "epoch": 1.2352941176470589, "grad_norm": 0.4336708188056946, "kl": 0.0975341796875, "learning_rate": 4.8430683762381195e-06, "loss": 0.0086, "num_tokens": 10007809.0, "reward": 1.0416666865348816, "reward_std": 0.1647402998059988, "rewards/curriculum_aware_reward_fn": 0.07291666697710752, "rewards/format_reward": 0.96875, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 523.640625, "epoch": 1.2521008403361344, "grad_norm": 0.46911758184432983, "kl": 0.1041259765625, "learning_rate": 4.837676868837213e-06, "loss": 0.0313, "num_tokens": 10147291.0, "reward": 1.0302083790302277, "reward_std": 0.18167300708591938, "rewards/curriculum_aware_reward_fn": 0.06145833572372794, "rewards/format_reward": 0.96875, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 628.6171875, "epoch": 1.26890756302521, "grad_norm": 0.5553966760635376, "kl": 0.091552734375, "learning_rate": 4.832197766696085e-06, "loss": 0.0771, "num_tokens": 10304234.0, "reward": 1.0401041805744171, "reward_std": 0.2994466572999954, "rewards/curriculum_aware_reward_fn": 0.14166666939854622, "rewards/format_reward": 0.8984375, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 541.03125, "epoch": 1.2857142857142856, "grad_norm": 0.4850277602672577, "kl": 0.092529296875, "learning_rate": 4.826631299706887e-06, "loss": -0.0076, "num_tokens": 10455718.0, "reward": 1.043229192495346, "reward_std": 0.18734892271459103, "rewards/curriculum_aware_reward_fn": 0.12135417200624943, "rewards/format_reward": 0.921875, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 513.34375, "epoch": 1.3025210084033614, "grad_norm": 0.5054645538330078, "kl": 0.098876953125, "learning_rate": 4.820977701427424e-06, "loss": 0.0342, "num_tokens": 10586514.0, "reward": 1.0703125149011612, "reward_std": 0.18218252062797546, "rewards/curriculum_aware_reward_fn": 0.10156250256113708, "rewards/format_reward": 0.96875, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 554.0390625, "epoch": 1.319327731092437, "grad_norm": 0.5157454013824463, "kl": 0.0882568359375, "learning_rate": 4.81523720907136e-06, "loss": 0.05, "num_tokens": 10726735.0, "reward": 1.1385416984558105, "reward_std": 0.23471001349389553, "rewards/curriculum_aware_reward_fn": 0.17760416865348816, "rewards/format_reward": 0.9609375, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 580.015625, "epoch": 1.3361344537815127, "grad_norm": 0.41959676146507263, "kl": 0.0958251953125, "learning_rate": 4.809410063498254e-06, "loss": 0.0059, "num_tokens": 10884185.0, "reward": 0.8411458283662796, "reward_std": 0.22234245762228966, "rewards/curriculum_aware_reward_fn": 0.06770833395421505, "rewards/format_reward": 0.7734375, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 534.84375, "epoch": 1.3529411764705883, "grad_norm": 0.6390196681022644, "kl": 0.093017578125, "learning_rate": 4.8034965092034656e-06, "loss": 0.0963, "num_tokens": 11024149.0, "reward": 1.0494791865348816, "reward_std": 0.2942212373018265, "rewards/curriculum_aware_reward_fn": 0.1432291679084301, "rewards/format_reward": 0.90625, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 525.671875, "epoch": 1.3697478991596639, "grad_norm": 0.4957650303840637, "kl": 0.105224609375, "learning_rate": 4.797496794307889e-06, "loss": -0.0236, "num_tokens": 11167547.0, "reward": 0.9828125238418579, "reward_std": 0.14313106751069427, "rewards/curriculum_aware_reward_fn": 0.06093750288709998, "rewards/format_reward": 0.921875, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 488.7890625, "epoch": 1.3865546218487395, "grad_norm": 0.5502341389656067, "kl": 0.104248046875, "learning_rate": 4.791411170547545e-06, "loss": 0.072, "num_tokens": 11300104.0, "reward": 1.143229216337204, "reward_std": 0.2514217048883438, "rewards/curriculum_aware_reward_fn": 0.1822916679084301, "rewards/format_reward": 0.9609375, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 508.1015625, "epoch": 1.403361344537815, "grad_norm": 0.489242285490036, "kl": 0.0986328125, "learning_rate": 4.785239893263017e-06, "loss": 0.1064, "num_tokens": 11446565.0, "reward": 0.9505208730697632, "reward_std": 0.18909456953406334, "rewards/curriculum_aware_reward_fn": 0.1145833358168602, "rewards/format_reward": 0.8359375, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 448.359375, "epoch": 1.4201680672268908, "grad_norm": 0.46208083629608154, "kl": 0.11083984375, "learning_rate": 4.778983221388742e-06, "loss": 0.0139, "num_tokens": 11580547.0, "reward": 1.0276041626930237, "reward_std": 0.12672015465795994, "rewards/curriculum_aware_reward_fn": 0.09791666641831398, "rewards/format_reward": 0.9296875, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 457.3515625, "epoch": 1.4369747899159664, "grad_norm": 0.44321703910827637, "kl": 0.1085205078125, "learning_rate": 4.77264141744214e-06, "loss": 0.0184, "num_tokens": 11712984.0, "reward": 1.160416692495346, "reward_std": 0.15310384705662727, "rewards/curriculum_aware_reward_fn": 0.16041666828095913, "rewards/format_reward": 1.0, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 384.3984375, "epoch": 1.453781512605042, "grad_norm": 0.4427275061607361, "kl": 0.123779296875, "learning_rate": 4.766214747512603e-06, "loss": 0.0233, "num_tokens": 11829315.0, "reward": 1.1666666865348816, "reward_std": 0.14362479094415903, "rewards/curriculum_aware_reward_fn": 0.1744791674427688, "rewards/format_reward": 0.9921875, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 389.265625, "epoch": 1.4705882352941178, "grad_norm": 0.5121983885765076, "kl": 0.12841796875, "learning_rate": 4.759703481250331e-06, "loss": 0.0229, "num_tokens": 11949533.0, "reward": 1.1343750357627869, "reward_std": 0.16322745941579342, "rewards/curriculum_aware_reward_fn": 0.1500000013038516, "rewards/format_reward": 0.984375, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 459.921875, "epoch": 1.4873949579831933, "grad_norm": 0.6308655738830566, "kl": 0.11328125, "learning_rate": 4.753107891855015e-06, "loss": 0.02, "num_tokens": 12092563.0, "reward": 0.8854166716337204, "reward_std": 0.22940894588828087, "rewards/curriculum_aware_reward_fn": 0.11197916697710752, "rewards/format_reward": 0.7734375, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 325.625, "epoch": 1.504201680672269, "grad_norm": 0.6008861660957336, "kl": 0.14013671875, "learning_rate": 4.746428256064375e-06, "loss": 0.0021, "num_tokens": 12195803.0, "reward": 1.261979192495346, "reward_std": 0.17762075550854206, "rewards/curriculum_aware_reward_fn": 0.2697916701436043, "rewards/format_reward": 0.9921875, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 348.2421875, "epoch": 1.5210084033613445, "grad_norm": 0.5485793948173523, "kl": 0.13720703125, "learning_rate": 4.7396648541425534e-06, "loss": 0.0053, "num_tokens": 12314866.0, "reward": 1.2442708611488342, "reward_std": 0.21353545226156712, "rewards/curriculum_aware_reward_fn": 0.24427084252238274, "rewards/format_reward": 1.0, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 519.265625, "epoch": 1.53781512605042, "grad_norm": 0.4276430010795593, "kl": 0.11962890625, "learning_rate": 4.732817969868348e-06, "loss": 0.0315, "num_tokens": 12463604.0, "reward": 0.895833358168602, "reward_std": 0.09582467563450336, "rewards/curriculum_aware_reward_fn": 0.0833333358168602, "rewards/format_reward": 0.8125, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 334.859375, "epoch": 1.5546218487394958, "grad_norm": 0.5961228013038635, "kl": 0.1279296875, "learning_rate": 4.7258878905233095e-06, "loss": 0.0697, "num_tokens": 12583698.0, "reward": 1.2239583432674408, "reward_std": 0.18167817778885365, "rewards/curriculum_aware_reward_fn": 0.2317708358168602, "rewards/format_reward": 0.9921875, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 363.859375, "epoch": 1.5714285714285714, "grad_norm": 0.7697988152503967, "kl": 0.1495361328125, "learning_rate": 4.718874906879688e-06, "loss": 0.1313, "num_tokens": 12705640.0, "reward": 1.123437523841858, "reward_std": 0.29410218447446823, "rewards/curriculum_aware_reward_fn": 0.17812500521540642, "rewards/format_reward": 0.9453125, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 297.1328125, "epoch": 1.5882352941176472, "grad_norm": 0.5211566686630249, "kl": 0.133056640625, "learning_rate": 4.711779313188231e-06, "loss": 0.04, "num_tokens": 12824305.0, "reward": 1.1119791865348816, "reward_std": 0.18469560518860817, "rewards/curriculum_aware_reward_fn": 0.1119791679084301, "rewards/format_reward": 1.0, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 253.859375, "epoch": 1.6050420168067228, "grad_norm": 0.6686471104621887, "kl": 0.1435546875, "learning_rate": 4.70460140716584e-06, "loss": -0.0138, "num_tokens": 12918559.0, "reward": 1.1968750357627869, "reward_std": 0.1724256370216608, "rewards/curriculum_aware_reward_fn": 0.20468750596046448, "rewards/format_reward": 0.9921875, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 575.90625, "epoch": 1.6218487394957983, "grad_norm": 0.3436340093612671, "kl": 0.160888671875, "learning_rate": 4.697341489983076e-06, "loss": 0.0223, "num_tokens": 13076699.0, "reward": 0.8567708432674408, "reward_std": 0.08664888702332973, "rewards/curriculum_aware_reward_fn": 0.10677083395421505, "rewards/format_reward": 0.75, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 301.9140625, "epoch": 1.638655462184874, "grad_norm": 1.0377366542816162, "kl": 0.162353515625, "learning_rate": 4.6899998662515215e-06, "loss": 0.0906, "num_tokens": 13192776.0, "reward": 1.1197916716337204, "reward_std": 0.12685893662273884, "rewards/curriculum_aware_reward_fn": 0.13541666674427688, "rewards/format_reward": 0.984375, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 331.28125, "epoch": 1.6554621848739495, "grad_norm": 0.477889746427536, "kl": 0.151123046875, "learning_rate": 4.682576844011007e-06, "loss": 0.0147, "num_tokens": 13309884.0, "reward": 1.2942708730697632, "reward_std": 0.10486710164695978, "rewards/curriculum_aware_reward_fn": 0.3567708432674408, "rewards/format_reward": 0.9375, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 418.96875, "epoch": 1.6722689075630253, "grad_norm": 1.2051795721054077, "kl": 0.1630859375, "learning_rate": 4.675072734716678e-06, "loss": 0.0986, "num_tokens": 13443272.0, "reward": 1.0937500298023224, "reward_std": 0.1639669369906187, "rewards/curriculum_aware_reward_fn": 0.1796875111758709, "rewards/format_reward": 0.9140625, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 257.484375, "epoch": 1.6890756302521008, "grad_norm": 0.6367622017860413, "kl": 0.157958984375, "learning_rate": 4.667487853225931e-06, "loss": 0.0274, "num_tokens": 13546814.0, "reward": 1.3072916865348816, "reward_std": 0.1563644390553236, "rewards/curriculum_aware_reward_fn": 0.3307291641831398, "rewards/format_reward": 0.9765625, "step": 100 }, { "epoch": 1.6890756302521008, "step": 100, "total_flos": 0.0, "train_loss": 0.0, "train_runtime": 1.1082, "train_samples_per_second": 1712.724, "train_steps_per_second": 52.338 } ], "logging_steps": 1, "max_steps": 58, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }