|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.6890756302521008, |
|
"eval_steps": 500, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 523.3984375, |
|
"epoch": 0.01680672268907563, |
|
"grad_norm": 0.4084254801273346, |
|
"kl": 0.0, |
|
"learning_rate": 3.3333333333333335e-07, |
|
"loss": 0.0346, |
|
"num_tokens": 149643.0, |
|
"reward": 0.05572916800156236, |
|
"reward_std": 0.1236814484000206, |
|
"rewards/curriculum_aware_reward_fn": 0.024479168001562357, |
|
"rewards/format_reward": 0.03125, |
|
"step": 1 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 446.5390625, |
|
"epoch": 0.03361344537815126, |
|
"grad_norm": 0.5236720442771912, |
|
"kl": 0.0, |
|
"learning_rate": 6.666666666666667e-07, |
|
"loss": 0.015, |
|
"num_tokens": 282184.0, |
|
"reward": 0.17552083544433117, |
|
"reward_std": 0.2788553349673748, |
|
"rewards/curriculum_aware_reward_fn": 0.12864583544433117, |
|
"rewards/format_reward": 0.046875, |
|
"step": 2 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 404.5625, |
|
"epoch": 0.05042016806722689, |
|
"grad_norm": 0.5860257744789124, |
|
"kl": 0.00027751922607421875, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.0033, |
|
"num_tokens": 411536.0, |
|
"reward": 0.1171875, |
|
"reward_std": 0.2090039700269699, |
|
"rewards/curriculum_aware_reward_fn": 0.078125, |
|
"rewards/format_reward": 0.0390625, |
|
"step": 3 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 437.8359375, |
|
"epoch": 0.06722689075630252, |
|
"grad_norm": 0.5572423338890076, |
|
"kl": 0.00028705596923828125, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 0.0288, |
|
"num_tokens": 543811.0, |
|
"reward": 0.1276041716337204, |
|
"reward_std": 0.17299909517169, |
|
"rewards/curriculum_aware_reward_fn": 0.1041666716337204, |
|
"rewards/format_reward": 0.0234375, |
|
"step": 4 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 484.453125, |
|
"epoch": 0.08403361344537816, |
|
"grad_norm": 0.5424997806549072, |
|
"kl": 0.0003066062927246094, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.0016, |
|
"num_tokens": 674637.0, |
|
"reward": 0.15625, |
|
"reward_std": 0.20851250365376472, |
|
"rewards/curriculum_aware_reward_fn": 0.109375, |
|
"rewards/format_reward": 0.046875, |
|
"step": 5 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 468.078125, |
|
"epoch": 0.10084033613445378, |
|
"grad_norm": 0.47155389189720154, |
|
"kl": 0.0003123283386230469, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": -0.0177, |
|
"num_tokens": 810527.0, |
|
"reward": 0.1276041716337204, |
|
"reward_std": 0.18760672956705093, |
|
"rewards/curriculum_aware_reward_fn": 0.0651041716337204, |
|
"rewards/format_reward": 0.0625, |
|
"step": 6 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 500.3125, |
|
"epoch": 0.11764705882352941, |
|
"grad_norm": 0.5090876817703247, |
|
"kl": 0.0004954338073730469, |
|
"learning_rate": 2.3333333333333336e-06, |
|
"loss": 0.0566, |
|
"num_tokens": 947063.0, |
|
"reward": 0.1354166679084301, |
|
"reward_std": 0.1753537617623806, |
|
"rewards/curriculum_aware_reward_fn": 0.07291666697710752, |
|
"rewards/format_reward": 0.0625, |
|
"step": 7 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 483.84375, |
|
"epoch": 0.13445378151260504, |
|
"grad_norm": 0.6272737979888916, |
|
"kl": 0.0008707046508789062, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 0.0704, |
|
"num_tokens": 1093987.0, |
|
"reward": 0.16250000149011612, |
|
"reward_std": 0.3029462620615959, |
|
"rewards/curriculum_aware_reward_fn": 0.037500000558793545, |
|
"rewards/format_reward": 0.125, |
|
"step": 8 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 423.1484375, |
|
"epoch": 0.15126050420168066, |
|
"grad_norm": 0.7402248978614807, |
|
"kl": 0.00279998779296875, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0728, |
|
"num_tokens": 1218118.0, |
|
"reward": 0.3968750089406967, |
|
"reward_std": 0.45669952034950256, |
|
"rewards/curriculum_aware_reward_fn": 0.10781250474974513, |
|
"rewards/format_reward": 0.2890625, |
|
"step": 9 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 496.734375, |
|
"epoch": 0.16806722689075632, |
|
"grad_norm": 0.7178624272346497, |
|
"kl": 0.00432586669921875, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": -0.0207, |
|
"num_tokens": 1362188.0, |
|
"reward": 0.5677083432674408, |
|
"reward_std": 0.5023705065250397, |
|
"rewards/curriculum_aware_reward_fn": 0.013020833721384406, |
|
"rewards/format_reward": 0.5546875, |
|
"step": 10 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 500.3984375, |
|
"epoch": 0.18487394957983194, |
|
"grad_norm": 0.6241538524627686, |
|
"kl": 0.0084075927734375, |
|
"learning_rate": 3.6666666666666666e-06, |
|
"loss": 0.0165, |
|
"num_tokens": 1496743.0, |
|
"reward": 0.714062511920929, |
|
"reward_std": 0.344537615776062, |
|
"rewards/curriculum_aware_reward_fn": 0.010937500279396772, |
|
"rewards/format_reward": 0.703125, |
|
"step": 11 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 517.5625, |
|
"epoch": 0.20168067226890757, |
|
"grad_norm": 0.6269044280052185, |
|
"kl": 0.00717926025390625, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.0195, |
|
"num_tokens": 1650679.0, |
|
"reward": 0.7604166716337204, |
|
"reward_std": 0.2794100269675255, |
|
"rewards/curriculum_aware_reward_fn": 0.010416666977107525, |
|
"rewards/format_reward": 0.75, |
|
"step": 12 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 489.9921875, |
|
"epoch": 0.2184873949579832, |
|
"grad_norm": 0.6628085374832153, |
|
"kl": 0.011993408203125, |
|
"learning_rate": 4.333333333333334e-06, |
|
"loss": 0.0178, |
|
"num_tokens": 1791190.0, |
|
"reward": 0.9375000298023224, |
|
"reward_std": 0.23163868859410286, |
|
"rewards/curriculum_aware_reward_fn": 0.031250000931322575, |
|
"rewards/format_reward": 0.90625, |
|
"step": 13 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 418.90625, |
|
"epoch": 0.23529411764705882, |
|
"grad_norm": 0.7439326643943787, |
|
"kl": 0.0212249755859375, |
|
"learning_rate": 4.666666666666667e-06, |
|
"loss": -0.0283, |
|
"num_tokens": 1912322.0, |
|
"reward": 0.8645833432674408, |
|
"reward_std": 0.2743529714643955, |
|
"rewards/curriculum_aware_reward_fn": 0.02864583395421505, |
|
"rewards/format_reward": 0.8359375, |
|
"step": 14 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 440.5546875, |
|
"epoch": 0.25210084033613445, |
|
"grad_norm": 0.477730393409729, |
|
"kl": 0.0169830322265625, |
|
"learning_rate": 5e-06, |
|
"loss": 0.0314, |
|
"num_tokens": 2041473.0, |
|
"reward": 1.0234375298023224, |
|
"reward_std": 0.159461235627532, |
|
"rewards/curriculum_aware_reward_fn": 0.05468750325962901, |
|
"rewards/format_reward": 0.96875, |
|
"step": 15 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 497.265625, |
|
"epoch": 0.2689075630252101, |
|
"grad_norm": 0.3044699728488922, |
|
"kl": 0.021484375, |
|
"learning_rate": 4.999952797253148e-06, |
|
"loss": 0.0386, |
|
"num_tokens": 2178899.0, |
|
"reward": 0.9427083432674408, |
|
"reward_std": 0.10222155228257179, |
|
"rewards/curriculum_aware_reward_fn": 0.06770833395421505, |
|
"rewards/format_reward": 0.875, |
|
"step": 16 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 503.7421875, |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 0.4504559338092804, |
|
"kl": 0.033233642578125, |
|
"learning_rate": 4.9998111909931225e-06, |
|
"loss": 0.0346, |
|
"num_tokens": 2311042.0, |
|
"reward": 1.0927083492279053, |
|
"reward_std": 0.13363875821232796, |
|
"rewards/curriculum_aware_reward_fn": 0.09270833618938923, |
|
"rewards/format_reward": 1.0, |
|
"step": 17 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 582.390625, |
|
"epoch": 0.3025210084033613, |
|
"grad_norm": 0.3522663712501526, |
|
"kl": 0.033416748046875, |
|
"learning_rate": 4.999575187161439e-06, |
|
"loss": 0.0049, |
|
"num_tokens": 2477404.0, |
|
"reward": 0.997395858168602, |
|
"reward_std": 0.0840194127522409, |
|
"rewards/curriculum_aware_reward_fn": 0.05989583651535213, |
|
"rewards/format_reward": 0.9375, |
|
"step": 18 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 603.4140625, |
|
"epoch": 0.31932773109243695, |
|
"grad_norm": 0.2832561731338501, |
|
"kl": 0.0328369140625, |
|
"learning_rate": 4.9992447956603455e-06, |
|
"loss": 0.0152, |
|
"num_tokens": 2642801.0, |
|
"reward": 0.895833358168602, |
|
"reward_std": 0.05173455877229571, |
|
"rewards/curriculum_aware_reward_fn": 0.020833333721384406, |
|
"rewards/format_reward": 0.875, |
|
"step": 19 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 501.46875, |
|
"epoch": 0.33613445378151263, |
|
"grad_norm": 0.4254682660102844, |
|
"kl": 0.0416259765625, |
|
"learning_rate": 4.998820030352409e-06, |
|
"loss": 0.0009, |
|
"num_tokens": 2784445.0, |
|
"reward": 1.0010417103767395, |
|
"reward_std": 0.13327472284436226, |
|
"rewards/curriculum_aware_reward_fn": 0.07135416753590107, |
|
"rewards/format_reward": 0.9296875, |
|
"step": 20 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 598.765625, |
|
"epoch": 0.35294117647058826, |
|
"grad_norm": 0.3762480914592743, |
|
"kl": 0.033721923828125, |
|
"learning_rate": 4.998300909059929e-06, |
|
"loss": -0.0163, |
|
"num_tokens": 2951415.0, |
|
"reward": 0.9947916865348816, |
|
"reward_std": 0.11004853993654251, |
|
"rewards/curriculum_aware_reward_fn": 0.07291666604578495, |
|
"rewards/format_reward": 0.921875, |
|
"step": 21 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 489.6484375, |
|
"epoch": 0.3697478991596639, |
|
"grad_norm": 0.6521299481391907, |
|
"kl": 0.04229736328125, |
|
"learning_rate": 4.997687453564198e-06, |
|
"loss": 0.0083, |
|
"num_tokens": 3090354.0, |
|
"reward": 0.9609375298023224, |
|
"reward_std": 0.25490327179431915, |
|
"rewards/curriculum_aware_reward_fn": 0.1562500037252903, |
|
"rewards/format_reward": 0.8046875, |
|
"step": 22 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 446.625, |
|
"epoch": 0.3865546218487395, |
|
"grad_norm": 0.4581259489059448, |
|
"kl": 0.02838134765625, |
|
"learning_rate": 4.9969796896045775e-06, |
|
"loss": 0.0239, |
|
"num_tokens": 3234002.0, |
|
"reward": 1.1093750298023224, |
|
"reward_std": 0.15402578841894865, |
|
"rewards/curriculum_aware_reward_fn": 0.17187500139698386, |
|
"rewards/format_reward": 0.9375, |
|
"step": 23 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 438.296875, |
|
"epoch": 0.40336134453781514, |
|
"grad_norm": 0.469014436006546, |
|
"kl": 0.02874755859375, |
|
"learning_rate": 4.996177646877426e-06, |
|
"loss": 0.0065, |
|
"num_tokens": 3368280.0, |
|
"reward": 1.0302083790302277, |
|
"reward_std": 0.12476669438183308, |
|
"rewards/curriculum_aware_reward_fn": 0.045833335258066654, |
|
"rewards/format_reward": 0.984375, |
|
"step": 24 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 433.9921875, |
|
"epoch": 0.42016806722689076, |
|
"grad_norm": 0.5325790643692017, |
|
"kl": 0.028350830078125, |
|
"learning_rate": 4.995281359034851e-06, |
|
"loss": -0.0046, |
|
"num_tokens": 3495607.0, |
|
"reward": 1.0026041865348816, |
|
"reward_std": 0.2044544592499733, |
|
"rewards/curriculum_aware_reward_fn": 0.08072916977107525, |
|
"rewards/format_reward": 0.921875, |
|
"step": 25 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 380.015625, |
|
"epoch": 0.4369747899159664, |
|
"grad_norm": 0.5200158357620239, |
|
"kl": 0.03680419921875, |
|
"learning_rate": 4.994290863683296e-06, |
|
"loss": 0.0419, |
|
"num_tokens": 3609809.0, |
|
"reward": 1.1458334028720856, |
|
"reward_std": 0.20207119127735496, |
|
"rewards/curriculum_aware_reward_fn": 0.15364583837799728, |
|
"rewards/format_reward": 0.9921875, |
|
"step": 26 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 413.7265625, |
|
"epoch": 0.453781512605042, |
|
"grad_norm": 0.41265320777893066, |
|
"kl": 0.033966064453125, |
|
"learning_rate": 4.99320620238196e-06, |
|
"loss": -0.0204, |
|
"num_tokens": 3744550.0, |
|
"reward": 1.0963541865348816, |
|
"reward_std": 0.11664257757365704, |
|
"rewards/curriculum_aware_reward_fn": 0.09635416604578495, |
|
"rewards/format_reward": 1.0, |
|
"step": 27 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 461.4375, |
|
"epoch": 0.47058823529411764, |
|
"grad_norm": 0.3836808502674103, |
|
"kl": 0.032623291015625, |
|
"learning_rate": 4.99202742064106e-06, |
|
"loss": 0.035, |
|
"num_tokens": 3875238.0, |
|
"reward": 1.0182291865348816, |
|
"reward_std": 0.10335793904960155, |
|
"rewards/curriculum_aware_reward_fn": 0.0807291679084301, |
|
"rewards/format_reward": 0.9375, |
|
"step": 28 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 506.953125, |
|
"epoch": 0.48739495798319327, |
|
"grad_norm": 0.3787353038787842, |
|
"kl": 0.029510498046875, |
|
"learning_rate": 4.990754567919917e-06, |
|
"loss": 0.0725, |
|
"num_tokens": 4024312.0, |
|
"reward": 0.9531250298023224, |
|
"reward_std": 0.08838835544884205, |
|
"rewards/curriculum_aware_reward_fn": 0.08593750279396772, |
|
"rewards/format_reward": 0.8671875, |
|
"step": 29 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 474.1875, |
|
"epoch": 0.5042016806722689, |
|
"grad_norm": 0.37932828068733215, |
|
"kl": 0.03497314453125, |
|
"learning_rate": 4.989387697624881e-06, |
|
"loss": 0.0078, |
|
"num_tokens": 4161712.0, |
|
"reward": 1.1328125298023224, |
|
"reward_std": 0.14552949741482735, |
|
"rewards/curriculum_aware_reward_fn": 0.1328125037252903, |
|
"rewards/format_reward": 1.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 447.6796875, |
|
"epoch": 0.5210084033613446, |
|
"grad_norm": 0.396627813577652, |
|
"kl": 0.0396728515625, |
|
"learning_rate": 4.987926867107095e-06, |
|
"loss": 0.004, |
|
"num_tokens": 4304935.0, |
|
"reward": 0.9671875089406967, |
|
"reward_std": 0.12499183788895607, |
|
"rewards/curriculum_aware_reward_fn": 0.09218750381842256, |
|
"rewards/format_reward": 0.875, |
|
"step": 31 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 574.703125, |
|
"epoch": 0.5378151260504201, |
|
"grad_norm": 0.39759182929992676, |
|
"kl": 0.030426025390625, |
|
"learning_rate": 4.986372137660078e-06, |
|
"loss": 0.0566, |
|
"num_tokens": 4464105.0, |
|
"reward": 0.8953125327825546, |
|
"reward_std": 0.147516256198287, |
|
"rewards/curriculum_aware_reward_fn": 0.0906250006519258, |
|
"rewards/format_reward": 0.8046875, |
|
"step": 32 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 434.640625, |
|
"epoch": 0.5546218487394958, |
|
"grad_norm": 0.3933623433113098, |
|
"kl": 0.034210205078125, |
|
"learning_rate": 4.984723574517165e-06, |
|
"loss": 0.0163, |
|
"num_tokens": 4602139.0, |
|
"reward": 1.1234374940395355, |
|
"reward_std": 0.15428178012371063, |
|
"rewards/curriculum_aware_reward_fn": 0.18593750335276127, |
|
"rewards/format_reward": 0.9375, |
|
"step": 33 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 445.1640625, |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.3263511061668396, |
|
"kl": 0.04083251953125, |
|
"learning_rate": 4.9829812468487655e-06, |
|
"loss": 0.0145, |
|
"num_tokens": 4736544.0, |
|
"reward": 0.9843750298023224, |
|
"reward_std": 0.09849035926163197, |
|
"rewards/curriculum_aware_reward_fn": 0.05468750139698386, |
|
"rewards/format_reward": 0.9296875, |
|
"step": 34 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 477.53125, |
|
"epoch": 0.5882352941176471, |
|
"grad_norm": 0.336434543132782, |
|
"kl": 0.0318603515625, |
|
"learning_rate": 4.981145227759457e-06, |
|
"loss": 0.0017, |
|
"num_tokens": 4881308.0, |
|
"reward": 0.966145858168602, |
|
"reward_std": 0.07084779627621174, |
|
"rewards/curriculum_aware_reward_fn": 0.09114583441987634, |
|
"rewards/format_reward": 0.875, |
|
"step": 35 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 469.4609375, |
|
"epoch": 0.6050420168067226, |
|
"grad_norm": 0.3600573241710663, |
|
"kl": 0.0386962890625, |
|
"learning_rate": 4.979215594284924e-06, |
|
"loss": 0.008, |
|
"num_tokens": 5017415.0, |
|
"reward": 1.0843750536441803, |
|
"reward_std": 0.11371596809476614, |
|
"rewards/curriculum_aware_reward_fn": 0.08437500288709998, |
|
"rewards/format_reward": 1.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 422.7890625, |
|
"epoch": 0.6218487394957983, |
|
"grad_norm": 0.38463500142097473, |
|
"kl": 0.03997802734375, |
|
"learning_rate": 4.977192427388722e-06, |
|
"loss": 0.0195, |
|
"num_tokens": 5141804.0, |
|
"reward": 1.1130208671092987, |
|
"reward_std": 0.1456994889304042, |
|
"rewards/curriculum_aware_reward_fn": 0.12083333730697632, |
|
"rewards/format_reward": 0.9921875, |
|
"step": 37 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 415.4765625, |
|
"epoch": 0.6386554621848739, |
|
"grad_norm": 0.5343978404998779, |
|
"kl": 0.04315185546875, |
|
"learning_rate": 4.9750758119588824e-06, |
|
"loss": -0.0, |
|
"num_tokens": 5262529.0, |
|
"reward": 1.136979192495346, |
|
"reward_std": 0.29098181426525116, |
|
"rewards/curriculum_aware_reward_fn": 0.16822916036471725, |
|
"rewards/format_reward": 0.96875, |
|
"step": 38 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 442.2421875, |
|
"epoch": 0.6554621848739496, |
|
"grad_norm": 0.33322009444236755, |
|
"kl": 0.047119140625, |
|
"learning_rate": 4.972865836804349e-06, |
|
"loss": 0.0088, |
|
"num_tokens": 5399424.0, |
|
"reward": 1.03125, |
|
"reward_std": 0.13204573467373848, |
|
"rewards/curriculum_aware_reward_fn": 0.109375, |
|
"rewards/format_reward": 0.921875, |
|
"step": 39 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 482.59375, |
|
"epoch": 0.6722689075630253, |
|
"grad_norm": 0.35028988122940063, |
|
"kl": 0.0504150390625, |
|
"learning_rate": 4.970562594651254e-06, |
|
"loss": 0.0066, |
|
"num_tokens": 5539492.0, |
|
"reward": 1.09375, |
|
"reward_std": 0.1325825173407793, |
|
"rewards/curriculum_aware_reward_fn": 0.125, |
|
"rewards/format_reward": 0.96875, |
|
"step": 40 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 423.125, |
|
"epoch": 0.6890756302521008, |
|
"grad_norm": 0.3613918125629425, |
|
"kl": 0.06549072265625, |
|
"learning_rate": 4.968166182139026e-06, |
|
"loss": 0.0516, |
|
"num_tokens": 5667012.0, |
|
"reward": 1.122395858168602, |
|
"reward_std": 0.09522313997149467, |
|
"rewards/curriculum_aware_reward_fn": 0.13020833837799728, |
|
"rewards/format_reward": 0.9921875, |
|
"step": 41 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 391.953125, |
|
"epoch": 0.7058823529411765, |
|
"grad_norm": 0.6798313856124878, |
|
"kl": 0.080322265625, |
|
"learning_rate": 4.9656766998163306e-06, |
|
"loss": 0.0429, |
|
"num_tokens": 5786006.0, |
|
"reward": 1.1645833551883698, |
|
"reward_std": 0.23287939466536045, |
|
"rewards/curriculum_aware_reward_fn": 0.19583334028720856, |
|
"rewards/format_reward": 0.96875, |
|
"step": 42 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 406.59375, |
|
"epoch": 0.7226890756302521, |
|
"grad_norm": 0.4006165862083435, |
|
"kl": 0.06298828125, |
|
"learning_rate": 4.963094252136865e-06, |
|
"loss": 0.0119, |
|
"num_tokens": 5910762.0, |
|
"reward": 1.0989583432674408, |
|
"reward_std": 0.1059559416025877, |
|
"rewards/curriculum_aware_reward_fn": 0.16927083767950535, |
|
"rewards/format_reward": 0.9296875, |
|
"step": 43 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 411.28125, |
|
"epoch": 0.7394957983193278, |
|
"grad_norm": 0.3308926522731781, |
|
"kl": 0.068359375, |
|
"learning_rate": 4.960418947454958e-06, |
|
"loss": 0.0204, |
|
"num_tokens": 6042942.0, |
|
"reward": 0.9947916865348816, |
|
"reward_std": 0.10640286095440388, |
|
"rewards/curriculum_aware_reward_fn": 0.0572916679084301, |
|
"rewards/format_reward": 0.9375, |
|
"step": 44 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 380.9140625, |
|
"epoch": 0.7563025210084033, |
|
"grad_norm": 0.478098064661026, |
|
"kl": 0.0755615234375, |
|
"learning_rate": 4.957650898021038e-06, |
|
"loss": 0.0624, |
|
"num_tokens": 6162299.0, |
|
"reward": 1.158854216337204, |
|
"reward_std": 0.20395735278725624, |
|
"rewards/curriculum_aware_reward_fn": 0.15885416977107525, |
|
"rewards/format_reward": 1.0, |
|
"step": 45 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 333.5390625, |
|
"epoch": 0.773109243697479, |
|
"grad_norm": 0.47225892543792725, |
|
"kl": 0.0811767578125, |
|
"learning_rate": 4.954790219976915e-06, |
|
"loss": 0.0167, |
|
"num_tokens": 6272592.0, |
|
"reward": 1.170312523841858, |
|
"reward_std": 0.1300742938183248, |
|
"rewards/curriculum_aware_reward_fn": 0.17031250218860805, |
|
"rewards/format_reward": 1.0, |
|
"step": 46 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 366.34375, |
|
"epoch": 0.7899159663865546, |
|
"grad_norm": 0.4816167652606964, |
|
"kl": 0.0848388671875, |
|
"learning_rate": 4.95183703335091e-06, |
|
"loss": 0.0103, |
|
"num_tokens": 6397436.0, |
|
"reward": 1.1953125298023224, |
|
"reward_std": 0.12861409783363342, |
|
"rewards/curriculum_aware_reward_fn": 0.1953125074505806, |
|
"rewards/format_reward": 1.0, |
|
"step": 47 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 338.65625, |
|
"epoch": 0.8067226890756303, |
|
"grad_norm": 0.47690296173095703, |
|
"kl": 0.07958984375, |
|
"learning_rate": 4.948791462052819e-06, |
|
"loss": -0.0039, |
|
"num_tokens": 6519896.0, |
|
"reward": 1.0442708730697632, |
|
"reward_std": 0.11976211331784725, |
|
"rewards/curriculum_aware_reward_fn": 0.05208333441987634, |
|
"rewards/format_reward": 0.9921875, |
|
"step": 48 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 325.78125, |
|
"epoch": 0.8235294117647058, |
|
"grad_norm": 0.46987131237983704, |
|
"kl": 0.08935546875, |
|
"learning_rate": 4.945653633868716e-06, |
|
"loss": 0.023, |
|
"num_tokens": 6637036.0, |
|
"reward": 1.1223958730697632, |
|
"reward_std": 0.18378917127847672, |
|
"rewards/curriculum_aware_reward_fn": 0.14583333488553762, |
|
"rewards/format_reward": 0.9765625, |
|
"step": 49 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 302.6171875, |
|
"epoch": 0.8403361344537815, |
|
"grad_norm": 0.5635585784912109, |
|
"kl": 0.088623046875, |
|
"learning_rate": 4.942423680455584e-06, |
|
"loss": 0.0157, |
|
"num_tokens": 6749411.0, |
|
"reward": 1.1494792103767395, |
|
"reward_std": 0.19005249440670013, |
|
"rewards/curriculum_aware_reward_fn": 0.1494791703298688, |
|
"rewards/format_reward": 1.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 366.4140625, |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 0.5150569677352905, |
|
"kl": 0.0848388671875, |
|
"learning_rate": 4.939101737335802e-06, |
|
"loss": -0.0145, |
|
"num_tokens": 6879104.0, |
|
"reward": 0.9713541865348816, |
|
"reward_std": 0.12468592822551727, |
|
"rewards/curriculum_aware_reward_fn": 0.041666666977107525, |
|
"rewards/format_reward": 0.9296875, |
|
"step": 51 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 288.9609375, |
|
"epoch": 0.8739495798319328, |
|
"grad_norm": 0.6744732856750488, |
|
"kl": 0.088134765625, |
|
"learning_rate": 4.935687943891447e-06, |
|
"loss": 0.0413, |
|
"num_tokens": 6981627.0, |
|
"reward": 1.171354204416275, |
|
"reward_std": 0.26688926108181477, |
|
"rewards/curriculum_aware_reward_fn": 0.1791666648350656, |
|
"rewards/format_reward": 0.9921875, |
|
"step": 52 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 286.609375, |
|
"epoch": 0.8907563025210085, |
|
"grad_norm": 0.6298589706420898, |
|
"kl": 0.0899658203125, |
|
"learning_rate": 4.932182443358458e-06, |
|
"loss": 0.002, |
|
"num_tokens": 7088449.0, |
|
"reward": 1.1250000298023224, |
|
"reward_std": 0.14211241621524096, |
|
"rewards/curriculum_aware_reward_fn": 0.125000003259629, |
|
"rewards/format_reward": 1.0, |
|
"step": 53 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 302.4921875, |
|
"epoch": 0.907563025210084, |
|
"grad_norm": 0.6446663737297058, |
|
"kl": 0.0955810546875, |
|
"learning_rate": 4.928585382820616e-06, |
|
"loss": 0.0001, |
|
"num_tokens": 7197360.0, |
|
"reward": 1.1005208790302277, |
|
"reward_std": 0.08887648163363338, |
|
"rewards/curriculum_aware_reward_fn": 0.10052083828486502, |
|
"rewards/format_reward": 1.0, |
|
"step": 54 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 399.0625, |
|
"epoch": 0.9243697478991597, |
|
"grad_norm": 0.729965090751648, |
|
"kl": 0.1077880859375, |
|
"learning_rate": 4.924896913203376e-06, |
|
"loss": 0.0496, |
|
"num_tokens": 7331528.0, |
|
"reward": 0.9932291805744171, |
|
"reward_std": 0.15547171607613564, |
|
"rewards/curriculum_aware_reward_fn": 0.1807291698642075, |
|
"rewards/format_reward": 0.8125, |
|
"step": 55 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 494.09375, |
|
"epoch": 0.9411764705882353, |
|
"grad_norm": 0.41514354944229126, |
|
"kl": 0.0888671875, |
|
"learning_rate": 4.921117189267535e-06, |
|
"loss": -0.0255, |
|
"num_tokens": 7482548.0, |
|
"reward": 0.8671875149011612, |
|
"reward_std": 0.09729943191632628, |
|
"rewards/curriculum_aware_reward_fn": 0.05468750069849193, |
|
"rewards/format_reward": 0.8125, |
|
"step": 56 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 511.90625, |
|
"epoch": 0.957983193277311, |
|
"grad_norm": 0.6172528266906738, |
|
"kl": 0.1680908203125, |
|
"learning_rate": 4.917246369602742e-06, |
|
"loss": 0.0368, |
|
"num_tokens": 7621256.0, |
|
"reward": 1.050520896911621, |
|
"reward_std": 0.1373576819896698, |
|
"rewards/curriculum_aware_reward_fn": 0.17552083916962147, |
|
"rewards/format_reward": 0.875, |
|
"step": 57 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 415.8046875, |
|
"epoch": 0.9747899159663865, |
|
"grad_norm": 0.5895751118659973, |
|
"kl": 0.2454833984375, |
|
"learning_rate": 4.9132846166208355e-06, |
|
"loss": 0.0654, |
|
"num_tokens": 7740207.0, |
|
"reward": 1.1197916865348816, |
|
"reward_std": 0.09127403190359473, |
|
"rewards/curriculum_aware_reward_fn": 0.12760416674427688, |
|
"rewards/format_reward": 0.9921875, |
|
"step": 58 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 578.2857208251953, |
|
"epoch": 0.9915966386554622, |
|
"grad_norm": 0.4072950482368469, |
|
"kl": 0.12353515625, |
|
"learning_rate": 4.9092320965490365e-06, |
|
"loss": 0.0363, |
|
"num_tokens": 7885325.0, |
|
"reward": 1.0598958432674408, |
|
"reward_std": 0.11012758687138557, |
|
"rewards/curriculum_aware_reward_fn": 0.0598958358168602, |
|
"rewards/format_reward": 1.0, |
|
"step": 59 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 564.78125, |
|
"epoch": 1.0168067226890756, |
|
"grad_norm": 0.4054146707057953, |
|
"kl": 0.1219482421875, |
|
"learning_rate": 4.905088979422971e-06, |
|
"loss": 0.0515, |
|
"num_tokens": 8038297.0, |
|
"reward": 1.0937500596046448, |
|
"reward_std": 0.1636663186363876, |
|
"rewards/curriculum_aware_reward_fn": 0.1015625053551048, |
|
"rewards/format_reward": 0.9921875, |
|
"step": 60 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 578.2890625, |
|
"epoch": 1.0336134453781514, |
|
"grad_norm": 0.3728667199611664, |
|
"kl": 0.126708984375, |
|
"learning_rate": 4.900855439079536e-06, |
|
"loss": 0.0134, |
|
"num_tokens": 8185046.0, |
|
"reward": 1.1265625357627869, |
|
"reward_std": 0.14053087309002876, |
|
"rewards/curriculum_aware_reward_fn": 0.12656250409781933, |
|
"rewards/format_reward": 1.0, |
|
"step": 61 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 722.3125, |
|
"epoch": 1.050420168067227, |
|
"grad_norm": 0.32597795128822327, |
|
"kl": 0.1097412109375, |
|
"learning_rate": 4.8965316531496055e-06, |
|
"loss": 0.0184, |
|
"num_tokens": 8366894.0, |
|
"reward": 0.9677083492279053, |
|
"reward_std": 0.10382660711184144, |
|
"rewards/curriculum_aware_reward_fn": 0.04583333386108279, |
|
"rewards/format_reward": 0.921875, |
|
"step": 62 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 663.3671875, |
|
"epoch": 1.0672268907563025, |
|
"grad_norm": 0.4411354064941406, |
|
"kl": 0.1256103515625, |
|
"learning_rate": 4.892117803050578e-06, |
|
"loss": 0.0637, |
|
"num_tokens": 8514637.0, |
|
"reward": 1.0671875327825546, |
|
"reward_std": 0.21371026523411274, |
|
"rewards/curriculum_aware_reward_fn": 0.09843750204890966, |
|
"rewards/format_reward": 0.96875, |
|
"step": 63 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 644.1328125, |
|
"epoch": 1.084033613445378, |
|
"grad_norm": 0.36323124170303345, |
|
"kl": 0.1290283203125, |
|
"learning_rate": 4.887614073978761e-06, |
|
"loss": -0.0258, |
|
"num_tokens": 8683182.0, |
|
"reward": 0.9921875, |
|
"reward_std": 0.12415501847863197, |
|
"rewards/curriculum_aware_reward_fn": 0.0859375, |
|
"rewards/format_reward": 0.90625, |
|
"step": 64 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 659.5703125, |
|
"epoch": 1.1008403361344539, |
|
"grad_norm": 0.5263646245002747, |
|
"kl": 0.1494140625, |
|
"learning_rate": 4.883020654901609e-06, |
|
"loss": -0.03, |
|
"num_tokens": 8846031.0, |
|
"reward": 0.9375000149011612, |
|
"reward_std": 0.29325952008366585, |
|
"rewards/curriculum_aware_reward_fn": 0.08593750093132257, |
|
"rewards/format_reward": 0.8515625, |
|
"step": 65 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 542.421875, |
|
"epoch": 1.1176470588235294, |
|
"grad_norm": 0.5841398239135742, |
|
"kl": 0.1328125, |
|
"learning_rate": 4.878337738549785e-06, |
|
"loss": 0.0185, |
|
"num_tokens": 8994685.0, |
|
"reward": 0.9244791865348816, |
|
"reward_std": 0.350746251642704, |
|
"rewards/curriculum_aware_reward_fn": 0.0651041679084301, |
|
"rewards/format_reward": 0.859375, |
|
"step": 66 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 582.9453125, |
|
"epoch": 1.134453781512605, |
|
"grad_norm": 0.5570164322853088, |
|
"kl": 0.122314453125, |
|
"learning_rate": 4.873565521409082e-06, |
|
"loss": 0.0521, |
|
"num_tokens": 9155622.0, |
|
"reward": 0.9843750298023224, |
|
"reward_std": 0.3105625621974468, |
|
"rewards/curriculum_aware_reward_fn": 0.140625, |
|
"rewards/format_reward": 0.84375, |
|
"step": 67 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 485.3359375, |
|
"epoch": 1.1512605042016806, |
|
"grad_norm": 0.5985521078109741, |
|
"kl": 0.1263427734375, |
|
"learning_rate": 4.868704203712173e-06, |
|
"loss": 0.0059, |
|
"num_tokens": 9288609.0, |
|
"reward": 1.052083358168602, |
|
"reward_std": 0.3865399658679962, |
|
"rewards/curriculum_aware_reward_fn": 0.18489583488553762, |
|
"rewards/format_reward": 0.8671875, |
|
"step": 68 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 535.578125, |
|
"epoch": 1.1680672268907564, |
|
"grad_norm": 0.564681887626648, |
|
"kl": 0.1126708984375, |
|
"learning_rate": 4.86375398943021e-06, |
|
"loss": -0.0485, |
|
"num_tokens": 9437915.0, |
|
"reward": 0.9140625298023224, |
|
"reward_std": 0.28478266298770905, |
|
"rewards/curriculum_aware_reward_fn": 0.07031250256113708, |
|
"rewards/format_reward": 0.84375, |
|
"step": 69 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 496.0703125, |
|
"epoch": 1.184873949579832, |
|
"grad_norm": 0.5399004220962524, |
|
"kl": 0.099853515625, |
|
"learning_rate": 4.858715086264274e-06, |
|
"loss": 0.0, |
|
"num_tokens": 9580828.0, |
|
"reward": 0.934895858168602, |
|
"reward_std": 0.19700524397194386, |
|
"rewards/curriculum_aware_reward_fn": 0.04427083441987634, |
|
"rewards/format_reward": 0.890625, |
|
"step": 70 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 469.078125, |
|
"epoch": 1.2016806722689075, |
|
"grad_norm": 0.552808940410614, |
|
"kl": 0.104736328125, |
|
"learning_rate": 4.853587705636646e-06, |
|
"loss": -0.0129, |
|
"num_tokens": 9710926.0, |
|
"reward": 1.0234375447034836, |
|
"reward_std": 0.2662259414792061, |
|
"rewards/curriculum_aware_reward_fn": 0.07031250186264515, |
|
"rewards/format_reward": 0.953125, |
|
"step": 71 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 539.765625, |
|
"epoch": 1.2184873949579833, |
|
"grad_norm": 0.4241795837879181, |
|
"kl": 0.093505859375, |
|
"learning_rate": 4.84837206268195e-06, |
|
"loss": -0.0161, |
|
"num_tokens": 9867328.0, |
|
"reward": 0.796875, |
|
"reward_std": 0.12126770988106728, |
|
"rewards/curriculum_aware_reward_fn": 0.015625, |
|
"rewards/format_reward": 0.78125, |
|
"step": 72 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 502.6328125, |
|
"epoch": 1.2352941176470589, |
|
"grad_norm": 0.4336708188056946, |
|
"kl": 0.0975341796875, |
|
"learning_rate": 4.8430683762381195e-06, |
|
"loss": 0.0086, |
|
"num_tokens": 10007809.0, |
|
"reward": 1.0416666865348816, |
|
"reward_std": 0.1647402998059988, |
|
"rewards/curriculum_aware_reward_fn": 0.07291666697710752, |
|
"rewards/format_reward": 0.96875, |
|
"step": 73 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 523.640625, |
|
"epoch": 1.2521008403361344, |
|
"grad_norm": 0.46911758184432983, |
|
"kl": 0.1041259765625, |
|
"learning_rate": 4.837676868837213e-06, |
|
"loss": 0.0313, |
|
"num_tokens": 10147291.0, |
|
"reward": 1.0302083790302277, |
|
"reward_std": 0.18167300708591938, |
|
"rewards/curriculum_aware_reward_fn": 0.06145833572372794, |
|
"rewards/format_reward": 0.96875, |
|
"step": 74 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 628.6171875, |
|
"epoch": 1.26890756302521, |
|
"grad_norm": 0.5553966760635376, |
|
"kl": 0.091552734375, |
|
"learning_rate": 4.832197766696085e-06, |
|
"loss": 0.0771, |
|
"num_tokens": 10304234.0, |
|
"reward": 1.0401041805744171, |
|
"reward_std": 0.2994466572999954, |
|
"rewards/curriculum_aware_reward_fn": 0.14166666939854622, |
|
"rewards/format_reward": 0.8984375, |
|
"step": 75 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 541.03125, |
|
"epoch": 1.2857142857142856, |
|
"grad_norm": 0.4850277602672577, |
|
"kl": 0.092529296875, |
|
"learning_rate": 4.826631299706887e-06, |
|
"loss": -0.0076, |
|
"num_tokens": 10455718.0, |
|
"reward": 1.043229192495346, |
|
"reward_std": 0.18734892271459103, |
|
"rewards/curriculum_aware_reward_fn": 0.12135417200624943, |
|
"rewards/format_reward": 0.921875, |
|
"step": 76 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 513.34375, |
|
"epoch": 1.3025210084033614, |
|
"grad_norm": 0.5054645538330078, |
|
"kl": 0.098876953125, |
|
"learning_rate": 4.820977701427424e-06, |
|
"loss": 0.0342, |
|
"num_tokens": 10586514.0, |
|
"reward": 1.0703125149011612, |
|
"reward_std": 0.18218252062797546, |
|
"rewards/curriculum_aware_reward_fn": 0.10156250256113708, |
|
"rewards/format_reward": 0.96875, |
|
"step": 77 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 554.0390625, |
|
"epoch": 1.319327731092437, |
|
"grad_norm": 0.5157454013824463, |
|
"kl": 0.0882568359375, |
|
"learning_rate": 4.81523720907136e-06, |
|
"loss": 0.05, |
|
"num_tokens": 10726735.0, |
|
"reward": 1.1385416984558105, |
|
"reward_std": 0.23471001349389553, |
|
"rewards/curriculum_aware_reward_fn": 0.17760416865348816, |
|
"rewards/format_reward": 0.9609375, |
|
"step": 78 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 580.015625, |
|
"epoch": 1.3361344537815127, |
|
"grad_norm": 0.41959676146507263, |
|
"kl": 0.0958251953125, |
|
"learning_rate": 4.809410063498254e-06, |
|
"loss": 0.0059, |
|
"num_tokens": 10884185.0, |
|
"reward": 0.8411458283662796, |
|
"reward_std": 0.22234245762228966, |
|
"rewards/curriculum_aware_reward_fn": 0.06770833395421505, |
|
"rewards/format_reward": 0.7734375, |
|
"step": 79 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 534.84375, |
|
"epoch": 1.3529411764705883, |
|
"grad_norm": 0.6390196681022644, |
|
"kl": 0.093017578125, |
|
"learning_rate": 4.8034965092034656e-06, |
|
"loss": 0.0963, |
|
"num_tokens": 11024149.0, |
|
"reward": 1.0494791865348816, |
|
"reward_std": 0.2942212373018265, |
|
"rewards/curriculum_aware_reward_fn": 0.1432291679084301, |
|
"rewards/format_reward": 0.90625, |
|
"step": 80 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 525.671875, |
|
"epoch": 1.3697478991596639, |
|
"grad_norm": 0.4957650303840637, |
|
"kl": 0.105224609375, |
|
"learning_rate": 4.797496794307889e-06, |
|
"loss": -0.0236, |
|
"num_tokens": 11167547.0, |
|
"reward": 0.9828125238418579, |
|
"reward_std": 0.14313106751069427, |
|
"rewards/curriculum_aware_reward_fn": 0.06093750288709998, |
|
"rewards/format_reward": 0.921875, |
|
"step": 81 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 488.7890625, |
|
"epoch": 1.3865546218487395, |
|
"grad_norm": 0.5502341389656067, |
|
"kl": 0.104248046875, |
|
"learning_rate": 4.791411170547545e-06, |
|
"loss": 0.072, |
|
"num_tokens": 11300104.0, |
|
"reward": 1.143229216337204, |
|
"reward_std": 0.2514217048883438, |
|
"rewards/curriculum_aware_reward_fn": 0.1822916679084301, |
|
"rewards/format_reward": 0.9609375, |
|
"step": 82 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 508.1015625, |
|
"epoch": 1.403361344537815, |
|
"grad_norm": 0.489242285490036, |
|
"kl": 0.0986328125, |
|
"learning_rate": 4.785239893263017e-06, |
|
"loss": 0.1064, |
|
"num_tokens": 11446565.0, |
|
"reward": 0.9505208730697632, |
|
"reward_std": 0.18909456953406334, |
|
"rewards/curriculum_aware_reward_fn": 0.1145833358168602, |
|
"rewards/format_reward": 0.8359375, |
|
"step": 83 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 448.359375, |
|
"epoch": 1.4201680672268908, |
|
"grad_norm": 0.46208083629608154, |
|
"kl": 0.11083984375, |
|
"learning_rate": 4.778983221388742e-06, |
|
"loss": 0.0139, |
|
"num_tokens": 11580547.0, |
|
"reward": 1.0276041626930237, |
|
"reward_std": 0.12672015465795994, |
|
"rewards/curriculum_aware_reward_fn": 0.09791666641831398, |
|
"rewards/format_reward": 0.9296875, |
|
"step": 84 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 457.3515625, |
|
"epoch": 1.4369747899159664, |
|
"grad_norm": 0.44321703910827637, |
|
"kl": 0.1085205078125, |
|
"learning_rate": 4.77264141744214e-06, |
|
"loss": 0.0184, |
|
"num_tokens": 11712984.0, |
|
"reward": 1.160416692495346, |
|
"reward_std": 0.15310384705662727, |
|
"rewards/curriculum_aware_reward_fn": 0.16041666828095913, |
|
"rewards/format_reward": 1.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 384.3984375, |
|
"epoch": 1.453781512605042, |
|
"grad_norm": 0.4427275061607361, |
|
"kl": 0.123779296875, |
|
"learning_rate": 4.766214747512603e-06, |
|
"loss": 0.0233, |
|
"num_tokens": 11829315.0, |
|
"reward": 1.1666666865348816, |
|
"reward_std": 0.14362479094415903, |
|
"rewards/curriculum_aware_reward_fn": 0.1744791674427688, |
|
"rewards/format_reward": 0.9921875, |
|
"step": 86 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 389.265625, |
|
"epoch": 1.4705882352941178, |
|
"grad_norm": 0.5121983885765076, |
|
"kl": 0.12841796875, |
|
"learning_rate": 4.759703481250331e-06, |
|
"loss": 0.0229, |
|
"num_tokens": 11949533.0, |
|
"reward": 1.1343750357627869, |
|
"reward_std": 0.16322745941579342, |
|
"rewards/curriculum_aware_reward_fn": 0.1500000013038516, |
|
"rewards/format_reward": 0.984375, |
|
"step": 87 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 459.921875, |
|
"epoch": 1.4873949579831933, |
|
"grad_norm": 0.6308655738830566, |
|
"kl": 0.11328125, |
|
"learning_rate": 4.753107891855015e-06, |
|
"loss": 0.02, |
|
"num_tokens": 12092563.0, |
|
"reward": 0.8854166716337204, |
|
"reward_std": 0.22940894588828087, |
|
"rewards/curriculum_aware_reward_fn": 0.11197916697710752, |
|
"rewards/format_reward": 0.7734375, |
|
"step": 88 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 325.625, |
|
"epoch": 1.504201680672269, |
|
"grad_norm": 0.6008861660957336, |
|
"kl": 0.14013671875, |
|
"learning_rate": 4.746428256064375e-06, |
|
"loss": 0.0021, |
|
"num_tokens": 12195803.0, |
|
"reward": 1.261979192495346, |
|
"reward_std": 0.17762075550854206, |
|
"rewards/curriculum_aware_reward_fn": 0.2697916701436043, |
|
"rewards/format_reward": 0.9921875, |
|
"step": 89 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 348.2421875, |
|
"epoch": 1.5210084033613445, |
|
"grad_norm": 0.5485793948173523, |
|
"kl": 0.13720703125, |
|
"learning_rate": 4.7396648541425534e-06, |
|
"loss": 0.0053, |
|
"num_tokens": 12314866.0, |
|
"reward": 1.2442708611488342, |
|
"reward_std": 0.21353545226156712, |
|
"rewards/curriculum_aware_reward_fn": 0.24427084252238274, |
|
"rewards/format_reward": 1.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 519.265625, |
|
"epoch": 1.53781512605042, |
|
"grad_norm": 0.4276430010795593, |
|
"kl": 0.11962890625, |
|
"learning_rate": 4.732817969868348e-06, |
|
"loss": 0.0315, |
|
"num_tokens": 12463604.0, |
|
"reward": 0.895833358168602, |
|
"reward_std": 0.09582467563450336, |
|
"rewards/curriculum_aware_reward_fn": 0.0833333358168602, |
|
"rewards/format_reward": 0.8125, |
|
"step": 91 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 334.859375, |
|
"epoch": 1.5546218487394958, |
|
"grad_norm": 0.5961228013038635, |
|
"kl": 0.1279296875, |
|
"learning_rate": 4.7258878905233095e-06, |
|
"loss": 0.0697, |
|
"num_tokens": 12583698.0, |
|
"reward": 1.2239583432674408, |
|
"reward_std": 0.18167817778885365, |
|
"rewards/curriculum_aware_reward_fn": 0.2317708358168602, |
|
"rewards/format_reward": 0.9921875, |
|
"step": 92 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 363.859375, |
|
"epoch": 1.5714285714285714, |
|
"grad_norm": 0.7697988152503967, |
|
"kl": 0.1495361328125, |
|
"learning_rate": 4.718874906879688e-06, |
|
"loss": 0.1313, |
|
"num_tokens": 12705640.0, |
|
"reward": 1.123437523841858, |
|
"reward_std": 0.29410218447446823, |
|
"rewards/curriculum_aware_reward_fn": 0.17812500521540642, |
|
"rewards/format_reward": 0.9453125, |
|
"step": 93 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 297.1328125, |
|
"epoch": 1.5882352941176472, |
|
"grad_norm": 0.5211566686630249, |
|
"kl": 0.133056640625, |
|
"learning_rate": 4.711779313188231e-06, |
|
"loss": 0.04, |
|
"num_tokens": 12824305.0, |
|
"reward": 1.1119791865348816, |
|
"reward_std": 0.18469560518860817, |
|
"rewards/curriculum_aware_reward_fn": 0.1119791679084301, |
|
"rewards/format_reward": 1.0, |
|
"step": 94 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 253.859375, |
|
"epoch": 1.6050420168067228, |
|
"grad_norm": 0.6686471104621887, |
|
"kl": 0.1435546875, |
|
"learning_rate": 4.70460140716584e-06, |
|
"loss": -0.0138, |
|
"num_tokens": 12918559.0, |
|
"reward": 1.1968750357627869, |
|
"reward_std": 0.1724256370216608, |
|
"rewards/curriculum_aware_reward_fn": 0.20468750596046448, |
|
"rewards/format_reward": 0.9921875, |
|
"step": 95 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 575.90625, |
|
"epoch": 1.6218487394957983, |
|
"grad_norm": 0.3436340093612671, |
|
"kl": 0.160888671875, |
|
"learning_rate": 4.697341489983076e-06, |
|
"loss": 0.0223, |
|
"num_tokens": 13076699.0, |
|
"reward": 0.8567708432674408, |
|
"reward_std": 0.08664888702332973, |
|
"rewards/curriculum_aware_reward_fn": 0.10677083395421505, |
|
"rewards/format_reward": 0.75, |
|
"step": 96 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 301.9140625, |
|
"epoch": 1.638655462184874, |
|
"grad_norm": 1.0377366542816162, |
|
"kl": 0.162353515625, |
|
"learning_rate": 4.6899998662515215e-06, |
|
"loss": 0.0906, |
|
"num_tokens": 13192776.0, |
|
"reward": 1.1197916716337204, |
|
"reward_std": 0.12685893662273884, |
|
"rewards/curriculum_aware_reward_fn": 0.13541666674427688, |
|
"rewards/format_reward": 0.984375, |
|
"step": 97 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 331.28125, |
|
"epoch": 1.6554621848739495, |
|
"grad_norm": 0.477889746427536, |
|
"kl": 0.151123046875, |
|
"learning_rate": 4.682576844011007e-06, |
|
"loss": 0.0147, |
|
"num_tokens": 13309884.0, |
|
"reward": 1.2942708730697632, |
|
"reward_std": 0.10486710164695978, |
|
"rewards/curriculum_aware_reward_fn": 0.3567708432674408, |
|
"rewards/format_reward": 0.9375, |
|
"step": 98 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 418.96875, |
|
"epoch": 1.6722689075630253, |
|
"grad_norm": 1.2051795721054077, |
|
"kl": 0.1630859375, |
|
"learning_rate": 4.675072734716678e-06, |
|
"loss": 0.0986, |
|
"num_tokens": 13443272.0, |
|
"reward": 1.0937500298023224, |
|
"reward_std": 0.1639669369906187, |
|
"rewards/curriculum_aware_reward_fn": 0.1796875111758709, |
|
"rewards/format_reward": 0.9140625, |
|
"step": 99 |
|
}, |
|
{ |
|
"clip_ratio": 0.0, |
|
"completion_length": 257.484375, |
|
"epoch": 1.6890756302521008, |
|
"grad_norm": 0.6367622017860413, |
|
"kl": 0.157958984375, |
|
"learning_rate": 4.667487853225931e-06, |
|
"loss": 0.0274, |
|
"num_tokens": 13546814.0, |
|
"reward": 1.3072916865348816, |
|
"reward_std": 0.1563644390553236, |
|
"rewards/curriculum_aware_reward_fn": 0.3307291641831398, |
|
"rewards/format_reward": 0.9765625, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.6890756302521008, |
|
"step": 100, |
|
"total_flos": 0.0, |
|
"train_loss": 0.0, |
|
"train_runtime": 1.1082, |
|
"train_samples_per_second": 1712.724, |
|
"train_steps_per_second": 52.338 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 58, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|