diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,55045 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.18655846275826687, + "eval_steps": 500, + "global_step": 5000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 1049.732177734375, + "epoch": 3.731169255165338e-05, + "grad_norm": 0.05940868332982063, + "kl": 0.0, + "learning_rate": 6.666666666666668e-08, + "loss": -0.0089, + "num_tokens": 98111.0, + "reward": 0.20974336803192273, + "reward_std": 0.20873506763018668, + "rewards/code_reward": 0.20902906730771065, + "rewards/format_reward": 0.0071428571827709675, + "step": 1 + }, + { + "clip_ratio": 0.0, + "epoch": 7.462338510330675e-05, + "grad_norm": 0.05957343056797981, + "kl": 0.0, + "learning_rate": 1.3333333333333336e-07, + "loss": -0.0089, + "step": 2 + }, + { + "clip_ratio": 0.0039270849083550274, + "epoch": 0.00011193507765496012, + "grad_norm": 0.061085790395736694, + "kl": 0.0004949569702148438, + "learning_rate": 2.0000000000000002e-07, + "loss": -0.0086, + "step": 3 + }, + { + "clip_ratio": 0.003915057750418782, + "completion_length": 677.5178833007812, + "epoch": 0.0001492467702066135, + "grad_norm": 0.059134867042303085, + "kl": 0.0005044937133789062, + "learning_rate": 2.666666666666667e-07, + "loss": -0.0129, + "num_tokens": 159460.0, + "reward": 0.4316071637731511, + "reward_std": 0.3223567308159545, + "rewards/code_reward": 0.4285714253783226, + "rewards/format_reward": 0.030357143143191934, + "step": 4 + }, + { + "clip_ratio": 0.003978051187004894, + "epoch": 0.00018655846275826687, + "grad_norm": 0.05955991894006729, + "kl": 0.0004887580871582031, + "learning_rate": 3.3333333333333335e-07, + "loss": -0.0128, + "step": 5 + }, + { + "clip_ratio": 0.0039017166709527373, + "epoch": 0.00022387015530992023, + "grad_norm": 0.05978769063949585, + "kl": 0.0005064010620117188, + "learning_rate": 4.0000000000000003e-07, + "loss": -0.013, + "step": 6 + }, + { + "clip_ratio": 0.00371108966646716, + "completion_length": 698.1964569091797, + "epoch": 0.0002611818478615736, + "grad_norm": 0.06318207085132599, + "kl": 0.00051116943359375, + "learning_rate": 4.666666666666667e-07, + "loss": -0.004, + "num_tokens": 229163.0, + "reward": 0.3666626932681538, + "reward_std": 0.21847038459964097, + "rewards/code_reward": 0.3629126697778702, + "rewards/format_reward": 0.0375000003259629, + "step": 7 + }, + { + "clip_ratio": 0.0036112117813900113, + "epoch": 0.000298493540413227, + "grad_norm": 0.06814826279878616, + "kl": 0.00048065185546875, + "learning_rate": 5.333333333333335e-07, + "loss": -0.0037, + "step": 8 + }, + { + "clip_ratio": 0.0034014711854979396, + "epoch": 0.00033580523296488035, + "grad_norm": 0.06734953075647354, + "kl": 0.000514984130859375, + "learning_rate": 6.000000000000001e-07, + "loss": -0.0037, + "step": 9 + }, + { + "clip_ratio": 0.0032428089762106538, + "completion_length": 696.3393402099609, + "epoch": 0.00037311692551653374, + "grad_norm": 0.054221104830503464, + "kl": 0.0005764961242675781, + "learning_rate": 6.666666666666667e-07, + "loss": -0.0028, + "num_tokens": 295080.0, + "reward": 0.4569318314315751, + "reward_std": 0.21823891228996217, + "rewards/code_reward": 0.45568181574344635, + "rewards/format_reward": 0.012500000419095159, + "step": 10 + }, + { + "clip_ratio": 0.0032383272191509604, + "epoch": 0.00041042861806818713, + "grad_norm": 0.05372092127799988, + "kl": 0.0005884170532226562, + "learning_rate": 7.333333333333334e-07, + "loss": -0.0028, + "step": 11 + }, + { + "clip_ratio": 0.0036283275694586337, + "epoch": 0.00044774031061984047, + "grad_norm": 0.07763804495334625, + "kl": 0.0006227493286132812, + "learning_rate": 8.000000000000001e-07, + "loss": -0.0026, + "step": 12 + }, + { + "clip_ratio": 0.0017875952762551606, + "completion_length": 790.3214721679688, + "epoch": 0.00048505200317149386, + "grad_norm": 0.04330376535654068, + "kl": 0.0004963874816894531, + "learning_rate": 8.666666666666668e-07, + "loss": 0.001, + "num_tokens": 377082.0, + "reward": 0.12619049102067947, + "reward_std": 0.16022727265954018, + "rewards/code_reward": 0.12619048729538918, + "rewards/format_reward": 0.0, + "step": 13 + }, + { + "clip_ratio": 0.001712678058538586, + "epoch": 0.0005223636957231473, + "grad_norm": 0.044205956161022186, + "kl": 0.0004892349243164062, + "learning_rate": 9.333333333333334e-07, + "loss": 0.0009, + "step": 14 + }, + { + "clip_ratio": 0.0018372733611613512, + "epoch": 0.0005596753882748006, + "grad_norm": 0.04978034272789955, + "kl": 0.0005106925964355469, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.001, + "step": 15 + }, + { + "clip_ratio": 0.003791501745581627, + "completion_length": 801.5893249511719, + "epoch": 0.000596987080826454, + "grad_norm": 0.06478451937437057, + "kl": 0.0006046295166015625, + "learning_rate": 1.066666666666667e-06, + "loss": 0.0052, + "num_tokens": 458777.0, + "reward": 0.5235333237797022, + "reward_std": 0.13838129735086113, + "rewards/code_reward": 0.5233547473326325, + "rewards/format_reward": 0.0017857142956927419, + "step": 16 + }, + { + "clip_ratio": 0.003628058359026909, + "epoch": 0.0006342987733781074, + "grad_norm": 0.06502138823270798, + "kl": 0.0005402565002441406, + "learning_rate": 1.1333333333333334e-06, + "loss": 0.0051, + "step": 17 + }, + { + "clip_ratio": 0.003460305742919445, + "epoch": 0.0006716104659297607, + "grad_norm": 0.063479483127594, + "kl": 0.0005593299865722656, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.0051, + "step": 18 + }, + { + "clip_ratio": 0.004184611898381263, + "completion_length": 758.7143096923828, + "epoch": 0.0007089221584814141, + "grad_norm": 0.07176263630390167, + "kl": 0.000629425048828125, + "learning_rate": 1.2666666666666669e-06, + "loss": 0.0061, + "num_tokens": 536573.0, + "reward": 0.4228898340370506, + "reward_std": 0.19598172698169947, + "rewards/code_reward": 0.4087826581671834, + "rewards/format_reward": 0.1410714341327548, + "step": 19 + }, + { + "clip_ratio": 0.004373407049570233, + "epoch": 0.0007462338510330675, + "grad_norm": 0.06987442076206207, + "kl": 0.0005645751953125, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.0057, + "step": 20 + }, + { + "clip_ratio": 0.004203859542030841, + "epoch": 0.0007835455435847208, + "grad_norm": 0.06913967430591583, + "kl": 0.0006189346313476562, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.006, + "step": 21 + }, + { + "clip_ratio": 0.0038627973990514874, + "completion_length": 725.2857513427734, + "epoch": 0.0008208572361363743, + "grad_norm": 0.06347162276506424, + "kl": 0.0005159378051757812, + "learning_rate": 1.4666666666666669e-06, + "loss": 0.0079, + "num_tokens": 603999.0, + "reward": 0.5047638546675444, + "reward_std": 0.3621476925909519, + "rewards/code_reward": 0.5001209881156683, + "rewards/format_reward": 0.04642857238650322, + "step": 22 + }, + { + "clip_ratio": 0.00357841799268499, + "epoch": 0.0008581689286880276, + "grad_norm": 0.06200867146253586, + "kl": 0.0005159378051757812, + "learning_rate": 1.5333333333333334e-06, + "loss": 0.0077, + "step": 23 + }, + { + "clip_ratio": 0.003531730268150568, + "epoch": 0.0008954806212396809, + "grad_norm": 0.06560929864645004, + "kl": 0.0005412101745605469, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.0079, + "step": 24 + }, + { + "clip_ratio": 0.0042446642764844, + "completion_length": 977.6071929931641, + "epoch": 0.0009327923137913344, + "grad_norm": 0.06812455505132675, + "kl": 0.0005841255187988281, + "learning_rate": 1.6666666666666667e-06, + "loss": -0.0032, + "num_tokens": 698967.0, + "reward": 0.22176974860485643, + "reward_std": 0.10107220942154527, + "rewards/code_reward": 0.2174840106163174, + "rewards/format_reward": 0.0428571414668113, + "step": 25 + }, + { + "clip_ratio": 0.004441662575118244, + "epoch": 0.0009701040063429877, + "grad_norm": 0.062085483223199844, + "kl": 0.0005164146423339844, + "learning_rate": 1.7333333333333336e-06, + "loss": -0.003, + "step": 26 + }, + { + "clip_ratio": 0.003828263725154102, + "epoch": 0.0010074156988946412, + "grad_norm": 0.05835438892245293, + "kl": 0.0005173683166503906, + "learning_rate": 1.8000000000000001e-06, + "loss": -0.0031, + "step": 27 + }, + { + "clip_ratio": 0.0034778951667249203, + "completion_length": 798.0357513427734, + "epoch": 0.0010447273914462945, + "grad_norm": 0.04332362115383148, + "kl": 0.0005521774291992188, + "learning_rate": 1.8666666666666669e-06, + "loss": -0.0009, + "num_tokens": 778559.0, + "reward": 0.26539273746311665, + "reward_std": 0.29936444759368896, + "rewards/code_reward": 0.26539273746311665, + "rewards/format_reward": 0.0, + "step": 28 + }, + { + "clip_ratio": 0.002732276276219636, + "epoch": 0.0010820390839979478, + "grad_norm": 0.04406796768307686, + "kl": 0.00054931640625, + "learning_rate": 1.9333333333333336e-06, + "loss": -0.0011, + "step": 29 + }, + { + "clip_ratio": 0.0031425630440935493, + "epoch": 0.0011193507765496012, + "grad_norm": 0.043597735464572906, + "kl": 0.000518798828125, + "learning_rate": 2.0000000000000003e-06, + "loss": -0.0009, + "step": 30 + }, + { + "clip_ratio": 0.0053825476206839085, + "completion_length": 679.7678985595703, + "epoch": 0.0011566624691012545, + "grad_norm": 0.06638483703136444, + "kl": 0.000690460205078125, + "learning_rate": 2.0666666666666666e-06, + "loss": -0.0134, + "num_tokens": 837892.0, + "reward": 0.1316797339823097, + "reward_std": 0.182619922561571, + "rewards/code_reward": 0.12542973086237907, + "rewards/format_reward": 0.06250000093132257, + "step": 31 + }, + { + "clip_ratio": 0.005089090554974973, + "epoch": 0.001193974161652908, + "grad_norm": 0.06785798817873001, + "kl": 0.0005693435668945312, + "learning_rate": 2.133333333333334e-06, + "loss": -0.0133, + "step": 32 + }, + { + "clip_ratio": 0.005421160312835127, + "epoch": 0.0012312858542045614, + "grad_norm": 0.06917574256658554, + "kl": 0.0006308555603027344, + "learning_rate": 2.2e-06, + "loss": -0.0133, + "step": 33 + }, + { + "clip_ratio": 0.003806304943282157, + "completion_length": 866.5000305175781, + "epoch": 0.0012685975467562147, + "grad_norm": 0.05503791198134422, + "kl": 0.0005464553833007812, + "learning_rate": 2.266666666666667e-06, + "loss": -0.0404, + "num_tokens": 915340.0, + "reward": 0.45345591683872044, + "reward_std": 0.09049062535632402, + "rewards/code_reward": 0.45042017102241516, + "rewards/format_reward": 0.030357143376022577, + "step": 34 + }, + { + "clip_ratio": 0.004040958185214549, + "epoch": 0.001305909239307868, + "grad_norm": 0.055518608540296555, + "kl": 0.0004968643188476562, + "learning_rate": 2.3333333333333336e-06, + "loss": -0.0404, + "step": 35 + }, + { + "clip_ratio": 0.0037390358629636467, + "epoch": 0.0013432209318595214, + "grad_norm": 0.05297478288412094, + "kl": 0.0005311965942382812, + "learning_rate": 2.4000000000000003e-06, + "loss": -0.0405, + "step": 36 + }, + { + "clip_ratio": 0.003895837697200477, + "completion_length": 736.4464721679688, + "epoch": 0.0013805326244111747, + "grad_norm": 0.06452920287847519, + "kl": 0.0005474090576171875, + "learning_rate": 2.466666666666667e-06, + "loss": 0.006, + "num_tokens": 987479.0, + "reward": 0.4399922415614128, + "reward_std": 0.33294960111379623, + "rewards/code_reward": 0.4374922141432762, + "rewards/format_reward": 0.024999999441206455, + "step": 37 + }, + { + "clip_ratio": 0.0041656927787698805, + "epoch": 0.0014178443169628283, + "grad_norm": 0.0600845105946064, + "kl": 0.0005588531494140625, + "learning_rate": 2.5333333333333338e-06, + "loss": 0.006, + "step": 38 + }, + { + "clip_ratio": 0.003465810848865658, + "epoch": 0.0014551560095144816, + "grad_norm": 0.05872698500752449, + "kl": 0.0005807876586914062, + "learning_rate": 2.6e-06, + "loss": 0.0058, + "step": 39 + }, + { + "clip_ratio": 0.0035018770722672343, + "completion_length": 855.2857666015625, + "epoch": 0.001492467702066135, + "grad_norm": 0.05717320367693901, + "kl": 0.00057220458984375, + "learning_rate": 2.666666666666667e-06, + "loss": -0.0357, + "num_tokens": 1069157.0, + "reward": 0.2800316959619522, + "reward_std": 0.3199797496199608, + "rewards/code_reward": 0.2762816809117794, + "rewards/format_reward": 0.03749999881256372, + "step": 40 + }, + { + "clip_ratio": 0.003570113331079483, + "epoch": 0.0015297793946177883, + "grad_norm": 0.05596914514899254, + "kl": 0.0005483627319335938, + "learning_rate": 2.7333333333333336e-06, + "loss": -0.0359, + "step": 41 + }, + { + "clip_ratio": 0.0032328887027688324, + "epoch": 0.0015670910871694416, + "grad_norm": 0.057716649025678635, + "kl": 0.0005865097045898438, + "learning_rate": 2.8000000000000003e-06, + "loss": -0.036, + "step": 42 + }, + { + "clip_ratio": 0.004469216102734208, + "completion_length": 811.2678833007812, + "epoch": 0.0016044027797210952, + "grad_norm": 0.07835095375776291, + "kl": 0.0004749298095703125, + "learning_rate": 2.866666666666667e-06, + "loss": 0.0076, + "num_tokens": 1151940.0, + "reward": 0.1869555363082327, + "reward_std": 0.26948040002025664, + "rewards/code_reward": 0.18570553697645664, + "rewards/format_reward": 0.012500000186264515, + "step": 43 + }, + { + "clip_ratio": 0.003966361866332591, + "epoch": 0.0016417144722727485, + "grad_norm": 0.07419681549072266, + "kl": 0.0004496574401855469, + "learning_rate": 2.9333333333333338e-06, + "loss": 0.0076, + "step": 44 + }, + { + "clip_ratio": 0.004174922010861337, + "epoch": 0.0016790261648244019, + "grad_norm": 0.06450870633125305, + "kl": 0.000469207763671875, + "learning_rate": 3e-06, + "loss": 0.0078, + "step": 45 + }, + { + "clip_ratio": 0.003967344993725419, + "completion_length": 853.3393249511719, + "epoch": 0.0017163378573760552, + "grad_norm": 0.059824056923389435, + "kl": 0.0004911422729492188, + "learning_rate": 3.066666666666667e-06, + "loss": -0.0034, + "num_tokens": 1241377.0, + "reward": 0.20044754585251212, + "reward_std": 0.17971461778506637, + "rewards/code_reward": 0.18955466337502003, + "rewards/format_reward": 0.10892857261933386, + "step": 46 + }, + { + "clip_ratio": 0.003426427545491606, + "epoch": 0.0017536495499277085, + "grad_norm": 0.0592130608856678, + "kl": 0.0005521774291992188, + "learning_rate": 3.133333333333334e-06, + "loss": -0.0036, + "step": 47 + }, + { + "clip_ratio": 0.0040143177029676735, + "epoch": 0.0017909612424793619, + "grad_norm": 0.0592193678021431, + "kl": 0.0005445480346679688, + "learning_rate": 3.2000000000000003e-06, + "loss": -0.0035, + "step": 48 + }, + { + "clip_ratio": 0.004490456718485802, + "completion_length": 860.8571624755859, + "epoch": 0.0018282729350310154, + "grad_norm": 0.06518423557281494, + "kl": 0.000591278076171875, + "learning_rate": 3.266666666666667e-06, + "loss": 0.0089, + "num_tokens": 1327805.0, + "reward": 0.19068512473313604, + "reward_std": 0.23456245294073597, + "rewards/code_reward": 0.18818511627614498, + "rewards/format_reward": 0.02500000048894435, + "step": 49 + }, + { + "clip_ratio": 0.004301088105421513, + "epoch": 0.0018655846275826688, + "grad_norm": 0.06832009553909302, + "kl": 0.0006055831909179688, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0087, + "step": 50 + }, + { + "clip_ratio": 0.004653123498428613, + "epoch": 0.001902896320134322, + "grad_norm": 0.062982477247715, + "kl": 0.0006322860717773438, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.009, + "step": 51 + }, + { + "clip_ratio": 0.0042314904276281595, + "completion_length": 835.7500305175781, + "epoch": 0.0019402080126859754, + "grad_norm": 0.06419790536165237, + "kl": 0.0006418228149414062, + "learning_rate": 3.4666666666666672e-06, + "loss": 0.0029, + "num_tokens": 1405071.0, + "reward": 0.010999086720403284, + "reward_std": 0.03040636470541358, + "rewards/code_reward": 0.010106228815857321, + "rewards/format_reward": 0.008928571827709675, + "step": 52 + }, + { + "clip_ratio": 0.004082762636244297, + "epoch": 0.0019775197052376288, + "grad_norm": 0.06244908645749092, + "kl": 0.0006628036499023438, + "learning_rate": 3.5333333333333335e-06, + "loss": 0.003, + "step": 53 + }, + { + "clip_ratio": 0.004621072090230882, + "epoch": 0.0020148313977892823, + "grad_norm": 0.06093616038560867, + "kl": 0.0006303787231445312, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.003, + "step": 54 + }, + { + "clip_ratio": 0.0033400111133232713, + "completion_length": 884.4286193847656, + "epoch": 0.0020521430903409354, + "grad_norm": 0.06252560019493103, + "kl": 0.0005865097045898438, + "learning_rate": 3.6666666666666666e-06, + "loss": 0.0102, + "num_tokens": 1490579.0, + "reward": 0.3763005882501602, + "reward_std": 0.3044521752744913, + "rewards/code_reward": 0.37558628618717194, + "rewards/format_reward": 0.0071428571827709675, + "step": 55 + }, + { + "clip_ratio": 0.003634348511695862, + "epoch": 0.002089454782892589, + "grad_norm": 0.05814225226640701, + "kl": 0.0006551742553710938, + "learning_rate": 3.7333333333333337e-06, + "loss": 0.0103, + "step": 56 + }, + { + "clip_ratio": 0.00332397484453395, + "epoch": 0.0021267664754442426, + "grad_norm": 0.05923973396420479, + "kl": 0.00066375732421875, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.0102, + "step": 57 + }, + { + "clip_ratio": 0.0039900586707517505, + "completion_length": 809.1428985595703, + "epoch": 0.0021640781679958957, + "grad_norm": 0.051192838698625565, + "kl": 0.0008678436279296875, + "learning_rate": 3.866666666666667e-06, + "loss": -0.0014, + "num_tokens": 1575189.0, + "reward": 0.15413165837526321, + "reward_std": 0.28608471155166626, + "rewards/code_reward": 0.15056023001670837, + "rewards/format_reward": 0.035714286379516125, + "step": 58 + }, + { + "clip_ratio": 0.003875496331602335, + "epoch": 0.0022013898605475492, + "grad_norm": 0.0492444708943367, + "kl": 0.0009059906005859375, + "learning_rate": 3.9333333333333335e-06, + "loss": -0.0015, + "step": 59 + }, + { + "clip_ratio": 0.0037796080578118563, + "epoch": 0.0022387015530992023, + "grad_norm": 0.048009127378463745, + "kl": 0.0009241104125976562, + "learning_rate": 4.000000000000001e-06, + "loss": -0.0014, + "step": 60 + }, + { + "clip_ratio": 0.004798175417818129, + "completion_length": 650.6428833007812, + "epoch": 0.002276013245650856, + "grad_norm": 0.09358293563127518, + "kl": 0.0021953582763671875, + "learning_rate": 4.066666666666667e-06, + "loss": 0.0301, + "num_tokens": 1635565.0, + "reward": 0.17146163003053516, + "reward_std": 0.24065554316621274, + "rewards/code_reward": 0.15806878110743128, + "rewards/format_reward": 0.13392857927829027, + "step": 61 + }, + { + "clip_ratio": 0.004958610923495144, + "epoch": 0.002313324938202509, + "grad_norm": 0.06897728145122528, + "kl": 0.0033979415893554688, + "learning_rate": 4.133333333333333e-06, + "loss": 0.0298, + "step": 62 + }, + { + "clip_ratio": 0.004681366495788097, + "epoch": 0.0023506366307541626, + "grad_norm": 0.06931719183921814, + "kl": 0.004828453063964844, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.0297, + "step": 63 + }, + { + "clip_ratio": 0.0033571755629964173, + "completion_length": 1005.1607513427734, + "epoch": 0.002387948323305816, + "grad_norm": 0.0562518872320652, + "kl": 0.0010433197021484375, + "learning_rate": 4.266666666666668e-06, + "loss": 0.0017, + "num_tokens": 1729304.0, + "reward": 0.4781405180692673, + "reward_std": 0.3283190652728081, + "rewards/code_reward": 0.47456907480955124, + "rewards/format_reward": 0.0357142873108387, + "step": 64 + }, + { + "clip_ratio": 0.00342867209110409, + "epoch": 0.0024252600158574692, + "grad_norm": 0.05584600195288658, + "kl": 0.001285552978515625, + "learning_rate": 4.333333333333334e-06, + "loss": 0.0017, + "step": 65 + }, + { + "clip_ratio": 0.0039420510875061154, + "epoch": 0.002462571708409123, + "grad_norm": 0.05655444413423538, + "kl": 0.0016050338745117188, + "learning_rate": 4.4e-06, + "loss": 0.0017, + "step": 66 + }, + { + "clip_ratio": 0.0026107701705768704, + "completion_length": 817.482177734375, + "epoch": 0.002499883400960776, + "grad_norm": 0.058583132922649384, + "kl": 0.00113677978515625, + "learning_rate": 4.4666666666666665e-06, + "loss": -0.0102, + "num_tokens": 1810937.0, + "reward": 0.2550000019837171, + "reward_std": 0.014669906813651323, + "rewards/code_reward": 0.25, + "rewards/format_reward": 0.04999999888241291, + "step": 67 + }, + { + "clip_ratio": 0.0028111067367717624, + "epoch": 0.0025371950935124295, + "grad_norm": 0.05796598270535469, + "kl": 0.001255035400390625, + "learning_rate": 4.533333333333334e-06, + "loss": -0.0102, + "step": 68 + }, + { + "clip_ratio": 0.0029888658318668604, + "epoch": 0.002574506786064083, + "grad_norm": 0.05366106703877449, + "kl": 0.0014934539794921875, + "learning_rate": 4.600000000000001e-06, + "loss": -0.0103, + "step": 69 + }, + { + "clip_ratio": 0.00462610088288784, + "completion_length": 717.357177734375, + "epoch": 0.002611818478615736, + "grad_norm": 0.0670144259929657, + "kl": 0.006500244140625, + "learning_rate": 4.666666666666667e-06, + "loss": 0.0078, + "num_tokens": 1878941.0, + "reward": 0.3764443531399593, + "reward_std": 0.0955456921365112, + "rewards/code_reward": 0.37198004126548767, + "rewards/format_reward": 0.044642859138548374, + "step": 70 + }, + { + "clip_ratio": 0.004358199308626354, + "epoch": 0.0026491301711673897, + "grad_norm": 0.06330101937055588, + "kl": 0.0124664306640625, + "learning_rate": 4.7333333333333335e-06, + "loss": 0.0075, + "step": 71 + }, + { + "clip_ratio": 0.004729703883640468, + "epoch": 0.002686441863719043, + "grad_norm": 0.07420779764652252, + "kl": 0.0200958251953125, + "learning_rate": 4.800000000000001e-06, + "loss": 0.0078, + "step": 72 + }, + { + "clip_ratio": 0.004732472589239478, + "completion_length": 775.8214569091797, + "epoch": 0.0027237535562706964, + "grad_norm": 0.06615617126226425, + "kl": 0.0208282470703125, + "learning_rate": 4.866666666666667e-06, + "loss": 0.0026, + "num_tokens": 1955133.0, + "reward": 0.2911364585161209, + "reward_std": 0.2811088655143976, + "rewards/code_reward": 0.28845785185694695, + "rewards/format_reward": 0.026785715483129025, + "step": 73 + }, + { + "clip_ratio": 0.0042842793627642095, + "epoch": 0.0027610652488223495, + "grad_norm": 0.06702057272195816, + "kl": 0.019195556640625, + "learning_rate": 4.933333333333334e-06, + "loss": 0.0025, + "step": 74 + }, + { + "clip_ratio": 0.004420529876369983, + "epoch": 0.002798376941374003, + "grad_norm": 0.07710850238800049, + "kl": 0.021026611328125, + "learning_rate": 5e-06, + "loss": 0.0024, + "step": 75 + }, + { + "clip_ratio": 0.004974359122570604, + "completion_length": 715.3393249511719, + "epoch": 0.0028356886339256566, + "grad_norm": 0.08342790603637695, + "kl": 0.031095504760742188, + "learning_rate": 5.0666666666666676e-06, + "loss": 0.0082, + "num_tokens": 2039544.0, + "reward": 0.42287752963602543, + "reward_std": 0.38356511667370796, + "rewards/code_reward": 0.39519896917045116, + "rewards/format_reward": 0.27678572852164507, + "step": 76 + }, + { + "clip_ratio": 0.004815920896362513, + "epoch": 0.0028730003264773097, + "grad_norm": 0.08132918924093246, + "kl": 0.020502090454101562, + "learning_rate": 5.133333333333334e-06, + "loss": 0.0086, + "step": 77 + }, + { + "clip_ratio": 0.005259909084998071, + "epoch": 0.0029103120190289633, + "grad_norm": 0.07938116043806076, + "kl": 0.021482467651367188, + "learning_rate": 5.2e-06, + "loss": 0.0085, + "step": 78 + }, + { + "clip_ratio": 0.0048945666057989, + "completion_length": 975.3929290771484, + "epoch": 0.0029476237115806164, + "grad_norm": 0.09060864895582199, + "kl": 0.010547637939453125, + "learning_rate": 5.2666666666666665e-06, + "loss": 0.0129, + "num_tokens": 2133016.0, + "reward": 0.1108615673147142, + "reward_std": 0.12990539567545056, + "rewards/code_reward": 0.09621870703995228, + "rewards/format_reward": 0.1464285715483129, + "step": 79 + }, + { + "clip_ratio": 0.004619743092916906, + "epoch": 0.00298493540413227, + "grad_norm": 0.11827455461025238, + "kl": 0.013896942138671875, + "learning_rate": 5.333333333333334e-06, + "loss": 0.0128, + "step": 80 + }, + { + "clip_ratio": 0.0047619243850931525, + "epoch": 0.0030222470966839235, + "grad_norm": 0.0751243531703949, + "kl": 0.013629913330078125, + "learning_rate": 5.400000000000001e-06, + "loss": 0.0127, + "step": 81 + }, + { + "clip_ratio": 0.003945668286178261, + "completion_length": 785.2500305175781, + "epoch": 0.0030595587892355766, + "grad_norm": 0.0671067014336586, + "kl": 0.013296127319335938, + "learning_rate": 5.466666666666667e-06, + "loss": 0.0159, + "num_tokens": 2207398.0, + "reward": 0.27845072373747826, + "reward_std": 0.42263518273830414, + "rewards/code_reward": 0.26952214911580086, + "rewards/format_reward": 0.0892857126891613, + "step": 82 + }, + { + "clip_ratio": 0.0043056191061623394, + "epoch": 0.00309687048178723, + "grad_norm": 0.058061935007572174, + "kl": 0.013570785522460938, + "learning_rate": 5.533333333333334e-06, + "loss": 0.0159, + "step": 83 + }, + { + "clip_ratio": 0.004146330640651286, + "epoch": 0.0031341821743388833, + "grad_norm": 0.05876903608441353, + "kl": 0.014863967895507812, + "learning_rate": 5.600000000000001e-06, + "loss": 0.0158, + "step": 84 + }, + { + "clip_ratio": 0.004401201207656413, + "completion_length": 772.9464721679688, + "epoch": 0.003171493866890537, + "grad_norm": 0.07271997630596161, + "kl": 0.016445159912109375, + "learning_rate": 5.666666666666667e-06, + "loss": 0.0063, + "num_tokens": 2271837.0, + "reward": 0.31266172288451344, + "reward_std": 0.27769914641976357, + "rewards/code_reward": 0.29784028604626656, + "rewards/format_reward": 0.14821428433060646, + "step": 85 + }, + { + "clip_ratio": 0.004530630074441433, + "epoch": 0.0032088055594421904, + "grad_norm": 0.13163886964321136, + "kl": 0.022003173828125, + "learning_rate": 5.733333333333334e-06, + "loss": 0.0063, + "step": 86 + }, + { + "clip_ratio": 0.004695651703514159, + "epoch": 0.0032461172519938435, + "grad_norm": 0.0759764090180397, + "kl": 0.020854949951171875, + "learning_rate": 5.8e-06, + "loss": 0.0063, + "step": 87 + }, + { + "clip_ratio": 0.004327840346377343, + "completion_length": 780.8036041259766, + "epoch": 0.003283428944545497, + "grad_norm": 0.07094255834817886, + "kl": 0.02121734619140625, + "learning_rate": 5.8666666666666675e-06, + "loss": 0.0107, + "num_tokens": 2353362.0, + "reward": 0.4110119305551052, + "reward_std": 0.27901993272826076, + "rewards/code_reward": 0.40119049698114395, + "rewards/format_reward": 0.09821428172290325, + "step": 88 + }, + { + "clip_ratio": 0.0044831542181782424, + "epoch": 0.00332074063709715, + "grad_norm": 0.0691906288266182, + "kl": 0.020843505859375, + "learning_rate": 5.933333333333335e-06, + "loss": 0.0107, + "step": 89 + }, + { + "clip_ratio": 0.004610029864124954, + "epoch": 0.0033580523296488037, + "grad_norm": 0.06926675885915756, + "kl": 0.020450592041015625, + "learning_rate": 6e-06, + "loss": 0.0107, + "step": 90 + }, + { + "clip_ratio": 0.004367675574030727, + "completion_length": 823.6250457763672, + "epoch": 0.003395364022200457, + "grad_norm": 0.06650792062282562, + "kl": 0.0282135009765625, + "learning_rate": 6.066666666666667e-06, + "loss": 0.0303, + "num_tokens": 2424475.0, + "reward": 0.2385554249631241, + "reward_std": 0.12726482166908681, + "rewards/code_reward": 0.2280196992214769, + "rewards/format_reward": 0.10535714542493224, + "step": 91 + }, + { + "clip_ratio": 0.005137363099493086, + "epoch": 0.0034326757147521104, + "grad_norm": 0.08006864786148071, + "kl": 0.03302001953125, + "learning_rate": 6.133333333333334e-06, + "loss": 0.0304, + "step": 92 + }, + { + "clip_ratio": 0.005121207621414214, + "epoch": 0.003469987407303764, + "grad_norm": 0.07004494220018387, + "kl": 0.0242462158203125, + "learning_rate": 6.200000000000001e-06, + "loss": 0.0302, + "step": 93 + }, + { + "clip_ratio": 0.003610322601161897, + "completion_length": 815.7143249511719, + "epoch": 0.003507299099855417, + "grad_norm": 0.06056816503405571, + "kl": 0.0223846435546875, + "learning_rate": 6.266666666666668e-06, + "loss": -0.0026, + "num_tokens": 2506723.0, + "reward": 0.3572045974433422, + "reward_std": 0.3584906868636608, + "rewards/code_reward": 0.34345458820462227, + "rewards/format_reward": 0.13750000298023224, + "step": 94 + }, + { + "clip_ratio": 0.003887994505930692, + "epoch": 0.0035446107924070706, + "grad_norm": 0.06221781671047211, + "kl": 0.0209197998046875, + "learning_rate": 6.333333333333333e-06, + "loss": -0.0029, + "step": 95 + }, + { + "clip_ratio": 0.0038092390750534832, + "epoch": 0.0035819224849587238, + "grad_norm": 0.058073509484529495, + "kl": 0.0216522216796875, + "learning_rate": 6.4000000000000006e-06, + "loss": -0.0029, + "step": 96 + }, + { + "clip_ratio": 0.0024879479606170207, + "completion_length": 740.1964416503906, + "epoch": 0.0036192341775103773, + "grad_norm": 0.05851594731211662, + "kl": 0.02142333984375, + "learning_rate": 6.466666666666667e-06, + "loss": 0.0104, + "num_tokens": 2582110.0, + "reward": 0.3558743689209223, + "reward_std": 0.26609608344733715, + "rewards/code_reward": 0.3315886799246073, + "rewards/format_reward": 0.24285713955760002, + "step": 97 + }, + { + "clip_ratio": 0.0026627944316715, + "epoch": 0.003656545870062031, + "grad_norm": 0.05821840465068817, + "kl": 0.0213165283203125, + "learning_rate": 6.533333333333334e-06, + "loss": 0.0103, + "step": 98 + }, + { + "clip_ratio": 0.002564574417192489, + "epoch": 0.003693857562613684, + "grad_norm": 0.057189542800188065, + "kl": 0.0175018310546875, + "learning_rate": 6.600000000000001e-06, + "loss": 0.0103, + "step": 99 + }, + { + "clip_ratio": 0.004392108181491494, + "completion_length": 785.2857666015625, + "epoch": 0.0037311692551653375, + "grad_norm": 0.08328428864479065, + "kl": 0.011859893798828125, + "learning_rate": 6.666666666666667e-06, + "loss": 0.0062, + "num_tokens": 2649592.0, + "reward": 0.17935334704816341, + "reward_std": 0.14873511716723442, + "rewards/code_reward": 0.1584604736417532, + "rewards/format_reward": 0.20892857806757092, + "step": 100 + }, + { + "clip_ratio": 0.0041271685040555894, + "epoch": 0.0037684809477169907, + "grad_norm": 0.06999722123146057, + "kl": 0.013751983642578125, + "learning_rate": 6.733333333333334e-06, + "loss": 0.0061, + "step": 101 + }, + { + "clip_ratio": 0.0044279383146204054, + "epoch": 0.003805792640268644, + "grad_norm": 0.071166031062603, + "kl": 0.014373779296875, + "learning_rate": 6.800000000000001e-06, + "loss": 0.006, + "step": 102 + }, + { + "clip_ratio": 0.0036900777486152947, + "completion_length": 745.732177734375, + "epoch": 0.0038431043328202978, + "grad_norm": 0.0611068531870842, + "kl": 0.02685546875, + "learning_rate": 6.866666666666667e-06, + "loss": 0.017, + "num_tokens": 2722727.0, + "reward": 0.14512375323101878, + "reward_std": 0.19863351341336966, + "rewards/code_reward": 0.13048090878874063, + "rewards/format_reward": 0.1464285748079419, + "step": 103 + }, + { + "clip_ratio": 0.0038304394111037254, + "epoch": 0.003880416025371951, + "grad_norm": 0.06294312328100204, + "kl": 0.027008056640625, + "learning_rate": 6.9333333333333344e-06, + "loss": 0.0173, + "step": 104 + }, + { + "clip_ratio": 0.0041868246626108885, + "epoch": 0.0039177277179236044, + "grad_norm": 0.05673138052225113, + "kl": 0.02336883544921875, + "learning_rate": 7e-06, + "loss": 0.017, + "step": 105 + }, + { + "clip_ratio": 0.005061717121861875, + "completion_length": 757.4464645385742, + "epoch": 0.0039550394104752576, + "grad_norm": 0.0737903043627739, + "kl": 0.012542724609375, + "learning_rate": 7.066666666666667e-06, + "loss": -0.0132, + "num_tokens": 2791422.0, + "reward": 0.06232143472880125, + "reward_std": 0.1196102099493146, + "rewards/code_reward": 0.0535714291036129, + "rewards/format_reward": 0.08750000223517418, + "step": 106 + }, + { + "clip_ratio": 0.004666511667892337, + "epoch": 0.003992351103026911, + "grad_norm": 0.06583179533481598, + "kl": 0.01580810546875, + "learning_rate": 7.133333333333334e-06, + "loss": -0.0134, + "step": 107 + }, + { + "clip_ratio": 0.004926500143483281, + "epoch": 0.004029662795578565, + "grad_norm": 0.06810552626848221, + "kl": 0.0206756591796875, + "learning_rate": 7.2000000000000005e-06, + "loss": -0.0134, + "step": 108 + }, + { + "clip_ratio": 0.0028108953847549856, + "completion_length": 680.1250305175781, + "epoch": 0.004066974488130218, + "grad_norm": 0.08575601130723953, + "kl": 0.04571533203125, + "learning_rate": 7.266666666666668e-06, + "loss": 0.0525, + "num_tokens": 2851951.0, + "reward": 0.7244047522544861, + "reward_std": 0.33616480231285095, + "rewards/code_reward": 0.6976190358400345, + "rewards/format_reward": 0.2678571380674839, + "step": 109 + }, + { + "clip_ratio": 0.002869019008358009, + "epoch": 0.004104286180681871, + "grad_norm": 0.07259955257177353, + "kl": 0.036468505859375, + "learning_rate": 7.333333333333333e-06, + "loss": 0.0523, + "step": 110 + }, + { + "clip_ratio": 0.003551799338310957, + "epoch": 0.004141597873233525, + "grad_norm": 0.07586399465799332, + "kl": 0.0316162109375, + "learning_rate": 7.4e-06, + "loss": 0.0522, + "step": 111 + }, + { + "clip_ratio": 0.003227207635063678, + "completion_length": 857.1607666015625, + "epoch": 0.004178909565785178, + "grad_norm": 0.06522615253925323, + "kl": 0.030364990234375, + "learning_rate": 7.4666666666666675e-06, + "loss": -0.0111, + "num_tokens": 2937990.0, + "reward": 0.5056872181594372, + "reward_std": 0.35115163773298264, + "rewards/code_reward": 0.48729434609413147, + "rewards/format_reward": 0.18392857257276773, + "step": 112 + }, + { + "clip_ratio": 0.003503412677673623, + "epoch": 0.004216221258336831, + "grad_norm": 0.061381928622722626, + "kl": 0.0333251953125, + "learning_rate": 7.533333333333334e-06, + "loss": -0.0111, + "step": 113 + }, + { + "clip_ratio": 0.0038908764836378396, + "epoch": 0.004253532950888485, + "grad_norm": 0.05990113690495491, + "kl": 0.035186767578125, + "learning_rate": 7.600000000000001e-06, + "loss": -0.0111, + "step": 114 + }, + { + "clip_ratio": 0.0032012666924856603, + "completion_length": 750.5893096923828, + "epoch": 0.004290844643440138, + "grad_norm": 0.07089807838201523, + "kl": 0.016693115234375, + "learning_rate": 7.666666666666667e-06, + "loss": 0.012, + "num_tokens": 3008933.0, + "reward": 0.6490611638873816, + "reward_std": 0.28249630704522133, + "rewards/code_reward": 0.6149539947509766, + "rewards/format_reward": 0.3410714380443096, + "step": 115 + }, + { + "clip_ratio": 0.0033260583877563477, + "epoch": 0.004328156335991791, + "grad_norm": 0.0705995038151741, + "kl": 0.01636505126953125, + "learning_rate": 7.733333333333334e-06, + "loss": 0.0121, + "step": 116 + }, + { + "clip_ratio": 0.0034832734963856637, + "epoch": 0.0043654680285434445, + "grad_norm": 0.06954685598611832, + "kl": 0.0183258056640625, + "learning_rate": 7.800000000000002e-06, + "loss": 0.0117, + "step": 117 + }, + { + "clip_ratio": 0.003809009911492467, + "completion_length": 876.1250305175781, + "epoch": 0.0044027797210950985, + "grad_norm": 0.08215485513210297, + "kl": 0.01677703857421875, + "learning_rate": 7.866666666666667e-06, + "loss": 0.0051, + "num_tokens": 3093752.0, + "reward": 0.2799456426873803, + "reward_std": 0.16286637261509895, + "rewards/code_reward": 0.23923134431242943, + "rewards/format_reward": 0.40714286267757416, + "step": 118 + }, + { + "clip_ratio": 0.0038696398842148483, + "epoch": 0.004440091413646752, + "grad_norm": 0.07865402847528458, + "kl": 0.01723480224609375, + "learning_rate": 7.933333333333334e-06, + "loss": 0.005, + "step": 119 + }, + { + "clip_ratio": 0.0045727608958259225, + "epoch": 0.004477403106198405, + "grad_norm": 0.07361030578613281, + "kl": 0.0187530517578125, + "learning_rate": 8.000000000000001e-06, + "loss": 0.0049, + "step": 120 + }, + { + "clip_ratio": 0.004446155042387545, + "completion_length": 699.2500152587891, + "epoch": 0.004514714798750059, + "grad_norm": 0.08172205835580826, + "kl": 0.0214996337890625, + "learning_rate": 8.066666666666667e-06, + "loss": 0.0019, + "num_tokens": 3162352.0, + "reward": 0.22111177747137845, + "reward_std": 0.14624769520014524, + "rewards/code_reward": 0.17914746701717377, + "rewards/format_reward": 0.4196428470313549, + "step": 121 + }, + { + "clip_ratio": 0.004072262905538082, + "epoch": 0.004552026491301712, + "grad_norm": 0.11189721524715424, + "kl": 0.023529052734375, + "learning_rate": 8.133333333333334e-06, + "loss": 0.0017, + "step": 122 + }, + { + "clip_ratio": 0.005029312625993043, + "epoch": 0.004589338183853365, + "grad_norm": 0.08434264361858368, + "kl": 0.0277557373046875, + "learning_rate": 8.2e-06, + "loss": 0.0015, + "step": 123 + }, + { + "clip_ratio": 0.003993882273789495, + "completion_length": 652.5714569091797, + "epoch": 0.004626649876405018, + "grad_norm": 0.07539169490337372, + "kl": 0.0352783203125, + "learning_rate": 8.266666666666667e-06, + "loss": 0.0074, + "num_tokens": 3228590.0, + "reward": 0.4290756364353001, + "reward_std": 0.28088532760739326, + "rewards/code_reward": 0.3944327626377344, + "rewards/format_reward": 0.3464285768568516, + "step": 124 + }, + { + "clip_ratio": 0.004117927455808967, + "epoch": 0.004663961568956672, + "grad_norm": 0.0732932761311531, + "kl": 0.03143310546875, + "learning_rate": 8.333333333333334e-06, + "loss": 0.0074, + "step": 125 + }, + { + "clip_ratio": 0.00439747708151117, + "epoch": 0.004701273261508325, + "grad_norm": 0.07341460883617401, + "kl": 0.027679443359375, + "learning_rate": 8.400000000000001e-06, + "loss": 0.0071, + "step": 126 + }, + { + "clip_ratio": 0.004193307424429804, + "completion_length": 847.9286041259766, + "epoch": 0.004738584954059978, + "grad_norm": 0.08505049347877502, + "kl": 0.0175628662109375, + "learning_rate": 8.466666666666668e-06, + "loss": 0.0258, + "num_tokens": 3314672.0, + "reward": 0.4675363376736641, + "reward_std": 0.19064880069345236, + "rewards/code_reward": 0.4253934621810913, + "rewards/format_reward": 0.4214285798370838, + "step": 127 + }, + { + "clip_ratio": 0.0044250357314012945, + "epoch": 0.004775896646611632, + "grad_norm": 0.07868269085884094, + "kl": 0.024078369140625, + "learning_rate": 8.533333333333335e-06, + "loss": 0.0261, + "step": 128 + }, + { + "clip_ratio": 0.004353826632723212, + "epoch": 0.004813208339163285, + "grad_norm": 0.08744551986455917, + "kl": 0.027069091796875, + "learning_rate": 8.6e-06, + "loss": 0.0255, + "step": 129 + }, + { + "clip_ratio": 0.004396414034999907, + "completion_length": 949.7679138183594, + "epoch": 0.0048505200317149385, + "grad_norm": 0.07004830241203308, + "kl": 0.0205230712890625, + "learning_rate": 8.666666666666668e-06, + "loss": -0.0096, + "num_tokens": 3403279.0, + "reward": 0.14528846414759755, + "reward_std": 0.15291004441678524, + "rewards/code_reward": 0.10885989118833095, + "rewards/format_reward": 0.3642857074737549, + "step": 130 + }, + { + "clip_ratio": 0.003812372393440455, + "epoch": 0.004887831724266592, + "grad_norm": 0.06713265180587769, + "kl": 0.0224151611328125, + "learning_rate": 8.733333333333333e-06, + "loss": -0.0099, + "step": 131 + }, + { + "clip_ratio": 0.00431096414104104, + "epoch": 0.004925143416818246, + "grad_norm": 0.06553714722394943, + "kl": 0.0241546630859375, + "learning_rate": 8.8e-06, + "loss": -0.01, + "step": 132 + }, + { + "clip_ratio": 0.004329068353399634, + "completion_length": 811.3928833007812, + "epoch": 0.004962455109369899, + "grad_norm": 0.3049370050430298, + "kl": 0.189483642578125, + "learning_rate": 8.866666666666668e-06, + "loss": -0.0016, + "num_tokens": 3489107.0, + "reward": 0.44756443426012993, + "reward_std": 0.30533435568213463, + "rewards/code_reward": 0.4114929661154747, + "rewards/format_reward": 0.36071427911520004, + "step": 133 + }, + { + "clip_ratio": 0.004392658127471805, + "epoch": 0.004999766801921552, + "grad_norm": 0.06861816346645355, + "kl": 0.040771484375, + "learning_rate": 8.933333333333333e-06, + "loss": -0.0033, + "step": 134 + }, + { + "clip_ratio": 0.00485595635836944, + "epoch": 0.005037078494473206, + "grad_norm": 0.09656067192554474, + "kl": 0.030487060546875, + "learning_rate": 9e-06, + "loss": -0.0031, + "step": 135 + }, + { + "clip_ratio": 0.00377564417431131, + "completion_length": 842.7500305175781, + "epoch": 0.005074390187024859, + "grad_norm": 0.08491750061511993, + "kl": 0.045440673828125, + "learning_rate": 9.066666666666667e-06, + "loss": 0.0151, + "num_tokens": 3574241.0, + "reward": 0.3939283899962902, + "reward_std": 0.3176419213414192, + "rewards/code_reward": 0.34071411937475204, + "rewards/format_reward": 0.532142847776413, + "step": 136 + }, + { + "clip_ratio": 0.0037570124841295183, + "epoch": 0.005111701879576512, + "grad_norm": 0.07498861104249954, + "kl": 0.03704833984375, + "learning_rate": 9.133333333333335e-06, + "loss": 0.0151, + "step": 137 + }, + { + "clip_ratio": 0.0040102729690261185, + "epoch": 0.005149013572128166, + "grad_norm": 0.07352706044912338, + "kl": 0.0359649658203125, + "learning_rate": 9.200000000000002e-06, + "loss": 0.0148, + "step": 138 + }, + { + "clip_ratio": 0.0027137938886880875, + "completion_length": 823.9107513427734, + "epoch": 0.005186325264679819, + "grad_norm": 0.060176532715559006, + "kl": 0.014190673828125, + "learning_rate": 9.266666666666667e-06, + "loss": 0.0122, + "num_tokens": 3652132.0, + "reward": 0.5471478942781687, + "reward_std": 0.24464463628828526, + "rewards/code_reward": 0.5098264720290899, + "rewards/format_reward": 0.37321428395807743, + "step": 139 + }, + { + "clip_ratio": 0.003150615724734962, + "epoch": 0.005223636957231472, + "grad_norm": 0.06173131242394447, + "kl": 0.01312255859375, + "learning_rate": 9.333333333333334e-06, + "loss": 0.0122, + "step": 140 + }, + { + "clip_ratio": 0.0029662439483217895, + "epoch": 0.005260948649783125, + "grad_norm": 0.062065936625003815, + "kl": 0.0125579833984375, + "learning_rate": 9.4e-06, + "loss": 0.0118, + "step": 141 + }, + { + "clip_ratio": 0.004243574803695083, + "completion_length": 717.1250305175781, + "epoch": 0.005298260342334779, + "grad_norm": 0.08131963014602661, + "kl": 0.0191497802734375, + "learning_rate": 9.466666666666667e-06, + "loss": 0.004, + "num_tokens": 3716609.0, + "reward": 0.3606767915189266, + "reward_std": 0.2845412641763687, + "rewards/code_reward": 0.3258553398773074, + "rewards/format_reward": 0.3482142835855484, + "step": 142 + }, + { + "clip_ratio": 0.0037234838237054646, + "epoch": 0.0053355720348864325, + "grad_norm": 0.08170482516288757, + "kl": 0.02264404296875, + "learning_rate": 9.533333333333334e-06, + "loss": 0.0037, + "step": 143 + }, + { + "clip_ratio": 0.004228211124427617, + "epoch": 0.005372883727438086, + "grad_norm": 0.07855211943387985, + "kl": 0.0264892578125, + "learning_rate": 9.600000000000001e-06, + "loss": 0.0035, + "step": 144 + }, + { + "clip_ratio": 0.004712335299700499, + "completion_length": 1065.982162475586, + "epoch": 0.00541019541998974, + "grad_norm": 0.07606414705514908, + "kl": 0.01282501220703125, + "learning_rate": 9.666666666666667e-06, + "loss": 0.0322, + "num_tokens": 3828048.0, + "reward": 0.16933136992156506, + "reward_std": 0.26417561434209347, + "rewards/code_reward": 0.1254027932882309, + "rewards/format_reward": 0.4392857141792774, + "step": 145 + }, + { + "clip_ratio": 0.004639646445866674, + "epoch": 0.005447507112541393, + "grad_norm": 0.07048355787992477, + "kl": 0.01430511474609375, + "learning_rate": 9.733333333333334e-06, + "loss": 0.0321, + "step": 146 + }, + { + "clip_ratio": 0.0051269931718707085, + "epoch": 0.005484818805093046, + "grad_norm": 0.0750993862748146, + "kl": 0.01401519775390625, + "learning_rate": 9.800000000000001e-06, + "loss": 0.0319, + "step": 147 + }, + { + "clip_ratio": 0.004979864112101495, + "completion_length": 755.6250457763672, + "epoch": 0.005522130497644699, + "grad_norm": 0.07934028655290604, + "kl": 0.0195465087890625, + "learning_rate": 9.866666666666668e-06, + "loss": 0.0085, + "num_tokens": 3908443.0, + "reward": 0.4618741311132908, + "reward_std": 0.2800837457180023, + "rewards/code_reward": 0.4256241247057915, + "rewards/format_reward": 0.36250000074505806, + "step": 148 + }, + { + "clip_ratio": 0.004592386307194829, + "epoch": 0.005559442190196353, + "grad_norm": 0.07558207958936691, + "kl": 0.01892852783203125, + "learning_rate": 9.933333333333334e-06, + "loss": 0.0082, + "step": 149 + }, + { + "clip_ratio": 0.005219215294346213, + "epoch": 0.005596753882748006, + "grad_norm": 0.07289914786815643, + "kl": 0.0185394287109375, + "learning_rate": 1e-05, + "loss": 0.0078, + "step": 150 + }, + { + "clip_ratio": 0.004364718799479306, + "completion_length": 1058.482177734375, + "epoch": 0.005634065575299659, + "grad_norm": 0.07894004881381989, + "kl": 0.01422119140625, + "learning_rate": 9.999999055941794e-06, + "loss": 0.0401, + "num_tokens": 4009620.0, + "reward": 0.32742974907159805, + "reward_std": 0.3156113885343075, + "rewards/code_reward": 0.2774297362193465, + "rewards/format_reward": 0.4999999888241291, + "step": 151 + }, + { + "clip_ratio": 0.004118499637115747, + "epoch": 0.005671377267851313, + "grad_norm": 0.07215942442417145, + "kl": 0.0142669677734375, + "learning_rate": 9.999996223767578e-06, + "loss": 0.0399, + "step": 152 + }, + { + "clip_ratio": 0.0040166189428418875, + "epoch": 0.005708688960402966, + "grad_norm": 0.06812794506549835, + "kl": 0.015411376953125, + "learning_rate": 9.999991503478531e-06, + "loss": 0.0397, + "step": 153 + }, + { + "clip_ratio": 0.004809446632862091, + "completion_length": 756.5357513427734, + "epoch": 0.005746000652954619, + "grad_norm": 0.09445882588624954, + "kl": 0.021484375, + "learning_rate": 9.999984895076643e-06, + "loss": 0.0078, + "num_tokens": 4080098.0, + "reward": 0.4602966010570526, + "reward_std": 0.35954274237155914, + "rewards/code_reward": 0.42868945375084877, + "rewards/format_reward": 0.31607142463326454, + "step": 154 + }, + { + "clip_ratio": 0.005294816684909165, + "epoch": 0.005783312345506273, + "grad_norm": 0.08862800896167755, + "kl": 0.02349853515625, + "learning_rate": 9.999976398564682e-06, + "loss": 0.0078, + "step": 155 + }, + { + "clip_ratio": 0.005058512033428997, + "epoch": 0.0058206240380579265, + "grad_norm": 0.12003374844789505, + "kl": 0.0249481201171875, + "learning_rate": 9.999966013946214e-06, + "loss": 0.0074, + "step": 156 + }, + { + "clip_ratio": 0.004765102698002011, + "completion_length": 815.0893249511719, + "epoch": 0.00585793573060958, + "grad_norm": 0.08698784559965134, + "kl": 0.0325469970703125, + "learning_rate": 9.999953741225595e-06, + "loss": 0.0383, + "num_tokens": 4164355.0, + "reward": 0.5324112884700298, + "reward_std": 0.34566505625844, + "rewards/code_reward": 0.4774112543091178, + "rewards/format_reward": 0.5499999895691872, + "step": 157 + }, + { + "clip_ratio": 0.0046009664656594396, + "epoch": 0.005895247423161233, + "grad_norm": 0.08533735573291779, + "kl": 0.033966064453125, + "learning_rate": 9.999939580407976e-06, + "loss": 0.038, + "step": 158 + }, + { + "clip_ratio": 0.005055032204836607, + "epoch": 0.005932559115712887, + "grad_norm": 0.08573473244905472, + "kl": 0.035064697265625, + "learning_rate": 9.999923531499298e-06, + "loss": 0.0376, + "step": 159 + }, + { + "clip_ratio": 0.005541750229895115, + "completion_length": 1141.5536499023438, + "epoch": 0.00596987080826454, + "grad_norm": 0.13502679765224457, + "kl": 0.01441192626953125, + "learning_rate": 9.999905594506296e-06, + "loss": 0.0141, + "num_tokens": 4271794.0, + "reward": 0.46044908836483955, + "reward_std": 0.1258612610399723, + "rewards/code_reward": 0.4193776547908783, + "rewards/format_reward": 0.4107142882421613, + "step": 160 + }, + { + "clip_ratio": 0.006172261433675885, + "epoch": 0.006007182500816193, + "grad_norm": 0.08383465558290482, + "kl": 0.01598358154296875, + "learning_rate": 9.999885769436492e-06, + "loss": 0.0142, + "step": 161 + }, + { + "clip_ratio": 0.005938812857493758, + "epoch": 0.006044494193367847, + "grad_norm": 0.09828599542379379, + "kl": 0.01805877685546875, + "learning_rate": 9.99986405629821e-06, + "loss": 0.0135, + "step": 162 + }, + { + "clip_ratio": 0.004686150990892202, + "completion_length": 590.8393096923828, + "epoch": 0.0060818058859195, + "grad_norm": 0.0906926691532135, + "kl": 0.039093017578125, + "learning_rate": 9.999840455100557e-06, + "loss": 0.0311, + "num_tokens": 4332335.0, + "reward": 0.4910562364384532, + "reward_std": 0.15648668771609664, + "rewards/code_reward": 0.42123477160930634, + "rewards/format_reward": 0.6982142999768257, + "step": 163 + }, + { + "clip_ratio": 0.003891480155289173, + "epoch": 0.006119117578471153, + "grad_norm": 0.24608157575130463, + "kl": 0.04949951171875, + "learning_rate": 9.999814965853435e-06, + "loss": 0.0305, + "step": 164 + }, + { + "clip_ratio": 0.00454916077433154, + "epoch": 0.006156429271022806, + "grad_norm": 0.0903649851679802, + "kl": 0.05157470703125, + "learning_rate": 9.99978758856754e-06, + "loss": 0.0302, + "step": 165 + }, + { + "clip_ratio": 0.004579223459586501, + "completion_length": 729.0178985595703, + "epoch": 0.00619374096357446, + "grad_norm": 0.08453750610351562, + "kl": 0.03759765625, + "learning_rate": 9.99975832325436e-06, + "loss": -0.0043, + "num_tokens": 4410022.0, + "reward": 0.6663115471601486, + "reward_std": 0.21798086911439896, + "rewards/code_reward": 0.5932757817208767, + "rewards/format_reward": 0.7303571403026581, + "step": 166 + }, + { + "clip_ratio": 0.004504561948124319, + "epoch": 0.0062310526561261135, + "grad_norm": 0.07903100550174713, + "kl": 0.035797119140625, + "learning_rate": 9.999727169926172e-06, + "loss": -0.0047, + "step": 167 + }, + { + "clip_ratio": 0.005287665408104658, + "epoch": 0.006268364348677767, + "grad_norm": 0.07424798607826233, + "kl": 0.03765869140625, + "learning_rate": 9.99969412859605e-06, + "loss": -0.0051, + "step": 168 + }, + { + "clip_ratio": 0.004337002173997462, + "completion_length": 679.803596496582, + "epoch": 0.0063056760412294206, + "grad_norm": 0.09585179388523102, + "kl": 0.08111572265625, + "learning_rate": 9.999659199277855e-06, + "loss": 0.0137, + "num_tokens": 4485667.0, + "reward": 0.58072929084301, + "reward_std": 0.3670716658234596, + "rewards/code_reward": 0.5003721192479134, + "rewards/format_reward": 0.803571417927742, + "step": 169 + }, + { + "clip_ratio": 0.004886757174972445, + "epoch": 0.006342987733781074, + "grad_norm": 0.08919905126094818, + "kl": 0.07818603515625, + "learning_rate": 9.999622381986245e-06, + "loss": 0.0137, + "step": 170 + }, + { + "clip_ratio": 0.005444412236101925, + "epoch": 0.006380299426332727, + "grad_norm": 0.08996913582086563, + "kl": 0.0726318359375, + "learning_rate": 9.999583676736665e-06, + "loss": 0.0135, + "step": 171 + }, + { + "clip_ratio": 0.0037258476950228214, + "completion_length": 857.0536193847656, + "epoch": 0.006417611118884381, + "grad_norm": 0.08795832097530365, + "kl": 0.050262451171875, + "learning_rate": 9.999543083545357e-06, + "loss": 0.0188, + "num_tokens": 4567626.0, + "reward": 0.40649982169270515, + "reward_std": 0.24810676649212837, + "rewards/code_reward": 0.3345355335623026, + "rewards/format_reward": 0.7196428552269936, + "step": 172 + }, + { + "clip_ratio": 0.0037726404261775315, + "epoch": 0.006454922811436034, + "grad_norm": 0.07794350385665894, + "kl": 0.03704833984375, + "learning_rate": 9.999500602429353e-06, + "loss": 0.0186, + "step": 173 + }, + { + "clip_ratio": 0.004370673676021397, + "epoch": 0.006492234503987687, + "grad_norm": 0.09754373878240585, + "kl": 0.0372314453125, + "learning_rate": 9.999456233406477e-06, + "loss": 0.0185, + "step": 174 + }, + { + "clip_ratio": 0.003991663514170796, + "completion_length": 835.7321929931641, + "epoch": 0.00652954619653934, + "grad_norm": 0.08194305002689362, + "kl": 0.02874755859375, + "learning_rate": 9.999409976495346e-06, + "loss": -0.0029, + "num_tokens": 4644051.0, + "reward": 0.2902037026360631, + "reward_std": 0.25118159502744675, + "rewards/code_reward": 0.24002512518200092, + "rewards/format_reward": 0.5017857104539871, + "step": 175 + }, + { + "clip_ratio": 0.004291087097954005, + "epoch": 0.006566857889090994, + "grad_norm": 0.07858309149742126, + "kl": 0.030181884765625, + "learning_rate": 9.999361831715367e-06, + "loss": -0.0031, + "step": 176 + }, + { + "clip_ratio": 0.0047472158330492675, + "epoch": 0.006604169581642647, + "grad_norm": 0.07457469403743744, + "kl": 0.029022216796875, + "learning_rate": 9.999311799086742e-06, + "loss": -0.0036, + "step": 177 + }, + { + "clip_ratio": 0.0038054902106523514, + "completion_length": 876.8750228881836, + "epoch": 0.0066414812741943, + "grad_norm": 0.08074542135000229, + "kl": 0.043731689453125, + "learning_rate": 9.999259878630463e-06, + "loss": 0.0029, + "num_tokens": 4719336.0, + "reward": 0.683481615036726, + "reward_std": 0.18650846742093563, + "rewards/code_reward": 0.6118744499981403, + "rewards/format_reward": 0.7160714343190193, + "step": 178 + }, + { + "clip_ratio": 0.004614856210537255, + "epoch": 0.006678792966745954, + "grad_norm": 0.07612542062997818, + "kl": 0.04437255859375, + "learning_rate": 9.999206070368316e-06, + "loss": 0.0028, + "step": 179 + }, + { + "clip_ratio": 0.004698079021181911, + "epoch": 0.0067161046592976075, + "grad_norm": 0.07453392446041107, + "kl": 0.04296875, + "learning_rate": 9.999150374322878e-06, + "loss": 0.0026, + "step": 180 + }, + { + "clip_ratio": 0.005900516640394926, + "completion_length": 620.6071548461914, + "epoch": 0.006753416351849261, + "grad_norm": 0.0943993479013443, + "kl": 0.035064697265625, + "learning_rate": 9.999092790517517e-06, + "loss": -0.0048, + "num_tokens": 4785072.0, + "reward": 0.1649925298988819, + "reward_std": 0.13010887429118156, + "rewards/code_reward": 0.09552823193371296, + "rewards/format_reward": 0.6946428567171097, + "step": 181 + }, + { + "clip_ratio": 0.005969358026050031, + "epoch": 0.006790728044400914, + "grad_norm": 0.09552862495183945, + "kl": 0.037872314453125, + "learning_rate": 9.999033318976393e-06, + "loss": -0.0056, + "step": 182 + }, + { + "clip_ratio": 0.005916992900893092, + "epoch": 0.006828039736952568, + "grad_norm": 0.09560307115316391, + "kl": 0.0460205078125, + "learning_rate": 9.998971959724461e-06, + "loss": -0.0061, + "step": 183 + }, + { + "clip_ratio": 0.004060279985424131, + "completion_length": 916.1786041259766, + "epoch": 0.006865351429504221, + "grad_norm": 0.09530257433652878, + "kl": 0.082977294921875, + "learning_rate": 9.998908712787465e-06, + "loss": 0.0323, + "num_tokens": 4877734.0, + "reward": 0.23372449725866318, + "reward_std": 0.2029952723532915, + "rewards/code_reward": 0.16604592185467482, + "rewards/format_reward": 0.6767857223749161, + "step": 184 + }, + { + "clip_ratio": 0.004730395565275103, + "epoch": 0.006902663122055874, + "grad_norm": 0.082708440721035, + "kl": 0.054168701171875, + "learning_rate": 9.998843578191943e-06, + "loss": 0.0321, + "step": 185 + }, + { + "clip_ratio": 0.0051338220946490765, + "epoch": 0.006939974814607528, + "grad_norm": 0.07914624363183975, + "kl": 0.055389404296875, + "learning_rate": 9.998776555965224e-06, + "loss": 0.0319, + "step": 186 + }, + { + "clip_ratio": 0.003520705970004201, + "completion_length": 687.5178985595703, + "epoch": 0.006977286507159181, + "grad_norm": 0.07230006158351898, + "kl": 0.031951904296875, + "learning_rate": 9.998707646135433e-06, + "loss": -0.0014, + "num_tokens": 4944459.0, + "reward": 0.382739270105958, + "reward_std": 0.22917547076940536, + "rewards/code_reward": 0.3154178336262703, + "rewards/format_reward": 0.6732142865657806, + "step": 187 + }, + { + "clip_ratio": 0.0043439631699584424, + "epoch": 0.007014598199710834, + "grad_norm": 0.07766614854335785, + "kl": 0.034942626953125, + "learning_rate": 9.998636848731477e-06, + "loss": -0.0014, + "step": 188 + }, + { + "clip_ratio": 0.0047292670351453125, + "epoch": 0.007051909892262488, + "grad_norm": 0.05223187431693077, + "kl": 0.0367431640625, + "learning_rate": 9.998564163783062e-06, + "loss": -0.0017, + "step": 189 + }, + { + "clip_ratio": 0.003680915106087923, + "completion_length": 740.6607284545898, + "epoch": 0.007089221584814141, + "grad_norm": 0.09088468551635742, + "kl": 0.0355224609375, + "learning_rate": 9.998489591320691e-06, + "loss": 0.005, + "num_tokens": 5017408.0, + "reward": 0.6728540826588869, + "reward_std": 0.23156581912189722, + "rewards/code_reward": 0.5817825943231583, + "rewards/format_reward": 0.9107142686843872, + "step": 190 + }, + { + "clip_ratio": 0.004008536925539374, + "epoch": 0.007126533277365794, + "grad_norm": 0.0853656604886055, + "kl": 0.035308837890625, + "learning_rate": 9.99841313137565e-06, + "loss": 0.0047, + "step": 191 + }, + { + "clip_ratio": 0.004636025114450604, + "epoch": 0.0071638449699174475, + "grad_norm": 0.08043695241212845, + "kl": 0.034423828125, + "learning_rate": 9.998334783980016e-06, + "loss": 0.0043, + "step": 192 + }, + { + "clip_ratio": 0.0038101774407550693, + "completion_length": 784.3393249511719, + "epoch": 0.0072011566624691015, + "grad_norm": 0.09266630560159683, + "kl": 0.0565185546875, + "learning_rate": 9.998254549166669e-06, + "loss": -0.001, + "num_tokens": 5093825.0, + "reward": 0.29586358461529016, + "reward_std": 0.11458997335284948, + "rewards/code_reward": 0.22068501636385918, + "rewards/format_reward": 0.7517857179045677, + "step": 193 + }, + { + "clip_ratio": 0.004176330054178834, + "epoch": 0.007238468355020755, + "grad_norm": 0.0868118554353714, + "kl": 0.04876708984375, + "learning_rate": 9.998172426969268e-06, + "loss": -0.0014, + "step": 194 + }, + { + "clip_ratio": 0.004666382854338735, + "epoch": 0.007275780047572408, + "grad_norm": 0.08124133944511414, + "kl": 0.04986572265625, + "learning_rate": 9.998088417422275e-06, + "loss": -0.0019, + "step": 195 + }, + { + "clip_ratio": 0.0036231164122000337, + "completion_length": 639.3393096923828, + "epoch": 0.007313091740124062, + "grad_norm": 0.10257536917924881, + "kl": 0.040740966796875, + "learning_rate": 9.998002520560935e-06, + "loss": 0.0008, + "num_tokens": 5166784.0, + "reward": 0.7271736338734627, + "reward_std": 0.12570815905928612, + "rewards/code_reward": 0.6169950738549232, + "rewards/format_reward": 1.1017856895923615, + "step": 196 + }, + { + "clip_ratio": 0.003542980703059584, + "epoch": 0.007350403432675715, + "grad_norm": 0.07866242527961731, + "kl": 0.04510498046875, + "learning_rate": 9.997914736421293e-06, + "loss": 0.0004, + "step": 197 + }, + { + "clip_ratio": 0.003185482113622129, + "epoch": 0.007387715125227368, + "grad_norm": 0.07647780328989029, + "kl": 0.04962158203125, + "learning_rate": 9.997825065040175e-06, + "loss": 0.0002, + "step": 198 + }, + { + "clip_ratio": 0.003281585988588631, + "completion_length": 727.9464721679688, + "epoch": 0.007425026817779021, + "grad_norm": 0.08867290616035461, + "kl": 0.08099365234375, + "learning_rate": 9.997733506455212e-06, + "loss": 0.0276, + "num_tokens": 5229473.0, + "reward": 0.5535299852490425, + "reward_std": 0.1372433938086033, + "rewards/code_reward": 0.4508513999171555, + "rewards/format_reward": 1.0267857313156128, + "step": 199 + }, + { + "clip_ratio": 0.003505714877974242, + "epoch": 0.007462338510330675, + "grad_norm": 0.08027646690607071, + "kl": 0.07379150390625, + "learning_rate": 9.997640060704818e-06, + "loss": 0.0274, + "step": 200 + }, + { + "clip_ratio": 0.003395806939806789, + "epoch": 0.007499650202882328, + "grad_norm": 0.07678469270467758, + "kl": 0.06866455078125, + "learning_rate": 9.9975447278282e-06, + "loss": 0.0267, + "step": 201 + }, + { + "clip_ratio": 0.0037414584076032043, + "completion_length": 651.8393096923828, + "epoch": 0.007536961895433981, + "grad_norm": 0.09600714594125748, + "kl": 0.0361328125, + "learning_rate": 9.997447507865358e-06, + "loss": 0.011, + "num_tokens": 5292870.0, + "reward": 0.8131565675139427, + "reward_std": 0.19494640827178955, + "rewards/code_reward": 0.6926208026707172, + "rewards/format_reward": 1.205357164144516, + "step": 202 + }, + { + "clip_ratio": 0.003825028834398836, + "epoch": 0.007574273587985635, + "grad_norm": 0.08885879069566727, + "kl": 0.036773681640625, + "learning_rate": 9.997348400857085e-06, + "loss": 0.0108, + "step": 203 + }, + { + "clip_ratio": 0.003767062444239855, + "epoch": 0.007611585280537288, + "grad_norm": 0.08421695977449417, + "kl": 0.037506103515625, + "learning_rate": 9.997247406844964e-06, + "loss": 0.0101, + "step": 204 + }, + { + "clip_ratio": 0.002661868347786367, + "completion_length": 777.9464721679688, + "epoch": 0.0076488969730889415, + "grad_norm": 0.07696408778429031, + "kl": 0.067626953125, + "learning_rate": 9.99714452587137e-06, + "loss": 0.0128, + "num_tokens": 5375649.0, + "reward": 0.5797934103757143, + "reward_std": 0.09092009626328945, + "rewards/code_reward": 0.46568629145622253, + "rewards/format_reward": 1.1410714089870453, + "step": 205 + }, + { + "clip_ratio": 0.0025482007185928524, + "epoch": 0.0076862086656405955, + "grad_norm": 0.08938346803188324, + "kl": 0.06842041015625, + "learning_rate": 9.99703975797947e-06, + "loss": 0.0124, + "step": 206 + }, + { + "clip_ratio": 0.002452238113619387, + "epoch": 0.007723520358192249, + "grad_norm": 0.06390335410833359, + "kl": 0.05352783203125, + "learning_rate": 9.996933103213224e-06, + "loss": 0.012, + "step": 207 + }, + { + "clip_ratio": 0.0037327160825952888, + "completion_length": 657.232177734375, + "epoch": 0.007760832050743902, + "grad_norm": 0.09617351740598679, + "kl": 0.0440673828125, + "learning_rate": 9.99682456161738e-06, + "loss": 0.0159, + "num_tokens": 5448924.0, + "reward": 0.5538814198225737, + "reward_std": 0.11581545695662498, + "rewards/code_reward": 0.4501313827931881, + "rewards/format_reward": 1.037500023841858, + "step": 208 + }, + { + "clip_ratio": 0.0042639413732104, + "epoch": 0.007798143743295555, + "grad_norm": 0.0927850529551506, + "kl": 0.041168212890625, + "learning_rate": 9.99671413323748e-06, + "loss": 0.0155, + "step": 209 + }, + { + "clip_ratio": 0.004250534577295184, + "epoch": 0.007835455435847209, + "grad_norm": 0.08422954380512238, + "kl": 0.034332275390625, + "learning_rate": 9.996601818119858e-06, + "loss": 0.0149, + "step": 210 + }, + { + "clip_ratio": 0.0035704015754163265, + "completion_length": 720.3571624755859, + "epoch": 0.007872767128398862, + "grad_norm": 0.09818711876869202, + "kl": 0.03826904296875, + "learning_rate": 9.996487616311643e-06, + "loss": 0.0235, + "num_tokens": 5513484.0, + "reward": 0.42669917829334736, + "reward_std": 0.09325571078807116, + "rewards/code_reward": 0.31848490610718727, + "rewards/format_reward": 1.0821428745985031, + "step": 211 + }, + { + "clip_ratio": 0.00371076149167493, + "epoch": 0.007910078820950515, + "grad_norm": 0.08701728284358978, + "kl": 0.038787841796875, + "learning_rate": 9.996371527860747e-06, + "loss": 0.0231, + "step": 212 + }, + { + "clip_ratio": 0.004054058576002717, + "epoch": 0.007947390513502168, + "grad_norm": 0.08270826935768127, + "kl": 0.039306640625, + "learning_rate": 9.996253552815883e-06, + "loss": 0.0225, + "step": 213 + }, + { + "clip_ratio": 0.004400565754622221, + "completion_length": 624.4285888671875, + "epoch": 0.007984702206053821, + "grad_norm": 0.17384770512580872, + "kl": 0.16192626953125, + "learning_rate": 9.996133691226547e-06, + "loss": 0.044, + "num_tokens": 5578846.0, + "reward": 0.7032306455075741, + "reward_std": 0.3111959816887975, + "rewards/code_reward": 0.5719806477427483, + "rewards/format_reward": 1.3124999701976776, + "step": 214 + }, + { + "clip_ratio": 0.004525794123765081, + "epoch": 0.008022013898605476, + "grad_norm": 0.37151721119880676, + "kl": 0.1083984375, + "learning_rate": 9.996011943143032e-06, + "loss": 0.0443, + "step": 215 + }, + { + "clip_ratio": 0.004639293882064521, + "epoch": 0.00805932559115713, + "grad_norm": 0.7010387182235718, + "kl": 0.333251953125, + "learning_rate": 9.995888308616421e-06, + "loss": 0.0444, + "step": 216 + }, + { + "clip_ratio": 0.004295735445339233, + "completion_length": 447.2500305175781, + "epoch": 0.008096637283708782, + "grad_norm": 0.39592331647872925, + "kl": 0.48138427734375, + "learning_rate": 9.99576278769859e-06, + "loss": 0.0228, + "num_tokens": 5630282.0, + "reward": 0.706756416708231, + "reward_std": 0.19267083797603846, + "rewards/code_reward": 0.5951492562890053, + "rewards/format_reward": 1.116071417927742, + "step": 217 + }, + { + "clip_ratio": 0.007925484445877373, + "epoch": 0.008133948976260436, + "grad_norm": 0.370220810174942, + "kl": 0.1400146484375, + "learning_rate": 9.995635380442206e-06, + "loss": 0.0219, + "step": 218 + }, + { + "clip_ratio": 0.008035031612962484, + "epoch": 0.008171260668812089, + "grad_norm": 0.40616756677627563, + "kl": 0.0849609375, + "learning_rate": 9.995506086900724e-06, + "loss": 0.0213, + "step": 219 + }, + { + "clip_ratio": 0.00446747214300558, + "completion_length": 666.2321853637695, + "epoch": 0.008208572361363742, + "grad_norm": 0.0942574217915535, + "kl": 0.04083251953125, + "learning_rate": 9.995374907128396e-06, + "loss": 0.0158, + "num_tokens": 5698545.0, + "reward": 0.338626928627491, + "reward_std": 0.32119051087647676, + "rewards/code_reward": 0.20648405887186527, + "rewards/format_reward": 1.321428582072258, + "step": 220 + }, + { + "clip_ratio": 0.0046544576762244105, + "epoch": 0.008245884053915395, + "grad_norm": 0.0862528458237648, + "kl": 0.0462646484375, + "learning_rate": 9.99524184118026e-06, + "loss": 0.0155, + "step": 221 + }, + { + "clip_ratio": 0.005203164531849325, + "epoch": 0.00828319574646705, + "grad_norm": 0.08757706731557846, + "kl": 0.060302734375, + "learning_rate": 9.99510688911215e-06, + "loss": 0.0154, + "step": 222 + }, + { + "clip_ratio": 0.00417603668756783, + "completion_length": 681.8214569091797, + "epoch": 0.008320507439018703, + "grad_norm": 0.0959044098854065, + "kl": 0.0675048828125, + "learning_rate": 9.994970050980688e-06, + "loss": 0.003, + "num_tokens": 5767667.0, + "reward": 0.5268604531884193, + "reward_std": 0.16591081535443664, + "rewards/code_reward": 0.385967580601573, + "rewards/format_reward": 1.4089286029338837, + "step": 223 + }, + { + "clip_ratio": 0.004512524988967925, + "epoch": 0.008357819131570356, + "grad_norm": 0.08638836443424225, + "kl": 0.07421875, + "learning_rate": 9.994831326843288e-06, + "loss": 0.0026, + "step": 224 + }, + { + "clip_ratio": 0.005140068591572344, + "epoch": 0.008395130824122009, + "grad_norm": 0.086151622235775, + "kl": 0.0665283203125, + "learning_rate": 9.994690716758159e-06, + "loss": 0.0023, + "step": 225 + }, + { + "clip_ratio": 0.0035791777190752327, + "completion_length": 823.3393249511719, + "epoch": 0.008432442516673662, + "grad_norm": 0.09167367219924927, + "kl": 0.02947998046875, + "learning_rate": 9.994548220784296e-06, + "loss": -0.0078, + "num_tokens": 5837448.0, + "reward": 0.5690591111779213, + "reward_std": 0.10750861000269651, + "rewards/code_reward": 0.43423763290047646, + "rewards/format_reward": 1.3482142686843872, + "step": 226 + }, + { + "clip_ratio": 0.0034919297322630882, + "epoch": 0.008469754209225315, + "grad_norm": 0.08677981793880463, + "kl": 0.0289306640625, + "learning_rate": 9.99440383898149e-06, + "loss": -0.0082, + "step": 227 + }, + { + "clip_ratio": 0.004110443114768714, + "epoch": 0.00850706590177697, + "grad_norm": 0.08398650586605072, + "kl": 0.0294952392578125, + "learning_rate": 9.994257571410316e-06, + "loss": -0.0083, + "step": 228 + }, + { + "clip_ratio": 0.0035436770413070917, + "completion_length": 822.8393249511719, + "epoch": 0.008544377594328623, + "grad_norm": 0.08238296955823898, + "kl": 0.040283203125, + "learning_rate": 9.99410941813215e-06, + "loss": -0.0129, + "num_tokens": 5921089.0, + "reward": 0.33519232645630836, + "reward_std": 0.17735979426652193, + "rewards/code_reward": 0.21840659715235233, + "rewards/format_reward": 1.167857140302658, + "step": 229 + }, + { + "clip_ratio": 0.003543863247614354, + "epoch": 0.008581689286880276, + "grad_norm": 0.08872734010219574, + "kl": 0.0447998046875, + "learning_rate": 9.993959379209155e-06, + "loss": -0.0131, + "step": 230 + }, + { + "clip_ratio": 0.003979889734182507, + "epoch": 0.00861900097943193, + "grad_norm": 0.07805692404508591, + "kl": 0.04638671875, + "learning_rate": 9.99380745470428e-06, + "loss": -0.0135, + "step": 231 + }, + { + "clip_ratio": 0.0049113406566902995, + "completion_length": 548.7857360839844, + "epoch": 0.008656312671983583, + "grad_norm": 0.10478541254997253, + "kl": 0.054107666015625, + "learning_rate": 9.993653644681273e-06, + "loss": 0.0157, + "num_tokens": 5981445.0, + "reward": 0.3511221632361412, + "reward_std": 0.24507475271821022, + "rewards/code_reward": 0.2109435871243477, + "rewards/format_reward": 1.4017857015132904, + "step": 232 + }, + { + "clip_ratio": 0.005106896220240742, + "epoch": 0.008693624364535236, + "grad_norm": 0.10202553123235703, + "kl": 0.047821044921875, + "learning_rate": 9.993497949204669e-06, + "loss": 0.0154, + "step": 233 + }, + { + "clip_ratio": 0.005104968964587897, + "epoch": 0.008730936057086889, + "grad_norm": 0.09199768304824829, + "kl": 0.048797607421875, + "learning_rate": 9.993340368339793e-06, + "loss": 0.0148, + "step": 234 + }, + { + "clip_ratio": 0.004332953772973269, + "completion_length": 663.6964492797852, + "epoch": 0.008768247749638544, + "grad_norm": 0.09609228372573853, + "kl": 0.1053466796875, + "learning_rate": 9.993180902152767e-06, + "loss": 0.0135, + "num_tokens": 6051568.0, + "reward": 0.3180071245878935, + "reward_std": 0.13934592343866825, + "rewards/code_reward": 0.1906856670975685, + "rewards/format_reward": 1.2732142508029938, + "step": 235 + }, + { + "clip_ratio": 0.004491167957894504, + "epoch": 0.008805559442190197, + "grad_norm": 0.10051500052213669, + "kl": 0.1072998046875, + "learning_rate": 9.993019550710498e-06, + "loss": 0.0131, + "step": 236 + }, + { + "clip_ratio": 0.004503439296968281, + "epoch": 0.00884287113474185, + "grad_norm": 0.08593375980854034, + "kl": 0.0911865234375, + "learning_rate": 9.992856314080684e-06, + "loss": 0.0123, + "step": 237 + }, + { + "clip_ratio": 0.004247071919962764, + "completion_length": 666.5893173217773, + "epoch": 0.008880182827293503, + "grad_norm": 1.300209641456604, + "kl": 0.68341064453125, + "learning_rate": 9.992691192331821e-06, + "loss": 0.0273, + "num_tokens": 6119249.0, + "reward": 0.5345271304249763, + "reward_std": 0.2392579847946763, + "rewards/code_reward": 0.40774137154221535, + "rewards/format_reward": 1.2678571045398712, + "step": 238 + }, + { + "clip_ratio": 0.0056593240005895495, + "epoch": 0.008917494519845156, + "grad_norm": 1.9461170434951782, + "kl": 0.05511474609375, + "learning_rate": 9.992524185533184e-06, + "loss": 0.0336, + "step": 239 + }, + { + "clip_ratio": 0.0057356912875548005, + "epoch": 0.00895480621239681, + "grad_norm": 0.9280481338500977, + "kl": 0.0595703125, + "learning_rate": 9.992355293754853e-06, + "loss": 0.027, + "step": 240 + }, + { + "clip_ratio": 0.0036855486687272787, + "completion_length": 997.3928985595703, + "epoch": 0.008992117904948463, + "grad_norm": 0.07582602649927139, + "kl": 0.029296875, + "learning_rate": 9.992184517067688e-06, + "loss": 0.034, + "num_tokens": 6221247.0, + "reward": 0.36026787757873535, + "reward_std": 0.23492727428674698, + "rewards/code_reward": 0.2263392861932516, + "rewards/format_reward": 1.3392857313156128, + "step": 241 + }, + { + "clip_ratio": 0.003983144706580788, + "epoch": 0.009029429597500117, + "grad_norm": 0.07959845662117004, + "kl": 0.0328369140625, + "learning_rate": 9.992011855543345e-06, + "loss": 0.0339, + "step": 242 + }, + { + "clip_ratio": 0.004240406444296241, + "epoch": 0.00906674129005177, + "grad_norm": 0.07523786276578903, + "kl": 0.03338623046875, + "learning_rate": 9.991837309254268e-06, + "loss": 0.0337, + "step": 243 + }, + { + "clip_ratio": 0.0030144195770844817, + "completion_length": 715.2143249511719, + "epoch": 0.009104052982603424, + "grad_norm": 0.09345444291830063, + "kl": 0.037628173828125, + "learning_rate": 9.991660878273694e-06, + "loss": 0.01, + "num_tokens": 6292953.0, + "reward": 0.7379702478647232, + "reward_std": 0.18863076530396938, + "rewards/code_reward": 0.6086845397949219, + "rewards/format_reward": 1.292857140302658, + "step": 244 + }, + { + "clip_ratio": 0.0035185081069357693, + "epoch": 0.009141364675155077, + "grad_norm": 0.08751847594976425, + "kl": 0.0406494140625, + "learning_rate": 9.991482562675654e-06, + "loss": 0.0099, + "step": 245 + }, + { + "clip_ratio": 0.0037192151648923755, + "epoch": 0.00917867636770673, + "grad_norm": 0.08098579943180084, + "kl": 0.037841796875, + "learning_rate": 9.99130236253496e-06, + "loss": 0.0096, + "step": 246 + }, + { + "clip_ratio": 0.0038726728525944054, + "completion_length": 807.2143096923828, + "epoch": 0.009215988060258383, + "grad_norm": 0.08818066865205765, + "kl": 0.037841796875, + "learning_rate": 9.991120277927224e-06, + "loss": 0.0009, + "num_tokens": 6373997.0, + "reward": 0.35033416375517845, + "reward_std": 0.18011424969881773, + "rewards/code_reward": 0.23265555500984192, + "rewards/format_reward": 1.1767856776714325, + "step": 247 + }, + { + "clip_ratio": 0.004007873183581978, + "epoch": 0.009253299752810036, + "grad_norm": 0.08709990233182907, + "kl": 0.03955078125, + "learning_rate": 9.990936308928843e-06, + "loss": 0.0009, + "step": 248 + }, + { + "clip_ratio": 0.003668102202937007, + "epoch": 0.009290611445361691, + "grad_norm": 0.08606981486082077, + "kl": 0.04107666015625, + "learning_rate": 9.99075045561701e-06, + "loss": 0.0004, + "step": 249 + }, + { + "clip_ratio": 0.004393384384457022, + "completion_length": 700.5000305175781, + "epoch": 0.009327923137913344, + "grad_norm": 0.6010595560073853, + "kl": 0.318634033203125, + "learning_rate": 9.990562718069703e-06, + "loss": 0.0163, + "num_tokens": 6452369.0, + "reward": 0.5319213382899761, + "reward_std": 0.15835293615236878, + "rewards/code_reward": 0.3988856226205826, + "rewards/format_reward": 1.330357164144516, + "step": 250 + }, + { + "clip_ratio": 0.004936377867124975, + "epoch": 0.009365234830464997, + "grad_norm": 0.0979296937584877, + "kl": 0.062530517578125, + "learning_rate": 9.990373096365695e-06, + "loss": 0.0135, + "step": 251 + }, + { + "clip_ratio": 0.005421827198006213, + "epoch": 0.00940254652301665, + "grad_norm": 0.10828126221895218, + "kl": 0.06109619140625, + "learning_rate": 9.990181590584548e-06, + "loss": 0.0134, + "step": 252 + }, + { + "clip_ratio": 0.0040123488288372755, + "completion_length": 628.7857360839844, + "epoch": 0.009439858215568303, + "grad_norm": 0.08404182642698288, + "kl": 0.054962158203125, + "learning_rate": 9.989988200806612e-06, + "loss": 0.0133, + "num_tokens": 6514811.0, + "reward": 0.33397576957941055, + "reward_std": 0.358185850083828, + "rewards/code_reward": 0.18933291174471378, + "rewards/format_reward": 1.4464285969734192, + "step": 253 + }, + { + "clip_ratio": 0.004523399518802762, + "epoch": 0.009477169908119957, + "grad_norm": 0.08408243954181671, + "kl": 0.050811767578125, + "learning_rate": 9.989792927113033e-06, + "loss": 0.0132, + "step": 254 + }, + { + "clip_ratio": 0.005030769854784012, + "epoch": 0.00951448160067161, + "grad_norm": 0.07827068865299225, + "kl": 0.041961669921875, + "learning_rate": 9.989595769585738e-06, + "loss": 0.013, + "step": 255 + }, + { + "clip_ratio": 0.00250210496596992, + "completion_length": 753.5893249511719, + "epoch": 0.009551793293223265, + "grad_norm": 0.38267484307289124, + "kl": 0.273284912109375, + "learning_rate": 9.989396728307458e-06, + "loss": 0.0086, + "num_tokens": 6588750.0, + "reward": 0.6343691535294056, + "reward_std": 0.027103726752102375, + "rewards/code_reward": 0.49597625964088365, + "rewards/format_reward": 1.3839285671710968, + "step": 256 + }, + { + "clip_ratio": 0.0029735419084317982, + "epoch": 0.009589104985774918, + "grad_norm": 0.0752052292227745, + "kl": 0.0570068359375, + "learning_rate": 9.989195803361704e-06, + "loss": 0.0068, + "step": 257 + }, + { + "clip_ratio": 0.0033153602853417397, + "epoch": 0.00962641667832657, + "grad_norm": 0.358904093503952, + "kl": 0.043304443359375, + "learning_rate": 9.98899299483278e-06, + "loss": 0.0085, + "step": 258 + }, + { + "clip_ratio": 0.004630948242265731, + "completion_length": 556.357177734375, + "epoch": 0.009663728370878224, + "grad_norm": 0.11582271754741669, + "kl": 0.057861328125, + "learning_rate": 9.98878830280578e-06, + "loss": 0.0075, + "num_tokens": 6639660.0, + "reward": 0.4800824411213398, + "reward_std": 0.10539746470749378, + "rewards/code_reward": 0.3550824150443077, + "rewards/format_reward": 1.25, + "step": 259 + }, + { + "clip_ratio": 0.004522126168012619, + "epoch": 0.009701040063429877, + "grad_norm": 0.10794132202863693, + "kl": 0.05816650390625, + "learning_rate": 9.988581727366591e-06, + "loss": 0.0072, + "step": 260 + }, + { + "clip_ratio": 0.003980654175393283, + "epoch": 0.00973835175598153, + "grad_norm": 0.09981204569339752, + "kl": 0.06549072265625, + "learning_rate": 9.988373268601888e-06, + "loss": 0.0065, + "step": 261 + }, + { + "clip_ratio": 0.003389013814739883, + "completion_length": 866.232177734375, + "epoch": 0.009775663448533183, + "grad_norm": 0.09706564247608185, + "kl": 0.03607177734375, + "learning_rate": 9.988162926599134e-06, + "loss": -0.001, + "num_tokens": 6725591.0, + "reward": 0.32432257011532784, + "reward_std": 0.20270952582359314, + "rewards/code_reward": 0.18860827200114727, + "rewards/format_reward": 1.3571428954601288, + "step": 262 + }, + { + "clip_ratio": 0.003564666782040149, + "epoch": 0.009812975141084838, + "grad_norm": 0.07987275719642639, + "kl": 0.03900146484375, + "learning_rate": 9.987950701446588e-06, + "loss": -0.001, + "step": 263 + }, + { + "clip_ratio": 0.003992884012404829, + "epoch": 0.009850286833636491, + "grad_norm": 0.07928963750600815, + "kl": 0.0384521484375, + "learning_rate": 9.987736593233292e-06, + "loss": -0.0012, + "step": 264 + }, + { + "clip_ratio": 0.002056614961475134, + "completion_length": 460.9643096923828, + "epoch": 0.009887598526188144, + "grad_norm": 0.08863948285579681, + "kl": 0.04400634765625, + "learning_rate": 9.987520602049084e-06, + "loss": 0.0075, + "num_tokens": 6775079.0, + "reward": 0.66396863758564, + "reward_std": 0.03699097875505686, + "rewards/code_reward": 0.51754005625844, + "rewards/format_reward": 1.4642857015132904, + "step": 265 + }, + { + "clip_ratio": 0.0020450044539757073, + "epoch": 0.009924910218739797, + "grad_norm": 0.08821898698806763, + "kl": 0.05389404296875, + "learning_rate": 9.98730272798459e-06, + "loss": 0.0075, + "step": 266 + }, + { + "clip_ratio": 0.0015095033159013838, + "epoch": 0.00996222191129145, + "grad_norm": 0.08055567741394043, + "kl": 0.05389404296875, + "learning_rate": 9.987082971131226e-06, + "loss": 0.0069, + "step": 267 + }, + { + "clip_ratio": 0.002515333762858063, + "completion_length": 557.4286041259766, + "epoch": 0.009999533603843104, + "grad_norm": 0.10972746461629868, + "kl": 0.09295654296875, + "learning_rate": 9.986861331581197e-06, + "loss": -0.0007, + "num_tokens": 6830627.0, + "reward": 0.8089285790920258, + "reward_std": 0.15993194375187159, + "rewards/code_reward": 0.6607142928987741, + "rewards/format_reward": 1.4821428656578064, + "step": 268 + }, + { + "clip_ratio": 0.002628769725561142, + "epoch": 0.010036845296394759, + "grad_norm": 0.07947108149528503, + "kl": 0.0797119140625, + "learning_rate": 9.9866378094275e-06, + "loss": -0.0011, + "step": 269 + }, + { + "clip_ratio": 0.002208319609053433, + "epoch": 0.010074156988946412, + "grad_norm": 0.0822838544845581, + "kl": 0.06378173828125, + "learning_rate": 9.98641240476392e-06, + "loss": -0.0014, + "step": 270 + }, + { + "clip_ratio": 0.0027288742712698877, + "completion_length": 608.6071701049805, + "epoch": 0.010111468681498065, + "grad_norm": 0.08706234395503998, + "kl": 0.037689208984375, + "learning_rate": 9.986185117685031e-06, + "loss": -0.0004, + "num_tokens": 6897679.0, + "reward": 0.7816144675016403, + "reward_std": 0.08521442487835884, + "rewards/code_reward": 0.6380430236458778, + "rewards/format_reward": 1.4357142746448517, + "step": 271 + }, + { + "clip_ratio": 0.0028228851151652634, + "epoch": 0.010148780374049718, + "grad_norm": 0.08141525089740753, + "kl": 0.03875732421875, + "learning_rate": 9.985955948286203e-06, + "loss": -0.0006, + "step": 272 + }, + { + "clip_ratio": 0.002587716095149517, + "epoch": 0.010186092066601371, + "grad_norm": 0.07097455859184265, + "kl": 0.03955078125, + "learning_rate": 9.985724896663586e-06, + "loss": -0.0009, + "step": 273 + }, + { + "clip_ratio": 0.0032859534258022904, + "completion_length": 699.7321701049805, + "epoch": 0.010223403759153024, + "grad_norm": 0.06546859443187714, + "kl": 0.03875732421875, + "learning_rate": 9.985491962914129e-06, + "loss": 0.013, + "num_tokens": 6974846.0, + "reward": 0.7142007313668728, + "reward_std": 0.13553646206855774, + "rewards/code_reward": 0.5668792426586151, + "rewards/format_reward": 1.4732142984867096, + "step": 274 + }, + { + "clip_ratio": 0.0031666463473811746, + "epoch": 0.010260715451704677, + "grad_norm": 0.08189915865659714, + "kl": 0.04241943359375, + "learning_rate": 9.985257147135564e-06, + "loss": 0.0127, + "step": 275 + }, + { + "clip_ratio": 0.002880616288166493, + "epoch": 0.010298027144256332, + "grad_norm": 0.05966382846236229, + "kl": 0.0386962890625, + "learning_rate": 9.985020449426415e-06, + "loss": 0.0126, + "step": 276 + }, + { + "clip_ratio": 0.004024980124086142, + "completion_length": 617.232177734375, + "epoch": 0.010335338836807985, + "grad_norm": 0.10013587027788162, + "kl": 0.038482666015625, + "learning_rate": 9.984781869885999e-06, + "loss": 0.0157, + "num_tokens": 7032427.0, + "reward": 0.40679580718278885, + "reward_std": 0.031236213631927967, + "rewards/code_reward": 0.25858152005821466, + "rewards/format_reward": 1.4821428656578064, + "step": 277 + }, + { + "clip_ratio": 0.0040133289294317365, + "epoch": 0.010372650529359638, + "grad_norm": 0.1592109203338623, + "kl": 0.037933349609375, + "learning_rate": 9.984541408614416e-06, + "loss": 0.0151, + "step": 278 + }, + { + "clip_ratio": 0.0038085189880803227, + "epoch": 0.010409962221911291, + "grad_norm": 0.08551512658596039, + "kl": 0.038238525390625, + "learning_rate": 9.984299065712565e-06, + "loss": 0.0147, + "step": 279 + }, + { + "clip_ratio": 0.0034396362025290728, + "completion_length": 850.6786041259766, + "epoch": 0.010447273914462945, + "grad_norm": 0.06537695974111557, + "kl": 0.0506591796875, + "learning_rate": 9.98405484128212e-06, + "loss": 0.0149, + "num_tokens": 7112657.0, + "reward": 0.25664781779050827, + "reward_std": 0.08567760325968266, + "rewards/code_reward": 0.11200495064258575, + "rewards/format_reward": 1.4464285969734192, + "step": 280 + }, + { + "clip_ratio": 0.0037408536882139742, + "epoch": 0.010484585607014598, + "grad_norm": 0.06536629796028137, + "kl": 0.050933837890625, + "learning_rate": 9.98380873542556e-06, + "loss": 0.0149, + "step": 281 + }, + { + "clip_ratio": 0.003248342836741358, + "epoch": 0.01052189729956625, + "grad_norm": 0.06726227700710297, + "kl": 0.0521240234375, + "learning_rate": 9.983560748246142e-06, + "loss": 0.0146, + "step": 282 + }, + { + "clip_ratio": 0.004160403041169047, + "completion_length": 775.2321701049805, + "epoch": 0.010559208992117906, + "grad_norm": 0.09233646094799042, + "kl": 0.030975341796875, + "learning_rate": 9.983310879847919e-06, + "loss": 0.006, + "num_tokens": 7194850.0, + "reward": 0.4028128385543823, + "reward_std": 0.2880722712725401, + "rewards/code_reward": 0.2599556976929307, + "rewards/format_reward": 1.4285714030265808, + "step": 283 + }, + { + "clip_ratio": 0.003676077409181744, + "epoch": 0.010596520684669559, + "grad_norm": 0.08340355008840561, + "kl": 0.03118896484375, + "learning_rate": 9.983059130335733e-06, + "loss": 0.0059, + "step": 284 + }, + { + "clip_ratio": 0.003797809418756515, + "epoch": 0.010633832377221212, + "grad_norm": 0.08707712590694427, + "kl": 0.030914306640625, + "learning_rate": 9.98280549981521e-06, + "loss": 0.0056, + "step": 285 + }, + { + "clip_ratio": 0.003889854997396469, + "completion_length": 764.0536193847656, + "epoch": 0.010671144069772865, + "grad_norm": 0.09759113192558289, + "kl": 0.03643798828125, + "learning_rate": 9.98254998839277e-06, + "loss": 0.0331, + "num_tokens": 7269935.0, + "reward": 0.6318517737090588, + "reward_std": 0.12087373156100512, + "rewards/code_reward": 0.48435176257044077, + "rewards/format_reward": 1.4749999940395355, + "step": 286 + }, + { + "clip_ratio": 0.003769294125959277, + "epoch": 0.010708455762324518, + "grad_norm": 0.09182238578796387, + "kl": 0.03485107421875, + "learning_rate": 9.982292596175621e-06, + "loss": 0.0328, + "step": 287 + }, + { + "clip_ratio": 0.003612221102230251, + "epoch": 0.010745767454876171, + "grad_norm": 0.08829570561647415, + "kl": 0.0350341796875, + "learning_rate": 9.98203332327176e-06, + "loss": 0.0324, + "step": 288 + }, + { + "clip_ratio": 0.002272983663715422, + "completion_length": 575.303596496582, + "epoch": 0.010783079147427824, + "grad_norm": 0.08192306011915207, + "kl": 0.04248046875, + "learning_rate": 9.981772169789973e-06, + "loss": -0.0027, + "num_tokens": 7334772.0, + "reward": 0.6790222078561783, + "reward_std": 0.18603406846523285, + "rewards/code_reward": 0.5325936302542686, + "rewards/format_reward": 1.4642857015132904, + "step": 289 + }, + { + "clip_ratio": 0.0021269143908284605, + "epoch": 0.01082039083997948, + "grad_norm": 0.07519153505563736, + "kl": 0.041748046875, + "learning_rate": 9.981509135839835e-06, + "loss": -0.0029, + "step": 290 + }, + { + "clip_ratio": 0.0021641626954078674, + "epoch": 0.010857702532531132, + "grad_norm": 0.08440684527158737, + "kl": 0.04345703125, + "learning_rate": 9.981244221531708e-06, + "loss": -0.0033, + "step": 291 + }, + { + "clip_ratio": 0.0031593175954185426, + "completion_length": 699.607177734375, + "epoch": 0.010895014225082785, + "grad_norm": 0.07548001408576965, + "kl": 0.03271484375, + "learning_rate": 9.98097742697675e-06, + "loss": 0.0047, + "num_tokens": 7409636.0, + "reward": 0.6193783134222031, + "reward_std": 0.14556205831468105, + "rewards/code_reward": 0.4765211511403322, + "rewards/format_reward": 1.4285714328289032, + "step": 292 + }, + { + "clip_ratio": 0.0032584117725491524, + "epoch": 0.010932325917634439, + "grad_norm": 0.07646626234054565, + "kl": 0.034759521484375, + "learning_rate": 9.980708752286899e-06, + "loss": 0.0046, + "step": 293 + }, + { + "clip_ratio": 0.0029166718013584614, + "epoch": 0.010969637610186092, + "grad_norm": 0.07404545694589615, + "kl": 0.035369873046875, + "learning_rate": 9.98043819757489e-06, + "loss": 0.0044, + "step": 294 + }, + { + "clip_ratio": 0.0033007319434545934, + "completion_length": 864.1250610351562, + "epoch": 0.011006949302737745, + "grad_norm": 0.0830797404050827, + "kl": 0.03759765625, + "learning_rate": 9.980165762954237e-06, + "loss": -0.009, + "num_tokens": 7500335.0, + "reward": 0.4038939066231251, + "reward_std": 0.3407976711168885, + "rewards/code_reward": 0.26639389246702194, + "rewards/format_reward": 1.375, + "step": 295 + }, + { + "clip_ratio": 0.00355309946462512, + "epoch": 0.011044260995289398, + "grad_norm": 0.10756263881921768, + "kl": 0.03765869140625, + "learning_rate": 9.979891448539251e-06, + "loss": -0.0091, + "step": 296 + }, + { + "clip_ratio": 0.0030425406293943524, + "epoch": 0.011081572687841053, + "grad_norm": 0.07346783578395844, + "kl": 0.037750244140625, + "learning_rate": 9.979615254445033e-06, + "loss": -0.0097, + "step": 297 + }, + { + "clip_ratio": 0.0024366335128434002, + "completion_length": 595.607177734375, + "epoch": 0.011118884380392706, + "grad_norm": 0.08195068687200546, + "kl": 0.0399169921875, + "learning_rate": 9.979337180787464e-06, + "loss": -0.0033, + "num_tokens": 7561661.0, + "reward": 0.8762274272739887, + "reward_std": 0.09444486163556576, + "rewards/code_reward": 0.734263144666329, + "rewards/format_reward": 1.4196428656578064, + "step": 298 + }, + { + "clip_ratio": 0.0023005552939139307, + "epoch": 0.011156196072944359, + "grad_norm": 0.08436665683984756, + "kl": 0.0404052734375, + "learning_rate": 9.97905722768322e-06, + "loss": -0.0035, + "step": 299 + }, + { + "clip_ratio": 0.002200211922172457, + "epoch": 0.011193507765496012, + "grad_norm": 0.07613036036491394, + "kl": 0.040771484375, + "learning_rate": 9.978775395249763e-06, + "loss": -0.004, + "step": 300 + }, + { + "clip_ratio": 0.003844133054371923, + "completion_length": 753.2143249511719, + "epoch": 0.011230819458047665, + "grad_norm": 0.0711805522441864, + "kl": 0.0274658203125, + "learning_rate": 9.97849168360535e-06, + "loss": 0.0106, + "num_tokens": 7634039.0, + "reward": 0.4717077724635601, + "reward_std": 0.26061033457517624, + "rewards/code_reward": 0.3261720575392246, + "rewards/format_reward": 1.455357164144516, + "step": 301 + }, + { + "clip_ratio": 0.0036021042615175247, + "epoch": 0.011268131150599318, + "grad_norm": 0.0693809762597084, + "kl": 0.0269775390625, + "learning_rate": 9.978206092869013e-06, + "loss": 0.0104, + "step": 302 + }, + { + "clip_ratio": 0.004092165210749954, + "epoch": 0.011305442843150973, + "grad_norm": 0.06909362971782684, + "kl": 0.026763916015625, + "learning_rate": 9.977918623160585e-06, + "loss": 0.0103, + "step": 303 + }, + { + "clip_ratio": 0.004827032214961946, + "completion_length": 519.053581237793, + "epoch": 0.011342754535702626, + "grad_norm": 0.0624174140393734, + "kl": 0.04888916015625, + "learning_rate": 9.977629274600685e-06, + "loss": -0.0059, + "num_tokens": 7684568.0, + "reward": 0.29586086794734, + "reward_std": 0.13796993345022202, + "rewards/code_reward": 0.15586083941161633, + "rewards/format_reward": 1.399999976158142, + "step": 304 + }, + { + "clip_ratio": 0.004637763951905072, + "epoch": 0.01138006622825428, + "grad_norm": 0.061529867351055145, + "kl": 0.04803466796875, + "learning_rate": 9.977338047310714e-06, + "loss": -0.006, + "step": 305 + }, + { + "clip_ratio": 0.004583583795465529, + "epoch": 0.011417377920805933, + "grad_norm": 0.05604119971394539, + "kl": 0.04681396484375, + "learning_rate": 9.977044941412868e-06, + "loss": -0.0063, + "step": 306 + }, + { + "clip_ratio": 0.004179073788691312, + "completion_length": 806.3036041259766, + "epoch": 0.011454689613357586, + "grad_norm": 0.13379234075546265, + "kl": 0.0430908203125, + "learning_rate": 9.97674995703013e-06, + "loss": 0.0145, + "num_tokens": 7765477.0, + "reward": 0.39951111003756523, + "reward_std": 0.22310838848352432, + "rewards/code_reward": 0.26343967020511627, + "rewards/format_reward": 1.3607142567634583, + "step": 307 + }, + { + "clip_ratio": 0.0038131920155137777, + "epoch": 0.011492001305909239, + "grad_norm": 0.07834887504577637, + "kl": 0.040863037109375, + "learning_rate": 9.976453094286266e-06, + "loss": 0.0143, + "step": 308 + }, + { + "clip_ratio": 0.004076011769939214, + "epoch": 0.011529312998460892, + "grad_norm": 0.07942645251750946, + "kl": 0.0438232421875, + "learning_rate": 9.976154353305838e-06, + "loss": 0.0142, + "step": 309 + }, + { + "clip_ratio": 0.004221897979732603, + "completion_length": 736.2321624755859, + "epoch": 0.011566624691012547, + "grad_norm": 0.07061327993869781, + "kl": 0.0357666015625, + "learning_rate": 9.97585373421419e-06, + "loss": -0.0, + "num_tokens": 7839004.0, + "reward": 0.45445627719163895, + "reward_std": 0.19946808042004704, + "rewards/code_reward": 0.30981343029998243, + "rewards/format_reward": 1.4464285969734192, + "step": 310 + }, + { + "clip_ratio": 0.0036802791291847825, + "epoch": 0.0116039363835642, + "grad_norm": 0.06927018612623215, + "kl": 0.034912109375, + "learning_rate": 9.975551237137456e-06, + "loss": -0.0004, + "step": 311 + }, + { + "clip_ratio": 0.004154088266659528, + "epoch": 0.011641248076115853, + "grad_norm": 0.06549149006605148, + "kl": 0.033721923828125, + "learning_rate": 9.975246862202558e-06, + "loss": -0.0004, + "step": 312 + }, + { + "clip_ratio": 0.0033532975357957184, + "completion_length": 738.3393402099609, + "epoch": 0.011678559768667506, + "grad_norm": 0.07512947171926498, + "kl": 0.0394287109375, + "learning_rate": 9.974940609537209e-06, + "loss": 0.0346, + "num_tokens": 7918879.0, + "reward": 0.6741632632911205, + "reward_std": 0.2646650765091181, + "rewards/code_reward": 0.5314846767578274, + "rewards/format_reward": 1.4267857074737549, + "step": 313 + }, + { + "clip_ratio": 0.0027966012130491436, + "epoch": 0.01171587146121916, + "grad_norm": 0.0746498629450798, + "kl": 0.037689208984375, + "learning_rate": 9.974632479269904e-06, + "loss": 0.0342, + "step": 314 + }, + { + "clip_ratio": 0.00278927682666108, + "epoch": 0.011753183153770812, + "grad_norm": 0.07165227830410004, + "kl": 0.038238525390625, + "learning_rate": 9.974322471529929e-06, + "loss": 0.0341, + "step": 315 + }, + { + "clip_ratio": 0.004201611154712737, + "completion_length": 624.5893096923828, + "epoch": 0.011790494846322466, + "grad_norm": 0.10360850393772125, + "kl": 0.052642822265625, + "learning_rate": 9.97401058644736e-06, + "loss": 0.0023, + "num_tokens": 7996248.0, + "reward": 0.7239807769656181, + "reward_std": 0.14663936104625463, + "rewards/code_reward": 0.5820164754986763, + "rewards/format_reward": 1.4196428656578064, + "step": 316 + }, + { + "clip_ratio": 0.004126974381506443, + "epoch": 0.01182780653887412, + "grad_norm": 0.1336342692375183, + "kl": 0.053955078125, + "learning_rate": 9.973696824153052e-06, + "loss": 0.0022, + "step": 317 + }, + { + "clip_ratio": 0.004279408836737275, + "epoch": 0.011865118231425774, + "grad_norm": 0.09911667555570602, + "kl": 0.05242919921875, + "learning_rate": 9.973381184778662e-06, + "loss": 0.002, + "step": 318 + }, + { + "clip_ratio": 0.0033795832423493266, + "completion_length": 679.5893173217773, + "epoch": 0.011902429923977427, + "grad_norm": 0.10870985686779022, + "kl": 0.04949951171875, + "learning_rate": 9.973063668456623e-06, + "loss": -0.0028, + "num_tokens": 8076137.0, + "reward": 0.6373305730521679, + "reward_std": 0.20241934061050415, + "rewards/code_reward": 0.4953662920743227, + "rewards/format_reward": 1.4196428656578064, + "step": 319 + }, + { + "clip_ratio": 0.003140786837320775, + "epoch": 0.01193974161652908, + "grad_norm": 0.08210621029138565, + "kl": 0.04876708984375, + "learning_rate": 9.972744275320156e-06, + "loss": -0.0029, + "step": 320 + }, + { + "clip_ratio": 0.003021313692443073, + "epoch": 0.011977053309080733, + "grad_norm": 0.08826208114624023, + "kl": 0.0482177734375, + "learning_rate": 9.972423005503277e-06, + "loss": -0.0032, + "step": 321 + }, + { + "clip_ratio": 0.003629481070674956, + "completion_length": 780.1071853637695, + "epoch": 0.012014365001632386, + "grad_norm": 0.08502256870269775, + "kl": 0.0233917236328125, + "learning_rate": 9.97209985914078e-06, + "loss": 0.0106, + "num_tokens": 8153927.0, + "reward": 0.6007840074598789, + "reward_std": 0.21200163662433624, + "rewards/code_reward": 0.45078397169709206, + "rewards/format_reward": 1.5, + "step": 322 + }, + { + "clip_ratio": 0.003686100128106773, + "epoch": 0.012051676694184039, + "grad_norm": 0.07484009116888046, + "kl": 0.023162841796875, + "learning_rate": 9.971774836368256e-06, + "loss": 0.0103, + "step": 323 + }, + { + "clip_ratio": 0.0034317022655159235, + "epoch": 0.012088988386735694, + "grad_norm": 0.07379964739084244, + "kl": 0.0230865478515625, + "learning_rate": 9.971447937322075e-06, + "loss": 0.0104, + "step": 324 + }, + { + "clip_ratio": 0.0033678209874778986, + "completion_length": 813.8393096923828, + "epoch": 0.012126300079287347, + "grad_norm": 0.08261311054229736, + "kl": 0.031158447265625, + "learning_rate": 9.971119162139401e-06, + "loss": 0.0096, + "num_tokens": 8236462.0, + "reward": 0.7037380635738373, + "reward_std": 0.245924293063581, + "rewards/code_reward": 0.5630237832665443, + "rewards/format_reward": 1.407142847776413, + "step": 325 + }, + { + "clip_ratio": 0.0033570644445717335, + "epoch": 0.012163611771839, + "grad_norm": 0.09359529614448547, + "kl": 0.030120849609375, + "learning_rate": 9.970788510958178e-06, + "loss": 0.0095, + "step": 326 + }, + { + "clip_ratio": 0.003198965569026768, + "epoch": 0.012200923464390653, + "grad_norm": 0.07044743746519089, + "kl": 0.031402587890625, + "learning_rate": 9.970455983917145e-06, + "loss": 0.0091, + "step": 327 + }, + { + "clip_ratio": 0.002765144221484661, + "completion_length": 728.1250305175781, + "epoch": 0.012238235156942306, + "grad_norm": 0.04425208643078804, + "kl": 0.030731201171875, + "learning_rate": 9.970121581155821e-06, + "loss": -0.0069, + "num_tokens": 8309759.0, + "reward": 0.44961485266685486, + "reward_std": 0.10954656451940536, + "rewards/code_reward": 0.30497198551893234, + "rewards/format_reward": 1.4464285671710968, + "step": 328 + }, + { + "clip_ratio": 0.0032077114447019994, + "epoch": 0.01227554684949396, + "grad_norm": 0.04088854044675827, + "kl": 0.03033447265625, + "learning_rate": 9.969785302814519e-06, + "loss": -0.0068, + "step": 329 + }, + { + "clip_ratio": 0.0027382729458622634, + "epoch": 0.012312858542045613, + "grad_norm": 0.03946107625961304, + "kl": 0.030426025390625, + "learning_rate": 9.96944714903433e-06, + "loss": -0.0069, + "step": 330 + }, + { + "clip_ratio": 0.0030313743627630174, + "completion_length": 662.3214569091797, + "epoch": 0.012350170234597268, + "grad_norm": 0.25426632165908813, + "kl": 0.220489501953125, + "learning_rate": 9.969107119957141e-06, + "loss": 0.0039, + "num_tokens": 8388009.0, + "reward": 0.5928571447730064, + "reward_std": 0.11981689091771841, + "rewards/code_reward": 0.4464285671710968, + "rewards/format_reward": 1.4642857313156128, + "step": 331 + }, + { + "clip_ratio": 0.0031967191607691348, + "epoch": 0.01238748192714892, + "grad_norm": 0.1225559264421463, + "kl": 0.073760986328125, + "learning_rate": 9.968765215725622e-06, + "loss": 0.0029, + "step": 332 + }, + { + "clip_ratio": 0.0036641339538618922, + "epoch": 0.012424793619700574, + "grad_norm": 0.22467811405658722, + "kl": 0.062255859375, + "learning_rate": 9.968421436483228e-06, + "loss": 0.0034, + "step": 333 + }, + { + "clip_ratio": 0.004948150599375367, + "completion_length": 733.982177734375, + "epoch": 0.012462105312252227, + "grad_norm": 0.07510225474834442, + "kl": 0.0360107421875, + "learning_rate": 9.968075782374201e-06, + "loss": 0.0009, + "num_tokens": 8455082.0, + "reward": 0.668910626322031, + "reward_std": 0.14747038017958403, + "rewards/code_reward": 0.524267740547657, + "rewards/format_reward": 1.4464285969734192, + "step": 334 + }, + { + "clip_ratio": 0.0043053856352344155, + "epoch": 0.01249941700480388, + "grad_norm": 0.06824354827404022, + "kl": 0.038360595703125, + "learning_rate": 9.967728253543574e-06, + "loss": 0.0008, + "step": 335 + }, + { + "clip_ratio": 0.004535709216725081, + "epoch": 0.012536728697355533, + "grad_norm": 0.06253088265657425, + "kl": 0.0391845703125, + "learning_rate": 9.967378850137162e-06, + "loss": 0.0003, + "step": 336 + }, + { + "clip_ratio": 0.00392182485666126, + "completion_length": 702.357177734375, + "epoch": 0.012574040389907188, + "grad_norm": 0.34059685468673706, + "kl": 0.293548583984375, + "learning_rate": 9.96702757230157e-06, + "loss": 0.0045, + "num_tokens": 8521798.0, + "reward": 0.47818340733647346, + "reward_std": 0.18864138063509017, + "rewards/code_reward": 0.3281833861838095, + "rewards/format_reward": 1.5, + "step": 337 + }, + { + "clip_ratio": 0.003795295604504645, + "epoch": 0.012611352082458841, + "grad_norm": 0.10051585733890533, + "kl": 0.08880615234375, + "learning_rate": 9.966674420184185e-06, + "loss": 0.0019, + "step": 338 + }, + { + "clip_ratio": 0.00418271980015561, + "epoch": 0.012648663775010494, + "grad_norm": 0.0810936689376831, + "kl": 0.040191650390625, + "learning_rate": 9.966319393933186e-06, + "loss": 0.0011, + "step": 339 + }, + { + "clip_ratio": 0.003456297388765961, + "completion_length": 592.5000381469727, + "epoch": 0.012685975467562147, + "grad_norm": 0.08174199610948563, + "kl": 0.041229248046875, + "learning_rate": 9.965962493697531e-06, + "loss": -0.0041, + "num_tokens": 8589684.0, + "reward": 0.5854461677372456, + "reward_std": 0.1866754274815321, + "rewards/code_reward": 0.43812474235892296, + "rewards/format_reward": 1.4732142984867096, + "step": 340 + }, + { + "clip_ratio": 0.0035702349268831313, + "epoch": 0.0127232871601138, + "grad_norm": 0.07917595654726028, + "kl": 0.0413818359375, + "learning_rate": 9.965603719626972e-06, + "loss": -0.0044, + "step": 341 + }, + { + "clip_ratio": 0.003406367148272693, + "epoch": 0.012760598852665454, + "grad_norm": 0.07737964391708374, + "kl": 0.044525146484375, + "learning_rate": 9.965243071872047e-06, + "loss": -0.0049, + "step": 342 + }, + { + "clip_ratio": 0.0038594387006014585, + "completion_length": 795.9821701049805, + "epoch": 0.012797910545217107, + "grad_norm": 0.0629672259092331, + "kl": 0.02880859375, + "learning_rate": 9.964880550584069e-06, + "loss": -0.0221, + "num_tokens": 8674943.0, + "reward": 0.4257022701203823, + "reward_std": 0.06337236403487623, + "rewards/code_reward": 0.2757022115984, + "rewards/format_reward": 1.5, + "step": 343 + }, + { + "clip_ratio": 0.004178298928309232, + "epoch": 0.012835222237768762, + "grad_norm": 0.06429681181907654, + "kl": 0.028656005859375, + "learning_rate": 9.964516155915152e-06, + "loss": -0.0221, + "step": 344 + }, + { + "clip_ratio": 0.003920155460946262, + "epoch": 0.012872533930320415, + "grad_norm": 0.05960626155138016, + "kl": 0.028411865234375, + "learning_rate": 9.964149888018187e-06, + "loss": -0.0222, + "step": 345 + }, + { + "clip_ratio": 0.0031817994313314557, + "completion_length": 619.5357437133789, + "epoch": 0.012909845622872068, + "grad_norm": 0.07873054593801498, + "kl": 0.04498291015625, + "learning_rate": 9.96378174704685e-06, + "loss": 0.0075, + "num_tokens": 8745001.0, + "reward": 0.8341180980205536, + "reward_std": 0.23752810433506966, + "rewards/code_reward": 0.6859038099646568, + "rewards/format_reward": 1.4821428656578064, + "step": 346 + }, + { + "clip_ratio": 0.002864739450160414, + "epoch": 0.012947157315423721, + "grad_norm": 0.24039849638938904, + "kl": 0.044830322265625, + "learning_rate": 9.963411733155612e-06, + "loss": 0.0072, + "step": 347 + }, + { + "clip_ratio": 0.0026126864831894636, + "epoch": 0.012984469007975374, + "grad_norm": 0.07050604373216629, + "kl": 0.044830322265625, + "learning_rate": 9.963039846499722e-06, + "loss": 0.0068, + "step": 348 + }, + { + "clip_ratio": 0.001769490074366331, + "completion_length": 445.51788330078125, + "epoch": 0.013021780700527027, + "grad_norm": 0.06162756681442261, + "kl": 0.02886962890625, + "learning_rate": 9.962666087235214e-06, + "loss": 0.0014, + "num_tokens": 8794786.0, + "reward": 1.0282467603683472, + "reward_std": 0.1916051283478737, + "rewards/code_reward": 0.8782467395067215, + "rewards/format_reward": 1.5, + "step": 349 + }, + { + "clip_ratio": 0.0014247017388697714, + "epoch": 0.01305909239307868, + "grad_norm": 0.05959663912653923, + "kl": 0.028961181640625, + "learning_rate": 9.962290455518914e-06, + "loss": 0.001, + "step": 350 + }, + { + "clip_ratio": 0.0017759694310370833, + "epoch": 0.013096404085630335, + "grad_norm": 0.05925162509083748, + "kl": 0.028472900390625, + "learning_rate": 9.961912951508428e-06, + "loss": 0.001, + "step": 351 + }, + { + "clip_ratio": 0.004170358821284026, + "completion_length": 662.1785888671875, + "epoch": 0.013133715778181988, + "grad_norm": 0.07206638157367706, + "kl": 0.040863037109375, + "learning_rate": 9.961533575362149e-06, + "loss": 0.0104, + "num_tokens": 8858342.0, + "reward": 0.38690752536058426, + "reward_std": 0.11152276676148176, + "rewards/code_reward": 0.24226465169340372, + "rewards/format_reward": 1.4464285969734192, + "step": 352 + }, + { + "clip_ratio": 0.004076501296367496, + "epoch": 0.013171027470733641, + "grad_norm": 0.06895661354064941, + "kl": 0.038116455078125, + "learning_rate": 9.961152327239259e-06, + "loss": 0.0105, + "step": 353 + }, + { + "clip_ratio": 0.004021344997454435, + "epoch": 0.013208339163285295, + "grad_norm": 0.06517249345779419, + "kl": 0.03912353515625, + "learning_rate": 9.96076920729972e-06, + "loss": 0.0097, + "step": 354 + }, + { + "clip_ratio": 0.002958056051284075, + "completion_length": 731.5893249511719, + "epoch": 0.013245650855836948, + "grad_norm": 0.07242201268672943, + "kl": 0.0306396484375, + "learning_rate": 9.960384215704284e-06, + "loss": -0.0067, + "num_tokens": 8931287.0, + "reward": 0.5920448377728462, + "reward_std": 0.13516366621479392, + "rewards/code_reward": 0.4513305281288922, + "rewards/format_reward": 1.407142847776413, + "step": 355 + }, + { + "clip_ratio": 0.0031350510544143617, + "epoch": 0.0132829625483886, + "grad_norm": 0.07069297879934311, + "kl": 0.031341552734375, + "learning_rate": 9.959997352614483e-06, + "loss": -0.0068, + "step": 356 + }, + { + "clip_ratio": 0.0027419382240623236, + "epoch": 0.013320274240940254, + "grad_norm": 0.0702805146574974, + "kl": 0.03253173828125, + "learning_rate": 9.959608618192642e-06, + "loss": -0.0074, + "step": 357 + }, + { + "clip_ratio": 0.00474268535617739, + "completion_length": 692.8928833007812, + "epoch": 0.013357585933491909, + "grad_norm": 2.727727174758911, + "kl": 1.927734375, + "learning_rate": 9.959218012601865e-06, + "loss": 0.0075, + "num_tokens": 9011823.0, + "reward": 0.39058488234877586, + "reward_std": 0.2148779910057783, + "rewards/code_reward": 0.24683486856520176, + "rewards/format_reward": 1.4375, + "step": 358 + }, + { + "clip_ratio": 0.005896827788092196, + "epoch": 0.013394897626043562, + "grad_norm": 0.7070395946502686, + "kl": 0.05780029296875, + "learning_rate": 9.95882553600604e-06, + "loss": -0.0055, + "step": 359 + }, + { + "clip_ratio": 0.006065872730687261, + "epoch": 0.013432209318595215, + "grad_norm": 1.1295528411865234, + "kl": 0.052001953125, + "learning_rate": 9.958431188569848e-06, + "loss": -0.002, + "step": 360 + }, + { + "clip_ratio": 0.00390944245737046, + "completion_length": 526.3750228881836, + "epoch": 0.013469521011146868, + "grad_norm": 0.0387735553085804, + "kl": 0.029083251953125, + "learning_rate": 9.958034970458746e-06, + "loss": -0.0026, + "num_tokens": 9064750.0, + "reward": 0.515406858175993, + "reward_std": 0.04423747956752777, + "rewards/code_reward": 0.3654068447649479, + "rewards/format_reward": 1.5, + "step": 361 + }, + { + "clip_ratio": 0.0031672436743974686, + "epoch": 0.013506832703698521, + "grad_norm": 0.0375300794839859, + "kl": 0.029815673828125, + "learning_rate": 9.95763688183898e-06, + "loss": -0.0026, + "step": 362 + }, + { + "clip_ratio": 0.0038860125932842493, + "epoch": 0.013544144396250174, + "grad_norm": 0.036534346640110016, + "kl": 0.029022216796875, + "learning_rate": 9.957236922877584e-06, + "loss": -0.0028, + "step": 363 + }, + { + "clip_ratio": 0.003631035448051989, + "completion_length": 809.8393096923828, + "epoch": 0.013581456088801827, + "grad_norm": 0.11062806844711304, + "kl": 0.03424072265625, + "learning_rate": 9.956835093742368e-06, + "loss": 0.0065, + "num_tokens": 9156889.0, + "reward": 0.7775120139122009, + "reward_std": 0.2702966872602701, + "rewards/code_reward": 0.631976280361414, + "rewards/format_reward": 1.4553571343421936, + "step": 364 + }, + { + "clip_ratio": 0.003152924997266382, + "epoch": 0.013618767781353482, + "grad_norm": 0.07130417972803116, + "kl": 0.03607177734375, + "learning_rate": 9.956431394601938e-06, + "loss": 0.0062, + "step": 365 + }, + { + "clip_ratio": 0.0032727839425206184, + "epoch": 0.013656079473905135, + "grad_norm": 0.06931007653474808, + "kl": 0.037109375, + "learning_rate": 9.956025825625673e-06, + "loss": 0.0061, + "step": 366 + }, + { + "clip_ratio": 0.0033742024679668248, + "completion_length": 915.2857666015625, + "epoch": 0.013693391166456789, + "grad_norm": 0.07338181138038635, + "kl": 0.05810546875, + "learning_rate": 9.955618386983745e-06, + "loss": -0.014, + "num_tokens": 9255437.0, + "reward": 0.549513004720211, + "reward_std": 0.17068626638501883, + "rewards/code_reward": 0.4048701226711273, + "rewards/format_reward": 1.4464285969734192, + "step": 367 + }, + { + "clip_ratio": 0.003724192560184747, + "epoch": 0.013730702859008442, + "grad_norm": 0.07208924740552902, + "kl": 0.0594482421875, + "learning_rate": 9.95520907884711e-06, + "loss": -0.014, + "step": 368 + }, + { + "clip_ratio": 0.003907369857188314, + "epoch": 0.013768014551560095, + "grad_norm": 0.06576503813266754, + "kl": 0.05511474609375, + "learning_rate": 9.9547979013875e-06, + "loss": -0.0143, + "step": 369 + }, + { + "clip_ratio": 0.003866637183818966, + "completion_length": 581.6607437133789, + "epoch": 0.013805326244111748, + "grad_norm": 0.1014467403292656, + "kl": 0.04595947265625, + "learning_rate": 9.954384854777444e-06, + "loss": 0.0068, + "num_tokens": 9313812.0, + "reward": 0.5125850588083267, + "reward_std": 0.1240949840284884, + "rewards/code_reward": 0.36437076330184937, + "rewards/format_reward": 1.4821428656578064, + "step": 370 + }, + { + "clip_ratio": 0.004086767847184092, + "epoch": 0.013842637936663403, + "grad_norm": 0.09413934499025345, + "kl": 0.04803466796875, + "learning_rate": 9.953969939190244e-06, + "loss": 0.0064, + "step": 371 + }, + { + "clip_ratio": 0.004614671866875142, + "epoch": 0.013879949629215056, + "grad_norm": 0.08039746433496475, + "kl": 0.04833984375, + "learning_rate": 9.953553154799993e-06, + "loss": 0.0061, + "step": 372 + }, + { + "clip_ratio": 0.0027823506388813257, + "completion_length": 584.3035888671875, + "epoch": 0.013917261321766709, + "grad_norm": 0.07648629695177078, + "kl": 0.044403076171875, + "learning_rate": 9.953134501781567e-06, + "loss": 0.0135, + "num_tokens": 9380049.0, + "reward": 0.916169136762619, + "reward_std": 0.27179497107863426, + "rewards/code_reward": 0.7688477113842964, + "rewards/format_reward": 1.4732142984867096, + "step": 373 + }, + { + "clip_ratio": 0.0027717198827303946, + "epoch": 0.013954573014318362, + "grad_norm": 0.07330768555402756, + "kl": 0.046844482421875, + "learning_rate": 9.952713980310621e-06, + "loss": 0.0134, + "step": 374 + }, + { + "clip_ratio": 0.0027018976397812366, + "epoch": 0.013991884706870015, + "grad_norm": 0.07288599759340286, + "kl": 0.043487548828125, + "learning_rate": 9.952291590563604e-06, + "loss": 0.0131, + "step": 375 + }, + { + "clip_ratio": 0.002986648993100971, + "completion_length": 698.6785888671875, + "epoch": 0.014029196399421668, + "grad_norm": 0.0867164134979248, + "kl": 0.056396484375, + "learning_rate": 9.951867332717737e-06, + "loss": 0.0315, + "num_tokens": 9450703.0, + "reward": 0.7611997202038765, + "reward_std": 0.1854964615777135, + "rewards/code_reward": 0.6211996953934431, + "rewards/format_reward": 1.4000000059604645, + "step": 376 + }, + { + "clip_ratio": 0.0028120041242800653, + "epoch": 0.014066508091973321, + "grad_norm": 0.08439499139785767, + "kl": 0.049102783203125, + "learning_rate": 9.951441206951033e-06, + "loss": 0.0313, + "step": 377 + }, + { + "clip_ratio": 0.002777107059955597, + "epoch": 0.014103819784524976, + "grad_norm": 0.07913133502006531, + "kl": 0.04949951171875, + "learning_rate": 9.951013213442288e-06, + "loss": 0.0308, + "step": 378 + }, + { + "clip_ratio": 0.004699056968092918, + "completion_length": 612.8928833007812, + "epoch": 0.01414113147707663, + "grad_norm": 0.07622236013412476, + "kl": 0.061676025390625, + "learning_rate": 9.950583352371077e-06, + "loss": -0.0003, + "num_tokens": 9512647.0, + "reward": 0.28610390797257423, + "reward_std": 0.20798139087855816, + "rewards/code_reward": 0.1461038962006569, + "rewards/format_reward": 1.4000000059604645, + "step": 379 + }, + { + "clip_ratio": 0.004345426918007433, + "epoch": 0.014178443169628283, + "grad_norm": 0.08017177134752274, + "kl": 0.061187744140625, + "learning_rate": 9.950151623917765e-06, + "loss": -0.0003, + "step": 380 + }, + { + "clip_ratio": 0.003987404692452401, + "epoch": 0.014215754862179936, + "grad_norm": 0.07087519764900208, + "kl": 0.0584716796875, + "learning_rate": 9.949718028263495e-06, + "loss": -0.0008, + "step": 381 + }, + { + "clip_ratio": 0.0031114821322262287, + "completion_length": 648.8571701049805, + "epoch": 0.014253066554731589, + "grad_norm": 0.06101280450820923, + "kl": 0.031585693359375, + "learning_rate": 9.949282565590196e-06, + "loss": -0.0001, + "num_tokens": 9580455.0, + "reward": 0.5653911642730236, + "reward_std": 0.1449222546070814, + "rewards/code_reward": 0.41539115458726883, + "rewards/format_reward": 1.5, + "step": 382 + }, + { + "clip_ratio": 0.0032078091171570122, + "epoch": 0.014290378247283242, + "grad_norm": 0.07388651371002197, + "kl": 0.031219482421875, + "learning_rate": 9.948845236080581e-06, + "loss": -0.0002, + "step": 383 + }, + { + "clip_ratio": 0.0034707450540736318, + "epoch": 0.014327689939834895, + "grad_norm": 0.055996738374233246, + "kl": 0.0316162109375, + "learning_rate": 9.948406039918146e-06, + "loss": -0.0003, + "step": 384 + }, + { + "clip_ratio": 0.00368214346235618, + "completion_length": 921.7857666015625, + "epoch": 0.01436500163238655, + "grad_norm": 0.07185804843902588, + "kl": 0.021392822265625, + "learning_rate": 9.947964977287169e-06, + "loss": -0.0012, + "num_tokens": 9662007.0, + "reward": 0.5069339983165264, + "reward_std": 0.2553109973669052, + "rewards/code_reward": 0.35693395510315895, + "rewards/format_reward": 1.5, + "step": 385 + }, + { + "clip_ratio": 0.0036343716783449054, + "epoch": 0.014402313324938203, + "grad_norm": 0.07071929425001144, + "kl": 0.021636962890625, + "learning_rate": 9.94752204837271e-06, + "loss": -0.0013, + "step": 386 + }, + { + "clip_ratio": 0.003497172030620277, + "epoch": 0.014439625017489856, + "grad_norm": 0.06906618922948837, + "kl": 0.02166748046875, + "learning_rate": 9.947077253360611e-06, + "loss": -0.0015, + "step": 387 + }, + { + "clip_ratio": 0.002348303736653179, + "completion_length": 837.482177734375, + "epoch": 0.01447693671004151, + "grad_norm": 0.06999661773443222, + "kl": 0.027191162109375, + "learning_rate": 9.946630592437508e-06, + "loss": -0.0139, + "num_tokens": 9743794.0, + "reward": 0.7702276110649109, + "reward_std": 0.2618505507707596, + "rewards/code_reward": 0.6221919134259224, + "rewards/format_reward": 1.480357140302658, + "step": 388 + }, + { + "clip_ratio": 0.0025491046835668385, + "epoch": 0.014514248402593162, + "grad_norm": 0.11652582138776779, + "kl": 0.027496337890625, + "learning_rate": 9.946182065790805e-06, + "loss": -0.0141, + "step": 389 + }, + { + "clip_ratio": 0.0024820746039040387, + "epoch": 0.014551560095144815, + "grad_norm": 0.061837829649448395, + "kl": 0.02642822265625, + "learning_rate": 9.945731673608698e-06, + "loss": -0.0142, + "step": 390 + }, + { + "clip_ratio": 0.0024589813547208905, + "completion_length": 477.3393096923828, + "epoch": 0.014588871787696469, + "grad_norm": 0.06738464534282684, + "kl": 0.05126953125, + "learning_rate": 9.945279416080163e-06, + "loss": 0.0194, + "num_tokens": 9789705.0, + "reward": 0.912500012665987, + "reward_std": 0.07834326848387718, + "rewards/code_reward": 0.7678571436554193, + "rewards/format_reward": 1.4464285969734192, + "step": 391 + }, + { + "clip_ratio": 0.0020776722230948508, + "epoch": 0.014626183480248123, + "grad_norm": 0.09088516980409622, + "kl": 0.05023193359375, + "learning_rate": 9.944825293394959e-06, + "loss": 0.0191, + "step": 392 + }, + { + "clip_ratio": 0.00268349121324718, + "epoch": 0.014663495172799777, + "grad_norm": 0.06426964700222015, + "kl": 0.0540771484375, + "learning_rate": 9.944369305743628e-06, + "loss": 0.0192, + "step": 393 + }, + { + "clip_ratio": 0.0045898022362962365, + "completion_length": 668.9464569091797, + "epoch": 0.01470080686535143, + "grad_norm": 0.09085007011890411, + "kl": 0.03021240234375, + "learning_rate": 9.943911453317491e-06, + "loss": 0.0207, + "num_tokens": 9850420.0, + "reward": 0.5988007001578808, + "reward_std": 0.1362283470807597, + "rewards/code_reward": 0.45326495775952935, + "rewards/format_reward": 1.455357164144516, + "step": 394 + }, + { + "clip_ratio": 0.004354513715952635, + "epoch": 0.014738118557903083, + "grad_norm": 0.09129197895526886, + "kl": 0.029510498046875, + "learning_rate": 9.94345173630866e-06, + "loss": 0.0206, + "step": 395 + }, + { + "clip_ratio": 0.004549291101284325, + "epoch": 0.014775430250454736, + "grad_norm": 0.08793345093727112, + "kl": 0.030242919921875, + "learning_rate": 9.942990154910016e-06, + "loss": 0.0201, + "step": 396 + }, + { + "clip_ratio": 0.00444439280545339, + "completion_length": 612.8571701049805, + "epoch": 0.014812741943006389, + "grad_norm": 0.03823050484061241, + "kl": 0.034423828125, + "learning_rate": 9.942526709315235e-06, + "loss": -0.004, + "num_tokens": 9913524.0, + "reward": 0.3007868118584156, + "reward_std": 0.09787388145923615, + "rewards/code_reward": 0.15346534550189972, + "rewards/format_reward": 1.4732142984867096, + "step": 397 + }, + { + "clip_ratio": 0.004718466138001531, + "epoch": 0.014850053635558042, + "grad_norm": 0.03618098422884941, + "kl": 0.033477783203125, + "learning_rate": 9.94206139971877e-06, + "loss": -0.004, + "step": 398 + }, + { + "clip_ratio": 0.0044235565583221614, + "epoch": 0.014887365328109697, + "grad_norm": 0.03586447983980179, + "kl": 0.033538818359375, + "learning_rate": 9.941594226315855e-06, + "loss": -0.0041, + "step": 399 + }, + { + "clip_ratio": 0.001658598834183067, + "completion_length": 579.928596496582, + "epoch": 0.01492467702066135, + "grad_norm": 0.06391527503728867, + "kl": 0.04730224609375, + "learning_rate": 9.941125189302508e-06, + "loss": 0.0008, + "num_tokens": 9980202.0, + "reward": 1.0562499910593033, + "reward_std": 0.13697139732539654, + "rewards/code_reward": 0.9107142835855484, + "rewards/format_reward": 1.455357164144516, + "step": 400 + }, + { + "clip_ratio": 0.0011551998322829604, + "epoch": 0.014961988713213003, + "grad_norm": 0.0650881752371788, + "kl": 0.05810546875, + "learning_rate": 9.94065428887553e-06, + "loss": 0.0007, + "step": 401 + }, + { + "clip_ratio": 0.0015275728073902428, + "epoch": 0.014999300405764656, + "grad_norm": 0.05484443157911301, + "kl": 0.047607421875, + "learning_rate": 9.940181525232498e-06, + "loss": 0.0007, + "step": 402 + }, + { + "clip_ratio": 0.0020619715796783566, + "completion_length": 564.3750305175781, + "epoch": 0.01503661209831631, + "grad_norm": 0.07248856127262115, + "kl": 0.052734375, + "learning_rate": 9.939706898571777e-06, + "loss": -0.0001, + "num_tokens": 10042551.0, + "reward": 0.8395563364028931, + "reward_std": 0.1875942200422287, + "rewards/code_reward": 0.6915206164121628, + "rewards/format_reward": 1.480357140302658, + "step": 403 + }, + { + "clip_ratio": 0.0020268808002583683, + "epoch": 0.015073923790867963, + "grad_norm": 0.06597252935171127, + "kl": 0.04302978515625, + "learning_rate": 9.939230409092512e-06, + "loss": -0.0004, + "step": 404 + }, + { + "clip_ratio": 0.0020155744277872145, + "epoch": 0.015111235483419617, + "grad_norm": 0.06373067200183868, + "kl": 0.0391845703125, + "learning_rate": 9.938752056994629e-06, + "loss": -0.0006, + "step": 405 + }, + { + "clip_ratio": 0.003512552648317069, + "completion_length": 749.6428833007812, + "epoch": 0.01514854717597127, + "grad_norm": 0.06110711768269539, + "kl": 0.023895263671875, + "learning_rate": 9.938271842478834e-06, + "loss": 0.0045, + "num_tokens": 10122849.0, + "reward": 0.458571445196867, + "reward_std": 0.0795586840249598, + "rewards/code_reward": 0.3125, + "rewards/format_reward": 1.4607142806053162, + "step": 406 + }, + { + "clip_ratio": 0.0033902206923812628, + "epoch": 0.015185858868522924, + "grad_norm": 0.057129230350255966, + "kl": 0.024658203125, + "learning_rate": 9.937789765746619e-06, + "loss": 0.0044, + "step": 407 + }, + { + "clip_ratio": 0.0033691705320961773, + "epoch": 0.015223170561074577, + "grad_norm": 0.05691225454211235, + "kl": 0.024993896484375, + "learning_rate": 9.93730582700025e-06, + "loss": 0.0044, + "step": 408 + }, + { + "clip_ratio": 0.003931487328372896, + "completion_length": 698.607177734375, + "epoch": 0.01526048225362623, + "grad_norm": 0.07090039551258087, + "kl": 0.0289306640625, + "learning_rate": 9.936820026442784e-06, + "loss": -0.0048, + "num_tokens": 10191175.0, + "reward": 0.2851525731384754, + "reward_std": 0.19832610385492444, + "rewards/code_reward": 0.1437239907681942, + "rewards/format_reward": 1.4142856895923615, + "step": 409 + }, + { + "clip_ratio": 0.0035225211177021265, + "epoch": 0.015297793946177883, + "grad_norm": 0.06841059029102325, + "kl": 0.03094482421875, + "learning_rate": 9.936332364278051e-06, + "loss": -0.0052, + "step": 410 + }, + { + "clip_ratio": 0.0034863606560975313, + "epoch": 0.015335105638729536, + "grad_norm": 0.06872402876615524, + "kl": 0.030853271484375, + "learning_rate": 9.935842840710665e-06, + "loss": -0.0051, + "step": 411 + }, + { + "clip_ratio": 0.003151845303364098, + "completion_length": 798.1785888671875, + "epoch": 0.015372417331281191, + "grad_norm": 0.11005644500255585, + "kl": 0.027984619140625, + "learning_rate": 9.93535145594602e-06, + "loss": 0.0095, + "num_tokens": 10276711.0, + "reward": 0.525582380592823, + "reward_std": 0.214183047413826, + "rewards/code_reward": 0.3827252183109522, + "rewards/format_reward": 1.4285714328289032, + "step": 412 + }, + { + "clip_ratio": 0.003145965514704585, + "epoch": 0.015409729023832844, + "grad_norm": 0.07595289498567581, + "kl": 0.02801513671875, + "learning_rate": 9.934858210190296e-06, + "loss": 0.0094, + "step": 413 + }, + { + "clip_ratio": 0.00308082188712433, + "epoch": 0.015447040716384497, + "grad_norm": 0.07384956628084183, + "kl": 0.0274658203125, + "learning_rate": 9.934363103650446e-06, + "loss": 0.0091, + "step": 414 + }, + { + "clip_ratio": 0.0051908541936427355, + "completion_length": 821.3036193847656, + "epoch": 0.01548435240893615, + "grad_norm": 0.0528886653482914, + "kl": 0.023590087890625, + "learning_rate": 9.933866136534208e-06, + "loss": 0.0039, + "num_tokens": 10353028.0, + "reward": 0.227678582072258, + "reward_std": 0.13318027555942535, + "rewards/code_reward": 0.08035714365541935, + "rewards/format_reward": 1.4732142984867096, + "step": 415 + }, + { + "clip_ratio": 0.005163317313417792, + "epoch": 0.015521664101487804, + "grad_norm": 0.05180484801530838, + "kl": 0.023590087890625, + "learning_rate": 9.933367309050102e-06, + "loss": 0.0038, + "step": 416 + }, + { + "clip_ratio": 0.005080681527033448, + "epoch": 0.015558975794039457, + "grad_norm": 0.05020344257354736, + "kl": 0.023834228515625, + "learning_rate": 9.932866621407424e-06, + "loss": 0.0036, + "step": 417 + }, + { + "clip_ratio": 0.00461467495188117, + "completion_length": 761.2143096923828, + "epoch": 0.01559628748659111, + "grad_norm": 0.08510478585958481, + "kl": 0.03594970703125, + "learning_rate": 9.932364073816255e-06, + "loss": -0.0268, + "num_tokens": 10433190.0, + "reward": 0.2453259564936161, + "reward_std": 0.14877348579466343, + "rewards/code_reward": 0.09532596310600638, + "rewards/format_reward": 1.5, + "step": 418 + }, + { + "clip_ratio": 0.004508159472607076, + "epoch": 0.015633599179142765, + "grad_norm": 0.08183611929416656, + "kl": 0.034881591796875, + "learning_rate": 9.931859666487454e-06, + "loss": -0.0269, + "step": 419 + }, + { + "clip_ratio": 0.004110742884222418, + "epoch": 0.015670910871694418, + "grad_norm": 0.07924588024616241, + "kl": 0.03515625, + "learning_rate": 9.931353399632661e-06, + "loss": -0.0273, + "step": 420 + }, + { + "clip_ratio": 0.0038677387055940926, + "completion_length": 677.6964569091797, + "epoch": 0.01570822256424607, + "grad_norm": 0.09528189897537231, + "kl": 0.0260009765625, + "learning_rate": 9.930845273464299e-06, + "loss": 0.0028, + "num_tokens": 10501479.0, + "reward": 0.41650721430778503, + "reward_std": 0.3949483707547188, + "rewards/code_reward": 0.2749000545591116, + "rewards/format_reward": 1.4160714149475098, + "step": 421 + }, + { + "clip_ratio": 0.004169431806076318, + "epoch": 0.015745534256797724, + "grad_norm": 0.09049475193023682, + "kl": 0.025634765625, + "learning_rate": 9.930335288195564e-06, + "loss": 0.0025, + "step": 422 + }, + { + "clip_ratio": 0.004037536273244768, + "epoch": 0.015782845949349377, + "grad_norm": 0.09991448372602463, + "kl": 0.02447509765625, + "learning_rate": 9.929823444040437e-06, + "loss": 0.0022, + "step": 423 + }, + { + "clip_ratio": 0.00423276872606948, + "completion_length": 640.5893096923828, + "epoch": 0.01582015764190103, + "grad_norm": 0.09079701453447342, + "kl": 0.04876708984375, + "learning_rate": 9.929309741213681e-06, + "loss": 0.0114, + "num_tokens": 10569230.0, + "reward": 0.3278161697089672, + "reward_std": 0.1507175052538514, + "rewards/code_reward": 0.18781614862382412, + "rewards/format_reward": 1.4000000059604645, + "step": 424 + }, + { + "clip_ratio": 0.003851137706078589, + "epoch": 0.015857469334452683, + "grad_norm": 0.0902690663933754, + "kl": 0.048248291015625, + "learning_rate": 9.928794179930836e-06, + "loss": 0.0108, + "step": 425 + }, + { + "clip_ratio": 0.0038802978524472564, + "epoch": 0.015894781027004336, + "grad_norm": 0.08508943021297455, + "kl": 0.04864501953125, + "learning_rate": 9.928276760408218e-06, + "loss": 0.0106, + "step": 426 + }, + { + "clip_ratio": 0.004512152750976384, + "completion_length": 793.1071624755859, + "epoch": 0.01593209271955599, + "grad_norm": 0.08441104739904404, + "kl": 0.03680419921875, + "learning_rate": 9.92775748286293e-06, + "loss": 0.0094, + "num_tokens": 10662210.0, + "reward": 0.4173332266509533, + "reward_std": 0.31858190055936575, + "rewards/code_reward": 0.27536892145872116, + "rewards/format_reward": 1.4196428656578064, + "step": 427 + }, + { + "clip_ratio": 0.0044147634180262685, + "epoch": 0.015969404412107643, + "grad_norm": 0.08109438419342041, + "kl": 0.036224365234375, + "learning_rate": 9.927236347512848e-06, + "loss": 0.0095, + "step": 428 + }, + { + "clip_ratio": 0.0043067128281109035, + "epoch": 0.0160067161046593, + "grad_norm": 0.08332722634077072, + "kl": 0.037261962890625, + "learning_rate": 9.926713354576636e-06, + "loss": 0.0089, + "step": 429 + }, + { + "clip_ratio": 0.0051824128604494035, + "completion_length": 542.9285888671875, + "epoch": 0.016044027797210952, + "grad_norm": 0.08738179504871368, + "kl": 0.037506103515625, + "learning_rate": 9.92618850427373e-06, + "loss": 0.0115, + "num_tokens": 10733984.0, + "reward": 0.6523956321179867, + "reward_std": 0.054231652058660984, + "rewards/code_reward": 0.5086455810815096, + "rewards/format_reward": 1.4375, + "step": 430 + }, + { + "clip_ratio": 0.004930829687509686, + "epoch": 0.016081339489762606, + "grad_norm": 0.0860501229763031, + "kl": 0.03814697265625, + "learning_rate": 9.925661796824345e-06, + "loss": 0.0113, + "step": 431 + }, + { + "clip_ratio": 0.0054250991670414805, + "epoch": 0.01611865118231426, + "grad_norm": 0.07886748015880585, + "kl": 0.03717041015625, + "learning_rate": 9.925133232449478e-06, + "loss": 0.0111, + "step": 432 + }, + { + "clip_ratio": 0.004473858396522701, + "completion_length": 648.053596496582, + "epoch": 0.016155962874865912, + "grad_norm": 0.08482695370912552, + "kl": 0.03021240234375, + "learning_rate": 9.924602811370909e-06, + "loss": 0.0167, + "num_tokens": 10801999.0, + "reward": 0.6000030338764191, + "reward_std": 0.24791091680526733, + "rewards/code_reward": 0.4616101458668709, + "rewards/format_reward": 1.3839285969734192, + "step": 433 + }, + { + "clip_ratio": 0.004425858613103628, + "epoch": 0.016193274567417565, + "grad_norm": 0.08653348684310913, + "kl": 0.03228759765625, + "learning_rate": 9.924070533811188e-06, + "loss": 0.0163, + "step": 434 + }, + { + "clip_ratio": 0.003894850437063724, + "epoch": 0.016230586259969218, + "grad_norm": 0.08413336426019669, + "kl": 0.03094482421875, + "learning_rate": 9.923536399993651e-06, + "loss": 0.016, + "step": 435 + }, + { + "clip_ratio": 0.003677779226563871, + "completion_length": 682.2143249511719, + "epoch": 0.01626789795252087, + "grad_norm": 0.0823432132601738, + "kl": 0.02532958984375, + "learning_rate": 9.923000410142411e-06, + "loss": -0.0051, + "num_tokens": 10865151.0, + "reward": 0.581865455955267, + "reward_std": 0.17507321760058403, + "rewards/code_reward": 0.44436544133350253, + "rewards/format_reward": 1.375, + "step": 436 + }, + { + "clip_ratio": 0.0037146799731999636, + "epoch": 0.016305209645072524, + "grad_norm": 0.08024854212999344, + "kl": 0.02581787109375, + "learning_rate": 9.922462564482356e-06, + "loss": -0.0051, + "step": 437 + }, + { + "clip_ratio": 0.00352960464078933, + "epoch": 0.016342521337624177, + "grad_norm": 0.07456513494253159, + "kl": 0.025543212890625, + "learning_rate": 9.92192286323916e-06, + "loss": -0.0056, + "step": 438 + }, + { + "clip_ratio": 0.0033312486484646797, + "completion_length": 590.1964721679688, + "epoch": 0.01637983303017583, + "grad_norm": 0.047322433441877365, + "kl": 0.022918701171875, + "learning_rate": 9.921381306639272e-06, + "loss": -0.0007, + "num_tokens": 10925222.0, + "reward": 0.5136204734444618, + "reward_std": 0.07688190042972565, + "rewards/code_reward": 0.36362045258283615, + "rewards/format_reward": 1.5, + "step": 439 + }, + { + "clip_ratio": 0.002714859030675143, + "epoch": 0.016417144722727484, + "grad_norm": 0.04758286848664284, + "kl": 0.023162841796875, + "learning_rate": 9.920837894909915e-06, + "loss": -0.0009, + "step": 440 + }, + { + "clip_ratio": 0.0030459932750090957, + "epoch": 0.016454456415279137, + "grad_norm": 0.045507773756980896, + "kl": 0.023223876953125, + "learning_rate": 9.9202926282791e-06, + "loss": -0.0011, + "step": 441 + }, + { + "clip_ratio": 0.003687795309815556, + "completion_length": 632.5535888671875, + "epoch": 0.01649176810783079, + "grad_norm": 0.08772286027669907, + "kl": 0.02545166015625, + "learning_rate": 9.919745506975606e-06, + "loss": 0.0067, + "num_tokens": 10993475.0, + "reward": 0.7611264325678349, + "reward_std": 0.18488734355196357, + "rewards/code_reward": 0.6182692646980286, + "rewards/format_reward": 1.4285714626312256, + "step": 442 + }, + { + "clip_ratio": 0.003384930023457855, + "epoch": 0.016529079800382446, + "grad_norm": 0.08102830499410629, + "kl": 0.027252197265625, + "learning_rate": 9.919196531228995e-06, + "loss": 0.0065, + "step": 443 + }, + { + "clip_ratio": 0.003738873405382037, + "epoch": 0.0165663914929341, + "grad_norm": 0.07707776874303818, + "kl": 0.02587890625, + "learning_rate": 9.918645701269612e-06, + "loss": 0.0064, + "step": 444 + }, + { + "clip_ratio": 0.0032231859513558447, + "completion_length": 794.4107360839844, + "epoch": 0.016603703185485753, + "grad_norm": 0.07514365762472153, + "kl": 0.036346435546875, + "learning_rate": 9.91809301732857e-06, + "loss": -0.0128, + "num_tokens": 11079710.0, + "reward": 0.7641697227954865, + "reward_std": 0.40273065865039825, + "rewards/code_reward": 0.6232768222689629, + "rewards/format_reward": 1.4089286029338837, + "step": 445 + }, + { + "clip_ratio": 0.0032355415751226246, + "epoch": 0.016641014878037406, + "grad_norm": 0.0747017115354538, + "kl": 0.037322998046875, + "learning_rate": 9.917538479637767e-06, + "loss": -0.0131, + "step": 446 + }, + { + "clip_ratio": 0.002708603657083586, + "epoch": 0.01667832657058906, + "grad_norm": 0.07047252357006073, + "kl": 0.036468505859375, + "learning_rate": 9.916982088429875e-06, + "loss": -0.0134, + "step": 447 + }, + { + "clip_ratio": 0.004563135444186628, + "completion_length": 586.9107360839844, + "epoch": 0.016715638263140712, + "grad_norm": 0.10933307558298111, + "kl": 0.029754638671875, + "learning_rate": 9.916423843938346e-06, + "loss": 0.0076, + "num_tokens": 11144329.0, + "reward": 0.3603978231549263, + "reward_std": 0.2925555817782879, + "rewards/code_reward": 0.21843352541327477, + "rewards/format_reward": 1.4196428656578064, + "step": 448 + }, + { + "clip_ratio": 0.004497806774452329, + "epoch": 0.016752949955692365, + "grad_norm": 0.28405052423477173, + "kl": 0.02935791015625, + "learning_rate": 9.915863746397413e-06, + "loss": 0.0071, + "step": 449 + }, + { + "clip_ratio": 0.0043468535877764225, + "epoch": 0.016790261648244018, + "grad_norm": 0.08661922812461853, + "kl": 0.0299072265625, + "learning_rate": 9.915301796042076e-06, + "loss": 0.0067, + "step": 450 + }, + { + "clip_ratio": 0.00405640626559034, + "completion_length": 520.9464569091797, + "epoch": 0.01682757334079567, + "grad_norm": 0.2036890834569931, + "kl": 0.045196533203125, + "learning_rate": 9.914737993108124e-06, + "loss": 0.0156, + "num_tokens": 11203602.0, + "reward": 0.9503007680177689, + "reward_std": 0.26499425573274493, + "rewards/code_reward": 0.8029793351888657, + "rewards/format_reward": 1.4732142984867096, + "step": 451 + }, + { + "clip_ratio": 0.004486247024033219, + "epoch": 0.016864885033347325, + "grad_norm": 0.10296270251274109, + "kl": 0.0477294921875, + "learning_rate": 9.914172337832116e-06, + "loss": 0.0155, + "step": 452 + }, + { + "clip_ratio": 0.004452614986803383, + "epoch": 0.016902196725898978, + "grad_norm": 0.0948900505900383, + "kl": 0.043182373046875, + "learning_rate": 9.913604830451392e-06, + "loss": 0.0148, + "step": 453 + }, + { + "clip_ratio": 0.003603132034186274, + "completion_length": 581.8928833007812, + "epoch": 0.01693950841845063, + "grad_norm": 0.12643904983997345, + "kl": 0.02899169921875, + "learning_rate": 9.913035471204065e-06, + "loss": 0.014, + "num_tokens": 11261934.0, + "reward": 0.6735501512885094, + "reward_std": 0.3648451864719391, + "rewards/code_reward": 0.5280144102871418, + "rewards/format_reward": 1.455357164144516, + "step": 454 + }, + { + "clip_ratio": 0.004292336816433817, + "epoch": 0.016976820111002284, + "grad_norm": 0.09490080922842026, + "kl": 0.02899169921875, + "learning_rate": 9.912464260329029e-06, + "loss": 0.0134, + "step": 455 + }, + { + "clip_ratio": 0.0037939391913823783, + "epoch": 0.01701413180355394, + "grad_norm": 0.08549856394529343, + "kl": 0.0291595458984375, + "learning_rate": 9.911891198065952e-06, + "loss": 0.0131, + "step": 456 + }, + { + "clip_ratio": 0.004818867484573275, + "completion_length": 801.3214721679688, + "epoch": 0.017051443496105594, + "grad_norm": 0.08996160328388214, + "kl": 0.030792236328125, + "learning_rate": 9.911316284655283e-06, + "loss": -0.0105, + "num_tokens": 11343460.0, + "reward": 0.4126952104270458, + "reward_std": 0.3576683634892106, + "rewards/code_reward": 0.2662666258402169, + "rewards/format_reward": 1.4642857313156128, + "step": 457 + }, + { + "clip_ratio": 0.005110301659442484, + "epoch": 0.017088755188657247, + "grad_norm": 0.08698266744613647, + "kl": 0.03277587890625, + "learning_rate": 9.910739520338243e-06, + "loss": -0.0104, + "step": 458 + }, + { + "clip_ratio": 0.0040649864240549505, + "epoch": 0.0171260668812089, + "grad_norm": 0.09906613081693649, + "kl": 0.032684326171875, + "learning_rate": 9.910160905356835e-06, + "loss": -0.0111, + "step": 459 + }, + { + "clip_ratio": 0.005110376514494419, + "completion_length": 675.9107360839844, + "epoch": 0.017163378573760553, + "grad_norm": 0.07728830724954605, + "kl": 0.035736083984375, + "learning_rate": 9.90958043995383e-06, + "loss": -0.0081, + "num_tokens": 11414155.0, + "reward": 0.35638851299881935, + "reward_std": 0.2802795087918639, + "rewards/code_reward": 0.2224599274341017, + "rewards/format_reward": 1.3392857313156128, + "step": 460 + }, + { + "clip_ratio": 0.00454036780865863, + "epoch": 0.017200690266312206, + "grad_norm": 0.08182122558355331, + "kl": 0.036041259765625, + "learning_rate": 9.908998124372784e-06, + "loss": -0.0084, + "step": 461 + }, + { + "clip_ratio": 0.0044173578498885036, + "epoch": 0.01723800195886386, + "grad_norm": 0.07403821498155594, + "kl": 0.036712646484375, + "learning_rate": 9.908413958858024e-06, + "loss": -0.0085, + "step": 462 + }, + { + "clip_ratio": 0.003660862217657268, + "completion_length": 550.1428756713867, + "epoch": 0.017275313651415512, + "grad_norm": 0.10384063422679901, + "kl": 0.0394287109375, + "learning_rate": 9.907827943654657e-06, + "loss": 0.0215, + "num_tokens": 11472949.0, + "reward": 0.9716270118951797, + "reward_std": 0.18535678554326296, + "rewards/code_reward": 0.8323412835597992, + "rewards/format_reward": 1.3928571939468384, + "step": 463 + }, + { + "clip_ratio": 0.003426253970246762, + "epoch": 0.017312625343967165, + "grad_norm": 0.10382093489170074, + "kl": 0.04241943359375, + "learning_rate": 9.90724007900856e-06, + "loss": 0.021, + "step": 464 + }, + { + "clip_ratio": 0.004241541144438088, + "epoch": 0.01734993703651882, + "grad_norm": 0.09678187221288681, + "kl": 0.04095458984375, + "learning_rate": 9.906650365166394e-06, + "loss": 0.0205, + "step": 465 + }, + { + "clip_ratio": 0.0035135726211592555, + "completion_length": 670.9464721679688, + "epoch": 0.01738724872907047, + "grad_norm": 0.0890529453754425, + "kl": 0.0303955078125, + "learning_rate": 9.906058802375593e-06, + "loss": 0.0143, + "num_tokens": 11543968.0, + "reward": 0.30482142791152, + "reward_std": 0.2588490443304181, + "rewards/code_reward": 0.160714291036129, + "rewards/format_reward": 1.4410714209079742, + "step": 466 + }, + { + "clip_ratio": 0.003711101016961038, + "epoch": 0.017424560421622125, + "grad_norm": 0.08673145622015, + "kl": 0.0311279296875, + "learning_rate": 9.905465390884363e-06, + "loss": 0.0141, + "step": 467 + }, + { + "clip_ratio": 0.0031802686862647533, + "epoch": 0.017461872114173778, + "grad_norm": 0.08217141032218933, + "kl": 0.0360107421875, + "learning_rate": 9.904870130941687e-06, + "loss": 0.0136, + "step": 468 + }, + { + "clip_ratio": 0.002741908421739936, + "completion_length": 696.1071701049805, + "epoch": 0.01749918380672543, + "grad_norm": 0.05931785702705383, + "kl": 0.02386474609375, + "learning_rate": 9.904273022797326e-06, + "loss": -0.001, + "num_tokens": 11609256.0, + "reward": 0.6322420798242092, + "reward_std": 0.1817549243569374, + "rewards/code_reward": 0.48224205523729324, + "rewards/format_reward": 1.5, + "step": 469 + }, + { + "clip_ratio": 0.0032152452622540295, + "epoch": 0.017536495499277088, + "grad_norm": 0.05506191402673721, + "kl": 0.02349853515625, + "learning_rate": 9.90367406670182e-06, + "loss": -0.001, + "step": 470 + }, + { + "clip_ratio": 0.0026094650966115296, + "epoch": 0.01757380719182874, + "grad_norm": 0.054710663855075836, + "kl": 0.02362060546875, + "learning_rate": 9.903073262906475e-06, + "loss": -0.0012, + "step": 471 + }, + { + "clip_ratio": 0.003500823862850666, + "completion_length": 563.6071701049805, + "epoch": 0.017611118884380394, + "grad_norm": 0.07176437228918076, + "kl": 0.026336669921875, + "learning_rate": 9.902470611663379e-06, + "loss": -0.0015, + "num_tokens": 11666032.0, + "reward": 0.7678776644170284, + "reward_std": 0.13865768536925316, + "rewards/code_reward": 0.6178776770830154, + "rewards/format_reward": 1.5, + "step": 472 + }, + { + "clip_ratio": 0.0037971411366015673, + "epoch": 0.017648430576932047, + "grad_norm": 0.0692606195807457, + "kl": 0.02685546875, + "learning_rate": 9.901866113225392e-06, + "loss": -0.0018, + "step": 473 + }, + { + "clip_ratio": 0.0030812949407845736, + "epoch": 0.0176857422694837, + "grad_norm": 0.07105837017297745, + "kl": 0.026885986328125, + "learning_rate": 9.90125976784615e-06, + "loss": -0.0019, + "step": 474 + }, + { + "clip_ratio": 0.004300131520722061, + "completion_length": 763.0714569091797, + "epoch": 0.017723053962035353, + "grad_norm": 0.08503256738185883, + "kl": 0.03204345703125, + "learning_rate": 9.90065157578007e-06, + "loss": 0.0073, + "num_tokens": 11747642.0, + "reward": 0.4607165567576885, + "reward_std": 0.2143979137763381, + "rewards/code_reward": 0.3133950945921242, + "rewards/format_reward": 1.4732142984867096, + "step": 475 + }, + { + "clip_ratio": 0.003960977133829147, + "epoch": 0.017760365654587006, + "grad_norm": 0.13627783954143524, + "kl": 0.0304718017578125, + "learning_rate": 9.900041537282328e-06, + "loss": 0.0071, + "step": 476 + }, + { + "clip_ratio": 0.004145053972024471, + "epoch": 0.01779767734713866, + "grad_norm": 0.08356675505638123, + "kl": 0.0303955078125, + "learning_rate": 9.899429652608894e-06, + "loss": 0.007, + "step": 477 + }, + { + "clip_ratio": 0.00368480192264542, + "completion_length": 711.1786041259766, + "epoch": 0.017834989039690313, + "grad_norm": 0.08279280364513397, + "kl": 0.02777099609375, + "learning_rate": 9.898815922016497e-06, + "loss": -0.0111, + "num_tokens": 11814936.0, + "reward": 0.36238666251301765, + "reward_std": 0.2485308339819312, + "rewards/code_reward": 0.2150652315467596, + "rewards/format_reward": 1.4732142984867096, + "step": 478 + }, + { + "clip_ratio": 0.003496191871818155, + "epoch": 0.017872300732241966, + "grad_norm": 0.08534844219684601, + "kl": 0.0279693603515625, + "learning_rate": 9.898200345762652e-06, + "loss": -0.0111, + "step": 479 + }, + { + "clip_ratio": 0.0034540639026090503, + "epoch": 0.01790961242479362, + "grad_norm": 0.08277982473373413, + "kl": 0.029937744140625, + "learning_rate": 9.897582924105638e-06, + "loss": -0.0114, + "step": 480 + }, + { + "clip_ratio": 0.004202622803859413, + "completion_length": 684.107177734375, + "epoch": 0.017946924117345272, + "grad_norm": 0.055416546761989594, + "kl": 0.024627685546875, + "learning_rate": 9.89696365730452e-06, + "loss": -0.0006, + "num_tokens": 11887294.0, + "reward": 0.31964289397001266, + "reward_std": 0.1083326954394579, + "rewards/code_reward": 0.16964285541325808, + "rewards/format_reward": 1.5, + "step": 481 + }, + { + "clip_ratio": 0.003771009505726397, + "epoch": 0.017984235809896925, + "grad_norm": 0.056374724954366684, + "kl": 0.024658203125, + "learning_rate": 9.896342545619126e-06, + "loss": -0.0005, + "step": 482 + }, + { + "clip_ratio": 0.003962401184253395, + "epoch": 0.018021547502448578, + "grad_norm": 0.054497066885232925, + "kl": 0.02447509765625, + "learning_rate": 9.895719589310066e-06, + "loss": -0.0006, + "step": 483 + }, + { + "clip_ratio": 0.0035213223891332746, + "completion_length": 821.8214721679688, + "epoch": 0.018058859195000235, + "grad_norm": 0.07616934925317764, + "kl": 0.050079345703125, + "learning_rate": 9.895094788638716e-06, + "loss": 0.0197, + "num_tokens": 11969702.0, + "reward": 0.5026452429592609, + "reward_std": 0.05625134217552841, + "rewards/code_reward": 0.3553237767191604, + "rewards/format_reward": 1.4732142984867096, + "step": 484 + }, + { + "clip_ratio": 0.004049155453685671, + "epoch": 0.018096170887551888, + "grad_norm": 0.07851938903331757, + "kl": 0.0493621826171875, + "learning_rate": 9.894468143867236e-06, + "loss": 0.0196, + "step": 485 + }, + { + "clip_ratio": 0.0038566383300349116, + "epoch": 0.01813348258010354, + "grad_norm": 0.07258377969264984, + "kl": 0.0579071044921875, + "learning_rate": 9.893839655258554e-06, + "loss": 0.0194, + "step": 486 + }, + { + "clip_ratio": 0.0027873956714756787, + "completion_length": 701.2321624755859, + "epoch": 0.018170794272655194, + "grad_norm": 0.1032683476805687, + "kl": 0.033447265625, + "learning_rate": 9.893209323076369e-06, + "loss": -0.0001, + "num_tokens": 12034983.0, + "reward": 0.6066552735865116, + "reward_std": 0.20780162140727043, + "rewards/code_reward": 0.4566552797332406, + "rewards/format_reward": 1.5, + "step": 487 + }, + { + "clip_ratio": 0.002951481379568577, + "epoch": 0.018208105965206847, + "grad_norm": 0.07482991367578506, + "kl": 0.033294677734375, + "learning_rate": 9.892577147585158e-06, + "loss": -0.0003, + "step": 488 + }, + { + "clip_ratio": 0.0024138594744727015, + "epoch": 0.0182454176577585, + "grad_norm": 0.06681593507528305, + "kl": 0.0345916748046875, + "learning_rate": 9.89194312905017e-06, + "loss": -0.0007, + "step": 489 + }, + { + "clip_ratio": 0.0013105264515616, + "completion_length": 708.6964569091797, + "epoch": 0.018282729350310153, + "grad_norm": 0.03996020555496216, + "kl": 0.028472900390625, + "learning_rate": 9.891307267737432e-06, + "loss": 0.0044, + "num_tokens": 12104854.0, + "reward": 0.7477540485560894, + "reward_std": 0.056080013513565063, + "rewards/code_reward": 0.5977540463209152, + "rewards/format_reward": 1.5, + "step": 490 + }, + { + "clip_ratio": 0.0015231905272230506, + "epoch": 0.018320041042861807, + "grad_norm": 0.04152441769838333, + "kl": 0.02789306640625, + "learning_rate": 9.890669563913732e-06, + "loss": 0.0044, + "step": 491 + }, + { + "clip_ratio": 0.0016022035852074623, + "epoch": 0.01835735273541346, + "grad_norm": 0.038361627608537674, + "kl": 0.0279541015625, + "learning_rate": 9.890030017846643e-06, + "loss": 0.0044, + "step": 492 + }, + { + "clip_ratio": 0.004383992520160973, + "completion_length": 677.1250305175781, + "epoch": 0.018394664427965113, + "grad_norm": 0.08295733481645584, + "kl": 0.02752685546875, + "learning_rate": 9.889388629804505e-06, + "loss": 0.0067, + "num_tokens": 12170255.0, + "reward": 0.2399894744157791, + "reward_std": 0.11576923471875489, + "rewards/code_reward": 0.08998945617349818, + "rewards/format_reward": 1.5, + "step": 493 + }, + { + "clip_ratio": 0.004393304930999875, + "epoch": 0.018431976120516766, + "grad_norm": 0.07681110501289368, + "kl": 0.027313232421875, + "learning_rate": 9.888745400056435e-06, + "loss": 0.0065, + "step": 494 + }, + { + "clip_ratio": 0.004446038510650396, + "epoch": 0.01846928781306842, + "grad_norm": 0.0721772313117981, + "kl": 0.028045654296875, + "learning_rate": 9.888100328872318e-06, + "loss": 0.0063, + "step": 495 + }, + { + "clip_ratio": 0.0028749662451446056, + "completion_length": 726.7143096923828, + "epoch": 0.018506599505620072, + "grad_norm": 0.07224322855472565, + "kl": 0.03460693359375, + "learning_rate": 9.887453416522813e-06, + "loss": -0.003, + "num_tokens": 12239189.0, + "reward": 0.70736263692379, + "reward_std": 0.24719670042395592, + "rewards/code_reward": 0.5600412115454674, + "rewards/format_reward": 1.4732142984867096, + "step": 496 + }, + { + "clip_ratio": 0.002726523205637932, + "epoch": 0.01854391119817173, + "grad_norm": 0.07728468626737595, + "kl": 0.036102294921875, + "learning_rate": 9.886804663279355e-06, + "loss": -0.0032, + "step": 497 + }, + { + "clip_ratio": 0.0026210222276858985, + "epoch": 0.018581222890723382, + "grad_norm": 0.06656786799430847, + "kl": 0.03436279296875, + "learning_rate": 9.88615406941415e-06, + "loss": -0.0033, + "step": 498 + }, + { + "clip_ratio": 0.00450645899400115, + "completion_length": 581.303596496582, + "epoch": 0.018618534583275035, + "grad_norm": 0.10749932378530502, + "kl": 0.037109375, + "learning_rate": 9.88550163520017e-06, + "loss": 0.0151, + "num_tokens": 12298286.0, + "reward": 0.712565865367651, + "reward_std": 0.22895601019263268, + "rewards/code_reward": 0.5652444064617157, + "rewards/format_reward": 1.4732142984867096, + "step": 499 + }, + { + "clip_ratio": 0.0037704621790908277, + "epoch": 0.018655846275826688, + "grad_norm": 0.09047386050224304, + "kl": 0.0362548828125, + "learning_rate": 9.884847360911168e-06, + "loss": 0.0147, + "step": 500 + }, + { + "clip_ratio": 0.004345539025962353, + "epoch": 0.01869315796837834, + "grad_norm": 0.06913299858570099, + "kl": 0.035919189453125, + "learning_rate": 9.884191246821663e-06, + "loss": 0.0143, + "step": 501 + }, + { + "clip_ratio": 0.0033822397235780954, + "completion_length": 520.7500305175781, + "epoch": 0.018730469660929994, + "grad_norm": 0.09839209914207458, + "kl": 0.04034423828125, + "learning_rate": 9.883533293206953e-06, + "loss": -0.0038, + "num_tokens": 12355462.0, + "reward": 0.9029529020190239, + "reward_std": 0.20949244499206543, + "rewards/code_reward": 0.7529528997838497, + "rewards/format_reward": 1.5, + "step": 502 + }, + { + "clip_ratio": 0.003448092087637633, + "epoch": 0.018767781353481647, + "grad_norm": 0.0936226099729538, + "kl": 0.040496826171875, + "learning_rate": 9.882873500343098e-06, + "loss": -0.004, + "step": 503 + }, + { + "clip_ratio": 0.0025299719418399036, + "epoch": 0.0188050930460333, + "grad_norm": 0.08655489236116409, + "kl": 0.04052734375, + "learning_rate": 9.882211868506935e-06, + "loss": -0.0044, + "step": 504 + }, + { + "clip_ratio": 0.003793374286033213, + "completion_length": 695.8393096923828, + "epoch": 0.018842404738584954, + "grad_norm": 0.08354032039642334, + "kl": 0.029052734375, + "learning_rate": 9.881548397976077e-06, + "loss": -0.0042, + "num_tokens": 12423437.0, + "reward": 0.7777301073074341, + "reward_std": 0.39608826488256454, + "rewards/code_reward": 0.6277301013469696, + "rewards/format_reward": 1.5, + "step": 505 + }, + { + "clip_ratio": 0.0038565085851587355, + "epoch": 0.018879716431136607, + "grad_norm": 0.08482751250267029, + "kl": 0.02899169921875, + "learning_rate": 9.880883089028898e-06, + "loss": -0.0041, + "step": 506 + }, + { + "clip_ratio": 0.0038637526449747384, + "epoch": 0.01891702812368826, + "grad_norm": 0.0851750299334526, + "kl": 0.029083251953125, + "learning_rate": 9.880215941944554e-06, + "loss": -0.0046, + "step": 507 + }, + { + "clip_ratio": 0.002141979173757136, + "completion_length": 594.0893096923828, + "epoch": 0.018954339816239913, + "grad_norm": 0.056602880358695984, + "kl": 0.0325927734375, + "learning_rate": 9.879546957002967e-06, + "loss": 0.0001, + "num_tokens": 12493064.0, + "reward": 0.854591827839613, + "reward_std": 0.05235202983021736, + "rewards/code_reward": 0.7072704136371613, + "rewards/format_reward": 1.4732142984867096, + "step": 508 + }, + { + "clip_ratio": 0.0025729156332090497, + "epoch": 0.018991651508791566, + "grad_norm": 0.05399477481842041, + "kl": 0.032501220703125, + "learning_rate": 9.878876134484828e-06, + "loss": -0.0001, + "step": 509 + }, + { + "clip_ratio": 0.0019495338783599436, + "epoch": 0.01902896320134322, + "grad_norm": 0.052827138453722, + "kl": 0.034027099609375, + "learning_rate": 9.878203474671603e-06, + "loss": -0.0002, + "step": 510 + }, + { + "clip_ratio": 0.003007915918715298, + "completion_length": 712.0178756713867, + "epoch": 0.019066274893894876, + "grad_norm": 0.0695330873131752, + "kl": 0.024261474609375, + "learning_rate": 9.877528977845526e-06, + "loss": -0.0008, + "num_tokens": 12560587.0, + "reward": 0.6886126324534416, + "reward_std": 0.19537803530693054, + "rewards/code_reward": 0.5412912182509899, + "rewards/format_reward": 1.4732142984867096, + "step": 511 + }, + { + "clip_ratio": 0.0026110309991054237, + "epoch": 0.01910358658644653, + "grad_norm": 0.06777067482471466, + "kl": 0.024932861328125, + "learning_rate": 9.876852644289608e-06, + "loss": -0.001, + "step": 512 + }, + { + "clip_ratio": 0.002564077964052558, + "epoch": 0.019140898278998182, + "grad_norm": 0.06769108027219772, + "kl": 0.02471923828125, + "learning_rate": 9.876174474287623e-06, + "loss": -0.0013, + "step": 513 + }, + { + "clip_ratio": 0.004310609714593738, + "completion_length": 757.3214416503906, + "epoch": 0.019178209971549835, + "grad_norm": 0.08220021426677704, + "kl": 0.0381011962890625, + "learning_rate": 9.875494468124118e-06, + "loss": 0.0527, + "num_tokens": 12631179.0, + "reward": 0.3866668902337551, + "reward_std": 0.308915882371366, + "rewards/code_reward": 0.23916687769815326, + "rewards/format_reward": 1.4749999940395355, + "step": 514 + }, + { + "clip_ratio": 0.004433032649103552, + "epoch": 0.01921552166410149, + "grad_norm": 0.08602520078420639, + "kl": 0.039398193359375, + "learning_rate": 9.87481262608441e-06, + "loss": 0.0525, + "step": 515 + }, + { + "clip_ratio": 0.004020405816845596, + "epoch": 0.01925283335665314, + "grad_norm": 0.07853934913873672, + "kl": 0.037322998046875, + "learning_rate": 9.87412894845459e-06, + "loss": 0.0522, + "step": 516 + }, + { + "clip_ratio": 0.005145972594618797, + "completion_length": 619.6250305175781, + "epoch": 0.019290145049204795, + "grad_norm": 0.05483057722449303, + "kl": 0.03607177734375, + "learning_rate": 9.873443435521514e-06, + "loss": -0.0007, + "num_tokens": 12702306.0, + "reward": 0.3388037532567978, + "reward_std": 0.17833997681736946, + "rewards/code_reward": 0.18880373425781727, + "rewards/format_reward": 1.5, + "step": 517 + }, + { + "clip_ratio": 0.005171466851606965, + "epoch": 0.019327456741756448, + "grad_norm": 0.056846119463443756, + "kl": 0.03460693359375, + "learning_rate": 9.872756087572813e-06, + "loss": -0.0007, + "step": 518 + }, + { + "clip_ratio": 0.004592430253978819, + "epoch": 0.0193647684343081, + "grad_norm": 0.05572538450360298, + "kl": 0.034759521484375, + "learning_rate": 9.872066904896883e-06, + "loss": -0.0009, + "step": 519 + }, + { + "clip_ratio": 0.00413447868777439, + "completion_length": 570.7857360839844, + "epoch": 0.019402080126859754, + "grad_norm": 0.1942586898803711, + "kl": 0.03338623046875, + "learning_rate": 9.871375887782894e-06, + "loss": 0.0113, + "num_tokens": 12760408.0, + "reward": 0.922686442732811, + "reward_std": 0.2685091746971011, + "rewards/code_reward": 0.7753650024533272, + "rewards/format_reward": 1.4732142984867096, + "step": 520 + }, + { + "clip_ratio": 0.004345785593613982, + "epoch": 0.019439391819411407, + "grad_norm": 0.19324171543121338, + "kl": 0.0318603515625, + "learning_rate": 9.870683036520785e-06, + "loss": 0.011, + "step": 521 + }, + { + "clip_ratio": 0.0047244803281500936, + "epoch": 0.01947670351196306, + "grad_norm": 0.13776187598705292, + "kl": 0.033050537109375, + "learning_rate": 9.86998835140126e-06, + "loss": 0.0104, + "step": 522 + }, + { + "clip_ratio": 0.0037872722023166716, + "completion_length": 644.3393249511719, + "epoch": 0.019514015204514713, + "grad_norm": 0.08136291056871414, + "kl": 0.04071044921875, + "learning_rate": 9.869291832715798e-06, + "loss": -0.0009, + "num_tokens": 12830217.0, + "reward": 0.35841044038534164, + "reward_std": 0.10444741370156407, + "rewards/code_reward": 0.2137675362173468, + "rewards/format_reward": 1.4464285969734192, + "step": 523 + }, + { + "clip_ratio": 0.0036878735991194844, + "epoch": 0.019551326897066366, + "grad_norm": 0.08214517682790756, + "kl": 0.0400390625, + "learning_rate": 9.868593480756646e-06, + "loss": -0.0013, + "step": 524 + }, + { + "clip_ratio": 0.0038076842902228236, + "epoch": 0.019588638589618023, + "grad_norm": 0.09505803138017654, + "kl": 0.041046142578125, + "learning_rate": 9.867893295816818e-06, + "loss": -0.0016, + "step": 525 + }, + { + "clip_ratio": 0.0036396703799255192, + "completion_length": 734.9107284545898, + "epoch": 0.019625950282169676, + "grad_norm": 0.08248738199472427, + "kl": 0.03485107421875, + "learning_rate": 9.867191278190099e-06, + "loss": 0.001, + "num_tokens": 12899778.0, + "reward": 0.6649234816431999, + "reward_std": 0.29410482943058014, + "rewards/code_reward": 0.5229591839015484, + "rewards/format_reward": 1.4196428656578064, + "step": 526 + }, + { + "clip_ratio": 0.004026541544590145, + "epoch": 0.01966326197472133, + "grad_norm": 0.08177682757377625, + "kl": 0.03387451171875, + "learning_rate": 9.866487428171042e-06, + "loss": 0.0008, + "step": 527 + }, + { + "clip_ratio": 0.004053512879181653, + "epoch": 0.019700573667272982, + "grad_norm": 0.07890861481428146, + "kl": 0.033966064453125, + "learning_rate": 9.865781746054971e-06, + "loss": 0.0005, + "step": 528 + }, + { + "clip_ratio": 0.002404346247203648, + "completion_length": 537.5357360839844, + "epoch": 0.019737885359824636, + "grad_norm": 0.06376765668392181, + "kl": 0.03997802734375, + "learning_rate": 9.865074232137977e-06, + "loss": -0.0033, + "num_tokens": 12961954.0, + "reward": 0.7366071455180645, + "reward_std": 0.19518418610095978, + "rewards/code_reward": 0.589285708963871, + "rewards/format_reward": 1.4732142984867096, + "step": 529 + }, + { + "clip_ratio": 0.002495193504728377, + "epoch": 0.01977519705237629, + "grad_norm": 0.06045370176434517, + "kl": 0.039703369140625, + "learning_rate": 9.864364886716917e-06, + "loss": -0.0036, + "step": 530 + }, + { + "clip_ratio": 0.0027806166326627135, + "epoch": 0.019812508744927942, + "grad_norm": 0.05474802851676941, + "kl": 0.04052734375, + "learning_rate": 9.863653710089422e-06, + "loss": -0.0038, + "step": 531 + }, + { + "clip_ratio": 0.002182339900173247, + "completion_length": 535.8214492797852, + "epoch": 0.019849820437479595, + "grad_norm": 0.055486638098955154, + "kl": 0.041717529296875, + "learning_rate": 9.86294070255389e-06, + "loss": 0.0014, + "num_tokens": 13020142.0, + "reward": 0.8299858532845974, + "reward_std": 0.11488097906112671, + "rewards/code_reward": 0.6799858659505844, + "rewards/format_reward": 1.5, + "step": 532 + }, + { + "clip_ratio": 0.002144314465112984, + "epoch": 0.019887132130031248, + "grad_norm": 0.051142144948244095, + "kl": 0.04083251953125, + "learning_rate": 9.86222586440948e-06, + "loss": 0.0013, + "step": 533 + }, + { + "clip_ratio": 0.0021606850204989314, + "epoch": 0.0199244438225829, + "grad_norm": 0.0499332994222641, + "kl": 0.0413818359375, + "learning_rate": 9.861509195956129e-06, + "loss": 0.0013, + "step": 534 + }, + { + "clip_ratio": 0.004566644376609474, + "completion_length": 596.0893173217773, + "epoch": 0.019961755515134554, + "grad_norm": 0.0843946635723114, + "kl": 0.024749755859375, + "learning_rate": 9.860790697494537e-06, + "loss": -0.0071, + "num_tokens": 13085989.0, + "reward": 0.5877453684806824, + "reward_std": 0.0745321037247777, + "rewards/code_reward": 0.43774537299759686, + "rewards/format_reward": 1.5, + "step": 535 + }, + { + "clip_ratio": 0.004427964682690799, + "epoch": 0.019999067207686207, + "grad_norm": 0.08352560549974442, + "kl": 0.024871826171875, + "learning_rate": 9.860070369326174e-06, + "loss": -0.007, + "step": 536 + }, + { + "clip_ratio": 0.004001747642178088, + "epoch": 0.02003637890023786, + "grad_norm": 0.07702326029539108, + "kl": 0.02496337890625, + "learning_rate": 9.859348211753273e-06, + "loss": -0.0074, + "step": 537 + }, + { + "clip_ratio": 0.003989632474258542, + "completion_length": 677.9464645385742, + "epoch": 0.020073690592789517, + "grad_norm": 0.08017770200967789, + "kl": 0.02374267578125, + "learning_rate": 9.858624225078841e-06, + "loss": -0.0056, + "num_tokens": 13150778.0, + "reward": 0.37904882803559303, + "reward_std": 0.18043866707012057, + "rewards/code_reward": 0.22904879599809647, + "rewards/format_reward": 1.5, + "step": 538 + }, + { + "clip_ratio": 0.003628081700298935, + "epoch": 0.02011100228534117, + "grad_norm": 0.0782821848988533, + "kl": 0.0235595703125, + "learning_rate": 9.857898409606648e-06, + "loss": -0.0058, + "step": 539 + }, + { + "clip_ratio": 0.0037010922096669674, + "epoch": 0.020148313977892823, + "grad_norm": 0.0725114569067955, + "kl": 0.024017333984375, + "learning_rate": 9.857170765641232e-06, + "loss": -0.0061, + "step": 540 + }, + { + "clip_ratio": 0.004127011517994106, + "completion_length": 695.3214569091797, + "epoch": 0.020185625670444476, + "grad_norm": 0.07021067291498184, + "kl": 0.02911376953125, + "learning_rate": 9.8564412934879e-06, + "loss": 0.0058, + "num_tokens": 13219410.0, + "reward": 0.5912687890231609, + "reward_std": 0.17663622461259365, + "rewards/code_reward": 0.44394733756780624, + "rewards/format_reward": 1.4732142984867096, + "step": 541 + }, + { + "clip_ratio": 0.004294544807635248, + "epoch": 0.02022293736299613, + "grad_norm": 0.07378441840410233, + "kl": 0.028900146484375, + "learning_rate": 9.855709993452725e-06, + "loss": 0.0058, + "step": 542 + }, + { + "clip_ratio": 0.0044132687035016716, + "epoch": 0.020260249055547783, + "grad_norm": 0.07145832479000092, + "kl": 0.02972412109375, + "learning_rate": 9.854976865842546e-06, + "loss": 0.0055, + "step": 543 + }, + { + "clip_ratio": 0.0038575426442548633, + "completion_length": 748.4821624755859, + "epoch": 0.020297560748099436, + "grad_norm": 0.08621951192617416, + "kl": 0.022369384765625, + "learning_rate": 9.854241910964971e-06, + "loss": 0.0126, + "num_tokens": 13284467.0, + "reward": 0.5655661672353745, + "reward_std": 0.32361817732453346, + "rewards/code_reward": 0.4155661500990391, + "rewards/format_reward": 1.5, + "step": 544 + }, + { + "clip_ratio": 0.0038967328728176653, + "epoch": 0.02033487244065109, + "grad_norm": 0.0824940949678421, + "kl": 0.023162841796875, + "learning_rate": 9.853505129128372e-06, + "loss": 0.0124, + "step": 545 + }, + { + "clip_ratio": 0.0037870010710321367, + "epoch": 0.020372184133202742, + "grad_norm": 0.08460056036710739, + "kl": 0.023590087890625, + "learning_rate": 9.85276652064189e-06, + "loss": 0.0122, + "step": 546 + }, + { + "clip_ratio": 0.0038901165826246142, + "completion_length": 584.357177734375, + "epoch": 0.020409495825754395, + "grad_norm": 0.08177228271961212, + "kl": 0.03607177734375, + "learning_rate": 9.852026085815433e-06, + "loss": -0.0011, + "num_tokens": 13344239.0, + "reward": 0.7054295651614666, + "reward_std": 0.18863899819552898, + "rewards/code_reward": 0.5634652450680733, + "rewards/format_reward": 1.4196428656578064, + "step": 547 + }, + { + "clip_ratio": 0.003992271493189037, + "epoch": 0.020446807518306048, + "grad_norm": 0.07648050040006638, + "kl": 0.03631591796875, + "learning_rate": 9.851283824959669e-06, + "loss": -0.0012, + "step": 548 + }, + { + "clip_ratio": 0.00403786008246243, + "epoch": 0.0204841192108577, + "grad_norm": 0.07238291949033737, + "kl": 0.03570556640625, + "learning_rate": 9.85053973838604e-06, + "loss": -0.0014, + "step": 549 + }, + { + "clip_ratio": 0.00462181813782081, + "completion_length": 704.9464569091797, + "epoch": 0.020521430903409354, + "grad_norm": 0.07029660046100616, + "kl": 0.035858154296875, + "learning_rate": 9.849793826406752e-06, + "loss": -0.0056, + "num_tokens": 13414264.0, + "reward": 0.47211929038167, + "reward_std": 0.20345989987254143, + "rewards/code_reward": 0.3247978389263153, + "rewards/format_reward": 1.4732142984867096, + "step": 550 + }, + { + "clip_ratio": 0.004732071305625141, + "epoch": 0.020558742595961008, + "grad_norm": 0.06930360198020935, + "kl": 0.036163330078125, + "learning_rate": 9.849046089334774e-06, + "loss": -0.0059, + "step": 551 + }, + { + "clip_ratio": 0.004282575915567577, + "epoch": 0.020596054288512664, + "grad_norm": 0.0679289698600769, + "kl": 0.03546142578125, + "learning_rate": 9.848296527483841e-06, + "loss": -0.0061, + "step": 552 + }, + { + "clip_ratio": 0.004237726680003107, + "completion_length": 690.3214569091797, + "epoch": 0.020633365981064317, + "grad_norm": 0.08910752087831497, + "kl": 0.030487060546875, + "learning_rate": 9.847545141168459e-06, + "loss": 0.0291, + "num_tokens": 13487376.0, + "reward": 0.7512649968266487, + "reward_std": 0.26728885620832443, + "rewards/code_reward": 0.6012649685144424, + "rewards/format_reward": 1.5, + "step": 553 + }, + { + "clip_ratio": 0.004181607917416841, + "epoch": 0.02067067767361597, + "grad_norm": 0.081914983689785, + "kl": 0.03076171875, + "learning_rate": 9.846791930703892e-06, + "loss": 0.029, + "step": 554 + }, + { + "clip_ratio": 0.003980765992309898, + "epoch": 0.020707989366167624, + "grad_norm": 0.07888168096542358, + "kl": 0.030426025390625, + "learning_rate": 9.846036896406176e-06, + "loss": 0.0287, + "step": 555 + }, + { + "clip_ratio": 0.004814712796360254, + "completion_length": 684.8928909301758, + "epoch": 0.020745301058719277, + "grad_norm": 0.08525583893060684, + "kl": 0.027557373046875, + "learning_rate": 9.845280038592113e-06, + "loss": 0.0163, + "num_tokens": 13559274.0, + "reward": 0.36096958443522453, + "reward_std": 0.2019502716138959, + "rewards/code_reward": 0.21096960082650185, + "rewards/format_reward": 1.5, + "step": 556 + }, + { + "clip_ratio": 0.005062451877165586, + "epoch": 0.02078261275127093, + "grad_norm": 0.0819048136472702, + "kl": 0.02777099609375, + "learning_rate": 9.844521357579256e-06, + "loss": 0.0161, + "step": 557 + }, + { + "clip_ratio": 0.005015514092519879, + "epoch": 0.020819924443822583, + "grad_norm": 0.07687193900346756, + "kl": 0.027801513671875, + "learning_rate": 9.843760853685942e-06, + "loss": 0.0155, + "step": 558 + }, + { + "clip_ratio": 0.0046284489799290895, + "completion_length": 704.1964721679688, + "epoch": 0.020857236136374236, + "grad_norm": 0.06513993442058563, + "kl": 0.02410888671875, + "learning_rate": 9.84299852723126e-06, + "loss": 0.0194, + "num_tokens": 13635403.0, + "reward": 0.4798612408339977, + "reward_std": 0.24142973870038986, + "rewards/code_reward": 0.3298611883074045, + "rewards/format_reward": 1.5, + "step": 559 + }, + { + "clip_ratio": 0.004275781218893826, + "epoch": 0.02089454782892589, + "grad_norm": 0.0651819258928299, + "kl": 0.02471923828125, + "learning_rate": 9.84223437853507e-06, + "loss": 0.0193, + "step": 560 + }, + { + "clip_ratio": 0.004137150361202657, + "epoch": 0.020931859521477542, + "grad_norm": 0.06731481105089188, + "kl": 0.024322509765625, + "learning_rate": 9.841468407917996e-06, + "loss": 0.0192, + "step": 561 + }, + { + "clip_ratio": 0.004286549345124513, + "completion_length": 612.9464569091797, + "epoch": 0.020969171214029195, + "grad_norm": 0.14732412993907928, + "kl": 0.088897705078125, + "learning_rate": 9.840700615701421e-06, + "loss": -0.0076, + "num_tokens": 13699016.0, + "reward": 0.8175745606422424, + "reward_std": 0.2660117093473673, + "rewards/code_reward": 0.6748959794640541, + "rewards/format_reward": 1.4267857074737549, + "step": 562 + }, + { + "clip_ratio": 0.003860855125822127, + "epoch": 0.02100648290658085, + "grad_norm": 0.09999147057533264, + "kl": 0.064361572265625, + "learning_rate": 9.8399310022075e-06, + "loss": -0.0081, + "step": 563 + }, + { + "clip_ratio": 0.0036184992059133947, + "epoch": 0.0210437945991325, + "grad_norm": 0.09819746762514114, + "kl": 0.055389404296875, + "learning_rate": 9.839159567759147e-06, + "loss": -0.0086, + "step": 564 + }, + { + "clip_ratio": 0.004028225608635694, + "completion_length": 681.5536041259766, + "epoch": 0.021081106291684158, + "grad_norm": 0.09040936827659607, + "kl": 0.03314208984375, + "learning_rate": 9.838386312680043e-06, + "loss": -0.0062, + "num_tokens": 13778357.0, + "reward": 0.3248027637600899, + "reward_std": 0.23738076258450747, + "rewards/code_reward": 0.17748131044209003, + "rewards/format_reward": 1.4732142984867096, + "step": 565 + }, + { + "clip_ratio": 0.0038276276318356395, + "epoch": 0.02111841798423581, + "grad_norm": 0.09737054258584976, + "kl": 0.03302001953125, + "learning_rate": 9.837611237294629e-06, + "loss": -0.0062, + "step": 566 + }, + { + "clip_ratio": 0.003726982220541686, + "epoch": 0.021155729676787464, + "grad_norm": 0.08696939796209335, + "kl": 0.03369140625, + "learning_rate": 9.836834341928114e-06, + "loss": -0.0066, + "step": 567 + }, + { + "clip_ratio": 0.003987731470260769, + "completion_length": 666.3214569091797, + "epoch": 0.021193041369339118, + "grad_norm": 0.07682638615369797, + "kl": 0.04425048828125, + "learning_rate": 9.836055626906466e-06, + "loss": 0.0177, + "num_tokens": 13850461.0, + "reward": 0.5026605799794197, + "reward_std": 0.08747120667248964, + "rewards/code_reward": 0.35533910244703293, + "rewards/format_reward": 1.4732142984867096, + "step": 568 + }, + { + "clip_ratio": 0.0035277629212941974, + "epoch": 0.02123035306189077, + "grad_norm": 0.07479701191186905, + "kl": 0.04473876953125, + "learning_rate": 9.835275092556426e-06, + "loss": 0.0176, + "step": 569 + }, + { + "clip_ratio": 0.003959621419198811, + "epoch": 0.021267664754442424, + "grad_norm": 0.06882442533969879, + "kl": 0.04473876953125, + "learning_rate": 9.834492739205484e-06, + "loss": 0.0172, + "step": 570 + }, + { + "clip_ratio": 0.0035189285408705473, + "completion_length": 552.2678985595703, + "epoch": 0.021304976446994077, + "grad_norm": 0.07539945840835571, + "kl": 0.03192138671875, + "learning_rate": 9.833708567181904e-06, + "loss": -0.0011, + "num_tokens": 13915002.0, + "reward": 0.6494419537484646, + "reward_std": 0.268716424703598, + "rewards/code_reward": 0.4994419636204839, + "rewards/format_reward": 1.5, + "step": 571 + }, + { + "clip_ratio": 0.0031447018263861537, + "epoch": 0.02134228813954573, + "grad_norm": 0.07084906846284866, + "kl": 0.032073974609375, + "learning_rate": 9.832922576814713e-06, + "loss": -0.0014, + "step": 572 + }, + { + "clip_ratio": 0.002798713743686676, + "epoch": 0.021379599832097383, + "grad_norm": 0.07489573210477829, + "kl": 0.031982421875, + "learning_rate": 9.832134768433694e-06, + "loss": -0.0016, + "step": 573 + }, + { + "clip_ratio": 0.004899334628134966, + "completion_length": 764.2500457763672, + "epoch": 0.021416911524649036, + "grad_norm": 0.08271732926368713, + "kl": 0.0452880859375, + "learning_rate": 9.831345142369398e-06, + "loss": 0.0321, + "num_tokens": 14009852.0, + "reward": 0.45359838008880615, + "reward_std": 0.27265359833836555, + "rewards/code_reward": 0.3082412015646696, + "rewards/format_reward": 1.4535714387893677, + "step": 574 + }, + { + "clip_ratio": 0.00477286521345377, + "epoch": 0.02145422321720069, + "grad_norm": 0.07834110409021378, + "kl": 0.04669189453125, + "learning_rate": 9.830553698953138e-06, + "loss": 0.0318, + "step": 575 + }, + { + "clip_ratio": 0.004875178681686521, + "epoch": 0.021491534909752343, + "grad_norm": 0.09520472586154938, + "kl": 0.04559326171875, + "learning_rate": 9.829760438516988e-06, + "loss": 0.0315, + "step": 576 + }, + { + "clip_ratio": 0.004739537602290511, + "completion_length": 578.8214645385742, + "epoch": 0.021528846602303996, + "grad_norm": 0.10582005232572556, + "kl": 0.034423828125, + "learning_rate": 9.828965361393784e-06, + "loss": 0.0137, + "num_tokens": 14074242.0, + "reward": 0.8028225898742676, + "reward_std": 0.20244868099689484, + "rewards/code_reward": 0.6528225615620613, + "rewards/format_reward": 1.5, + "step": 577 + }, + { + "clip_ratio": 0.0038432180881500244, + "epoch": 0.02156615829485565, + "grad_norm": 0.10271576792001724, + "kl": 0.034088134765625, + "learning_rate": 9.82816846791713e-06, + "loss": 0.0134, + "step": 578 + }, + { + "clip_ratio": 0.0043927234946750104, + "epoch": 0.021603469987407305, + "grad_norm": 0.09532078355550766, + "kl": 0.033294677734375, + "learning_rate": 9.827369758421384e-06, + "loss": 0.0129, + "step": 579 + }, + { + "clip_ratio": 0.0054228645749390125, + "completion_length": 644.0178756713867, + "epoch": 0.02164078167995896, + "grad_norm": 0.09929276257753372, + "kl": 0.04327392578125, + "learning_rate": 9.826569233241671e-06, + "loss": -0.0015, + "num_tokens": 14131237.0, + "reward": 0.25073671340942383, + "reward_std": 0.11272168811410666, + "rewards/code_reward": 0.10734382877126336, + "rewards/format_reward": 1.4339285790920258, + "step": 580 + }, + { + "clip_ratio": 0.005252769798971713, + "epoch": 0.02167809337251061, + "grad_norm": 0.11592892557382584, + "kl": 0.042449951171875, + "learning_rate": 9.825766892713877e-06, + "loss": -0.0015, + "step": 581 + }, + { + "clip_ratio": 0.005363767850212753, + "epoch": 0.021715405065062265, + "grad_norm": 0.11314118653535843, + "kl": 0.04534912109375, + "learning_rate": 9.824962737174645e-06, + "loss": -0.0018, + "step": 582 + }, + { + "clip_ratio": 0.004014039470348507, + "completion_length": 608.1428680419922, + "epoch": 0.021752716757613918, + "grad_norm": 0.09450048208236694, + "kl": 0.033111572265625, + "learning_rate": 9.82415676696139e-06, + "loss": 0.0035, + "num_tokens": 14194665.0, + "reward": 0.5941341333091259, + "reward_std": 0.22546476125717163, + "rewards/code_reward": 0.4460984244942665, + "rewards/format_reward": 1.480357140302658, + "step": 583 + }, + { + "clip_ratio": 0.004049510695040226, + "epoch": 0.02179002845016557, + "grad_norm": 0.09101826697587967, + "kl": 0.03411865234375, + "learning_rate": 9.823348982412281e-06, + "loss": 0.0031, + "step": 584 + }, + { + "clip_ratio": 0.0042833343031816185, + "epoch": 0.021827340142717224, + "grad_norm": 0.08654940873384476, + "kl": 0.035186767578125, + "learning_rate": 9.822539383866246e-06, + "loss": 0.0025, + "step": 585 + }, + { + "clip_ratio": 0.003524119849316776, + "completion_length": 756.8750457763672, + "epoch": 0.021864651835268877, + "grad_norm": 0.0796869546175003, + "kl": 0.032989501953125, + "learning_rate": 9.821727971662978e-06, + "loss": -0.005, + "num_tokens": 14271378.0, + "reward": 0.44518228247761726, + "reward_std": 0.1878432258963585, + "rewards/code_reward": 0.2991108000278473, + "rewards/format_reward": 1.4607142806053162, + "step": 586 + }, + { + "clip_ratio": 0.003849483618978411, + "epoch": 0.02190196352782053, + "grad_norm": 0.12102924287319183, + "kl": 0.0338134765625, + "learning_rate": 9.820914746142934e-06, + "loss": -0.005, + "step": 587 + }, + { + "clip_ratio": 0.003167739952914417, + "epoch": 0.021939275220372183, + "grad_norm": 0.07661031186580658, + "kl": 0.032318115234375, + "learning_rate": 9.820099707647323e-06, + "loss": -0.0055, + "step": 588 + }, + { + "clip_ratio": 0.004913267737720162, + "completion_length": 613.0893173217773, + "epoch": 0.021976586912923837, + "grad_norm": 0.07628662884235382, + "kl": 0.039459228515625, + "learning_rate": 9.819282856518126e-06, + "loss": -0.0122, + "num_tokens": 14339129.0, + "reward": 0.5363254472613335, + "reward_std": 0.3187742531299591, + "rewards/code_reward": 0.3882897272706032, + "rewards/format_reward": 1.480357140302658, + "step": 589 + }, + { + "clip_ratio": 0.004488885460887104, + "epoch": 0.02201389860547549, + "grad_norm": 0.08427531272172928, + "kl": 0.039337158203125, + "learning_rate": 9.818464193098073e-06, + "loss": -0.0123, + "step": 590 + }, + { + "clip_ratio": 0.004603097040671855, + "epoch": 0.022051210298027143, + "grad_norm": 0.07753891497850418, + "kl": 0.039886474609375, + "learning_rate": 9.817643717730666e-06, + "loss": -0.0126, + "step": 591 + }, + { + "clip_ratio": 0.0025160120567306876, + "completion_length": 818.0000305175781, + "epoch": 0.022088521990578796, + "grad_norm": 0.06814445555210114, + "kl": 0.053009033203125, + "learning_rate": 9.816821430760153e-06, + "loss": -0.0029, + "num_tokens": 14411215.0, + "reward": 0.40901968628168106, + "reward_std": 0.06482767034322023, + "rewards/code_reward": 0.26705539342947304, + "rewards/format_reward": 1.4196428656578064, + "step": 592 + }, + { + "clip_ratio": 0.0024100180016830564, + "epoch": 0.022125833683130453, + "grad_norm": 0.06353622674942017, + "kl": 0.051605224609375, + "learning_rate": 9.815997332531558e-06, + "loss": -0.0029, + "step": 593 + }, + { + "clip_ratio": 0.0022344260942190886, + "epoch": 0.022163145375682106, + "grad_norm": 0.06422707438468933, + "kl": 0.050872802734375, + "learning_rate": 9.815171423390655e-06, + "loss": -0.0032, + "step": 594 + }, + { + "clip_ratio": 0.0026580236153677106, + "completion_length": 553.5893096923828, + "epoch": 0.02220045706823376, + "grad_norm": 0.0769646093249321, + "kl": 0.035125732421875, + "learning_rate": 9.814343703683977e-06, + "loss": 0.008, + "num_tokens": 14468466.0, + "reward": 0.7595903314650059, + "reward_std": 0.2398202307522297, + "rewards/code_reward": 0.6115546310320497, + "rewards/format_reward": 1.480357140302658, + "step": 595 + }, + { + "clip_ratio": 0.002651196555234492, + "epoch": 0.022237768760785412, + "grad_norm": 0.07319202274084091, + "kl": 0.034820556640625, + "learning_rate": 9.813514173758824e-06, + "loss": 0.0077, + "step": 596 + }, + { + "clip_ratio": 0.0027482082368806005, + "epoch": 0.022275080453337065, + "grad_norm": 0.07098983973264694, + "kl": 0.034027099609375, + "learning_rate": 9.81268283396325e-06, + "loss": 0.0074, + "step": 597 + }, + { + "clip_ratio": 0.0031565858516842127, + "completion_length": 491.0178756713867, + "epoch": 0.022312392145888718, + "grad_norm": 0.11110988259315491, + "kl": 0.040496826171875, + "learning_rate": 9.81184968464607e-06, + "loss": 0.0209, + "num_tokens": 14521835.0, + "reward": 0.5249324254691601, + "reward_std": 0.14614539965987206, + "rewards/code_reward": 0.37886100402101874, + "rewards/format_reward": 1.4607142806053162, + "step": 598 + }, + { + "clip_ratio": 0.0027427654713392258, + "epoch": 0.02234970383844037, + "grad_norm": 0.09557485580444336, + "kl": 0.04315185546875, + "learning_rate": 9.811014726156856e-06, + "loss": 0.0203, + "step": 599 + }, + { + "clip_ratio": 0.0032361693447455764, + "epoch": 0.022387015530992024, + "grad_norm": 0.08271770924329758, + "kl": 0.046417236328125, + "learning_rate": 9.810177958845942e-06, + "loss": 0.0196, + "step": 600 + }, + { + "clip_ratio": 0.0027748391730710864, + "completion_length": 858.0893096923828, + "epoch": 0.022424327223543677, + "grad_norm": 0.06081134453415871, + "kl": 0.02862548828125, + "learning_rate": 9.809339383064422e-06, + "loss": 0.0105, + "num_tokens": 14602354.0, + "reward": 0.43177950754761696, + "reward_std": 0.16808597650378942, + "rewards/code_reward": 0.28713663783855736, + "rewards/format_reward": 1.4464285969734192, + "step": 601 + }, + { + "clip_ratio": 0.002626343455631286, + "epoch": 0.02246163891609533, + "grad_norm": 0.06305305659770966, + "kl": 0.028533935546875, + "learning_rate": 9.808498999164146e-06, + "loss": 0.0104, + "step": 602 + }, + { + "clip_ratio": 0.002869359275791794, + "epoch": 0.022498950608646984, + "grad_norm": 0.06685371696949005, + "kl": 0.0299072265625, + "learning_rate": 9.80765680749772e-06, + "loss": 0.0103, + "step": 603 + }, + { + "clip_ratio": 0.003793970972765237, + "completion_length": 613.7857284545898, + "epoch": 0.022536262301198637, + "grad_norm": 0.09690216183662415, + "kl": 0.04364013671875, + "learning_rate": 9.806812808418516e-06, + "loss": -0.0113, + "num_tokens": 14671838.0, + "reward": 0.844893790781498, + "reward_std": 0.22417658753693104, + "rewards/code_reward": 0.6975723206996918, + "rewards/format_reward": 1.4732142984867096, + "step": 604 + }, + { + "clip_ratio": 0.003681022033561021, + "epoch": 0.02257357399375029, + "grad_norm": 0.09720773994922638, + "kl": 0.044403076171875, + "learning_rate": 9.80596700228066e-06, + "loss": -0.0114, + "step": 605 + }, + { + "clip_ratio": 0.0036191405379213393, + "epoch": 0.022610885686301947, + "grad_norm": 0.09070476144552231, + "kl": 0.04266357421875, + "learning_rate": 9.805119389439034e-06, + "loss": -0.0118, + "step": 606 + }, + { + "clip_ratio": 0.0034805446630343795, + "completion_length": 677.5000381469727, + "epoch": 0.0226481973788536, + "grad_norm": 0.05869672819972038, + "kl": 0.033172607421875, + "learning_rate": 9.804269970249286e-06, + "loss": 0.0023, + "num_tokens": 14741810.0, + "reward": 0.5700191594660282, + "reward_std": 0.22959227114915848, + "rewards/code_reward": 0.4246619939804077, + "rewards/format_reward": 1.4535714387893677, + "step": 607 + }, + { + "clip_ratio": 0.0031112349242903292, + "epoch": 0.022685509071405253, + "grad_norm": 0.05866689234972, + "kl": 0.034088134765625, + "learning_rate": 9.80341874506781e-06, + "loss": 0.0021, + "step": 608 + }, + { + "clip_ratio": 0.003268954867962748, + "epoch": 0.022722820763956906, + "grad_norm": 0.062489960342645645, + "kl": 0.035064697265625, + "learning_rate": 9.802565714251767e-06, + "loss": 0.002, + "step": 609 + }, + { + "clip_ratio": 0.0045137551496736705, + "completion_length": 661.6428985595703, + "epoch": 0.02276013245650856, + "grad_norm": 0.08993616700172424, + "kl": 0.0330810546875, + "learning_rate": 9.801710878159072e-06, + "loss": -0.0174, + "num_tokens": 14812350.0, + "reward": 0.5553470849990845, + "reward_std": 0.24102066084742546, + "rewards/code_reward": 0.4053470864892006, + "rewards/format_reward": 1.5, + "step": 610 + }, + { + "clip_ratio": 0.004143317113630474, + "epoch": 0.022797444149060212, + "grad_norm": 0.08880582451820374, + "kl": 0.03314208984375, + "learning_rate": 9.800854237148402e-06, + "loss": -0.0178, + "step": 611 + }, + { + "clip_ratio": 0.004234972700942308, + "epoch": 0.022834755841611865, + "grad_norm": 0.08294712752103806, + "kl": 0.032623291015625, + "learning_rate": 9.799995791579183e-06, + "loss": -0.0179, + "step": 612 + }, + { + "clip_ratio": 0.0036375405616126955, + "completion_length": 712.1428985595703, + "epoch": 0.02287206753416352, + "grad_norm": 0.06578993052244186, + "kl": 0.04638671875, + "learning_rate": 9.799135541811606e-06, + "loss": 0.007, + "num_tokens": 14890618.0, + "reward": 0.6632843501865864, + "reward_std": 0.06969572138041258, + "rewards/code_reward": 0.5159628745168447, + "rewards/format_reward": 1.4732142984867096, + "step": 613 + }, + { + "clip_ratio": 0.0038852618308737874, + "epoch": 0.02290937922671517, + "grad_norm": 0.07086817920207977, + "kl": 0.0479736328125, + "learning_rate": 9.798273488206613e-06, + "loss": 0.0067, + "step": 614 + }, + { + "clip_ratio": 0.004001242690719664, + "epoch": 0.022946690919266825, + "grad_norm": 0.056542519479990005, + "kl": 0.048431396484375, + "learning_rate": 9.797409631125908e-06, + "loss": 0.0064, + "step": 615 + }, + { + "clip_ratio": 0.00483832013560459, + "completion_length": 506.5714416503906, + "epoch": 0.022984002611818478, + "grad_norm": 0.09385277330875397, + "kl": 0.050537109375, + "learning_rate": 9.796543970931947e-06, + "loss": 0.0032, + "num_tokens": 14947014.0, + "reward": 0.5774333216249943, + "reward_std": 0.2608437091112137, + "rewards/code_reward": 0.4301118850708008, + "rewards/format_reward": 1.4732142984867096, + "step": 616 + }, + { + "clip_ratio": 0.004865290597081184, + "epoch": 0.02302131430437013, + "grad_norm": 0.0899122878909111, + "kl": 0.0484619140625, + "learning_rate": 9.79567650798795e-06, + "loss": 0.0029, + "step": 617 + }, + { + "clip_ratio": 0.004402461461722851, + "epoch": 0.023058625996921784, + "grad_norm": 0.0825577825307846, + "kl": 0.0469970703125, + "learning_rate": 9.794807242657882e-06, + "loss": 0.0023, + "step": 618 + }, + { + "clip_ratio": 0.002895451500080526, + "completion_length": 751.053596496582, + "epoch": 0.023095937689473437, + "grad_norm": 0.062977135181427, + "kl": 0.028411865234375, + "learning_rate": 9.793936175306475e-06, + "loss": 0.0092, + "num_tokens": 15017423.0, + "reward": 0.5613080747425556, + "reward_std": 0.04973746812902391, + "rewards/code_reward": 0.41130804666318, + "rewards/format_reward": 1.5, + "step": 619 + }, + { + "clip_ratio": 0.002657483739312738, + "epoch": 0.023133249382025094, + "grad_norm": 0.06208345666527748, + "kl": 0.02862548828125, + "learning_rate": 9.793063306299211e-06, + "loss": 0.0092, + "step": 620 + }, + { + "clip_ratio": 0.002985748171340674, + "epoch": 0.023170561074576747, + "grad_norm": 0.05855400115251541, + "kl": 0.027679443359375, + "learning_rate": 9.79218863600233e-06, + "loss": 0.009, + "step": 621 + }, + { + "clip_ratio": 0.004003750276751816, + "completion_length": 802.6250152587891, + "epoch": 0.0232078727671284, + "grad_norm": 0.07541870325803757, + "kl": 0.0222320556640625, + "learning_rate": 9.791312164782828e-06, + "loss": 0.0171, + "num_tokens": 15090258.0, + "reward": 0.39433499798178673, + "reward_std": 0.13462700881063938, + "rewards/code_reward": 0.24433499574661255, + "rewards/format_reward": 1.5, + "step": 622 + }, + { + "clip_ratio": 0.003361461975146085, + "epoch": 0.023245184459680053, + "grad_norm": 0.08645598590373993, + "kl": 0.0221099853515625, + "learning_rate": 9.790433893008453e-06, + "loss": 0.017, + "step": 623 + }, + { + "clip_ratio": 0.003373440558789298, + "epoch": 0.023282496152231706, + "grad_norm": 0.070963554084301, + "kl": 0.022247314453125, + "learning_rate": 9.789553821047715e-06, + "loss": 0.0168, + "step": 624 + }, + { + "clip_ratio": 0.001967319520190358, + "completion_length": 567.9821548461914, + "epoch": 0.02331980784478336, + "grad_norm": 0.05598798021674156, + "kl": 0.0362548828125, + "learning_rate": 9.788671949269874e-06, + "loss": 0.014, + "num_tokens": 15149071.0, + "reward": 0.828738309442997, + "reward_std": 0.11782625474734232, + "rewards/code_reward": 0.6787383220071206, + "rewards/format_reward": 1.5, + "step": 625 + }, + { + "clip_ratio": 0.0021078676218166947, + "epoch": 0.023357119537335012, + "grad_norm": 0.05765242129564285, + "kl": 0.0362548828125, + "learning_rate": 9.787788278044948e-06, + "loss": 0.0139, + "step": 626 + }, + { + "clip_ratio": 0.002317325444892049, + "epoch": 0.023394431229886666, + "grad_norm": 0.06477253884077072, + "kl": 0.0367431640625, + "learning_rate": 9.786902807743708e-06, + "loss": 0.0137, + "step": 627 + }, + { + "clip_ratio": 0.0014680586173199117, + "completion_length": 640.9643173217773, + "epoch": 0.02343174292243832, + "grad_norm": 0.041530534625053406, + "kl": 0.028289794921875, + "learning_rate": 9.786015538737685e-06, + "loss": 0.0088, + "num_tokens": 15210683.0, + "reward": 0.6652310863137245, + "reward_std": 0.024663517251610756, + "rewards/code_reward": 0.5152310943230987, + "rewards/format_reward": 1.5, + "step": 628 + }, + { + "clip_ratio": 0.0016059914487414062, + "epoch": 0.023469054614989972, + "grad_norm": 0.03956856578588486, + "kl": 0.027618408203125, + "learning_rate": 9.785126471399155e-06, + "loss": 0.0087, + "step": 629 + }, + { + "clip_ratio": 0.00176699785515666, + "epoch": 0.023506366307541625, + "grad_norm": 0.037678226828575134, + "kl": 0.027618408203125, + "learning_rate": 9.784235606101155e-06, + "loss": 0.0086, + "step": 630 + }, + { + "clip_ratio": 0.003788027912378311, + "completion_length": 695.232177734375, + "epoch": 0.023543678000093278, + "grad_norm": 0.0681360587477684, + "kl": 0.023345947265625, + "learning_rate": 9.783342943217477e-06, + "loss": -0.0187, + "num_tokens": 15275334.0, + "reward": 0.5123523063957691, + "reward_std": 0.1812572181224823, + "rewards/code_reward": 0.36235230043530464, + "rewards/format_reward": 1.5, + "step": 631 + }, + { + "clip_ratio": 0.0041384827345609665, + "epoch": 0.02358098969264493, + "grad_norm": 0.066310353577137, + "kl": 0.02313232421875, + "learning_rate": 9.782448483122666e-06, + "loss": -0.0189, + "step": 632 + }, + { + "clip_ratio": 0.003925097640603781, + "epoch": 0.023618301385196588, + "grad_norm": 0.06021953001618385, + "kl": 0.023529052734375, + "learning_rate": 9.781552226192022e-06, + "loss": -0.0193, + "step": 633 + }, + { + "clip_ratio": 0.004688086803071201, + "completion_length": 634.4643096923828, + "epoch": 0.02365561307774824, + "grad_norm": 0.10968918353319168, + "kl": 0.037384033203125, + "learning_rate": 9.780654172801594e-06, + "loss": 0.0009, + "num_tokens": 15340068.0, + "reward": 0.5702130496501923, + "reward_std": 0.30788087844848633, + "rewards/code_reward": 0.4228915963321924, + "rewards/format_reward": 1.4732142984867096, + "step": 634 + }, + { + "clip_ratio": 0.004648245288990438, + "epoch": 0.023692924770299894, + "grad_norm": 0.09378158301115036, + "kl": 0.03802490234375, + "learning_rate": 9.779754323328192e-06, + "loss": 0.0007, + "step": 635 + }, + { + "clip_ratio": 0.00444416148820892, + "epoch": 0.023730236462851547, + "grad_norm": 0.0950494036078453, + "kl": 0.03875732421875, + "learning_rate": 9.778852678149376e-06, + "loss": 0.0002, + "step": 636 + }, + { + "clip_ratio": 0.0021789370803162456, + "completion_length": 599.0535926818848, + "epoch": 0.0237675481554032, + "grad_norm": 0.04782033711671829, + "kl": 0.034759521484375, + "learning_rate": 9.777949237643454e-06, + "loss": 0.0132, + "num_tokens": 15400369.0, + "reward": 0.6768723912537098, + "reward_std": 0.10054731741547585, + "rewards/code_reward": 0.5268724001944065, + "rewards/format_reward": 1.5, + "step": 637 + }, + { + "clip_ratio": 0.0024032911751419306, + "epoch": 0.023804859847954853, + "grad_norm": 0.04834016412496567, + "kl": 0.0340576171875, + "learning_rate": 9.7770440021895e-06, + "loss": 0.0131, + "step": 638 + }, + { + "clip_ratio": 0.002187039819546044, + "epoch": 0.023842171540506506, + "grad_norm": 0.047813985496759415, + "kl": 0.03497314453125, + "learning_rate": 9.776136972167333e-06, + "loss": 0.0131, + "step": 639 + }, + { + "clip_ratio": 0.0033047784818336368, + "completion_length": 722.535758972168, + "epoch": 0.02387948323305816, + "grad_norm": 0.0998101532459259, + "kl": 0.04931640625, + "learning_rate": 9.775228147957522e-06, + "loss": -0.02, + "num_tokens": 15467179.0, + "reward": 0.8400793597102165, + "reward_std": 0.12419067579321563, + "rewards/code_reward": 0.6954364926205017, + "rewards/format_reward": 1.4464285969734192, + "step": 640 + }, + { + "clip_ratio": 0.0033934051753021777, + "epoch": 0.023916794925609813, + "grad_norm": 0.08553320914506912, + "kl": 0.045196533203125, + "learning_rate": 9.774317529941395e-06, + "loss": -0.02, + "step": 641 + }, + { + "clip_ratio": 0.003951461578253657, + "epoch": 0.023954106618161466, + "grad_norm": 0.0742662250995636, + "kl": 0.0430908203125, + "learning_rate": 9.773405118501033e-06, + "loss": -0.0205, + "step": 642 + }, + { + "clip_ratio": 0.004347892478108406, + "completion_length": 579.2500305175781, + "epoch": 0.02399141831071312, + "grad_norm": 0.10515333712100983, + "kl": 0.039154052734375, + "learning_rate": 9.77249091401926e-06, + "loss": 0.0198, + "num_tokens": 15531047.0, + "reward": 0.5916322469711304, + "reward_std": 0.32679159566760063, + "rewards/code_reward": 0.44163220189511776, + "rewards/format_reward": 1.5, + "step": 643 + }, + { + "clip_ratio": 0.004247719654813409, + "epoch": 0.024028730003264772, + "grad_norm": 0.09132135659456253, + "kl": 0.04107666015625, + "learning_rate": 9.771574916879667e-06, + "loss": 0.0194, + "step": 644 + }, + { + "clip_ratio": 0.0044168829917907715, + "epoch": 0.024066041695816425, + "grad_norm": 0.08228785544633865, + "kl": 0.038116455078125, + "learning_rate": 9.770657127466583e-06, + "loss": 0.0189, + "step": 645 + }, + { + "clip_ratio": 0.0032234069076366723, + "completion_length": 614.7500305175781, + "epoch": 0.024103353388368078, + "grad_norm": 0.08424410223960876, + "kl": 0.058502197265625, + "learning_rate": 9.7697375461651e-06, + "loss": 0.0048, + "num_tokens": 15602769.0, + "reward": 0.6895487271249294, + "reward_std": 0.05330518609844148, + "rewards/code_reward": 0.5395487351343036, + "rewards/format_reward": 1.5, + "step": 646 + }, + { + "clip_ratio": 0.0031701382249593735, + "epoch": 0.024140665080919735, + "grad_norm": 0.08141103386878967, + "kl": 0.05859375, + "learning_rate": 9.768816173361055e-06, + "loss": 0.0046, + "step": 647 + }, + { + "clip_ratio": 0.0036615796852856874, + "epoch": 0.024177976773471388, + "grad_norm": 0.06956008821725845, + "kl": 0.0523681640625, + "learning_rate": 9.76789300944104e-06, + "loss": 0.0044, + "step": 648 + }, + { + "clip_ratio": 0.0037154400488361716, + "completion_length": 637.8750305175781, + "epoch": 0.02421528846602304, + "grad_norm": 0.08687446266412735, + "kl": 0.02557373046875, + "learning_rate": 9.766968054792397e-06, + "loss": 0.0218, + "num_tokens": 15667246.0, + "reward": 0.7177798897027969, + "reward_std": 0.3693353831768036, + "rewards/code_reward": 0.5677798986434937, + "rewards/format_reward": 1.5, + "step": 649 + }, + { + "clip_ratio": 0.003506977518554777, + "epoch": 0.024252600158574694, + "grad_norm": 0.08728500455617905, + "kl": 0.026123046875, + "learning_rate": 9.766041309803218e-06, + "loss": 0.0215, + "step": 650 + }, + { + "clip_ratio": 0.0038207624456845224, + "epoch": 0.024289911851126347, + "grad_norm": 0.1332753300666809, + "kl": 0.027008056640625, + "learning_rate": 9.76511277486235e-06, + "loss": 0.0211, + "step": 651 + }, + { + "clip_ratio": 0.0035317358560860157, + "completion_length": 647.4285888671875, + "epoch": 0.024327223543678, + "grad_norm": 0.08790852874517441, + "kl": 0.0372314453125, + "learning_rate": 9.76418245035939e-06, + "loss": 0.009, + "num_tokens": 15735002.0, + "reward": 0.956324428319931, + "reward_std": 0.3388652615249157, + "rewards/code_reward": 0.8090029805898666, + "rewards/format_reward": 1.4732142984867096, + "step": 652 + }, + { + "clip_ratio": 0.0032086222781799734, + "epoch": 0.024364535236229654, + "grad_norm": 0.09049242734909058, + "kl": 0.036865234375, + "learning_rate": 9.763250336684683e-06, + "loss": 0.0084, + "step": 653 + }, + { + "clip_ratio": 0.0028057389426976442, + "epoch": 0.024401846928781307, + "grad_norm": 0.07900263369083405, + "kl": 0.03521728515625, + "learning_rate": 9.762316434229325e-06, + "loss": 0.0081, + "step": 654 + }, + { + "clip_ratio": 0.004394150048028678, + "completion_length": 631.3571472167969, + "epoch": 0.02443915862133296, + "grad_norm": 0.09163382649421692, + "kl": 0.04022216796875, + "learning_rate": 9.761380743385167e-06, + "loss": -0.0047, + "num_tokens": 15799968.0, + "reward": 0.6394232660531998, + "reward_std": 0.23348088189959526, + "rewards/code_reward": 0.49478040263056755, + "rewards/format_reward": 1.4464285969734192, + "step": 655 + }, + { + "clip_ratio": 0.004229749902151525, + "epoch": 0.024476470313884613, + "grad_norm": 0.0896325409412384, + "kl": 0.039398193359375, + "learning_rate": 9.760443264544808e-06, + "loss": -0.0049, + "step": 656 + }, + { + "clip_ratio": 0.0036833694903180003, + "epoch": 0.024513782006436266, + "grad_norm": 0.08354759216308594, + "kl": 0.039154052734375, + "learning_rate": 9.759503998101595e-06, + "loss": -0.0057, + "step": 657 + }, + { + "clip_ratio": 0.004853549296967685, + "completion_length": 584.9286041259766, + "epoch": 0.02455109369898792, + "grad_norm": 0.09650494158267975, + "kl": 0.042022705078125, + "learning_rate": 9.758562944449628e-06, + "loss": -0.0055, + "num_tokens": 15858330.0, + "reward": 0.23370422422885895, + "reward_std": 0.1386744526680559, + "rewards/code_reward": 0.08370421989820898, + "rewards/format_reward": 1.5, + "step": 658 + }, + { + "clip_ratio": 0.004628597875125706, + "epoch": 0.024588405391539572, + "grad_norm": 0.09613670408725739, + "kl": 0.040191650390625, + "learning_rate": 9.757620103983754e-06, + "loss": -0.006, + "step": 659 + }, + { + "clip_ratio": 0.005110285594128072, + "epoch": 0.024625717084091225, + "grad_norm": 0.08428134024143219, + "kl": 0.03973388671875, + "learning_rate": 9.75667547709957e-06, + "loss": -0.0061, + "step": 660 + }, + { + "clip_ratio": 0.0034187943674623966, + "completion_length": 929.9286041259766, + "epoch": 0.024663028776642882, + "grad_norm": 0.07823214679956436, + "kl": 0.03717041015625, + "learning_rate": 9.755729064193428e-06, + "loss": -0.0013, + "num_tokens": 15949004.0, + "reward": 0.4578884206712246, + "reward_std": 0.211487234570086, + "rewards/code_reward": 0.3212812300771475, + "rewards/format_reward": 1.3660714328289032, + "step": 661 + }, + { + "clip_ratio": 0.003361632610904053, + "epoch": 0.024700340469194535, + "grad_norm": 0.07590502500534058, + "kl": 0.034912109375, + "learning_rate": 9.754780865662424e-06, + "loss": -0.0013, + "step": 662 + }, + { + "clip_ratio": 0.003522035083733499, + "epoch": 0.024737652161746188, + "grad_norm": 0.0722728744149208, + "kl": 0.0352783203125, + "learning_rate": 9.753830881904404e-06, + "loss": -0.0017, + "step": 663 + }, + { + "clip_ratio": 0.003769897622987628, + "completion_length": 678.107177734375, + "epoch": 0.02477496385429784, + "grad_norm": 0.07903086394071579, + "kl": 0.033935546875, + "learning_rate": 9.752879113317962e-06, + "loss": 0.0247, + "num_tokens": 16017666.0, + "reward": 0.3253242038190365, + "reward_std": 0.15743420273065567, + "rewards/code_reward": 0.17764561623334885, + "rewards/format_reward": 1.4767857193946838, + "step": 664 + }, + { + "clip_ratio": 0.00382449256721884, + "epoch": 0.024812275546849494, + "grad_norm": 0.0723605677485466, + "kl": 0.0355224609375, + "learning_rate": 9.751925560302443e-06, + "loss": 0.0246, + "step": 665 + }, + { + "clip_ratio": 0.0038885026588104665, + "epoch": 0.024849587239401148, + "grad_norm": 0.06787560135126114, + "kl": 0.039276123046875, + "learning_rate": 9.750970223257942e-06, + "loss": 0.0244, + "step": 666 + }, + { + "clip_ratio": 0.002770947525277734, + "completion_length": 566.1607360839844, + "epoch": 0.0248868989319528, + "grad_norm": 0.06406408548355103, + "kl": 0.033111572265625, + "learning_rate": 9.7500131025853e-06, + "loss": -0.0073, + "num_tokens": 16071813.0, + "reward": 0.6737032197415829, + "reward_std": 0.16376563161611557, + "rewards/code_reward": 0.5237032398581505, + "rewards/format_reward": 1.5, + "step": 667 + }, + { + "clip_ratio": 0.002609044313430786, + "epoch": 0.024924210624504454, + "grad_norm": 0.059586189687252045, + "kl": 0.03216552734375, + "learning_rate": 9.749054198686106e-06, + "loss": -0.0075, + "step": 668 + }, + { + "clip_ratio": 0.0023991194320842624, + "epoch": 0.024961522317056107, + "grad_norm": 0.05737625062465668, + "kl": 0.032806396484375, + "learning_rate": 9.748093511962698e-06, + "loss": -0.0079, + "step": 669 + }, + { + "clip_ratio": 0.003952798433601856, + "completion_length": 793.7857666015625, + "epoch": 0.02499883400960776, + "grad_norm": 0.08435793220996857, + "kl": 0.04345703125, + "learning_rate": 9.747131042818164e-06, + "loss": 0.0264, + "num_tokens": 16151475.0, + "reward": 0.4534314125776291, + "reward_std": 0.3027436351403594, + "rewards/code_reward": 0.30539566557854414, + "rewards/format_reward": 1.480357140302658, + "step": 670 + }, + { + "clip_ratio": 0.004041558189783245, + "epoch": 0.025036145702159413, + "grad_norm": 0.08250921964645386, + "kl": 0.041748046875, + "learning_rate": 9.746166791656337e-06, + "loss": 0.0261, + "step": 671 + }, + { + "clip_ratio": 0.0033444216824136674, + "epoch": 0.025073457394711066, + "grad_norm": 0.07660599052906036, + "kl": 0.0408935546875, + "learning_rate": 9.745200758881801e-06, + "loss": 0.0257, + "step": 672 + }, + { + "clip_ratio": 0.003276202594861388, + "completion_length": 1106.8036346435547, + "epoch": 0.02511076908726272, + "grad_norm": 0.08263489603996277, + "kl": 0.027496337890625, + "learning_rate": 9.744232944899883e-06, + "loss": 0.0079, + "num_tokens": 16258816.0, + "reward": 0.29176611825823784, + "reward_std": 0.1478812973946333, + "rewards/code_reward": 0.14980183730949648, + "rewards/format_reward": 1.4196428954601288, + "step": 673 + }, + { + "clip_ratio": 0.003073220723308623, + "epoch": 0.025148080779814376, + "grad_norm": 0.08075864613056183, + "kl": 0.02679443359375, + "learning_rate": 9.743263350116658e-06, + "loss": 0.0076, + "step": 674 + }, + { + "clip_ratio": 0.003246314183343202, + "epoch": 0.02518539247236603, + "grad_norm": 0.06788896024227142, + "kl": 0.026824951171875, + "learning_rate": 9.742291974938954e-06, + "loss": 0.0072, + "step": 675 + }, + { + "clip_ratio": 0.0038916125195100904, + "completion_length": 969.6607360839844, + "epoch": 0.025222704164917682, + "grad_norm": 0.132943794131279, + "kl": 0.15716552734375, + "learning_rate": 9.741318819774341e-06, + "loss": 0.0053, + "num_tokens": 16362229.0, + "reward": 0.451464906334877, + "reward_std": 0.21046864055097103, + "rewards/code_reward": 0.30860777432098985, + "rewards/format_reward": 1.4285714626312256, + "step": 676 + }, + { + "clip_ratio": 0.004294815065804869, + "epoch": 0.025260015857469335, + "grad_norm": 0.12265728414058685, + "kl": 0.145263671875, + "learning_rate": 9.740343885031135e-06, + "loss": 0.0051, + "step": 677 + }, + { + "clip_ratio": 0.004672415787354112, + "epoch": 0.02529732755002099, + "grad_norm": 0.08262819051742554, + "kl": 0.0855712890625, + "learning_rate": 9.739367171118404e-06, + "loss": 0.0042, + "step": 678 + }, + { + "clip_ratio": 0.005085436103399843, + "completion_length": 564.9286041259766, + "epoch": 0.02533463924257264, + "grad_norm": 0.05652032047510147, + "kl": 0.0406494140625, + "learning_rate": 9.738388678445954e-06, + "loss": 0.0015, + "num_tokens": 16416615.0, + "reward": 0.1485661379992962, + "reward_std": 0.008403682499192655, + "rewards/code_reward": 0.0005304101505316794, + "rewards/format_reward": 1.480357140302658, + "step": 679 + }, + { + "clip_ratio": 0.004880668770056218, + "epoch": 0.025371950935124295, + "grad_norm": 0.05733836814761162, + "kl": 0.0413818359375, + "learning_rate": 9.737408407424346e-06, + "loss": 0.0011, + "step": 680 + }, + { + "clip_ratio": 0.005869560060091317, + "epoch": 0.025409262627675948, + "grad_norm": 0.053949225693941116, + "kl": 0.04229736328125, + "learning_rate": 9.73642635846488e-06, + "loss": 0.0011, + "step": 681 + }, + { + "clip_ratio": 0.0038742513861507177, + "completion_length": 706.3928985595703, + "epoch": 0.0254465743202276, + "grad_norm": 0.08554106950759888, + "kl": 0.03973388671875, + "learning_rate": 9.735442531979612e-06, + "loss": 0.0044, + "num_tokens": 16487449.0, + "reward": 0.5900818705558777, + "reward_std": 0.34315144643187523, + "rewards/code_reward": 0.4427604414522648, + "rewards/format_reward": 1.4732142984867096, + "step": 682 + }, + { + "clip_ratio": 0.003939349844586104, + "epoch": 0.025483886012779254, + "grad_norm": 0.08483120054006577, + "kl": 0.03948974609375, + "learning_rate": 9.734456928381334e-06, + "loss": 0.004, + "step": 683 + }, + { + "clip_ratio": 0.003953918698243797, + "epoch": 0.025521197705330907, + "grad_norm": 0.07956794649362564, + "kl": 0.0396728515625, + "learning_rate": 9.733469548083581e-06, + "loss": 0.0035, + "step": 684 + }, + { + "clip_ratio": 0.0023489415762014687, + "completion_length": 644.5357360839844, + "epoch": 0.02555850939788256, + "grad_norm": 0.06683433055877686, + "kl": 0.038909912109375, + "learning_rate": 9.732480391500648e-06, + "loss": 0.0044, + "num_tokens": 16555673.0, + "reward": 0.4345153048634529, + "reward_std": 0.10927381692454219, + "rewards/code_reward": 0.2882653078995645, + "rewards/format_reward": 1.4625000059604645, + "step": 685 + }, + { + "clip_ratio": 0.0024396535591222346, + "epoch": 0.025595821090434213, + "grad_norm": 0.06862339377403259, + "kl": 0.04022216796875, + "learning_rate": 9.731489459047564e-06, + "loss": 0.0044, + "step": 686 + }, + { + "clip_ratio": 0.0027178525924682617, + "epoch": 0.025633132782985867, + "grad_norm": 0.05997862666845322, + "kl": 0.040618896484375, + "learning_rate": 9.730496751140105e-06, + "loss": 0.0041, + "step": 687 + }, + { + "clip_ratio": 0.0051106634782627225, + "completion_length": 572.4285888671875, + "epoch": 0.025670444475537523, + "grad_norm": 0.09271027892827988, + "kl": 0.0462646484375, + "learning_rate": 9.729502268194793e-06, + "loss": 0.0033, + "num_tokens": 16628735.0, + "reward": 0.395619697868824, + "reward_std": 0.14612728427164257, + "rewards/code_reward": 0.2527625519433059, + "rewards/format_reward": 1.4285714328289032, + "step": 688 + }, + { + "clip_ratio": 0.0039029809995554388, + "epoch": 0.025707756168089176, + "grad_norm": 0.08981873840093613, + "kl": 0.0452880859375, + "learning_rate": 9.728506010628895e-06, + "loss": 0.0028, + "step": 689 + }, + { + "clip_ratio": 0.004186424543149769, + "epoch": 0.02574506786064083, + "grad_norm": 0.08602425456047058, + "kl": 0.04571533203125, + "learning_rate": 9.72750797886042e-06, + "loss": 0.0024, + "step": 690 + }, + { + "clip_ratio": 0.0026485210401006043, + "completion_length": 709.5178833007812, + "epoch": 0.025782379553192483, + "grad_norm": 0.0441429540514946, + "kl": 0.03900146484375, + "learning_rate": 9.726508173308128e-06, + "loss": -0.0013, + "num_tokens": 16703888.0, + "reward": 0.6378236822783947, + "reward_std": 0.1441657394170761, + "rewards/code_reward": 0.4913951028138399, + "rewards/format_reward": 1.4642857015132904, + "step": 691 + }, + { + "clip_ratio": 0.003055864421185106, + "epoch": 0.025819691245744136, + "grad_norm": 0.04587646573781967, + "kl": 0.040191650390625, + "learning_rate": 9.725506594391517e-06, + "loss": -0.0013, + "step": 692 + }, + { + "clip_ratio": 0.003415958140976727, + "epoch": 0.02585700293829579, + "grad_norm": 0.04224787652492523, + "kl": 0.03936767578125, + "learning_rate": 9.724503242530827e-06, + "loss": -0.0014, + "step": 693 + }, + { + "clip_ratio": 0.002697092219023034, + "completion_length": 682.1428833007812, + "epoch": 0.025894314630847442, + "grad_norm": 0.0901390090584755, + "kl": 0.040130615234375, + "learning_rate": 9.72349811814705e-06, + "loss": 0.0105, + "num_tokens": 16777690.0, + "reward": 0.8376400619745255, + "reward_std": 0.19188097538426518, + "rewards/code_reward": 0.6894257683306932, + "rewards/format_reward": 1.4821428656578064, + "step": 694 + }, + { + "clip_ratio": 0.002586269343737513, + "epoch": 0.025931626323399095, + "grad_norm": 0.07659289240837097, + "kl": 0.041229248046875, + "learning_rate": 9.722491221661916e-06, + "loss": 0.0102, + "step": 695 + }, + { + "clip_ratio": 0.0023306127404794097, + "epoch": 0.025968938015950748, + "grad_norm": 0.06924735754728317, + "kl": 0.039337158203125, + "learning_rate": 9.721482553497903e-06, + "loss": 0.0099, + "step": 696 + }, + { + "clip_ratio": 0.002470326377078891, + "completion_length": 902.2857513427734, + "epoch": 0.0260062497085024, + "grad_norm": 0.06562488526105881, + "kl": 0.031341552734375, + "learning_rate": 9.720472114078226e-06, + "loss": -0.0189, + "num_tokens": 16868622.0, + "reward": 0.7438304200768471, + "reward_std": 0.17001502215862274, + "rewards/code_reward": 0.5938304178416729, + "rewards/format_reward": 1.5, + "step": 697 + }, + { + "clip_ratio": 0.0023579771514050663, + "epoch": 0.026043561401054054, + "grad_norm": 0.062330521643161774, + "kl": 0.0316162109375, + "learning_rate": 9.719459903826847e-06, + "loss": -0.0191, + "step": 698 + }, + { + "clip_ratio": 0.0021084268810227513, + "epoch": 0.026080873093605707, + "grad_norm": 0.06120261549949646, + "kl": 0.029815673828125, + "learning_rate": 9.718445923168473e-06, + "loss": -0.0193, + "step": 699 + }, + { + "clip_ratio": 0.0041809825343079865, + "completion_length": 840.7857666015625, + "epoch": 0.02611818478615736, + "grad_norm": 0.05254679545760155, + "kl": 0.057861328125, + "learning_rate": 9.717430172528548e-06, + "loss": 0.0084, + "num_tokens": 16953604.0, + "reward": 0.2842746563255787, + "reward_std": 0.09076095372438431, + "rewards/code_reward": 0.13427463173866272, + "rewards/format_reward": 1.5, + "step": 700 + }, + { + "clip_ratio": 0.004662310006096959, + "epoch": 0.026155496478709017, + "grad_norm": 0.05485280975699425, + "kl": 0.0557861328125, + "learning_rate": 9.716412652333263e-06, + "loss": 0.0082, + "step": 701 + }, + { + "clip_ratio": 0.004718215728644282, + "epoch": 0.02619280817126067, + "grad_norm": 0.04805172607302666, + "kl": 0.055389404296875, + "learning_rate": 9.715393363009552e-06, + "loss": 0.008, + "step": 702 + }, + { + "clip_ratio": 0.0028271101182326674, + "completion_length": 529.6250228881836, + "epoch": 0.026230119863812323, + "grad_norm": 0.057786229997873306, + "kl": 0.056365966796875, + "learning_rate": 9.714372304985092e-06, + "loss": 0.0031, + "num_tokens": 17005285.0, + "reward": 0.39625000208616257, + "reward_std": 0.01403121743351221, + "rewards/code_reward": 0.25, + "rewards/format_reward": 1.4625000059604645, + "step": 703 + }, + { + "clip_ratio": 0.0028304191073402762, + "epoch": 0.026267431556363977, + "grad_norm": 0.060363758355379105, + "kl": 0.0556640625, + "learning_rate": 9.713349478688293e-06, + "loss": 0.0032, + "step": 704 + }, + { + "clip_ratio": 0.0027640548069030046, + "epoch": 0.02630474324891563, + "grad_norm": 0.049025923013687134, + "kl": 0.04949951171875, + "learning_rate": 9.71232488454832e-06, + "loss": 0.0029, + "step": 705 + }, + { + "clip_ratio": 0.0032592995557934046, + "completion_length": 821.4643325805664, + "epoch": 0.026342054941467283, + "grad_norm": 0.06460193544626236, + "kl": 0.029205322265625, + "learning_rate": 9.711298522995071e-06, + "loss": -0.0007, + "num_tokens": 17095331.0, + "reward": 0.5975416488945484, + "reward_std": 0.1238340325653553, + "rewards/code_reward": 0.4475416373461485, + "rewards/format_reward": 1.5, + "step": 706 + }, + { + "clip_ratio": 0.0032384510850533843, + "epoch": 0.026379366634018936, + "grad_norm": 0.06528749316930771, + "kl": 0.02923583984375, + "learning_rate": 9.71027039445919e-06, + "loss": -0.0009, + "step": 707 + }, + { + "clip_ratio": 0.0027874052175320685, + "epoch": 0.02641667832657059, + "grad_norm": 0.051315467804670334, + "kl": 0.028717041015625, + "learning_rate": 9.70924049937206e-06, + "loss": -0.001, + "step": 708 + }, + { + "clip_ratio": 0.0031610133592039347, + "completion_length": 597.1607513427734, + "epoch": 0.026453990019122242, + "grad_norm": 0.0915776714682579, + "kl": 0.0325927734375, + "learning_rate": 9.708208838165806e-06, + "loss": -0.004, + "num_tokens": 17154672.0, + "reward": 0.7709945142269135, + "reward_std": 0.22005976364016533, + "rewards/code_reward": 0.6209944784641266, + "rewards/format_reward": 1.5, + "step": 709 + }, + { + "clip_ratio": 0.0029752982081845403, + "epoch": 0.026491301711673895, + "grad_norm": 0.0856356993317604, + "kl": 0.032318115234375, + "learning_rate": 9.707175411273292e-06, + "loss": -0.0043, + "step": 710 + }, + { + "clip_ratio": 0.0025810525403358042, + "epoch": 0.02652861340422555, + "grad_norm": 0.07996039092540741, + "kl": 0.03118896484375, + "learning_rate": 9.706140219128128e-06, + "loss": -0.0046, + "step": 711 + }, + { + "clip_ratio": 0.0037900206516496837, + "completion_length": 756.3928985595703, + "epoch": 0.0265659250967772, + "grad_norm": 0.07029080390930176, + "kl": 0.0313720703125, + "learning_rate": 9.705103262164657e-06, + "loss": 0.0223, + "num_tokens": 17226038.0, + "reward": 0.715542372316122, + "reward_std": 0.2613403294235468, + "rewards/code_reward": 0.5673280507326126, + "rewards/format_reward": 1.4821428656578064, + "step": 712 + }, + { + "clip_ratio": 0.003941509174183011, + "epoch": 0.026603236789328855, + "grad_norm": 0.06869374960660934, + "kl": 0.031402587890625, + "learning_rate": 9.704064540817971e-06, + "loss": 0.0221, + "step": 713 + }, + { + "clip_ratio": 0.0035677007399499416, + "epoch": 0.026640548481880508, + "grad_norm": 0.06875143945217133, + "kl": 0.030792236328125, + "learning_rate": 9.703024055523896e-06, + "loss": 0.0219, + "step": 714 + }, + { + "clip_ratio": 0.002629130380228162, + "completion_length": 750.4821701049805, + "epoch": 0.026677860174432164, + "grad_norm": 0.06328112632036209, + "kl": 0.022979736328125, + "learning_rate": 9.701981806719001e-06, + "loss": 0.1112, + "num_tokens": 17291781.0, + "reward": 0.6448179557919502, + "reward_std": 0.016415001009590924, + "rewards/code_reward": 0.5001750700321281, + "rewards/format_reward": 1.4464285671710968, + "step": 715 + }, + { + "clip_ratio": 0.0027578085428103805, + "epoch": 0.026715171866983817, + "grad_norm": 0.058517321944236755, + "kl": 0.022491455078125, + "learning_rate": 9.700937794840595e-06, + "loss": 0.1111, + "step": 716 + }, + { + "clip_ratio": 0.002827154065016657, + "epoch": 0.02675248355953547, + "grad_norm": 0.05946826934814453, + "kl": 0.02239990234375, + "learning_rate": 9.699892020326724e-06, + "loss": 0.1111, + "step": 717 + }, + { + "clip_ratio": 0.004121849138755351, + "completion_length": 848.1429061889648, + "epoch": 0.026789795252087124, + "grad_norm": 0.07875704020261765, + "kl": 0.02490234375, + "learning_rate": 9.698844483616177e-06, + "loss": 0.0357, + "num_tokens": 17364645.0, + "reward": 0.3513118736445904, + "reward_std": 0.17889759875833988, + "rewards/code_reward": 0.20309757854556665, + "rewards/format_reward": 1.4821428656578064, + "step": 718 + }, + { + "clip_ratio": 0.0039043856086209416, + "epoch": 0.026827106944638777, + "grad_norm": 0.07266589999198914, + "kl": 0.024200439453125, + "learning_rate": 9.69779518514848e-06, + "loss": 0.0355, + "step": 719 + }, + { + "clip_ratio": 0.003931555955205113, + "epoch": 0.02686441863719043, + "grad_norm": 0.07176872342824936, + "kl": 0.023712158203125, + "learning_rate": 9.6967441253639e-06, + "loss": 0.0351, + "step": 720 + }, + { + "clip_ratio": 0.0033447930472902954, + "completion_length": 851.410758972168, + "epoch": 0.026901730329742083, + "grad_norm": 0.06488914042711258, + "kl": 0.028289794921875, + "learning_rate": 9.695691304703443e-06, + "loss": 0.0301, + "num_tokens": 17443768.0, + "reward": 0.7176453769207001, + "reward_std": 0.24328166246414185, + "rewards/code_reward": 0.5676453709602356, + "rewards/format_reward": 1.5, + "step": 721 + }, + { + "clip_ratio": 0.003058555827010423, + "epoch": 0.026939042022293736, + "grad_norm": 0.06686447560787201, + "kl": 0.027862548828125, + "learning_rate": 9.694636723608853e-06, + "loss": 0.0298, + "step": 722 + }, + { + "clip_ratio": 0.0030112069216556847, + "epoch": 0.02697635371484539, + "grad_norm": 0.0607256218791008, + "kl": 0.027587890625, + "learning_rate": 9.69358038252261e-06, + "loss": 0.0295, + "step": 723 + }, + { + "clip_ratio": 0.005338232382200658, + "completion_length": 667.0178833007812, + "epoch": 0.027013665407397042, + "grad_norm": 0.9744699001312256, + "kl": 0.19451904296875, + "learning_rate": 9.692522281887934e-06, + "loss": 0.0063, + "num_tokens": 17520069.0, + "reward": 0.5533470287919044, + "reward_std": 0.24972904846072197, + "rewards/code_reward": 0.4033470042049885, + "rewards/format_reward": 1.5, + "step": 724 + }, + { + "clip_ratio": 0.00663722021272406, + "epoch": 0.027050977099948696, + "grad_norm": 0.355368971824646, + "kl": 0.060150146484375, + "learning_rate": 9.691462422148791e-06, + "loss": 0.0049, + "step": 725 + }, + { + "clip_ratio": 0.007290375244338065, + "epoch": 0.02708828879250035, + "grad_norm": 0.08652976900339127, + "kl": 0.053436279296875, + "learning_rate": 9.690400803749873e-06, + "loss": 0.0042, + "step": 726 + }, + { + "clip_ratio": 0.0032479050860274583, + "completion_length": 736.1250305175781, + "epoch": 0.027125600485052002, + "grad_norm": 0.07595648616552353, + "kl": 0.024444580078125, + "learning_rate": 9.689337427136618e-06, + "loss": -0.0162, + "num_tokens": 17593842.0, + "reward": 0.48750001192092896, + "reward_std": 0.22718430357053876, + "rewards/code_reward": 0.3392857164144516, + "rewards/format_reward": 1.4821428656578064, + "step": 727 + }, + { + "clip_ratio": 0.0032716632704250515, + "epoch": 0.027162912177603655, + "grad_norm": 0.07536167651414871, + "kl": 0.023773193359375, + "learning_rate": 9.688272292755197e-06, + "loss": -0.0162, + "step": 728 + }, + { + "clip_ratio": 0.003735521691851318, + "epoch": 0.02720022387015531, + "grad_norm": 0.0672072023153305, + "kl": 0.023162841796875, + "learning_rate": 9.687205401052521e-06, + "loss": -0.0164, + "step": 729 + }, + { + "clip_ratio": 0.003938148147426546, + "completion_length": 489.92859649658203, + "epoch": 0.027237535562706965, + "grad_norm": 0.09016937017440796, + "kl": 0.033050537109375, + "learning_rate": 9.686136752476239e-06, + "loss": 0.0013, + "num_tokens": 17646058.0, + "reward": 0.7150545231997967, + "reward_std": 0.13444645330309868, + "rewards/code_reward": 0.5650545107200742, + "rewards/format_reward": 1.5, + "step": 730 + }, + { + "clip_ratio": 0.0031964430236257613, + "epoch": 0.027274847255258618, + "grad_norm": 0.08689908683300018, + "kl": 0.036346435546875, + "learning_rate": 9.685066347474735e-06, + "loss": 0.0009, + "step": 731 + }, + { + "clip_ratio": 0.0037219725782051682, + "epoch": 0.02731215894781027, + "grad_norm": 0.077250637114048, + "kl": 0.034332275390625, + "learning_rate": 9.683994186497132e-06, + "loss": 0.0009, + "step": 732 + }, + { + "clip_ratio": 0.0033153906697407365, + "completion_length": 585.857177734375, + "epoch": 0.027349470640361924, + "grad_norm": 0.06577646732330322, + "kl": 0.026641845703125, + "learning_rate": 9.682920269993288e-06, + "loss": 0.0041, + "num_tokens": 17703338.0, + "reward": 0.8711399845778942, + "reward_std": 0.061390964314341545, + "rewards/code_reward": 0.72113998234272, + "rewards/format_reward": 1.5, + "step": 733 + }, + { + "clip_ratio": 0.0037391941295936704, + "epoch": 0.027386782332913577, + "grad_norm": 0.06614696234464645, + "kl": 0.02691650390625, + "learning_rate": 9.681844598413801e-06, + "loss": 0.004, + "step": 734 + }, + { + "clip_ratio": 0.0033459349651820958, + "epoch": 0.02742409402546523, + "grad_norm": 0.05754578858613968, + "kl": 0.0267333984375, + "learning_rate": 9.680767172209998e-06, + "loss": 0.0037, + "step": 735 + }, + { + "clip_ratio": 0.003778448444791138, + "completion_length": 706.5178985595703, + "epoch": 0.027461405718016883, + "grad_norm": 0.07564171403646469, + "kl": 0.024749755859375, + "learning_rate": 9.679687991833952e-06, + "loss": 0.0002, + "num_tokens": 17776349.0, + "reward": 0.5741851292550564, + "reward_std": 0.2111078016459942, + "rewards/code_reward": 0.424185112118721, + "rewards/format_reward": 1.5, + "step": 736 + }, + { + "clip_ratio": 0.004210481594782323, + "epoch": 0.027498717410568536, + "grad_norm": 0.0729103609919548, + "kl": 0.025299072265625, + "learning_rate": 9.678607057738463e-06, + "loss": 0.0001, + "step": 737 + }, + { + "clip_ratio": 0.0036711778957396746, + "epoch": 0.02753602910312019, + "grad_norm": 0.06681554764509201, + "kl": 0.024871826171875, + "learning_rate": 9.677524370377073e-06, + "loss": -0.0003, + "step": 738 + }, + { + "clip_ratio": 0.0031220251112245023, + "completion_length": 769.6786041259766, + "epoch": 0.027573340795671843, + "grad_norm": 0.2920572757720947, + "kl": 0.253448486328125, + "learning_rate": 9.676439930204057e-06, + "loss": -0.0025, + "num_tokens": 17845071.0, + "reward": 0.6627474874258041, + "reward_std": 0.1768129337579012, + "rewards/code_reward": 0.516318878158927, + "rewards/format_reward": 1.4642857015132904, + "step": 739 + }, + { + "clip_ratio": 0.0033183235791511834, + "epoch": 0.027610652488223496, + "grad_norm": 0.11768119782209396, + "kl": 0.108184814453125, + "learning_rate": 9.675353737674426e-06, + "loss": -0.0042, + "step": 740 + }, + { + "clip_ratio": 0.0037357822875492275, + "epoch": 0.02764796418077515, + "grad_norm": 0.06587764620780945, + "kl": 0.03533935546875, + "learning_rate": 9.674265793243928e-06, + "loss": -0.005, + "step": 741 + }, + { + "clip_ratio": 0.00449636485427618, + "completion_length": 720.6964721679688, + "epoch": 0.027685275873326805, + "grad_norm": 0.07789593189954758, + "kl": 0.02301025390625, + "learning_rate": 9.67317609736904e-06, + "loss": 0.0125, + "num_tokens": 17921004.0, + "reward": 0.46579671651124954, + "reward_std": 0.2384970746934414, + "rewards/code_reward": 0.322939564473927, + "rewards/format_reward": 1.4285714030265808, + "step": 742 + }, + { + "clip_ratio": 0.004080515529494733, + "epoch": 0.02772258756587846, + "grad_norm": 0.07443635165691376, + "kl": 0.02362060546875, + "learning_rate": 9.672084650506982e-06, + "loss": 0.0121, + "step": 743 + }, + { + "clip_ratio": 0.004771573643665761, + "epoch": 0.02775989925843011, + "grad_norm": 0.07313227653503418, + "kl": 0.023345947265625, + "learning_rate": 9.670991453115703e-06, + "loss": 0.012, + "step": 744 + }, + { + "clip_ratio": 0.0032178430701605976, + "completion_length": 614.5357513427734, + "epoch": 0.027797210950981765, + "grad_norm": 0.06926330178976059, + "kl": 0.047393798828125, + "learning_rate": 9.66989650565389e-06, + "loss": 0.0056, + "num_tokens": 17985168.0, + "reward": 0.6178069785237312, + "reward_std": 0.21357997134327888, + "rewards/code_reward": 0.4731641337275505, + "rewards/format_reward": 1.4464285969734192, + "step": 745 + }, + { + "clip_ratio": 0.0028915933216921985, + "epoch": 0.027834522643533418, + "grad_norm": 0.07593461871147156, + "kl": 0.044952392578125, + "learning_rate": 9.66879980858096e-06, + "loss": 0.0053, + "step": 746 + }, + { + "clip_ratio": 0.003091328893788159, + "epoch": 0.02787183433608507, + "grad_norm": 0.06685936450958252, + "kl": 0.044830322265625, + "learning_rate": 9.66770136235707e-06, + "loss": 0.0052, + "step": 747 + }, + { + "clip_ratio": 0.004340977116953582, + "completion_length": 864.9464569091797, + "epoch": 0.027909146028636724, + "grad_norm": 0.02995653636753559, + "kl": 0.0237579345703125, + "learning_rate": 9.666601167443104e-06, + "loss": 0.0154, + "num_tokens": 18067079.0, + "reward": 0.18571430817246437, + "reward_std": 0.023440366610884666, + "rewards/code_reward": 0.0357142873108387, + "rewards/format_reward": 1.5, + "step": 748 + }, + { + "clip_ratio": 0.003941610513720661, + "epoch": 0.027946457721188377, + "grad_norm": 0.029353097081184387, + "kl": 0.0227203369140625, + "learning_rate": 9.665499224300685e-06, + "loss": 0.0154, + "step": 749 + }, + { + "clip_ratio": 0.004010208649560809, + "epoch": 0.02798376941374003, + "grad_norm": 0.027544518932700157, + "kl": 0.02252197265625, + "learning_rate": 9.66439553339217e-06, + "loss": 0.0154, + "step": 750 + }, + { + "clip_ratio": 0.003786671150010079, + "completion_length": 792.2678833007812, + "epoch": 0.028021081106291684, + "grad_norm": 0.06262415647506714, + "kl": 0.0243682861328125, + "learning_rate": 9.663290095180644e-06, + "loss": -0.0136, + "num_tokens": 18141994.0, + "reward": 0.4604026600718498, + "reward_std": 0.19424903352046385, + "rewards/code_reward": 0.3104026265500579, + "rewards/format_reward": 1.5, + "step": 751 + }, + { + "clip_ratio": 0.003842622449155897, + "epoch": 0.028058392798843337, + "grad_norm": 0.05853721499443054, + "kl": 0.023956298828125, + "learning_rate": 9.662182910129929e-06, + "loss": -0.0136, + "step": 752 + }, + { + "clip_ratio": 0.003465545712970197, + "epoch": 0.02809570449139499, + "grad_norm": 0.06319680064916611, + "kl": 0.02423095703125, + "learning_rate": 9.66107397870458e-06, + "loss": -0.0139, + "step": 753 + }, + { + "clip_ratio": 0.0033459396800026298, + "completion_length": 681.5536041259766, + "epoch": 0.028133016183946643, + "grad_norm": 0.07511759549379349, + "kl": 0.02734375, + "learning_rate": 9.659963301369885e-06, + "loss": -0.0004, + "num_tokens": 18218143.0, + "reward": 0.4502119980752468, + "reward_std": 0.12509834952652454, + "rewards/code_reward": 0.3002120037563145, + "rewards/format_reward": 1.5, + "step": 754 + }, + { + "clip_ratio": 0.003710277029313147, + "epoch": 0.028170327876498296, + "grad_norm": 0.07442516088485718, + "kl": 0.026947021484375, + "learning_rate": 9.658850878591862e-06, + "loss": -0.0007, + "step": 755 + }, + { + "clip_ratio": 0.0036627332447096705, + "epoch": 0.028207639569049953, + "grad_norm": 0.0600881427526474, + "kl": 0.026824951171875, + "learning_rate": 9.657736710837264e-06, + "loss": -0.0007, + "step": 756 + }, + { + "clip_ratio": 0.004765044082887471, + "completion_length": 657.4643249511719, + "epoch": 0.028244951261601606, + "grad_norm": 0.09435152262449265, + "kl": 0.02947998046875, + "learning_rate": 9.656620798573572e-06, + "loss": 0.0156, + "num_tokens": 18289373.0, + "reward": 0.6962250620126724, + "reward_std": 0.3570348024368286, + "rewards/code_reward": 0.5462250411510468, + "rewards/format_reward": 1.5, + "step": 757 + }, + { + "clip_ratio": 0.003808257693890482, + "epoch": 0.02828226295415326, + "grad_norm": 0.09178141504526138, + "kl": 0.031158447265625, + "learning_rate": 9.655503142269007e-06, + "loss": 0.0152, + "step": 758 + }, + { + "clip_ratio": 0.004076414450537413, + "epoch": 0.028319574646704912, + "grad_norm": 0.09717664122581482, + "kl": 0.029693603515625, + "learning_rate": 9.654383742392515e-06, + "loss": 0.0147, + "step": 759 + }, + { + "clip_ratio": 0.004300171858631074, + "completion_length": 723.357177734375, + "epoch": 0.028356886339256565, + "grad_norm": 0.07057137042284012, + "kl": 0.028656005859375, + "learning_rate": 9.653262599413774e-06, + "loss": 0.0071, + "num_tokens": 18366715.0, + "reward": 0.6993016786873341, + "reward_std": 0.03454831917770207, + "rewards/code_reward": 0.5519802086055279, + "rewards/format_reward": 1.4732142984867096, + "step": 760 + }, + { + "clip_ratio": 0.004496539768297225, + "epoch": 0.028394198031808218, + "grad_norm": 0.06861842423677444, + "kl": 0.028411865234375, + "learning_rate": 9.652139713803193e-06, + "loss": 0.0068, + "step": 761 + }, + { + "clip_ratio": 0.004553244914859533, + "epoch": 0.02843150972435987, + "grad_norm": 0.06841633468866348, + "kl": 0.028045654296875, + "learning_rate": 9.651015086031919e-06, + "loss": 0.0067, + "step": 762 + }, + { + "clip_ratio": 0.0037327069439925253, + "completion_length": 878.5714645385742, + "epoch": 0.028468821416911524, + "grad_norm": 0.08551131933927536, + "kl": 0.0262451171875, + "learning_rate": 9.649888716571822e-06, + "loss": 0.0096, + "num_tokens": 18447625.0, + "reward": 0.368157722055912, + "reward_std": 0.10674746334552765, + "rewards/code_reward": 0.22797915036790073, + "rewards/format_reward": 1.4017857015132904, + "step": 763 + }, + { + "clip_ratio": 0.0035395475570112467, + "epoch": 0.028506133109463178, + "grad_norm": 0.07966896891593933, + "kl": 0.027374267578125, + "learning_rate": 9.648760605895505e-06, + "loss": 0.0094, + "step": 764 + }, + { + "clip_ratio": 0.003796232515014708, + "epoch": 0.02854344480201483, + "grad_norm": 0.07623524963855743, + "kl": 0.027435302734375, + "learning_rate": 9.647630754476306e-06, + "loss": 0.0089, + "step": 765 + }, + { + "clip_ratio": 0.004264712275471538, + "completion_length": 952.0893249511719, + "epoch": 0.028580756494566484, + "grad_norm": 0.07572798430919647, + "kl": 0.030487060546875, + "learning_rate": 9.646499162788286e-06, + "loss": 0.0053, + "num_tokens": 18548654.0, + "reward": 0.43303246423602104, + "reward_std": 0.26466431468725204, + "rewards/code_reward": 0.28571100160479546, + "rewards/format_reward": 1.4732142984867096, + "step": 766 + }, + { + "clip_ratio": 0.004438750678673387, + "epoch": 0.028618068187118137, + "grad_norm": 0.06957465410232544, + "kl": 0.030364990234375, + "learning_rate": 9.645365831306239e-06, + "loss": 0.0052, + "step": 767 + }, + { + "clip_ratio": 0.004492798412684351, + "epoch": 0.02865537987966979, + "grad_norm": 0.07068915665149689, + "kl": 0.030853271484375, + "learning_rate": 9.644230760505694e-06, + "loss": 0.0048, + "step": 768 + }, + { + "clip_ratio": 0.003192695148754865, + "completion_length": 893.107177734375, + "epoch": 0.028692691572221443, + "grad_norm": 0.0754704549908638, + "kl": 0.026031494140625, + "learning_rate": 9.643093950862902e-06, + "loss": 0.0048, + "num_tokens": 18627984.0, + "reward": 0.5128607489168644, + "reward_std": 0.14931945502758026, + "rewards/code_reward": 0.36464644642546773, + "rewards/format_reward": 1.4821428656578064, + "step": 769 + }, + { + "clip_ratio": 0.0035412764409556985, + "epoch": 0.0287300032647731, + "grad_norm": 0.07392465323209763, + "kl": 0.025970458984375, + "learning_rate": 9.641955402854848e-06, + "loss": 0.0046, + "step": 770 + }, + { + "clip_ratio": 0.0038643284933641553, + "epoch": 0.028767314957324753, + "grad_norm": 0.07214333117008209, + "kl": 0.02581787109375, + "learning_rate": 9.640815116959248e-06, + "loss": 0.0042, + "step": 771 + }, + { + "clip_ratio": 0.0027465890161693096, + "completion_length": 796.2857513427734, + "epoch": 0.028804626649876406, + "grad_norm": 0.06723640859127045, + "kl": 0.037994384765625, + "learning_rate": 9.639673093654542e-06, + "loss": -0.002, + "num_tokens": 18716732.0, + "reward": 0.6108149439096451, + "reward_std": 0.12448185007087886, + "rewards/code_reward": 0.467957794200629, + "rewards/format_reward": 1.4285714328289032, + "step": 772 + }, + { + "clip_ratio": 0.0028144462849013507, + "epoch": 0.02884193834242806, + "grad_norm": 0.06552080810070038, + "kl": 0.03759765625, + "learning_rate": 9.638529333419904e-06, + "loss": -0.002, + "step": 773 + }, + { + "clip_ratio": 0.002453698718454689, + "epoch": 0.028879250034979712, + "grad_norm": 0.06545063108205795, + "kl": 0.0369873046875, + "learning_rate": 9.637383836735231e-06, + "loss": -0.0023, + "step": 774 + }, + { + "clip_ratio": 0.003945053205825388, + "completion_length": 659.5536193847656, + "epoch": 0.028916561727531365, + "grad_norm": 0.1361463963985443, + "kl": 0.031707763671875, + "learning_rate": 9.636236604081156e-06, + "loss": 0.0052, + "num_tokens": 18789521.0, + "reward": 0.2642233371734619, + "reward_std": 0.05805782089009881, + "rewards/code_reward": 0.12493759486824274, + "rewards/format_reward": 1.392857164144516, + "step": 775 + }, + { + "clip_ratio": 0.0037746502785012126, + "epoch": 0.02895387342008302, + "grad_norm": 0.17721115052700043, + "kl": 0.032562255859375, + "learning_rate": 9.635087635939035e-06, + "loss": 0.0051, + "step": 776 + }, + { + "clip_ratio": 0.003757684666197747, + "epoch": 0.02899118511263467, + "grad_norm": 0.08390466123819351, + "kl": 0.0321044921875, + "learning_rate": 9.633936932790952e-06, + "loss": 0.0046, + "step": 777 + }, + { + "clip_ratio": 0.003673141007311642, + "completion_length": 711.928581237793, + "epoch": 0.029028496805186325, + "grad_norm": 0.06294823437929153, + "kl": 0.0194244384765625, + "learning_rate": 9.632784495119724e-06, + "loss": -0.0051, + "num_tokens": 18858075.0, + "reward": 0.6564767025411129, + "reward_std": 0.20863357186317444, + "rewards/code_reward": 0.5064766928553581, + "rewards/format_reward": 1.5, + "step": 778 + }, + { + "clip_ratio": 0.003534330171532929, + "epoch": 0.029065808497737978, + "grad_norm": 0.062443822622299194, + "kl": 0.0193328857421875, + "learning_rate": 9.631630323408888e-06, + "loss": -0.0052, + "step": 779 + }, + { + "clip_ratio": 0.003556630341336131, + "epoch": 0.02910312019028963, + "grad_norm": 0.059218909591436386, + "kl": 0.019500732421875, + "learning_rate": 9.63047441814272e-06, + "loss": -0.0054, + "step": 780 + }, + { + "clip_ratio": 0.004707033978775144, + "completion_length": 703.1250457763672, + "epoch": 0.029140431882841284, + "grad_norm": 0.06464865803718567, + "kl": 0.0260009765625, + "learning_rate": 9.629316779806209e-06, + "loss": 0.0061, + "num_tokens": 18932100.0, + "reward": 0.36275072768330574, + "reward_std": 0.10124765988439322, + "rewards/code_reward": 0.21542927727568895, + "rewards/format_reward": 1.4732142984867096, + "step": 781 + }, + { + "clip_ratio": 0.004417776013724506, + "epoch": 0.029177743575392937, + "grad_norm": 0.06291962414979935, + "kl": 0.027191162109375, + "learning_rate": 9.628157408885083e-06, + "loss": 0.0061, + "step": 782 + }, + { + "clip_ratio": 0.003987599979154766, + "epoch": 0.029215055267944594, + "grad_norm": 0.058003589510917664, + "kl": 0.02752685546875, + "learning_rate": 9.626996305865791e-06, + "loss": 0.0056, + "step": 783 + }, + { + "clip_ratio": 0.004074889176990837, + "completion_length": 779.3750305175781, + "epoch": 0.029252366960496247, + "grad_norm": 0.08460897952318192, + "kl": 0.028289794921875, + "learning_rate": 9.625833471235508e-06, + "loss": -0.0148, + "num_tokens": 18996633.0, + "reward": 0.514394611120224, + "reward_std": 0.23484092624858022, + "rewards/code_reward": 0.3690374605357647, + "rewards/format_reward": 1.4535714387893677, + "step": 784 + }, + { + "clip_ratio": 0.003996251791249961, + "epoch": 0.0292896786530479, + "grad_norm": 0.08559928089380264, + "kl": 0.0294036865234375, + "learning_rate": 9.624668905482144e-06, + "loss": -0.0151, + "step": 785 + }, + { + "clip_ratio": 0.0037624974502250552, + "epoch": 0.029326990345599553, + "grad_norm": 0.08056904375553131, + "kl": 0.029510498046875, + "learning_rate": 9.623502609094322e-06, + "loss": -0.0155, + "step": 786 + }, + { + "clip_ratio": 0.0032817095052450895, + "completion_length": 549.8214721679688, + "epoch": 0.029364302038151206, + "grad_norm": 0.07499027997255325, + "kl": 0.040985107421875, + "learning_rate": 9.622334582561403e-06, + "loss": 0.0103, + "num_tokens": 19055157.0, + "reward": 0.7834346741437912, + "reward_std": 0.16848662495613098, + "rewards/code_reward": 0.6378989703953266, + "rewards/format_reward": 1.455357164144516, + "step": 787 + }, + { + "clip_ratio": 0.002920141734648496, + "epoch": 0.02940161373070286, + "grad_norm": 0.07264088839292526, + "kl": 0.04229736328125, + "learning_rate": 9.62116482637347e-06, + "loss": 0.0101, + "step": 788 + }, + { + "clip_ratio": 0.0023997039534151554, + "epoch": 0.029438925423254513, + "grad_norm": 0.06988698244094849, + "kl": 0.0555419921875, + "learning_rate": 9.619993341021325e-06, + "loss": 0.0097, + "step": 789 + }, + { + "clip_ratio": 0.0027541282470338047, + "completion_length": 763.535758972168, + "epoch": 0.029476237115806166, + "grad_norm": 0.07136596739292145, + "kl": 0.026275634765625, + "learning_rate": 9.618820126996509e-06, + "loss": 0.0102, + "num_tokens": 19140853.0, + "reward": 0.6546809300780296, + "reward_std": 0.25961409881711006, + "rewards/code_reward": 0.5046809390187263, + "rewards/format_reward": 1.5, + "step": 790 + }, + { + "clip_ratio": 0.003147158888168633, + "epoch": 0.02951354880835782, + "grad_norm": 0.06589177250862122, + "kl": 0.026611328125, + "learning_rate": 9.617645184791275e-06, + "loss": 0.01, + "step": 791 + }, + { + "clip_ratio": 0.0026973412022925913, + "epoch": 0.029550860500909472, + "grad_norm": 0.06327617913484573, + "kl": 0.02703857421875, + "learning_rate": 9.616468514898609e-06, + "loss": 0.0097, + "step": 792 + }, + { + "clip_ratio": 0.0037582198274321854, + "completion_length": 554.7500228881836, + "epoch": 0.029588172193461125, + "grad_norm": 0.09062208235263824, + "kl": 0.041351318359375, + "learning_rate": 9.61529011781222e-06, + "loss": 0.0153, + "num_tokens": 19198323.0, + "reward": 0.9518877863883972, + "reward_std": 0.2949034422636032, + "rewards/code_reward": 0.8045663386583328, + "rewards/format_reward": 1.4732142984867096, + "step": 793 + }, + { + "clip_ratio": 0.0038821837515570223, + "epoch": 0.029625483886012778, + "grad_norm": 0.08808073401451111, + "kl": 0.041351318359375, + "learning_rate": 9.614109994026543e-06, + "loss": 0.015, + "step": 794 + }, + { + "clip_ratio": 0.0031883579213172197, + "epoch": 0.02966279557856443, + "grad_norm": 0.08405385911464691, + "kl": 0.040496826171875, + "learning_rate": 9.612928144036734e-06, + "loss": 0.0145, + "step": 795 + }, + { + "clip_ratio": 0.0029921172535978258, + "completion_length": 600.4107437133789, + "epoch": 0.029700107271116084, + "grad_norm": 0.07346513867378235, + "kl": 0.02593994140625, + "learning_rate": 9.611744568338674e-06, + "loss": 0.0097, + "num_tokens": 19261752.0, + "reward": 0.6042124591767788, + "reward_std": 0.13002232741564512, + "rewards/code_reward": 0.45421245880424976, + "rewards/format_reward": 1.5, + "step": 796 + }, + { + "clip_ratio": 0.0033622157643549144, + "epoch": 0.02973741896366774, + "grad_norm": 0.07873138040304184, + "kl": 0.026123046875, + "learning_rate": 9.610559267428974e-06, + "loss": 0.0094, + "step": 797 + }, + { + "clip_ratio": 0.0032116228248924017, + "epoch": 0.029774730656219394, + "grad_norm": 0.06577888876199722, + "kl": 0.026458740234375, + "learning_rate": 9.60937224180496e-06, + "loss": 0.0093, + "step": 798 + }, + { + "clip_ratio": 0.002919884049333632, + "completion_length": 484.8393020629883, + "epoch": 0.029812042348771047, + "grad_norm": 0.07221511751413345, + "kl": 0.0272216796875, + "learning_rate": 9.608183491964685e-06, + "loss": -0.0025, + "num_tokens": 19308265.0, + "reward": 0.6831168718636036, + "reward_std": 0.05631432682275772, + "rewards/code_reward": 0.5331168854609132, + "rewards/format_reward": 1.5, + "step": 799 + }, + { + "clip_ratio": 0.002761873882263899, + "epoch": 0.0298493540413227, + "grad_norm": 0.07000894844532013, + "kl": 0.026824951171875, + "learning_rate": 9.606993018406931e-06, + "loss": -0.0027, + "step": 800 + }, + { + "clip_ratio": 0.0028440243331715465, + "epoch": 0.029886665733874353, + "grad_norm": 0.06772661954164505, + "kl": 0.027130126953125, + "learning_rate": 9.605800821631195e-06, + "loss": -0.003, + "step": 801 + }, + { + "clip_ratio": 0.003954714571591467, + "completion_length": 502.89288330078125, + "epoch": 0.029923977426426007, + "grad_norm": 0.09040743857622147, + "kl": 0.023956298828125, + "learning_rate": 9.604606902137705e-06, + "loss": 0.0157, + "num_tokens": 19363615.0, + "reward": 0.6741442382335663, + "reward_std": 0.23131176456809044, + "rewards/code_reward": 0.5241442490369081, + "rewards/format_reward": 1.5, + "step": 802 + }, + { + "clip_ratio": 0.003939845657441765, + "epoch": 0.02996128911897766, + "grad_norm": 0.10706374049186707, + "kl": 0.024871826171875, + "learning_rate": 9.603411260427402e-06, + "loss": 0.0155, + "step": 803 + }, + { + "clip_ratio": 0.0035461405641399324, + "epoch": 0.029998600811529313, + "grad_norm": 0.0827750638127327, + "kl": 0.025390625, + "learning_rate": 9.602213897001957e-06, + "loss": 0.0146, + "step": 804 + }, + { + "clip_ratio": 0.004908216535113752, + "completion_length": 696.1607513427734, + "epoch": 0.030035912504080966, + "grad_norm": 0.0988248884677887, + "kl": 0.061126708984375, + "learning_rate": 9.601014812363762e-06, + "loss": 0.0003, + "num_tokens": 19443606.0, + "reward": 0.2898636907339096, + "reward_std": 0.18595536798238754, + "rewards/code_reward": 0.13986366894096136, + "rewards/format_reward": 1.5, + "step": 805 + }, + { + "clip_ratio": 0.005110140889883041, + "epoch": 0.03007322419663262, + "grad_norm": 0.08313244581222534, + "kl": 0.043609619140625, + "learning_rate": 9.599814007015929e-06, + "loss": 0.0003, + "step": 806 + }, + { + "clip_ratio": 0.005358141963370144, + "epoch": 0.030110535889184272, + "grad_norm": 0.09756236523389816, + "kl": 0.038543701171875, + "learning_rate": 9.598611481462297e-06, + "loss": -0.0, + "step": 807 + }, + { + "clip_ratio": 0.004178617964498699, + "completion_length": 605.8036117553711, + "epoch": 0.030147847581735925, + "grad_norm": 0.0656607449054718, + "kl": 0.03692626953125, + "learning_rate": 9.597407236207422e-06, + "loss": 0.0091, + "num_tokens": 19507407.0, + "reward": 0.5131547711789608, + "reward_std": 0.16650056280195713, + "rewards/code_reward": 0.36511904560029507, + "rewards/format_reward": 1.480357140302658, + "step": 808 + }, + { + "clip_ratio": 0.004412415204569697, + "epoch": 0.03018515927428758, + "grad_norm": 0.0686543732881546, + "kl": 0.03839111328125, + "learning_rate": 9.596201271756581e-06, + "loss": 0.0092, + "step": 809 + }, + { + "clip_ratio": 0.004433508205693215, + "epoch": 0.030222470966839235, + "grad_norm": 0.06250008195638657, + "kl": 0.03668212890625, + "learning_rate": 9.594993588615778e-06, + "loss": 0.0089, + "step": 810 + }, + { + "clip_ratio": 0.0033745261025615036, + "completion_length": 698.9286041259766, + "epoch": 0.030259782659390888, + "grad_norm": 0.05075261741876602, + "kl": 0.027435302734375, + "learning_rate": 9.59378418729173e-06, + "loss": -0.0017, + "num_tokens": 19577557.0, + "reward": 0.3282497152686119, + "reward_std": 0.2436935007572174, + "rewards/code_reward": 0.17824967950582504, + "rewards/format_reward": 1.5, + "step": 811 + }, + { + "clip_ratio": 0.0035331491963006556, + "epoch": 0.03029709435194254, + "grad_norm": 0.051697324961423874, + "kl": 0.027801513671875, + "learning_rate": 9.592573068291882e-06, + "loss": -0.0017, + "step": 812 + }, + { + "clip_ratio": 0.00372676900587976, + "epoch": 0.030334406044494194, + "grad_norm": 0.05024990439414978, + "kl": 0.027862548828125, + "learning_rate": 9.591360232124395e-06, + "loss": -0.0018, + "step": 813 + }, + { + "clip_ratio": 0.0031917409505695105, + "completion_length": 607.9464569091797, + "epoch": 0.030371717737045847, + "grad_norm": 0.08165678381919861, + "kl": 0.03466796875, + "learning_rate": 9.590145679298157e-06, + "loss": -0.0011, + "num_tokens": 19642612.0, + "reward": 0.899659089744091, + "reward_std": 0.23696057498455048, + "rewards/code_reward": 0.7516233697533607, + "rewards/format_reward": 1.480357140302658, + "step": 814 + }, + { + "clip_ratio": 0.0030304225510917604, + "epoch": 0.0304090294295975, + "grad_norm": 0.07627447694540024, + "kl": 0.0352783203125, + "learning_rate": 9.588929410322767e-06, + "loss": -0.0014, + "step": 815 + }, + { + "clip_ratio": 0.003074265900067985, + "epoch": 0.030446341122149154, + "grad_norm": 0.07312291115522385, + "kl": 0.036590576171875, + "learning_rate": 9.58771142570855e-06, + "loss": -0.0018, + "step": 816 + }, + { + "clip_ratio": 0.0035721984459087253, + "completion_length": 839.6250457763672, + "epoch": 0.030483652814700807, + "grad_norm": 0.0795898288488388, + "kl": 0.033538818359375, + "learning_rate": 9.586491725966551e-06, + "loss": 0.0142, + "num_tokens": 19732415.0, + "reward": 0.7235441952943802, + "reward_std": 0.11626304127275944, + "rewards/code_reward": 0.573544206097722, + "rewards/format_reward": 1.5, + "step": 817 + }, + { + "clip_ratio": 0.0027566119679249823, + "epoch": 0.03052096450725246, + "grad_norm": 0.0700603574514389, + "kl": 0.0341796875, + "learning_rate": 9.585270311608533e-06, + "loss": 0.0139, + "step": 818 + }, + { + "clip_ratio": 0.003398871631361544, + "epoch": 0.030558276199804113, + "grad_norm": 0.06282947212457657, + "kl": 0.0343017578125, + "learning_rate": 9.584047183146979e-06, + "loss": 0.0139, + "step": 819 + }, + { + "clip_ratio": 0.005123316077515483, + "completion_length": 643.6250381469727, + "epoch": 0.030595587892355766, + "grad_norm": 0.07787411659955978, + "kl": 0.031005859375, + "learning_rate": 9.58282234109509e-06, + "loss": 0.0115, + "num_tokens": 19802058.0, + "reward": 0.6404234208166599, + "reward_std": 0.14639717061072588, + "rewards/code_reward": 0.49042338505387306, + "rewards/format_reward": 1.5, + "step": 820 + }, + { + "clip_ratio": 0.00467827613465488, + "epoch": 0.03063289958490742, + "grad_norm": 0.0764201357960701, + "kl": 0.03070068359375, + "learning_rate": 9.581595785966786e-06, + "loss": 0.0112, + "step": 821 + }, + { + "clip_ratio": 0.004790327977389097, + "epoch": 0.030670211277459072, + "grad_norm": 0.0719211995601654, + "kl": 0.030975341796875, + "learning_rate": 9.58036751827671e-06, + "loss": 0.011, + "step": 822 + }, + { + "clip_ratio": 0.0015921257436275482, + "completion_length": 629.0893096923828, + "epoch": 0.030707522970010726, + "grad_norm": 0.058435868471860886, + "kl": 0.026702880859375, + "learning_rate": 9.579137538540218e-06, + "loss": 0.005, + "num_tokens": 19865301.0, + "reward": 0.9668296128511429, + "reward_std": 0.1492699608206749, + "rewards/code_reward": 0.8187939003109932, + "rewards/format_reward": 1.480357140302658, + "step": 823 + }, + { + "clip_ratio": 0.0015114251291379333, + "epoch": 0.030744834662562382, + "grad_norm": 0.059629347175359726, + "kl": 0.027069091796875, + "learning_rate": 9.577905847273387e-06, + "loss": 0.0048, + "step": 824 + }, + { + "clip_ratio": 0.0013944482197985053, + "epoch": 0.030782146355114035, + "grad_norm": 0.05386903136968613, + "kl": 0.026824951171875, + "learning_rate": 9.576672444993012e-06, + "loss": 0.0047, + "step": 825 + }, + { + "clip_ratio": 0.0038879411877132952, + "completion_length": 587.8750152587891, + "epoch": 0.03081945804766569, + "grad_norm": 0.09043409675359726, + "kl": 0.031341552734375, + "learning_rate": 9.575437332216604e-06, + "loss": 0.0174, + "num_tokens": 19930422.0, + "reward": 0.39897599816322327, + "reward_std": 0.1013263653148897, + "rewards/code_reward": 0.24897595797665417, + "rewards/format_reward": 1.5, + "step": 826 + }, + { + "clip_ratio": 0.00401117280125618, + "epoch": 0.03085676974021734, + "grad_norm": 0.08628899604082108, + "kl": 0.032012939453125, + "learning_rate": 9.574200509462397e-06, + "loss": 0.0172, + "step": 827 + }, + { + "clip_ratio": 0.003940253285691142, + "epoch": 0.030894081432768995, + "grad_norm": 0.08307154476642609, + "kl": 0.032318115234375, + "learning_rate": 9.572961977249338e-06, + "loss": 0.0168, + "step": 828 + }, + { + "clip_ratio": 0.003680263733258471, + "completion_length": 609.7857513427734, + "epoch": 0.030931393125320648, + "grad_norm": 0.09510792791843414, + "kl": 0.024871826171875, + "learning_rate": 9.571721736097089e-06, + "loss": 0.0099, + "num_tokens": 19999878.0, + "reward": 0.7202058359980583, + "reward_std": 0.20945744682103395, + "rewards/code_reward": 0.5721700824797153, + "rewards/format_reward": 1.480357140302658, + "step": 829 + }, + { + "clip_ratio": 0.003842084144707769, + "epoch": 0.0309687048178723, + "grad_norm": 0.0862003043293953, + "kl": 0.0254669189453125, + "learning_rate": 9.570479786526035e-06, + "loss": 0.0096, + "step": 830 + }, + { + "clip_ratio": 0.003383189847227186, + "epoch": 0.031006016510423954, + "grad_norm": 0.0800432488322258, + "kl": 0.02496337890625, + "learning_rate": 9.569236129057274e-06, + "loss": 0.0091, + "step": 831 + }, + { + "clip_ratio": 0.003514322394039482, + "completion_length": 581.8571624755859, + "epoch": 0.031043328202975607, + "grad_norm": 0.08474402129650116, + "kl": 0.04180908203125, + "learning_rate": 9.567990764212618e-06, + "loss": -0.0031, + "num_tokens": 20071368.0, + "reward": 0.6706973314285278, + "reward_std": 0.30114192701876163, + "rewards/code_reward": 0.5253401417285204, + "rewards/format_reward": 1.4535714089870453, + "step": 832 + }, + { + "clip_ratio": 0.002731029671849683, + "epoch": 0.03108063989552726, + "grad_norm": 0.08383109420537949, + "kl": 0.040771484375, + "learning_rate": 9.566743692514609e-06, + "loss": -0.0037, + "step": 833 + }, + { + "clip_ratio": 0.0030121903400868177, + "epoch": 0.031117951588078913, + "grad_norm": 0.09265285730361938, + "kl": 0.0421142578125, + "learning_rate": 9.565494914486486e-06, + "loss": -0.004, + "step": 834 + }, + { + "clip_ratio": 0.0022814818075858057, + "completion_length": 488.9464569091797, + "epoch": 0.031155263280630566, + "grad_norm": 0.04725905507802963, + "kl": 0.0478515625, + "learning_rate": 9.564244430652216e-06, + "loss": 0.0088, + "num_tokens": 20129731.0, + "reward": 0.8840225748717785, + "reward_std": 0.0579151026904583, + "rewards/code_reward": 0.7340225428342819, + "rewards/format_reward": 1.5, + "step": 835 + }, + { + "clip_ratio": 0.002366705215536058, + "epoch": 0.03119257497318222, + "grad_norm": 0.04429178312420845, + "kl": 0.04736328125, + "learning_rate": 9.562992241536476e-06, + "loss": 0.0087, + "step": 836 + }, + { + "clip_ratio": 0.002476133988238871, + "epoch": 0.031229886665733873, + "grad_norm": 0.04111822694540024, + "kl": 0.048065185546875, + "learning_rate": 9.561738347664666e-06, + "loss": 0.0087, + "step": 837 + }, + { + "clip_ratio": 0.0041818819008767605, + "completion_length": 591.4821701049805, + "epoch": 0.03126719835828553, + "grad_norm": 0.06756600737571716, + "kl": 0.03033447265625, + "learning_rate": 9.560482749562894e-06, + "loss": -0.0015, + "num_tokens": 20192016.0, + "reward": 0.3231869786977768, + "reward_std": 0.13497349992394447, + "rewards/code_reward": 0.17318697134032845, + "rewards/format_reward": 1.5, + "step": 838 + }, + { + "clip_ratio": 0.0043291503097862005, + "epoch": 0.03130451005083718, + "grad_norm": 0.06646870821714401, + "kl": 0.02947998046875, + "learning_rate": 9.559225447757986e-06, + "loss": -0.0018, + "step": 839 + }, + { + "clip_ratio": 0.00417615647893399, + "epoch": 0.031341821743388835, + "grad_norm": 0.06314951181411743, + "kl": 0.0286865234375, + "learning_rate": 9.557966442777484e-06, + "loss": -0.0019, + "step": 840 + }, + { + "clip_ratio": 0.003922065778169781, + "completion_length": 663.7857360839844, + "epoch": 0.03137913343594049, + "grad_norm": 0.10651123523712158, + "kl": 0.033905029296875, + "learning_rate": 9.55670573514964e-06, + "loss": -0.0015, + "num_tokens": 20271202.0, + "reward": 0.6496212258934975, + "reward_std": 0.34487318620085716, + "rewards/code_reward": 0.5015854723751545, + "rewards/format_reward": 1.480357140302658, + "step": 841 + }, + { + "clip_ratio": 0.003765575063880533, + "epoch": 0.03141644512849214, + "grad_norm": 0.08405885100364685, + "kl": 0.034027099609375, + "learning_rate": 9.555443325403427e-06, + "loss": -0.0019, + "step": 842 + }, + { + "clip_ratio": 0.003230676054954529, + "epoch": 0.031453756821043795, + "grad_norm": 0.08330560475587845, + "kl": 0.032989501953125, + "learning_rate": 9.554179214068526e-06, + "loss": -0.0023, + "step": 843 + }, + { + "clip_ratio": 0.0028582672239281237, + "completion_length": 720.2678833007812, + "epoch": 0.03149106851359545, + "grad_norm": 0.0684475302696228, + "kl": 0.0313720703125, + "learning_rate": 9.552913401675335e-06, + "loss": -0.008, + "num_tokens": 20345445.0, + "reward": 0.8842931240797043, + "reward_std": 0.22567491233348846, + "rewards/code_reward": 0.7342930808663368, + "rewards/format_reward": 1.5, + "step": 844 + }, + { + "clip_ratio": 0.002938974881544709, + "epoch": 0.0315283802061471, + "grad_norm": 0.06744751334190369, + "kl": 0.029998779296875, + "learning_rate": 9.551645888754966e-06, + "loss": -0.0077, + "step": 845 + }, + { + "clip_ratio": 0.0029194625676609576, + "epoch": 0.031565691898698754, + "grad_norm": 0.06603443622589111, + "kl": 0.0296630859375, + "learning_rate": 9.550376675839244e-06, + "loss": -0.0082, + "step": 846 + }, + { + "clip_ratio": 0.003629632876254618, + "completion_length": 585.9286041259766, + "epoch": 0.03160300359125041, + "grad_norm": 0.060099970549345016, + "kl": 0.02667236328125, + "learning_rate": 9.549105763460706e-06, + "loss": -0.0092, + "num_tokens": 20406117.0, + "reward": 0.7380101941525936, + "reward_std": 0.12831591628491879, + "rewards/code_reward": 0.5880102068185806, + "rewards/format_reward": 1.5, + "step": 847 + }, + { + "clip_ratio": 0.003764999913983047, + "epoch": 0.03164031528380206, + "grad_norm": 0.058207493275403976, + "kl": 0.02630615234375, + "learning_rate": 9.547833152152605e-06, + "loss": -0.0092, + "step": 848 + }, + { + "clip_ratio": 0.003689299337565899, + "epoch": 0.031677626976353714, + "grad_norm": 0.05573664978146553, + "kl": 0.026153564453125, + "learning_rate": 9.546558842448904e-06, + "loss": -0.0097, + "step": 849 + }, + { + "clip_ratio": 0.003914806875400245, + "completion_length": 520.0000228881836, + "epoch": 0.03171493866890537, + "grad_norm": 0.08954956382513046, + "kl": 0.0255126953125, + "learning_rate": 9.54528283488428e-06, + "loss": -0.0023, + "num_tokens": 20455705.0, + "reward": 0.5646905936300755, + "reward_std": 0.3220470920205116, + "rewards/code_reward": 0.4146905541419983, + "rewards/format_reward": 1.5, + "step": 850 + }, + { + "clip_ratio": 0.003752889286261052, + "epoch": 0.03175225036145702, + "grad_norm": 0.086339071393013, + "kl": 0.024627685546875, + "learning_rate": 9.544005129994122e-06, + "loss": -0.0026, + "step": 851 + }, + { + "clip_ratio": 0.003982186783105135, + "epoch": 0.03178956205400867, + "grad_norm": 0.07333740592002869, + "kl": 0.024688720703125, + "learning_rate": 9.542725728314532e-06, + "loss": -0.003, + "step": 852 + }, + { + "clip_ratio": 0.003655973298009485, + "completion_length": 590.4643173217773, + "epoch": 0.031826873746560326, + "grad_norm": 0.09021245688199997, + "kl": 0.0389404296875, + "learning_rate": 9.54144463038232e-06, + "loss": 0.0016, + "num_tokens": 20513593.0, + "reward": 0.6921225562691689, + "reward_std": 0.30279140919446945, + "rewards/code_reward": 0.5421225633472204, + "rewards/format_reward": 1.5, + "step": 853 + }, + { + "clip_ratio": 0.004007307637948543, + "epoch": 0.03186418543911198, + "grad_norm": 0.09435517340898514, + "kl": 0.040008544921875, + "learning_rate": 9.540161836735014e-06, + "loss": 0.0017, + "step": 854 + }, + { + "clip_ratio": 0.003503211890347302, + "epoch": 0.03190149713166363, + "grad_norm": 0.08737196773290634, + "kl": 0.037506103515625, + "learning_rate": 9.538877347910851e-06, + "loss": 0.0009, + "step": 855 + }, + { + "clip_ratio": 0.004221319337375462, + "completion_length": 519.053596496582, + "epoch": 0.031938808824215285, + "grad_norm": 0.08871160447597504, + "kl": 0.0367431640625, + "learning_rate": 9.537591164448776e-06, + "loss": 0.0098, + "num_tokens": 20573012.0, + "reward": 0.7569087818264961, + "reward_std": 0.13417457975447178, + "rewards/code_reward": 0.6069087665528059, + "rewards/format_reward": 1.5, + "step": 856 + }, + { + "clip_ratio": 0.0037635331973433495, + "epoch": 0.03197612051676694, + "grad_norm": 0.08719012886285782, + "kl": 0.037353515625, + "learning_rate": 9.53630328688845e-06, + "loss": 0.0092, + "step": 857 + }, + { + "clip_ratio": 0.0036851484328508377, + "epoch": 0.0320134322093186, + "grad_norm": 0.08182580024003983, + "kl": 0.03619384765625, + "learning_rate": 9.535013715770242e-06, + "loss": 0.0087, + "step": 858 + }, + { + "clip_ratio": 0.005214297445490956, + "completion_length": 886.5357513427734, + "epoch": 0.03205074390187025, + "grad_norm": 0.09338640421628952, + "kl": 0.05877685546875, + "learning_rate": 9.53372245163523e-06, + "loss": 0.0506, + "num_tokens": 20670230.0, + "reward": 0.3280488885939121, + "reward_std": 0.30296639259904623, + "rewards/code_reward": 0.18072745949029922, + "rewards/format_reward": 1.4732142984867096, + "step": 859 + }, + { + "clip_ratio": 0.005088376114144921, + "epoch": 0.032088055594421905, + "grad_norm": 0.09073115140199661, + "kl": 0.061553955078125, + "learning_rate": 9.532429495025206e-06, + "loss": 0.0503, + "step": 860 + }, + { + "clip_ratio": 0.004797311499714851, + "epoch": 0.03212536728697356, + "grad_norm": 0.080753855407238, + "kl": 0.0506591796875, + "learning_rate": 9.531134846482671e-06, + "loss": 0.0498, + "step": 861 + }, + { + "clip_ratio": 0.003685707284603268, + "completion_length": 710.5536117553711, + "epoch": 0.03216267897952521, + "grad_norm": 0.07679028809070587, + "kl": 0.042266845703125, + "learning_rate": 9.529838506550836e-06, + "loss": 0.0107, + "num_tokens": 20737629.0, + "reward": 0.41639216244220734, + "reward_std": 0.0784163506468758, + "rewards/code_reward": 0.2690707358997315, + "rewards/format_reward": 1.4732142984867096, + "step": 862 + }, + { + "clip_ratio": 0.004398910095915198, + "epoch": 0.032199990672076864, + "grad_norm": 0.07547535002231598, + "kl": 0.046630859375, + "learning_rate": 9.52854047577362e-06, + "loss": 0.0109, + "step": 863 + }, + { + "clip_ratio": 0.004586763505358249, + "epoch": 0.03223730236462852, + "grad_norm": 0.06482858210802078, + "kl": 0.04345703125, + "learning_rate": 9.527240754695653e-06, + "loss": 0.0105, + "step": 864 + }, + { + "clip_ratio": 0.004775337583851069, + "completion_length": 960.1607513427734, + "epoch": 0.03227461405718017, + "grad_norm": 0.07311834394931793, + "kl": 0.0304412841796875, + "learning_rate": 9.525939343862273e-06, + "loss": 0.0074, + "num_tokens": 20833118.0, + "reward": 0.21514412015676498, + "reward_std": 0.12339112348854542, + "rewards/code_reward": 0.06514410499949008, + "rewards/format_reward": 1.5, + "step": 865 + }, + { + "clip_ratio": 0.0039647434605285525, + "epoch": 0.032311925749731824, + "grad_norm": 0.07104367017745972, + "kl": 0.029998779296875, + "learning_rate": 9.524636243819527e-06, + "loss": 0.0071, + "step": 866 + }, + { + "clip_ratio": 0.003991328238043934, + "epoch": 0.03234923744228348, + "grad_norm": 0.07124081999063492, + "kl": 0.030517578125, + "learning_rate": 9.523331455114174e-06, + "loss": 0.0068, + "step": 867 + }, + { + "clip_ratio": 0.0014993137447163463, + "completion_length": 551.8928909301758, + "epoch": 0.03238654913483513, + "grad_norm": 0.061667393893003464, + "kl": 0.038665771484375, + "learning_rate": 9.522024978293681e-06, + "loss": -0.0024, + "num_tokens": 20890890.0, + "reward": 0.7417118027806282, + "reward_std": 0.030365572310984135, + "rewards/code_reward": 0.5943903475999832, + "rewards/format_reward": 1.4732142984867096, + "step": 868 + }, + { + "clip_ratio": 0.001562882971484214, + "epoch": 0.03242386082738678, + "grad_norm": 0.06561271846294403, + "kl": 0.038543701171875, + "learning_rate": 9.520716813906214e-06, + "loss": -0.0026, + "step": 869 + }, + { + "clip_ratio": 0.00164689717348665, + "epoch": 0.032461172519938436, + "grad_norm": 0.05863572657108307, + "kl": 0.03912353515625, + "learning_rate": 9.519406962500662e-06, + "loss": -0.0027, + "step": 870 + }, + { + "clip_ratio": 0.004205552337225527, + "completion_length": 796.0893173217773, + "epoch": 0.03249848421249009, + "grad_norm": 0.07827971130609512, + "kl": 0.030487060546875, + "learning_rate": 9.518095424626611e-06, + "loss": -0.0199, + "num_tokens": 20969967.0, + "reward": 0.40272733196616173, + "reward_std": 0.12062009237706661, + "rewards/code_reward": 0.2554058786481619, + "rewards/format_reward": 1.4732142984867096, + "step": 871 + }, + { + "clip_ratio": 0.0037592435837723315, + "epoch": 0.03253579590504174, + "grad_norm": 0.07568345218896866, + "kl": 0.030303955078125, + "learning_rate": 9.516782200834357e-06, + "loss": -0.02, + "step": 872 + }, + { + "clip_ratio": 0.00425542623270303, + "epoch": 0.032573107597593395, + "grad_norm": 0.06566576659679413, + "kl": 0.031463623046875, + "learning_rate": 9.515467291674906e-06, + "loss": -0.0202, + "step": 873 + }, + { + "clip_ratio": 0.003918480419088155, + "completion_length": 585.4107437133789, + "epoch": 0.03261041929014505, + "grad_norm": 0.09900378435850143, + "kl": 0.065399169921875, + "learning_rate": 9.514150697699969e-06, + "loss": 0.0106, + "num_tokens": 21033326.0, + "reward": 0.3354116529226303, + "reward_std": 0.17954642372205853, + "rewards/code_reward": 0.1900545060634613, + "rewards/format_reward": 1.4535714387893677, + "step": 874 + }, + { + "clip_ratio": 0.003627142170444131, + "epoch": 0.0326477309826967, + "grad_norm": 0.09669620543718338, + "kl": 0.05157470703125, + "learning_rate": 9.512832419461966e-06, + "loss": 0.0104, + "step": 875 + }, + { + "clip_ratio": 0.0037828420172445476, + "epoch": 0.032685042675248355, + "grad_norm": 0.08044969290494919, + "kl": 0.05096435546875, + "learning_rate": 9.511512457514018e-06, + "loss": 0.0095, + "step": 876 + }, + { + "clip_ratio": 0.003584542602766305, + "completion_length": 694.8750381469727, + "epoch": 0.03272235436780001, + "grad_norm": 0.09752579033374786, + "kl": 0.03717041015625, + "learning_rate": 9.51019081240996e-06, + "loss": -0.0155, + "num_tokens": 21106847.0, + "reward": 0.6822152994573116, + "reward_std": 0.2650728886947036, + "rewards/code_reward": 0.5348938703536987, + "rewards/format_reward": 1.4732142984867096, + "step": 877 + }, + { + "clip_ratio": 0.0035195864038541913, + "epoch": 0.03275966606035166, + "grad_norm": 0.08419207483530045, + "kl": 0.03802490234375, + "learning_rate": 9.50886748470433e-06, + "loss": -0.0157, + "step": 878 + }, + { + "clip_ratio": 0.003800875856541097, + "epoch": 0.032796977752903314, + "grad_norm": 0.07521255314350128, + "kl": 0.038238525390625, + "learning_rate": 9.50754247495237e-06, + "loss": -0.0164, + "step": 879 + }, + { + "clip_ratio": 0.003598135313950479, + "completion_length": 512.0893173217773, + "epoch": 0.03283428944545497, + "grad_norm": 0.055641788989305496, + "kl": 0.037994384765625, + "learning_rate": 9.50621578371003e-06, + "loss": 0.007, + "num_tokens": 21166520.0, + "reward": 0.53711012378335, + "reward_std": 0.06325222551822662, + "rewards/code_reward": 0.3871100563555956, + "rewards/format_reward": 1.5, + "step": 880 + }, + { + "clip_ratio": 0.003937399480491877, + "epoch": 0.03287160113800662, + "grad_norm": 0.06397485733032227, + "kl": 0.03875732421875, + "learning_rate": 9.504887411533963e-06, + "loss": 0.007, + "step": 881 + }, + { + "clip_ratio": 0.003700360539369285, + "epoch": 0.03290891283055827, + "grad_norm": 0.07812628895044327, + "kl": 0.037750244140625, + "learning_rate": 9.503557358981534e-06, + "loss": 0.0067, + "step": 882 + }, + { + "clip_ratio": 0.0030354695045389235, + "completion_length": 684.3035888671875, + "epoch": 0.03294622452310993, + "grad_norm": 0.07113184034824371, + "kl": 0.0399169921875, + "learning_rate": 9.502225626610802e-06, + "loss": 0.0112, + "num_tokens": 21235459.0, + "reward": 0.7431608103215694, + "reward_std": 0.15522429067641497, + "rewards/code_reward": 0.598517931997776, + "rewards/format_reward": 1.4464285671710968, + "step": 883 + }, + { + "clip_ratio": 0.0024310965673066676, + "epoch": 0.03298353621566158, + "grad_norm": 0.0723503977060318, + "kl": 0.0413818359375, + "learning_rate": 9.500892214980544e-06, + "loss": 0.0108, + "step": 884 + }, + { + "clip_ratio": 0.002662370912730694, + "epoch": 0.03302084790821324, + "grad_norm": 0.06749339401721954, + "kl": 0.04388427734375, + "learning_rate": 9.49955712465023e-06, + "loss": 0.0105, + "step": 885 + }, + { + "clip_ratio": 0.003662103263195604, + "completion_length": 793.6607360839844, + "epoch": 0.03305815960076489, + "grad_norm": 0.07443653792142868, + "kl": 0.029296875, + "learning_rate": 9.49822035618004e-06, + "loss": 0.0107, + "num_tokens": 21301940.0, + "reward": 0.512749582529068, + "reward_std": 0.09678029629867524, + "rewards/code_reward": 0.36542810179525986, + "rewards/format_reward": 1.4732142984867096, + "step": 886 + }, + { + "clip_ratio": 0.003516753436997533, + "epoch": 0.033095471293316546, + "grad_norm": 0.07099655270576477, + "kl": 0.029388427734375, + "learning_rate": 9.49688191013086e-06, + "loss": 0.0104, + "step": 887 + }, + { + "clip_ratio": 0.003912624844815582, + "epoch": 0.0331327829858682, + "grad_norm": 0.07087744772434235, + "kl": 0.029876708984375, + "learning_rate": 9.495541787064273e-06, + "loss": 0.0102, + "step": 888 + }, + { + "clip_ratio": 0.004652243398595601, + "completion_length": 992.3929138183594, + "epoch": 0.03317009467841985, + "grad_norm": 0.07389822602272034, + "kl": 0.03057861328125, + "learning_rate": 9.494199987542571e-06, + "loss": 0.0439, + "num_tokens": 21392710.0, + "reward": 0.26077867299318314, + "reward_std": 0.2755468273535371, + "rewards/code_reward": 0.11613581835990772, + "rewards/format_reward": 1.4464285671710968, + "step": 889 + }, + { + "clip_ratio": 0.004577949992381036, + "epoch": 0.033207406370971505, + "grad_norm": 0.08123765885829926, + "kl": 0.03082275390625, + "learning_rate": 9.49285651212875e-06, + "loss": 0.0436, + "step": 890 + }, + { + "clip_ratio": 0.004629838571418077, + "epoch": 0.03324471806352316, + "grad_norm": 0.06977388262748718, + "kl": 0.030731201171875, + "learning_rate": 9.491511361386506e-06, + "loss": 0.0436, + "step": 891 + }, + { + "clip_ratio": 0.00398042076267302, + "completion_length": 821.0536041259766, + "epoch": 0.03328202975607481, + "grad_norm": 0.06039390712976456, + "kl": 0.04840087890625, + "learning_rate": 9.49016453588024e-06, + "loss": 0.0736, + "num_tokens": 21471127.0, + "reward": 0.44068628549575806, + "reward_std": 0.0810028100386262, + "rewards/code_reward": 0.29604341834783554, + "rewards/format_reward": 1.4464285671710968, + "step": 892 + }, + { + "clip_ratio": 0.0034194402396678925, + "epoch": 0.033319341448626465, + "grad_norm": 0.05749569088220596, + "kl": 0.04693603515625, + "learning_rate": 9.488816036175054e-06, + "loss": 0.0734, + "step": 893 + }, + { + "clip_ratio": 0.0036215499276295304, + "epoch": 0.03335665314117812, + "grad_norm": 0.05673466995358467, + "kl": 0.045318603515625, + "learning_rate": 9.487465862836754e-06, + "loss": 0.0734, + "step": 894 + }, + { + "clip_ratio": 0.003460141713730991, + "completion_length": 671.4285888671875, + "epoch": 0.03339396483372977, + "grad_norm": 0.06763267517089844, + "kl": 0.03509521484375, + "learning_rate": 9.486114016431847e-06, + "loss": 0.0112, + "num_tokens": 21544455.0, + "reward": 0.4905134066939354, + "reward_std": 0.22664558049291372, + "rewards/code_reward": 0.3431919664144516, + "rewards/format_reward": 1.4732142984867096, + "step": 895 + }, + { + "clip_ratio": 0.0035167298337910324, + "epoch": 0.033431276526281424, + "grad_norm": 0.06620968878269196, + "kl": 0.0341796875, + "learning_rate": 9.484760497527543e-06, + "loss": 0.0112, + "step": 896 + }, + { + "clip_ratio": 0.0035886180121451616, + "epoch": 0.03346858821883308, + "grad_norm": 0.06623095273971558, + "kl": 0.03533935546875, + "learning_rate": 9.483405306691755e-06, + "loss": 0.0109, + "step": 897 + }, + { + "clip_ratio": 0.004896329541224986, + "completion_length": 754.732177734375, + "epoch": 0.03350589991138473, + "grad_norm": 0.08494674414396286, + "kl": 0.030853271484375, + "learning_rate": 9.482048444493092e-06, + "loss": 0.0208, + "num_tokens": 21617422.0, + "reward": 0.48868192732334137, + "reward_std": 0.33231569081544876, + "rewards/code_reward": 0.3413604870438576, + "rewards/format_reward": 1.4732142984867096, + "step": 898 + }, + { + "clip_ratio": 0.00440667598741129, + "epoch": 0.03354321160393638, + "grad_norm": 0.08057358860969543, + "kl": 0.031036376953125, + "learning_rate": 9.480689911500871e-06, + "loss": 0.0205, + "step": 899 + }, + { + "clip_ratio": 0.004399195779114962, + "epoch": 0.033580523296488037, + "grad_norm": 0.08057461678981781, + "kl": 0.030792236328125, + "learning_rate": 9.479329708285107e-06, + "loss": 0.0201, + "step": 900 + }, + { + "clip_ratio": 0.005003752710763365, + "completion_length": 736.8928833007812, + "epoch": 0.03361783498903969, + "grad_norm": 0.061701450496912, + "kl": 0.029754638671875, + "learning_rate": 9.477967835416517e-06, + "loss": 0.0187, + "num_tokens": 21695046.0, + "reward": 0.2683035805821419, + "reward_std": 0.14712523482739925, + "rewards/code_reward": 0.11830357275903225, + "rewards/format_reward": 1.5, + "step": 901 + }, + { + "clip_ratio": 0.004653093637898564, + "epoch": 0.03365514668159134, + "grad_norm": 0.05713028460741043, + "kl": 0.0291748046875, + "learning_rate": 9.476604293466514e-06, + "loss": 0.0183, + "step": 902 + }, + { + "clip_ratio": 0.005251180729828775, + "epoch": 0.033692458374142996, + "grad_norm": 0.05525389686226845, + "kl": 0.0289306640625, + "learning_rate": 9.475239083007218e-06, + "loss": 0.0183, + "step": 903 + }, + { + "clip_ratio": 0.0035696547711268067, + "completion_length": 808.6964645385742, + "epoch": 0.03372977006669465, + "grad_norm": 0.05262179300189018, + "kl": 0.027740478515625, + "learning_rate": 9.473872204611446e-06, + "loss": 0.0021, + "num_tokens": 21773247.0, + "reward": 0.5014007724821568, + "reward_std": 0.14502021297812462, + "rewards/code_reward": 0.35407931730151176, + "rewards/format_reward": 1.4732142984867096, + "step": 904 + }, + { + "clip_ratio": 0.003587437327951193, + "epoch": 0.0337670817592463, + "grad_norm": 0.051341935992240906, + "kl": 0.02801513671875, + "learning_rate": 9.472503658852714e-06, + "loss": 0.002, + "step": 905 + }, + { + "clip_ratio": 0.0035642364528030157, + "epoch": 0.033804393451797955, + "grad_norm": 0.04726048931479454, + "kl": 0.027252197265625, + "learning_rate": 9.471133446305238e-06, + "loss": 0.0018, + "step": 906 + }, + { + "clip_ratio": 0.004864144371822476, + "completion_length": 712.6964645385742, + "epoch": 0.03384170514434961, + "grad_norm": 0.06123898923397064, + "kl": 0.036041259765625, + "learning_rate": 9.469761567543932e-06, + "loss": -0.0178, + "num_tokens": 21847528.0, + "reward": 0.381889671087265, + "reward_std": 0.09424492157995701, + "rewards/code_reward": 0.23456821031868458, + "rewards/format_reward": 1.4732142984867096, + "step": 907 + }, + { + "clip_ratio": 0.004413050541188568, + "epoch": 0.03387901683690126, + "grad_norm": 0.05905822291970253, + "kl": 0.03662109375, + "learning_rate": 9.468388023144416e-06, + "loss": -0.0179, + "step": 908 + }, + { + "clip_ratio": 0.005040226969867945, + "epoch": 0.033916328529452915, + "grad_norm": 0.057133983820676804, + "kl": 0.0360107421875, + "learning_rate": 9.467012813682999e-06, + "loss": -0.018, + "step": 909 + }, + { + "clip_ratio": 0.004088623099960387, + "completion_length": 618.5714569091797, + "epoch": 0.03395364022200457, + "grad_norm": 0.06741844117641449, + "kl": 0.026763916015625, + "learning_rate": 9.465635939736696e-06, + "loss": -0.0034, + "num_tokens": 21908544.0, + "reward": 0.5571835860610008, + "reward_std": 0.09502632915973663, + "rewards/code_reward": 0.40718357264995575, + "rewards/format_reward": 1.5, + "step": 910 + }, + { + "clip_ratio": 0.003827205451671034, + "epoch": 0.03399095191455622, + "grad_norm": 0.06133883073925972, + "kl": 0.02593994140625, + "learning_rate": 9.464257401883214e-06, + "loss": -0.0034, + "step": 911 + }, + { + "clip_ratio": 0.0038135609356686473, + "epoch": 0.03402826360710788, + "grad_norm": 0.06249229609966278, + "kl": 0.02496337890625, + "learning_rate": 9.462877200700968e-06, + "loss": -0.0037, + "step": 912 + }, + { + "clip_ratio": 0.0030543655157089233, + "completion_length": 544.303596496582, + "epoch": 0.034065575299659534, + "grad_norm": 0.05662781372666359, + "kl": 0.03515625, + "learning_rate": 9.461495336769058e-06, + "loss": -0.0009, + "num_tokens": 21970203.0, + "reward": 0.4981292672455311, + "reward_std": 0.10229597240686417, + "rewards/code_reward": 0.34812925942242146, + "rewards/format_reward": 1.5, + "step": 913 + }, + { + "clip_ratio": 0.0032653920352458954, + "epoch": 0.03410288699221119, + "grad_norm": 0.05669175460934639, + "kl": 0.034271240234375, + "learning_rate": 9.460111810667295e-06, + "loss": -0.0009, + "step": 914 + }, + { + "clip_ratio": 0.002926331537310034, + "epoch": 0.03414019868476284, + "grad_norm": 0.054004911333322525, + "kl": 0.03302001953125, + "learning_rate": 9.458726622976176e-06, + "loss": -0.0013, + "step": 915 + }, + { + "clip_ratio": 0.002600991865620017, + "completion_length": 803.5714569091797, + "epoch": 0.03417751037731449, + "grad_norm": 0.08054956793785095, + "kl": 0.015533447265625, + "learning_rate": 9.4573397742769e-06, + "loss": 0.0072, + "num_tokens": 22049881.0, + "reward": 0.6634919531643391, + "reward_std": 0.0767142241820693, + "rewards/code_reward": 0.513491939753294, + "rewards/format_reward": 1.5, + "step": 916 + }, + { + "clip_ratio": 0.0028722260612994432, + "epoch": 0.034214822069866146, + "grad_norm": 0.04109775647521019, + "kl": 0.01513671875, + "learning_rate": 9.455951265151367e-06, + "loss": 0.0073, + "step": 917 + }, + { + "clip_ratio": 0.0026649533538147807, + "epoch": 0.0342521337624178, + "grad_norm": 0.040415771305561066, + "kl": 0.0150299072265625, + "learning_rate": 9.454561096182168e-06, + "loss": 0.0072, + "step": 918 + }, + { + "clip_ratio": 0.004081698716618121, + "completion_length": 868.0000457763672, + "epoch": 0.03428944545496945, + "grad_norm": 0.07393413782119751, + "kl": 0.026153564453125, + "learning_rate": 9.45316926795259e-06, + "loss": -0.0165, + "num_tokens": 22140629.0, + "reward": 0.451352845877409, + "reward_std": 0.14158951584249735, + "rewards/code_reward": 0.3013528254814446, + "rewards/format_reward": 1.5, + "step": 919 + }, + { + "clip_ratio": 0.0040016191196627915, + "epoch": 0.034326757147521106, + "grad_norm": 0.07288991659879684, + "kl": 0.02557373046875, + "learning_rate": 9.451775781046619e-06, + "loss": -0.0168, + "step": 920 + }, + { + "clip_ratio": 0.004202300158794969, + "epoch": 0.03436406884007276, + "grad_norm": 0.06963060051202774, + "kl": 0.024993896484375, + "learning_rate": 9.450380636048936e-06, + "loss": -0.017, + "step": 921 + }, + { + "clip_ratio": 0.0, + "completion_length": 663.1785888671875, + "epoch": 0.03440138053262441, + "grad_norm": 0.0026013918686658144, + "kl": 0.019927978515625, + "learning_rate": 9.448983833544919e-06, + "loss": 0.0002, + "num_tokens": 22205151.0, + "reward": 1.149999976158142, + "reward_std": 0.0, + "rewards/code_reward": 1.0, + "rewards/format_reward": 1.5, + "step": 922 + }, + { + "clip_ratio": 0.0, + "epoch": 0.034438692225176065, + "grad_norm": 0.0025978849735111, + "kl": 0.019561767578125, + "learning_rate": 9.447585374120641e-06, + "loss": 0.0002, + "step": 923 + }, + { + "clip_ratio": 0.0, + "epoch": 0.03447600391772772, + "grad_norm": 0.0025752035435289145, + "kl": 0.019500732421875, + "learning_rate": 9.44618525836287e-06, + "loss": 0.0002, + "step": 924 + }, + { + "clip_ratio": 0.0038300197338685393, + "completion_length": 709.3750305175781, + "epoch": 0.03451331561027937, + "grad_norm": 0.07882384955883026, + "kl": 0.0228271484375, + "learning_rate": 9.444783486859066e-06, + "loss": -0.0045, + "num_tokens": 22272568.0, + "reward": 0.6071945950388908, + "reward_std": 0.23349622637033463, + "rewards/code_reward": 0.45719458162784576, + "rewards/format_reward": 1.5, + "step": 925 + }, + { + "clip_ratio": 0.004192341351881623, + "epoch": 0.034550627302831025, + "grad_norm": 0.07428035140037537, + "kl": 0.022216796875, + "learning_rate": 9.443380060197387e-06, + "loss": -0.0044, + "step": 926 + }, + { + "clip_ratio": 0.0038968153530731797, + "epoch": 0.03458793899538268, + "grad_norm": 0.06716028600931168, + "kl": 0.0225830078125, + "learning_rate": 9.441974978966687e-06, + "loss": -0.0048, + "step": 927 + }, + { + "clip_ratio": 0.004480265837628394, + "completion_length": 624.2143249511719, + "epoch": 0.03462525068793433, + "grad_norm": 0.08811378479003906, + "kl": 0.024993896484375, + "learning_rate": 9.440568243756509e-06, + "loss": 0.011, + "num_tokens": 22340620.0, + "reward": 0.682870663702488, + "reward_std": 0.16310164442984387, + "rewards/code_reward": 0.5328706668078667, + "rewards/format_reward": 1.5, + "step": 928 + }, + { + "clip_ratio": 0.004059583239722997, + "epoch": 0.034662562380485984, + "grad_norm": 0.08691146224737167, + "kl": 0.0247802734375, + "learning_rate": 9.439159855157093e-06, + "loss": 0.0106, + "step": 929 + }, + { + "clip_ratio": 0.004057545156683773, + "epoch": 0.03469987407303764, + "grad_norm": 0.08255352824926376, + "kl": 0.024871826171875, + "learning_rate": 9.437749813759376e-06, + "loss": 0.0102, + "step": 930 + }, + { + "clip_ratio": 0.004541432834230363, + "completion_length": 785.732177734375, + "epoch": 0.03473718576558929, + "grad_norm": 0.09331943839788437, + "kl": 0.03314208984375, + "learning_rate": 9.436338120154982e-06, + "loss": 0.0278, + "num_tokens": 22418375.0, + "reward": 0.5887399911880493, + "reward_std": 0.28473297134041786, + "rewards/code_reward": 0.44141856767237186, + "rewards/format_reward": 1.4732142984867096, + "step": 931 + }, + { + "clip_ratio": 0.004224523843731731, + "epoch": 0.03477449745814094, + "grad_norm": 0.08570308238267899, + "kl": 0.03253173828125, + "learning_rate": 9.434924774936233e-06, + "loss": 0.0274, + "step": 932 + }, + { + "clip_ratio": 0.005047781043685973, + "epoch": 0.034811809150692596, + "grad_norm": 0.07960882782936096, + "kl": 0.03546142578125, + "learning_rate": 9.433509778696142e-06, + "loss": 0.0272, + "step": 933 + }, + { + "clip_ratio": 0.003423826885409653, + "completion_length": 526.9464569091797, + "epoch": 0.03484912084324425, + "grad_norm": 0.09510476887226105, + "kl": 0.0244140625, + "learning_rate": 9.432093132028415e-06, + "loss": -0.0058, + "num_tokens": 22476304.0, + "reward": 0.5304003842175007, + "reward_std": 0.15820863796398044, + "rewards/code_reward": 0.38218611729098484, + "rewards/format_reward": 1.4821428656578064, + "step": 934 + }, + { + "clip_ratio": 0.003813612915109843, + "epoch": 0.0348864325357959, + "grad_norm": 0.07954838871955872, + "kl": 0.023101806640625, + "learning_rate": 9.43067483552745e-06, + "loss": -0.0061, + "step": 935 + }, + { + "clip_ratio": 0.004242980328854173, + "epoch": 0.034923744228347556, + "grad_norm": 0.07263799011707306, + "kl": 0.0233154296875, + "learning_rate": 9.429254889788338e-06, + "loss": -0.0068, + "step": 936 + }, + { + "clip_ratio": 0.00333854608470574, + "completion_length": 710.5536041259766, + "epoch": 0.03496105592089921, + "grad_norm": 0.07214302569627762, + "kl": 0.024200439453125, + "learning_rate": 9.427833295406864e-06, + "loss": 0.0478, + "num_tokens": 22541855.0, + "reward": 0.3833829425275326, + "reward_std": 0.15543606411665678, + "rewards/code_reward": 0.2387400809675455, + "rewards/format_reward": 1.4464285969734192, + "step": 937 + }, + { + "clip_ratio": 0.004226793418638408, + "epoch": 0.03499836761345086, + "grad_norm": 0.06742609292268753, + "kl": 0.0266876220703125, + "learning_rate": 9.426410052979501e-06, + "loss": 0.0478, + "step": 938 + }, + { + "clip_ratio": 0.004233864485286176, + "epoch": 0.035035679306002515, + "grad_norm": 0.06557326018810272, + "kl": 0.026397705078125, + "learning_rate": 9.424985163103413e-06, + "loss": 0.0472, + "step": 939 + }, + { + "clip_ratio": 0.0017003653920255601, + "completion_length": 593.6250305175781, + "epoch": 0.035072990998554175, + "grad_norm": 0.06617343425750732, + "kl": 0.040985107421875, + "learning_rate": 9.423558626376462e-06, + "loss": -0.0095, + "num_tokens": 22609510.0, + "reward": 0.9421055391430855, + "reward_std": 0.14166581630706787, + "rewards/code_reward": 0.7921055369079113, + "rewards/format_reward": 1.5, + "step": 940 + }, + { + "clip_ratio": 0.0019669613684527576, + "epoch": 0.03511030269110583, + "grad_norm": 0.058002036064863205, + "kl": 0.04052734375, + "learning_rate": 9.422130443397192e-06, + "loss": -0.0097, + "step": 941 + }, + { + "clip_ratio": 0.0019554944592528045, + "epoch": 0.03514761438365748, + "grad_norm": 0.05645475164055824, + "kl": 0.040069580078125, + "learning_rate": 9.420700614764846e-06, + "loss": -0.0099, + "step": 942 + }, + { + "clip_ratio": 0.0033095848630182445, + "completion_length": 693.2500228881836, + "epoch": 0.035184926076209135, + "grad_norm": 0.06892628967761993, + "kl": 0.025665283203125, + "learning_rate": 9.41926914107935e-06, + "loss": 0.0043, + "num_tokens": 22674680.0, + "reward": 0.522977352142334, + "reward_std": 0.24248844385147095, + "rewards/code_reward": 0.37297734804451466, + "rewards/format_reward": 1.5, + "step": 943 + }, + { + "clip_ratio": 0.0035440208157524467, + "epoch": 0.03522223776876079, + "grad_norm": 0.06446530669927597, + "kl": 0.025543212890625, + "learning_rate": 9.417836022941325e-06, + "loss": 0.0041, + "step": 944 + }, + { + "clip_ratio": 0.003326660138554871, + "epoch": 0.03525954946131244, + "grad_norm": 0.0599253810942173, + "kl": 0.026275634765625, + "learning_rate": 9.416401260952082e-06, + "loss": 0.0039, + "step": 945 + }, + { + "clip_ratio": 0.004039239487610757, + "completion_length": 899.3393325805664, + "epoch": 0.035296861153864094, + "grad_norm": 0.08570820838212967, + "kl": 0.026763916015625, + "learning_rate": 9.414964855713616e-06, + "loss": 0.0271, + "num_tokens": 22753799.0, + "reward": 0.29650387540459633, + "reward_std": 0.1168527053669095, + "rewards/code_reward": 0.1500752749852836, + "rewards/format_reward": 1.4642857015132904, + "step": 946 + }, + { + "clip_ratio": 0.0033151479437947273, + "epoch": 0.03533417284641575, + "grad_norm": 0.08585751801729202, + "kl": 0.026885986328125, + "learning_rate": 9.413526807828622e-06, + "loss": 0.0267, + "step": 947 + }, + { + "clip_ratio": 0.0031460931641049683, + "epoch": 0.0353714845389674, + "grad_norm": 0.08370749652385712, + "kl": 0.02655029296875, + "learning_rate": 9.412087117900475e-06, + "loss": 0.0263, + "step": 948 + }, + { + "clip_ratio": 0.004212659841869026, + "completion_length": 763.9107513427734, + "epoch": 0.03540879623151905, + "grad_norm": 0.0919012501835823, + "kl": 0.0208740234375, + "learning_rate": 9.410645786533244e-06, + "loss": 0.0128, + "num_tokens": 22827056.0, + "reward": 0.5814484171569347, + "reward_std": 0.37720760703086853, + "rewards/code_reward": 0.4314484177157283, + "rewards/format_reward": 1.5, + "step": 949 + }, + { + "clip_ratio": 0.004150247317738831, + "epoch": 0.035446107924070706, + "grad_norm": 0.08619942516088486, + "kl": 0.0211334228515625, + "learning_rate": 9.40920281433168e-06, + "loss": 0.0123, + "step": 950 + }, + { + "clip_ratio": 0.00437651778338477, + "epoch": 0.03548341961662236, + "grad_norm": 0.08455240726470947, + "kl": 0.0218505859375, + "learning_rate": 9.407758201901233e-06, + "loss": 0.0119, + "step": 951 + }, + { + "clip_ratio": 0.004410386085510254, + "completion_length": 837.1071624755859, + "epoch": 0.03552073130917401, + "grad_norm": 0.09447121620178223, + "kl": 0.0244903564453125, + "learning_rate": 9.406311949848034e-06, + "loss": 0.0569, + "num_tokens": 22909942.0, + "reward": 0.5000047199428082, + "reward_std": 0.2941339500248432, + "rewards/code_reward": 0.3526832815259695, + "rewards/format_reward": 1.4732142984867096, + "step": 952 + }, + { + "clip_ratio": 0.004753854009322822, + "epoch": 0.035558043001725666, + "grad_norm": 0.0928969606757164, + "kl": 0.024627685546875, + "learning_rate": 9.404864058778901e-06, + "loss": 0.0568, + "step": 953 + }, + { + "clip_ratio": 0.00465699250344187, + "epoch": 0.03559535469427732, + "grad_norm": 0.14358839392662048, + "kl": 0.0244598388671875, + "learning_rate": 9.403414529301346e-06, + "loss": 0.0563, + "step": 954 + }, + { + "clip_ratio": 0.004371111863292754, + "completion_length": 657.2678680419922, + "epoch": 0.03563266638682897, + "grad_norm": 0.08138173818588257, + "kl": 0.02545166015625, + "learning_rate": 9.401963362023562e-06, + "loss": 0.0047, + "num_tokens": 22976961.0, + "reward": 0.6501439735293388, + "reward_std": 0.11131254583597183, + "rewards/code_reward": 0.5001439563930035, + "rewards/format_reward": 1.5, + "step": 955 + }, + { + "clip_ratio": 0.0039914860390126705, + "epoch": 0.035669978079380625, + "grad_norm": 0.07692191749811172, + "kl": 0.02581787109375, + "learning_rate": 9.400510557554431e-06, + "loss": 0.0043, + "step": 956 + }, + { + "clip_ratio": 0.003941457602195442, + "epoch": 0.03570728977193228, + "grad_norm": 0.07468171417713165, + "kl": 0.0260009765625, + "learning_rate": 9.399056116503526e-06, + "loss": 0.004, + "step": 957 + }, + { + "clip_ratio": 0.0036704427329823375, + "completion_length": 677.0893096923828, + "epoch": 0.03574460146448393, + "grad_norm": 0.09361574053764343, + "kl": 0.024017333984375, + "learning_rate": 9.397600039481101e-06, + "loss": -0.0057, + "num_tokens": 23050662.0, + "reward": 0.6543892435729504, + "reward_std": 0.26533421920612454, + "rewards/code_reward": 0.5043892209650949, + "rewards/format_reward": 1.5, + "step": 958 + }, + { + "clip_ratio": 0.0038765244535170496, + "epoch": 0.035781913157035584, + "grad_norm": 0.08661819249391556, + "kl": 0.0242919921875, + "learning_rate": 9.3961423270981e-06, + "loss": -0.0061, + "step": 959 + }, + { + "clip_ratio": 0.003637694870121777, + "epoch": 0.03581922484958724, + "grad_norm": 0.08288127183914185, + "kl": 0.025054931640625, + "learning_rate": 9.394682979966151e-06, + "loss": -0.0064, + "step": 960 + }, + { + "clip_ratio": 0.00348588265478611, + "completion_length": 858.9643249511719, + "epoch": 0.03585653654213889, + "grad_norm": 0.07163936644792557, + "kl": 0.0329742431640625, + "learning_rate": 9.39322199869757e-06, + "loss": 0.0942, + "num_tokens": 23139714.0, + "reward": 0.5792169608175755, + "reward_std": 0.15885293390601873, + "rewards/code_reward": 0.43457411229610443, + "rewards/format_reward": 1.4464285969734192, + "step": 961 + }, + { + "clip_ratio": 0.0032432041480205953, + "epoch": 0.035893848234690544, + "grad_norm": 0.06477635353803635, + "kl": 0.0321197509765625, + "learning_rate": 9.391759383905357e-06, + "loss": 0.0939, + "step": 962 + }, + { + "clip_ratio": 0.0033894767402671278, + "epoch": 0.0359311599272422, + "grad_norm": 0.06217913329601288, + "kl": 0.0313262939453125, + "learning_rate": 9.3902951362032e-06, + "loss": 0.0936, + "step": 963 + }, + { + "clip_ratio": 0.003369659127201885, + "completion_length": 594.1964569091797, + "epoch": 0.03596847161979385, + "grad_norm": 0.07068059593439102, + "kl": 0.03125, + "learning_rate": 9.388829256205466e-06, + "loss": -0.0013, + "num_tokens": 23201633.0, + "reward": 0.6433035954833031, + "reward_std": 0.015071461908519268, + "rewards/code_reward": 0.4933035671710968, + "rewards/format_reward": 1.5, + "step": 964 + }, + { + "clip_ratio": 0.0036048764595761895, + "epoch": 0.0360057833123455, + "grad_norm": 0.06088383495807648, + "kl": 0.032318115234375, + "learning_rate": 9.387361744527213e-06, + "loss": -0.0015, + "step": 965 + }, + { + "clip_ratio": 0.004090022179298103, + "epoch": 0.036043095004897156, + "grad_norm": 0.06945618242025375, + "kl": 0.03167724609375, + "learning_rate": 9.385892601784184e-06, + "loss": -0.0017, + "step": 966 + }, + { + "clip_ratio": 0.00281422957777977, + "completion_length": 582.1250152587891, + "epoch": 0.036080406697448816, + "grad_norm": 0.07881506532430649, + "kl": 0.02508544921875, + "learning_rate": 9.384421828592801e-06, + "loss": 0.0115, + "num_tokens": 23259866.0, + "reward": 0.649076234549284, + "reward_std": 0.2269187942147255, + "rewards/code_reward": 0.4990762360394001, + "rewards/format_reward": 1.5, + "step": 967 + }, + { + "clip_ratio": 0.0021178883616812527, + "epoch": 0.03611771839000047, + "grad_norm": 0.08080031722784042, + "kl": 0.025146484375, + "learning_rate": 9.382949425570173e-06, + "loss": 0.0112, + "step": 968 + }, + { + "clip_ratio": 0.0025173453614115715, + "epoch": 0.03615503008255212, + "grad_norm": 0.07420245558023453, + "kl": 0.02496337890625, + "learning_rate": 9.381475393334096e-06, + "loss": 0.0107, + "step": 969 + }, + { + "clip_ratio": 0.003251782094594091, + "completion_length": 611.2321548461914, + "epoch": 0.036192341775103776, + "grad_norm": 0.08712968975305557, + "kl": 0.027740478515625, + "learning_rate": 9.379999732503043e-06, + "loss": -0.02, + "num_tokens": 23326911.0, + "reward": 0.5934386253356934, + "reward_std": 0.12799225538037717, + "rewards/code_reward": 0.44343863404355943, + "rewards/format_reward": 1.5, + "step": 970 + }, + { + "clip_ratio": 0.003078072564676404, + "epoch": 0.03622965346765543, + "grad_norm": 0.08401865512132645, + "kl": 0.02752685546875, + "learning_rate": 9.378522443696177e-06, + "loss": -0.0204, + "step": 971 + }, + { + "clip_ratio": 0.0028088699909858406, + "epoch": 0.03626696516020708, + "grad_norm": 0.07989899069070816, + "kl": 0.02783203125, + "learning_rate": 9.377043527533339e-06, + "loss": -0.0209, + "step": 972 + }, + { + "clip_ratio": 0.0030367985018529, + "completion_length": 607.1964492797852, + "epoch": 0.036304276852758735, + "grad_norm": 0.0635046660900116, + "kl": 0.0250244140625, + "learning_rate": 9.375562984635055e-06, + "loss": 0.0008, + "num_tokens": 23384392.0, + "reward": 0.7513072304427624, + "reward_std": 0.113032978028059, + "rewards/code_reward": 0.6013071909546852, + "rewards/format_reward": 1.5, + "step": 973 + }, + { + "clip_ratio": 0.003208088339306414, + "epoch": 0.03634158854531039, + "grad_norm": 0.06585624068975449, + "kl": 0.0250244140625, + "learning_rate": 9.374080815622532e-06, + "loss": 0.0008, + "step": 974 + }, + { + "clip_ratio": 0.0037100136978551745, + "epoch": 0.03637890023786204, + "grad_norm": 0.05791882798075676, + "kl": 0.0252685546875, + "learning_rate": 9.372597021117664e-06, + "loss": 0.0006, + "step": 975 + }, + { + "clip_ratio": 0.002637782657984644, + "completion_length": 551.8750457763672, + "epoch": 0.036416211930413694, + "grad_norm": 0.06379517167806625, + "kl": 0.02587890625, + "learning_rate": 9.371111601743022e-06, + "loss": 0.0061, + "num_tokens": 23437977.0, + "reward": 0.7859028987586498, + "reward_std": 0.17293168976902962, + "rewards/code_reward": 0.6359028974547982, + "rewards/format_reward": 1.5, + "step": 976 + }, + { + "clip_ratio": 0.002358135941904038, + "epoch": 0.03645352362296535, + "grad_norm": 0.06250271946191788, + "kl": 0.026611328125, + "learning_rate": 9.369624558121859e-06, + "loss": 0.0059, + "step": 977 + }, + { + "clip_ratio": 0.002599135274067521, + "epoch": 0.036490835315517, + "grad_norm": 0.05840061604976654, + "kl": 0.02642822265625, + "learning_rate": 9.368135890878112e-06, + "loss": 0.0056, + "step": 978 + }, + { + "clip_ratio": 0.004454618087038398, + "completion_length": 629.3393096923828, + "epoch": 0.036528147008068654, + "grad_norm": 0.06738469004631042, + "kl": 0.028411865234375, + "learning_rate": 9.366645600636397e-06, + "loss": 0.0086, + "num_tokens": 23500310.0, + "reward": 0.39822088927030563, + "reward_std": 0.008835607208311558, + "rewards/code_reward": 0.24822089629014954, + "rewards/format_reward": 1.5, + "step": 979 + }, + { + "clip_ratio": 0.004602465953212231, + "epoch": 0.03656545870062031, + "grad_norm": 0.06303027272224426, + "kl": 0.028594970703125, + "learning_rate": 9.365153688022015e-06, + "loss": 0.0082, + "step": 980 + }, + { + "clip_ratio": 0.00430378841701895, + "epoch": 0.03660277039317196, + "grad_norm": 0.055744584649801254, + "kl": 0.02899169921875, + "learning_rate": 9.363660153660942e-06, + "loss": 0.0078, + "step": 981 + }, + { + "clip_ratio": 0.0028647987346630543, + "completion_length": 541.4643096923828, + "epoch": 0.03664008208572361, + "grad_norm": 0.08293138444423676, + "kl": 0.028289794921875, + "learning_rate": 9.362164998179839e-06, + "loss": 0.0008, + "num_tokens": 23559262.0, + "reward": 0.6774423234164715, + "reward_std": 0.15074267145246267, + "rewards/code_reward": 0.5301208943128586, + "rewards/format_reward": 1.4732142984867096, + "step": 982 + }, + { + "clip_ratio": 0.002536011568736285, + "epoch": 0.036677393778275266, + "grad_norm": 0.07888031005859375, + "kl": 0.0284423828125, + "learning_rate": 9.360668222206043e-06, + "loss": 0.0003, + "step": 983 + }, + { + "clip_ratio": 0.002304037829162553, + "epoch": 0.03671470547082692, + "grad_norm": 0.07472971826791763, + "kl": 0.02850341796875, + "learning_rate": 9.359169826367576e-06, + "loss": -0.0002, + "step": 984 + }, + { + "clip_ratio": 0.002032045100349933, + "completion_length": 582.5893173217773, + "epoch": 0.03675201716337857, + "grad_norm": 0.0601794496178627, + "kl": 0.021331787109375, + "learning_rate": 9.357669811293138e-06, + "loss": 0.0051, + "num_tokens": 23621007.0, + "reward": 0.9002515077590942, + "reward_std": 0.24160972237586975, + "rewards/code_reward": 0.7502515092492104, + "rewards/format_reward": 1.5, + "step": 985 + }, + { + "clip_ratio": 0.0019694818183779716, + "epoch": 0.036789328855930226, + "grad_norm": 0.059304095804691315, + "kl": 0.021392822265625, + "learning_rate": 9.356168177612102e-06, + "loss": 0.0049, + "step": 986 + }, + { + "clip_ratio": 0.0017509160679765046, + "epoch": 0.03682664054848188, + "grad_norm": 0.056348662823438644, + "kl": 0.021514892578125, + "learning_rate": 9.354664925954533e-06, + "loss": 0.0047, + "step": 987 + }, + { + "clip_ratio": 0.005199479288421571, + "completion_length": 726.5714569091797, + "epoch": 0.03686395224103353, + "grad_norm": 0.08393554389476776, + "kl": 0.061309814453125, + "learning_rate": 9.353160056951161e-06, + "loss": 0.0057, + "num_tokens": 23696233.0, + "reward": 0.5649434998631477, + "reward_std": 0.11174412444233894, + "rewards/code_reward": 0.4167291969060898, + "rewards/format_reward": 1.4821428656578064, + "step": 988 + }, + { + "clip_ratio": 0.004724144120700657, + "epoch": 0.036901263933585185, + "grad_norm": 0.06890065968036652, + "kl": 0.049774169921875, + "learning_rate": 9.351653571233404e-06, + "loss": 0.0052, + "step": 989 + }, + { + "clip_ratio": 0.004780091636348516, + "epoch": 0.03693857562613684, + "grad_norm": 0.0657135620713234, + "kl": 0.049652099609375, + "learning_rate": 9.350145469433356e-06, + "loss": 0.0051, + "step": 990 + }, + { + "clip_ratio": 0.00278476532548666, + "completion_length": 610.5000381469727, + "epoch": 0.03697588731868849, + "grad_norm": 0.061730336397886276, + "kl": 0.0245361328125, + "learning_rate": 9.348635752183788e-06, + "loss": -0.0003, + "num_tokens": 23760101.0, + "reward": 0.40392274037003517, + "reward_std": 0.21132676303386688, + "rewards/code_reward": 0.2539227083325386, + "rewards/format_reward": 1.5, + "step": 991 + }, + { + "clip_ratio": 0.0030979413422755897, + "epoch": 0.037013199011240144, + "grad_norm": 0.062197495251894, + "kl": 0.026824951171875, + "learning_rate": 9.347124420118148e-06, + "loss": -0.0004, + "step": 992 + }, + { + "clip_ratio": 0.0029465818661265075, + "epoch": 0.0370505107037918, + "grad_norm": 0.05803210660815239, + "kl": 0.026336669921875, + "learning_rate": 9.345611473870566e-06, + "loss": -0.0007, + "step": 993 + }, + { + "clip_ratio": 0.002845784300006926, + "completion_length": 662.5357437133789, + "epoch": 0.03708782239634346, + "grad_norm": 0.07036542892456055, + "kl": 0.026641845703125, + "learning_rate": 9.34409691407584e-06, + "loss": 0.008, + "num_tokens": 23824237.0, + "reward": 0.7373016141355038, + "reward_std": 0.18570900335907936, + "rewards/code_reward": 0.5873015895485878, + "rewards/format_reward": 1.5, + "step": 994 + }, + { + "clip_ratio": 0.003013899317011237, + "epoch": 0.03712513408889511, + "grad_norm": 0.07030674070119858, + "kl": 0.026397705078125, + "learning_rate": 9.34258074136946e-06, + "loss": 0.0079, + "step": 995 + }, + { + "clip_ratio": 0.002467009355314076, + "epoch": 0.037162445781446764, + "grad_norm": 0.0630888044834137, + "kl": 0.026580810546875, + "learning_rate": 9.341062956387576e-06, + "loss": 0.0075, + "step": 996 + }, + { + "clip_ratio": 0.0039284080266952515, + "completion_length": 688.5536041259766, + "epoch": 0.03719975747399842, + "grad_norm": 0.09107758849859238, + "kl": 0.02606201171875, + "learning_rate": 9.339543559767026e-06, + "loss": 0.0014, + "num_tokens": 23891188.0, + "reward": 0.6608281321823597, + "reward_std": 0.2518746294081211, + "rewards/code_reward": 0.5126138413324952, + "rewards/format_reward": 1.4821428656578064, + "step": 997 + }, + { + "clip_ratio": 0.0036177326692268252, + "epoch": 0.03723706916655007, + "grad_norm": 0.09290468692779541, + "kl": 0.02606201171875, + "learning_rate": 9.33802255214532e-06, + "loss": 0.0012, + "step": 998 + }, + { + "clip_ratio": 0.0031471707625314593, + "epoch": 0.03727438085910172, + "grad_norm": 0.08495660126209259, + "kl": 0.02606201171875, + "learning_rate": 9.336499934160645e-06, + "loss": 0.0007, + "step": 999 + }, + { + "clip_ratio": 0.0032481738016940653, + "completion_length": 818.9286193847656, + "epoch": 0.037311692551653376, + "grad_norm": 0.07068884372711182, + "kl": 0.02008056640625, + "learning_rate": 9.334975706451863e-06, + "loss": -0.0053, + "num_tokens": 23981148.0, + "reward": 0.5511904992163181, + "reward_std": 0.18659561593085527, + "rewards/code_reward": 0.4011905062943697, + "rewards/format_reward": 1.5, + "step": 1000 + }, + { + "clip_ratio": 0.0030841114348731935, + "epoch": 0.03734900424420503, + "grad_norm": 0.06877072155475616, + "kl": 0.019439697265625, + "learning_rate": 9.33344986965851e-06, + "loss": -0.0056, + "step": 1001 + }, + { + "clip_ratio": 0.0029330667457543314, + "epoch": 0.03738631593675668, + "grad_norm": 0.06665188074111938, + "kl": 0.019744873046875, + "learning_rate": 9.331922424420803e-06, + "loss": -0.0056, + "step": 1002 + }, + { + "clip_ratio": 0.003671571845188737, + "completion_length": 821.5536193847656, + "epoch": 0.037423627629308336, + "grad_norm": 0.0770421177148819, + "kl": 0.025115966796875, + "learning_rate": 9.330393371379624e-06, + "loss": 0.0483, + "num_tokens": 24053335.0, + "reward": 0.4566052705049515, + "reward_std": 0.11394684156402946, + "rewards/code_reward": 0.3128552446141839, + "rewards/format_reward": 1.4375000298023224, + "step": 1003 + }, + { + "clip_ratio": 0.003336434834636748, + "epoch": 0.03746093932185999, + "grad_norm": 0.07518985122442245, + "kl": 0.026763916015625, + "learning_rate": 9.328862711176539e-06, + "loss": 0.0479, + "step": 1004 + }, + { + "clip_ratio": 0.003141647612210363, + "epoch": 0.03749825101441164, + "grad_norm": 0.07410664856433868, + "kl": 0.02593994140625, + "learning_rate": 9.327330444453783e-06, + "loss": 0.0475, + "step": 1005 + }, + { + "clip_ratio": 0.00499309238512069, + "completion_length": 809.3214645385742, + "epoch": 0.037535562706963295, + "grad_norm": 0.13249045610427856, + "kl": 0.041351318359375, + "learning_rate": 9.32579657185427e-06, + "loss": -0.0125, + "num_tokens": 24142505.0, + "reward": 0.47165536880493164, + "reward_std": 0.1882252162322402, + "rewards/code_reward": 0.3243339229375124, + "rewards/format_reward": 1.4732142984867096, + "step": 1006 + }, + { + "clip_ratio": 0.004739236086606979, + "epoch": 0.03757287439951495, + "grad_norm": 0.08789905905723572, + "kl": 0.0413818359375, + "learning_rate": 9.32426109402158e-06, + "loss": -0.013, + "step": 1007 + }, + { + "clip_ratio": 0.00412282015895471, + "epoch": 0.0376101860920666, + "grad_norm": 0.07699283957481384, + "kl": 0.042388916015625, + "learning_rate": 9.322724011599973e-06, + "loss": -0.0135, + "step": 1008 + }, + { + "clip_ratio": 0.002722372068092227, + "completion_length": 675.4107513427734, + "epoch": 0.037647497784618254, + "grad_norm": 0.06331321597099304, + "kl": 0.023468017578125, + "learning_rate": 9.32118532523438e-06, + "loss": -0.0008, + "num_tokens": 24205906.0, + "reward": 0.8571052998304367, + "reward_std": 0.05646671401336789, + "rewards/code_reward": 0.707105299574323, + "rewards/format_reward": 1.5, + "step": 1009 + }, + { + "clip_ratio": 0.0025642318069003522, + "epoch": 0.03768480947716991, + "grad_norm": 0.059524934738874435, + "kl": 0.023345947265625, + "learning_rate": 9.319645035570406e-06, + "loss": -0.0012, + "step": 1010 + }, + { + "clip_ratio": 0.002009223389904946, + "epoch": 0.03772212116972156, + "grad_norm": 0.058410726487636566, + "kl": 0.023712158203125, + "learning_rate": 9.318103143254327e-06, + "loss": -0.0015, + "step": 1011 + }, + { + "clip_ratio": 0.005602831719443202, + "completion_length": 631.0714645385742, + "epoch": 0.037759432862273214, + "grad_norm": 0.059260446578264236, + "kl": 0.023468017578125, + "learning_rate": 9.316559648933092e-06, + "loss": -0.0022, + "num_tokens": 24274790.0, + "reward": 0.46666669473052025, + "reward_std": 0.1689186617732048, + "rewards/code_reward": 0.3166666813194752, + "rewards/format_reward": 1.5, + "step": 1012 + }, + { + "clip_ratio": 0.005034513829741627, + "epoch": 0.03779674455482487, + "grad_norm": 0.059515174478292465, + "kl": 0.02349853515625, + "learning_rate": 9.315014553254323e-06, + "loss": -0.0025, + "step": 1013 + }, + { + "clip_ratio": 0.005002433026675135, + "epoch": 0.03783405624737652, + "grad_norm": 0.05352368205785751, + "kl": 0.02386474609375, + "learning_rate": 9.313467856866313e-06, + "loss": -0.0027, + "step": 1014 + }, + { + "clip_ratio": 0.001956228748895228, + "completion_length": 567.5178756713867, + "epoch": 0.03787136793992817, + "grad_norm": 0.02363317459821701, + "kl": 0.037567138671875, + "learning_rate": 9.31191956041803e-06, + "loss": 0.0519, + "num_tokens": 24323791.0, + "reward": 0.7982954792678356, + "reward_std": 0.010022293776273727, + "rewards/code_reward": 0.6509740203619003, + "rewards/format_reward": 1.4732142984867096, + "step": 1015 + }, + { + "clip_ratio": 0.0016967188857961446, + "epoch": 0.037908679632479826, + "grad_norm": 0.02328268252313137, + "kl": 0.037078857421875, + "learning_rate": 9.310369664559106e-06, + "loss": 0.0518, + "step": 1016 + }, + { + "clip_ratio": 0.0023493466433137655, + "epoch": 0.03794599132503148, + "grad_norm": 0.025036374107003212, + "kl": 0.039215087890625, + "learning_rate": 9.30881816993985e-06, + "loss": 0.0519, + "step": 1017 + }, + { + "clip_ratio": 0.003911201667506248, + "completion_length": 795.4107360839844, + "epoch": 0.03798330301758313, + "grad_norm": 0.06765462458133698, + "kl": 0.0302886962890625, + "learning_rate": 9.307265077211243e-06, + "loss": 0.0109, + "num_tokens": 24401920.0, + "reward": 0.33665483072400093, + "reward_std": 0.20331135531887412, + "rewards/code_reward": 0.1911191043909639, + "rewards/format_reward": 1.455357164144516, + "step": 1018 + }, + { + "clip_ratio": 0.003981074434705079, + "epoch": 0.038020614710134785, + "grad_norm": 0.06631176918745041, + "kl": 0.027862548828125, + "learning_rate": 9.30571038702493e-06, + "loss": 0.011, + "step": 1019 + }, + { + "clip_ratio": 0.00411680486286059, + "epoch": 0.03805792640268644, + "grad_norm": 0.07072292268276215, + "kl": 0.02825927734375, + "learning_rate": 9.304154100033233e-06, + "loss": 0.0107, + "step": 1020 + }, + { + "clip_ratio": 0.0029157098615542054, + "completion_length": 594.6964721679688, + "epoch": 0.0380952380952381, + "grad_norm": 0.05484096705913544, + "kl": 0.023468017578125, + "learning_rate": 9.30259621688914e-06, + "loss": 0.0027, + "num_tokens": 24465673.0, + "reward": 0.5649350881576538, + "reward_std": 0.15646552294492722, + "rewards/code_reward": 0.4149350728839636, + "rewards/format_reward": 1.5, + "step": 1021 + }, + { + "clip_ratio": 0.003647981327958405, + "epoch": 0.03813254978778975, + "grad_norm": 0.05364915356040001, + "kl": 0.02325439453125, + "learning_rate": 9.301036738246308e-06, + "loss": 0.0027, + "step": 1022 + }, + { + "clip_ratio": 0.0036331398878246546, + "epoch": 0.038169861480341405, + "grad_norm": 0.05084182694554329, + "kl": 0.023040771484375, + "learning_rate": 9.29947566475907e-06, + "loss": 0.0025, + "step": 1023 + }, + { + "clip_ratio": 0.004250404832419008, + "completion_length": 530.0178756713867, + "epoch": 0.03820717317289306, + "grad_norm": 0.11061746627092361, + "kl": 0.035888671875, + "learning_rate": 9.297912997082417e-06, + "loss": 0.0282, + "num_tokens": 24528170.0, + "reward": 0.673251137137413, + "reward_std": 0.20507735572755337, + "rewards/code_reward": 0.5250368192791939, + "rewards/format_reward": 1.4821428656578064, + "step": 1024 + }, + { + "clip_ratio": 0.004368814057670534, + "epoch": 0.03824448486544471, + "grad_norm": 0.11700218170881271, + "kl": 0.036895751953125, + "learning_rate": 9.29634873587202e-06, + "loss": 0.0278, + "step": 1025 + }, + { + "clip_ratio": 0.0041979250381700695, + "epoch": 0.038281796557996364, + "grad_norm": 0.09059210866689682, + "kl": 0.035888671875, + "learning_rate": 9.294782881784214e-06, + "loss": 0.027, + "step": 1026 + }, + { + "clip_ratio": 0.003129621152766049, + "completion_length": 779.4286193847656, + "epoch": 0.03831910825054802, + "grad_norm": 0.07425401359796524, + "kl": 0.025115966796875, + "learning_rate": 9.293215435476002e-06, + "loss": -0.0063, + "num_tokens": 24598544.0, + "reward": 0.5152563750743866, + "reward_std": 0.10888168960809708, + "rewards/code_reward": 0.36722064577043056, + "rewards/format_reward": 1.480357140302658, + "step": 1027 + }, + { + "clip_ratio": 0.003633946005720645, + "epoch": 0.03835641994309967, + "grad_norm": 0.05226435139775276, + "kl": 0.025299072265625, + "learning_rate": 9.291646397605051e-06, + "loss": -0.0064, + "step": 1028 + }, + { + "clip_ratio": 0.0032459802459925413, + "epoch": 0.038393731635651324, + "grad_norm": 0.05063031241297722, + "kl": 0.026123046875, + "learning_rate": 9.290075768829706e-06, + "loss": -0.0065, + "step": 1029 + }, + { + "clip_ratio": 0.004410586901940405, + "completion_length": 582.928596496582, + "epoch": 0.03843104332820298, + "grad_norm": 0.07562192529439926, + "kl": 0.029388427734375, + "learning_rate": 9.288503549808973e-06, + "loss": 0.0054, + "num_tokens": 24657396.0, + "reward": 0.6262649111449718, + "reward_std": 0.06903212843462825, + "rewards/code_reward": 0.4762648716568947, + "rewards/format_reward": 1.5, + "step": 1030 + }, + { + "clip_ratio": 0.004203634860459715, + "epoch": 0.03846835502075463, + "grad_norm": 0.07460197061300278, + "kl": 0.029144287109375, + "learning_rate": 9.28692974120252e-06, + "loss": 0.0053, + "step": 1031 + }, + { + "clip_ratio": 0.0048498569522053, + "epoch": 0.03850566671330628, + "grad_norm": 0.07276192307472229, + "kl": 0.029327392578125, + "learning_rate": 9.285354343670694e-06, + "loss": 0.005, + "step": 1032 + }, + { + "clip_ratio": 0.004682947299443185, + "completion_length": 768.2678833007812, + "epoch": 0.038542978405857936, + "grad_norm": 0.07957222312688828, + "kl": 0.02716064453125, + "learning_rate": 9.283777357874501e-06, + "loss": 0.0012, + "num_tokens": 24736287.0, + "reward": 0.28472936898469925, + "reward_std": 0.19739464996382594, + "rewards/code_reward": 0.13472936861217022, + "rewards/format_reward": 1.5, + "step": 1033 + }, + { + "clip_ratio": 0.0045089153572916985, + "epoch": 0.03858029009840959, + "grad_norm": 0.07709285616874695, + "kl": 0.027130126953125, + "learning_rate": 9.282198784475615e-06, + "loss": 0.001, + "step": 1034 + }, + { + "clip_ratio": 0.00443292100680992, + "epoch": 0.03861760179096124, + "grad_norm": 0.06912980228662491, + "kl": 0.027069091796875, + "learning_rate": 9.280618624136374e-06, + "loss": 0.0006, + "step": 1035 + }, + { + "clip_ratio": 0.002334974007681012, + "completion_length": 809.1964797973633, + "epoch": 0.038654913483512895, + "grad_norm": 0.04713848978281021, + "kl": 0.032958984375, + "learning_rate": 9.279036877519786e-06, + "loss": 0.0456, + "num_tokens": 24816322.0, + "reward": 0.4175889641046524, + "reward_std": 0.0774157214909792, + "rewards/code_reward": 0.27294610638637096, + "rewards/format_reward": 1.4464285969734192, + "step": 1036 + }, + { + "clip_ratio": 0.0022623068653047085, + "epoch": 0.03869222517606455, + "grad_norm": 0.044516898691654205, + "kl": 0.033416748046875, + "learning_rate": 9.277453545289524e-06, + "loss": 0.0453, + "step": 1037 + }, + { + "clip_ratio": 0.002157221606466919, + "epoch": 0.0387295368686162, + "grad_norm": 0.04440119490027428, + "kl": 0.033233642578125, + "learning_rate": 9.27586862810992e-06, + "loss": 0.0453, + "step": 1038 + }, + { + "clip_ratio": 0.003976551903178915, + "completion_length": 733.7678985595703, + "epoch": 0.038766848561167855, + "grad_norm": 0.0877891331911087, + "kl": 0.029296875, + "learning_rate": 9.274282126645979e-06, + "loss": 0.0542, + "num_tokens": 24880233.0, + "reward": 0.7450379990041256, + "reward_std": 0.1840728772804141, + "rewards/code_reward": 0.5977164953947067, + "rewards/format_reward": 1.4732142984867096, + "step": 1039 + }, + { + "clip_ratio": 0.0031977380567695946, + "epoch": 0.03880416025371951, + "grad_norm": 0.09290570765733719, + "kl": 0.028778076171875, + "learning_rate": 9.272694041563368e-06, + "loss": 0.0538, + "step": 1040 + }, + { + "clip_ratio": 0.003713928919751197, + "epoch": 0.03884147194627116, + "grad_norm": 0.07826217263936996, + "kl": 0.029205322265625, + "learning_rate": 9.271104373528415e-06, + "loss": 0.0536, + "step": 1041 + }, + { + "clip_ratio": 0.004270482226274908, + "completion_length": 565.5893020629883, + "epoch": 0.038878783638822814, + "grad_norm": 0.08645724505186081, + "kl": 0.029754638671875, + "learning_rate": 9.26951312320812e-06, + "loss": -0.0131, + "num_tokens": 24935762.0, + "reward": 0.844235822558403, + "reward_std": 0.29770882427692413, + "rewards/code_reward": 0.6942358165979385, + "rewards/format_reward": 1.5, + "step": 1042 + }, + { + "clip_ratio": 0.003773388860281557, + "epoch": 0.03891609533137447, + "grad_norm": 0.08505848795175552, + "kl": 0.0301513671875, + "learning_rate": 9.267920291270139e-06, + "loss": -0.0133, + "step": 1043 + }, + { + "clip_ratio": 0.003444499569013715, + "epoch": 0.03895340702392612, + "grad_norm": 0.08919163793325424, + "kl": 0.0291748046875, + "learning_rate": 9.266325878382793e-06, + "loss": -0.0138, + "step": 1044 + }, + { + "clip_ratio": 0.0032756210421212018, + "completion_length": 781.857177734375, + "epoch": 0.038990718716477774, + "grad_norm": 0.07246002554893494, + "kl": 0.030303955078125, + "learning_rate": 9.264729885215072e-06, + "loss": 0.0381, + "num_tokens": 25013034.0, + "reward": 0.6710845977067947, + "reward_std": 0.27406319323927164, + "rewards/code_reward": 0.5257274303585291, + "rewards/format_reward": 1.4535714089870453, + "step": 1045 + }, + { + "clip_ratio": 0.003297218296211213, + "epoch": 0.03902803040902943, + "grad_norm": 0.07249487936496735, + "kl": 0.0313720703125, + "learning_rate": 9.263132312436624e-06, + "loss": 0.0381, + "step": 1046 + }, + { + "clip_ratio": 0.0033640859182924032, + "epoch": 0.03906534210158108, + "grad_norm": 0.07022958993911743, + "kl": 0.029083251953125, + "learning_rate": 9.261533160717759e-06, + "loss": 0.0377, + "step": 1047 + }, + { + "clip_ratio": 0.004738840216305107, + "completion_length": 611.0714569091797, + "epoch": 0.03910265379413273, + "grad_norm": 0.10802329331636429, + "kl": 0.04119873046875, + "learning_rate": 9.25993243072945e-06, + "loss": 0.0194, + "num_tokens": 25078992.0, + "reward": 0.7616686075925827, + "reward_std": 0.2608688175678253, + "rewards/code_reward": 0.6136328782886267, + "rewards/format_reward": 1.480357140302658, + "step": 1048 + }, + { + "clip_ratio": 0.004689826688263565, + "epoch": 0.03913996548668439, + "grad_norm": 0.10573717206716537, + "kl": 0.041656494140625, + "learning_rate": 9.25833012314334e-06, + "loss": 0.019, + "step": 1049 + }, + { + "clip_ratio": 0.003989055287092924, + "epoch": 0.039177277179236046, + "grad_norm": 0.0930517166852951, + "kl": 0.042144775390625, + "learning_rate": 9.256726238631721e-06, + "loss": 0.0181, + "step": 1050 + }, + { + "clip_ratio": 0.003077496076002717, + "completion_length": 618.9286041259766, + "epoch": 0.0392145888717877, + "grad_norm": 0.051519591361284256, + "kl": 0.035308837890625, + "learning_rate": 9.255120777867557e-06, + "loss": 0.0008, + "num_tokens": 25146860.0, + "reward": 0.7050595246255398, + "reward_std": 0.0686635822057724, + "rewards/code_reward": 0.5550595223903656, + "rewards/format_reward": 1.5, + "step": 1051 + }, + { + "clip_ratio": 0.0032552466727793217, + "epoch": 0.03925190056433935, + "grad_norm": 0.05093884840607643, + "kl": 0.0355224609375, + "learning_rate": 9.253513741524468e-06, + "loss": 0.0006, + "step": 1052 + }, + { + "clip_ratio": 0.003017779963556677, + "epoch": 0.039289212256891005, + "grad_norm": 0.04571753367781639, + "kl": 0.035186767578125, + "learning_rate": 9.251905130276734e-06, + "loss": 0.0004, + "step": 1053 + }, + { + "clip_ratio": 0.00298432435374707, + "completion_length": 804.4286193847656, + "epoch": 0.03932652394944266, + "grad_norm": 0.06088537722826004, + "kl": 0.0381317138671875, + "learning_rate": 9.250294944799305e-06, + "loss": 0.0618, + "num_tokens": 25220454.0, + "reward": 0.6453106291592121, + "reward_std": 0.08722019009292126, + "rewards/code_reward": 0.4979892075061798, + "rewards/format_reward": 1.4732142984867096, + "step": 1054 + }, + { + "clip_ratio": 0.0033902590512298048, + "epoch": 0.03936383564199431, + "grad_norm": 0.05120262876152992, + "kl": 0.0348358154296875, + "learning_rate": 9.248683185767778e-06, + "loss": 0.0617, + "step": 1055 + }, + { + "clip_ratio": 0.00326330098323524, + "epoch": 0.039401147334545965, + "grad_norm": 0.04517804831266403, + "kl": 0.0309295654296875, + "learning_rate": 9.247069853858422e-06, + "loss": 0.0614, + "step": 1056 + }, + { + "clip_ratio": 0.003833273018244654, + "completion_length": 735.2857360839844, + "epoch": 0.03943845902709762, + "grad_norm": 0.08522135764360428, + "kl": 0.039306640625, + "learning_rate": 9.24545494974816e-06, + "loss": 0.0039, + "num_tokens": 25297274.0, + "reward": 0.7193863913416862, + "reward_std": 0.19551336951553822, + "rewards/code_reward": 0.5713506415486336, + "rewards/format_reward": 1.480357140302658, + "step": 1057 + }, + { + "clip_ratio": 0.0038634692900814116, + "epoch": 0.03947577071964927, + "grad_norm": 0.08082570880651474, + "kl": 0.038299560546875, + "learning_rate": 9.243838474114573e-06, + "loss": 0.0036, + "step": 1058 + }, + { + "clip_ratio": 0.003645479737315327, + "epoch": 0.039513082412200924, + "grad_norm": 0.07572901993989944, + "kl": 0.0384521484375, + "learning_rate": 9.242220427635907e-06, + "loss": 0.0031, + "step": 1059 + }, + { + "clip_ratio": 0.0047526207054033875, + "completion_length": 839.3928985595703, + "epoch": 0.03955039410475258, + "grad_norm": 0.07848247140645981, + "kl": 0.033172607421875, + "learning_rate": 9.240600810991064e-06, + "loss": 0.0059, + "num_tokens": 25385104.0, + "reward": 0.5948491133749485, + "reward_std": 0.23099772073328495, + "rewards/code_reward": 0.44484908133745193, + "rewards/format_reward": 1.5, + "step": 1060 + }, + { + "clip_ratio": 0.0044238289701752365, + "epoch": 0.03958770579730423, + "grad_norm": 0.07808912545442581, + "kl": 0.0322265625, + "learning_rate": 9.238979624859604e-06, + "loss": 0.0057, + "step": 1061 + }, + { + "clip_ratio": 0.004105662228539586, + "epoch": 0.039625017489855884, + "grad_norm": 0.07030772417783737, + "kl": 0.03387451171875, + "learning_rate": 9.237356869921748e-06, + "loss": 0.0054, + "step": 1062 + }, + { + "clip_ratio": 0.0031104550580494106, + "completion_length": 702.0000228881836, + "epoch": 0.03966232918240754, + "grad_norm": 0.060391832143068314, + "kl": 0.02716064453125, + "learning_rate": 9.235732546858372e-06, + "loss": 0.0313, + "num_tokens": 25454698.0, + "reward": 0.4809948392212391, + "reward_std": 0.16178925707936287, + "rewards/code_reward": 0.33367341104894876, + "rewards/format_reward": 1.4732142984867096, + "step": 1063 + }, + { + "clip_ratio": 0.0032078386284410954, + "epoch": 0.03969964087495919, + "grad_norm": 0.05824555456638336, + "kl": 0.026702880859375, + "learning_rate": 9.234106656351013e-06, + "loss": 0.0311, + "step": 1064 + }, + { + "clip_ratio": 0.00348925415892154, + "epoch": 0.03973695256751084, + "grad_norm": 0.1791776567697525, + "kl": 0.026580810546875, + "learning_rate": 9.232479199081863e-06, + "loss": 0.031, + "step": 1065 + }, + { + "clip_ratio": 0.004294991143979132, + "completion_length": 662.5357360839844, + "epoch": 0.039774264260062496, + "grad_norm": 0.09336454421281815, + "kl": 0.03094482421875, + "learning_rate": 9.230850175733775e-06, + "loss": 0.0154, + "num_tokens": 25528970.0, + "reward": 0.7730890065431595, + "reward_std": 0.3607179783284664, + "rewards/code_reward": 0.6230890080332756, + "rewards/format_reward": 1.5, + "step": 1066 + }, + { + "clip_ratio": 0.00363527808804065, + "epoch": 0.03981157595261415, + "grad_norm": 0.09156283736228943, + "kl": 0.030303955078125, + "learning_rate": 9.229219586990256e-06, + "loss": 0.0149, + "step": 1067 + }, + { + "clip_ratio": 0.0036149139632470906, + "epoch": 0.0398488876451658, + "grad_norm": 0.08637526631355286, + "kl": 0.0301513671875, + "learning_rate": 9.227587433535471e-06, + "loss": 0.0145, + "step": 1068 + }, + { + "clip_ratio": 0.0017707915394566953, + "completion_length": 816.9643096923828, + "epoch": 0.039886199337717455, + "grad_norm": 0.05322569981217384, + "kl": 0.035369873046875, + "learning_rate": 9.225953716054243e-06, + "loss": 0.0465, + "num_tokens": 25605716.0, + "reward": 0.7090911418199539, + "reward_std": 0.05188449751585722, + "rewards/code_reward": 0.5617697238922119, + "rewards/format_reward": 1.4732142984867096, + "step": 1069 + }, + { + "clip_ratio": 0.0017323395004495978, + "epoch": 0.03992351103026911, + "grad_norm": 0.05091780424118042, + "kl": 0.03472900390625, + "learning_rate": 9.224318435232044e-06, + "loss": 0.0464, + "step": 1070 + }, + { + "clip_ratio": 0.0018878853297792375, + "epoch": 0.03996082272282076, + "grad_norm": 0.04925460368394852, + "kl": 0.035430908203125, + "learning_rate": 9.222681591755012e-06, + "loss": 0.0462, + "step": 1071 + }, + { + "clip_ratio": 0.0037768532056361437, + "completion_length": 613.2678909301758, + "epoch": 0.039998134415372415, + "grad_norm": 0.05713547021150589, + "kl": 0.024139404296875, + "learning_rate": 9.221043186309936e-06, + "loss": 0.0061, + "num_tokens": 25668633.0, + "reward": 0.2750000134110451, + "reward_std": 0.17378021404147148, + "rewards/code_reward": 0.12500000186264515, + "rewards/format_reward": 1.5, + "step": 1072 + }, + { + "clip_ratio": 0.004245938209351152, + "epoch": 0.04003544610792407, + "grad_norm": 0.057543378323316574, + "kl": 0.023590087890625, + "learning_rate": 9.219403219584257e-06, + "loss": 0.0061, + "step": 1073 + }, + { + "clip_ratio": 0.00453721807571128, + "epoch": 0.04007275780047572, + "grad_norm": 0.05214346572756767, + "kl": 0.0238037109375, + "learning_rate": 9.217761692266079e-06, + "loss": 0.006, + "step": 1074 + }, + { + "clip_ratio": 0.00379862857516855, + "completion_length": 677.3571929931641, + "epoch": 0.040110069493027374, + "grad_norm": 0.08740504086017609, + "kl": 0.033111572265625, + "learning_rate": 9.216118605044154e-06, + "loss": -0.0067, + "num_tokens": 25741817.0, + "reward": 0.5922888293862343, + "reward_std": 0.2274510283023119, + "rewards/code_reward": 0.44425310380756855, + "rewards/format_reward": 1.480357140302658, + "step": 1075 + }, + { + "clip_ratio": 0.0034135375753976405, + "epoch": 0.040147381185579034, + "grad_norm": 0.08159995079040527, + "kl": 0.0323486328125, + "learning_rate": 9.214473958607892e-06, + "loss": -0.0069, + "step": 1076 + }, + { + "clip_ratio": 0.002985846484079957, + "epoch": 0.04018469287813069, + "grad_norm": 0.07581112533807755, + "kl": 0.031494140625, + "learning_rate": 9.212827753647355e-06, + "loss": -0.0074, + "step": 1077 + }, + { + "clip_ratio": 0.0038503045798279345, + "completion_length": 922.357177734375, + "epoch": 0.04022200457068234, + "grad_norm": 0.05346745252609253, + "kl": 0.024322509765625, + "learning_rate": 9.211179990853262e-06, + "loss": 0.0596, + "num_tokens": 25828973.0, + "reward": 0.2718985043466091, + "reward_std": 0.10740025760605931, + "rewards/code_reward": 0.1254699220880866, + "rewards/format_reward": 1.4642857313156128, + "step": 1078 + }, + { + "clip_ratio": 0.0037423803005367517, + "epoch": 0.040259316263233993, + "grad_norm": 0.052079565823078156, + "kl": 0.024627685546875, + "learning_rate": 9.20953067091698e-06, + "loss": 0.0595, + "step": 1079 + }, + { + "clip_ratio": 0.004014936450403184, + "epoch": 0.04029662795578565, + "grad_norm": 0.04948502406477928, + "kl": 0.02386474609375, + "learning_rate": 9.207879794530536e-06, + "loss": 0.0593, + "step": 1080 + }, + { + "clip_ratio": 0.004000178014393896, + "completion_length": 899.5714721679688, + "epoch": 0.0403339396483373, + "grad_norm": 0.0735321193933487, + "kl": 0.03277587890625, + "learning_rate": 9.206227362386608e-06, + "loss": 0.0756, + "num_tokens": 25915063.0, + "reward": 0.7192117646336555, + "reward_std": 0.3084814166650176, + "rewards/code_reward": 0.5774260275065899, + "rewards/format_reward": 1.4178571701049805, + "step": 1081 + }, + { + "clip_ratio": 0.003932483203243464, + "epoch": 0.04037125134088895, + "grad_norm": 0.07042364031076431, + "kl": 0.031982421875, + "learning_rate": 9.204573375178522e-06, + "loss": 0.0754, + "step": 1082 + }, + { + "clip_ratio": 0.0037076197913847864, + "epoch": 0.040408563033440606, + "grad_norm": 0.06869091093540192, + "kl": 0.032012939453125, + "learning_rate": 9.202917833600261e-06, + "loss": 0.0749, + "step": 1083 + }, + { + "clip_ratio": 0.00244231236865744, + "completion_length": 623.0892944335938, + "epoch": 0.04044587472599226, + "grad_norm": 0.08359270542860031, + "kl": 0.029510498046875, + "learning_rate": 9.201260738346464e-06, + "loss": -0.0111, + "num_tokens": 25979678.0, + "reward": 0.9942583441734314, + "reward_std": 0.22786352783441544, + "rewards/code_reward": 0.8487226068973541, + "rewards/format_reward": 1.4553571343421936, + "step": 1084 + }, + { + "clip_ratio": 0.002509896003175527, + "epoch": 0.04048318641854391, + "grad_norm": 0.08499445766210556, + "kl": 0.029327392578125, + "learning_rate": 9.199602090112411e-06, + "loss": -0.0113, + "step": 1085 + }, + { + "clip_ratio": 0.002389529370702803, + "epoch": 0.040520498111095565, + "grad_norm": 0.07275501638650894, + "kl": 0.029693603515625, + "learning_rate": 9.197941889594047e-06, + "loss": -0.0119, + "step": 1086 + }, + { + "clip_ratio": 0.0034247389994561672, + "completion_length": 648.0535888671875, + "epoch": 0.04055780980364722, + "grad_norm": 0.03601553291082382, + "kl": 0.028594970703125, + "learning_rate": 9.196280137487954e-06, + "loss": 0.0024, + "num_tokens": 26042205.0, + "reward": 0.4792986661195755, + "reward_std": 0.1142958477139473, + "rewards/code_reward": 0.3312629386782646, + "rewards/format_reward": 1.480357140302658, + "step": 1087 + }, + { + "clip_ratio": 0.0035551211331039667, + "epoch": 0.04059512149619887, + "grad_norm": 0.037020232528448105, + "kl": 0.02862548828125, + "learning_rate": 9.194616834491378e-06, + "loss": 0.0024, + "step": 1088 + }, + { + "clip_ratio": 0.0036361218662932515, + "epoch": 0.040632433188750525, + "grad_norm": 0.03576742485165596, + "kl": 0.029083251953125, + "learning_rate": 9.192951981302207e-06, + "loss": 0.0023, + "step": 1089 + }, + { + "clip_ratio": 0.002482805517502129, + "completion_length": 570.4464416503906, + "epoch": 0.04066974488130218, + "grad_norm": 0.10781107097864151, + "kl": 0.02923583984375, + "learning_rate": 9.191285578618984e-06, + "loss": 0.0089, + "num_tokens": 26098706.0, + "reward": 1.110610693693161, + "reward_std": 0.08376232627779245, + "rewards/code_reward": 0.9625749439001083, + "rewards/format_reward": 1.480357140302658, + "step": 1090 + }, + { + "clip_ratio": 0.0025499824550934136, + "epoch": 0.04070705657385383, + "grad_norm": 0.12584204971790314, + "kl": 0.029296875, + "learning_rate": 9.1896176271409e-06, + "loss": 0.0087, + "step": 1091 + }, + { + "clip_ratio": 0.0025044531794264913, + "epoch": 0.040744368266405484, + "grad_norm": 0.07017864286899567, + "kl": 0.029266357421875, + "learning_rate": 9.187948127567795e-06, + "loss": 0.0081, + "step": 1092 + }, + { + "clip_ratio": 0.003947991062887013, + "completion_length": 591.2857284545898, + "epoch": 0.04078167995895714, + "grad_norm": 0.05670541897416115, + "kl": 0.025360107421875, + "learning_rate": 9.186277080600163e-06, + "loss": 0.0025, + "num_tokens": 26162884.0, + "reward": 0.49687742441892624, + "reward_std": 0.19366630166769028, + "rewards/code_reward": 0.34884166717529297, + "rewards/format_reward": 1.480357140302658, + "step": 1093 + }, + { + "clip_ratio": 0.00334525149082765, + "epoch": 0.04081899165150879, + "grad_norm": 0.05595582351088524, + "kl": 0.02386474609375, + "learning_rate": 9.184604486939142e-06, + "loss": 0.0022, + "step": 1094 + }, + { + "clip_ratio": 0.003618590591941029, + "epoch": 0.04085630334406044, + "grad_norm": 0.05318014696240425, + "kl": 0.02490234375, + "learning_rate": 9.182930347286522e-06, + "loss": 0.002, + "step": 1095 + }, + { + "clip_ratio": 0.0029320682515390217, + "completion_length": 742.107177734375, + "epoch": 0.040893615036612097, + "grad_norm": 0.04914858937263489, + "kl": 0.020721435546875, + "learning_rate": 9.18125466234474e-06, + "loss": 0.0053, + "num_tokens": 26230622.0, + "reward": 0.6321428641676903, + "reward_std": 0.06681530922651291, + "rewards/code_reward": 0.4821428507566452, + "rewards/format_reward": 1.5, + "step": 1096 + }, + { + "clip_ratio": 0.0027543625910766423, + "epoch": 0.04093092672916375, + "grad_norm": 0.04270845279097557, + "kl": 0.020782470703125, + "learning_rate": 9.179577432816885e-06, + "loss": 0.0053, + "step": 1097 + }, + { + "clip_ratio": 0.0028447714867070317, + "epoch": 0.0409682384217154, + "grad_norm": 0.03756280615925789, + "kl": 0.0203094482421875, + "learning_rate": 9.17789865940669e-06, + "loss": 0.0051, + "step": 1098 + }, + { + "clip_ratio": 0.002749040664639324, + "completion_length": 749.9286117553711, + "epoch": 0.041005550114267056, + "grad_norm": 0.07559466361999512, + "kl": 0.0355224609375, + "learning_rate": 9.176218342818533e-06, + "loss": 0.0566, + "num_tokens": 26313960.0, + "reward": 0.8698588237166405, + "reward_std": 0.10059643909335136, + "rewards/code_reward": 0.7245016554370522, + "rewards/format_reward": 1.4535714387893677, + "step": 1099 + }, + { + "clip_ratio": 0.0028327758773230016, + "epoch": 0.04104286180681871, + "grad_norm": 0.06531720608472824, + "kl": 0.0347900390625, + "learning_rate": 9.174536483757449e-06, + "loss": 0.0564, + "step": 1100 + }, + { + "clip_ratio": 0.002679202414583415, + "epoch": 0.04108017349937036, + "grad_norm": 0.05954798310995102, + "kl": 0.034820556640625, + "learning_rate": 9.172853082929114e-06, + "loss": 0.056, + "step": 1101 + }, + { + "clip_ratio": 0.003762493608519435, + "completion_length": 835.8750457763672, + "epoch": 0.041117485191922015, + "grad_norm": 0.06659568101167679, + "kl": 0.0227203369140625, + "learning_rate": 9.171168141039851e-06, + "loss": 0.0037, + "num_tokens": 26397869.0, + "reward": 0.4777338020503521, + "reward_std": 0.19906340911984444, + "rewards/code_reward": 0.32951950281858444, + "rewards/format_reward": 1.4821428656578064, + "step": 1102 + }, + { + "clip_ratio": 0.0035322435142006725, + "epoch": 0.041154796884473675, + "grad_norm": 0.06274719536304474, + "kl": 0.023468017578125, + "learning_rate": 9.169481658796628e-06, + "loss": 0.0036, + "step": 1103 + }, + { + "clip_ratio": 0.00331356143578887, + "epoch": 0.04119210857702533, + "grad_norm": 0.060053110122680664, + "kl": 0.023406982421875, + "learning_rate": 9.167793636907066e-06, + "loss": 0.0031, + "step": 1104 + }, + { + "clip_ratio": 0.004743140423670411, + "completion_length": 665.607177734375, + "epoch": 0.04122942026957698, + "grad_norm": 0.10433481633663177, + "kl": 0.025054931640625, + "learning_rate": 9.166104076079423e-06, + "loss": -0.0075, + "num_tokens": 26466349.0, + "reward": 0.3103807047009468, + "reward_std": 0.18890401162207127, + "rewards/code_reward": 0.16038068913621828, + "rewards/format_reward": 1.5, + "step": 1105 + }, + { + "clip_ratio": 0.0041993262129835784, + "epoch": 0.041266731962128635, + "grad_norm": 0.10003095120191574, + "kl": 0.025146484375, + "learning_rate": 9.164412977022612e-06, + "loss": -0.0079, + "step": 1106 + }, + { + "clip_ratio": 0.004663260537199676, + "epoch": 0.04130404365468029, + "grad_norm": 0.08416758477687836, + "kl": 0.0252685546875, + "learning_rate": 9.162720340446183e-06, + "loss": -0.0086, + "step": 1107 + }, + { + "clip_ratio": 0.004929172631818801, + "completion_length": 678.1607360839844, + "epoch": 0.04134135534723194, + "grad_norm": 0.07526934146881104, + "kl": 0.025787353515625, + "learning_rate": 9.161026167060336e-06, + "loss": 0.0075, + "num_tokens": 26535210.0, + "reward": 0.17282911762595177, + "reward_std": 0.03799323830753565, + "rewards/code_reward": 0.02479338925331831, + "rewards/format_reward": 1.480357140302658, + "step": 1108 + }, + { + "clip_ratio": 0.004598392639309168, + "epoch": 0.041378667039783594, + "grad_norm": 0.06924478709697723, + "kl": 0.026611328125, + "learning_rate": 9.159330457575915e-06, + "loss": 0.0073, + "step": 1109 + }, + { + "clip_ratio": 0.004838823922909796, + "epoch": 0.04141597873233525, + "grad_norm": 0.06812824308872223, + "kl": 0.025970458984375, + "learning_rate": 9.15763321270441e-06, + "loss": 0.0069, + "step": 1110 + }, + { + "clip_ratio": 0.0032876175828278065, + "completion_length": 618.7143096923828, + "epoch": 0.0414532904248869, + "grad_norm": 0.0878426656126976, + "kl": 0.021820068359375, + "learning_rate": 9.155934433157951e-06, + "loss": 0.0027, + "num_tokens": 26603122.0, + "reward": 0.7691507413983345, + "reward_std": 0.24790808092802763, + "rewards/code_reward": 0.6191507577896118, + "rewards/format_reward": 1.5, + "step": 1111 + }, + { + "clip_ratio": 0.0033667486859485507, + "epoch": 0.04149060211743855, + "grad_norm": 0.08385408669710159, + "kl": 0.02252197265625, + "learning_rate": 9.154234119649315e-06, + "loss": 0.0024, + "step": 1112 + }, + { + "clip_ratio": 0.002939904050435871, + "epoch": 0.041527913809990206, + "grad_norm": 0.07945457845926285, + "kl": 0.022430419921875, + "learning_rate": 9.15253227289192e-06, + "loss": 0.0021, + "step": 1113 + }, + { + "clip_ratio": 0.0028349481872282922, + "completion_length": 781.5178833007812, + "epoch": 0.04156522550254186, + "grad_norm": 0.06622999906539917, + "kl": 0.022552490234375, + "learning_rate": 9.150828893599833e-06, + "loss": 0.0068, + "num_tokens": 26673025.0, + "reward": 0.6383877843618393, + "reward_std": 0.032052715541794896, + "rewards/code_reward": 0.4901735048770206, + "rewards/format_reward": 1.4821428656578064, + "step": 1114 + }, + { + "clip_ratio": 0.0025009859236888587, + "epoch": 0.04160253719509351, + "grad_norm": 0.06962397694587708, + "kl": 0.02239990234375, + "learning_rate": 9.149123982487757e-06, + "loss": 0.0066, + "step": 1115 + }, + { + "clip_ratio": 0.0025727461907081306, + "epoch": 0.041639848887645166, + "grad_norm": 0.06134776771068573, + "kl": 0.02325439453125, + "learning_rate": 9.147417540271044e-06, + "loss": 0.0062, + "step": 1116 + }, + { + "clip_ratio": 0.003946287848521024, + "completion_length": 662.9821701049805, + "epoch": 0.04167716058019682, + "grad_norm": 0.06558782607316971, + "kl": 0.021942138671875, + "learning_rate": 9.145709567665683e-06, + "loss": 0.0421, + "num_tokens": 26741078.0, + "reward": 0.645892858505249, + "reward_std": 0.11652041785418987, + "rewards/code_reward": 0.4985714219510555, + "rewards/format_reward": 1.4732142984867096, + "step": 1117 + }, + { + "clip_ratio": 0.003913962282240391, + "epoch": 0.04171447227274847, + "grad_norm": 0.0719294622540474, + "kl": 0.0220184326171875, + "learning_rate": 9.144000065388308e-06, + "loss": 0.042, + "step": 1118 + }, + { + "clip_ratio": 0.003746178117580712, + "epoch": 0.041751783965300125, + "grad_norm": 0.059078916907310486, + "kl": 0.0222625732421875, + "learning_rate": 9.142289034156197e-06, + "loss": 0.0418, + "step": 1119 + }, + { + "clip_ratio": 0.002446836791932583, + "completion_length": 714.4286117553711, + "epoch": 0.04178909565785178, + "grad_norm": 0.07083624601364136, + "kl": 0.02362060546875, + "learning_rate": 9.140576474687263e-06, + "loss": -0.003, + "num_tokens": 26816590.0, + "reward": 1.022603616118431, + "reward_std": 0.23245255649089813, + "rewards/code_reward": 0.8726036250591278, + "rewards/format_reward": 1.5, + "step": 1120 + }, + { + "clip_ratio": 0.002149312582332641, + "epoch": 0.04182640735040343, + "grad_norm": 0.08532369881868362, + "kl": 0.0232696533203125, + "learning_rate": 9.138862387700068e-06, + "loss": -0.0031, + "step": 1121 + }, + { + "clip_ratio": 0.0027137695578858256, + "epoch": 0.041863719042955085, + "grad_norm": 0.06213632598519325, + "kl": 0.0232696533203125, + "learning_rate": 9.137146773913809e-06, + "loss": -0.0034, + "step": 1122 + }, + { + "clip_ratio": 0.0037429596995934844, + "completion_length": 665.8393249511719, + "epoch": 0.04190103073550674, + "grad_norm": 0.06457763910293579, + "kl": 0.0272216796875, + "learning_rate": 9.135429634048324e-06, + "loss": 0.0057, + "num_tokens": 26884033.0, + "reward": 0.5869206413626671, + "reward_std": 0.11247400287538767, + "rewards/code_reward": 0.43692063307389617, + "rewards/format_reward": 1.5, + "step": 1123 + }, + { + "clip_ratio": 0.0037246831343509257, + "epoch": 0.04193834242805839, + "grad_norm": 0.06899301707744598, + "kl": 0.02728271484375, + "learning_rate": 9.133710968824096e-06, + "loss": 0.0055, + "step": 1124 + }, + { + "clip_ratio": 0.003106677730102092, + "epoch": 0.041975654120610044, + "grad_norm": 0.05853154882788658, + "kl": 0.027496337890625, + "learning_rate": 9.131990778962241e-06, + "loss": 0.0052, + "step": 1125 + }, + { + "clip_ratio": 0.004592937766574323, + "completion_length": 730.3036041259766, + "epoch": 0.0420129658131617, + "grad_norm": 0.0696183517575264, + "kl": 0.03558349609375, + "learning_rate": 9.130269065184525e-06, + "loss": -0.0031, + "num_tokens": 26961848.0, + "reward": 0.5886388719081879, + "reward_std": 0.04386221687309444, + "rewards/code_reward": 0.4386388601269573, + "rewards/format_reward": 1.5, + "step": 1126 + }, + { + "clip_ratio": 0.003933870990294963, + "epoch": 0.04205027750571335, + "grad_norm": 0.07205943018198013, + "kl": 0.035614013671875, + "learning_rate": 9.128545828213343e-06, + "loss": -0.0033, + "step": 1127 + }, + { + "clip_ratio": 0.0035382502828724682, + "epoch": 0.042087589198265, + "grad_norm": 0.059240955859422684, + "kl": 0.035919189453125, + "learning_rate": 9.126821068771733e-06, + "loss": -0.0035, + "step": 1128 + }, + { + "clip_ratio": 0.00415497156791389, + "completion_length": 741.607177734375, + "epoch": 0.042124900890816656, + "grad_norm": 0.07559163123369217, + "kl": 0.039947509765625, + "learning_rate": 9.125094787583371e-06, + "loss": -0.0193, + "num_tokens": 27038616.0, + "reward": 0.6806786730885506, + "reward_std": 0.2644737996160984, + "rewards/code_reward": 0.5344286020845175, + "rewards/format_reward": 1.4625000059604645, + "step": 1129 + }, + { + "clip_ratio": 0.00428449414903298, + "epoch": 0.042162212583368316, + "grad_norm": 0.07470542192459106, + "kl": 0.040069580078125, + "learning_rate": 9.123366985372577e-06, + "loss": -0.0195, + "step": 1130 + }, + { + "clip_ratio": 0.003964928619097918, + "epoch": 0.04219952427591997, + "grad_norm": 0.06651497632265091, + "kl": 0.03875732421875, + "learning_rate": 9.121637662864304e-06, + "loss": -0.0199, + "step": 1131 + }, + { + "clip_ratio": 0.004062030464410782, + "completion_length": 786.1428833007812, + "epoch": 0.04223683596847162, + "grad_norm": 0.09124214947223663, + "kl": 0.03314208984375, + "learning_rate": 9.119906820784139e-06, + "loss": -0.0058, + "num_tokens": 27120566.0, + "reward": 0.3128802813589573, + "reward_std": 0.11365946545265615, + "rewards/code_reward": 0.16484456462785602, + "rewards/format_reward": 1.480357140302658, + "step": 1132 + }, + { + "clip_ratio": 0.0034809871576726437, + "epoch": 0.042274147661023276, + "grad_norm": 0.06797035783529282, + "kl": 0.03192138671875, + "learning_rate": 9.118174459858313e-06, + "loss": -0.0062, + "step": 1133 + }, + { + "clip_ratio": 0.0030830889591015875, + "epoch": 0.04231145935357493, + "grad_norm": 0.06633008271455765, + "kl": 0.030059814453125, + "learning_rate": 9.116440580813693e-06, + "loss": -0.0064, + "step": 1134 + }, + { + "clip_ratio": 0.0020265699713490903, + "completion_length": 807.7678985595703, + "epoch": 0.04234877104612658, + "grad_norm": 0.05473239719867706, + "kl": 0.030364990234375, + "learning_rate": 9.114705184377785e-06, + "loss": 0.042, + "num_tokens": 27204147.0, + "reward": 0.9877976179122925, + "reward_std": 0.21066520363092422, + "rewards/code_reward": 0.8404761850833893, + "rewards/format_reward": 1.4732142984867096, + "step": 1135 + }, + { + "clip_ratio": 0.0018248592386953533, + "epoch": 0.042386082738678235, + "grad_norm": 0.054262675344944, + "kl": 0.0313720703125, + "learning_rate": 9.112968271278725e-06, + "loss": 0.0419, + "step": 1136 + }, + { + "clip_ratio": 0.0019016240257769823, + "epoch": 0.04242339443122989, + "grad_norm": 0.052271611988544464, + "kl": 0.0303955078125, + "learning_rate": 9.111229842245293e-06, + "loss": 0.0418, + "step": 1137 + }, + { + "clip_ratio": 0.004657561308704317, + "completion_length": 716.7500381469727, + "epoch": 0.04246070612378154, + "grad_norm": 0.012121311388909817, + "kl": 0.027008056640625, + "learning_rate": 9.109489898006898e-06, + "loss": 0.0004, + "num_tokens": 27271809.0, + "reward": 0.15000002086162567, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.5, + "step": 1138 + }, + { + "clip_ratio": 0.005511316005140543, + "epoch": 0.042498017816333195, + "grad_norm": 0.007723301649093628, + "kl": 0.02398681640625, + "learning_rate": 9.107748439293591e-06, + "loss": 0.0004, + "step": 1139 + }, + { + "clip_ratio": 0.004971527028828859, + "epoch": 0.04253532950888485, + "grad_norm": 0.006053682882338762, + "kl": 0.022979736328125, + "learning_rate": 9.106005466836057e-06, + "loss": 0.0004, + "step": 1140 + }, + { + "clip_ratio": 0.005266623222269118, + "completion_length": 877.4821701049805, + "epoch": 0.0425726412014365, + "grad_norm": 0.06733787059783936, + "kl": 0.028839111328125, + "learning_rate": 9.104260981365609e-06, + "loss": 0.0463, + "num_tokens": 27353330.0, + "reward": 0.36275070160627365, + "reward_std": 0.18694429844617844, + "rewards/code_reward": 0.21542927087284625, + "rewards/format_reward": 1.4732142984867096, + "step": 1141 + }, + { + "clip_ratio": 0.004755206522531807, + "epoch": 0.042609952893988154, + "grad_norm": 0.0852365493774414, + "kl": 0.029022216796875, + "learning_rate": 9.102514983614208e-06, + "loss": 0.0459, + "step": 1142 + }, + { + "clip_ratio": 0.004649778013117611, + "epoch": 0.04264726458653981, + "grad_norm": 0.07479290664196014, + "kl": 0.02789306640625, + "learning_rate": 9.100767474314436e-06, + "loss": 0.0457, + "step": 1143 + }, + { + "clip_ratio": 0.004415114468429238, + "completion_length": 769.0178985595703, + "epoch": 0.04268457627909146, + "grad_norm": 0.08434013277292252, + "kl": 0.03863525390625, + "learning_rate": 9.09901845419952e-06, + "loss": 0.0028, + "num_tokens": 27434195.0, + "reward": 0.6290712170302868, + "reward_std": 0.1557808555662632, + "rewards/code_reward": 0.47907120175659657, + "rewards/format_reward": 1.5, + "step": 1144 + }, + { + "clip_ratio": 0.004183554323390126, + "epoch": 0.04272188797164311, + "grad_norm": 0.08141256868839264, + "kl": 0.03790283203125, + "learning_rate": 9.097267924003312e-06, + "loss": 0.0027, + "step": 1145 + }, + { + "clip_ratio": 0.0038348017260432243, + "epoch": 0.042759199664194766, + "grad_norm": 0.0720590129494667, + "kl": 0.0369873046875, + "learning_rate": 9.095515884460307e-06, + "loss": 0.0022, + "step": 1146 + }, + { + "clip_ratio": 0.003353129723109305, + "completion_length": 781.3571701049805, + "epoch": 0.04279651135674642, + "grad_norm": 0.07437347620725632, + "kl": 0.03240966796875, + "learning_rate": 9.093762336305625e-06, + "loss": 0.026, + "num_tokens": 27516199.0, + "reward": 0.5939462333917618, + "reward_std": 0.1705697402358055, + "rewards/code_reward": 0.4466248154640198, + "rewards/format_reward": 1.4732142984867096, + "step": 1147 + }, + { + "clip_ratio": 0.0035517719807103276, + "epoch": 0.04283382304929807, + "grad_norm": 0.07161848247051239, + "kl": 0.03271484375, + "learning_rate": 9.092007280275023e-06, + "loss": 0.026, + "step": 1148 + }, + { + "clip_ratio": 0.0029464695835486054, + "epoch": 0.042871134741849726, + "grad_norm": 0.06372758001089096, + "kl": 0.03326416015625, + "learning_rate": 9.090250717104886e-06, + "loss": 0.0254, + "step": 1149 + }, + { + "clip_ratio": 0.003607507620472461, + "completion_length": 1052.5893249511719, + "epoch": 0.04290844643440138, + "grad_norm": 0.08057577908039093, + "kl": 0.0194854736328125, + "learning_rate": 9.088492647532244e-06, + "loss": 0.0131, + "num_tokens": 27613770.0, + "reward": 0.43058422580361366, + "reward_std": 0.2018452798947692, + "rewards/code_reward": 0.2832627771422267, + "rewards/format_reward": 1.4732142984867096, + "step": 1150 + }, + { + "clip_ratio": 0.003310676838736981, + "epoch": 0.04294575812695303, + "grad_norm": 0.0794740542769432, + "kl": 0.0193634033203125, + "learning_rate": 9.086733072294744e-06, + "loss": 0.0127, + "step": 1151 + }, + { + "clip_ratio": 0.0034613371244631708, + "epoch": 0.042983069819504685, + "grad_norm": 0.07304228842258453, + "kl": 0.0196533203125, + "learning_rate": 9.084971992130673e-06, + "loss": 0.0126, + "step": 1152 + }, + { + "clip_ratio": 0.004614957666490227, + "completion_length": 851.3036193847656, + "epoch": 0.04302038151205634, + "grad_norm": 0.09627262502908707, + "kl": 0.039215087890625, + "learning_rate": 9.083209407778946e-06, + "loss": 0.0551, + "num_tokens": 27689163.0, + "reward": 0.2273639403283596, + "reward_std": 0.14361329656094313, + "rewards/code_reward": 0.08272106957156211, + "rewards/format_reward": 1.4464285969734192, + "step": 1153 + }, + { + "clip_ratio": 0.004537401779089123, + "epoch": 0.04305769320460799, + "grad_norm": 0.08239686489105225, + "kl": 0.03875732421875, + "learning_rate": 9.081445319979112e-06, + "loss": 0.0547, + "step": 1154 + }, + { + "clip_ratio": 0.0047527256538160145, + "epoch": 0.043095004897159644, + "grad_norm": 0.07294803857803345, + "kl": 0.0377197265625, + "learning_rate": 9.079679729471349e-06, + "loss": 0.0544, + "step": 1155 + }, + { + "clip_ratio": 0.0026936958311125636, + "completion_length": 567.6964569091797, + "epoch": 0.0431323165897113, + "grad_norm": 0.04417962208390236, + "kl": 0.03216552734375, + "learning_rate": 9.077912636996467e-06, + "loss": 0.0011, + "num_tokens": 27752622.0, + "reward": 0.8212903961539268, + "reward_std": 0.1249738335609436, + "rewards/code_reward": 0.6712903939187527, + "rewards/format_reward": 1.5, + "step": 1156 + }, + { + "clip_ratio": 0.002532464568503201, + "epoch": 0.04316962828226296, + "grad_norm": 0.04701172187924385, + "kl": 0.03228759765625, + "learning_rate": 9.076144043295904e-06, + "loss": 0.001, + "step": 1157 + }, + { + "clip_ratio": 0.0022863487247377634, + "epoch": 0.04320693997481461, + "grad_norm": 0.0479363389313221, + "kl": 0.031005859375, + "learning_rate": 9.074373949111727e-06, + "loss": 0.0008, + "step": 1158 + }, + { + "clip_ratio": 0.0040912426775321364, + "completion_length": 633.3571624755859, + "epoch": 0.043244251667366264, + "grad_norm": 0.052482590079307556, + "kl": 0.033843994140625, + "learning_rate": 9.07260235518664e-06, + "loss": -0.0165, + "num_tokens": 27812982.0, + "reward": 0.4533946216106415, + "reward_std": 0.13096175342798233, + "rewards/code_reward": 0.3033946231007576, + "rewards/format_reward": 1.5, + "step": 1159 + }, + { + "clip_ratio": 0.004404284409247339, + "epoch": 0.04328156335991792, + "grad_norm": 0.05251314491033554, + "kl": 0.034423828125, + "learning_rate": 9.070829262263966e-06, + "loss": -0.0167, + "step": 1160 + }, + { + "clip_ratio": 0.004931546573061496, + "epoch": 0.04331887505246957, + "grad_norm": 0.05029692128300667, + "kl": 0.034088134765625, + "learning_rate": 9.069054671087665e-06, + "loss": -0.0167, + "step": 1161 + }, + { + "clip_ratio": 0.0038081060047261417, + "completion_length": 858.9464569091797, + "epoch": 0.04335618674502122, + "grad_norm": 0.0717586800456047, + "kl": 0.033905029296875, + "learning_rate": 9.067278582402321e-06, + "loss": 0.0202, + "num_tokens": 27895159.0, + "reward": 0.5637085922062397, + "reward_std": 0.15000187675468624, + "rewards/code_reward": 0.41370859392918646, + "rewards/format_reward": 1.5, + "step": 1162 + }, + { + "clip_ratio": 0.003412397811189294, + "epoch": 0.043393498437572876, + "grad_norm": 0.07561934739351273, + "kl": 0.034820556640625, + "learning_rate": 9.065500996953149e-06, + "loss": 0.0199, + "step": 1163 + }, + { + "clip_ratio": 0.0034654869232326746, + "epoch": 0.04343081013012453, + "grad_norm": 0.06942389905452728, + "kl": 0.031829833984375, + "learning_rate": 9.063721915485988e-06, + "loss": 0.0196, + "step": 1164 + }, + { + "clip_ratio": 0.004441747325472534, + "completion_length": 830.4286041259766, + "epoch": 0.04346812182267618, + "grad_norm": 0.048069778829813004, + "kl": 0.028045654296875, + "learning_rate": 9.06194133874731e-06, + "loss": 0.0502, + "num_tokens": 27976313.0, + "reward": 0.24916020408272743, + "reward_std": 0.0844610771164298, + "rewards/code_reward": 0.10183876007795334, + "rewards/format_reward": 1.4732142984867096, + "step": 1165 + }, + { + "clip_ratio": 0.004392869654111564, + "epoch": 0.043505433515227836, + "grad_norm": 0.045663557946681976, + "kl": 0.027496337890625, + "learning_rate": 9.060159267484214e-06, + "loss": 0.0501, + "step": 1166 + }, + { + "clip_ratio": 0.004536898632068187, + "epoch": 0.04354274520777949, + "grad_norm": 0.049409449100494385, + "kl": 0.027435302734375, + "learning_rate": 9.058375702444418e-06, + "loss": 0.0501, + "step": 1167 + }, + { + "clip_ratio": 0.003419209795538336, + "completion_length": 593.6071624755859, + "epoch": 0.04358005690033114, + "grad_norm": 0.0923234075307846, + "kl": 0.02728271484375, + "learning_rate": 9.056590644376276e-06, + "loss": -0.0026, + "num_tokens": 28040019.0, + "reward": 0.6660216711461544, + "reward_std": 0.17571709211915731, + "rewards/code_reward": 0.5160216861404479, + "rewards/format_reward": 1.5, + "step": 1168 + }, + { + "clip_ratio": 0.0031509455875493586, + "epoch": 0.043617368592882795, + "grad_norm": 0.09980518370866776, + "kl": 0.02655029296875, + "learning_rate": 9.054804094028768e-06, + "loss": -0.0031, + "step": 1169 + }, + { + "clip_ratio": 0.003295930044259876, + "epoch": 0.04365468028543445, + "grad_norm": 0.08081680536270142, + "kl": 0.027130126953125, + "learning_rate": 9.053016052151492e-06, + "loss": -0.0035, + "step": 1170 + }, + { + "clip_ratio": 0.0020407591364346445, + "completion_length": 765.2143096923828, + "epoch": 0.0436919919779861, + "grad_norm": 0.041689179837703705, + "kl": 0.02154541015625, + "learning_rate": 9.051226519494682e-06, + "loss": 0.0004, + "num_tokens": 28115169.0, + "reward": 0.6802197806537151, + "reward_std": 0.03951851278543472, + "rewards/code_reward": 0.5302197802811861, + "rewards/format_reward": 1.5, + "step": 1171 + }, + { + "clip_ratio": 0.001928678306285292, + "epoch": 0.043729303670537754, + "grad_norm": 0.037141166627407074, + "kl": 0.0214691162109375, + "learning_rate": 9.049435496809189e-06, + "loss": 0.0003, + "step": 1172 + }, + { + "clip_ratio": 0.0020133137004449964, + "epoch": 0.04376661536308941, + "grad_norm": 0.040210917592048645, + "kl": 0.0210723876953125, + "learning_rate": 9.047642984846495e-06, + "loss": 0.0002, + "step": 1173 + }, + { + "clip_ratio": 0.0020972125639673322, + "completion_length": 684.0357437133789, + "epoch": 0.04380392705564106, + "grad_norm": 0.06867295503616333, + "kl": 0.0183563232421875, + "learning_rate": 9.045848984358705e-06, + "loss": -0.0215, + "num_tokens": 28173271.0, + "reward": 0.8709158599376678, + "reward_std": 0.2493196427822113, + "rewards/code_reward": 0.7209158167243004, + "rewards/format_reward": 1.5, + "step": 1174 + }, + { + "clip_ratio": 0.002343095082323998, + "epoch": 0.043841238748192714, + "grad_norm": 0.20237396657466888, + "kl": 0.0180511474609375, + "learning_rate": 9.044053496098546e-06, + "loss": -0.0215, + "step": 1175 + }, + { + "clip_ratio": 0.002169945277273655, + "epoch": 0.04387855044074437, + "grad_norm": 0.05737554281949997, + "kl": 0.0183258056640625, + "learning_rate": 9.042256520819373e-06, + "loss": -0.0219, + "step": 1176 + }, + { + "clip_ratio": 0.0028355723479762673, + "completion_length": 569.3036041259766, + "epoch": 0.04391586213329602, + "grad_norm": 0.08250407129526138, + "kl": 0.034820556640625, + "learning_rate": 9.040458059275162e-06, + "loss": 0.0035, + "num_tokens": 28228994.0, + "reward": 0.9502640068531036, + "reward_std": 0.26241549849510193, + "rewards/code_reward": 0.8002640306949615, + "rewards/format_reward": 1.5, + "step": 1177 + }, + { + "clip_ratio": 0.003006057580932975, + "epoch": 0.04395317382584767, + "grad_norm": 0.07659757882356644, + "kl": 0.03411865234375, + "learning_rate": 9.038658112220519e-06, + "loss": 0.0033, + "step": 1178 + }, + { + "clip_ratio": 0.0026733200065791607, + "epoch": 0.043990485518399326, + "grad_norm": 0.06776789575815201, + "kl": 0.034759521484375, + "learning_rate": 9.036856680410663e-06, + "loss": 0.0027, + "step": 1179 + }, + { + "clip_ratio": 0.004437008639797568, + "completion_length": 677.3750228881836, + "epoch": 0.04402779721095098, + "grad_norm": 0.0797518789768219, + "kl": 0.0323486328125, + "learning_rate": 9.035053764601441e-06, + "loss": 0.0144, + "num_tokens": 28307093.0, + "reward": 0.5480402074754238, + "reward_std": 0.16480992175638676, + "rewards/code_reward": 0.4000044818967581, + "rewards/format_reward": 1.480357140302658, + "step": 1180 + }, + { + "clip_ratio": 0.004067078873049468, + "epoch": 0.04406510890350263, + "grad_norm": 0.0717959925532341, + "kl": 0.030548095703125, + "learning_rate": 9.033249365549328e-06, + "loss": 0.0141, + "step": 1181 + }, + { + "clip_ratio": 0.004178760573267937, + "epoch": 0.044102420596054286, + "grad_norm": 0.07060413062572479, + "kl": 0.030181884765625, + "learning_rate": 9.031443484011411e-06, + "loss": 0.014, + "step": 1182 + }, + { + "clip_ratio": 0.005214205710217357, + "completion_length": 773.1964721679688, + "epoch": 0.04413973228860594, + "grad_norm": 0.10005509108304977, + "kl": 0.0428466796875, + "learning_rate": 9.029636120745408e-06, + "loss": 0.0002, + "num_tokens": 28391034.0, + "reward": 0.29788169637322426, + "reward_std": 0.28476395830512047, + "rewards/code_reward": 0.15056025609374046, + "rewards/format_reward": 1.4732142984867096, + "step": 1183 + }, + { + "clip_ratio": 0.0056727915070950985, + "epoch": 0.04417704398115759, + "grad_norm": 0.08844492584466934, + "kl": 0.0399169921875, + "learning_rate": 9.027827276509653e-06, + "loss": 0.0001, + "step": 1184 + }, + { + "clip_ratio": 0.005288945860229433, + "epoch": 0.04421435567370925, + "grad_norm": 0.08369864523410797, + "kl": 0.039276123046875, + "learning_rate": 9.026016952063107e-06, + "loss": -0.0006, + "step": 1185 + }, + { + "clip_ratio": 0.002054093172773719, + "completion_length": 600.9464569091797, + "epoch": 0.044251667366260905, + "grad_norm": 0.06113513559103012, + "kl": 0.024444580078125, + "learning_rate": 9.024205148165342e-06, + "loss": 0.0192, + "num_tokens": 28448809.0, + "reward": 1.0361947119235992, + "reward_std": 0.03706978110130876, + "rewards/code_reward": 0.886194720864296, + "rewards/format_reward": 1.5, + "step": 1186 + }, + { + "clip_ratio": 0.0018469314672984183, + "epoch": 0.04428897905881256, + "grad_norm": 0.06066041812300682, + "kl": 0.024658203125, + "learning_rate": 9.022391865576562e-06, + "loss": 0.019, + "step": 1187 + }, + { + "clip_ratio": 0.0018249432323500514, + "epoch": 0.04432629075136421, + "grad_norm": 0.05739743262529373, + "kl": 0.025390625, + "learning_rate": 9.020577105057588e-06, + "loss": 0.0189, + "step": 1188 + }, + { + "clip_ratio": 0.004505077318754047, + "completion_length": 825.3214569091797, + "epoch": 0.044363602443915864, + "grad_norm": 0.06588272750377655, + "kl": 0.026885986328125, + "learning_rate": 9.018760867369856e-06, + "loss": 0.0412, + "num_tokens": 28534871.0, + "reward": 0.35624999552965164, + "reward_std": 0.15007224213331938, + "rewards/code_reward": 0.20892857387661934, + "rewards/format_reward": 1.4732142984867096, + "step": 1189 + }, + { + "clip_ratio": 0.0044784919591620564, + "epoch": 0.04440091413646752, + "grad_norm": 0.06768105924129486, + "kl": 0.02667236328125, + "learning_rate": 9.016943153275426e-06, + "loss": 0.0408, + "step": 1190 + }, + { + "clip_ratio": 0.004720811499282718, + "epoch": 0.04443822582901917, + "grad_norm": 0.064885213971138, + "kl": 0.026519775390625, + "learning_rate": 9.015123963536978e-06, + "loss": 0.0406, + "step": 1191 + }, + { + "clip_ratio": 0.0043700417736545205, + "completion_length": 666.6786041259766, + "epoch": 0.044475537521570824, + "grad_norm": 0.05038855969905853, + "kl": 0.02777099609375, + "learning_rate": 9.013303298917812e-06, + "loss": 0.0033, + "num_tokens": 28602893.0, + "reward": 0.45194806158542633, + "reward_std": 0.05925111100077629, + "rewards/code_reward": 0.30194804817438126, + "rewards/format_reward": 1.5, + "step": 1192 + }, + { + "clip_ratio": 0.0036230909754522145, + "epoch": 0.04451284921412248, + "grad_norm": 0.04855521023273468, + "kl": 0.027984619140625, + "learning_rate": 9.011481160181841e-06, + "loss": 0.0031, + "step": 1193 + }, + { + "clip_ratio": 0.004365586908534169, + "epoch": 0.04455016090667413, + "grad_norm": 0.04762512072920799, + "kl": 0.0274658203125, + "learning_rate": 9.009657548093606e-06, + "loss": 0.0031, + "step": 1194 + }, + { + "clip_ratio": 0.005121399008203298, + "completion_length": 576.3928756713867, + "epoch": 0.04458747259922578, + "grad_norm": 0.097462497651577, + "kl": 0.052001953125, + "learning_rate": 9.007832463418256e-06, + "loss": 0.0426, + "num_tokens": 28668281.0, + "reward": 0.7132861576974392, + "reward_std": 0.23313025012612343, + "rewards/code_reward": 0.5659647285938263, + "rewards/format_reward": 1.4732142984867096, + "step": 1195 + }, + { + "clip_ratio": 0.005010282737202942, + "epoch": 0.044624784291777436, + "grad_norm": 0.09513916820287704, + "kl": 0.0428466796875, + "learning_rate": 9.006005906921565e-06, + "loss": 0.0425, + "step": 1196 + }, + { + "clip_ratio": 0.004829566925764084, + "epoch": 0.04466209598432909, + "grad_norm": 0.09130184352397919, + "kl": 0.041351318359375, + "learning_rate": 9.00417787936992e-06, + "loss": 0.0417, + "step": 1197 + }, + { + "clip_ratio": 0.005388858029618859, + "completion_length": 793.6429138183594, + "epoch": 0.04469940767688074, + "grad_norm": 0.087114118039608, + "kl": 0.038970947265625, + "learning_rate": 9.002348381530331e-06, + "loss": 0.0151, + "num_tokens": 28743973.0, + "reward": 0.3413049168884754, + "reward_std": 0.1833990514278412, + "rewards/code_reward": 0.1959477737545967, + "rewards/format_reward": 1.4535714387893677, + "step": 1198 + }, + { + "clip_ratio": 0.00484612985746935, + "epoch": 0.044736719369432396, + "grad_norm": 0.09952718764543533, + "kl": 0.03753662109375, + "learning_rate": 9.000517414170418e-06, + "loss": 0.0148, + "step": 1199 + }, + { + "clip_ratio": 0.005231782095506787, + "epoch": 0.04477403106198405, + "grad_norm": 0.07255447655916214, + "kl": 0.0368194580078125, + "learning_rate": 8.998684978058423e-06, + "loss": 0.0143, + "step": 1200 + }, + { + "clip_ratio": 0.004293364938348532, + "completion_length": 818.5893249511719, + "epoch": 0.0448113427545357, + "grad_norm": 0.05991440638899803, + "kl": 0.0238037109375, + "learning_rate": 8.9968510739632e-06, + "loss": 0.0522, + "num_tokens": 28830918.0, + "reward": 0.6004855744540691, + "reward_std": 0.21079964004456997, + "rewards/code_reward": 0.45316415280103683, + "rewards/format_reward": 1.4732142984867096, + "step": 1201 + }, + { + "clip_ratio": 0.0040023395558819175, + "epoch": 0.044848654447087355, + "grad_norm": 0.06023801118135452, + "kl": 0.0234375, + "learning_rate": 8.995015702654224e-06, + "loss": 0.0523, + "step": 1202 + }, + { + "clip_ratio": 0.0040342333959415555, + "epoch": 0.04488596613963901, + "grad_norm": 0.05532507225871086, + "kl": 0.023345947265625, + "learning_rate": 8.99317886490158e-06, + "loss": 0.0517, + "step": 1203 + }, + { + "clip_ratio": 0.002619779494125396, + "completion_length": 713.9464569091797, + "epoch": 0.04492327783219066, + "grad_norm": 0.08295637369155884, + "kl": 0.024261474609375, + "learning_rate": 8.991340561475973e-06, + "loss": 0.0094, + "num_tokens": 28907803.0, + "reward": 0.9332058429718018, + "reward_std": 0.25412724912166595, + "rewards/code_reward": 0.7832058370113373, + "rewards/format_reward": 1.5, + "step": 1204 + }, + { + "clip_ratio": 0.0026625432074069977, + "epoch": 0.044960589524742314, + "grad_norm": 0.07680745422840118, + "kl": 0.024383544921875, + "learning_rate": 8.989500793148719e-06, + "loss": 0.0092, + "step": 1205 + }, + { + "clip_ratio": 0.002872889570426196, + "epoch": 0.04499790121729397, + "grad_norm": 0.070435531437397, + "kl": 0.023590087890625, + "learning_rate": 8.987659560691752e-06, + "loss": 0.0088, + "step": 1206 + }, + { + "clip_ratio": 0.0039029669133014977, + "completion_length": 767.0357513427734, + "epoch": 0.04503521290984562, + "grad_norm": 0.07622598856687546, + "kl": 0.0182647705078125, + "learning_rate": 8.985816864877619e-06, + "loss": 0.0056, + "num_tokens": 28977693.0, + "reward": 0.5211103186011314, + "reward_std": 0.3418578878045082, + "rewards/code_reward": 0.371110319159925, + "rewards/format_reward": 1.5, + "step": 1207 + }, + { + "clip_ratio": 0.003531717404257506, + "epoch": 0.045072524602397274, + "grad_norm": 0.07747345417737961, + "kl": 0.017669677734375, + "learning_rate": 8.98397270647948e-06, + "loss": 0.0052, + "step": 1208 + }, + { + "clip_ratio": 0.0032491512247361243, + "epoch": 0.04510983629494893, + "grad_norm": 0.07477495074272156, + "kl": 0.0181427001953125, + "learning_rate": 8.98212708627111e-06, + "loss": 0.0049, + "step": 1209 + }, + { + "clip_ratio": 0.004009857715573162, + "completion_length": 771.8214721679688, + "epoch": 0.04514714798750058, + "grad_norm": 0.08695187419652939, + "kl": 0.018646240234375, + "learning_rate": 8.980280005026898e-06, + "loss": -0.0078, + "num_tokens": 29049895.0, + "reward": 0.4678157791495323, + "reward_std": 0.3989068418741226, + "rewards/code_reward": 0.31781577691435814, + "rewards/format_reward": 1.5, + "step": 1210 + }, + { + "clip_ratio": 0.004147466679569334, + "epoch": 0.04518445968005223, + "grad_norm": 0.0850985199213028, + "kl": 0.019012451171875, + "learning_rate": 8.978431463521845e-06, + "loss": -0.0081, + "step": 1211 + }, + { + "clip_ratio": 0.004301339853554964, + "epoch": 0.04522177137260389, + "grad_norm": 0.07974699139595032, + "kl": 0.01904296875, + "learning_rate": 8.976581462531561e-06, + "loss": -0.0085, + "step": 1212 + }, + { + "clip_ratio": 0.0042710715206339955, + "completion_length": 770.4821853637695, + "epoch": 0.045259083065155546, + "grad_norm": 0.14528723061084747, + "kl": 0.027923583984375, + "learning_rate": 8.974730002832278e-06, + "loss": 0.1007, + "num_tokens": 29120314.0, + "reward": 0.5373956188559532, + "reward_std": 0.18306024186313152, + "rewards/code_reward": 0.3954312801361084, + "rewards/format_reward": 1.4196428954601288, + "step": 1213 + }, + { + "clip_ratio": 0.003733236633706838, + "epoch": 0.0452963947577072, + "grad_norm": 0.07979262620210648, + "kl": 0.028167724609375, + "learning_rate": 8.972877085200829e-06, + "loss": 0.1006, + "step": 1214 + }, + { + "clip_ratio": 0.0040453189867548645, + "epoch": 0.04533370645025885, + "grad_norm": 0.07745829224586487, + "kl": 0.02813720703125, + "learning_rate": 8.971022710414666e-06, + "loss": 0.1001, + "step": 1215 + }, + { + "clip_ratio": 0.003255123447161168, + "completion_length": 717.8393249511719, + "epoch": 0.045371018142810506, + "grad_norm": 0.07470939308404922, + "kl": 0.0185546875, + "learning_rate": 8.969166879251849e-06, + "loss": -0.0075, + "num_tokens": 29187995.0, + "reward": 0.5495804585516453, + "reward_std": 0.15942050144076347, + "rewards/code_reward": 0.399580460973084, + "rewards/format_reward": 1.5, + "step": 1216 + }, + { + "clip_ratio": 0.003017220995388925, + "epoch": 0.04540832983536216, + "grad_norm": 0.07461985945701599, + "kl": 0.0186004638671875, + "learning_rate": 8.967309592491052e-06, + "loss": -0.0077, + "step": 1217 + }, + { + "clip_ratio": 0.0031385229667648673, + "epoch": 0.04544564152791381, + "grad_norm": 0.06951192021369934, + "kl": 0.0194091796875, + "learning_rate": 8.965450850911558e-06, + "loss": -0.0082, + "step": 1218 + }, + { + "clip_ratio": 0.00433696189429611, + "completion_length": 778.607177734375, + "epoch": 0.045482953220465465, + "grad_norm": 0.0761532112956047, + "kl": 0.0250396728515625, + "learning_rate": 8.96359065529326e-06, + "loss": 0.0587, + "num_tokens": 29262033.0, + "reward": 0.8391437232494354, + "reward_std": 0.2433333806693554, + "rewards/code_reward": 0.6937865242362022, + "rewards/format_reward": 1.4535714387893677, + "step": 1219 + }, + { + "clip_ratio": 0.004063681000843644, + "epoch": 0.04552026491301712, + "grad_norm": 0.07009672373533249, + "kl": 0.0253753662109375, + "learning_rate": 8.96172900641666e-06, + "loss": 0.0584, + "step": 1220 + }, + { + "clip_ratio": 0.0041127061122097075, + "epoch": 0.04555757660556877, + "grad_norm": 0.06895603984594345, + "kl": 0.0257568359375, + "learning_rate": 8.959865905062873e-06, + "loss": 0.0579, + "step": 1221 + }, + { + "clip_ratio": 0.004344396409578621, + "completion_length": 700.732177734375, + "epoch": 0.045594888298120424, + "grad_norm": 0.0900905653834343, + "kl": 0.04498291015625, + "learning_rate": 8.95800135201362e-06, + "loss": 0.0074, + "num_tokens": 29336582.0, + "reward": 0.9488936215639114, + "reward_std": 0.1718443278223276, + "rewards/code_reward": 0.7988936305046082, + "rewards/format_reward": 1.5, + "step": 1222 + }, + { + "clip_ratio": 0.004426638828590512, + "epoch": 0.04563219999067208, + "grad_norm": 0.08714178949594498, + "kl": 0.047393798828125, + "learning_rate": 8.956135348051237e-06, + "loss": 0.0071, + "step": 1223 + }, + { + "clip_ratio": 0.004190864914562553, + "epoch": 0.04566951168322373, + "grad_norm": 0.1145503893494606, + "kl": 0.045684814453125, + "learning_rate": 8.954267893958659e-06, + "loss": 0.0067, + "step": 1224 + }, + { + "clip_ratio": 0.004430174711160362, + "completion_length": 566.6785888671875, + "epoch": 0.045706823375775384, + "grad_norm": 0.0718425065279007, + "kl": 0.0245361328125, + "learning_rate": 8.952398990519438e-06, + "loss": 0.0149, + "num_tokens": 29390688.0, + "reward": 0.449426032602787, + "reward_std": 0.18365970253944397, + "rewards/code_reward": 0.29942601174116135, + "rewards/format_reward": 1.5, + "step": 1225 + }, + { + "clip_ratio": 0.003816180629655719, + "epoch": 0.04574413506832704, + "grad_norm": 0.06423383951187134, + "kl": 0.024383544921875, + "learning_rate": 8.950528638517732e-06, + "loss": 0.0147, + "step": 1226 + }, + { + "clip_ratio": 0.003948975296225399, + "epoch": 0.04578144676087869, + "grad_norm": 0.05919088423252106, + "kl": 0.024932861328125, + "learning_rate": 8.948656838738303e-06, + "loss": 0.0145, + "step": 1227 + }, + { + "clip_ratio": 0.0036236054729670286, + "completion_length": 545.6964569091797, + "epoch": 0.04581875845343034, + "grad_norm": 0.28763347864151, + "kl": 0.076507568359375, + "learning_rate": 8.946783591966528e-06, + "loss": 0.0045, + "num_tokens": 29448323.0, + "reward": 0.7038323432207108, + "reward_std": 0.1905626878142357, + "rewards/code_reward": 0.5556180588901043, + "rewards/format_reward": 1.4821428656578064, + "step": 1228 + }, + { + "clip_ratio": 0.003523721417877823, + "epoch": 0.045856070145981996, + "grad_norm": 0.10338588804006577, + "kl": 0.027313232421875, + "learning_rate": 8.944908898988381e-06, + "loss": 0.0038, + "step": 1229 + }, + { + "clip_ratio": 0.0033068133052438498, + "epoch": 0.04589338183853365, + "grad_norm": 0.08445089310407639, + "kl": 0.02728271484375, + "learning_rate": 8.943032760590453e-06, + "loss": 0.0031, + "step": 1230 + }, + { + "clip_ratio": 0.0038751306710764766, + "completion_length": 697.9107513427734, + "epoch": 0.0459306935310853, + "grad_norm": 0.09490235894918442, + "kl": 0.0374755859375, + "learning_rate": 8.941155177559934e-06, + "loss": -0.0024, + "num_tokens": 29516064.0, + "reward": 0.7711624875664711, + "reward_std": 0.2123185656964779, + "rewards/code_reward": 0.6229482032358646, + "rewards/format_reward": 1.4821428656578064, + "step": 1231 + }, + { + "clip_ratio": 0.0035428847186267376, + "epoch": 0.045968005223636955, + "grad_norm": 0.09095196425914764, + "kl": 0.036895751953125, + "learning_rate": 8.939276150684626e-06, + "loss": -0.0026, + "step": 1232 + }, + { + "clip_ratio": 0.003139618202112615, + "epoch": 0.04600531691618861, + "grad_norm": 0.08636380732059479, + "kl": 0.037506103515625, + "learning_rate": 8.937395680752929e-06, + "loss": -0.0032, + "step": 1233 + }, + { + "clip_ratio": 0.0040890445234254, + "completion_length": 862.9107513427734, + "epoch": 0.04604262860874026, + "grad_norm": 0.06112855672836304, + "kl": 0.0201873779296875, + "learning_rate": 8.935513768553859e-06, + "loss": 0.0138, + "num_tokens": 29599597.0, + "reward": 0.42230359837412834, + "reward_std": 0.12893700413405895, + "rewards/code_reward": 0.272303594276309, + "rewards/format_reward": 1.5, + "step": 1234 + }, + { + "clip_ratio": 0.0040947109228000045, + "epoch": 0.046079940301291915, + "grad_norm": 0.05982782319188118, + "kl": 0.0205078125, + "learning_rate": 8.933630414877026e-06, + "loss": 0.0134, + "step": 1235 + }, + { + "clip_ratio": 0.00429383700247854, + "epoch": 0.04611725199384357, + "grad_norm": 0.05624419450759888, + "kl": 0.0205230712890625, + "learning_rate": 8.931745620512652e-06, + "loss": 0.0134, + "step": 1236 + }, + { + "clip_ratio": 0.00474620028398931, + "completion_length": 627.3571701049805, + "epoch": 0.04615456368639522, + "grad_norm": 0.0743071511387825, + "kl": 0.0207672119140625, + "learning_rate": 8.929859386251564e-06, + "loss": -0.0014, + "num_tokens": 29664479.0, + "reward": 0.45321784913539886, + "reward_std": 0.10777689842507243, + "rewards/code_reward": 0.3032178245484829, + "rewards/format_reward": 1.5, + "step": 1237 + }, + { + "clip_ratio": 0.004772275744471699, + "epoch": 0.046191875378946874, + "grad_norm": 0.06036899611353874, + "kl": 0.0213165283203125, + "learning_rate": 8.927971712885187e-06, + "loss": -0.0014, + "step": 1238 + }, + { + "clip_ratio": 0.0042087751789949834, + "epoch": 0.046229187071498534, + "grad_norm": 0.05715563893318176, + "kl": 0.02032470703125, + "learning_rate": 8.92608260120556e-06, + "loss": -0.0019, + "step": 1239 + }, + { + "clip_ratio": 0.00298203865531832, + "completion_length": 538.1964569091797, + "epoch": 0.04626649876405019, + "grad_norm": 0.08811914920806885, + "kl": 0.036529541015625, + "learning_rate": 8.92419205200531e-06, + "loss": 0.0033, + "num_tokens": 29725194.0, + "reward": 0.8877861797809601, + "reward_std": 0.2258443534374237, + "rewards/code_reward": 0.7377861887216568, + "rewards/format_reward": 1.5, + "step": 1240 + }, + { + "clip_ratio": 0.003032732638530433, + "epoch": 0.04630381045660184, + "grad_norm": 0.08140786737203598, + "kl": 0.036651611328125, + "learning_rate": 8.922300066077683e-06, + "loss": 0.0032, + "step": 1241 + }, + { + "clip_ratio": 0.002456644957419485, + "epoch": 0.046341122149153494, + "grad_norm": 0.07806896418333054, + "kl": 0.03729248046875, + "learning_rate": 8.920406644216522e-06, + "loss": 0.0025, + "step": 1242 + }, + { + "clip_ratio": 0.004258761589881033, + "completion_length": 805.482177734375, + "epoch": 0.04637843384170515, + "grad_norm": 0.06845182925462723, + "kl": 0.042572021484375, + "learning_rate": 8.918511787216268e-06, + "loss": 0.0224, + "num_tokens": 29801311.0, + "reward": 0.4565667286515236, + "reward_std": 0.13693049922585487, + "rewards/code_reward": 0.3092452585697174, + "rewards/format_reward": 1.4732142984867096, + "step": 1243 + }, + { + "clip_ratio": 0.004919474245980382, + "epoch": 0.0464157455342568, + "grad_norm": 0.06606616079807281, + "kl": 0.04058837890625, + "learning_rate": 8.916615495871967e-06, + "loss": 0.022, + "step": 1244 + }, + { + "clip_ratio": 0.004857009742408991, + "epoch": 0.04645305722680845, + "grad_norm": 0.06141253933310509, + "kl": 0.034332275390625, + "learning_rate": 8.914717770979271e-06, + "loss": 0.0218, + "step": 1245 + }, + { + "clip_ratio": 0.0030789065058343112, + "completion_length": 635.232177734375, + "epoch": 0.046490368919360106, + "grad_norm": 0.14608079195022583, + "kl": 0.03460693359375, + "learning_rate": 8.912818613334429e-06, + "loss": -0.0107, + "num_tokens": 29877134.0, + "reward": 0.6752634458243847, + "reward_std": 0.05138664855621755, + "rewards/code_reward": 0.5272277239710093, + "rewards/format_reward": 1.480357140302658, + "step": 1246 + }, + { + "clip_ratio": 0.003341534757055342, + "epoch": 0.04652768061191176, + "grad_norm": 0.07169315218925476, + "kl": 0.03411865234375, + "learning_rate": 8.910918023734289e-06, + "loss": -0.0107, + "step": 1247 + }, + { + "clip_ratio": 0.0030821593827567995, + "epoch": 0.04656499230446341, + "grad_norm": 0.07551496475934982, + "kl": 0.03497314453125, + "learning_rate": 8.909016002976309e-06, + "loss": -0.011, + "step": 1248 + }, + { + "clip_ratio": 0.004079832520801574, + "completion_length": 682.6607437133789, + "epoch": 0.046602303997015065, + "grad_norm": 0.11041751503944397, + "kl": 0.025390625, + "learning_rate": 8.907112551858537e-06, + "loss": -0.0022, + "num_tokens": 29953093.0, + "reward": 0.7423808090388775, + "reward_std": 0.1901039369404316, + "rewards/code_reward": 0.5923808217048645, + "rewards/format_reward": 1.5, + "step": 1249 + }, + { + "clip_ratio": 0.0037291834596544504, + "epoch": 0.04663961568956672, + "grad_norm": 0.07015436887741089, + "kl": 0.025238037109375, + "learning_rate": 8.905207671179629e-06, + "loss": -0.0025, + "step": 1250 + }, + { + "clip_ratio": 0.003981273213867098, + "epoch": 0.04667692738211837, + "grad_norm": 0.06535869836807251, + "kl": 0.02490234375, + "learning_rate": 8.903301361738836e-06, + "loss": -0.0027, + "step": 1251 + }, + { + "clip_ratio": 0.003006451530382037, + "completion_length": 580.0714569091797, + "epoch": 0.046714239074670025, + "grad_norm": 0.06974302977323532, + "kl": 0.019500732421875, + "learning_rate": 8.901393624336008e-06, + "loss": 0.0099, + "num_tokens": 30004617.0, + "reward": 0.6107496321201324, + "reward_std": 0.2777090221643448, + "rewards/code_reward": 0.4607496503740549, + "rewards/format_reward": 1.5, + "step": 1252 + }, + { + "clip_ratio": 0.0030870504560880363, + "epoch": 0.04675155076722168, + "grad_norm": 0.07215604186058044, + "kl": 0.0195465087890625, + "learning_rate": 8.8994844597716e-06, + "loss": 0.0098, + "step": 1253 + }, + { + "clip_ratio": 0.0026612491346895695, + "epoch": 0.04678886245977333, + "grad_norm": 0.11256314069032669, + "kl": 0.019561767578125, + "learning_rate": 8.897573868846657e-06, + "loss": 0.0093, + "step": 1254 + }, + { + "clip_ratio": 0.0021441770950332284, + "completion_length": 559.9464645385742, + "epoch": 0.046826174152324984, + "grad_norm": 0.06333834677934647, + "kl": 0.0269775390625, + "learning_rate": 8.895661852362835e-06, + "loss": -0.0025, + "num_tokens": 30062420.0, + "reward": 0.8023944646120071, + "reward_std": 0.22802216559648514, + "rewards/code_reward": 0.6523944772779942, + "rewards/format_reward": 1.5, + "step": 1255 + }, + { + "clip_ratio": 0.001808778615668416, + "epoch": 0.04686348584487664, + "grad_norm": 0.060002245008945465, + "kl": 0.027618408203125, + "learning_rate": 8.893748411122373e-06, + "loss": -0.0027, + "step": 1256 + }, + { + "clip_ratio": 0.0017759662587195635, + "epoch": 0.04690079753742829, + "grad_norm": 0.06001506745815277, + "kl": 0.027862548828125, + "learning_rate": 8.891833545928119e-06, + "loss": -0.0029, + "step": 1257 + }, + { + "clip_ratio": 0.003576561401132494, + "completion_length": 882.6607360839844, + "epoch": 0.046938109229979944, + "grad_norm": 0.07683596760034561, + "kl": 0.021759033203125, + "learning_rate": 8.889917257583515e-06, + "loss": 0.0662, + "num_tokens": 30141935.0, + "reward": 0.5459462366998196, + "reward_std": 0.20565499737858772, + "rewards/code_reward": 0.40130338072776794, + "rewards/format_reward": 1.4464285969734192, + "step": 1258 + }, + { + "clip_ratio": 0.0037179323262535036, + "epoch": 0.0469754209225316, + "grad_norm": 0.07148604094982147, + "kl": 0.021270751953125, + "learning_rate": 8.887999546892598e-06, + "loss": 0.066, + "step": 1259 + }, + { + "clip_ratio": 0.0032034931355156004, + "epoch": 0.04701273261508325, + "grad_norm": 0.07292412966489792, + "kl": 0.0208740234375, + "learning_rate": 8.886080414660007e-06, + "loss": 0.0656, + "step": 1260 + }, + { + "clip_ratio": 0.003376230481080711, + "completion_length": 844.4286193847656, + "epoch": 0.0470500443076349, + "grad_norm": 0.07493031769990921, + "kl": 0.021331787109375, + "learning_rate": 8.884159861690973e-06, + "loss": 0.0103, + "num_tokens": 30229333.0, + "reward": 0.5912178084254265, + "reward_std": 0.1664906209334731, + "rewards/code_reward": 0.44657494127750397, + "rewards/format_reward": 1.4464285969734192, + "step": 1261 + }, + { + "clip_ratio": 0.004033105215057731, + "epoch": 0.047087356000186556, + "grad_norm": 0.06547587364912033, + "kl": 0.021514892578125, + "learning_rate": 8.882237888791322e-06, + "loss": 0.0099, + "step": 1262 + }, + { + "clip_ratio": 0.004094427160453051, + "epoch": 0.04712466769273821, + "grad_norm": 0.0644562616944313, + "kl": 0.021240234375, + "learning_rate": 8.88031449676748e-06, + "loss": 0.0098, + "step": 1263 + }, + { + "clip_ratio": 0.0046481332974508405, + "completion_length": 661.0000228881836, + "epoch": 0.04716197938528986, + "grad_norm": 0.051234468817710876, + "kl": 0.0278167724609375, + "learning_rate": 8.878389686426465e-06, + "loss": -0.0454, + "num_tokens": 30299305.0, + "reward": 0.41844093799591064, + "reward_std": 0.06576794944703579, + "rewards/code_reward": 0.2711195065639913, + "rewards/format_reward": 1.4732142984867096, + "step": 1264 + }, + { + "clip_ratio": 0.00456004380248487, + "epoch": 0.047199291077841515, + "grad_norm": 0.04968332126736641, + "kl": 0.027557373046875, + "learning_rate": 8.876463458575895e-06, + "loss": -0.0455, + "step": 1265 + }, + { + "clip_ratio": 0.004772420274093747, + "epoch": 0.047236602770393175, + "grad_norm": 0.04646913334727287, + "kl": 0.027618408203125, + "learning_rate": 8.874535814023979e-06, + "loss": -0.0457, + "step": 1266 + }, + { + "clip_ratio": 0.0035766412620432675, + "completion_length": 741.9286117553711, + "epoch": 0.04727391446294483, + "grad_norm": 0.057552825659513474, + "kl": 0.023406982421875, + "learning_rate": 8.872606753579516e-06, + "loss": -0.0052, + "num_tokens": 30371723.0, + "reward": 0.5939358286559582, + "reward_std": 0.17598630487918854, + "rewards/code_reward": 0.4466143548488617, + "rewards/format_reward": 1.4732142984867096, + "step": 1267 + }, + { + "clip_ratio": 0.0039619021117687225, + "epoch": 0.04731122615549648, + "grad_norm": 0.05986675247550011, + "kl": 0.0253143310546875, + "learning_rate": 8.87067627805191e-06, + "loss": -0.0054, + "step": 1268 + }, + { + "clip_ratio": 0.003865178965497762, + "epoch": 0.047348537848048135, + "grad_norm": 0.054247017949819565, + "kl": 0.0244903564453125, + "learning_rate": 8.868744388251149e-06, + "loss": -0.0056, + "step": 1269 + }, + { + "clip_ratio": 0.004101870174054056, + "completion_length": 931.6964721679688, + "epoch": 0.04738584954059979, + "grad_norm": 0.07298476994037628, + "kl": 0.0212249755859375, + "learning_rate": 8.866811084987818e-06, + "loss": 0.0957, + "num_tokens": 30456448.0, + "reward": 0.4800858907401562, + "reward_std": 0.19746193010360003, + "rewards/code_reward": 0.33544304221868515, + "rewards/format_reward": 1.4464285969734192, + "step": 1270 + }, + { + "clip_ratio": 0.003557304502464831, + "epoch": 0.04742316123315144, + "grad_norm": 0.07225508242845535, + "kl": 0.0216217041015625, + "learning_rate": 8.864876369073097e-06, + "loss": 0.0954, + "step": 1271 + }, + { + "clip_ratio": 0.003802433842793107, + "epoch": 0.047460472925703094, + "grad_norm": 0.0708756297826767, + "kl": 0.021484375, + "learning_rate": 8.862940241318757e-06, + "loss": 0.0951, + "step": 1272 + }, + { + "clip_ratio": 0.006292859208770096, + "completion_length": 660.0178909301758, + "epoch": 0.04749778461825475, + "grad_norm": 0.0710347443819046, + "kl": 0.02239990234375, + "learning_rate": 8.861002702537159e-06, + "loss": -0.0013, + "num_tokens": 30529809.0, + "reward": 0.35151291638612747, + "reward_std": 0.17700905352830887, + "rewards/code_reward": 0.20151289738714695, + "rewards/format_reward": 1.5, + "step": 1273 + }, + { + "clip_ratio": 0.005609896499663591, + "epoch": 0.0475350963108064, + "grad_norm": 0.06932906061410904, + "kl": 0.02203369140625, + "learning_rate": 8.85906375354126e-06, + "loss": -0.0014, + "step": 1274 + }, + { + "clip_ratio": 0.005770057439804077, + "epoch": 0.047572408003358053, + "grad_norm": 0.06265237182378769, + "kl": 0.02301025390625, + "learning_rate": 8.857123395144609e-06, + "loss": -0.0019, + "step": 1275 + }, + { + "clip_ratio": 0.004072179086506367, + "completion_length": 816.357177734375, + "epoch": 0.04760971969590971, + "grad_norm": 0.05920520797371864, + "kl": 0.0167694091796875, + "learning_rate": 8.855181628161339e-06, + "loss": 0.0026, + "num_tokens": 30605149.0, + "reward": 0.5904762223362923, + "reward_std": 0.12872559763491154, + "rewards/code_reward": 0.440476194024086, + "rewards/format_reward": 1.5, + "step": 1276 + }, + { + "clip_ratio": 0.004140972159802914, + "epoch": 0.04764703138846136, + "grad_norm": 0.0582873560488224, + "kl": 0.0170440673828125, + "learning_rate": 8.853238453406183e-06, + "loss": 0.0024, + "step": 1277 + }, + { + "clip_ratio": 0.00437511969357729, + "epoch": 0.04768434308101301, + "grad_norm": 0.05010085180401802, + "kl": 0.0172119140625, + "learning_rate": 8.851293871694461e-06, + "loss": 0.0022, + "step": 1278 + }, + { + "clip_ratio": 0.0043780962587334216, + "completion_length": 799.8214797973633, + "epoch": 0.047721654773564666, + "grad_norm": 0.04679470136761665, + "kl": 0.037841796875, + "learning_rate": 8.84934788384208e-06, + "loss": 0.0009, + "num_tokens": 30688369.0, + "reward": 0.6103365495800972, + "reward_std": 0.04241586849093437, + "rewards/code_reward": 0.4603365510702133, + "rewards/format_reward": 1.5, + "step": 1279 + }, + { + "clip_ratio": 0.004350993607658893, + "epoch": 0.04775896646611632, + "grad_norm": 0.04303732514381409, + "kl": 0.038055419921875, + "learning_rate": 8.847400490665547e-06, + "loss": 0.0008, + "step": 1280 + }, + { + "clip_ratio": 0.004496059729717672, + "epoch": 0.04779627815866797, + "grad_norm": 0.043120402842760086, + "kl": 0.037322998046875, + "learning_rate": 8.845451692981945e-06, + "loss": 0.0007, + "step": 1281 + }, + { + "clip_ratio": 0.005447155446745455, + "completion_length": 658.1428909301758, + "epoch": 0.047833589851219625, + "grad_norm": 0.09123614430427551, + "kl": 0.027130126953125, + "learning_rate": 8.843501491608956e-06, + "loss": 0.0308, + "num_tokens": 30759637.0, + "reward": 0.299107164144516, + "reward_std": 0.13610620703548193, + "rewards/code_reward": 0.1517857164144516, + "rewards/format_reward": 1.4732142984867096, + "step": 1282 + }, + { + "clip_ratio": 0.00477837841026485, + "epoch": 0.04787090154377128, + "grad_norm": 0.09549350291490555, + "kl": 0.02703857421875, + "learning_rate": 8.84154988736485e-06, + "loss": 0.0302, + "step": 1283 + }, + { + "clip_ratio": 0.0047474916791543365, + "epoch": 0.04790821323632293, + "grad_norm": 0.0750877782702446, + "kl": 0.027618408203125, + "learning_rate": 8.839596881068483e-06, + "loss": 0.0299, + "step": 1284 + }, + { + "clip_ratio": 0.005848693195730448, + "completion_length": 739.6429138183594, + "epoch": 0.047945524928874585, + "grad_norm": 0.0751296654343605, + "kl": 0.04046630859375, + "learning_rate": 8.8376424735393e-06, + "loss": -0.0076, + "num_tokens": 30840985.0, + "reward": 0.31178436800837517, + "reward_std": 0.2002396397292614, + "rewards/code_reward": 0.16178434528410435, + "rewards/format_reward": 1.5, + "step": 1285 + }, + { + "clip_ratio": 0.005493780772667378, + "epoch": 0.04798283662142624, + "grad_norm": 0.07226376980543137, + "kl": 0.040130615234375, + "learning_rate": 8.83568666559733e-06, + "loss": -0.008, + "step": 1286 + }, + { + "clip_ratio": 0.005390979931689799, + "epoch": 0.04802014831397789, + "grad_norm": 0.0724770799279213, + "kl": 0.039642333984375, + "learning_rate": 8.833729458063201e-06, + "loss": -0.0083, + "step": 1287 + }, + { + "clip_ratio": 0.004267815966159105, + "completion_length": 727.3571701049805, + "epoch": 0.048057460006529544, + "grad_norm": 0.07897859811782837, + "kl": 0.0226287841796875, + "learning_rate": 8.831770851758117e-06, + "loss": -0.0053, + "num_tokens": 30918971.0, + "reward": 0.62455939874053, + "reward_std": 0.17649163119494915, + "rewards/code_reward": 0.4745593760162592, + "rewards/format_reward": 1.5, + "step": 1288 + }, + { + "clip_ratio": 0.003950448706746101, + "epoch": 0.0480947716990812, + "grad_norm": 0.08056045323610306, + "kl": 0.022918701171875, + "learning_rate": 8.829810847503871e-06, + "loss": -0.0054, + "step": 1289 + }, + { + "clip_ratio": 0.0035438091144897044, + "epoch": 0.04813208339163285, + "grad_norm": 0.0904296487569809, + "kl": 0.022918701171875, + "learning_rate": 8.827849446122849e-06, + "loss": -0.0059, + "step": 1290 + }, + { + "clip_ratio": 0.0036337741767056286, + "completion_length": 665.7500305175781, + "epoch": 0.0481693950841845, + "grad_norm": 0.07724316418170929, + "kl": 0.016754150390625, + "learning_rate": 8.825886648438014e-06, + "loss": -0.002, + "num_tokens": 30984015.0, + "reward": 0.4468354806303978, + "reward_std": 0.07612908200826496, + "rewards/code_reward": 0.2995140589919174, + "rewards/format_reward": 1.4732142984867096, + "step": 1291 + }, + { + "clip_ratio": 0.0034006318310275674, + "epoch": 0.048206706776736157, + "grad_norm": 0.06675421446561813, + "kl": 0.01708984375, + "learning_rate": 8.823922455272925e-06, + "loss": -0.0025, + "step": 1292 + }, + { + "clip_ratio": 0.00367250555427745, + "epoch": 0.04824401846928781, + "grad_norm": 0.06353773921728134, + "kl": 0.01763916015625, + "learning_rate": 8.821956867451714e-06, + "loss": -0.0027, + "step": 1293 + }, + { + "clip_ratio": 0.003258144250139594, + "completion_length": 722.7857666015625, + "epoch": 0.04828133016183947, + "grad_norm": 0.08574020117521286, + "kl": 0.018585205078125, + "learning_rate": 8.819989885799111e-06, + "loss": 0.0144, + "num_tokens": 31049411.0, + "reward": 0.6153011247515678, + "reward_std": 0.21154308319091797, + "rewards/code_reward": 0.4679797012358904, + "rewards/format_reward": 1.4732142984867096, + "step": 1294 + }, + { + "clip_ratio": 0.003150680335238576, + "epoch": 0.04831864185439112, + "grad_norm": 0.07224798202514648, + "kl": 0.018585205078125, + "learning_rate": 8.818021511140423e-06, + "loss": 0.0143, + "step": 1295 + }, + { + "clip_ratio": 0.002915437740739435, + "epoch": 0.048355953546942776, + "grad_norm": 0.08505767583847046, + "kl": 0.0179901123046875, + "learning_rate": 8.81605174430154e-06, + "loss": 0.0138, + "step": 1296 + }, + { + "clip_ratio": 0.004003180656582117, + "completion_length": 741.7857360839844, + "epoch": 0.04839326523949443, + "grad_norm": 0.08729485422372818, + "kl": 0.0269622802734375, + "learning_rate": 8.814080586108945e-06, + "loss": -0.0084, + "num_tokens": 31121191.0, + "reward": 0.4289528578519821, + "reward_std": 0.3116193078458309, + "rewards/code_reward": 0.27895285096019506, + "rewards/format_reward": 1.5, + "step": 1297 + }, + { + "clip_ratio": 0.004141995159443468, + "epoch": 0.04843057693204608, + "grad_norm": 0.08428313583135605, + "kl": 0.026763916015625, + "learning_rate": 8.812108037389696e-06, + "loss": -0.0084, + "step": 1298 + }, + { + "clip_ratio": 0.004124921339098364, + "epoch": 0.048467888624597735, + "grad_norm": 0.07265902310609818, + "kl": 0.0264892578125, + "learning_rate": 8.81013409897144e-06, + "loss": -0.009, + "step": 1299 + }, + { + "clip_ratio": 0.0034990576677955687, + "completion_length": 755.5000381469727, + "epoch": 0.04850520031714939, + "grad_norm": 0.08025167137384415, + "kl": 0.015380859375, + "learning_rate": 8.808158771682402e-06, + "loss": 0.0086, + "num_tokens": 31191653.0, + "reward": 0.5200592465698719, + "reward_std": 0.18881333898752928, + "rewards/code_reward": 0.37005921883974224, + "rewards/format_reward": 1.5, + "step": 1300 + }, + { + "clip_ratio": 0.003766934387385845, + "epoch": 0.04854251200970104, + "grad_norm": 0.07889507710933685, + "kl": 0.015106201171875, + "learning_rate": 8.806182056351397e-06, + "loss": 0.0084, + "step": 1301 + }, + { + "clip_ratio": 0.00386454671388492, + "epoch": 0.048579823702252695, + "grad_norm": 0.06859076023101807, + "kl": 0.015594482421875, + "learning_rate": 8.804203953807813e-06, + "loss": 0.0078, + "step": 1302 + }, + { + "clip_ratio": 0.0026043213438242674, + "completion_length": 611.6071548461914, + "epoch": 0.04861713539480435, + "grad_norm": 0.07194503396749496, + "kl": 0.0366973876953125, + "learning_rate": 8.802224464881628e-06, + "loss": 0.0014, + "num_tokens": 31252013.0, + "reward": 0.6932214833796024, + "reward_std": 0.07432540785521269, + "rewards/code_reward": 0.543221477419138, + "rewards/format_reward": 1.5, + "step": 1303 + }, + { + "clip_ratio": 0.002530121011659503, + "epoch": 0.048654447087356, + "grad_norm": 0.06344405561685562, + "kl": 0.03643798828125, + "learning_rate": 8.800243590403398e-06, + "loss": 0.0013, + "step": 1304 + }, + { + "clip_ratio": 0.002598728402517736, + "epoch": 0.048691758779907654, + "grad_norm": 0.056420519948005676, + "kl": 0.0326385498046875, + "learning_rate": 8.798261331204262e-06, + "loss": 0.0012, + "step": 1305 + }, + { + "clip_ratio": 0.0017138910479843616, + "completion_length": 648.6428909301758, + "epoch": 0.04872907047245931, + "grad_norm": 0.060719966888427734, + "kl": 0.0179901123046875, + "learning_rate": 8.796277688115939e-06, + "loss": 0.0099, + "num_tokens": 31314279.0, + "reward": 0.9892857074737549, + "reward_std": 0.22050277143716812, + "rewards/code_reward": 0.8392857164144516, + "rewards/format_reward": 1.5, + "step": 1306 + }, + { + "clip_ratio": 0.0014312383718788624, + "epoch": 0.04876638216501096, + "grad_norm": 0.06205439195036888, + "kl": 0.01776123046875, + "learning_rate": 8.794292661970727e-06, + "loss": 0.0098, + "step": 1307 + }, + { + "clip_ratio": 0.0016051342827267945, + "epoch": 0.04880369385756261, + "grad_norm": 0.058316465467214584, + "kl": 0.01763916015625, + "learning_rate": 8.792306253601509e-06, + "loss": 0.0096, + "step": 1308 + }, + { + "clip_ratio": 0.004562842252198607, + "completion_length": 637.5893096923828, + "epoch": 0.048841005550114266, + "grad_norm": 0.08840080350637436, + "kl": 0.0222015380859375, + "learning_rate": 8.790318463841742e-06, + "loss": 0.0007, + "num_tokens": 31374204.0, + "reward": 0.8000233881175518, + "reward_std": 0.13802950829267502, + "rewards/code_reward": 0.6500233709812164, + "rewards/format_reward": 1.5, + "step": 1309 + }, + { + "clip_ratio": 0.0046781846904195845, + "epoch": 0.04887831724266592, + "grad_norm": 0.08436626195907593, + "kl": 0.022430419921875, + "learning_rate": 8.788329293525468e-06, + "loss": 0.0003, + "step": 1310 + }, + { + "clip_ratio": 0.004111170768737793, + "epoch": 0.04891562893521757, + "grad_norm": 0.07999914884567261, + "kl": 0.02276611328125, + "learning_rate": 8.786338743487306e-06, + "loss": -0.0002, + "step": 1311 + }, + { + "clip_ratio": 0.005346249672584236, + "completion_length": 670.3214569091797, + "epoch": 0.048952940627769226, + "grad_norm": 0.08963903039693832, + "kl": 0.030853271484375, + "learning_rate": 8.784346814562452e-06, + "loss": -0.0019, + "num_tokens": 31437166.0, + "reward": 0.5313109345734119, + "reward_std": 0.2759476751089096, + "rewards/code_reward": 0.38131090998649597, + "rewards/format_reward": 1.5, + "step": 1312 + }, + { + "clip_ratio": 0.005818774574436247, + "epoch": 0.04899025232032088, + "grad_norm": 0.08317296952009201, + "kl": 0.029266357421875, + "learning_rate": 8.782353507586683e-06, + "loss": -0.0023, + "step": 1313 + }, + { + "clip_ratio": 0.00508021202404052, + "epoch": 0.04902756401287253, + "grad_norm": 0.07862108945846558, + "kl": 0.029632568359375, + "learning_rate": 8.780358823396352e-06, + "loss": -0.0029, + "step": 1314 + }, + { + "clip_ratio": 0.0021328318398445845, + "completion_length": 600.6607284545898, + "epoch": 0.049064875705424185, + "grad_norm": 0.06048664078116417, + "kl": 0.01922607421875, + "learning_rate": 8.778362762828396e-06, + "loss": 0.0047, + "num_tokens": 31498635.0, + "reward": 0.8506922200322151, + "reward_std": 0.06413538195192814, + "rewards/code_reward": 0.7006922177970409, + "rewards/format_reward": 1.5, + "step": 1315 + }, + { + "clip_ratio": 0.0019935605814680457, + "epoch": 0.04910218739797584, + "grad_norm": 0.06487785279750824, + "kl": 0.01971435546875, + "learning_rate": 8.77636532672032e-06, + "loss": 0.0046, + "step": 1316 + }, + { + "clip_ratio": 0.0020358861656859517, + "epoch": 0.04913949909052749, + "grad_norm": 0.05225355178117752, + "kl": 0.019866943359375, + "learning_rate": 8.774366515910217e-06, + "loss": 0.0043, + "step": 1317 + }, + { + "clip_ratio": 0.0037815235555171967, + "completion_length": 543.7321701049805, + "epoch": 0.049176810783079145, + "grad_norm": 0.09930305927991867, + "kl": 0.017578125, + "learning_rate": 8.772366331236743e-06, + "loss": -0.0051, + "num_tokens": 31560108.0, + "reward": 0.7091877907514572, + "reward_std": 0.3167688101530075, + "rewards/code_reward": 0.5591877847909927, + "rewards/format_reward": 1.5, + "step": 1318 + }, + { + "clip_ratio": 0.003911617153789848, + "epoch": 0.0492141224756308, + "grad_norm": 0.10947313159704208, + "kl": 0.018585205078125, + "learning_rate": 8.770364773539146e-06, + "loss": -0.0052, + "step": 1319 + }, + { + "clip_ratio": 0.003982544643804431, + "epoch": 0.04925143416818245, + "grad_norm": 0.09513910114765167, + "kl": 0.0184326171875, + "learning_rate": 8.768361843657235e-06, + "loss": -0.0057, + "step": 1320 + }, + { + "clip_ratio": 0.004152832261752337, + "completion_length": 673.8036041259766, + "epoch": 0.04928874586073411, + "grad_norm": 0.07380026578903198, + "kl": 0.023834228515625, + "learning_rate": 8.766357542431409e-06, + "loss": 0.0104, + "num_tokens": 31623783.0, + "reward": 0.33805643394589424, + "reward_std": 0.23833181709051132, + "rewards/code_reward": 0.18805640935897827, + "rewards/format_reward": 1.5, + "step": 1321 + }, + { + "clip_ratio": 0.0038804521900601685, + "epoch": 0.049326057553285764, + "grad_norm": 0.0735209584236145, + "kl": 0.02423095703125, + "learning_rate": 8.764351870702632e-06, + "loss": 0.0102, + "step": 1322 + }, + { + "clip_ratio": 0.003993560501839966, + "epoch": 0.04936336924583742, + "grad_norm": 0.07123567163944244, + "kl": 0.02410888671875, + "learning_rate": 8.762344829312447e-06, + "loss": 0.0101, + "step": 1323 + }, + { + "clip_ratio": 0.003482003230601549, + "completion_length": 761.2678756713867, + "epoch": 0.04940068093838907, + "grad_norm": 0.05208456143736839, + "kl": 0.02325439453125, + "learning_rate": 8.760336419102971e-06, + "loss": 0.0022, + "num_tokens": 31700560.0, + "reward": 0.4518137462437153, + "reward_std": 0.06938888691365719, + "rewards/code_reward": 0.30181375064421445, + "rewards/format_reward": 1.5, + "step": 1324 + }, + { + "clip_ratio": 0.003321281459648162, + "epoch": 0.04943799263094072, + "grad_norm": 0.04827187582850456, + "kl": 0.023040771484375, + "learning_rate": 8.758326640916898e-06, + "loss": 0.002, + "step": 1325 + }, + { + "clip_ratio": 0.0032059389632195234, + "epoch": 0.049475304323492376, + "grad_norm": 0.0486091673374176, + "kl": 0.023712158203125, + "learning_rate": 8.756315495597489e-06, + "loss": 0.0019, + "step": 1326 + }, + { + "clip_ratio": 0.004269844968803227, + "completion_length": 664.4107437133789, + "epoch": 0.04951261601604403, + "grad_norm": 0.08502327650785446, + "kl": 0.025177001953125, + "learning_rate": 8.754302983988589e-06, + "loss": -0.0009, + "num_tokens": 31773131.0, + "reward": 0.48363472521305084, + "reward_std": 0.3418242484331131, + "rewards/code_reward": 0.33363472297787666, + "rewards/format_reward": 1.5, + "step": 1327 + }, + { + "clip_ratio": 0.004401217389386147, + "epoch": 0.04954992770859568, + "grad_norm": 0.09502480179071426, + "kl": 0.025146484375, + "learning_rate": 8.752289106934605e-06, + "loss": -0.001, + "step": 1328 + }, + { + "clip_ratio": 0.003876374918036163, + "epoch": 0.049587239401147336, + "grad_norm": 0.08124817162752151, + "kl": 0.024200439453125, + "learning_rate": 8.750273865280527e-06, + "loss": -0.0016, + "step": 1329 + }, + { + "clip_ratio": 0.004745194339193404, + "completion_length": 727.8750457763672, + "epoch": 0.04962455109369899, + "grad_norm": 0.10324031859636307, + "kl": 0.025634765625, + "learning_rate": 8.74825725987191e-06, + "loss": 0.0255, + "num_tokens": 31848220.0, + "reward": 0.5476170927286148, + "reward_std": 0.2635871134698391, + "rewards/code_reward": 0.39761707559227943, + "rewards/format_reward": 1.5, + "step": 1330 + }, + { + "clip_ratio": 0.004630443872883916, + "epoch": 0.04966186278625064, + "grad_norm": 0.099448062479496, + "kl": 0.0255126953125, + "learning_rate": 8.746239291554884e-06, + "loss": 0.0253, + "step": 1331 + }, + { + "clip_ratio": 0.004579945292789489, + "epoch": 0.049699174478802295, + "grad_norm": 0.09141512960195541, + "kl": 0.02630615234375, + "learning_rate": 8.744219961176153e-06, + "loss": 0.0245, + "step": 1332 + }, + { + "clip_ratio": 0.0029223154997453094, + "completion_length": 690.232177734375, + "epoch": 0.04973648617135395, + "grad_norm": 0.5602700114250183, + "kl": 0.022705078125, + "learning_rate": 8.74219926958299e-06, + "loss": -0.0118, + "num_tokens": 31913431.0, + "reward": 0.7496660873293877, + "reward_std": 0.22296547144651413, + "rewards/code_reward": 0.5996660515666008, + "rewards/format_reward": 1.5, + "step": 1333 + }, + { + "clip_ratio": 0.002776768116746098, + "epoch": 0.0497737978639056, + "grad_norm": 0.0626140907406807, + "kl": 0.023101806640625, + "learning_rate": 8.740177217623239e-06, + "loss": -0.0119, + "step": 1334 + }, + { + "clip_ratio": 0.0023954756325110793, + "epoch": 0.049811109556457255, + "grad_norm": 0.06205400452017784, + "kl": 0.023773193359375, + "learning_rate": 8.738153806145315e-06, + "loss": -0.0121, + "step": 1335 + }, + { + "clip_ratio": 0.004244347393978387, + "completion_length": 700.3036193847656, + "epoch": 0.04984842124900891, + "grad_norm": 0.1266721487045288, + "kl": 0.0264892578125, + "learning_rate": 8.736129035998204e-06, + "loss": 0.0009, + "num_tokens": 31990644.0, + "reward": 0.6635777354240417, + "reward_std": 0.26394644542597234, + "rewards/code_reward": 0.5135777075774968, + "rewards/format_reward": 1.5, + "step": 1336 + }, + { + "clip_ratio": 0.004205531964544207, + "epoch": 0.04988573294156056, + "grad_norm": 0.10663625597953796, + "kl": 0.02740478515625, + "learning_rate": 8.734102908031465e-06, + "loss": 0.0007, + "step": 1337 + }, + { + "clip_ratio": 0.004148676234763116, + "epoch": 0.049923044634112214, + "grad_norm": 0.10289544612169266, + "kl": 0.02978515625, + "learning_rate": 8.73207542309522e-06, + "loss": 0.0, + "step": 1338 + }, + { + "clip_ratio": 0.0030462733702734113, + "completion_length": 772.6964492797852, + "epoch": 0.04996035632666387, + "grad_norm": 0.08204079419374466, + "kl": 0.0252838134765625, + "learning_rate": 8.730046582040164e-06, + "loss": 0.0118, + "num_tokens": 32066773.0, + "reward": 0.7338607609272003, + "reward_std": 0.2874939888715744, + "rewards/code_reward": 0.5838607791811228, + "rewards/format_reward": 1.5, + "step": 1339 + }, + { + "clip_ratio": 0.0028988727717660367, + "epoch": 0.04999766801921552, + "grad_norm": 0.06832977384328842, + "kl": 0.02500152587890625, + "learning_rate": 8.728016385717561e-06, + "loss": 0.0116, + "step": 1340 + }, + { + "clip_ratio": 0.0028011160320602357, + "epoch": 0.05003497971176717, + "grad_norm": 0.05883968621492386, + "kl": 0.02802276611328125, + "learning_rate": 8.725984834979247e-06, + "loss": 0.0112, + "step": 1341 + }, + { + "clip_ratio": 0.004285758768673986, + "completion_length": 595.8393096923828, + "epoch": 0.050072291404318826, + "grad_norm": 0.10179895162582397, + "kl": 0.031951904296875, + "learning_rate": 8.72395193067762e-06, + "loss": 0.0078, + "num_tokens": 32136610.0, + "reward": 0.7585829794406891, + "reward_std": 0.32427702471613884, + "rewards/code_reward": 0.6085829483345151, + "rewards/format_reward": 1.5, + "step": 1342 + }, + { + "clip_ratio": 0.004861734691075981, + "epoch": 0.05010960309687048, + "grad_norm": 0.09987485408782959, + "kl": 0.031707763671875, + "learning_rate": 8.72191767366565e-06, + "loss": 0.0077, + "step": 1343 + }, + { + "clip_ratio": 0.004071730829309672, + "epoch": 0.05014691478942213, + "grad_norm": 0.09284502267837524, + "kl": 0.0321044921875, + "learning_rate": 8.719882064796874e-06, + "loss": 0.0067, + "step": 1344 + }, + { + "clip_ratio": 0.0027445750311017036, + "completion_length": 579.3928909301758, + "epoch": 0.050184226481973786, + "grad_norm": 0.09555502235889435, + "kl": 0.01953125, + "learning_rate": 8.717845104925393e-06, + "loss": 0.0052, + "num_tokens": 32193318.0, + "reward": 0.6874596104025841, + "reward_std": 0.14434813591651618, + "rewards/code_reward": 0.5374595823232085, + "rewards/format_reward": 1.5, + "step": 1345 + }, + { + "clip_ratio": 0.0028706109151244164, + "epoch": 0.05022153817452544, + "grad_norm": 0.08897962421178818, + "kl": 0.020538330078125, + "learning_rate": 8.715806794905881e-06, + "loss": 0.005, + "step": 1346 + }, + { + "clip_ratio": 0.002226909651653841, + "epoch": 0.05025884986707709, + "grad_norm": 0.08573922514915466, + "kl": 0.020904541015625, + "learning_rate": 8.713767135593572e-06, + "loss": 0.0044, + "step": 1347 + }, + { + "clip_ratio": 0.004050961113534868, + "completion_length": 806.2857437133789, + "epoch": 0.05029616155962875, + "grad_norm": 0.08131927251815796, + "kl": 0.01983642578125, + "learning_rate": 8.711726127844271e-06, + "loss": 0.0069, + "num_tokens": 32273820.0, + "reward": 0.501577340066433, + "reward_std": 0.42637383192777634, + "rewards/code_reward": 0.35425589978694916, + "rewards/format_reward": 1.4732142984867096, + "step": 1348 + }, + { + "clip_ratio": 0.0039030732586979866, + "epoch": 0.050333473252180405, + "grad_norm": 0.08295632153749466, + "kl": 0.019744873046875, + "learning_rate": 8.709683772514345e-06, + "loss": 0.0069, + "step": 1349 + }, + { + "clip_ratio": 0.004128038708586246, + "epoch": 0.05037078494473206, + "grad_norm": 0.07648130506277084, + "kl": 0.020416259765625, + "learning_rate": 8.707640070460733e-06, + "loss": 0.0065, + "step": 1350 + }, + { + "clip_ratio": 0.004730401386041194, + "completion_length": 702.1071701049805, + "epoch": 0.05040809663728371, + "grad_norm": 493.7107238769531, + "kl": 162.01953125, + "learning_rate": 8.705595022540929e-06, + "loss": 1.659, + "num_tokens": 32358358.0, + "reward": 0.7484559789299965, + "reward_std": 0.2805977687239647, + "rewards/code_reward": 0.6011345200240612, + "rewards/format_reward": 1.4732142984867096, + "step": 1351 + }, + { + "clip_ratio": 0.007603070873301476, + "epoch": 0.050445408329835364, + "grad_norm": 0.14509010314941406, + "kl": 0.086181640625, + "learning_rate": 8.703548629612998e-06, + "loss": 0.0402, + "step": 1352 + }, + { + "clip_ratio": 0.009110820712521672, + "epoch": 0.05048272002238702, + "grad_norm": 0.16616888344287872, + "kl": 0.057708740234375, + "learning_rate": 8.701500892535572e-06, + "loss": 0.0406, + "step": 1353 + }, + { + "clip_ratio": 0.0045948210172355175, + "completion_length": 607.3035888671875, + "epoch": 0.05052003171493867, + "grad_norm": 0.0779397189617157, + "kl": 0.023834228515625, + "learning_rate": 8.699451812167843e-06, + "loss": 0.0473, + "num_tokens": 32422341.0, + "reward": 0.4460070915520191, + "reward_std": 0.1609856216236949, + "rewards/code_reward": 0.2986856438219547, + "rewards/format_reward": 1.4732142984867096, + "step": 1354 + }, + { + "clip_ratio": 0.004973937291651964, + "epoch": 0.050557343407490324, + "grad_norm": 0.07982312142848969, + "kl": 0.024078369140625, + "learning_rate": 8.697401389369562e-06, + "loss": 0.0472, + "step": 1355 + }, + { + "clip_ratio": 0.004980003810487688, + "epoch": 0.05059465510004198, + "grad_norm": 0.07394333183765411, + "kl": 0.02496337890625, + "learning_rate": 8.695349625001051e-06, + "loss": 0.0469, + "step": 1356 + }, + { + "clip_ratio": 0.0029286390054039657, + "completion_length": 804.7857666015625, + "epoch": 0.05063196679259363, + "grad_norm": 0.08450876176357269, + "kl": 0.0180206298828125, + "learning_rate": 8.693296519923195e-06, + "loss": -0.0118, + "num_tokens": 32500127.0, + "reward": 0.48800773546099663, + "reward_std": 0.11968510411679745, + "rewards/code_reward": 0.33800768747460097, + "rewards/format_reward": 1.5, + "step": 1357 + }, + { + "clip_ratio": 0.003670817764941603, + "epoch": 0.05066927848514528, + "grad_norm": 0.09483238309621811, + "kl": 0.0174560546875, + "learning_rate": 8.691242074997436e-06, + "loss": -0.0119, + "step": 1358 + }, + { + "clip_ratio": 0.00404392828932032, + "epoch": 0.050706590177696936, + "grad_norm": 0.07335592806339264, + "kl": 0.017822265625, + "learning_rate": 8.68918629108578e-06, + "loss": -0.0123, + "step": 1359 + }, + { + "clip_ratio": 0.003990375087596476, + "completion_length": 587.6964645385742, + "epoch": 0.05074390187024859, + "grad_norm": 0.06091761961579323, + "kl": 0.019866943359375, + "learning_rate": 8.6871291690508e-06, + "loss": 0.0093, + "num_tokens": 32558826.0, + "reward": 0.6127976402640343, + "reward_std": 0.07421735674142838, + "rewards/code_reward": 0.462797611951828, + "rewards/format_reward": 1.5, + "step": 1360 + }, + { + "clip_ratio": 0.0037545639788731933, + "epoch": 0.05078121356280024, + "grad_norm": 0.059675708413124084, + "kl": 0.01953125, + "learning_rate": 8.685070709755622e-06, + "loss": 0.0092, + "step": 1361 + }, + { + "clip_ratio": 0.004267153097316623, + "epoch": 0.050818525255351896, + "grad_norm": 0.05858581140637398, + "kl": 0.0193328857421875, + "learning_rate": 8.683010914063938e-06, + "loss": 0.0089, + "step": 1362 + }, + { + "clip_ratio": 0.003909967024810612, + "completion_length": 667.2143211364746, + "epoch": 0.05085583694790355, + "grad_norm": 0.07782197743654251, + "kl": 0.023406982421875, + "learning_rate": 8.680949782840002e-06, + "loss": 0.0172, + "num_tokens": 32631596.0, + "reward": 0.6112843677401543, + "reward_std": 0.2077542580664158, + "rewards/code_reward": 0.4639629237353802, + "rewards/format_reward": 1.4732142984867096, + "step": 1363 + }, + { + "clip_ratio": 0.003339109825901687, + "epoch": 0.0508931486404552, + "grad_norm": 0.0869554802775383, + "kl": 0.023834228515625, + "learning_rate": 8.678887316948625e-06, + "loss": 0.0169, + "step": 1364 + }, + { + "clip_ratio": 0.003448780858889222, + "epoch": 0.050930460333006855, + "grad_norm": 0.07490275800228119, + "kl": 0.02362060546875, + "learning_rate": 8.676823517255178e-06, + "loss": 0.0165, + "step": 1365 + }, + { + "clip_ratio": 0.0051070344634354115, + "completion_length": 747.2143096923828, + "epoch": 0.05096777202555851, + "grad_norm": 0.07890152931213379, + "kl": 0.02386474609375, + "learning_rate": 8.674758384625597e-06, + "loss": 0.02, + "num_tokens": 32705780.0, + "reward": 0.3674503155052662, + "reward_std": 0.17377452389337122, + "rewards/code_reward": 0.22012887569144368, + "rewards/format_reward": 1.4732142984867096, + "step": 1366 + }, + { + "clip_ratio": 0.004736238741315901, + "epoch": 0.05100508371811016, + "grad_norm": 0.081419438123703, + "kl": 0.023590087890625, + "learning_rate": 8.67269191992637e-06, + "loss": 0.0197, + "step": 1367 + }, + { + "clip_ratio": 0.0041175009100697935, + "epoch": 0.051042395410661814, + "grad_norm": 0.0803910568356514, + "kl": 0.022796630859375, + "learning_rate": 8.670624124024546e-06, + "loss": 0.0193, + "step": 1368 + }, + { + "clip_ratio": 0.004736801725812256, + "completion_length": 637.910758972168, + "epoch": 0.05107970710321347, + "grad_norm": 0.09997569024562836, + "kl": 0.0204010009765625, + "learning_rate": 8.668554997787737e-06, + "loss": 0.0028, + "num_tokens": 32773115.0, + "reward": 0.561852652579546, + "reward_std": 0.27163419127464294, + "rewards/code_reward": 0.4118526428937912, + "rewards/format_reward": 1.5, + "step": 1369 + }, + { + "clip_ratio": 0.005053276137914509, + "epoch": 0.05111701879576512, + "grad_norm": 0.09518654644489288, + "kl": 0.0201416015625, + "learning_rate": 8.666484542084109e-06, + "loss": 0.0024, + "step": 1370 + }, + { + "clip_ratio": 0.005510588642209768, + "epoch": 0.051154330488316774, + "grad_norm": 0.08755352348089218, + "kl": 0.020416259765625, + "learning_rate": 8.664412757782387e-06, + "loss": 0.002, + "step": 1371 + }, + { + "clip_ratio": 0.004208258294966072, + "completion_length": 695.4107437133789, + "epoch": 0.05119164218086843, + "grad_norm": 0.1065686047077179, + "kl": 0.02459716796875, + "learning_rate": 8.66233964575185e-06, + "loss": -0.0025, + "num_tokens": 32845000.0, + "reward": 0.7161744982004166, + "reward_std": 0.22219368629157543, + "rewards/code_reward": 0.5688530467450619, + "rewards/format_reward": 1.4732142984867096, + "step": 1372 + }, + { + "clip_ratio": 0.005214069620706141, + "epoch": 0.05122895387342008, + "grad_norm": 0.09108000248670578, + "kl": 0.024932861328125, + "learning_rate": 8.660265206862342e-06, + "loss": -0.0028, + "step": 1373 + }, + { + "clip_ratio": 0.0042024513822980225, + "epoch": 0.05126626556597173, + "grad_norm": 0.08727757632732391, + "kl": 0.02484130859375, + "learning_rate": 8.658189441984258e-06, + "loss": -0.0033, + "step": 1374 + }, + { + "clip_ratio": 0.00503397494321689, + "completion_length": 894.0000305175781, + "epoch": 0.05130357725852339, + "grad_norm": 0.07045380026102066, + "kl": 0.0320892333984375, + "learning_rate": 8.656112351988547e-06, + "loss": 0.0237, + "num_tokens": 32943672.0, + "reward": 0.31026411429047585, + "reward_std": 0.09721151180565357, + "rewards/code_reward": 0.16026411252096295, + "rewards/format_reward": 1.5, + "step": 1375 + }, + { + "clip_ratio": 0.005176721548195928, + "epoch": 0.051340888951075046, + "grad_norm": 0.06736547499895096, + "kl": 0.0323486328125, + "learning_rate": 8.65403393774672e-06, + "loss": 0.0236, + "step": 1376 + }, + { + "clip_ratio": 0.004811759397853166, + "epoch": 0.0513782006436267, + "grad_norm": 0.06417091935873032, + "kl": 0.032135009765625, + "learning_rate": 8.651954200130841e-06, + "loss": 0.0234, + "step": 1377 + }, + { + "clip_ratio": 0.0018713170429691672, + "completion_length": 659.9286041259766, + "epoch": 0.05141551233617835, + "grad_norm": 0.050085678696632385, + "kl": 0.02655029296875, + "learning_rate": 8.64987314001353e-06, + "loss": 0.0449, + "num_tokens": 33008516.0, + "reward": 0.7425595335662365, + "reward_std": 0.104311085306108, + "rewards/code_reward": 0.5952380895614624, + "rewards/format_reward": 1.4732142984867096, + "step": 1378 + }, + { + "clip_ratio": 0.0019596212077885866, + "epoch": 0.051452824028730006, + "grad_norm": 0.04801948741078377, + "kl": 0.02655029296875, + "learning_rate": 8.64779075826796e-06, + "loss": 0.0448, + "step": 1379 + }, + { + "clip_ratio": 0.0019259793334640563, + "epoch": 0.05149013572128166, + "grad_norm": 0.04687933251261711, + "kl": 0.02691650390625, + "learning_rate": 8.64570705576786e-06, + "loss": 0.0446, + "step": 1380 + }, + { + "clip_ratio": 0.005248819710686803, + "completion_length": 673.0178833007812, + "epoch": 0.05152744741383331, + "grad_norm": 0.06504382193088531, + "kl": 0.0240478515625, + "learning_rate": 8.643622033387513e-06, + "loss": 0.005, + "num_tokens": 33080589.0, + "reward": 0.393281489610672, + "reward_std": 0.1605914980173111, + "rewards/code_reward": 0.24328147247433662, + "rewards/format_reward": 1.5, + "step": 1381 + }, + { + "clip_ratio": 0.004934556549414992, + "epoch": 0.051564759106384965, + "grad_norm": 0.058273013681173325, + "kl": 0.024139404296875, + "learning_rate": 8.641535692001753e-06, + "loss": 0.0049, + "step": 1382 + }, + { + "clip_ratio": 0.005173743469640613, + "epoch": 0.05160207079893662, + "grad_norm": 0.050488825887441635, + "kl": 0.02386474609375, + "learning_rate": 8.639448032485974e-06, + "loss": 0.0046, + "step": 1383 + }, + { + "clip_ratio": 0.004667554632760584, + "completion_length": 785.7143249511719, + "epoch": 0.05163938249148827, + "grad_norm": 0.07681185752153397, + "kl": 0.0217742919921875, + "learning_rate": 8.637359055716119e-06, + "loss": -0.006, + "num_tokens": 33152379.0, + "reward": 0.4186767227947712, + "reward_std": 0.2848752960562706, + "rewards/code_reward": 0.2686766944825649, + "rewards/format_reward": 1.5, + "step": 1384 + }, + { + "clip_ratio": 0.004192136810161173, + "epoch": 0.051676694184039924, + "grad_norm": 0.07256465405225754, + "kl": 0.0220947265625, + "learning_rate": 8.63526876256868e-06, + "loss": -0.0063, + "step": 1385 + }, + { + "clip_ratio": 0.004234466934576631, + "epoch": 0.05171400587659158, + "grad_norm": 0.06804154068231583, + "kl": 0.022125244140625, + "learning_rate": 8.633177153920707e-06, + "loss": -0.0065, + "step": 1386 + }, + { + "clip_ratio": 0.00487446750048548, + "completion_length": 708.7500305175781, + "epoch": 0.05175131756914323, + "grad_norm": 0.0790882557630539, + "kl": 0.021575927734375, + "learning_rate": 8.631084230649805e-06, + "loss": -0.001, + "num_tokens": 33227811.0, + "reward": 0.42404618486762047, + "reward_std": 0.24051284790039062, + "rewards/code_reward": 0.27404618076980114, + "rewards/format_reward": 1.5, + "step": 1387 + }, + { + "clip_ratio": 0.004965002823155373, + "epoch": 0.051788629261694884, + "grad_norm": 0.07488776743412018, + "kl": 0.0212860107421875, + "learning_rate": 8.628989993634118e-06, + "loss": -0.0013, + "step": 1388 + }, + { + "clip_ratio": 0.004129113571252674, + "epoch": 0.05182594095424654, + "grad_norm": 0.0746285691857338, + "kl": 0.0213775634765625, + "learning_rate": 8.626894443752352e-06, + "loss": -0.0017, + "step": 1389 + }, + { + "clip_ratio": 0.0025855842977762222, + "completion_length": 548.9285888671875, + "epoch": 0.05186325264679819, + "grad_norm": 0.06288418918848038, + "kl": 0.025909423828125, + "learning_rate": 8.624797581883763e-06, + "loss": 0.0087, + "num_tokens": 33283821.0, + "reward": 0.8041262030601501, + "reward_std": 0.18561450019478798, + "rewards/code_reward": 0.6541262120008469, + "rewards/format_reward": 1.5, + "step": 1390 + }, + { + "clip_ratio": 0.0022104326053522527, + "epoch": 0.05190056433934984, + "grad_norm": 0.06165161728858948, + "kl": 0.0264892578125, + "learning_rate": 8.622699408908154e-06, + "loss": 0.0084, + "step": 1391 + }, + { + "clip_ratio": 0.0018450032803229988, + "epoch": 0.051937876031901496, + "grad_norm": 0.0596005916595459, + "kl": 0.025726318359375, + "learning_rate": 8.620599925705878e-06, + "loss": 0.0081, + "step": 1392 + }, + { + "clip_ratio": 0.0030903767910785973, + "completion_length": 873.7678985595703, + "epoch": 0.05197518772445315, + "grad_norm": 0.05850737914443016, + "kl": 0.0239410400390625, + "learning_rate": 8.618499133157841e-06, + "loss": 0.004, + "num_tokens": 33377524.0, + "reward": 0.5973405726253986, + "reward_std": 0.10627019964158535, + "rewards/code_reward": 0.44734057504683733, + "rewards/format_reward": 1.5, + "step": 1393 + }, + { + "clip_ratio": 0.0028823852771893144, + "epoch": 0.0520124994170048, + "grad_norm": 0.06557387858629227, + "kl": 0.0246429443359375, + "learning_rate": 8.616397032145494e-06, + "loss": 0.0041, + "step": 1394 + }, + { + "clip_ratio": 0.002640288439579308, + "epoch": 0.052049811109556456, + "grad_norm": 0.0530531220138073, + "kl": 0.02410888671875, + "learning_rate": 8.614293623550842e-06, + "loss": 0.0038, + "step": 1395 + }, + { + "clip_ratio": 0.0034946976811625063, + "completion_length": 589.4107360839844, + "epoch": 0.05208712280210811, + "grad_norm": 0.06404884159564972, + "kl": 0.0184478759765625, + "learning_rate": 8.612188908256435e-06, + "loss": 0.0111, + "num_tokens": 33431839.0, + "reward": 0.6102272793650627, + "reward_std": 0.09059525467455387, + "rewards/code_reward": 0.46022728085517883, + "rewards/format_reward": 1.5, + "step": 1396 + }, + { + "clip_ratio": 0.0031500495388172567, + "epoch": 0.05212443449465976, + "grad_norm": 0.06306402385234833, + "kl": 0.0175933837890625, + "learning_rate": 8.610082887145375e-06, + "loss": 0.0109, + "step": 1397 + }, + { + "clip_ratio": 0.0035092317848466337, + "epoch": 0.052161746187211415, + "grad_norm": 0.058711450546979904, + "kl": 0.0186004638671875, + "learning_rate": 8.607975561101305e-06, + "loss": 0.0106, + "step": 1398 + }, + { + "clip_ratio": 0.003497640718705952, + "completion_length": 598.8393020629883, + "epoch": 0.05219905787976307, + "grad_norm": 0.07692062854766846, + "kl": 0.025848388671875, + "learning_rate": 8.605866931008423e-06, + "loss": 0.0038, + "num_tokens": 33504000.0, + "reward": 0.7315962426364422, + "reward_std": 0.20819789171218872, + "rewards/code_reward": 0.5815962450578809, + "rewards/format_reward": 1.5, + "step": 1399 + }, + { + "clip_ratio": 0.0036694867303594947, + "epoch": 0.05223636957231472, + "grad_norm": 0.07739820331335068, + "kl": 0.0253753662109375, + "learning_rate": 8.60375699775147e-06, + "loss": 0.0036, + "step": 1400 + }, + { + "clip_ratio": 0.0038159539690241218, + "epoch": 0.052273681264866374, + "grad_norm": 0.07447691261768341, + "kl": 0.024810791015625, + "learning_rate": 8.601645762215734e-06, + "loss": 0.0033, + "step": 1401 + }, + { + "clip_ratio": 0.0035731818643398583, + "completion_length": 586.4643096923828, + "epoch": 0.052310992957418034, + "grad_norm": 0.09137970209121704, + "kl": 0.0309906005859375, + "learning_rate": 8.599533225287052e-06, + "loss": 0.003, + "num_tokens": 33572262.0, + "reward": 0.7058761678636074, + "reward_std": 0.1222756989300251, + "rewards/code_reward": 0.5558761581778526, + "rewards/format_reward": 1.5, + "step": 1402 + }, + { + "clip_ratio": 0.003629576589446515, + "epoch": 0.05234830464996969, + "grad_norm": 0.07098503410816193, + "kl": 0.0306243896484375, + "learning_rate": 8.597419387851803e-06, + "loss": 0.0027, + "step": 1403 + }, + { + "clip_ratio": 0.004041228035930544, + "epoch": 0.05238561634252134, + "grad_norm": 0.07451147586107254, + "kl": 0.0302276611328125, + "learning_rate": 8.595304250796916e-06, + "loss": 0.0025, + "step": 1404 + }, + { + "clip_ratio": 0.002392880036495626, + "completion_length": 505.2500305175781, + "epoch": 0.052422928035072994, + "grad_norm": 0.07744338363409042, + "kl": 0.01531982421875, + "learning_rate": 8.593187815009863e-06, + "loss": 0.0235, + "num_tokens": 33625070.0, + "reward": 0.9783090502023697, + "reward_std": 0.2454826422035694, + "rewards/code_reward": 0.8283090591430664, + "rewards/format_reward": 1.5, + "step": 1405 + }, + { + "clip_ratio": 0.002725861908402294, + "epoch": 0.05246023972762465, + "grad_norm": 0.0794452503323555, + "kl": 0.0153045654296875, + "learning_rate": 8.59107008137866e-06, + "loss": 0.0234, + "step": 1406 + }, + { + "clip_ratio": 0.0023317281156778336, + "epoch": 0.0524975514201763, + "grad_norm": 0.07574892789125443, + "kl": 0.0157318115234375, + "learning_rate": 8.588951050791868e-06, + "loss": 0.023, + "step": 1407 + }, + { + "clip_ratio": 0.0028413228574208915, + "completion_length": 717.9464645385742, + "epoch": 0.05253486311272795, + "grad_norm": 0.06316661834716797, + "kl": 0.020233154296875, + "learning_rate": 8.586830724138595e-06, + "loss": 0.0513, + "num_tokens": 33696859.0, + "reward": 0.8301083147525787, + "reward_std": 0.08638249523937702, + "rewards/code_reward": 0.682786860037595, + "rewards/format_reward": 1.4732142984867096, + "step": 1408 + }, + { + "clip_ratio": 0.0025986716500483453, + "epoch": 0.052572174805279606, + "grad_norm": 0.06132559850811958, + "kl": 0.02056884765625, + "learning_rate": 8.58470910230849e-06, + "loss": 0.0509, + "step": 1409 + }, + { + "clip_ratio": 0.0024230367271229625, + "epoch": 0.05260948649783126, + "grad_norm": 0.06536705046892166, + "kl": 0.020294189453125, + "learning_rate": 8.582586186191747e-06, + "loss": 0.0509, + "step": 1410 + }, + { + "clip_ratio": 0.003034022229257971, + "completion_length": 716.9821929931641, + "epoch": 0.05264679819038291, + "grad_norm": 0.0598711334168911, + "kl": 0.0248870849609375, + "learning_rate": 8.5804619766791e-06, + "loss": 0.0018, + "num_tokens": 33768454.0, + "reward": 0.6197311691939831, + "reward_std": 0.11989659070968628, + "rewards/code_reward": 0.4697311632335186, + "rewards/format_reward": 1.5, + "step": 1411 + }, + { + "clip_ratio": 0.0029325243667699397, + "epoch": 0.052684109882934566, + "grad_norm": 0.061429694294929504, + "kl": 0.023681640625, + "learning_rate": 8.57833647466183e-06, + "loss": 0.0019, + "step": 1412 + }, + { + "clip_ratio": 0.0032922017853707075, + "epoch": 0.05272142157548622, + "grad_norm": 0.06329721957445145, + "kl": 0.0240478515625, + "learning_rate": 8.576209681031756e-06, + "loss": 0.0017, + "step": 1413 + }, + { + "clip_ratio": 0.0032916779746301472, + "completion_length": 626.8750305175781, + "epoch": 0.05275873326803787, + "grad_norm": 0.09298888593912125, + "kl": 0.042236328125, + "learning_rate": 8.574081596681241e-06, + "loss": 0.0102, + "num_tokens": 33836207.0, + "reward": 0.7327445410192013, + "reward_std": 0.15184721630066633, + "rewards/code_reward": 0.5827445185277611, + "rewards/format_reward": 1.5, + "step": 1414 + }, + { + "clip_ratio": 0.0031303323921747506, + "epoch": 0.052796044960589525, + "grad_norm": 0.0834529846906662, + "kl": 0.0411529541015625, + "learning_rate": 8.571952222503193e-06, + "loss": 0.0099, + "step": 1415 + }, + { + "clip_ratio": 0.002659466816112399, + "epoch": 0.05283335665314118, + "grad_norm": 0.1305980533361435, + "kl": 0.05023193359375, + "learning_rate": 8.569821559391054e-06, + "loss": 0.0094, + "step": 1416 + }, + { + "clip_ratio": 0.0043594075832515955, + "completion_length": 549.2500228881836, + "epoch": 0.05287066834569283, + "grad_norm": 0.07002268731594086, + "kl": 0.01690673828125, + "learning_rate": 8.567689608238814e-06, + "loss": 0.0014, + "num_tokens": 33892431.0, + "reward": 0.4359477236866951, + "reward_std": 0.24520783126354218, + "rewards/code_reward": 0.2859477177262306, + "rewards/format_reward": 1.5, + "step": 1417 + }, + { + "clip_ratio": 0.0048298349138349295, + "epoch": 0.052907980038244484, + "grad_norm": 0.06605233252048492, + "kl": 0.016998291015625, + "learning_rate": 8.565556369940999e-06, + "loss": 0.0014, + "step": 1418 + }, + { + "clip_ratio": 0.004992290632799268, + "epoch": 0.05294529173079614, + "grad_norm": 0.06324057281017303, + "kl": 0.0172119140625, + "learning_rate": 8.563421845392672e-06, + "loss": 0.0011, + "step": 1419 + }, + { + "clip_ratio": 0.00274049089057371, + "completion_length": 704.2857513427734, + "epoch": 0.05298260342334779, + "grad_norm": 0.031190795823931694, + "kl": 0.0160369873046875, + "learning_rate": 8.561286035489446e-06, + "loss": -0.006, + "num_tokens": 33963455.0, + "reward": 0.5249999985098839, + "reward_std": 0.1297186315059662, + "rewards/code_reward": 0.375, + "rewards/format_reward": 1.5, + "step": 1420 + }, + { + "clip_ratio": 0.0027353547629900277, + "epoch": 0.053019915115899444, + "grad_norm": 0.032183270901441574, + "kl": 0.016143798828125, + "learning_rate": 8.559148941127466e-06, + "loss": -0.0061, + "step": 1421 + }, + { + "clip_ratio": 0.0029449910507537425, + "epoch": 0.0530572268084511, + "grad_norm": 0.032017916440963745, + "kl": 0.01593017578125, + "learning_rate": 8.557010563203413e-06, + "loss": -0.0062, + "step": 1422 + }, + { + "clip_ratio": 0.005254663759842515, + "completion_length": 702.0536193847656, + "epoch": 0.05309453850100275, + "grad_norm": 0.04248325899243355, + "kl": 0.0240631103515625, + "learning_rate": 8.554870902614515e-06, + "loss": 0.0004, + "num_tokens": 34040808.0, + "reward": 0.36428574100136757, + "reward_std": 0.09078413993120193, + "rewards/code_reward": 0.2142857164144516, + "rewards/format_reward": 1.5, + "step": 1423 + }, + { + "clip_ratio": 0.005125192110426724, + "epoch": 0.0531318501935544, + "grad_norm": 0.04241453856229782, + "kl": 0.0237274169921875, + "learning_rate": 8.552729960258533e-06, + "loss": 0.0003, + "step": 1424 + }, + { + "clip_ratio": 0.005282876139972359, + "epoch": 0.053169161886106056, + "grad_norm": 0.03618345037102699, + "kl": 0.0237274169921875, + "learning_rate": 8.550587737033766e-06, + "loss": 0.0002, + "step": 1425 + }, + { + "clip_ratio": 0.005385028489399701, + "completion_length": 584.4643173217773, + "epoch": 0.05320647357865771, + "grad_norm": 0.10431883484125137, + "kl": 0.0255126953125, + "learning_rate": 8.548444233839049e-06, + "loss": -0.0006, + "num_tokens": 34104632.0, + "reward": 0.6119075752794743, + "reward_std": 0.14923108741641045, + "rewards/code_reward": 0.4645861145108938, + "rewards/format_reward": 1.4732142984867096, + "step": 1426 + }, + { + "clip_ratio": 0.00468732655281201, + "epoch": 0.05324378527120936, + "grad_norm": 0.09940428286790848, + "kl": 0.0255279541015625, + "learning_rate": 8.54629945157376e-06, + "loss": -0.0013, + "step": 1427 + }, + { + "clip_ratio": 0.004974550916813314, + "epoch": 0.053281096963761015, + "grad_norm": 0.07421709597110748, + "kl": 0.0273284912109375, + "learning_rate": 8.544153391137806e-06, + "loss": -0.0018, + "step": 1428 + }, + { + "clip_ratio": 0.004415580886416137, + "completion_length": 722.6071701049805, + "epoch": 0.05331840865631267, + "grad_norm": 0.08858367055654526, + "kl": 0.0233154296875, + "learning_rate": 8.542006053431637e-06, + "loss": -0.0066, + "num_tokens": 34189030.0, + "reward": 0.6888441666960716, + "reward_std": 0.23302000015974045, + "rewards/code_reward": 0.5388441383838654, + "rewards/format_reward": 1.5, + "step": 1429 + }, + { + "clip_ratio": 0.0037513942224904895, + "epoch": 0.05335572034886433, + "grad_norm": 0.08293943107128143, + "kl": 0.02325439453125, + "learning_rate": 8.539857439356234e-06, + "loss": -0.007, + "step": 1430 + }, + { + "clip_ratio": 0.0036792673636227846, + "epoch": 0.05339303204141598, + "grad_norm": 0.07564014941453934, + "kl": 0.0245361328125, + "learning_rate": 8.537707549813118e-06, + "loss": -0.0074, + "step": 1431 + }, + { + "clip_ratio": 0.0040745194419287145, + "completion_length": 829.7143249511719, + "epoch": 0.053430343733967635, + "grad_norm": 0.08131881803274155, + "kl": 0.030609130859375, + "learning_rate": 8.53555638570434e-06, + "loss": 0.0007, + "num_tokens": 34280434.0, + "reward": 0.6171234920620918, + "reward_std": 0.27174633368849754, + "rewards/code_reward": 0.46712348610162735, + "rewards/format_reward": 1.5, + "step": 1432 + }, + { + "clip_ratio": 0.0038532415637746453, + "epoch": 0.05346765542651929, + "grad_norm": 0.07118489593267441, + "kl": 0.03057861328125, + "learning_rate": 8.53340394793249e-06, + "loss": 0.0009, + "step": 1433 + }, + { + "clip_ratio": 0.003621563781052828, + "epoch": 0.05350496711907094, + "grad_norm": 0.0859898030757904, + "kl": 0.03045654296875, + "learning_rate": 8.531250237400693e-06, + "loss": 0.0005, + "step": 1434 + }, + { + "clip_ratio": 0.0036364008556120098, + "completion_length": 748.6607513427734, + "epoch": 0.053542278811622594, + "grad_norm": 0.0788608118891716, + "kl": 0.0237884521484375, + "learning_rate": 8.529095255012602e-06, + "loss": 0.0419, + "num_tokens": 34350065.0, + "reward": 0.6421021558344364, + "reward_std": 0.16682861000299454, + "rewards/code_reward": 0.4947806838899851, + "rewards/format_reward": 1.4732142984867096, + "step": 1435 + }, + { + "clip_ratio": 0.003073771600611508, + "epoch": 0.05357959050417425, + "grad_norm": 0.07685090601444244, + "kl": 0.02410888671875, + "learning_rate": 8.526939001672409e-06, + "loss": 0.0414, + "step": 1436 + }, + { + "clip_ratio": 0.0031957197934389114, + "epoch": 0.0536169021967259, + "grad_norm": 0.07786913216114044, + "kl": 0.024810791015625, + "learning_rate": 8.524781478284838e-06, + "loss": 0.0413, + "step": 1437 + }, + { + "clip_ratio": 0.0063374656601808965, + "completion_length": 381.89287185668945, + "epoch": 0.053654213889277554, + "grad_norm": 0.06641223281621933, + "kl": 0.020904541015625, + "learning_rate": 8.522622685755144e-06, + "loss": 0.0028, + "num_tokens": 34393207.0, + "reward": 0.4535714462399483, + "reward_std": 0.2348419800400734, + "rewards/code_reward": 0.3035714253783226, + "rewards/format_reward": 1.5, + "step": 1438 + }, + { + "clip_ratio": 0.005554335541091859, + "epoch": 0.05369152558182921, + "grad_norm": 0.06210204213857651, + "kl": 0.021392822265625, + "learning_rate": 8.52046262498912e-06, + "loss": 0.0025, + "step": 1439 + }, + { + "clip_ratio": 0.0061550100217573345, + "epoch": 0.05372883727438086, + "grad_norm": 0.057601042091846466, + "kl": 0.02105712890625, + "learning_rate": 8.518301296893085e-06, + "loss": 0.0024, + "step": 1440 + }, + { + "clip_ratio": 0.0031113342265598476, + "completion_length": 668.6428833007812, + "epoch": 0.05376614896693251, + "grad_norm": 0.059397559612989426, + "kl": 0.016143798828125, + "learning_rate": 8.516138702373893e-06, + "loss": 0.0008, + "num_tokens": 34457391.0, + "reward": 0.8599726110696793, + "reward_std": 0.1978992149233818, + "rewards/code_reward": 0.709972582757473, + "rewards/format_reward": 1.5, + "step": 1441 + }, + { + "clip_ratio": 0.002853106358088553, + "epoch": 0.053803460659484166, + "grad_norm": 0.05900612473487854, + "kl": 0.016143798828125, + "learning_rate": 8.513974842338929e-06, + "loss": 0.0005, + "step": 1442 + }, + { + "clip_ratio": 0.0029082027031108737, + "epoch": 0.05384077235203582, + "grad_norm": 0.055046409368515015, + "kl": 0.0168304443359375, + "learning_rate": 8.511809717696105e-06, + "loss": 0.0003, + "step": 1443 + }, + { + "clip_ratio": 0.0036738342023454607, + "completion_length": 783.4464797973633, + "epoch": 0.05387808404458747, + "grad_norm": 0.08553508669137955, + "kl": 0.023956298828125, + "learning_rate": 8.50964332935387e-06, + "loss": 0.0039, + "num_tokens": 34538630.0, + "reward": 0.3727457784116268, + "reward_std": 0.2835843393113464, + "rewards/code_reward": 0.22542433068156242, + "rewards/format_reward": 1.4732142984867096, + "step": 1444 + }, + { + "clip_ratio": 0.0037144822999835014, + "epoch": 0.053915395737139125, + "grad_norm": 0.08651585131883621, + "kl": 0.023895263671875, + "learning_rate": 8.507475678221201e-06, + "loss": 0.0038, + "step": 1445 + }, + { + "clip_ratio": 0.0033776915515773, + "epoch": 0.05395270742969078, + "grad_norm": 0.08139783143997192, + "kl": 0.024322509765625, + "learning_rate": 8.505306765207602e-06, + "loss": 0.0035, + "step": 1446 + }, + { + "clip_ratio": 0.003932184423319995, + "completion_length": 744.3928756713867, + "epoch": 0.05399001912224243, + "grad_norm": 0.1667274385690689, + "kl": 0.0190887451171875, + "learning_rate": 8.50313659122311e-06, + "loss": -0.0089, + "num_tokens": 34615372.0, + "reward": 0.4207373484969139, + "reward_std": 0.2178364247083664, + "rewards/code_reward": 0.2707373294979334, + "rewards/format_reward": 1.5, + "step": 1447 + }, + { + "clip_ratio": 0.003937184053938836, + "epoch": 0.054027330814794085, + "grad_norm": 0.06565093249082565, + "kl": 0.019805908203125, + "learning_rate": 8.50096515717829e-06, + "loss": -0.0092, + "step": 1448 + }, + { + "clip_ratio": 0.003979891713242978, + "epoch": 0.05406464250734574, + "grad_norm": 0.06382036954164505, + "kl": 0.0198974609375, + "learning_rate": 8.498792463984234e-06, + "loss": -0.0094, + "step": 1449 + }, + { + "clip_ratio": 0.004132852307520807, + "completion_length": 603.7321701049805, + "epoch": 0.05410195419989739, + "grad_norm": 0.08437658101320267, + "kl": 0.02410888671875, + "learning_rate": 8.496618512552566e-06, + "loss": 0.0001, + "num_tokens": 34684993.0, + "reward": 0.7045959495007992, + "reward_std": 0.1124102114699781, + "rewards/code_reward": 0.5545959174633026, + "rewards/format_reward": 1.5, + "step": 1450 + }, + { + "clip_ratio": 0.0038104300620034337, + "epoch": 0.054139265892449044, + "grad_norm": 0.07916023582220078, + "kl": 0.023590087890625, + "learning_rate": 8.494443303795433e-06, + "loss": -0.0002, + "step": 1451 + }, + { + "clip_ratio": 0.004616227932274342, + "epoch": 0.0541765775850007, + "grad_norm": 0.07833059132099152, + "kl": 0.024169921875, + "learning_rate": 8.492266838625512e-06, + "loss": -0.0005, + "step": 1452 + }, + { + "clip_ratio": 0.002892471617087722, + "completion_length": 528.1964645385742, + "epoch": 0.05421388927755235, + "grad_norm": 0.08305688947439194, + "kl": 0.02874755859375, + "learning_rate": 8.49008911795601e-06, + "loss": -0.0018, + "num_tokens": 34747668.0, + "reward": 0.7511904761195183, + "reward_std": 0.13509537279605865, + "rewards/code_reward": 0.6011904738843441, + "rewards/format_reward": 1.5, + "step": 1453 + }, + { + "clip_ratio": 0.0026406007818877697, + "epoch": 0.054251200970104003, + "grad_norm": 0.08454633504152298, + "kl": 0.028289794921875, + "learning_rate": 8.487910142700657e-06, + "loss": -0.002, + "step": 1454 + }, + { + "clip_ratio": 0.0026986285811290145, + "epoch": 0.05428851266265566, + "grad_norm": 0.0767415314912796, + "kl": 0.0291748046875, + "learning_rate": 8.485729913773707e-06, + "loss": -0.0024, + "step": 1455 + }, + { + "clip_ratio": 0.003182739659678191, + "completion_length": 775.0714721679688, + "epoch": 0.05432582435520731, + "grad_norm": 0.06917975842952728, + "kl": 0.024871826171875, + "learning_rate": 8.48354843208995e-06, + "loss": 0.042, + "num_tokens": 34826702.0, + "reward": 0.6796743683516979, + "reward_std": 0.1855982868000865, + "rewards/code_reward": 0.5323529541492462, + "rewards/format_reward": 1.4732142984867096, + "step": 1456 + }, + { + "clip_ratio": 0.002944020729046315, + "epoch": 0.05436313604775897, + "grad_norm": 0.062300775200128555, + "kl": 0.024688720703125, + "learning_rate": 8.481365698564691e-06, + "loss": 0.042, + "step": 1457 + }, + { + "clip_ratio": 0.003346188459545374, + "epoch": 0.05440044774031062, + "grad_norm": 0.06189906969666481, + "kl": 0.025421142578125, + "learning_rate": 8.479181714113767e-06, + "loss": 0.0418, + "step": 1458 + }, + { + "clip_ratio": 0.003317172988317907, + "completion_length": 533.107177734375, + "epoch": 0.054437759432862276, + "grad_norm": 0.03482196107506752, + "kl": 0.026580810546875, + "learning_rate": 8.476996479653536e-06, + "loss": -0.0026, + "num_tokens": 34892592.0, + "reward": 0.5682331100106239, + "reward_std": 0.11511970311403275, + "rewards/code_reward": 0.41823306679725647, + "rewards/format_reward": 1.5, + "step": 1459 + }, + { + "clip_ratio": 0.003430070704780519, + "epoch": 0.05447507112541393, + "grad_norm": 0.032941434532403946, + "kl": 0.02630615234375, + "learning_rate": 8.47480999610088e-06, + "loss": -0.0026, + "step": 1460 + }, + { + "clip_ratio": 0.0032534448546357453, + "epoch": 0.05451238281796558, + "grad_norm": 0.031646568328142166, + "kl": 0.0272216796875, + "learning_rate": 8.472622264373207e-06, + "loss": -0.0027, + "step": 1461 + }, + { + "clip_ratio": 0.0029000588692724705, + "completion_length": 584.535758972168, + "epoch": 0.054549694510517235, + "grad_norm": 0.08794906735420227, + "kl": 0.024322509765625, + "learning_rate": 8.470433285388452e-06, + "loss": -0.0176, + "num_tokens": 34960494.0, + "reward": 0.9175801873207092, + "reward_std": 0.28704098984599113, + "rewards/code_reward": 0.7702587395906448, + "rewards/format_reward": 1.4732142984867096, + "step": 1462 + }, + { + "clip_ratio": 0.002869464864488691, + "epoch": 0.05458700620306889, + "grad_norm": 0.08419640362262726, + "kl": 0.024139404296875, + "learning_rate": 8.468243060065067e-06, + "loss": -0.0176, + "step": 1463 + }, + { + "clip_ratio": 0.0024557413125876337, + "epoch": 0.05462431789562054, + "grad_norm": 0.08670420199632645, + "kl": 0.023956298828125, + "learning_rate": 8.466051589322032e-06, + "loss": -0.018, + "step": 1464 + }, + { + "clip_ratio": 0.005492831929586828, + "completion_length": 596.0714492797852, + "epoch": 0.054661629588172195, + "grad_norm": 0.08098531514406204, + "kl": 0.027130126953125, + "learning_rate": 8.463858874078847e-06, + "loss": -0.0015, + "num_tokens": 35025948.0, + "reward": 0.40804924815893173, + "reward_std": 0.22585312463343143, + "rewards/code_reward": 0.2580492157721892, + "rewards/format_reward": 1.5, + "step": 1465 + }, + { + "clip_ratio": 0.004939685575664043, + "epoch": 0.05469894128072385, + "grad_norm": 0.07177529484033585, + "kl": 0.0263671875, + "learning_rate": 8.461664915255531e-06, + "loss": -0.0017, + "step": 1466 + }, + { + "clip_ratio": 0.00512773881200701, + "epoch": 0.0547362529732755, + "grad_norm": 0.07312772423028946, + "kl": 0.02716064453125, + "learning_rate": 8.459469713772633e-06, + "loss": -0.002, + "step": 1467 + }, + { + "clip_ratio": 0.00492589600617066, + "completion_length": 665.1785888671875, + "epoch": 0.054773564665827154, + "grad_norm": 0.0026410571299493313, + "kl": 0.023406982421875, + "learning_rate": 8.457273270551218e-06, + "loss": 0.0004, + "num_tokens": 35093032.0, + "reward": 0.15000002086162567, + "reward_std": 0.0, + "rewards/code_reward": 0.0, + "rewards/format_reward": 1.5, + "step": 1468 + }, + { + "clip_ratio": 0.0050617961678653955, + "epoch": 0.05481087635837881, + "grad_norm": 0.002584310946986079, + "kl": 0.0230255126953125, + "learning_rate": 8.45507558651287e-06, + "loss": 0.0004, + "step": 1469 + }, + { + "clip_ratio": 0.004851850564591587, + "epoch": 0.05484818805093046, + "grad_norm": 0.0027088613715022802, + "kl": 0.02349853515625, + "learning_rate": 8.4528766625797e-06, + "loss": 0.0004, + "step": 1470 + }, + { + "clip_ratio": 0.003340436320286244, + "completion_length": 702.6428985595703, + "epoch": 0.05488549974348211, + "grad_norm": 0.051533278077840805, + "kl": 0.03692626953125, + "learning_rate": 8.45067649967433e-06, + "loss": -0.0048, + "num_tokens": 35164426.0, + "reward": 0.6226741187274456, + "reward_std": 0.09718158841133118, + "rewards/code_reward": 0.4746383838355541, + "rewards/format_reward": 1.480357140302658, + "step": 1471 + }, + { + "clip_ratio": 0.0032603295985609293, + "epoch": 0.05492281143603377, + "grad_norm": 0.052760545164346695, + "kl": 0.034820556640625, + "learning_rate": 8.448475098719915e-06, + "loss": -0.0048, + "step": 1472 + }, + { + "clip_ratio": 0.0037760368431918323, + "epoch": 0.05496012312858542, + "grad_norm": 0.04821248725056648, + "kl": 0.031005859375, + "learning_rate": 8.446272460640116e-06, + "loss": -0.0051, + "step": 1473 + }, + { + "clip_ratio": 0.004733344889245927, + "completion_length": 535.5714492797852, + "epoch": 0.05499743482113707, + "grad_norm": 0.06995493918657303, + "kl": 0.028289794921875, + "learning_rate": 8.444068586359122e-06, + "loss": -0.0031, + "num_tokens": 35223076.0, + "reward": 0.6183180138468742, + "reward_std": 0.03981872275471687, + "rewards/code_reward": 0.4683179706335068, + "rewards/format_reward": 1.5, + "step": 1474 + }, + { + "clip_ratio": 0.004668325302191079, + "epoch": 0.055034746513688726, + "grad_norm": 0.07999233901500702, + "kl": 0.02874755859375, + "learning_rate": 8.441863476801638e-06, + "loss": -0.0032, + "step": 1475 + }, + { + "clip_ratio": 0.004942800034768879, + "epoch": 0.05507205820624038, + "grad_norm": 0.04726424440741539, + "kl": 0.0274658203125, + "learning_rate": 8.439657132892883e-06, + "loss": -0.0034, + "step": 1476 + }, + { + "clip_ratio": 0.0023679876467213035, + "completion_length": 560.7678756713867, + "epoch": 0.05510936989879203, + "grad_norm": 0.07531487941741943, + "kl": 0.029144287109375, + "learning_rate": 8.437449555558604e-06, + "loss": 0.0019, + "num_tokens": 35280435.0, + "reward": 0.9975188076496124, + "reward_std": 0.17411112412810326, + "rewards/code_reward": 0.8475187867879868, + "rewards/format_reward": 1.5, + "step": 1477 + }, + { + "clip_ratio": 0.0019182113464921713, + "epoch": 0.055146681591343685, + "grad_norm": 0.07601532340049744, + "kl": 0.028472900390625, + "learning_rate": 8.435240745725055e-06, + "loss": 0.0016, + "step": 1478 + }, + { + "clip_ratio": 0.0021269842400215566, + "epoch": 0.05518399328389534, + "grad_norm": 0.0707043930888176, + "kl": 0.02850341796875, + "learning_rate": 8.433030704319011e-06, + "loss": 0.0013, + "step": 1479 + }, + { + "clip_ratio": 0.003001337347086519, + "completion_length": 525.5000305175781, + "epoch": 0.05522130497644699, + "grad_norm": 0.09750996530056, + "kl": 0.0252685546875, + "learning_rate": 8.43081943226777e-06, + "loss": 0.0025, + "num_tokens": 35336715.0, + "reward": 0.8436572030186653, + "reward_std": 0.32707415521144867, + "rewards/code_reward": 0.6956214755773544, + "rewards/format_reward": 1.480357140302658, + "step": 1480 + }, + { + "clip_ratio": 0.0029284472693689167, + "epoch": 0.055258616668998645, + "grad_norm": 0.08607978373765945, + "kl": 0.02569580078125, + "learning_rate": 8.428606930499134e-06, + "loss": 0.0022, + "step": 1481 + }, + { + "clip_ratio": 0.003107498516328633, + "epoch": 0.0552959283615503, + "grad_norm": 0.07936866581439972, + "kl": 0.02655029296875, + "learning_rate": 8.426393199941431e-06, + "loss": 0.0019, + "step": 1482 + }, + { + "clip_ratio": 0.0028830141527578235, + "completion_length": 582.7857360839844, + "epoch": 0.05533324005410195, + "grad_norm": 0.0432063452899456, + "kl": 0.0283203125, + "learning_rate": 8.424178241523503e-06, + "loss": -0.0002, + "num_tokens": 35409489.0, + "reward": 0.7524159662425518, + "reward_std": 0.08215596526861191, + "rewards/code_reward": 0.6024159714579582, + "rewards/format_reward": 1.5, + "step": 1483 + }, + { + "clip_ratio": 0.0026865870459005237, + "epoch": 0.05537055174665361, + "grad_norm": 0.04318898171186447, + "kl": 0.028411865234375, + "learning_rate": 8.421962056174701e-06, + "loss": -0.0003, + "step": 1484 + }, + { + "clip_ratio": 0.002769493614323437, + "epoch": 0.055407863439205264, + "grad_norm": 0.04203105717897415, + "kl": 0.0289306640625, + "learning_rate": 8.419744644824899e-06, + "loss": -0.0003, + "step": 1485 + }, + { + "clip_ratio": 0.0025223174598068, + "completion_length": 642.9643249511719, + "epoch": 0.05544517513175692, + "grad_norm": 0.06739164888858795, + "kl": 0.0150146484375, + "learning_rate": 8.417526008404477e-06, + "loss": -0.0046, + "num_tokens": 35480509.0, + "reward": 0.9295177608728409, + "reward_std": 0.18886446207761765, + "rewards/code_reward": 0.7795177474617958, + "rewards/format_reward": 1.5, + "step": 1486 + }, + { + "clip_ratio": 0.0021810946927871555, + "epoch": 0.05548248682430857, + "grad_norm": 0.0643877163529396, + "kl": 0.015655517578125, + "learning_rate": 8.41530614784434e-06, + "loss": -0.005, + "step": 1487 + }, + { + "clip_ratio": 0.0023820045171305537, + "epoch": 0.05551979851686022, + "grad_norm": 0.06180739402770996, + "kl": 0.015533447265625, + "learning_rate": 8.413085064075894e-06, + "loss": -0.0053, + "step": 1488 + }, + { + "clip_ratio": 0.006448949163313955, + "completion_length": 652.2143249511719, + "epoch": 0.05555711020941188, + "grad_norm": 0.09830169379711151, + "kl": 0.0357666015625, + "learning_rate": 8.41086275803107e-06, + "loss": 0.0067, + "num_tokens": 35560391.0, + "reward": 0.508357223123312, + "reward_std": 0.24143656343221664, + "rewards/code_reward": 0.35835722275078297, + "rewards/format_reward": 1.5, + "step": 1489 + }, + { + "clip_ratio": 0.006130315421614796, + "epoch": 0.05559442190196353, + "grad_norm": 0.08352508395910263, + "kl": 0.035186767578125, + "learning_rate": 8.408639230642302e-06, + "loss": 0.0066, + "step": 1490 + }, + { + "clip_ratio": 0.00586556748021394, + "epoch": 0.05563173359451518, + "grad_norm": 0.07389990240335464, + "kl": 0.035980224609375, + "learning_rate": 8.406414482842542e-06, + "loss": 0.006, + "step": 1491 + }, + { + "clip_ratio": 0.004843690316192806, + "completion_length": 748.4286041259766, + "epoch": 0.055669045287066836, + "grad_norm": 0.0960581973195076, + "kl": 0.023223876953125, + "learning_rate": 8.404188515565254e-06, + "loss": 0.0011, + "num_tokens": 35632305.0, + "reward": 0.424981739372015, + "reward_std": 0.28874703496694565, + "rewards/code_reward": 0.2749817334115505, + "rewards/format_reward": 1.5, + "step": 1492 + }, + { + "clip_ratio": 0.004352173942606896, + "epoch": 0.05570635697961849, + "grad_norm": 0.0742095559835434, + "kl": 0.023162841796875, + "learning_rate": 8.40196132974441e-06, + "loss": 0.0009, + "step": 1493 + }, + { + "clip_ratio": 0.004437928379047662, + "epoch": 0.05574366867217014, + "grad_norm": 0.07318661361932755, + "kl": 0.022735595703125, + "learning_rate": 8.3997329263145e-06, + "loss": 0.0004, + "step": 1494 + }, + { + "clip_ratio": 0.003717269340995699, + "completion_length": 650.4821624755859, + "epoch": 0.055780980364721795, + "grad_norm": 0.08326311409473419, + "kl": 0.022918701171875, + "learning_rate": 8.397503306210519e-06, + "loss": 0.0562, + "num_tokens": 35695738.0, + "reward": 0.8313408195972443, + "reward_std": 0.21983184479176998, + "rewards/code_reward": 0.6840193644165993, + "rewards/format_reward": 1.4732142984867096, + "step": 1495 + }, + { + "clip_ratio": 0.0036918598343618214, + "epoch": 0.05581829205727345, + "grad_norm": 0.08121925592422485, + "kl": 0.02264404296875, + "learning_rate": 8.395272470367972e-06, + "loss": 0.0562, + "step": 1496 + }, + { + "clip_ratio": 0.003404432034585625, + "epoch": 0.0558556037498251, + "grad_norm": 0.07633746415376663, + "kl": 0.02288818359375, + "learning_rate": 8.39304041972288e-06, + "loss": 0.0556, + "step": 1497 + }, + { + "clip_ratio": 0.0029203309095464647, + "completion_length": 540.6250228881836, + "epoch": 0.055892915442376755, + "grad_norm": 0.08720911294221878, + "kl": 0.029571533203125, + "learning_rate": 8.390807155211766e-06, + "loss": -0.0027, + "num_tokens": 35753607.0, + "reward": 0.9725585877895355, + "reward_std": 0.15267880074679852, + "rewards/code_reward": 0.824522852897644, + "rewards/format_reward": 1.480357140302658, + "step": 1498 + }, + { + "clip_ratio": 0.0028298062388785183, + "epoch": 0.05593022713492841, + "grad_norm": 0.07457781583070755, + "kl": 0.031768798828125, + "learning_rate": 8.388572677771668e-06, + "loss": -0.0028, + "step": 1499 + }, + { + "clip_ratio": 0.0022362328018061817, + "epoch": 0.05596753882748006, + "grad_norm": 0.07147800177335739, + "kl": 0.032318115234375, + "learning_rate": 8.38633698834013e-06, + "loss": -0.0036, + "step": 1500 + }, + { + "clip_ratio": 0.004817435401491821, + "completion_length": 795.0178985595703, + "epoch": 0.056004850520031714, + "grad_norm": 0.09378264099359512, + "kl": 0.0255126953125, + "learning_rate": 8.38410008785521e-06, + "loss": 0.0117, + "num_tokens": 35830216.0, + "reward": 0.3409876674413681, + "reward_std": 0.21685681492090225, + "rewards/code_reward": 0.19366621226072311, + "rewards/format_reward": 1.4732142984867096, + "step": 1501 + }, + { + "clip_ratio": 0.004449062398634851, + "epoch": 0.05604216221258337, + "grad_norm": 0.09114145487546921, + "kl": 0.025238037109375, + "learning_rate": 8.381861977255464e-06, + "loss": 0.0117, + "step": 1502 + }, + { + "clip_ratio": 0.00507266353815794, + "epoch": 0.05607947390513502, + "grad_norm": 0.08040880411863327, + "kl": 0.025177001953125, + "learning_rate": 8.379622657479966e-06, + "loss": 0.011, + "step": 1503 + }, + { + "clip_ratio": 0.004086273955181241, + "completion_length": 576.5714645385742, + "epoch": 0.05611678559768667, + "grad_norm": 0.03875662013888359, + "kl": 0.026885986328125, + "learning_rate": 8.377382129468293e-06, + "loss": 0.0016, + "num_tokens": 35894746.0, + "reward": 0.4535714387893677, + "reward_std": 0.10645382851362228, + "rewards/code_reward": 0.3035714291036129, + "rewards/format_reward": 1.5, + "step": 1504 + }, + { + "clip_ratio": 0.004110003123059869, + "epoch": 0.056154097290238326, + "grad_norm": 0.03828658536076546, + "kl": 0.026580810546875, + "learning_rate": 8.375140394160526e-06, + "loss": 0.0016, + "step": 1505 + }, + { + "clip_ratio": 0.004346172616351396, + "epoch": 0.05619140898278998, + "grad_norm": 0.03375622257590294, + "kl": 0.0260009765625, + "learning_rate": 8.372897452497256e-06, + "loss": 0.0015, + "step": 1506 + }, + { + "clip_ratio": 0.0029467453132383525, + "completion_length": 671.8750381469727, + "epoch": 0.05622872067534163, + "grad_norm": 0.0754072517156601, + "kl": 0.0178070068359375, + "learning_rate": 8.370653305419582e-06, + "loss": 0.0101, + "num_tokens": 35963311.0, + "reward": 0.5999299809336662, + "reward_std": 0.26021040976047516, + "rewards/code_reward": 0.44992997497320175, + "rewards/format_reward": 1.5, + "step": 1507 + }, + { + "clip_ratio": 0.002471093612257391, + "epoch": 0.056266032367893286, + "grad_norm": 0.3558266758918762, + "kl": 0.0180816650390625, + "learning_rate": 8.368407953869105e-06, + "loss": 0.0098, + "step": 1508 + }, + { + "clip_ratio": 0.002486397512257099, + "epoch": 0.05630334406044494, + "grad_norm": 0.0694352388381958, + "kl": 0.0181121826171875, + "learning_rate": 8.366161398787932e-06, + "loss": 0.0096, + "step": 1509 + }, + { + "clip_ratio": 0.0015509093645960093, + "completion_length": 791.8393249511719, + "epoch": 0.05634065575299659, + "grad_norm": 0.03475528582930565, + "kl": 0.016876220703125, + "learning_rate": 8.363913641118677e-06, + "loss": 0.0257, + "num_tokens": 36043480.0, + "reward": 0.6505304053425789, + "reward_std": 0.001053998013958335, + "rewards/code_reward": 0.5005304101505317, + "rewards/format_reward": 1.5, + "step": 1510 + }, + { + "clip_ratio": 0.0014796483155805618, + "epoch": 0.05637796744554825, + "grad_norm": 0.10016168653964996, + "kl": 0.0167236328125, + "learning_rate": 8.361664681804458e-06, + "loss": 0.0256, + "step": 1511 + }, + { + "clip_ratio": 0.0015696181799285114, + "epoch": 0.056415279138099905, + "grad_norm": 0.023871861398220062, + "kl": 0.0168914794921875, + "learning_rate": 8.359414521788896e-06, + "loss": 0.0256, + "step": 1512 + }, + { + "clip_ratio": 0.004210496845189482, + "completion_length": 843.0714721679688, + "epoch": 0.05645259083065156, + "grad_norm": 0.08476897329092026, + "kl": 0.0220489501953125, + "learning_rate": 8.357163162016112e-06, + "loss": 0.0182, + "num_tokens": 36126196.0, + "reward": 0.5122192017734051, + "reward_std": 0.21258736494928598, + "rewards/code_reward": 0.36418349319137633, + "rewards/format_reward": 1.480357140302658, + "step": 1513 + }, + { + "clip_ratio": 0.004579055646900088, + "epoch": 0.05648990252320321, + "grad_norm": 0.0832720696926117, + "kl": 0.021026611328125, + "learning_rate": 8.354910603430743e-06, + "loss": 0.0181, + "step": 1514 + }, + { + "clip_ratio": 0.003994301776401699, + "epoch": 0.056527214215754865, + "grad_norm": 0.08163436502218246, + "kl": 0.0221099853515625, + "learning_rate": 8.352656846977916e-06, + "loss": 0.0177, + "step": 1515 + }, + { + "clip_ratio": 0.003792014846112579, + "completion_length": 873.5000381469727, + "epoch": 0.05656452590830652, + "grad_norm": 0.07057210803031921, + "kl": 0.026519775390625, + "learning_rate": 8.350401893603264e-06, + "loss": 0.0546, + "num_tokens": 36214690.0, + "reward": 0.35922620818018913, + "reward_std": 0.13092639483511448, + "rewards/code_reward": 0.21190475765615702, + "rewards/format_reward": 1.4732142984867096, + "step": 1516 + }, + { + "clip_ratio": 0.00404945359332487, + "epoch": 0.05660183760085817, + "grad_norm": 0.07837039232254028, + "kl": 0.026580810546875, + "learning_rate": 8.348145744252926e-06, + "loss": 0.0543, + "step": 1517 + }, + { + "clip_ratio": 0.0038841289351694286, + "epoch": 0.056639149293409824, + "grad_norm": 0.06724509596824646, + "kl": 0.02728271484375, + "learning_rate": 8.34588839987354e-06, + "loss": 0.0542, + "step": 1518 + }, + { + "clip_ratio": 0.004394926014356315, + "completion_length": 613.5357360839844, + "epoch": 0.05667646098596148, + "grad_norm": 0.07470674812793732, + "kl": 0.025787353515625, + "learning_rate": 8.343629861412247e-06, + "loss": -0.0139, + "num_tokens": 36276810.0, + "reward": 0.32495197653770447, + "reward_std": 0.10090370289981365, + "rewards/code_reward": 0.1769162518903613, + "rewards/format_reward": 1.480357140302658, + "step": 1519 + }, + { + "clip_ratio": 0.004624889756087214, + "epoch": 0.05671377267851313, + "grad_norm": 0.07569186389446259, + "kl": 0.026519775390625, + "learning_rate": 8.341370129816684e-06, + "loss": -0.014, + "step": 1520 + }, + { + "clip_ratio": 0.004440257966052741, + "epoch": 0.05675108437106478, + "grad_norm": 0.08007165789604187, + "kl": 0.025360107421875, + "learning_rate": 8.339109206034994e-06, + "loss": -0.0144, + "step": 1521 + }, + { + "clip_ratio": 0.0030230687116272748, + "completion_length": 705.3036041259766, + "epoch": 0.056788396063616436, + "grad_norm": 0.0446382611989975, + "kl": 0.0205535888671875, + "learning_rate": 8.336847091015818e-06, + "loss": 0.0235, + "num_tokens": 36346505.0, + "reward": 0.8195823095738888, + "reward_std": 0.08131473790854216, + "rewards/code_reward": 0.6722608655691147, + "rewards/format_reward": 1.4732142984867096, + "step": 1522 + }, + { + "clip_ratio": 0.002841689740307629, + "epoch": 0.05682570775616809, + "grad_norm": 0.044071946293115616, + "kl": 0.0204620361328125, + "learning_rate": 8.334583785708302e-06, + "loss": 0.0234, + "step": 1523 + }, + { + "clip_ratio": 0.003073793603107333, + "epoch": 0.05686301944871974, + "grad_norm": 0.045011505484580994, + "kl": 0.0203399658203125, + "learning_rate": 8.332319291062083e-06, + "loss": 0.0233, + "step": 1524 + }, + { + "clip_ratio": 0.0033029775077011436, + "completion_length": 717.053596496582, + "epoch": 0.056900331141271396, + "grad_norm": 0.07434416562318802, + "kl": 0.0172119140625, + "learning_rate": 8.3300536080273e-06, + "loss": 0.002, + "num_tokens": 36412728.0, + "reward": 0.6361670009791851, + "reward_std": 0.16356317698955536, + "rewards/code_reward": 0.48616695404052734, + "rewards/format_reward": 1.5, + "step": 1525 + }, + { + "clip_ratio": 0.003186880116118118, + "epoch": 0.05693764283382305, + "grad_norm": 0.07008543610572815, + "kl": 0.0175018310546875, + "learning_rate": 8.327786737554595e-06, + "loss": 0.0019, + "step": 1526 + }, + { + "clip_ratio": 0.00271580446860753, + "epoch": 0.0569749545263747, + "grad_norm": 0.06603839248418808, + "kl": 0.01740264892578125, + "learning_rate": 8.325518680595101e-06, + "loss": 0.0014, + "step": 1527 + }, + { + "clip_ratio": 0.0031306726741604507, + "completion_length": 625.6250152587891, + "epoch": 0.057012266218926355, + "grad_norm": 0.1053902730345726, + "kl": 0.02392578125, + "learning_rate": 8.323249438100454e-06, + "loss": 0.0071, + "num_tokens": 36471689.0, + "reward": 0.6235449835658073, + "reward_std": 0.1860323417931795, + "rewards/code_reward": 0.47354498133063316, + "rewards/format_reward": 1.5, + "step": 1528 + }, + { + "clip_ratio": 0.003177153237629682, + "epoch": 0.05704957791147801, + "grad_norm": 0.08235081285238266, + "kl": 0.02459716796875, + "learning_rate": 8.320979011022788e-06, + "loss": 0.0068, + "step": 1529 + }, + { + "clip_ratio": 0.0032971977489069104, + "epoch": 0.05708688960402966, + "grad_norm": 0.07349062711000443, + "kl": 0.025360107421875, + "learning_rate": 8.31870740031473e-06, + "loss": 0.0065, + "step": 1530 + }, + { + "clip_ratio": 0.0038661323487758636, + "completion_length": 720.7143249511719, + "epoch": 0.057124201296581315, + "grad_norm": 0.07316794991493225, + "kl": 0.022186279296875, + "learning_rate": 8.316434606929405e-06, + "loss": 0.0179, + "num_tokens": 36546027.0, + "reward": 0.757407870143652, + "reward_std": 0.16101010027341545, + "rewards/code_reward": 0.607407808303833, + "rewards/format_reward": 1.5, + "step": 1531 + }, + { + "clip_ratio": 0.0033624411444179714, + "epoch": 0.05716151298913297, + "grad_norm": 0.07353859394788742, + "kl": 0.02203369140625, + "learning_rate": 8.314160631820435e-06, + "loss": 0.0176, + "step": 1532 + }, + { + "clip_ratio": 0.0031362599693238735, + "epoch": 0.05719882468168462, + "grad_norm": 0.06865502148866653, + "kl": 0.021820068359375, + "learning_rate": 8.311885475941941e-06, + "loss": 0.0174, + "step": 1533 + }, + { + "clip_ratio": 0.0028213999466970563, + "completion_length": 518.7857360839844, + "epoch": 0.057236136374236274, + "grad_norm": 0.05654983967542648, + "kl": 0.0217132568359375, + "learning_rate": 8.309609140248531e-06, + "loss": 0.0007, + "num_tokens": 36597913.0, + "reward": 0.5190476216375828, + "reward_std": 0.1599978655576706, + "rewards/code_reward": 0.3690476231276989, + "rewards/format_reward": 1.5, + "step": 1534 + }, + { + "clip_ratio": 0.002611853997223079, + "epoch": 0.05727344806678793, + "grad_norm": 0.05818675458431244, + "kl": 0.02093505859375, + "learning_rate": 8.307331625695319e-06, + "loss": 0.0006, + "step": 1535 + }, + { + "clip_ratio": 0.002621239284053445, + "epoch": 0.05731075975933958, + "grad_norm": 0.05805828422307968, + "kl": 0.0215606689453125, + "learning_rate": 8.305052933237902e-06, + "loss": 0.0004, + "step": 1536 + }, + { + "clip_ratio": 0.004665132088121027, + "completion_length": 803.857177734375, + "epoch": 0.05734807145189123, + "grad_norm": 0.059988152235746384, + "kl": 0.023406982421875, + "learning_rate": 8.302773063832382e-06, + "loss": 0.0145, + "num_tokens": 36675717.0, + "reward": 0.2796773612499237, + "reward_std": 0.12776269018650055, + "rewards/code_reward": 0.12967733154073358, + "rewards/format_reward": 1.5, + "step": 1537 + }, + { + "clip_ratio": 0.0044346615904942155, + "epoch": 0.057385383144442886, + "grad_norm": 0.06135326251387596, + "kl": 0.023895263671875, + "learning_rate": 8.300492018435344e-06, + "loss": 0.0143, + "step": 1538 + }, + { + "clip_ratio": 0.00452667148783803, + "epoch": 0.057422694836994546, + "grad_norm": 0.05679450184106827, + "kl": 0.023895263671875, + "learning_rate": 8.29820979800388e-06, + "loss": 0.0142, + "step": 1539 + }, + { + "clip_ratio": 0.004607684561051428, + "completion_length": 723.8214721679688, + "epoch": 0.0574600065295462, + "grad_norm": 0.09616316854953766, + "kl": 0.044189453125, + "learning_rate": 8.29592640349556e-06, + "loss": 0.0115, + "num_tokens": 36755353.0, + "reward": 0.5767436176538467, + "reward_std": 0.37843454629182816, + "rewards/code_reward": 0.42870787903666496, + "rewards/format_reward": 1.480357140302658, + "step": 1540 + }, + { + "clip_ratio": 0.004206991463433951, + "epoch": 0.05749731822209785, + "grad_norm": 0.09325989335775375, + "kl": 0.03802490234375, + "learning_rate": 8.293641835868459e-06, + "loss": 0.0109, + "step": 1541 + }, + { + "clip_ratio": 0.003735063422936946, + "epoch": 0.057534629914649506, + "grad_norm": 0.08732519298791885, + "kl": 0.038726806640625, + "learning_rate": 8.291356096081134e-06, + "loss": 0.0104, + "step": 1542 + }, + { + "clip_ratio": 0.0029796406743116677, + "completion_length": 562.3393096923828, + "epoch": 0.05757194160720116, + "grad_norm": 0.06832750886678696, + "kl": 0.02593994140625, + "learning_rate": 8.289069185092645e-06, + "loss": -0.0163, + "num_tokens": 36816272.0, + "reward": 1.1216680407524109, + "reward_std": 0.09071569889783859, + "rewards/code_reward": 0.9716680198907852, + "rewards/format_reward": 1.5, + "step": 1543 + }, + { + "clip_ratio": 0.003218043886590749, + "epoch": 0.05760925329975281, + "grad_norm": 0.06430184841156006, + "kl": 0.026153564453125, + "learning_rate": 8.286781103862534e-06, + "loss": -0.0164, + "step": 1544 + }, + { + "clip_ratio": 0.003275323542766273, + "epoch": 0.057646564992304465, + "grad_norm": 0.06023978441953659, + "kl": 0.026641845703125, + "learning_rate": 8.284491853350838e-06, + "loss": -0.0169, + "step": 1545 + }, + { + "clip_ratio": 0.003098177956417203, + "completion_length": 666.5178833007812, + "epoch": 0.05768387668485612, + "grad_norm": 0.057803329080343246, + "kl": 0.0152435302734375, + "learning_rate": 8.282201434518083e-06, + "loss": 0.0126, + "num_tokens": 36881149.0, + "reward": 0.40850821137428284, + "reward_std": 0.023068337701261044, + "rewards/code_reward": 0.25850820587947965, + "rewards/format_reward": 1.5, + "step": 1546 + }, + { + "clip_ratio": 0.0034735084627754986, + "epoch": 0.05772118837740777, + "grad_norm": 0.06430291384458542, + "kl": 0.0154266357421875, + "learning_rate": 8.279909848325289e-06, + "loss": 0.0125, + "step": 1547 + }, + { + "clip_ratio": 0.003197028301656246, + "epoch": 0.057758500069959424, + "grad_norm": 0.05789298564195633, + "kl": 0.015380859375, + "learning_rate": 8.27761709573396e-06, + "loss": 0.0123, + "step": 1548 + }, + { + "clip_ratio": 0.004337788908742368, + "completion_length": 739.5000305175781, + "epoch": 0.05779581176251108, + "grad_norm": 0.09915484488010406, + "kl": 0.0251922607421875, + "learning_rate": 8.275323177706092e-06, + "loss": 0.0031, + "num_tokens": 36948643.0, + "reward": 0.5422571673989296, + "reward_std": 0.3604711852967739, + "rewards/code_reward": 0.3969000205397606, + "rewards/format_reward": 1.4535714387893677, + "step": 1549 + }, + { + "clip_ratio": 0.004212598432786763, + "epoch": 0.05783312345506273, + "grad_norm": 0.10255494713783264, + "kl": 0.0245819091796875, + "learning_rate": 8.273028095204174e-06, + "loss": 0.0029, + "step": 1550 + }, + { + "clip_ratio": 0.004181560128927231, + "epoch": 0.057870435147614384, + "grad_norm": 0.10400275886058807, + "kl": 0.025665283203125, + "learning_rate": 8.270731849191177e-06, + "loss": 0.0022, + "step": 1551 + }, + { + "clip_ratio": 0.002788069425150752, + "completion_length": 530.7500228881836, + "epoch": 0.05790774684016604, + "grad_norm": 0.06915955245494843, + "kl": 0.02099609375, + "learning_rate": 8.268434440630563e-06, + "loss": 0.0004, + "num_tokens": 37017145.0, + "reward": 0.8464285843074322, + "reward_std": 0.15759944915771484, + "rewards/code_reward": 0.6964285671710968, + "rewards/format_reward": 1.5, + "step": 1552 + }, + { + "clip_ratio": 0.0028116556350141764, + "epoch": 0.05794505853271769, + "grad_norm": 0.05727088078856468, + "kl": 0.02081298828125, + "learning_rate": 8.266135870486284e-06, + "loss": 0.0001, + "step": 1553 + }, + { + "clip_ratio": 0.0033840840333141387, + "epoch": 0.05798237022526934, + "grad_norm": 0.05472441762685776, + "kl": 0.020965576171875, + "learning_rate": 8.263836139722776e-06, + "loss": -0.0001, + "step": 1554 + }, + { + "clip_ratio": 0.003643330419436097, + "completion_length": 568.1964492797852, + "epoch": 0.058019681917820996, + "grad_norm": 0.084551602602005, + "kl": 0.0272216796875, + "learning_rate": 8.261535249304964e-06, + "loss": 0.0093, + "num_tokens": 37079162.0, + "reward": 0.7175138592720032, + "reward_std": 0.17279232945293188, + "rewards/code_reward": 0.5675138384103775, + "rewards/format_reward": 1.5, + "step": 1555 + }, + { + "clip_ratio": 0.0034152076113969088, + "epoch": 0.05805699361037265, + "grad_norm": 0.08250147104263306, + "kl": 0.02789306640625, + "learning_rate": 8.259233200198259e-06, + "loss": 0.0092, + "step": 1556 + }, + { + "clip_ratio": 0.0031218278454616666, + "epoch": 0.0580943053029243, + "grad_norm": 0.07696948945522308, + "kl": 0.027435302734375, + "learning_rate": 8.256929993368556e-06, + "loss": 0.0088, + "step": 1557 + }, + { + "clip_ratio": 0.0042233733693137765, + "completion_length": 692.7857437133789, + "epoch": 0.058131616995475956, + "grad_norm": 0.0888817086815834, + "kl": 0.0216217041015625, + "learning_rate": 8.254625629782241e-06, + "loss": 0.0318, + "num_tokens": 37149570.0, + "reward": 0.5366596765816212, + "reward_std": 0.23903965950012207, + "rewards/code_reward": 0.3886239379644394, + "rewards/format_reward": 1.480357140302658, + "step": 1558 + }, + { + "clip_ratio": 0.0042689068941399455, + "epoch": 0.05816892868802761, + "grad_norm": 0.0756281316280365, + "kl": 0.021087646484375, + "learning_rate": 8.25232011040618e-06, + "loss": 0.0314, + "step": 1559 + }, + { + "clip_ratio": 0.004329306946601719, + "epoch": 0.05820624038057926, + "grad_norm": 0.07095357030630112, + "kl": 0.0206146240234375, + "learning_rate": 8.250013436207727e-06, + "loss": 0.0311, + "step": 1560 + }, + { + "clip_ratio": 0.003555001167114824, + "completion_length": 636.2500228881836, + "epoch": 0.058243552073130915, + "grad_norm": 1.588326096534729, + "kl": 0.1470489501953125, + "learning_rate": 8.247705608154717e-06, + "loss": 0.0513, + "num_tokens": 37217680.0, + "reward": 1.008530780673027, + "reward_std": 0.23031961545348167, + "rewards/code_reward": 0.8612093478441238, + "rewards/format_reward": 1.4732142984867096, + "step": 1561 + }, + { + "clip_ratio": 0.004088611691258848, + "epoch": 0.05828086376568257, + "grad_norm": 2.460813522338867, + "kl": 0.03179931640625, + "learning_rate": 8.245396627215478e-06, + "loss": 0.0596, + "step": 1562 + }, + { + "clip_ratio": 0.004335072298999876, + "epoch": 0.05831817545823422, + "grad_norm": 0.35778799653053284, + "kl": 0.0309600830078125, + "learning_rate": 8.243086494358808e-06, + "loss": 0.0539, + "step": 1563 + }, + { + "clip_ratio": 0.0037339432747103274, + "completion_length": 797.5714569091797, + "epoch": 0.058355487150785874, + "grad_norm": 0.09115169942378998, + "kl": 0.02142333984375, + "learning_rate": 8.240775210554e-06, + "loss": -0.0015, + "num_tokens": 37308474.0, + "reward": 0.7288723178207874, + "reward_std": 0.23974510468542576, + "rewards/code_reward": 0.5788722798461094, + "rewards/format_reward": 1.5, + "step": 1564 + }, + { + "clip_ratio": 0.0032109011663123965, + "epoch": 0.05839279884333753, + "grad_norm": 0.07416108250617981, + "kl": 0.021453857421875, + "learning_rate": 8.238462776770828e-06, + "loss": -0.0017, + "step": 1565 + }, + { + "clip_ratio": 0.0037584611563943326, + "epoch": 0.05843011053588919, + "grad_norm": 0.06928963959217072, + "kl": 0.020416259765625, + "learning_rate": 8.23614919397954e-06, + "loss": -0.002, + "step": 1566 + }, + { + "clip_ratio": 0.0032344621140509844, + "completion_length": 686.6607513427734, + "epoch": 0.05846742222844084, + "grad_norm": 0.08582348376512527, + "kl": 0.02362060546875, + "learning_rate": 8.233834463150877e-06, + "loss": 0.0509, + "num_tokens": 37375109.0, + "reward": 0.8883605226874352, + "reward_std": 0.037654777988791466, + "rewards/code_reward": 0.7430033838609233, + "rewards/format_reward": 1.4535714387893677, + "step": 1567 + }, + { + "clip_ratio": 0.003247863904107362, + "epoch": 0.058504733920992494, + "grad_norm": 0.08407115191221237, + "kl": 0.02362060546875, + "learning_rate": 8.231518585256055e-06, + "loss": 0.0505, + "step": 1568 + }, + { + "clip_ratio": 0.0034444844350218773, + "epoch": 0.05854204561354415, + "grad_norm": 0.07742664217948914, + "kl": 0.023101806640625, + "learning_rate": 8.229201561266774e-06, + "loss": 0.0502, + "step": 1569 + }, + { + "clip_ratio": 0.005454304104205221, + "completion_length": 690.0536041259766, + "epoch": 0.0585793573060958, + "grad_norm": 0.1741514354944229, + "kl": 0.024749755859375, + "learning_rate": 8.226883392155215e-06, + "loss": 0.0052, + "num_tokens": 37446274.0, + "reward": 0.46899354830384254, + "reward_std": 0.2931752875447273, + "rewards/code_reward": 0.31899350695312023, + "rewards/format_reward": 1.5, + "step": 1570 + }, + { + "clip_ratio": 0.005231374700088054, + "epoch": 0.05861666899864745, + "grad_norm": 0.08005734533071518, + "kl": 0.024688720703125, + "learning_rate": 8.22456407889404e-06, + "loss": 0.0048, + "step": 1571 + }, + { + "clip_ratio": 0.005280741723254323, + "epoch": 0.058653980691199106, + "grad_norm": 0.0775381401181221, + "kl": 0.024993896484375, + "learning_rate": 8.222243622456386e-06, + "loss": 0.0048, + "step": 1572 + }, + { + "clip_ratio": 0.003321955446153879, + "completion_length": 738.2500152587891, + "epoch": 0.05869129238375076, + "grad_norm": 0.08748921751976013, + "kl": 0.02161407470703125, + "learning_rate": 8.219922023815875e-06, + "loss": 0.0066, + "num_tokens": 37528482.0, + "reward": 0.4582805596292019, + "reward_std": 0.2920798510313034, + "rewards/code_reward": 0.3082805424928665, + "rewards/format_reward": 1.5, + "step": 1573 + }, + { + "clip_ratio": 0.0030073331436142325, + "epoch": 0.05872860407630241, + "grad_norm": 0.08702527731657028, + "kl": 0.02295684814453125, + "learning_rate": 8.217599283946608e-06, + "loss": 0.0063, + "step": 1574 + }, + { + "clip_ratio": 0.0029411190771497786, + "epoch": 0.058765915768854066, + "grad_norm": 0.07760459184646606, + "kl": 0.0229034423828125, + "learning_rate": 8.215275403823162e-06, + "loss": 0.0059, + "step": 1575 + }, + { + "clip_ratio": 0.003937374509405345, + "completion_length": 656.0178985595703, + "epoch": 0.05880322746140572, + "grad_norm": 0.08895273506641388, + "kl": 0.02386474609375, + "learning_rate": 8.212950384420595e-06, + "loss": 0.0107, + "num_tokens": 37599533.0, + "reward": 0.7219388000667095, + "reward_std": 0.19415829330682755, + "rewards/code_reward": 0.571938768029213, + "rewards/format_reward": 1.5, + "step": 1576 + }, + { + "clip_ratio": 0.003773079952225089, + "epoch": 0.05884053915395737, + "grad_norm": 0.08737504482269287, + "kl": 0.024444580078125, + "learning_rate": 8.21062422671444e-06, + "loss": 0.0103, + "step": 1577 + }, + { + "clip_ratio": 0.003156743128784001, + "epoch": 0.058877850846509025, + "grad_norm": 0.08264487236738205, + "kl": 0.0242767333984375, + "learning_rate": 8.20829693168071e-06, + "loss": 0.01, + "step": 1578 + }, + { + "clip_ratio": 0.002270538476295769, + "completion_length": 556.3571701049805, + "epoch": 0.05891516253906068, + "grad_norm": 0.06306931376457214, + "kl": 0.027130126953125, + "learning_rate": 8.205968500295897e-06, + "loss": 0.0086, + "num_tokens": 37656687.0, + "reward": 0.7068709917366505, + "reward_std": 0.06915665604174137, + "rewards/code_reward": 0.5568709904327989, + "rewards/format_reward": 1.5, + "step": 1579 + }, + { + "clip_ratio": 0.0024144643684849143, + "epoch": 0.05895247423161233, + "grad_norm": 0.07078391313552856, + "kl": 0.02685546875, + "learning_rate": 8.203638933536967e-06, + "loss": 0.0086, + "step": 1580 + }, + { + "clip_ratio": 0.002175042638555169, + "epoch": 0.058989785924163984, + "grad_norm": 0.060410864651203156, + "kl": 0.026611328125, + "learning_rate": 8.20130823238136e-06, + "loss": 0.0084, + "step": 1581 + }, + { + "clip_ratio": 0.0032572377822361887, + "completion_length": 640.3214721679688, + "epoch": 0.05902709761671564, + "grad_norm": 0.048525623977184296, + "kl": 0.026885986328125, + "learning_rate": 8.198976397807001e-06, + "loss": -0.0007, + "num_tokens": 37727783.0, + "reward": 0.6102040782570839, + "reward_std": 0.09033838659524918, + "rewards/code_reward": 0.4602040946483612, + "rewards/format_reward": 1.5, + "step": 1582 + }, + { + "clip_ratio": 0.003503905376419425, + "epoch": 0.05906440930926729, + "grad_norm": 0.04608646780252457, + "kl": 0.0272674560546875, + "learning_rate": 8.196643430792276e-06, + "loss": -0.0007, + "step": 1583 + }, + { + "clip_ratio": 0.0037140462081879377, + "epoch": 0.059101721001818944, + "grad_norm": 0.04328151047229767, + "kl": 0.0261688232421875, + "learning_rate": 8.194309332316062e-06, + "loss": -0.0008, + "step": 1584 + }, + { + "clip_ratio": 0.003990396042354405, + "completion_length": 611.5178909301758, + "epoch": 0.0591390326943706, + "grad_norm": 0.057701654732227325, + "kl": 0.021484375, + "learning_rate": 8.191974103357699e-06, + "loss": -0.0005, + "num_tokens": 37785044.0, + "reward": 0.2620130069553852, + "reward_std": 0.040202997624874115, + "rewards/code_reward": 0.11201298236846924, + "rewards/format_reward": 1.5, + "step": 1585 + }, + { + "clip_ratio": 0.004388793895486742, + "epoch": 0.05917634438692225, + "grad_norm": 0.053933367133140564, + "kl": 0.02203369140625, + "learning_rate": 8.189637744897008e-06, + "loss": -0.0008, + "step": 1586 + }, + { + "clip_ratio": 0.0047860159538686275, + "epoch": 0.0592136560794739, + "grad_norm": 0.051682956516742706, + "kl": 0.02178955078125, + "learning_rate": 8.18730025791428e-06, + "loss": -0.001, + "step": 1587 + }, + { + "clip_ratio": 0.00373645854415372, + "completion_length": 566.803596496582, + "epoch": 0.059250967772025556, + "grad_norm": 0.08040271699428558, + "kl": 0.0338897705078125, + "learning_rate": 8.184961643390283e-06, + "loss": 0.0556, + "num_tokens": 37851449.0, + "reward": 0.6146235689520836, + "reward_std": 0.08749190624803305, + "rewards/code_reward": 0.4673021137714386, + "rewards/format_reward": 1.4732142984867096, + "step": 1588 + }, + { + "clip_ratio": 0.0036835010978393257, + "epoch": 0.05928827946457721, + "grad_norm": 0.06920985132455826, + "kl": 0.0326690673828125, + "learning_rate": 8.182621902306254e-06, + "loss": 0.0555, + "step": 1589 + }, + { + "clip_ratio": 0.0041888029663823545, + "epoch": 0.05932559115712886, + "grad_norm": 0.060543712228536606, + "kl": 0.0324249267578125, + "learning_rate": 8.180281035643907e-06, + "loss": 0.0552, + "step": 1590 + }, + { + "clip_ratio": 0.0030273234588094056, + "completion_length": 708.1964645385742, + "epoch": 0.059362902849680516, + "grad_norm": 0.07339049130678177, + "kl": 0.0263214111328125, + "learning_rate": 8.177939044385424e-06, + "loss": 0.009, + "num_tokens": 37924904.0, + "reward": 0.7122874371707439, + "reward_std": 0.2943221405148506, + "rewards/code_reward": 0.5649659633636475, + "rewards/format_reward": 1.4732142984867096, + "step": 1591 + }, + { + "clip_ratio": 0.0023862036177888513, + "epoch": 0.05940021454223217, + "grad_norm": 0.07185934484004974, + "kl": 0.028900146484375, + "learning_rate": 8.175595929513466e-06, + "loss": 0.0086, + "step": 1592 + }, + { + "clip_ratio": 0.002705960359890014, + "epoch": 0.05943752623478383, + "grad_norm": 0.07315634936094284, + "kl": 0.028350830078125, + "learning_rate": 8.173251692011152e-06, + "loss": 0.0086, + "step": 1593 + }, + { + "clip_ratio": 0.004009437165223062, + "completion_length": 678.8393096923828, + "epoch": 0.05947483792733548, + "grad_norm": 0.082923024892807, + "kl": 0.0256195068359375, + "learning_rate": 8.170906332862088e-06, + "loss": 0.0026, + "num_tokens": 37995665.0, + "reward": 0.3511168360710144, + "reward_std": 0.1742014375049621, + "rewards/code_reward": 0.203795374953188, + "rewards/format_reward": 1.4732142984867096, + "step": 1594 + }, + { + "clip_ratio": 0.00402810686500743, + "epoch": 0.059512149619887135, + "grad_norm": 0.07293787598609924, + "kl": 0.0242767333984375, + "learning_rate": 8.168559853050338e-06, + "loss": 0.0023, + "step": 1595 + }, + { + "clip_ratio": 0.004056494915857911, + "epoch": 0.05954946131243879, + "grad_norm": 0.0725814625620842, + "kl": 0.02435302734375, + "learning_rate": 8.166212253560445e-06, + "loss": 0.002, + "step": 1596 + }, + { + "clip_ratio": 0.002568533760495484, + "completion_length": 725.8393096923828, + "epoch": 0.05958677300499044, + "grad_norm": 0.05876294523477554, + "kl": 0.02227783203125, + "learning_rate": 8.163863535377419e-06, + "loss": 0.0643, + "num_tokens": 38068694.0, + "reward": 0.5320303663611412, + "reward_std": 0.10298771318048239, + "rewards/code_reward": 0.38470893632620573, + "rewards/format_reward": 1.4732142984867096, + "step": 1597 + }, + { + "clip_ratio": 0.002451381296850741, + "epoch": 0.059624084697542094, + "grad_norm": 0.05839603766798973, + "kl": 0.02239990234375, + "learning_rate": 8.161513699486733e-06, + "loss": 0.0642, + "step": 1598 + }, + { + "clip_ratio": 0.002516321372240782, + "epoch": 0.05966139639009375, + "grad_norm": 0.05661454051733017, + "kl": 0.02197265625, + "learning_rate": 8.15916274687434e-06, + "loss": 0.064, + "step": 1599 + }, + { + "clip_ratio": 0.003244508581701666, + "completion_length": 666.9642944335938, + "epoch": 0.0596987080826454, + "grad_norm": 0.0329345278441906, + "kl": 0.0179290771484375, + "learning_rate": 8.156810678526652e-06, + "loss": -0.005, + "num_tokens": 38139672.0, + "reward": 0.5357143059372902, + "reward_std": 0.0770328938961029, + "rewards/code_reward": 0.3857142776250839, + "rewards/format_reward": 1.5, + "step": 1600 + }, + { + "clip_ratio": 0.002873104182071984, + "epoch": 0.059736019775197054, + "grad_norm": 0.03235320374369621, + "kl": 0.0182952880859375, + "learning_rate": 8.154457495430557e-06, + "loss": -0.005, + "step": 1601 + }, + { + "clip_ratio": 0.003289966785814613, + "epoch": 0.05977333146774871, + "grad_norm": 0.031850818544626236, + "kl": 0.018157958984375, + "learning_rate": 8.152103198573403e-06, + "loss": -0.0051, + "step": 1602 + }, + { + "clip_ratio": 0.005172173958271742, + "completion_length": 701.5893173217773, + "epoch": 0.05981064316030036, + "grad_norm": 0.05783623456954956, + "kl": 0.02276611328125, + "learning_rate": 8.149747788943014e-06, + "loss": 0.0063, + "num_tokens": 38209663.0, + "reward": 0.3622201420366764, + "reward_std": 0.10651194304227829, + "rewards/code_reward": 0.21222013467922807, + "rewards/format_reward": 1.5, + "step": 1603 + }, + { + "clip_ratio": 0.00513819616753608, + "epoch": 0.05984795485285201, + "grad_norm": 0.058369435369968414, + "kl": 0.023101806640625, + "learning_rate": 8.147391267527672e-06, + "loss": 0.0063, + "step": 1604 + }, + { + "clip_ratio": 0.004612705612089485, + "epoch": 0.059885266545403666, + "grad_norm": 0.05366721376776695, + "kl": 0.022979736328125, + "learning_rate": 8.14503363531613e-06, + "loss": 0.0059, + "step": 1605 + }, + { + "clip_ratio": 0.0038680853322148323, + "completion_length": 761.9643096923828, + "epoch": 0.05992257823795532, + "grad_norm": 0.11406312882900238, + "kl": 0.0478515625, + "learning_rate": 8.142674893297609e-06, + "loss": 0.0404, + "num_tokens": 38288187.0, + "reward": 0.6346323005855083, + "reward_std": 0.2964787930250168, + "rewards/code_reward": 0.489275187253952, + "rewards/format_reward": 1.4535714089870453, + "step": 1606 + }, + { + "clip_ratio": 0.0036516059772111475, + "epoch": 0.05995988993050697, + "grad_norm": 0.1000857800245285, + "kl": 0.0565643310546875, + "learning_rate": 8.140315042461789e-06, + "loss": 0.04, + "step": 1607 + }, + { + "clip_ratio": 0.0038869892596267164, + "epoch": 0.059997201623058626, + "grad_norm": 0.09104280173778534, + "kl": 0.0415496826171875, + "learning_rate": 8.137954083798826e-06, + "loss": 0.0395, + "step": 1608 + }, + { + "clip_ratio": 0.003182401938829571, + "completion_length": 556.428596496582, + "epoch": 0.06003451331561028, + "grad_norm": 0.0835004448890686, + "kl": 0.028472900390625, + "learning_rate": 8.13559201829933e-06, + "loss": -0.0056, + "num_tokens": 38347277.0, + "reward": 0.43587033078074455, + "reward_std": 0.13442546036094427, + "rewards/code_reward": 0.29122745990753174, + "rewards/format_reward": 1.4464285969734192, + "step": 1609 + }, + { + "clip_ratio": 0.003483023843728006, + "epoch": 0.06007182500816193, + "grad_norm": 0.10551528632640839, + "kl": 0.028533935546875, + "learning_rate": 8.13322884695438e-06, + "loss": -0.0058, + "step": 1610 + }, + { + "clip_ratio": 0.003122272144537419, + "epoch": 0.060109136700713585, + "grad_norm": 0.07960875332355499, + "kl": 0.027801513671875, + "learning_rate": 8.130864570755522e-06, + "loss": -0.0062, + "step": 1611 + }, + { + "clip_ratio": 0.004623334156349301, + "completion_length": 898.9643249511719, + "epoch": 0.06014644839326524, + "grad_norm": 0.0992894247174263, + "kl": 0.02001953125, + "learning_rate": 8.128499190694758e-06, + "loss": -0.0066, + "num_tokens": 38430631.0, + "reward": 0.2175269052386284, + "reward_std": 0.11071607074700296, + "rewards/code_reward": 0.07020544842816889, + "rewards/format_reward": 1.4732142984867096, + "step": 1612 + }, + { + "clip_ratio": 0.004290788841899484, + "epoch": 0.06018376008581689, + "grad_norm": 0.09197894483804703, + "kl": 0.0199737548828125, + "learning_rate": 8.126132707764558e-06, + "loss": -0.0069, + "step": 1613 + }, + { + "clip_ratio": 0.004215037974063307, + "epoch": 0.060221071778368544, + "grad_norm": 0.0798080787062645, + "kl": 0.0197601318359375, + "learning_rate": 8.123765122957858e-06, + "loss": -0.0073, + "step": 1614 + }, + { + "clip_ratio": 0.002387825632467866, + "completion_length": 590.7321624755859, + "epoch": 0.0602583834709202, + "grad_norm": 0.08222388476133347, + "kl": 0.017547607421875, + "learning_rate": 8.121396437268049e-06, + "loss": 0.051, + "num_tokens": 38488590.0, + "reward": 0.9298542588949203, + "reward_std": 0.20748872496187687, + "rewards/code_reward": 0.7825328409671783, + "rewards/format_reward": 1.4732142984867096, + "step": 1615 + }, + { + "clip_ratio": 0.002709310210775584, + "epoch": 0.06029569516347185, + "grad_norm": 0.07737685739994049, + "kl": 0.0176849365234375, + "learning_rate": 8.11902665168899e-06, + "loss": 0.0508, + "step": 1616 + }, + { + "clip_ratio": 0.002479560731444508, + "epoch": 0.060333006856023504, + "grad_norm": 0.07090195268392563, + "kl": 0.0175628662109375, + "learning_rate": 8.116655767214998e-06, + "loss": 0.0504, + "step": 1617 + }, + { + "clip_ratio": 0.003838001110125333, + "completion_length": 783.0357513427734, + "epoch": 0.06037031854857516, + "grad_norm": 0.08751675486564636, + "kl": 0.021697998046875, + "learning_rate": 8.114283784840852e-06, + "loss": -0.0192, + "num_tokens": 38568238.0, + "reward": 0.8024332225322723, + "reward_std": 0.2910071797668934, + "rewards/code_reward": 0.6551117785274982, + "rewards/format_reward": 1.4732142984867096, + "step": 1618 + }, + { + "clip_ratio": 0.003264850820414722, + "epoch": 0.06040763024112681, + "grad_norm": 0.08709808439016342, + "kl": 0.0217742919921875, + "learning_rate": 8.11191070556179e-06, + "loss": -0.0196, + "step": 1619 + }, + { + "clip_ratio": 0.0032103590201586485, + "epoch": 0.06044494193367847, + "grad_norm": 0.0893637090921402, + "kl": 0.021453857421875, + "learning_rate": 8.109536530373516e-06, + "loss": -0.0202, + "step": 1620 + }, + { + "clip_ratio": 0.0034393072710372508, + "completion_length": 640.0000381469727, + "epoch": 0.06048225362623012, + "grad_norm": 0.11764048039913177, + "kl": 0.032958984375, + "learning_rate": 8.107161260272185e-06, + "loss": 0.0105, + "num_tokens": 38634108.0, + "reward": 0.8012702018022537, + "reward_std": 0.176805698312819, + "rewards/code_reward": 0.6551987463608384, + "rewards/format_reward": 1.4607142806053162, + "step": 1621 + }, + { + "clip_ratio": 0.003274232614785433, + "epoch": 0.060519565318781776, + "grad_norm": 0.1028149425983429, + "kl": 0.03253173828125, + "learning_rate": 8.10478489625442e-06, + "loss": 0.0102, + "step": 1622 + }, + { + "clip_ratio": 0.003089517937041819, + "epoch": 0.06055687701133343, + "grad_norm": 0.09111734479665756, + "kl": 0.032958984375, + "learning_rate": 8.102407439317299e-06, + "loss": 0.0095, + "step": 1623 + }, + { + "clip_ratio": 0.005826237262226641, + "completion_length": 634.6250381469727, + "epoch": 0.06059418870388508, + "grad_norm": 0.13269920647144318, + "kl": 0.09429931640625, + "learning_rate": 8.100028890458357e-06, + "loss": 0.0142, + "num_tokens": 38717737.0, + "reward": 0.6281872466206551, + "reward_std": 0.389748964458704, + "rewards/code_reward": 0.4781872443854809, + "rewards/format_reward": 1.5, + "step": 1624 + }, + { + "clip_ratio": 0.005745542119257152, + "epoch": 0.060631500396436736, + "grad_norm": 0.1144213154911995, + "kl": 0.083648681640625, + "learning_rate": 8.097649250675588e-06, + "loss": 0.0138, + "step": 1625 + }, + { + "clip_ratio": 0.0051528733456507325, + "epoch": 0.06066881208898839, + "grad_norm": 0.09902413934469223, + "kl": 0.06488037109375, + "learning_rate": 8.095268520967448e-06, + "loss": 0.0128, + "step": 1626 + }, + { + "clip_ratio": 0.004084289073944092, + "completion_length": 722.1786041259766, + "epoch": 0.06070612378154004, + "grad_norm": 0.08110187947750092, + "kl": 0.019378662109375, + "learning_rate": 8.092886702332844e-06, + "loss": -0.0001, + "num_tokens": 38788895.0, + "reward": 0.5459133423864841, + "reward_std": 0.21180351451039314, + "rewards/code_reward": 0.3959133103489876, + "rewards/format_reward": 1.5, + "step": 1627 + }, + { + "clip_ratio": 0.0036682610225398093, + "epoch": 0.060743435474091695, + "grad_norm": 0.08382410556077957, + "kl": 0.0190887451171875, + "learning_rate": 8.090503795771145e-06, + "loss": -0.0002, + "step": 1628 + }, + { + "clip_ratio": 0.0037900476600043476, + "epoch": 0.06078074716664335, + "grad_norm": 0.07141174376010895, + "kl": 0.0192108154296875, + "learning_rate": 8.088119802282174e-06, + "loss": -0.0006, + "step": 1629 + }, + { + "clip_ratio": 0.003026043879799545, + "completion_length": 704.5000305175781, + "epoch": 0.060818058859195, + "grad_norm": 0.05437391996383667, + "kl": 0.022430419921875, + "learning_rate": 8.085734722866207e-06, + "loss": -0.0083, + "num_tokens": 38865475.0, + "reward": 0.9541750252246857, + "reward_std": 0.12708362750709057, + "rewards/code_reward": 0.8041749894618988, + "rewards/format_reward": 1.5, + "step": 1630 + }, + { + "clip_ratio": 0.0030019291443750262, + "epoch": 0.060855370551746654, + "grad_norm": 0.05362410470843315, + "kl": 0.023468017578125, + "learning_rate": 8.083348558523985e-06, + "loss": -0.0083, + "step": 1631 + }, + { + "clip_ratio": 0.00293729332042858, + "epoch": 0.06089268224429831, + "grad_norm": 0.04926226660609245, + "kl": 0.02398681640625, + "learning_rate": 8.080961310256694e-06, + "loss": -0.0086, + "step": 1632 + }, + { + "clip_ratio": 0.0037777769612148404, + "completion_length": 559.2857437133789, + "epoch": 0.06092999393684996, + "grad_norm": 0.24254615604877472, + "kl": 0.029205322265625, + "learning_rate": 8.07857297906598e-06, + "loss": 0.0039, + "num_tokens": 38928113.0, + "reward": 0.7244884185492992, + "reward_std": 0.1514110304415226, + "rewards/code_reward": 0.5744884014129639, + "rewards/format_reward": 1.5, + "step": 1633 + }, + { + "clip_ratio": 0.003960117232054472, + "epoch": 0.060967305629401614, + "grad_norm": 0.1293250471353531, + "kl": 0.046661376953125, + "learning_rate": 8.076183565953941e-06, + "loss": 0.004, + "step": 1634 + }, + { + "clip_ratio": 0.004359671613201499, + "epoch": 0.06100461732195327, + "grad_norm": 0.10464290529489517, + "kl": 0.033172607421875, + "learning_rate": 8.073793071923134e-06, + "loss": 0.0033, + "step": 1635 + }, + { + "clip_ratio": 0.0019922760548070073, + "completion_length": 545.8750305175781, + "epoch": 0.06104192901450492, + "grad_norm": 0.06979891657829285, + "kl": 0.02801513671875, + "learning_rate": 8.071401497976563e-06, + "loss": 0.0154, + "num_tokens": 38985618.0, + "reward": 0.8504551947116852, + "reward_std": 0.17711904272437096, + "rewards/code_reward": 0.7004551887512207, + "rewards/format_reward": 1.5, + "step": 1636 + }, + { + "clip_ratio": 0.001553655427414924, + "epoch": 0.06107924070705657, + "grad_norm": 0.07043148577213287, + "kl": 0.025970458984375, + "learning_rate": 8.06900884511769e-06, + "loss": 0.0153, + "step": 1637 + }, + { + "clip_ratio": 0.0015447247424162924, + "epoch": 0.061116552399608226, + "grad_norm": 0.06417836248874664, + "kl": 0.027008056640625, + "learning_rate": 8.066615114350423e-06, + "loss": 0.0149, + "step": 1638 + }, + { + "clip_ratio": 0.0025056636659428477, + "completion_length": 760.6607666015625, + "epoch": 0.06115386409215988, + "grad_norm": 0.0658804327249527, + "kl": 0.0196990966796875, + "learning_rate": 8.064220306679132e-06, + "loss": -0.006, + "num_tokens": 39060401.0, + "reward": 0.7483531311154366, + "reward_std": 0.30198393017053604, + "rewards/code_reward": 0.5983531437814236, + "rewards/format_reward": 1.5, + "step": 1639 + }, + { + "clip_ratio": 0.002631567360367626, + "epoch": 0.06119117578471153, + "grad_norm": 0.06625731289386749, + "kl": 0.0204620361328125, + "learning_rate": 8.061824423108633e-06, + "loss": -0.0061, + "step": 1640 + }, + { + "clip_ratio": 0.0027322195819579065, + "epoch": 0.061228487477263185, + "grad_norm": 0.06381222605705261, + "kl": 0.0200958251953125, + "learning_rate": 8.059427464644192e-06, + "loss": -0.0063, + "step": 1641 + }, + { + "clip_ratio": 0.0028140374924987555, + "completion_length": 716.0000305175781, + "epoch": 0.06126579916981484, + "grad_norm": 0.06551537662744522, + "kl": 0.020263671875, + "learning_rate": 8.05702943229153e-06, + "loss": 0.0081, + "num_tokens": 39131983.0, + "reward": 0.5980229675769806, + "reward_std": 0.2150501310825348, + "rewards/code_reward": 0.4480229653418064, + "rewards/format_reward": 1.5, + "step": 1642 + }, + { + "clip_ratio": 0.0027384335407987237, + "epoch": 0.06130311086236649, + "grad_norm": 0.09413152188062668, + "kl": 0.020599365234375, + "learning_rate": 8.05463032705682e-06, + "loss": 0.008, + "step": 1643 + }, + { + "clip_ratio": 0.002658357785549015, + "epoch": 0.061340422554918145, + "grad_norm": 0.06473200768232346, + "kl": 0.020782470703125, + "learning_rate": 8.052230149946674e-06, + "loss": 0.0078, + "step": 1644 + }, + { + "clip_ratio": 0.005086545716039836, + "completion_length": 665.2678680419922, + "epoch": 0.0613777342474698, + "grad_norm": 0.0892825722694397, + "kl": 0.0214996337890625, + "learning_rate": 8.049828901968167e-06, + "loss": 0.0054, + "num_tokens": 39199226.0, + "reward": 0.5889658629894257, + "reward_std": 0.47730761766433716, + "rewards/code_reward": 0.43896588683128357, + "rewards/format_reward": 1.5, + "step": 1645 + }, + { + "clip_ratio": 0.004175537498667836, + "epoch": 0.06141504594002145, + "grad_norm": 0.08670166879892349, + "kl": 0.02142333984375, + "learning_rate": 8.04742658412882e-06, + "loss": 0.0048, + "step": 1646 + }, + { + "clip_ratio": 0.0031326781027019024, + "epoch": 0.061452357632573104, + "grad_norm": 0.0827481746673584, + "kl": 0.021331787109375, + "learning_rate": 8.045023197436597e-06, + "loss": 0.0042, + "step": 1647 + }, + { + "clip_ratio": 0.004467041639145464, + "completion_length": 673.3928756713867, + "epoch": 0.061489669325124764, + "grad_norm": 0.08983433246612549, + "kl": 0.0269775390625, + "learning_rate": 8.042618742899914e-06, + "loss": 0.0062, + "num_tokens": 39272132.0, + "reward": 0.7487741969525814, + "reward_std": 0.1925436146557331, + "rewards/code_reward": 0.598774179816246, + "rewards/format_reward": 1.5, + "step": 1648 + }, + { + "clip_ratio": 0.0037606034602504224, + "epoch": 0.06152698101767642, + "grad_norm": 0.08768902719020844, + "kl": 0.026611328125, + "learning_rate": 8.040213221527639e-06, + "loss": 0.0058, + "step": 1649 + }, + { + "clip_ratio": 0.004334817174822092, + "epoch": 0.06156429271022807, + "grad_norm": 0.08191915601491928, + "kl": 0.026763916015625, + "learning_rate": 8.037806634329079e-06, + "loss": 0.0056, + "step": 1650 + }, + { + "clip_ratio": 0.0038486664998345077, + "completion_length": 724.1607360839844, + "epoch": 0.061601604402779724, + "grad_norm": 0.08940981328487396, + "kl": 0.021453857421875, + "learning_rate": 8.035398982314e-06, + "loss": -0.0075, + "num_tokens": 39340153.0, + "reward": 0.3058817759156227, + "reward_std": 0.17789436224848032, + "rewards/code_reward": 0.15588176809251308, + "rewards/format_reward": 1.5, + "step": 1651 + }, + { + "clip_ratio": 0.003819800796918571, + "epoch": 0.06163891609533138, + "grad_norm": 0.08779502660036087, + "kl": 0.021240234375, + "learning_rate": 8.032990266492603e-06, + "loss": -0.0077, + "step": 1652 + }, + { + "clip_ratio": 0.003914843313395977, + "epoch": 0.06167622778788303, + "grad_norm": 0.08251442015171051, + "kl": 0.02203369140625, + "learning_rate": 8.030580487875542e-06, + "loss": -0.0082, + "step": 1653 + }, + { + "clip_ratio": 0.0032967014121823013, + "completion_length": 628.0178833007812, + "epoch": 0.06171353948043468, + "grad_norm": 0.0913461446762085, + "kl": 0.030975341796875, + "learning_rate": 8.028169647473915e-06, + "loss": -0.0007, + "num_tokens": 39402776.0, + "reward": 0.4181208610534668, + "reward_std": 0.18666450586169958, + "rewards/code_reward": 0.2707993872463703, + "rewards/format_reward": 1.4732142984867096, + "step": 1654 + }, + { + "clip_ratio": 0.003416134277358651, + "epoch": 0.061750851172986336, + "grad_norm": 0.08764567226171494, + "kl": 0.03076171875, + "learning_rate": 8.025757746299267e-06, + "loss": -0.001, + "step": 1655 + }, + { + "clip_ratio": 0.0032842528889887035, + "epoch": 0.06178816286553799, + "grad_norm": 0.08254673331975937, + "kl": 0.03363037109375, + "learning_rate": 8.023344785363586e-06, + "loss": -0.0015, + "step": 1656 + }, + { + "clip_ratio": 0.003807689528912306, + "completion_length": 727.982177734375, + "epoch": 0.06182547455808964, + "grad_norm": 0.08366947621107101, + "kl": 0.01806640625, + "learning_rate": 8.020930765679304e-06, + "loss": 0.0, + "num_tokens": 39473629.0, + "reward": 0.5518435128033161, + "reward_std": 0.21157710999250412, + "rewards/code_reward": 0.40184350311756134, + "rewards/format_reward": 1.5, + "step": 1657 + }, + { + "clip_ratio": 0.0034797312691807747, + "epoch": 0.061862786250641295, + "grad_norm": 0.08279012143611908, + "kl": 0.0183563232421875, + "learning_rate": 8.018515688259304e-06, + "loss": -0.0003, + "step": 1658 + }, + { + "clip_ratio": 0.003394222934730351, + "epoch": 0.06190009794319295, + "grad_norm": 0.06979889422655106, + "kl": 0.018310546875, + "learning_rate": 8.0160995541169e-06, + "loss": -0.0004, + "step": 1659 + }, + { + "clip_ratio": 0.0038353222189471126, + "completion_length": 724.3928909301758, + "epoch": 0.0619374096357446, + "grad_norm": 0.08384953439235687, + "kl": 0.0232391357421875, + "learning_rate": 8.013682364265863e-06, + "loss": -0.0023, + "num_tokens": 39554543.0, + "reward": 0.6915683709084988, + "reward_std": 0.25470658764243126, + "rewards/code_reward": 0.5415683649480343, + "rewards/format_reward": 1.5, + "step": 1660 + }, + { + "clip_ratio": 0.0033977185958065093, + "epoch": 0.061974721328296255, + "grad_norm": 0.0901564434170723, + "kl": 0.0228424072265625, + "learning_rate": 8.0112641197204e-06, + "loss": -0.0025, + "step": 1661 + }, + { + "clip_ratio": 0.003655387961771339, + "epoch": 0.06201203302084791, + "grad_norm": 0.08427656441926956, + "kl": 0.0233306884765625, + "learning_rate": 8.008844821495159e-06, + "loss": -0.0026, + "step": 1662 + }, + { + "clip_ratio": 0.0049681790405884385, + "completion_length": 701.4286041259766, + "epoch": 0.06204934471339956, + "grad_norm": 0.0962432473897934, + "kl": 0.027557373046875, + "learning_rate": 8.006424470605235e-06, + "loss": -0.0007, + "num_tokens": 39627633.0, + "reward": 0.5411552302539349, + "reward_std": 0.2599896937608719, + "rewards/code_reward": 0.3911552131175995, + "rewards/format_reward": 1.5, + "step": 1663 + }, + { + "clip_ratio": 0.004956982797011733, + "epoch": 0.062086656405951214, + "grad_norm": 0.09671957045793533, + "kl": 0.027374267578125, + "learning_rate": 8.004003068066161e-06, + "loss": -0.001, + "step": 1664 + }, + { + "clip_ratio": 0.004697132564615458, + "epoch": 0.06212396809850287, + "grad_norm": 0.08465433120727539, + "kl": 0.028411865234375, + "learning_rate": 8.001580614893912e-06, + "loss": -0.0015, + "step": 1665 + }, + { + "clip_ratio": 0.003153868834488094, + "completion_length": 567.2143096923828, + "epoch": 0.06216127979105452, + "grad_norm": 0.07260246574878693, + "kl": 0.029205322265625, + "learning_rate": 7.999157112104907e-06, + "loss": 0.0077, + "num_tokens": 39684387.0, + "reward": 0.79335717856884, + "reward_std": 0.23193710297346115, + "rewards/code_reward": 0.6433571726083755, + "rewards/format_reward": 1.5, + "step": 1666 + }, + { + "clip_ratio": 0.002654622192494571, + "epoch": 0.06219859148360617, + "grad_norm": 0.07160737365484238, + "kl": 0.02716064453125, + "learning_rate": 7.996732560716e-06, + "loss": 0.0074, + "step": 1667 + }, + { + "clip_ratio": 0.00265527016017586, + "epoch": 0.06223590317615783, + "grad_norm": 0.0697118267416954, + "kl": 0.02728271484375, + "learning_rate": 7.994306961744489e-06, + "loss": 0.0071, + "step": 1668 + }, + { + "clip_ratio": 0.0044986295979470015, + "completion_length": 660.1786041259766, + "epoch": 0.06227321486870948, + "grad_norm": 0.08658070117235184, + "kl": 0.0176849365234375, + "learning_rate": 7.99188031620811e-06, + "loss": -0.0059, + "num_tokens": 39743211.0, + "reward": 0.48427532985806465, + "reward_std": 0.2449983563274145, + "rewards/code_reward": 0.334275308996439, + "rewards/format_reward": 1.5, + "step": 1669 + }, + { + "clip_ratio": 0.0036964023602195084, + "epoch": 0.06231052656126113, + "grad_norm": 0.11114390194416046, + "kl": 0.017364501953125, + "learning_rate": 7.989452625125039e-06, + "loss": -0.0061, + "step": 1670 + }, + { + "clip_ratio": 0.0035904208198189735, + "epoch": 0.062347838253812786, + "grad_norm": 0.07542326301336288, + "kl": 0.01751708984375, + "learning_rate": 7.98702388951389e-06, + "loss": -0.0066, + "step": 1671 + }, + { + "clip_ratio": 0.004152427893131971, + "completion_length": 596.303596496582, + "epoch": 0.06238514994636444, + "grad_norm": 0.0888257697224617, + "kl": 0.041412353515625, + "learning_rate": 7.984594110393714e-06, + "loss": -0.0008, + "num_tokens": 39816868.0, + "reward": 0.6793498955667019, + "reward_std": 0.10486217215657234, + "rewards/code_reward": 0.5293498933315277, + "rewards/format_reward": 1.5, + "step": 1672 + }, + { + "clip_ratio": 0.004228562233038247, + "epoch": 0.06242246163891609, + "grad_norm": 0.08207224309444427, + "kl": 0.0406494140625, + "learning_rate": 7.982163288784001e-06, + "loss": -0.0009, + "step": 1673 + }, + { + "clip_ratio": 0.004261562484316528, + "epoch": 0.062459773331467745, + "grad_norm": 0.07756693661212921, + "kl": 0.040435791015625, + "learning_rate": 7.97973142570468e-06, + "loss": -0.0012, + "step": 1674 + }, + { + "clip_ratio": 0.005239745136350393, + "completion_length": 598.1964416503906, + "epoch": 0.062497085024019405, + "grad_norm": 0.10967458784580231, + "kl": 0.025665283203125, + "learning_rate": 7.977298522176117e-06, + "loss": 0.0035, + "num_tokens": 39881741.0, + "reward": 0.589494626969099, + "reward_std": 0.08914984157308936, + "rewards/code_reward": 0.43949462845921516, + "rewards/format_reward": 1.5, + "step": 1675 + }, + { + "clip_ratio": 0.004990900168195367, + "epoch": 0.06253439671657106, + "grad_norm": 0.10037180781364441, + "kl": 0.025604248046875, + "learning_rate": 7.974864579219109e-06, + "loss": 0.0032, + "step": 1676 + }, + { + "clip_ratio": 0.004562372399959713, + "epoch": 0.06257170840912271, + "grad_norm": 0.07737666368484497, + "kl": 0.02532958984375, + "learning_rate": 7.972429597854898e-06, + "loss": 0.0027, + "step": 1677 + }, + { + "clip_ratio": 0.00238790619187057, + "completion_length": 577.7321701049805, + "epoch": 0.06260902010167436, + "grad_norm": 0.07421645522117615, + "kl": 0.0245361328125, + "learning_rate": 7.969993579105153e-06, + "loss": 0.0067, + "num_tokens": 39943284.0, + "reward": 1.062337651848793, + "reward_std": 0.11015208065509796, + "rewards/code_reward": 0.9123376607894897, + "rewards/format_reward": 1.5, + "step": 1678 + }, + { + "clip_ratio": 0.002067523484583944, + "epoch": 0.06264633179422602, + "grad_norm": 0.07710065692663193, + "kl": 0.02410888671875, + "learning_rate": 7.967556523991984e-06, + "loss": 0.0065, + "step": 1679 + }, + { + "clip_ratio": 0.002196636574808508, + "epoch": 0.06268364348677767, + "grad_norm": 0.06420893967151642, + "kl": 0.0242919921875, + "learning_rate": 7.965118433537934e-06, + "loss": 0.0061, + "step": 1680 + }, + { + "clip_ratio": 0.0028885225183330476, + "completion_length": 711.285758972168, + "epoch": 0.06272095517932932, + "grad_norm": 0.08084763586521149, + "kl": 0.022125244140625, + "learning_rate": 7.962679308765981e-06, + "loss": 0.0083, + "num_tokens": 40020440.0, + "reward": 0.7886354178190231, + "reward_std": 0.23248260468244553, + "rewards/code_reward": 0.6386353895068169, + "rewards/format_reward": 1.5, + "step": 1681 + }, + { + "clip_ratio": 0.002766440564300865, + "epoch": 0.06275826687188098, + "grad_norm": 0.07245895266532898, + "kl": 0.0220947265625, + "learning_rate": 7.960239150699534e-06, + "loss": 0.0081, + "step": 1682 + }, + { + "clip_ratio": 0.0027103080647066236, + "epoch": 0.06279557856443263, + "grad_norm": 0.0697702094912529, + "kl": 0.022064208984375, + "learning_rate": 7.95779796036244e-06, + "loss": 0.008, + "step": 1683 + }, + { + "clip_ratio": 0.004342648375313729, + "completion_length": 583.2142944335938, + "epoch": 0.06283289025698428, + "grad_norm": 0.09835421293973923, + "kl": 0.043731689453125, + "learning_rate": 7.95535573877898e-06, + "loss": 0.0027, + "num_tokens": 40080694.0, + "reward": 0.6505546197295189, + "reward_std": 0.3584822230041027, + "rewards/code_reward": 0.5005546323955059, + "rewards/format_reward": 1.5, + "step": 1684 + }, + { + "clip_ratio": 0.004220591101329774, + "epoch": 0.06287020194953594, + "grad_norm": 0.09913326799869537, + "kl": 0.043182373046875, + "learning_rate": 7.952912486973859e-06, + "loss": 0.0025, + "step": 1685 + }, + { + "clip_ratio": 0.004571093886625022, + "epoch": 0.06290751364208759, + "grad_norm": 0.09260256588459015, + "kl": 0.03363037109375, + "learning_rate": 7.950468205972221e-06, + "loss": 0.002, + "step": 1686 + }, + { + "clip_ratio": 0.0040361895225942135, + "completion_length": 734.8214645385742, + "epoch": 0.06294482533463924, + "grad_norm": 0.0795912891626358, + "kl": 0.024932861328125, + "learning_rate": 7.948022896799646e-06, + "loss": 0.0656, + "num_tokens": 40157124.0, + "reward": 0.5115528367459774, + "reward_std": 0.17022179113700986, + "rewards/code_reward": 0.36423139995895326, + "rewards/format_reward": 1.4732142984867096, + "step": 1687 + }, + { + "clip_ratio": 0.003631629340816289, + "epoch": 0.0629821370271909, + "grad_norm": 0.07427388429641724, + "kl": 0.0242919921875, + "learning_rate": 7.945576560482135e-06, + "loss": 0.0654, + "step": 1688 + }, + { + "clip_ratio": 0.0034238153602927923, + "epoch": 0.06301944871974255, + "grad_norm": 0.07012948393821716, + "kl": 0.02459716796875, + "learning_rate": 7.943129198046128e-06, + "loss": 0.0648, + "step": 1689 + }, + { + "clip_ratio": 0.0028756303945556283, + "completion_length": 626.3214569091797, + "epoch": 0.0630567604122942, + "grad_norm": 0.07922284305095673, + "kl": 0.025482177734375, + "learning_rate": 7.94068081051849e-06, + "loss": 0.0097, + "num_tokens": 40228206.0, + "reward": 0.736973263323307, + "reward_std": 0.25641893595457077, + "rewards/code_reward": 0.5869732834398746, + "rewards/format_reward": 1.5, + "step": 1690 + }, + { + "clip_ratio": 0.0028047889936715364, + "epoch": 0.06309407210484586, + "grad_norm": 0.07681523263454437, + "kl": 0.025787353515625, + "learning_rate": 7.938231398926524e-06, + "loss": 0.0098, + "step": 1691 + }, + { + "clip_ratio": 0.0031115083256736398, + "epoch": 0.06313138379739751, + "grad_norm": 0.09858463704586029, + "kl": 0.0264892578125, + "learning_rate": 7.935780964297952e-06, + "loss": 0.0094, + "step": 1692 + }, + { + "clip_ratio": 0.004735630005598068, + "completion_length": 681.3393249511719, + "epoch": 0.06316869548994916, + "grad_norm": 0.07745592296123505, + "kl": 0.021697998046875, + "learning_rate": 7.933329507660935e-06, + "loss": -0.0045, + "num_tokens": 40303741.0, + "reward": 0.6017374508082867, + "reward_std": 0.24661973025649786, + "rewards/code_reward": 0.45441602170467377, + "rewards/format_reward": 1.4732142984867096, + "step": 1693 + }, + { + "clip_ratio": 0.004213473410345614, + "epoch": 0.06320600718250081, + "grad_norm": 0.08155640214681625, + "kl": 0.02313232421875, + "learning_rate": 7.930877030044058e-06, + "loss": -0.0046, + "step": 1694 + }, + { + "clip_ratio": 0.004636735422536731, + "epoch": 0.06324331887505247, + "grad_norm": 0.07875591516494751, + "kl": 0.022674560546875, + "learning_rate": 7.928423532476332e-06, + "loss": -0.0049, + "step": 1695 + }, + { + "clip_ratio": 0.002488162717781961, + "completion_length": 527.8928909301758, + "epoch": 0.06328063056760412, + "grad_norm": 0.08961089700460434, + "kl": 0.0216522216796875, + "learning_rate": 7.925969015987202e-06, + "loss": 0.0079, + "num_tokens": 40360981.0, + "reward": 0.97971610724926, + "reward_std": 0.13424190878868103, + "rewards/code_reward": 0.8297161087393761, + "rewards/format_reward": 1.5, + "step": 1696 + }, + { + "clip_ratio": 0.002350923721678555, + "epoch": 0.06331794226015577, + "grad_norm": 0.07712612301111221, + "kl": 0.0220947265625, + "learning_rate": 7.923513481606535e-06, + "loss": 0.0078, + "step": 1697 + }, + { + "clip_ratio": 0.002323988825082779, + "epoch": 0.06335525395270743, + "grad_norm": 0.07054990530014038, + "kl": 0.022064208984375, + "learning_rate": 7.921056930364632e-06, + "loss": 0.0073, + "step": 1698 + }, + { + "clip_ratio": 0.0030587284127250314, + "completion_length": 650.9821624755859, + "epoch": 0.06339256564525908, + "grad_norm": 0.07826701551675797, + "kl": 0.019866943359375, + "learning_rate": 7.918599363292214e-06, + "loss": 0.0056, + "num_tokens": 40431414.0, + "reward": 0.9531765580177307, + "reward_std": 0.22249002009630203, + "rewards/code_reward": 0.8031765818595886, + "rewards/format_reward": 1.5, + "step": 1699 + }, + { + "clip_ratio": 0.0032113741035573184, + "epoch": 0.06342987733781073, + "grad_norm": 0.07995730638504028, + "kl": 0.0193939208984375, + "learning_rate": 7.916140781420428e-06, + "loss": 0.0055, + "step": 1700 + }, + { + "clip_ratio": 0.0026966589502990246, + "epoch": 0.06346718903036239, + "grad_norm": 0.0740559920668602, + "kl": 0.0202484130859375, + "learning_rate": 7.91368118578085e-06, + "loss": 0.0051, + "step": 1701 + }, + { + "clip_ratio": 0.004149332991801202, + "completion_length": 575.6071624755859, + "epoch": 0.06350450072291404, + "grad_norm": 0.09114234149456024, + "kl": 0.0198516845703125, + "learning_rate": 7.911220577405485e-06, + "loss": -0.0165, + "num_tokens": 40492992.0, + "reward": 0.3060414008796215, + "reward_std": 0.18922686763107777, + "rewards/code_reward": 0.15604139119386673, + "rewards/format_reward": 1.5, + "step": 1702 + }, + { + "clip_ratio": 0.004479059367440641, + "epoch": 0.06354181241546569, + "grad_norm": 0.07977958023548126, + "kl": 0.0205230712890625, + "learning_rate": 7.908758957326754e-06, + "loss": -0.0166, + "step": 1703 + }, + { + "clip_ratio": 0.004292303405236453, + "epoch": 0.06357912410801735, + "grad_norm": 0.09255650639533997, + "kl": 0.0201873779296875, + "learning_rate": 7.90629632657751e-06, + "loss": -0.0169, + "step": 1704 + }, + { + "clip_ratio": 0.002615749544929713, + "completion_length": 742.6428985595703, + "epoch": 0.063616435800569, + "grad_norm": 0.08569285273551941, + "kl": 0.0236053466796875, + "learning_rate": 7.903832686191026e-06, + "loss": -0.0206, + "num_tokens": 40573682.0, + "reward": 0.7758994810283184, + "reward_std": 0.10165299475193024, + "rewards/code_reward": 0.625899463891983, + "rewards/format_reward": 1.5, + "step": 1705 + }, + { + "clip_ratio": 0.0029751715483143926, + "epoch": 0.06365374749312065, + "grad_norm": 0.0556645430624485, + "kl": 0.019866943359375, + "learning_rate": 7.901368037200995e-06, + "loss": -0.0205, + "step": 1706 + }, + { + "clip_ratio": 0.002621763211209327, + "epoch": 0.0636910591856723, + "grad_norm": 0.05189719796180725, + "kl": 0.0196380615234375, + "learning_rate": 7.898902380641547e-06, + "loss": -0.0208, + "step": 1707 + }, + { + "clip_ratio": 0.0028323192382231355, + "completion_length": 639.3571701049805, + "epoch": 0.06372837087822396, + "grad_norm": 0.05529942363500595, + "kl": 0.023681640625, + "learning_rate": 7.89643571754722e-06, + "loss": 0.0048, + "num_tokens": 40652970.0, + "reward": 0.8048763833940029, + "reward_std": 0.06770746409893036, + "rewards/code_reward": 0.6548763662576675, + "rewards/format_reward": 1.5, + "step": 1708 + }, + { + "clip_ratio": 0.002817489323206246, + "epoch": 0.06376568257077561, + "grad_norm": 0.054101597517728806, + "kl": 0.023651123046875, + "learning_rate": 7.89396804895298e-06, + "loss": 0.0047, + "step": 1709 + }, + { + "clip_ratio": 0.0025327648036181927, + "epoch": 0.06380299426332726, + "grad_norm": 0.05002938210964203, + "kl": 0.023651123046875, + "learning_rate": 7.891499375894217e-06, + "loss": 0.0046, + "step": 1710 + }, + { + "clip_ratio": 0.0033931073267012835, + "completion_length": 561.428596496582, + "epoch": 0.06384030595587892, + "grad_norm": 0.08265974372625351, + "kl": 0.02520751953125, + "learning_rate": 7.889029699406739e-06, + "loss": -0.0173, + "num_tokens": 40714468.0, + "reward": 0.8011160716414452, + "reward_std": 0.18362649576738477, + "rewards/code_reward": 0.6511160619556904, + "rewards/format_reward": 1.5, + "step": 1711 + }, + { + "clip_ratio": 0.003138632280752063, + "epoch": 0.06387761764843057, + "grad_norm": 0.07976257801055908, + "kl": 0.02435302734375, + "learning_rate": 7.886559020526777e-06, + "loss": -0.0175, + "step": 1712 + }, + { + "clip_ratio": 0.0026043151738122106, + "epoch": 0.06391492934098222, + "grad_norm": 0.07570911198854446, + "kl": 0.024688720703125, + "learning_rate": 7.884087340290981e-06, + "loss": -0.018, + "step": 1713 + }, + { + "clip_ratio": 0.0029598167166113853, + "completion_length": 733.7857360839844, + "epoch": 0.06395224103353388, + "grad_norm": 0.07194045186042786, + "kl": 0.02178955078125, + "learning_rate": 7.881614659736425e-06, + "loss": 0.0165, + "num_tokens": 40788516.0, + "reward": 0.8988553434610367, + "reward_std": 0.2179737649857998, + "rewards/code_reward": 0.748855322599411, + "rewards/format_reward": 1.5, + "step": 1714 + }, + { + "clip_ratio": 0.0023532776394858956, + "epoch": 0.06398955272608553, + "grad_norm": 0.08663106709718704, + "kl": 0.0251312255859375, + "learning_rate": 7.879140979900594e-06, + "loss": 0.0162, + "step": 1715 + }, + { + "clip_ratio": 0.0023333175922743976, + "epoch": 0.0640268644186372, + "grad_norm": 0.07092786580324173, + "kl": 0.0277557373046875, + "learning_rate": 7.876666301821404e-06, + "loss": 0.016, + "step": 1716 + }, + { + "clip_ratio": 0.004939124104566872, + "completion_length": 717.5893096923828, + "epoch": 0.06406417611118885, + "grad_norm": 0.0848652645945549, + "kl": 0.0250244140625, + "learning_rate": 7.874190626537182e-06, + "loss": 0.0152, + "num_tokens": 40864583.0, + "reward": 0.5099748708307743, + "reward_std": 0.2006189627572894, + "rewards/code_reward": 0.3599748616106808, + "rewards/format_reward": 1.5, + "step": 1717 + }, + { + "clip_ratio": 0.0050440129125490785, + "epoch": 0.0641014878037405, + "grad_norm": 0.0834367573261261, + "kl": 0.024383544921875, + "learning_rate": 7.871713955086675e-06, + "loss": 0.015, + "step": 1718 + }, + { + "clip_ratio": 0.004818392277229577, + "epoch": 0.06413879949629216, + "grad_norm": 0.07319964468479156, + "kl": 0.02325439453125, + "learning_rate": 7.869236288509048e-06, + "loss": 0.0145, + "step": 1719 + }, + { + "clip_ratio": 0.0025630895397625864, + "completion_length": 532.6607437133789, + "epoch": 0.06417611118884381, + "grad_norm": 0.07005325704813004, + "kl": 0.02899169921875, + "learning_rate": 7.866757627843883e-06, + "loss": -0.0056, + "num_tokens": 40929006.0, + "reward": 0.9519891440868378, + "reward_std": 0.21372897922992706, + "rewards/code_reward": 0.8019891381263733, + "rewards/format_reward": 1.5, + "step": 1720 + }, + { + "clip_ratio": 0.002690241322852671, + "epoch": 0.06421342288139546, + "grad_norm": 0.06633561104536057, + "kl": 0.029754638671875, + "learning_rate": 7.864277974131184e-06, + "loss": -0.0057, + "step": 1721 + }, + { + "clip_ratio": 0.001905975106637925, + "epoch": 0.06425073457394712, + "grad_norm": 0.06459130346775055, + "kl": 0.0299072265625, + "learning_rate": 7.861797328411362e-06, + "loss": -0.0061, + "step": 1722 + }, + { + "clip_ratio": 0.004619207000359893, + "completion_length": 805.8214721679688, + "epoch": 0.06428804626649877, + "grad_norm": 0.04977060481905937, + "kl": 0.0185089111328125, + "learning_rate": 7.859315691725254e-06, + "loss": -0.0084, + "num_tokens": 40999346.0, + "reward": 0.21027876809239388, + "reward_std": 0.07769401837140322, + "rewards/code_reward": 0.06027874443680048, + "rewards/format_reward": 1.5, + "step": 1723 + }, + { + "clip_ratio": 0.004418315424118191, + "epoch": 0.06432535795905042, + "grad_norm": 0.04980326071381569, + "kl": 0.0183258056640625, + "learning_rate": 7.856833065114107e-06, + "loss": -0.0084, + "step": 1724 + }, + { + "clip_ratio": 0.004880479071289301, + "epoch": 0.06436266965160208, + "grad_norm": 0.04741090163588524, + "kl": 0.01824951171875, + "learning_rate": 7.854349449619586e-06, + "loss": -0.0086, + "step": 1725 + }, + { + "clip_ratio": 0.0026414847816340625, + "completion_length": 763.7678833007812, + "epoch": 0.06439998134415373, + "grad_norm": 0.08004157990217209, + "kl": 0.0244598388671875, + "learning_rate": 7.851864846283769e-06, + "loss": 0.0206, + "num_tokens": 41082633.0, + "reward": 0.8370147943496704, + "reward_std": 0.1177615374326706, + "rewards/code_reward": 0.6870148181915283, + "rewards/format_reward": 1.5, + "step": 1726 + }, + { + "clip_ratio": 0.0025629500159993768, + "epoch": 0.06443729303670538, + "grad_norm": 0.07818935811519623, + "kl": 0.025054931640625, + "learning_rate": 7.84937925614915e-06, + "loss": 0.0203, + "step": 1727 + }, + { + "clip_ratio": 0.0026443313690833747, + "epoch": 0.06447460472925703, + "grad_norm": 0.07158324122428894, + "kl": 0.0241546630859375, + "learning_rate": 7.846892680258637e-06, + "loss": 0.0199, + "step": 1728 + }, + { + "clip_ratio": 0.0032414698507636786, + "completion_length": 767.4643096923828, + "epoch": 0.06451191642180869, + "grad_norm": 0.08050180226564407, + "kl": 0.0203399658203125, + "learning_rate": 7.844405119655548e-06, + "loss": -0.0047, + "num_tokens": 41153877.0, + "reward": 0.7159471772611141, + "reward_std": 0.14745199563913047, + "rewards/code_reward": 0.5659471752587706, + "rewards/format_reward": 1.5, + "step": 1729 + }, + { + "clip_ratio": 0.0031240281532518566, + "epoch": 0.06454922811436034, + "grad_norm": 0.08529524505138397, + "kl": 0.0207061767578125, + "learning_rate": 7.841916575383622e-06, + "loss": -0.0047, + "step": 1730 + }, + { + "clip_ratio": 0.002825055562425405, + "epoch": 0.064586539806912, + "grad_norm": 0.06822271645069122, + "kl": 0.0201568603515625, + "learning_rate": 7.839427048487002e-06, + "loss": -0.0053, + "step": 1731 + }, + { + "clip_ratio": 0.0044126425054855645, + "completion_length": 570.928596496582, + "epoch": 0.06462385149946365, + "grad_norm": 0.0724129006266594, + "kl": 0.0227508544921875, + "learning_rate": 7.836936540010248e-06, + "loss": 0.0093, + "num_tokens": 41208417.0, + "reward": 0.5827731601893902, + "reward_std": 0.12241697404533625, + "rewards/code_reward": 0.4327731216326356, + "rewards/format_reward": 1.5, + "step": 1732 + }, + { + "clip_ratio": 0.003361605224199593, + "epoch": 0.0646611631920153, + "grad_norm": 0.08602968603372574, + "kl": 0.0233154296875, + "learning_rate": 7.834445050998335e-06, + "loss": 0.0089, + "step": 1733 + }, + { + "clip_ratio": 0.004277729312889278, + "epoch": 0.06469847488456695, + "grad_norm": 0.06434500217437744, + "kl": 0.0234832763671875, + "learning_rate": 7.831952582496637e-06, + "loss": 0.009, + "step": 1734 + }, + { + "clip_ratio": 0.0022362337331287563, + "completion_length": 699.5714569091797, + "epoch": 0.0647357865771186, + "grad_norm": 0.06006172299385071, + "kl": 0.018157958984375, + "learning_rate": 7.829459135550957e-06, + "loss": -0.0014, + "num_tokens": 41276181.0, + "reward": 0.8297052048146725, + "reward_std": 0.12144405487924814, + "rewards/code_reward": 0.6797052196925506, + "rewards/format_reward": 1.5, + "step": 1735 + }, + { + "clip_ratio": 0.002054299518931657, + "epoch": 0.06477309826967026, + "grad_norm": 0.05867690593004227, + "kl": 0.01849365234375, + "learning_rate": 7.826964711207494e-06, + "loss": -0.0015, + "step": 1736 + }, + { + "clip_ratio": 0.0017557345563545823, + "epoch": 0.06481040996222191, + "grad_norm": 0.05863317474722862, + "kl": 0.01824951171875, + "learning_rate": 7.82446931051286e-06, + "loss": -0.0017, + "step": 1737 + }, + { + "clip_ratio": 0.004566355026327074, + "completion_length": 629.2500228881836, + "epoch": 0.06484772165477357, + "grad_norm": 0.09001891314983368, + "kl": 0.033355712890625, + "learning_rate": 7.821972934514084e-06, + "loss": -0.005, + "num_tokens": 41350899.0, + "reward": 0.6069462634623051, + "reward_std": 0.11328242532908916, + "rewards/code_reward": 0.4569462686777115, + "rewards/format_reward": 1.5, + "step": 1738 + }, + { + "clip_ratio": 0.004610534000676125, + "epoch": 0.06488503334732522, + "grad_norm": 0.08168256282806396, + "kl": 0.0327911376953125, + "learning_rate": 7.819475584258597e-06, + "loss": -0.0051, + "step": 1739 + }, + { + "clip_ratio": 0.0041666540782898664, + "epoch": 0.06492234503987687, + "grad_norm": 0.0850692093372345, + "kl": 0.032562255859375, + "learning_rate": 7.816977260794242e-06, + "loss": -0.0055, + "step": 1740 + }, + { + "clip_ratio": 0.004390174639411271, + "completion_length": 687.1250305175781, + "epoch": 0.06495965673242853, + "grad_norm": 0.085722416639328, + "kl": 0.0200347900390625, + "learning_rate": 7.814477965169264e-06, + "loss": 0.0017, + "num_tokens": 41419254.0, + "reward": 0.5704389996826649, + "reward_std": 0.12799024395644665, + "rewards/code_reward": 0.4204389937222004, + "rewards/format_reward": 1.5, + "step": 1741 + }, + { + "clip_ratio": 0.004268302523996681, + "epoch": 0.06499696842498018, + "grad_norm": 0.0811707153916359, + "kl": 0.0203857421875, + "learning_rate": 7.811977698432327e-06, + "loss": 0.0015, + "step": 1742 + }, + { + "clip_ratio": 0.0041286893538199365, + "epoch": 0.06503428011753183, + "grad_norm": 0.07586263120174408, + "kl": 0.021728515625, + "learning_rate": 7.809476461632493e-06, + "loss": 0.0009, + "step": 1743 + }, + { + "clip_ratio": 0.0033344003604725003, + "completion_length": 549.0714569091797, + "epoch": 0.06507159181008348, + "grad_norm": 0.08387462794780731, + "kl": 0.02154541015625, + "learning_rate": 7.806974255819236e-06, + "loss": 0.0016, + "num_tokens": 41476252.0, + "reward": 0.9663557857275009, + "reward_std": 0.0758720375597477, + "rewards/code_reward": 0.8163557946681976, + "rewards/format_reward": 1.5, + "step": 1744 + }, + { + "clip_ratio": 0.0032539668027311563, + "epoch": 0.06510890350263514, + "grad_norm": 0.08188910037279129, + "kl": 0.022247314453125, + "learning_rate": 7.804471082042436e-06, + "loss": 0.0018, + "step": 1745 + }, + { + "clip_ratio": 0.0029547287267632782, + "epoch": 0.06514621519518679, + "grad_norm": 0.0759081095457077, + "kl": 0.02203369140625, + "learning_rate": 7.801966941352374e-06, + "loss": 0.0012, + "step": 1746 + }, + { + "clip_ratio": 0.002506556047592312, + "completion_length": 509.16075134277344, + "epoch": 0.06518352688773844, + "grad_norm": 0.06380236893892288, + "kl": 0.0369415283203125, + "learning_rate": 7.799461834799746e-06, + "loss": 0.011, + "num_tokens": 41532793.0, + "reward": 0.7571428529918194, + "reward_std": 0.1283881515264511, + "rewards/code_reward": 0.6071428582072258, + "rewards/format_reward": 1.5, + "step": 1747 + }, + { + "clip_ratio": 0.002374905743636191, + "epoch": 0.0652208385802901, + "grad_norm": 0.06205475702881813, + "kl": 0.0379486083984375, + "learning_rate": 7.796955763435642e-06, + "loss": 0.0108, + "step": 1748 + }, + { + "clip_ratio": 0.002631044713780284, + "epoch": 0.06525815027284175, + "grad_norm": 0.05252491682767868, + "kl": 0.0350189208984375, + "learning_rate": 7.794448728311572e-06, + "loss": 0.0107, + "step": 1749 + }, + { + "clip_ratio": 0.0031785957980901003, + "completion_length": 733.8214569091797, + "epoch": 0.0652954619653934, + "grad_norm": 0.06653374433517456, + "kl": 0.024932861328125, + "learning_rate": 7.791940730479435e-06, + "loss": -0.0027, + "num_tokens": 41600711.0, + "reward": 0.6289286240935326, + "reward_std": 0.04012607503682375, + "rewards/code_reward": 0.4789286144077778, + "rewards/format_reward": 1.5, + "step": 1750 + }, + { + "clip_ratio": 0.003183325403369963, + "epoch": 0.06533277365794506, + "grad_norm": 0.06619340181350708, + "kl": 0.024871826171875, + "learning_rate": 7.78943177099154e-06, + "loss": -0.0027, + "step": 1751 + }, + { + "clip_ratio": 0.0029050021548755467, + "epoch": 0.06537008535049671, + "grad_norm": 0.0609198622405529, + "kl": 0.024688720703125, + "learning_rate": 7.786921850900602e-06, + "loss": -0.0032, + "step": 1752 + }, + { + "clip_ratio": 0.003280701639596373, + "completion_length": 722.5000305175781, + "epoch": 0.06540739704304836, + "grad_norm": 0.07852388918399811, + "kl": 0.02197265625, + "learning_rate": 7.784410971259736e-06, + "loss": 0.008, + "num_tokens": 41680973.0, + "reward": 0.7447296530008316, + "reward_std": 0.2414987776428461, + "rewards/code_reward": 0.594729620963335, + "rewards/format_reward": 1.5, + "step": 1753 + }, + { + "clip_ratio": 0.0036491461214609444, + "epoch": 0.06544470873560002, + "grad_norm": 0.07679404318332672, + "kl": 0.022613525390625, + "learning_rate": 7.781899133122462e-06, + "loss": 0.0079, + "step": 1754 + }, + { + "clip_ratio": 0.0028558908961713314, + "epoch": 0.06548202042815167, + "grad_norm": 0.07264018803834915, + "kl": 0.02227783203125, + "learning_rate": 7.7793863375427e-06, + "loss": 0.0073, + "step": 1755 + }, + { + "clip_ratio": 0.00298546435078606, + "completion_length": 452.7857360839844, + "epoch": 0.06551933212070332, + "grad_norm": 0.09804187715053558, + "kl": 0.0171966552734375, + "learning_rate": 7.77687258557477e-06, + "loss": 0.0075, + "num_tokens": 41726111.0, + "reward": 0.6761793307960033, + "reward_std": 0.05466401274316013, + "rewards/code_reward": 0.5261793322861195, + "rewards/format_reward": 1.5, + "step": 1756 + }, + { + "clip_ratio": 0.002989925560541451, + "epoch": 0.06555664381325498, + "grad_norm": 0.07830090820789337, + "kl": 0.017059326171875, + "learning_rate": 7.774357878273396e-06, + "loss": 0.0071, + "step": 1757 + }, + { + "clip_ratio": 0.002690827182959765, + "epoch": 0.06559395550580663, + "grad_norm": 0.0619870163500309, + "kl": 0.0168609619140625, + "learning_rate": 7.771842216693707e-06, + "loss": 0.0067, + "step": 1758 + }, + { + "clip_ratio": 0.001528427324956283, + "completion_length": 455.30359649658203, + "epoch": 0.06563126719835828, + "grad_norm": 0.07229357212781906, + "kl": 0.02459716796875, + "learning_rate": 7.769325601891223e-06, + "loss": -0.0101, + "num_tokens": 41780154.0, + "reward": 1.0358203500509262, + "reward_std": 0.09063149616122246, + "rewards/code_reward": 0.8858203738927841, + "rewards/format_reward": 1.5, + "step": 1759 + }, + { + "clip_ratio": 0.0015853845397941768, + "epoch": 0.06566857889090993, + "grad_norm": 0.06849514693021774, + "kl": 0.025177001953125, + "learning_rate": 7.76680803492187e-06, + "loss": -0.0103, + "step": 1760 + }, + { + "clip_ratio": 0.001985928858630359, + "epoch": 0.06570589058346159, + "grad_norm": 0.062456436455249786, + "kl": 0.025848388671875, + "learning_rate": 7.764289516841974e-06, + "loss": -0.0107, + "step": 1761 + }, + { + "clip_ratio": 0.004450974927749485, + "completion_length": 672.6250228881836, + "epoch": 0.06574320227601324, + "grad_norm": 0.02974986657500267, + "kl": 0.01397705078125, + "learning_rate": 7.761770048708254e-06, + "loss": -0.011, + "num_tokens": 41843539.0, + "reward": 0.15312068536877632, + "reward_std": 0.0024138125590980053, + "rewards/code_reward": 0.003120665904134512, + "rewards/format_reward": 1.5, + "step": 1762 + }, + { + "clip_ratio": 0.004456299357116222, + "epoch": 0.0657805139685649, + "grad_norm": 0.028335528448224068, + "kl": 0.01361083984375, + "learning_rate": 7.759249631577841e-06, + "loss": -0.011, + "step": 1763 + }, + { + "clip_ratio": 0.004737455979920924, + "epoch": 0.06581782566111655, + "grad_norm": 0.029749423265457153, + "kl": 0.0136260986328125, + "learning_rate": 7.756728266508244e-06, + "loss": -0.0111, + "step": 1764 + }, + { + "clip_ratio": 0.0019724367884919047, + "completion_length": 593.2321701049805, + "epoch": 0.0658551373536682, + "grad_norm": 0.07552271336317062, + "kl": 0.024932861328125, + "learning_rate": 7.75420595455739e-06, + "loss": 0.0063, + "num_tokens": 41910822.0, + "reward": 0.9191677421331406, + "reward_std": 0.09631205350160599, + "rewards/code_reward": 0.7691677436232567, + "rewards/format_reward": 1.5, + "step": 1765 + }, + { + "clip_ratio": 0.002068153233267367, + "epoch": 0.06589244904621985, + "grad_norm": 0.12690415978431702, + "kl": 0.02471923828125, + "learning_rate": 7.751682696783588e-06, + "loss": 0.0063, + "step": 1766 + }, + { + "clip_ratio": 0.002114555798470974, + "epoch": 0.0659297607387715, + "grad_norm": 0.06411364674568176, + "kl": 0.02484130859375, + "learning_rate": 7.749158494245555e-06, + "loss": 0.006, + "step": 1767 + }, + { + "clip_ratio": 0.0034312193747609854, + "completion_length": 615.5535888671875, + "epoch": 0.06596707243132316, + "grad_norm": 0.0833631232380867, + "kl": 0.02142333984375, + "learning_rate": 7.746633348002395e-06, + "loss": -0.0057, + "num_tokens": 41977787.0, + "reward": 0.7474408000707626, + "reward_std": 0.15242088469676673, + "rewards/code_reward": 0.5974407736212015, + "rewards/format_reward": 1.5, + "step": 1768 + }, + { + "clip_ratio": 0.003437241422943771, + "epoch": 0.06600438412387481, + "grad_norm": 0.07484552264213562, + "kl": 0.02093505859375, + "learning_rate": 7.744107259113616e-06, + "loss": -0.0059, + "step": 1769 + }, + { + "clip_ratio": 0.002624260087031871, + "epoch": 0.06604169581642648, + "grad_norm": 0.07450208067893982, + "kl": 0.021026611328125, + "learning_rate": 7.741580228639118e-06, + "loss": -0.0064, + "step": 1770 + }, + { + "clip_ratio": 0.003952873754315078, + "completion_length": 737.2321624755859, + "epoch": 0.06607900750897813, + "grad_norm": 0.07510177046060562, + "kl": 0.0149993896484375, + "learning_rate": 7.739052257639193e-06, + "loss": -0.0053, + "num_tokens": 42042396.0, + "reward": 0.684880968183279, + "reward_std": 0.17331849038600922, + "rewards/code_reward": 0.5348809584975243, + "rewards/format_reward": 1.5, + "step": 1771 + }, + { + "clip_ratio": 0.004012248187791556, + "epoch": 0.06611631920152979, + "grad_norm": 0.06780865043401718, + "kl": 0.0152435302734375, + "learning_rate": 7.736523347174533e-06, + "loss": -0.0055, + "step": 1772 + }, + { + "clip_ratio": 0.0035028468701057136, + "epoch": 0.06615363089408144, + "grad_norm": 0.06691618263721466, + "kl": 0.015655517578125, + "learning_rate": 7.73399349830622e-06, + "loss": -0.0061, + "step": 1773 + }, + { + "clip_ratio": 0.004093696770723909, + "completion_length": 686.5536117553711, + "epoch": 0.06619094258663309, + "grad_norm": 0.09247713536024094, + "kl": 0.022552490234375, + "learning_rate": 7.731462712095735e-06, + "loss": 0.0058, + "num_tokens": 42114093.0, + "reward": 0.4757889434695244, + "reward_std": 0.2883440963923931, + "rewards/code_reward": 0.32578892447054386, + "rewards/format_reward": 1.5, + "step": 1774 + }, + { + "clip_ratio": 0.004160707117989659, + "epoch": 0.06622825427918475, + "grad_norm": 0.08995673805475235, + "kl": 0.0233154296875, + "learning_rate": 7.728930989604945e-06, + "loss": 0.0057, + "step": 1775 + }, + { + "clip_ratio": 0.004276876512449235, + "epoch": 0.0662655659717364, + "grad_norm": 0.09018738567829132, + "kl": 0.022613525390625, + "learning_rate": 7.726398331896115e-06, + "loss": 0.0053, + "step": 1776 + }, + { + "clip_ratio": 0.005128921591676772, + "completion_length": 569.2857437133789, + "epoch": 0.06630287766428805, + "grad_norm": 0.10511496663093567, + "kl": 0.026702880859375, + "learning_rate": 7.723864740031902e-06, + "loss": 0.0213, + "num_tokens": 42170865.0, + "reward": 0.545577097684145, + "reward_std": 0.22739433031529188, + "rewards/code_reward": 0.39557707763742656, + "rewards/format_reward": 1.5, + "step": 1777 + }, + { + "clip_ratio": 0.00449029728770256, + "epoch": 0.0663401893568397, + "grad_norm": 0.1003253385424614, + "kl": 0.025604248046875, + "learning_rate": 7.721330215075356e-06, + "loss": 0.0205, + "step": 1778 + }, + { + "clip_ratio": 0.004943897598423064, + "epoch": 0.06637750104939136, + "grad_norm": 0.08583760261535645, + "kl": 0.0252685546875, + "learning_rate": 7.718794758089912e-06, + "loss": 0.0198, + "step": 1779 + }, + { + "clip_ratio": 0.0033611629623919725, + "completion_length": 665.5714721679688, + "epoch": 0.06641481274194301, + "grad_norm": 0.09179621189832687, + "kl": 0.02880859375, + "learning_rate": 7.716258370139403e-06, + "loss": 0.0163, + "num_tokens": 42237145.0, + "reward": 0.8070229738950729, + "reward_std": 0.05324483383446932, + "rewards/code_reward": 0.6570229753851891, + "rewards/format_reward": 1.5, + "step": 1780 + }, + { + "clip_ratio": 0.003095240506809205, + "epoch": 0.06645212443449466, + "grad_norm": 0.07715875655412674, + "kl": 0.028961181640625, + "learning_rate": 7.71372105228805e-06, + "loss": 0.016, + "step": 1781 + }, + { + "clip_ratio": 0.0032126157311722636, + "epoch": 0.06648943612704632, + "grad_norm": 0.07974278181791306, + "kl": 0.0283050537109375, + "learning_rate": 7.711182805600465e-06, + "loss": 0.0158, + "step": 1782 + }, + { + "clip_ratio": 0.003997117397375405, + "completion_length": 686.0178909301758, + "epoch": 0.06652674781959797, + "grad_norm": 0.0770910382270813, + "kl": 0.0196533203125, + "learning_rate": 7.708643631141649e-06, + "loss": 0.0129, + "num_tokens": 42305284.0, + "reward": 0.5030812472105026, + "reward_std": 0.11379368789494038, + "rewards/code_reward": 0.3530812356621027, + "rewards/format_reward": 1.5, + "step": 1783 + }, + { + "clip_ratio": 0.0038459748029708862, + "epoch": 0.06656405951214962, + "grad_norm": 0.08431559801101685, + "kl": 0.0203857421875, + "learning_rate": 7.706103529976993e-06, + "loss": 0.0127, + "step": 1784 + }, + { + "clip_ratio": 0.0032511656172573566, + "epoch": 0.06660137120470128, + "grad_norm": 0.0756567046046257, + "kl": 0.0203857421875, + "learning_rate": 7.703562503172273e-06, + "loss": 0.0123, + "step": 1785 + }, + { + "clip_ratio": 0.0038541440153494477, + "completion_length": 615.8928909301758, + "epoch": 0.06663868289725293, + "grad_norm": 0.10650145262479782, + "kl": 0.020843505859375, + "learning_rate": 7.701020551793662e-06, + "loss": -0.0041, + "num_tokens": 42366850.0, + "reward": 0.6742817834019661, + "reward_std": 0.2390694934874773, + "rewards/code_reward": 0.5242817532271147, + "rewards/format_reward": 1.5, + "step": 1786 + }, + { + "clip_ratio": 0.00427454593591392, + "epoch": 0.06667599458980458, + "grad_norm": 0.09205096960067749, + "kl": 0.02142333984375, + "learning_rate": 7.698477676907711e-06, + "loss": -0.0044, + "step": 1787 + }, + { + "clip_ratio": 0.003440388769377023, + "epoch": 0.06671330628235624, + "grad_norm": 0.0863930732011795, + "kl": 0.021209716796875, + "learning_rate": 7.695933879581366e-06, + "loss": -0.0051, + "step": 1788 + }, + { + "clip_ratio": 0.002546081697801128, + "completion_length": 664.1071701049805, + "epoch": 0.06675061797490789, + "grad_norm": 0.05918009579181671, + "kl": 0.02423095703125, + "learning_rate": 7.693389160881955e-06, + "loss": -0.005, + "num_tokens": 42429590.0, + "reward": 0.7109453864395618, + "reward_std": 0.04343477636575699, + "rewards/code_reward": 0.5609453767538071, + "rewards/format_reward": 1.5, + "step": 1789 + }, + { + "clip_ratio": 0.0030477921827696264, + "epoch": 0.06678792966745954, + "grad_norm": 0.06008215993642807, + "kl": 0.0233612060546875, + "learning_rate": 7.690843521877194e-06, + "loss": -0.0049, + "step": 1790 + }, + { + "clip_ratio": 0.002552852442022413, + "epoch": 0.0668252413600112, + "grad_norm": 0.057248473167419434, + "kl": 0.024505615234375, + "learning_rate": 7.688296963635189e-06, + "loss": -0.0052, + "step": 1791 + }, + { + "clip_ratio": 0.003175390709657222, + "completion_length": 650.7857360839844, + "epoch": 0.06686255305256285, + "grad_norm": 0.09069722890853882, + "kl": 0.01727294921875, + "learning_rate": 7.685749487224428e-06, + "loss": -0.0134, + "num_tokens": 42501384.0, + "reward": 0.6884987875819206, + "reward_std": 0.22549138218164444, + "rewards/code_reward": 0.5384987872093916, + "rewards/format_reward": 1.5, + "step": 1792 + }, + { + "clip_ratio": 0.0026870170258916914, + "epoch": 0.0668998647451145, + "grad_norm": 0.08450327068567276, + "kl": 0.01751708984375, + "learning_rate": 7.683201093713783e-06, + "loss": -0.014, + "step": 1793 + }, + { + "clip_ratio": 0.003315431938972324, + "epoch": 0.06693717643766615, + "grad_norm": 0.0745653361082077, + "kl": 0.017852783203125, + "learning_rate": 7.680651784172511e-06, + "loss": -0.0141, + "step": 1794 + }, + { + "clip_ratio": 0.0046046500210650265, + "completion_length": 686.5178909301758, + "epoch": 0.06697448813021781, + "grad_norm": 0.09942453354597092, + "kl": 0.023895263671875, + "learning_rate": 7.678101559670259e-06, + "loss": 0.0047, + "num_tokens": 42579715.0, + "reward": 0.29852933436632156, + "reward_std": 0.1771911396062933, + "rewards/code_reward": 0.14852933368820231, + "rewards/format_reward": 1.5, + "step": 1795 + }, + { + "clip_ratio": 0.005120194051414728, + "epoch": 0.06701179982276946, + "grad_norm": 0.07838843017816544, + "kl": 0.023956298828125, + "learning_rate": 7.675550421277052e-06, + "loss": 0.0046, + "step": 1796 + }, + { + "clip_ratio": 0.004095554526429623, + "epoch": 0.06704911151532111, + "grad_norm": 0.21034230291843414, + "kl": 0.0250091552734375, + "learning_rate": 7.672998370063301e-06, + "loss": 0.0041, + "step": 1797 + }, + { + "clip_ratio": 0.004349512862972915, + "completion_length": 882.9464721679688, + "epoch": 0.06708642320787277, + "grad_norm": 0.06602897495031357, + "kl": 0.025360107421875, + "learning_rate": 7.6704454070998e-06, + "loss": 0.0025, + "num_tokens": 42673512.0, + "reward": 0.17925507947802544, + "reward_std": 0.060625605285167694, + "rewards/code_reward": 0.03104078397154808, + "rewards/format_reward": 1.4821428656578064, + "step": 1798 + }, + { + "clip_ratio": 0.004760652780532837, + "epoch": 0.06712373490042442, + "grad_norm": 0.07028081268072128, + "kl": 0.02471923828125, + "learning_rate": 7.667891533457718e-06, + "loss": 0.0025, + "step": 1799 + }, + { + "clip_ratio": 0.004369884554762393, + "epoch": 0.06716104659297607, + "grad_norm": 0.0656822919845581, + "kl": 0.0250244140625, + "learning_rate": 7.665336750208624e-06, + "loss": 0.0024, + "step": 1800 + }, + { + "clip_ratio": 0.0008127439650706947, + "completion_length": 375.5893020629883, + "epoch": 0.06719835828552773, + "grad_norm": 0.04239721968770027, + "kl": 0.022064208984375, + "learning_rate": 7.662781058424448e-06, + "loss": -0.0031, + "num_tokens": 42714859.0, + "reward": 1.0038960874080658, + "reward_std": 0.11301025748252869, + "rewards/code_reward": 0.8538961037993431, + "rewards/format_reward": 1.5, + "step": 1801 + }, + { + "clip_ratio": 0.0007220918196253479, + "epoch": 0.06723566997807938, + "grad_norm": 0.0385536290705204, + "kl": 0.0225830078125, + "learning_rate": 7.660224459177518e-06, + "loss": -0.0032, + "step": 1802 + }, + { + "clip_ratio": 0.000879107101354748, + "epoch": 0.06727298167063103, + "grad_norm": 0.03956189379096031, + "kl": 0.02276611328125, + "learning_rate": 7.65766695354053e-06, + "loss": -0.0031, + "step": 1803 + }, + { + "clip_ratio": 0.003508158610202372, + "completion_length": 836.7143249511719, + "epoch": 0.06731029336318269, + "grad_norm": 0.06361272931098938, + "kl": 0.0243682861328125, + "learning_rate": 7.655108542586569e-06, + "loss": -0.0033, + "num_tokens": 42797611.0, + "reward": 0.5789530239999294, + "reward_std": 0.1544178519397974, + "rewards/code_reward": 0.42895299941301346, + "rewards/format_reward": 1.5, + "step": 1804 + }, + { + "clip_ratio": 0.004014591337181628, + "epoch": 0.06734760505573434, + "grad_norm": 0.066889688372612, + "kl": 0.0245361328125, + "learning_rate": 7.652549227389097e-06, + "loss": -0.0034, + "step": 1805 + }, + { + "clip_ratio": 0.003523796971421689, + "epoch": 0.06738491674828599, + "grad_norm": 0.06353912502527237, + "kl": 0.0244598388671875, + "learning_rate": 7.649989009021954e-06, + "loss": -0.0036, + "step": 1806 + }, + { + "clip_ratio": 0.00422074826201424, + "completion_length": 622.3928909301758, + "epoch": 0.06742222844083764, + "grad_norm": 0.07086041569709778, + "kl": 0.022064208984375, + "learning_rate": 7.647427888559359e-06, + "loss": -0.0002, + "num_tokens": 42857553.0, + "reward": 0.3486533761024475, + "reward_std": 0.18810723046772182, + "rewards/code_reward": 0.19865335640497506, + "rewards/format_reward": 1.5, + "step": 1807 + }, + { + "clip_ratio": 0.004197214962914586, + "epoch": 0.0674595401333893, + "grad_norm": 0.11475314944982529, + "kl": 0.0226593017578125, + "learning_rate": 7.644865867075913e-06, + "loss": -0.0001, + "step": 1808 + }, + { + "clip_ratio": 0.004241318674758077, + "epoch": 0.06749685182594095, + "grad_norm": 0.0649724006652832, + "kl": 0.022369384765625, + "learning_rate": 7.64230294564659e-06, + "loss": -0.0007, + "step": 1809 + }, + { + "clip_ratio": 0.004357471247203648, + "completion_length": 703.6250152587891, + "epoch": 0.0675341635184926, + "grad_norm": 0.4570963680744171, + "kl": 0.165924072265625, + "learning_rate": 7.639739125346745e-06, + "loss": -0.0022, + "num_tokens": 42935134.0, + "reward": 0.6146536506712437, + "reward_std": 0.1401793211698532, + "rewards/code_reward": 0.4646536335349083, + "rewards/format_reward": 1.5, + "step": 1810 + }, + { + "clip_ratio": 0.004865562659688294, + "epoch": 0.06757147521104426, + "grad_norm": 0.07051994651556015, + "kl": 0.035247802734375, + "learning_rate": 7.637174407252112e-06, + "loss": -0.0035, + "step": 1811 + }, + { + "clip_ratio": 0.004853724851273, + "epoch": 0.06760878690359591, + "grad_norm": 0.06518469005823135, + "kl": 0.03228759765625, + "learning_rate": 7.634608792438794e-06, + "loss": -0.0039, + "step": 1812 + }, + { + "clip_ratio": 0.0025386103661730886, + "completion_length": 532.053596496582, + "epoch": 0.06764609859614756, + "grad_norm": 0.0043950993567705154, + "kl": 0.026519775390625, + "learning_rate": 7.63204228198328e-06, + "loss": 0.0003, + "num_tokens": 42996527.0, + "reward": 0.6499999985098839, + "reward_std": 0.0, + "rewards/code_reward": 0.5, + "rewards/format_reward": 1.5, + "step": 1813 + }, + { + "clip_ratio": 0.0029514814959838986, + "epoch": 0.06768341028869922, + "grad_norm": 0.004371731076389551, + "kl": 0.026336669921875, + "learning_rate": 7.629474876962429e-06, + "loss": 0.0003, + "step": 1814 + }, + { + "clip_ratio": 0.0032529812306165695, + "epoch": 0.06772072198125087, + "grad_norm": 0.0037224129773676395, + "kl": 0.0247802734375, + "learning_rate": 7.626906578453476e-06, + "loss": 0.0003, + "step": 1815 + }, + { + "clip_ratio": 0.0020333861466497183, + "completion_length": 597.5893020629883, + "epoch": 0.06775803367380252, + "grad_norm": 0.06117049977183342, + "kl": 0.0230712890625, + "learning_rate": 7.624337387534029e-06, + "loss": 0.0071, + "num_tokens": 43058356.0, + "reward": 0.721718605607748, + "reward_std": 0.11895932257175446, + "rewards/code_reward": 0.571718612452969, + "rewards/format_reward": 1.5, + "step": 1816 + }, + { + "clip_ratio": 0.0019798357971012592, + "epoch": 0.06779534536635418, + "grad_norm": 0.056870706379413605, + "kl": 0.02288818359375, + "learning_rate": 7.621767305282077e-06, + "loss": 0.007, + "step": 1817 + }, + { + "clip_ratio": 0.002162992430385202, + "epoch": 0.06783265705890583, + "grad_norm": 0.05252622812986374, + "kl": 0.0230712890625, + "learning_rate": 7.619196332775976e-06, + "loss": 0.0068, + "step": 1818 + }, + { + "clip_ratio": 0.002134671784006059, + "completion_length": 537.0536041259766, + "epoch": 0.06786996875145748, + "grad_norm": 0.06688041985034943, + "kl": 0.025543212890625, + "learning_rate": 7.616624471094459e-06, + "loss": 0.0033, + "num_tokens": 43122745.0, + "reward": 0.861627846956253, + "reward_std": 0.19008132070302963, + "rewards/code_reward": 0.7116278186440468, + "rewards/format_reward": 1.5, + "step": 1819 + }, + { + "clip_ratio": 0.0020877752685919404, + "epoch": 0.06790728044400914, + "grad_norm": 0.06412360817193985, + "kl": 0.0247802734375, + "learning_rate": 7.614051721316631e-06, + "loss": 0.003, + "step": 1820 + }, + { + "clip_ratio": 0.002119241515174508, + "epoch": 0.06794459213656079, + "grad_norm": 0.06289082765579224, + "kl": 0.024871826171875, + "learning_rate": 7.611478084521972e-06, + "loss": 0.0029, + "step": 1821 + }, + { + "clip_ratio": 0.0045931460335850716, + "completion_length": 556.8393096923828, + "epoch": 0.06798190382911244, + "grad_norm": 0.08444079011678696, + "kl": 0.022064208984375, + "learning_rate": 7.608903561790331e-06, + "loss": -0.0047, + "num_tokens": 43174620.0, + "reward": 0.48723458126187325, + "reward_std": 0.156945638358593, + "rewards/code_reward": 0.3372345566749573, + "rewards/format_reward": 1.5, + "step": 1822 + }, + { + "clip_ratio": 0.004048091999720782, + "epoch": 0.0680192155216641, + "grad_norm": 0.08404603600502014, + "kl": 0.021881103515625, + "learning_rate": 7.6063281542019265e-06, + "loss": -0.0048, + "step": 1823 + }, + { + "clip_ratio": 0.004465762118343264, + "epoch": 0.06805652721421576, + "grad_norm": 0.07990040630102158, + "kl": 0.02239990234375, + "learning_rate": 7.603751862837357e-06, + "loss": -0.0053, + "step": 1824 + }, + { + "clip_ratio": 0.0040215798653662205, + "completion_length": 680.2500457763672, + "epoch": 0.06809383890676741, + "grad_norm": 0.08065181225538254, + "kl": 0.0238037109375, + "learning_rate": 7.601174688777583e-06, + "loss": -0.0078, + "num_tokens": 43249366.0, + "reward": 0.6533240638673306, + "reward_std": 0.09066157042980194, + "rewards/code_reward": 0.5033240709453821, + "rewards/format_reward": 1.5, + "step": 1825 + }, + { + "clip_ratio": 0.004367640241980553, + "epoch": 0.06813115059931907, + "grad_norm": 0.06935954838991165, + "kl": 0.02337646484375, + "learning_rate": 7.5985966331039415e-06, + "loss": -0.0078, + "step": 1826 + }, + { + "clip_ratio": 0.003956733969971538, + "epoch": 0.06816846229187072, + "grad_norm": 0.06032222881913185, + "kl": 0.023590087890625, + "learning_rate": 7.596017696898134e-06, + "loss": -0.0081, + "step": 1827 + }, + { + "clip_ratio": 0.003308865358121693, + "completion_length": 669.3214492797852, + "epoch": 0.06820577398442237, + "grad_norm": 0.08495116978883743, + "kl": 0.030242919921875, + "learning_rate": 7.5934378812422375e-06, + "loss": 0.016, + "num_tokens": 43324606.0, + "reward": 0.6676085442304611, + "reward_std": 0.13666736870072782, + "rewards/code_reward": 0.517608497350011, + "rewards/format_reward": 1.5, + "step": 1828 + }, + { + "clip_ratio": 0.0024647266836836934, + "epoch": 0.06824308567697403, + "grad_norm": 0.08005453646183014, + "kl": 0.03033447265625, + "learning_rate": 7.590857187218691e-06, + "loss": 0.0156, + "step": 1829 + }, + { + "clip_ratio": 0.0024562819744460285, + "epoch": 0.06828039736952568, + "grad_norm": 0.07360588014125824, + "kl": 0.0307464599609375, + "learning_rate": 7.588275615910309e-06, + "loss": 0.0153, + "step": 1830 + }, + { + "clip_ratio": 0.004202295618597418, + "completion_length": 570.2143249511719, + "epoch": 0.06831770906207733, + "grad_norm": 0.09632505476474762, + "kl": 0.027618408203125, + "learning_rate": 7.58569316840027e-06, + "loss": 0.008, + "num_tokens": 43389298.0, + "reward": 0.7966081202030182, + "reward_std": 0.2928166016936302, + "rewards/code_reward": 0.6466080620884895, + "rewards/format_reward": 1.5, + "step": 1831 + }, + { + "clip_ratio": 0.003638710593804717, + "epoch": 0.06835502075462899, + "grad_norm": 0.10067827999591827, + "kl": 0.02752685546875, + "learning_rate": 7.5831098457721174e-06, + "loss": 0.0078, + "step": 1832 + }, + { + "clip_ratio": 0.003703240887261927, + "epoch": 0.06839233244718064, + "grad_norm": 0.08831631392240524, + "kl": 0.0272216796875, + "learning_rate": 7.580525649109771e-06, + "loss": 0.0072, + "step": 1833 + }, + { + "clip_ratio": 0.0036773186293430626, + "completion_length": 737.982177734375, + "epoch": 0.06842964413973229, + "grad_norm": 0.07628070563077927, + "kl": 0.02093505859375, + "learning_rate": 7.577940579497508e-06, + "loss": -0.0028, + "num_tokens": 43468775.0, + "reward": 0.807119645178318, + "reward_std": 0.20955900102853775, + "rewards/code_reward": 0.6571196168661118, + "rewards/format_reward": 1.5, + "step": 1834 + }, + { + "clip_ratio": 0.0034286058507859707, + "epoch": 0.06846695583228395, + "grad_norm": 0.06974999606609344, + "kl": 0.021331787109375, + "learning_rate": 7.5753546380199785e-06, + "loss": -0.0031, + "step": 1835 + }, + { + "clip_ratio": 0.0033682057983241975, + "epoch": 0.0685042675248356, + "grad_norm": 0.0663921907544136, + "kl": 0.021087646484375, + "learning_rate": 7.572767825762193e-06, + "loss": -0.0032, + "step": 1836 + }, + { + "clip_ratio": 0.004749990126583725, + "completion_length": 738.4286041259766, + "epoch": 0.06854157921738725, + "grad_norm": 0.07711069285869598, + "kl": 0.0259552001953125, + "learning_rate": 7.57018014380953e-06, + "loss": -0.0019, + "num_tokens": 43537637.0, + "reward": 0.2639685645699501, + "reward_std": 0.19567714864388108, + "rewards/code_reward": 0.11396856373175979, + "rewards/format_reward": 1.5, + "step": 1837 + }, + { + "clip_ratio": 0.004612574470229447, + "epoch": 0.0685788909099389, + "grad_norm": 0.07172377407550812, + "kl": 0.0272064208984375, + "learning_rate": 7.567591593247734e-06, + "loss": -0.0021, + "step": 1838 + }, + { + "clip_ratio": 0.004411455593071878, + "epoch": 0.06861620260249056, + "grad_norm": 0.06907638907432556, + "kl": 0.0261993408203125, + "learning_rate": 7.565002175162911e-06, + "loss": -0.0024, + "step": 1839 + }, + { + "clip_ratio": 0.0033153981785289943, + "completion_length": 717.8571929931641, + "epoch": 0.06865351429504221, + "grad_norm": 0.08143416047096252, + "kl": 0.022979736328125, + "learning_rate": 7.5624118906415355e-06, + "loss": 0.0054, + "num_tokens": 43607993.0, + "reward": 0.46221479400992393, + "reward_std": 0.1989527978003025, + "rewards/code_reward": 0.31221477687358856, + "rewards/format_reward": 1.5, + "step": 1840 + }, + { + "clip_ratio": 0.0032091522589325905, + "epoch": 0.06869082598759386, + "grad_norm": 0.07611919194459915, + "kl": 0.0225830078125, + "learning_rate": 7.55982074077044e-06, + "loss": 0.005, + "step": 1841 + }, + { + "clip_ratio": 0.003109895915258676, + "epoch": 0.06872813768014552, + "grad_norm": 0.07597800344228745, + "kl": 0.0231170654296875, + "learning_rate": 7.557228726636826e-06, + "loss": 0.0046, + "step": 1842 + }, + { + "clip_ratio": 0.0030799086089245975, + "completion_length": 589.1964492797852, + "epoch": 0.06876544937269717, + "grad_norm": 0.07159169018268585, + "kl": 0.022674560546875, + "learning_rate": 7.55463584932825e-06, + "loss": 0.0017, + "num_tokens": 43667154.0, + "reward": 0.852714266628027, + "reward_std": 0.04741728724911809, + "rewards/code_reward": 0.7027142494916916, + "rewards/format_reward": 1.5, + "step": 1843 + }, + { + "clip_ratio": 0.002520043170079589, + "epoch": 0.06880276106524882, + "grad_norm": 0.06713968515396118, + "kl": 0.02374267578125, + "learning_rate": 7.55204210993264e-06, + "loss": 0.0017, + "step": 1844 + }, + { + "clip_ratio": 0.0028951787389814854, + "epoch": 0.06884007275780048, + "grad_norm": 0.06334257870912552, + "kl": 0.0233154296875, + "learning_rate": 7.549447509538278e-06, + "loss": 0.0013, + "step": 1845 + }, + { + "clip_ratio": 0.003728954354301095, + "completion_length": 796.5714721679688, + "epoch": 0.06887738445035213, + "grad_norm": 0.07599860429763794, + "kl": 0.0224609375, + "learning_rate": 7.54685204923381e-06, + "loss": -0.0076, + "num_tokens": 43744914.0, + "reward": 0.582805946469307, + "reward_std": 0.21490824222564697, + "rewards/code_reward": 0.43280594423413277, + "rewards/format_reward": 1.5, + "step": 1846 + }, + { + "clip_ratio": 0.0030508239869959652, + "epoch": 0.06891469614290378, + "grad_norm": 0.07307583093643188, + "kl": 0.02227783203125, + "learning_rate": 7.544255730108243e-06, + "loss": -0.008, + "step": 1847 + }, + { + "clip_ratio": 0.003363161231391132, + "epoch": 0.06895200783545544, + "grad_norm": 0.07184191793203354, + "kl": 0.021728515625, + "learning_rate": 7.541658553250945e-06, + "loss": -0.0082, + "step": 1848 + }, + { + "clip_ratio": 0.004481347277760506, + "completion_length": 672.1071624755859, + "epoch": 0.06898931952800709, + "grad_norm": 0.1519007682800293, + "kl": 0.02911376953125, + "learning_rate": 7.539060519751644e-06, + "loss": 0.0102, + "num_tokens": 43813898.0, + "reward": 0.7803633362054825, + "reward_std": 0.3881106525659561, + "rewards/code_reward": 0.6339347586035728, + "rewards/format_reward": 1.4642857015132904, + "step": 1849 + }, + { + "clip_ratio": 0.004260823130607605, + "epoch": 0.06902663122055874, + "grad_norm": 0.09622755646705627, + "kl": 0.029327392578125, + "learning_rate": 7.536461630700426e-06, + "loss": 0.0101, + "step": 1850 + }, + { + "clip_ratio": 0.004222780466079712, + "epoch": 0.0690639429131104, + "grad_norm": 0.09028258174657822, + "kl": 0.029296875, + "learning_rate": 7.533861887187736e-06, + "loss": 0.0095, + "step": 1851 + }, + { + "clip_ratio": 0.005265508778393269, + "completion_length": 800.0714721679688, + "epoch": 0.06910125460566205, + "grad_norm": 0.07589341700077057, + "kl": 0.053009033203125, + "learning_rate": 7.5312612903043755e-06, + "loss": -0.0058, + "num_tokens": 43893716.0, + "reward": 0.39929841086268425, + "reward_std": 0.2511008009314537, + "rewards/code_reward": 0.24929838813841343, + "rewards/format_reward": 1.5, + "step": 1852 + }, + { + "clip_ratio": 0.005698285414837301, + "epoch": 0.0691385662982137, + "grad_norm": 0.07726412266492844, + "kl": 0.049896240234375, + "learning_rate": 7.528659841141514e-06, + "loss": -0.006, + "step": 1853 + }, + { + "clip_ratio": 0.005828781402669847, + "epoch": 0.06917587799076536, + "grad_norm": 0.07297910749912262, + "kl": 0.05072021484375, + "learning_rate": 7.526057540790663e-06, + "loss": -0.0062, + "step": 1854 + }, + { + "clip_ratio": 0.004099147743545473, + "completion_length": 687.857177734375, + "epoch": 0.06921318968331701, + "grad_norm": 0.09313453733921051, + "kl": 0.0272216796875, + "learning_rate": 7.5234543903437065e-06, + "loss": 0.004, + "num_tokens": 43960194.0, + "reward": 0.4105120711028576, + "reward_std": 0.12221845996100456, + "rewards/code_reward": 0.26051205629482865, + "rewards/format_reward": 1.5, + "step": 1855 + }, + { + "clip_ratio": 0.0039057006943039596, + "epoch": 0.06925050137586866, + "grad_norm": 0.09359204024076462, + "kl": 0.02703857421875, + "learning_rate": 7.52085039089287e-06, + "loss": 0.0039, + "step": 1856 + }, + { + "clip_ratio": 0.0042210425599478185, + "epoch": 0.06928781306842031, + "grad_norm": 0.08624608814716339, + "kl": 0.0267333984375, + "learning_rate": 7.51824554353075e-06, + "loss": 0.0034, + "step": 1857 + }, + { + "clip_ratio": 0.004781689727678895, + "completion_length": 730.6786117553711, + "epoch": 0.06932512476097197, + "grad_norm": 0.09087567776441574, + "kl": 0.02667236328125, + "learning_rate": 7.515639849350291e-06, + "loss": 0.0091, + "num_tokens": 44033746.0, + "reward": 0.49765926226973534, + "reward_std": 0.3206370249390602, + "rewards/code_reward": 0.34765926003456116, + "rewards/format_reward": 1.5, + "step": 1858 + }, + { + "clip_ratio": 0.004386233864352107, + "epoch": 0.06936243645352362, + "grad_norm": 0.08795223385095596, + "kl": 0.02642822265625, + "learning_rate": 7.513033309444793e-06, + "loss": 0.0086, + "step": 1859 + }, + { + "clip_ratio": 0.004159023694228381, + "epoch": 0.06939974814607527, + "grad_norm": 0.08252807706594467, + "kl": 0.026824951171875, + "learning_rate": 7.5104259249079115e-06, + "loss": 0.0083, + "step": 1860 + }, + { + "clip_ratio": 0.002305361907929182, + "completion_length": 583.3750381469727, + "epoch": 0.06943705983862693, + "grad_norm": 0.04575178772211075, + "kl": 0.024871826171875, + "learning_rate": 7.507817696833655e-06, + "loss": -0.0003, + "num_tokens": 44092483.0, + "reward": 0.6530788168311119, + "reward_std": 0.00642195250838995, + "rewards/code_reward": 0.5030788178555667, + "rewards/format_reward": 1.5, + "step": 1861 + }, + { + "clip_ratio": 0.002415999071672559, + "epoch": 0.06947437153117858, + "grad_norm": 0.044815633445978165, + "kl": 0.0255126953125, + "learning_rate": 7.50520862631639e-06, + "loss": -0.0003, + "step": 1862 + }, + { + "clip_ratio": 0.002331957104615867, + "epoch": 0.06951168322373023, + "grad_norm": 0.04244854673743248, + "kl": 0.025146484375, + "learning_rate": 7.502598714450835e-06, + "loss": -0.0004, + "step": 1863 + }, + { + "clip_ratio": 0.0031453146366402507, + "completion_length": 764.5714569091797, + "epoch": 0.06954899491628189, + "grad_norm": 0.0545707568526268, + "kl": 0.0181121826171875, + "learning_rate": 7.499987962332059e-06, + "loss": 0.0046, + "num_tokens": 44169263.0, + "reward": 0.47036831080913544, + "reward_std": 0.08937107026576996, + "rewards/code_reward": 0.3203683039173484, + "rewards/format_reward": 1.5, + "step": 1864 + }, + { + "clip_ratio": 0.00279629766009748, + "epoch": 0.06958630660883354, + "grad_norm": 0.050305694341659546, + "kl": 0.0179901123046875, + "learning_rate": 7.497376371055483e-06, + "loss": 0.0044, + "step": 1865 + }, + { + "clip_ratio": 0.0029539394308812916, + "epoch": 0.06962361830138519, + "grad_norm": 0.05057225376367569, + "kl": 0.0182647705078125, + "learning_rate": 7.494763941716884e-06, + "loss": 0.0043, + "step": 1866 + }, + { + "clip_ratio": 0.0032636993564665318, + "completion_length": 664.0000305175781, + "epoch": 0.06966092999393685, + "grad_norm": 0.07427576929330826, + "kl": 0.0249481201171875, + "learning_rate": 7.49215067541239e-06, + "loss": -0.0063, + "num_tokens": 44233411.0, + "reward": 0.7411277666687965, + "reward_std": 0.16672952100634575, + "rewards/code_reward": 0.5911277625709772, + "rewards/format_reward": 1.5, + "step": 1867 + }, + { + "clip_ratio": 0.0031044385978020728, + "epoch": 0.0696982416864885, + "grad_norm": 0.07437895238399506, + "kl": 0.02471923828125, + "learning_rate": 7.489536573238478e-06, + "loss": -0.0067, + "step": 1868 + }, + { + "clip_ratio": 0.002611784962937236, + "epoch": 0.06973555337904015, + "grad_norm": 0.06783199310302734, + "kl": 0.0243682861328125, + "learning_rate": 7.486921636291976e-06, + "loss": -0.0069, + "step": 1869 + }, + { + "clip_ratio": 0.004429178545251489, + "completion_length": 576.928596496582, + "epoch": 0.0697728650715918, + "grad_norm": 0.09290256351232529, + "kl": 0.021240234375, + "learning_rate": 7.484305865670063e-06, + "loss": -0.0044, + "num_tokens": 44287489.0, + "reward": 0.6279696375131607, + "reward_std": 0.22806342225521803, + "rewards/code_reward": 0.4779696445912123, + "rewards/format_reward": 1.5, + "step": 1870 + }, + { + "clip_ratio": 0.004612509277649224, + "epoch": 0.06981017676414346, + "grad_norm": 0.16358844935894012, + "kl": 0.020904541015625, + "learning_rate": 7.481689262470269e-06, + "loss": -0.0047, + "step": 1871 + }, + { + "clip_ratio": 0.004468354454729706, + "epoch": 0.06984748845669511, + "grad_norm": 0.09223896265029907, + "kl": 0.021148681640625, + "learning_rate": 7.479071827790473e-06, + "loss": -0.005, + "step": 1872 + }, + { + "clip_ratio": 0.0030045928433537483, + "completion_length": 674.2857513427734, + "epoch": 0.06988480014924676, + "grad_norm": 0.07943755388259888, + "kl": 0.023712158203125, + "learning_rate": 7.476453562728899e-06, + "loss": 0.0003, + "num_tokens": 44357491.0, + "reward": 0.7327969297766685, + "reward_std": 0.2058674432337284, + "rewards/code_reward": 0.5827969014644623, + "rewards/format_reward": 1.5, + "step": 1873 + }, + { + "clip_ratio": 0.0026546629960648715, + "epoch": 0.06992211184179842, + "grad_norm": 0.07585611939430237, + "kl": 0.02471923828125, + "learning_rate": 7.473834468384125e-06, + "loss": 0.0001, + "step": 1874 + }, + { + "clip_ratio": 0.0027724133105948567, + "epoch": 0.06995942353435007, + "grad_norm": 0.08138382434844971, + "kl": 0.024658203125, + "learning_rate": 7.471214545855071e-06, + "loss": -0.0003, + "step": 1875 + }, + { + "clip_ratio": 0.0037038002628833055, + "completion_length": 606.4464645385742, + "epoch": 0.06999673522690172, + "grad_norm": 0.09821690618991852, + "kl": 0.028289794921875, + "learning_rate": 7.468593796241011e-06, + "loss": -0.0121, + "num_tokens": 44421132.0, + "reward": 0.9601813554763794, + "reward_std": 0.16872391401557252, + "rewards/code_reward": 0.8101812824606895, + "rewards/format_reward": 1.5, + "step": 1876 + }, + { + "clip_ratio": 0.004096750286407769, + "epoch": 0.07003404691945338, + "grad_norm": 0.08625872433185577, + "kl": 0.0281982421875, + "learning_rate": 7.465972220641563e-06, + "loss": -0.0121, + "step": 1877 + }, + { + "clip_ratio": 0.003745619615074247, + "epoch": 0.07007135861200503, + "grad_norm": 0.0807986930012703, + "kl": 0.02642822265625, + "learning_rate": 7.463349820156689e-06, + "loss": -0.0129, + "step": 1878 + }, + { + "clip_ratio": 0.003358893096446991, + "completion_length": 688.5714721679688, + "epoch": 0.0701086703045567, + "grad_norm": 0.09716184437274933, + "kl": 0.020355224609375, + "learning_rate": 7.4607265958867e-06, + "loss": 0.0034, + "num_tokens": 44490408.0, + "reward": 0.5308934636414051, + "reward_std": 0.2571963733062148, + "rewards/code_reward": 0.3808934548869729, + "rewards/format_reward": 1.5, + "step": 1879 + }, + { + "clip_ratio": 0.0034833898534998298, + "epoch": 0.07014598199710835, + "grad_norm": 0.08975052833557129, + "kl": 0.02044677734375, + "learning_rate": 7.458102548932255e-06, + "loss": 0.0029, + "step": 1880 + }, + { + "clip_ratio": 0.003453884623013437, + "epoch": 0.07018329368966, + "grad_norm": 0.08342441916465759, + "kl": 0.02093505859375, + "learning_rate": 7.455477680394353e-06, + "loss": 0.0025, + "step": 1881 + }, + { + "clip_ratio": 0.0067311933962628245, + "completion_length": 622.3036079406738, + "epoch": 0.07022060538221166, + "grad_norm": 0.069577656686306, + "kl": 0.03643798828125, + "learning_rate": 7.4528519913743406e-06, + "loss": 0.0043, + "num_tokens": 44558101.0, + "reward": 0.3254777379333973, + "reward_std": 0.07091080397367477, + "rewards/code_reward": 0.1754777291789651, + "rewards/format_reward": 1.5, + "step": 1882 + }, + { + "clip_ratio": 0.007040532771497965, + "epoch": 0.07025791707476331, + "grad_norm": 0.06275437772274017, + "kl": 0.034576416015625, + "learning_rate": 7.450225482973908e-06, + "loss": 0.0041, + "step": 1883 + }, + { + "clip_ratio": 0.006516874767839909, + "epoch": 0.07029522876731496, + "grad_norm": 0.058843959122896194, + "kl": 0.03558349609375, + "learning_rate": 7.447598156295087e-06, + "loss": 0.0037, + "step": 1884 + }, + { + "clip_ratio": 0.003992752579506487, + "completion_length": 616.8571701049805, + "epoch": 0.07033254045986662, + "grad_norm": 0.18114344775676727, + "kl": 0.0171966552734375, + "learning_rate": 7.444970012440259e-06, + "loss": 0.0094, + "num_tokens": 44618475.0, + "reward": 0.47300422564148903, + "reward_std": 0.25263811647892, + "rewards/code_reward": 0.32300421223044395, + "rewards/format_reward": 1.5, + "step": 1885 + }, + { + "clip_ratio": 0.003446749411523342, + "epoch": 0.07036985215241827, + "grad_norm": 0.07507801800966263, + "kl": 0.01702880859375, + "learning_rate": 7.442341052512144e-06, + "loss": 0.0094, + "step": 1886 + }, + { + "clip_ratio": 0.00397374271415174, + "epoch": 0.07040716384496992, + "grad_norm": 0.06827020645141602, + "kl": 0.016845703125, + "learning_rate": 7.439711277613802e-06, + "loss": 0.0091, + "step": 1887 + }, + { + "clip_ratio": 0.0033326935372315347, + "completion_length": 641.5000381469727, + "epoch": 0.07044447553752158, + "grad_norm": 0.060275398194789886, + "kl": 0.0289306640625, + "learning_rate": 7.437080688848638e-06, + "loss": -0.0013, + "num_tokens": 44695965.0, + "reward": 0.7520408146083355, + "reward_std": 0.12015001475811005, + "rewards/code_reward": 0.6020408123731613, + "rewards/format_reward": 1.5, + "step": 1888 + }, + { + "clip_ratio": 0.0031029191450215876, + "epoch": 0.07048178723007323, + "grad_norm": 0.06965687870979309, + "kl": 0.028717041015625, + "learning_rate": 7.434449287320401e-06, + "loss": -0.0012, + "step": 1889 + }, + { + "clip_ratio": 0.003176158876158297, + "epoch": 0.07051909892262488, + "grad_norm": 0.0547051802277565, + "kl": 0.0299072265625, + "learning_rate": 7.431817074133178e-06, + "loss": -0.0014, + "step": 1890 + }, + { + "clip_ratio": 0.00459154014242813, + "completion_length": 643.5000152587891, + "epoch": 0.07055641061517653, + "grad_norm": 0.06026053428649902, + "kl": 0.018951416015625, + "learning_rate": 7.429184050391394e-06, + "loss": -0.0056, + "num_tokens": 44771607.0, + "reward": 0.43472225219011307, + "reward_std": 0.21489089727401733, + "rewards/code_reward": 0.2847222238779068, + "rewards/format_reward": 1.5, + "step": 1891 + }, + { + "clip_ratio": 0.004510606755502522, + "epoch": 0.07059372230772819, + "grad_norm": 0.05976993590593338, + "kl": 0.0189666748046875, + "learning_rate": 7.426550217199816e-06, + "loss": -0.0059, + "step": 1892 + }, + { + "clip_ratio": 0.004198897746391594, + "epoch": 0.07063103400027984, + "grad_norm": 0.05413532257080078, + "kl": 0.019073486328125, + "learning_rate": 7.423915575663555e-06, + "loss": -0.006, + "step": 1893 + }, + { + "clip_ratio": 0.0032135185319930315, + "completion_length": 702.2143325805664, + "epoch": 0.0706683456928315, + "grad_norm": 0.06928512454032898, + "kl": 0.020416259765625, + "learning_rate": 7.421280126888058e-06, + "loss": 0.0045, + "num_tokens": 44850565.0, + "reward": 0.6887281760573387, + "reward_std": 0.2944457083940506, + "rewards/code_reward": 0.5387281402945518, + "rewards/format_reward": 1.5, + "step": 1894 + }, + { + "clip_ratio": 0.0025157411582767963, + "epoch": 0.07070565738538315, + "grad_norm": 0.06748481839895248, + "kl": 0.02008056640625, + "learning_rate": 7.418643871979107e-06, + "loss": 0.004, + "step": 1895 + }, + { + "clip_ratio": 0.002631173876579851, + "epoch": 0.0707429690779348, + "grad_norm": 0.06652607768774033, + "kl": 0.0203857421875, + "learning_rate": 7.416006812042827e-06, + "loss": 0.0038, + "step": 1896 + }, + { + "clip_ratio": 0.004479480441659689, + "completion_length": 624.6250228881836, + "epoch": 0.07078028077048645, + "grad_norm": 0.0646536573767662, + "kl": 0.0242919921875, + "learning_rate": 7.413368948185681e-06, + "loss": 0.0216, + "num_tokens": 44918192.0, + "reward": 0.4468177855014801, + "reward_std": 0.15855877101421356, + "rewards/code_reward": 0.29681776463985443, + "rewards/format_reward": 1.5, + "step": 1897 + }, + { + "clip_ratio": 0.0042151038069278, + "epoch": 0.0708175924630381, + "grad_norm": 0.06090511754155159, + "kl": 0.02392578125, + "learning_rate": 7.410730281514464e-06, + "loss": 0.0217, + "step": 1898 + }, + { + "clip_ratio": 0.004643023828975856, + "epoch": 0.07085490415558976, + "grad_norm": 0.05902664363384247, + "kl": 0.0233154296875, + "learning_rate": 7.408090813136317e-06, + "loss": 0.0214, + "step": 1899 + }, + { + "clip_ratio": 0.0037862976314499974, + "completion_length": 538.0000381469727, + "epoch": 0.07089221584814141, + "grad_norm": 0.09191620349884033, + "kl": 0.02581787109375, + "learning_rate": 7.4054505441587075e-06, + "loss": -0.007, + "num_tokens": 44983964.0, + "reward": 0.6876043453812599, + "reward_std": 0.37633731216192245, + "rewards/code_reward": 0.5376043431460857, + "rewards/format_reward": 1.5, + "step": 1900 + }, + { + "clip_ratio": 0.0039055831148289144, + "epoch": 0.07092952754069307, + "grad_norm": 0.08801250904798508, + "kl": 0.025421142578125, + "learning_rate": 7.402809475689443e-06, + "loss": -0.0072, + "step": 1901 + }, + { + "clip_ratio": 0.0036793910549022257, + "epoch": 0.07096683923324472, + "grad_norm": 0.09844241291284561, + "kl": 0.02447509765625, + "learning_rate": 7.400167608836668e-06, + "loss": -0.0075, + "step": 1902 + }, + { + "clip_ratio": 0.004264927818439901, + "completion_length": 694.6428833007812, + "epoch": 0.07100415092579637, + "grad_norm": 0.06640040874481201, + "kl": 0.019744873046875, + "learning_rate": 7.397524944708864e-06, + "loss": 0.0157, + "num_tokens": 45053482.0, + "reward": 0.40000002831220627, + "reward_std": 0.1806170754134655, + "rewards/code_reward": 0.2500000074505806, + "rewards/format_reward": 1.5, + "step": 1903 + }, + { + "clip_ratio": 0.004224850214086473, + "epoch": 0.07104146261834803, + "grad_norm": 0.06486807763576508, + "kl": 0.019927978515625, + "learning_rate": 7.39488148441484e-06, + "loss": 0.0156, + "step": 1904 + }, + { + "clip_ratio": 0.004011384095065296, + "epoch": 0.07107877431089968, + "grad_norm": 0.06206608936190605, + "kl": 0.020233154296875, + "learning_rate": 7.392237229063741e-06, + "loss": 0.0153, + "step": 1905 + }, + { + "clip_ratio": 0.0035124887945130467, + "completion_length": 585.982177734375, + "epoch": 0.07111608600345133, + "grad_norm": 0.10738975554704666, + "kl": 0.02606201171875, + "learning_rate": 7.389592179765051e-06, + "loss": -0.0118, + "num_tokens": 45116537.0, + "reward": 0.8047614172101021, + "reward_std": 0.1750434326240793, + "rewards/code_reward": 0.6547614187002182, + "rewards/format_reward": 1.5, + "step": 1906 + }, + { + "clip_ratio": 0.0032676392002031207, + "epoch": 0.07115339769600298, + "grad_norm": 0.0868254080414772, + "kl": 0.026885986328125, + "learning_rate": 7.386946337628584e-06, + "loss": -0.0118, + "step": 1907 + }, + { + "clip_ratio": 0.0026609248016029596, + "epoch": 0.07119070938855464, + "grad_norm": 0.07992259413003922, + "kl": 0.0289306640625, + "learning_rate": 7.3842997037644846e-06, + "loss": -0.0127, + "step": 1908 + }, + { + "clip_ratio": 0.0059480773052200675, + "completion_length": 784.2500457763672, + "epoch": 0.07122802108110629, + "grad_norm": 0.17482036352157593, + "kl": 0.136993408203125, + "learning_rate": 7.381652279283231e-06, + "loss": 0.0039, + "num_tokens": 45203491.0, + "reward": 0.366279486566782, + "reward_std": 0.12354657240211964, + "rewards/code_reward": 0.21627947874367237, + "rewards/format_reward": 1.5, + "step": 1909 + }, + { + "clip_ratio": 0.0063567847246304154, + "epoch": 0.07126533277365794, + "grad_norm": 0.08786481618881226, + "kl": 0.0711669921875, + "learning_rate": 7.379004065295634e-06, + "loss": 0.003, + "step": 1910 + }, + { + "clip_ratio": 0.00649605190847069, + "epoch": 0.0713026444662096, + "grad_norm": 0.06819401681423187, + "kl": 0.04779052734375, + "learning_rate": 7.376355062912836e-06, + "loss": 0.0024, + "step": 1911 + }, + { + "clip_ratio": 0.004041236476041377, + "completion_length": 646.7678833007812, + "epoch": 0.07133995615876125, + "grad_norm": 0.10471457988023758, + "kl": 0.023284912109375, + "learning_rate": 7.3737052732463055e-06, + "loss": 0.0079, + "num_tokens": 45273128.0, + "reward": 0.7617518901824951, + "reward_std": 0.3277807831764221, + "rewards/code_reward": 0.6117518953979015, + "rewards/format_reward": 1.5, + "step": 1912 + }, + { + "clip_ratio": 0.003891141968779266, + "epoch": 0.0713772678513129, + "grad_norm": 0.1061345711350441, + "kl": 0.023193359375, + "learning_rate": 7.37105469740785e-06, + "loss": 0.0079, + "step": 1913 + }, + { + "clip_ratio": 0.0033661643392406404, + "epoch": 0.07141457954386456, + "grad_norm": 0.09159156680107117, + "kl": 0.022979736328125, + "learning_rate": 7.368403336509598e-06, + "loss": 0.0073, + "step": 1914 + }, + { + "clip_ratio": 0.004957814002409577, + "completion_length": 686.7321624755859, + "epoch": 0.07145189123641621, + "grad_norm": 0.06945565342903137, + "kl": 0.021697998046875, + "learning_rate": 7.365751191664012e-06, + "loss": 0.0037, + "num_tokens": 45344499.0, + "reward": 0.24327420815825462, + "reward_std": 0.18403409887105227, + "rewards/code_reward": 0.09327418077737093, + "rewards/format_reward": 1.5, + "step": 1915 + }, + { + "clip_ratio": 0.004851198231335729, + "epoch": 0.07148920292896786, + "grad_norm": 0.0766894668340683, + "kl": 0.021820068359375, + "learning_rate": 7.3630982639838855e-06, + "loss": 0.0034, + "step": 1916 + }, + { + "clip_ratio": 0.004436352581251413, + "epoch": 0.07152651462151952, + "grad_norm": 0.0571858286857605, + "kl": 0.022857666015625, + "learning_rate": 7.360444554582333e-06, + "loss": 0.0033, + "step": 1917 + }, + { + "clip_ratio": 0.004854643833823502, + "completion_length": 720.8750381469727, + "epoch": 0.07156382631407117, + "grad_norm": 0.0868053138256073, + "kl": 0.0242919921875, + "learning_rate": 7.3577900645728065e-06, + "loss": 0.0132, + "num_tokens": 45410460.0, + "reward": 0.7603398561477661, + "reward_std": 0.4080974869430065, + "rewards/code_reward": 0.6103398576378822, + "rewards/format_reward": 1.5, + "step": 1918 + }, + { + "clip_ratio": 0.004272604419384152, + "epoch": 0.07160113800662282, + "grad_norm": 0.08127342164516449, + "kl": 0.024383544921875, + "learning_rate": 7.3551347950690775e-06, + "loss": 0.0133, + "step": 1919 + }, + { + "clip_ratio": 0.003792135219555348, + "epoch": 0.07163844969917448, + "grad_norm": 0.0777520164847374, + "kl": 0.0250244140625, + "learning_rate": 7.352478747185249e-06, + "loss": 0.0126, + "step": 1920 + }, + { + "clip_ratio": 0.0028402076568454504, + "completion_length": 533.2857284545898, + "epoch": 0.07167576139172613, + "grad_norm": 0.055782634764909744, + "kl": 0.0211334228515625, + "learning_rate": 7.349821922035747e-06, + "loss": -0.0098, + "num_tokens": 45471852.0, + "reward": 0.763154175132513, + "reward_std": 0.007912970148026943, + "rewards/code_reward": 0.6131541728973389, + "rewards/format_reward": 1.5, + "step": 1921 + }, + { + "clip_ratio": 0.002176681417040527, + "epoch": 0.07171307308427778, + "grad_norm": 0.05776240676641464, + "kl": 0.0212860107421875, + "learning_rate": 7.34716432073533e-06, + "loss": -0.0098, + "step": 1922 + }, + { + "clip_ratio": 0.0024686630349606276, + "epoch": 0.07175038477682943, + "grad_norm": 0.05411768704652786, + "kl": 0.0212554931640625, + "learning_rate": 7.344505944399076e-06, + "loss": -0.01, + "step": 1923 + }, + { + "clip_ratio": 0.00407543801702559, + "completion_length": 743.8928985595703, + "epoch": 0.07178769646938109, + "grad_norm": 0.09358955919742584, + "kl": 0.03094482421875, + "learning_rate": 7.341846794142391e-06, + "loss": -0.0019, + "num_tokens": 45553760.0, + "reward": 0.6477288603782654, + "reward_std": 0.3202534168958664, + "rewards/code_reward": 0.4977288395166397, + "rewards/format_reward": 1.5, + "step": 1924 + }, + { + "clip_ratio": 0.0041111715399893, + "epoch": 0.07182500816193274, + "grad_norm": 0.09203270077705383, + "kl": 0.032135009765625, + "learning_rate": 7.339186871081005e-06, + "loss": -0.0021, + "step": 1925 + }, + { + "clip_ratio": 0.003652742743724957, + "epoch": 0.0718623198544844, + "grad_norm": 0.08515685051679611, + "kl": 0.03204345703125, + "learning_rate": 7.336526176330969e-06, + "loss": -0.0027, + "step": 1926 + }, + { + "clip_ratio": 0.0033831405453383923, + "completion_length": 625.9107437133789, + "epoch": 0.07189963154703605, + "grad_norm": 0.09108500927686691, + "kl": 0.031402587890625, + "learning_rate": 7.333864711008666e-06, + "loss": 0.0127, + "num_tokens": 45618449.0, + "reward": 0.6387700662016869, + "reward_std": 0.11117779789492488, + "rewards/code_reward": 0.488770077819936, + "rewards/format_reward": 1.5, + "step": 1927 + }, + { + "clip_ratio": 0.002784859447274357, + "epoch": 0.0719369432395877, + "grad_norm": 0.09382595866918564, + "kl": 0.03057861328125, + "learning_rate": 7.331202476230796e-06, + "loss": 0.0124, + "step": 1928 + }, + { + "clip_ratio": 0.0030475918320007622, + "epoch": 0.07197425493213935, + "grad_norm": 0.35318809747695923, + "kl": 0.0273284912109375, + "learning_rate": 7.32853947311438e-06, + "loss": 0.0124, + "step": 1929 + }, + { + "clip_ratio": 0.0038322582840919495, + "completion_length": 613.5357513427734, + "epoch": 0.072011566624691, + "grad_norm": 0.07326297461986542, + "kl": 0.0195159912109375, + "learning_rate": 7.3258757027767705e-06, + "loss": 0.0057, + "num_tokens": 45687205.0, + "reward": 0.6512682028114796, + "reward_std": 0.23027318064123392, + "rewards/code_reward": 0.5012681782245636, + "rewards/format_reward": 1.5, + "step": 1930 + }, + { + "clip_ratio": 0.003360241826158017, + "epoch": 0.07204887831724266, + "grad_norm": 0.07234030961990356, + "kl": 0.0199432373046875, + "learning_rate": 7.3232111663356305e-06, + "loss": 0.0054, + "step": 1931 + }, + { + "clip_ratio": 0.0035376036539673805, + "epoch": 0.07208619000979431, + "grad_norm": 0.0694832131266594, + "kl": 0.0196533203125, + "learning_rate": 7.320545864908952e-06, + "loss": 0.0055, + "step": 1932 + }, + { + "clip_ratio": 0.00356169615406543, + "completion_length": 646.1428833007812, + "epoch": 0.07212350170234598, + "grad_norm": 0.07966778427362442, + "kl": 0.024444580078125, + "learning_rate": 7.3178797996150475e-06, + "loss": -0.0082, + "num_tokens": 45757619.0, + "reward": 0.4755677878856659, + "reward_std": 0.16022305097430944, + "rewards/code_reward": 0.32556779053993523, + "rewards/format_reward": 1.5, + "step": 1933 + }, + { + "clip_ratio": 0.0028912897978443652, + "epoch": 0.07216081339489763, + "grad_norm": 0.0751623883843422, + "kl": 0.024444580078125, + "learning_rate": 7.315212971572548e-06, + "loss": -0.0087, + "step": 1934 + }, + { + "clip_ratio": 0.003239455501898192, + "epoch": 0.07219812508744929, + "grad_norm": 0.0735340490937233, + "kl": 0.024627685546875, + "learning_rate": 7.312545381900402e-06, + "loss": -0.0089, + "step": 1935 + }, + { + "clip_ratio": 0.0031612776801921427, + "completion_length": 531.8393020629883, + "epoch": 0.07223543678000094, + "grad_norm": 0.10105051100254059, + "kl": 0.0244140625, + "learning_rate": 7.309877031717884e-06, + "loss": 0.0039, + "num_tokens": 45812154.0, + "reward": 0.8371148556470871, + "reward_std": 0.31053633987903595, + "rewards/code_reward": 0.687114842236042, + "rewards/format_reward": 1.5, + "step": 1936 + }, + { + "clip_ratio": 0.0029136426164768636, + "epoch": 0.07227274847255259, + "grad_norm": 0.07893256098031998, + "kl": 0.0247650146484375, + "learning_rate": 7.307207922144583e-06, + "loss": 0.0032, + "step": 1937 + }, + { + "clip_ratio": 0.002440693322569132, + "epoch": 0.07231006016510425, + "grad_norm": 0.1223767027258873, + "kl": 0.0255126953125, + "learning_rate": 7.304538054300408e-06, + "loss": 0.0031, + "step": 1938 + }, + { + "clip_ratio": 0.0043902547331526875, + "completion_length": 665.9821853637695, + "epoch": 0.0723473718576559, + "grad_norm": 0.09255357086658478, + "kl": 0.02996826171875, + "learning_rate": 7.301867429305587e-06, + "loss": 0.0049, + "num_tokens": 45892947.0, + "reward": 0.6846840605139732, + "reward_std": 0.2590767592191696, + "rewards/code_reward": 0.5346840731799603, + "rewards/format_reward": 1.5, + "step": 1939 + }, + { + "clip_ratio": 0.0038514239713549614, + "epoch": 0.07238468355020755, + "grad_norm": 0.09167145192623138, + "kl": 0.030303955078125, + "learning_rate": 7.299196048280661e-06, + "loss": 0.0045, + "step": 1940 + }, + { + "clip_ratio": 0.0036270832642912865, + "epoch": 0.0724219952427592, + "grad_norm": 0.11102720350027084, + "kl": 0.0308837890625, + "learning_rate": 7.296523912346495e-06, + "loss": 0.004, + "step": 1941 + }, + { + "clip_ratio": 0.0056831237161532044, + "completion_length": 568.0357437133789, + "epoch": 0.07245930693531086, + "grad_norm": 0.0482507050037384, + "kl": 0.025604248046875, + "learning_rate": 7.293851022624268e-06, + "loss": -0.0002, + "num_tokens": 45955711.0, + "reward": 0.331547636538744, + "reward_std": 0.056866031140089035, + "rewards/code_reward": 0.181547611951828, + "rewards/format_reward": 1.5, + "step": 1942 + }, + { + "clip_ratio": 0.005433658370748162, + "epoch": 0.07249661862786251, + "grad_norm": 0.04552896320819855, + "kl": 0.026092529296875, + "learning_rate": 7.291177380235474e-06, + "loss": -0.0003, + "step": 1943 + }, + { + "clip_ratio": 0.005065001198090613, + "epoch": 0.07253393032041416, + "grad_norm": 0.04000428318977356, + "kl": 0.025634765625, + "learning_rate": 7.288502986301921e-06, + "loss": -0.0004, + "step": 1944 + }, + { + "clip_ratio": 0.004803598450962454, + "completion_length": 731.5714569091797, + "epoch": 0.07257124201296582, + "grad_norm": 0.06452543288469315, + "kl": 0.03131103515625, + "learning_rate": 7.285827841945739e-06, + "loss": 0.0386, + "num_tokens": 46029453.0, + "reward": 0.47140734642744064, + "reward_std": 0.08713942393660545, + "rewards/code_reward": 0.32408589124679565, + "rewards/format_reward": 1.4732142984867096, + "step": 1945 + }, + { + "clip_ratio": 0.004645942244678736, + "epoch": 0.07260855370551747, + "grad_norm": 0.0572662428021431, + "kl": 0.02996826171875, + "learning_rate": 7.2831519482893665e-06, + "loss": 0.0386, + "step": 1946 + }, + { + "clip_ratio": 0.0043988413526676595, + "epoch": 0.07264586539806912, + "grad_norm": 0.05306127294898033, + "kl": 0.03057861328125, + "learning_rate": 7.2804753064555614e-06, + "loss": 0.0382, + "step": 1947 + }, + { + "clip_ratio": 0.0011004700791090727, + "completion_length": 542.2857284545898, + "epoch": 0.07268317709062078, + "grad_norm": 0.002688393695279956, + "kl": 0.0196685791015625, + "learning_rate": 7.277797917567389e-06, + "loss": 0.0002, + "num_tokens": 46088017.0, + "reward": 0.899999987334013, + "reward_std": 0.0, + "rewards/code_reward": 0.75, + "rewards/format_reward": 1.5, + "step": 1948 + }, + { + "clip_ratio": 0.0011723467614501715, + "epoch": 0.07272048878317243, + "grad_norm": 0.0027009977493435144, + "kl": 0.0197296142578125, + "learning_rate": 7.275119782748234e-06, + "loss": 0.0002, + "step": 1949 + }, + { + "clip_ratio": 0.0015447271289303899, + "epoch": 0.07275780047572408, + "grad_norm": 0.0024487869814038277, + "kl": 0.0193023681640625, + "learning_rate": 7.272440903121792e-06, + "loss": 0.0002, + "step": 1950 + }, + { + "clip_ratio": 0.003965856449212879, + "completion_length": 738.6607513427734, + "epoch": 0.07279511216827574, + "grad_norm": 0.07665945589542389, + "kl": 0.04034423828125, + "learning_rate": 7.2697612798120734e-06, + "loss": 0.0077, + "num_tokens": 46166734.0, + "reward": 0.4978238381445408, + "reward_std": 0.1643480248749256, + "rewards/code_reward": 0.34782382287085056, + "rewards/format_reward": 1.5, + "step": 1951 + }, + { + "clip_ratio": 0.0041931961313821375, + "epoch": 0.07283242386082739, + "grad_norm": 0.07775919884443283, + "kl": 0.037017822265625, + "learning_rate": 7.267080913943395e-06, + "loss": 0.0075, + "step": 1952 + }, + { + "clip_ratio": 0.00398824131116271, + "epoch": 0.07286973555337904, + "grad_norm": 0.06921818107366562, + "kl": 0.033905029296875, + "learning_rate": 7.264399806640389e-06, + "loss": 0.0073, + "step": 1953 + }, + { + "clip_ratio": 0.003279275610111654, + "completion_length": 644.4643096923828, + "epoch": 0.0729070472459307, + "grad_norm": 0.09109409153461456, + "kl": 0.023681640625, + "learning_rate": 7.261717959027998e-06, + "loss": 0.0088, + "num_tokens": 46238300.0, + "reward": 0.778721496462822, + "reward_std": 0.27197252213954926, + "rewards/code_reward": 0.6287214830517769, + "rewards/format_reward": 1.5, + "step": 1954 + }, + { + "clip_ratio": 0.0029869991121813655, + "epoch": 0.07294435893848235, + "grad_norm": 0.07965117692947388, + "kl": 0.023956298828125, + "learning_rate": 7.2590353722314774e-06, + "loss": 0.0084, + "step": 1955 + }, + { + "clip_ratio": 0.00293815357144922, + "epoch": 0.072981670631034, + "grad_norm": 0.06712111830711365, + "kl": 0.023345947265625, + "learning_rate": 7.256352047376391e-06, + "loss": 0.0082, + "step": 1956 + }, + { + "clip_ratio": 0.0035128804156556726, + "completion_length": 777.7678833007812, + "epoch": 0.07301898232358565, + "grad_norm": 0.07232493907213211, + "kl": 0.01837158203125, + "learning_rate": 7.253667985588609e-06, + "loss": 0.0036, + "num_tokens": 46309267.0, + "reward": 0.5769958011806011, + "reward_std": 0.2691146284341812, + "rewards/code_reward": 0.42699579149484634, + "rewards/format_reward": 1.5, + "step": 1957 + }, + { + "clip_ratio": 0.0033216215088032186, + "epoch": 0.07305629401613731, + "grad_norm": 0.06544221937656403, + "kl": 0.0184478759765625, + "learning_rate": 7.250983187994316e-06, + "loss": 0.0034, + "step": 1958 + }, + { + "clip_ratio": 0.0033378131920471787, + "epoch": 0.07309360570868896, + "grad_norm": 0.06682408601045609, + "kl": 0.0191802978515625, + "learning_rate": 7.248297655720004e-06, + "loss": 0.0031, + "step": 1959 + }, + { + "clip_ratio": 0.002689948887564242, + "completion_length": 540.1428756713867, + "epoch": 0.07313091740124061, + "grad_norm": 0.07910895347595215, + "kl": 0.02325439453125, + "learning_rate": 7.24561138989247e-06, + "loss": 0.0006, + "num_tokens": 46371309.0, + "reward": 0.8856293708086014, + "reward_std": 0.29787248373031616, + "rewards/code_reward": 0.7356293499469757, + "rewards/format_reward": 1.5, + "step": 1960 + }, + { + "clip_ratio": 0.002046084322500974, + "epoch": 0.07316822909379227, + "grad_norm": 0.07850798964500427, + "kl": 0.02294921875, + "learning_rate": 7.2429243916388236e-06, + "loss": 0.0006, + "step": 1961 + }, + { + "clip_ratio": 0.0022488177055492997, + "epoch": 0.07320554078634392, + "grad_norm": 0.07352264970541, + "kl": 0.023101806640625, + "learning_rate": 7.240236662086477e-06, + "loss": 0.0002, + "step": 1962 + }, + { + "clip_ratio": 0.0031029521487653255, + "completion_length": 821.4464721679688, + "epoch": 0.07324285247889557, + "grad_norm": 0.06288424879312515, + "kl": 0.026275634765625, + "learning_rate": 7.237548202363153e-06, + "loss": 0.0042, + "num_tokens": 46459212.0, + "reward": 0.5586532391607761, + "reward_std": 0.13980219885706902, + "rewards/code_reward": 0.40865323692560196, + "rewards/format_reward": 1.5, + "step": 1963 + }, + { + "clip_ratio": 0.0031544484081678092, + "epoch": 0.07328016417144723, + "grad_norm": 0.06881005316972733, + "kl": 0.0267333984375, + "learning_rate": 7.23485901359688e-06, + "loss": 0.0041, + "step": 1964 + }, + { + "clip_ratio": 0.002966865198686719, + "epoch": 0.07331747586399888, + "grad_norm": 0.05928759276866913, + "kl": 0.02606201171875, + "learning_rate": 7.2321690969159904e-06, + "loss": 0.0039, + "step": 1965 + }, + { + "clip_ratio": 0.004815163265448064, + "completion_length": 627.5536117553711, + "epoch": 0.07335478755655053, + "grad_norm": 0.046548642218112946, + "kl": 0.0221405029296875, + "learning_rate": 7.2294784534491245e-06, + "loss": 0.0085, + "num_tokens": 46518575.0, + "reward": 0.3504161275923252, + "reward_std": 0.010240938514471054, + "rewards/code_reward": 0.20041608810424805, + "rewards/format_reward": 1.5, + "step": 1966 + }, + { + "clip_ratio": 0.004119937482755631, + "epoch": 0.07339209924910219, + "grad_norm": 0.0429682694375515, + "kl": 0.022491455078125, + "learning_rate": 7.226787084325224e-06, + "loss": 0.0082, + "step": 1967 + }, + { + "clip_ratio": 0.004477882175706327, + "epoch": 0.07342941094165384, + "grad_norm": 0.03906874731183052, + "kl": 0.0221405029296875, + "learning_rate": 7.224094990673541e-06, + "loss": 0.0081, + "step": 1968 + }, + { + "clip_ratio": 0.0033621449256315827, + "completion_length": 628.7500381469727, + "epoch": 0.07346672263420549, + "grad_norm": 0.06152952462434769, + "kl": 0.025543212890625, + "learning_rate": 7.221402173623625e-06, + "loss": -0.0061, + "num_tokens": 46575933.0, + "reward": 0.23306920006871223, + "reward_std": 0.07093996368348598, + "rewards/code_reward": 0.08306917594745755, + "rewards/format_reward": 1.5, + "step": 1969 + }, + { + "clip_ratio": 0.003760304069146514, + "epoch": 0.07350403432675714, + "grad_norm": 0.06092012673616409, + "kl": 0.026336669921875, + "learning_rate": 7.218708634305334e-06, + "loss": -0.0061, + "step": 1970 + }, + { + "clip_ratio": 0.0036867392482236028, + "epoch": 0.0735413460193088, + "grad_norm": 0.06430985778570175, + "kl": 0.02642822265625, + "learning_rate": 7.216014373848824e-06, + "loss": -0.0064, + "step": 1971 + }, + { + "clip_ratio": 0.004714792361482978, + "completion_length": 669.0178985595703, + "epoch": 0.07357865771186045, + "grad_norm": 0.08896001428365707, + "kl": 0.02783203125, + "learning_rate": 7.21331939338456e-06, + "loss": 0.0009, + "num_tokens": 46652346.0, + "reward": 0.27565763890743256, + "reward_std": 0.1320190727710724, + "rewards/code_reward": 0.12565761525183916, + "rewards/format_reward": 1.5, + "step": 1972 + }, + { + "clip_ratio": 0.004705418716184795, + "epoch": 0.0736159694044121, + "grad_norm": 0.0795099139213562, + "kl": 0.02801513671875, + "learning_rate": 7.2106236940433035e-06, + "loss": 0.0007, + "step": 1973 + }, + { + "clip_ratio": 0.004596685175783932, + "epoch": 0.07365328109696376, + "grad_norm": 0.07827934622764587, + "kl": 0.02703857421875, + "learning_rate": 7.207927276956123e-06, + "loss": 0.0004, + "step": 1974 + }, + { + "clip_ratio": 0.0038837537285871804, + "completion_length": 551.3928756713867, + "epoch": 0.07369059278951541, + "grad_norm": 0.06645648926496506, + "kl": 0.0286865234375, + "learning_rate": 7.205230143254382e-06, + "loss": 0.0062, + "num_tokens": 46716894.0, + "reward": 0.6099871583282948, + "reward_std": 0.1960584744811058, + "rewards/code_reward": 0.45998714864254, + "rewards/format_reward": 1.5, + "step": 1975 + }, + { + "clip_ratio": 0.003411108104046434, + "epoch": 0.07372790448206706, + "grad_norm": 0.06522858142852783, + "kl": 0.028289794921875, + "learning_rate": 7.202532294069747e-06, + "loss": 0.0061, + "step": 1976 + }, + { + "clip_ratio": 0.0038718251744285226, + "epoch": 0.07376521617461872, + "grad_norm": 0.06002403050661087, + "kl": 0.028656005859375, + "learning_rate": 7.199833730534187e-06, + "loss": 0.0058, + "step": 1977 + }, + { + "clip_ratio": 0.004379337537102401, + "completion_length": 751.3750152587891, + "epoch": 0.07380252786717037, + "grad_norm": 0.06440170854330063, + "kl": 0.0201416015625, + "learning_rate": 7.19713445377997e-06, + "loss": 0.0122, + "num_tokens": 46793131.0, + "reward": 0.19172747805714607, + "reward_std": 0.05078306212089956, + "rewards/code_reward": 0.041727465111762285, + "rewards/format_reward": 1.5, + "step": 1978 + }, + { + "clip_ratio": 0.004258617700543255, + "epoch": 0.07383983955972202, + "grad_norm": 0.0644465982913971, + "kl": 0.0206298828125, + "learning_rate": 7.1944344649396625e-06, + "loss": 0.012, + "step": 1979 + }, + { + "clip_ratio": 0.004176486167125404, + "epoch": 0.07387715125227368, + "grad_norm": 0.059023573994636536, + "kl": 0.02093505859375, + "learning_rate": 7.191733765146126e-06, + "loss": 0.0118, + "step": 1980 + }, + { + "clip_ratio": 0.0034483474446460605, + "completion_length": 862.6428985595703, + "epoch": 0.07391446294482533, + "grad_norm": 0.1011565774679184, + "kl": 0.020904541015625, + "learning_rate": 7.189032355532528e-06, + "loss": 0.0112, + "num_tokens": 46879617.0, + "reward": 0.5181436464190483, + "reward_std": 0.16839465266093612, + "rewards/code_reward": 0.36814362462610006, + "rewards/format_reward": 1.5, + "step": 1981 + }, + { + "clip_ratio": 0.0034510244731791317, + "epoch": 0.07395177463737698, + "grad_norm": 0.08135794848203659, + "kl": 0.020965576171875, + "learning_rate": 7.186330237232329e-06, + "loss": 0.0111, + "step": 1982 + }, + { + "clip_ratio": 0.00331986416131258, + "epoch": 0.07398908632992864, + "grad_norm": 0.07146086543798447, + "kl": 0.021270751953125, + "learning_rate": 7.183627411379286e-06, + "loss": 0.0106, + "step": 1983 + }, + { + "clip_ratio": 0.0031160515500232577, + "completion_length": 663.1786041259766, + "epoch": 0.07402639802248029, + "grad_norm": 0.07776926457881927, + "kl": 0.02777099609375, + "learning_rate": 7.180923879107456e-06, + "loss": 0.0016, + "num_tokens": 46959945.0, + "reward": 0.5577943660318851, + "reward_std": 0.1611664853990078, + "rewards/code_reward": 0.40779436752200127, + "rewards/format_reward": 1.5, + "step": 1984 + }, + { + "clip_ratio": 0.0033240781631320715, + "epoch": 0.07406370971503194, + "grad_norm": 0.14191469550132751, + "kl": 0.0286865234375, + "learning_rate": 7.17821964155119e-06, + "loss": 0.0015, + "step": 1985 + }, + { + "clip_ratio": 0.0031440292368642986, + "epoch": 0.0741010214075836, + "grad_norm": 0.06216135993599892, + "kl": 0.030181884765625, + "learning_rate": 7.1755146998451375e-06, + "loss": 0.0014, + "step": 1986 + }, + { + "clip_ratio": 0.0018929140060208738, + "completion_length": 496.232177734375, + "epoch": 0.07413833310013525, + "grad_norm": 0.06257088482379913, + "kl": 0.026824951171875, + "learning_rate": 7.172809055124239e-06, + "loss": 0.0068, + "num_tokens": 47013886.0, + "reward": 1.049531176686287, + "reward_std": 0.17549537122249603, + "rewards/code_reward": 0.8995311558246613, + "rewards/format_reward": 1.5, + "step": 1987 + }, + { + "clip_ratio": 0.0014926641597412527, + "epoch": 0.07417564479268692, + "grad_norm": 0.060302406549453735, + "kl": 0.02740478515625, + "learning_rate": 7.170102708523736e-06, + "loss": 0.0065, + "step": 1988 + }, + { + "clip_ratio": 0.0011724347132258117, + "epoch": 0.07421295648523857, + "grad_norm": 0.05903962254524231, + "kl": 0.027130126953125, + "learning_rate": 7.167395661179158e-06, + "loss": 0.0063, + "step": 1989 + }, + { + "clip_ratio": 0.004759554052725434, + "completion_length": 733.107177734375, + "epoch": 0.07425026817779022, + "grad_norm": 0.07479026913642883, + "kl": 0.03173828125, + "learning_rate": 7.164687914226335e-06, + "loss": -0.0014, + "num_tokens": 47089310.0, + "reward": 0.3995603509247303, + "reward_std": 0.0655060219578445, + "rewards/code_reward": 0.2495603293646127, + "rewards/format_reward": 1.5, + "step": 1990 + }, + { + "clip_ratio": 0.004345093155279756, + "epoch": 0.07428757987034187, + "grad_norm": 0.08209564536809921, + "kl": 0.0316619873046875, + "learning_rate": 7.161979468801385e-06, + "loss": -0.0017, + "step": 1991 + }, + { + "clip_ratio": 0.004549145058263093, + "epoch": 0.07432489156289353, + "grad_norm": 0.08162767440080643, + "kl": 0.030120849609375, + "learning_rate": 7.1592703260407215e-06, + "loss": -0.0019, + "step": 1992 + }, + { + "clip_ratio": 0.004110110574401915, + "completion_length": 588.2500152587891, + "epoch": 0.07436220325544518, + "grad_norm": 0.1162894144654274, + "kl": 0.02484130859375, + "learning_rate": 7.156560487081052e-06, + "loss": 0.0155, + "num_tokens": 47165694.0, + "reward": 0.6822132021188736, + "reward_std": 0.25865787640213966, + "rewards/code_reward": 0.5322131924331188, + "rewards/format_reward": 1.5, + "step": 1993 + }, + { + "clip_ratio": 0.004271031706593931, + "epoch": 0.07439951494799683, + "grad_norm": 0.09128385782241821, + "kl": 0.02447509765625, + "learning_rate": 7.153849953059373e-06, + "loss": 0.0151, + "step": 1994 + }, + { + "clip_ratio": 0.004174312693066895, + "epoch": 0.07443682664054849, + "grad_norm": 0.08899909257888794, + "kl": 0.0249481201171875, + "learning_rate": 7.1511387251129745e-06, + "loss": 0.0148, + "step": 1995 + }, + { + "clip_ratio": 0.004714775597676635, + "completion_length": 715.7143173217773, + "epoch": 0.07447413833310014, + "grad_norm": 0.07745671272277832, + "kl": 0.02593994140625, + "learning_rate": 7.148426804379438e-06, + "loss": 0.0117, + "num_tokens": 47241138.0, + "reward": 0.6565327793359756, + "reward_std": 0.1987793631851673, + "rewards/code_reward": 0.5065327733755112, + "rewards/format_reward": 1.5, + "step": 1996 + }, + { + "clip_ratio": 0.00452458905056119, + "epoch": 0.0745114500256518, + "grad_norm": 0.07221713662147522, + "kl": 0.026153564453125, + "learning_rate": 7.145714191996636e-06, + "loss": 0.0115, + "step": 1997 + }, + { + "clip_ratio": 0.004285515460651368, + "epoch": 0.07454876171820345, + "grad_norm": 0.06802409887313843, + "kl": 0.0264892578125, + "learning_rate": 7.143000889102728e-06, + "loss": 0.0112, + "step": 1998 + }, + { + "clip_ratio": 0.003408748365473002, + "completion_length": 711.1250305175781, + "epoch": 0.0745860734107551, + "grad_norm": 0.08806417137384415, + "kl": 0.02410888671875, + "learning_rate": 7.140286896836168e-06, + "loss": 0.0105, + "num_tokens": 47318845.0, + "reward": 0.6413819454610348, + "reward_std": 0.21658667922019958, + "rewards/code_reward": 0.491381898522377, + "rewards/format_reward": 1.5, + "step": 1999 + }, + { + "clip_ratio": 0.00358980312012136, + "epoch": 0.07462338510330675, + "grad_norm": 0.077362559735775, + "kl": 0.0242919921875, + "learning_rate": 7.137572216335695e-06, + "loss": 0.0103, + "step": 2000 + }, + { + "clip_ratio": 0.0035194832016713917, + "epoch": 0.0746606967958584, + "grad_norm": 0.06679049134254456, + "kl": 0.02423095703125, + "learning_rate": 7.134856848740338e-06, + "loss": 0.0098, + "step": 2001 + }, + { + "clip_ratio": 0.002718106989050284, + "completion_length": 649.6607666015625, + "epoch": 0.07469800848841006, + "grad_norm": 0.0740065649151802, + "kl": 0.025360107421875, + "learning_rate": 7.132140795189416e-06, + "loss": 0.0109, + "num_tokens": 47379656.0, + "reward": 0.5911405980587006, + "reward_std": 0.19253409560769796, + "rewards/code_reward": 0.4411406018771231, + "rewards/format_reward": 1.5, + "step": 2002 + }, + { + "clip_ratio": 0.0035252280067652464, + "epoch": 0.07473532018096171, + "grad_norm": 0.07627386599779129, + "kl": 0.025634765625, + "learning_rate": 7.129424056822534e-06, + "loss": 0.0111, + "step": 2003 + }, + { + "clip_ratio": 0.0034463960328139365, + "epoch": 0.07477263187351336, + "grad_norm": 0.07077978551387787, + "kl": 0.02557373046875, + "learning_rate": 7.1267066347795854e-06, + "loss": 0.0107, + "step": 2004 + }, + { + "clip_ratio": 0.0018709455616772175, + "completion_length": 538.9643096923828, + "epoch": 0.07480994356606502, + "grad_norm": 0.06616933643817902, + "kl": 0.0217437744140625, + "learning_rate": 7.123988530200751e-06, + "loss": 0.0091, + "num_tokens": 47439812.0, + "reward": 0.8269890397787094, + "reward_std": 0.19333912432193756, + "rewards/code_reward": 0.6769890375435352, + "rewards/format_reward": 1.5, + "step": 2005 + }, + { + "clip_ratio": 0.0017049391753971577, + "epoch": 0.07484725525861667, + "grad_norm": 0.06616534292697906, + "kl": 0.0212860107421875, + "learning_rate": 7.121269744226494e-06, + "loss": 0.009, + "step": 2006 + }, + { + "clip_ratio": 0.001910310995299369, + "epoch": 0.07488456695116832, + "grad_norm": 0.06829256564378738, + "kl": 0.02178955078125, + "learning_rate": 7.118550277997569e-06, + "loss": 0.0087, + "step": 2007 + }, + { + "clip_ratio": 0.003821214020717889, + "completion_length": 594.7321701049805, + "epoch": 0.07492187864371998, + "grad_norm": 0.09078340232372284, + "kl": 0.02734375, + "learning_rate": 7.115830132655014e-06, + "loss": -0.0015, + "num_tokens": 47506759.0, + "reward": 0.761334739625454, + "reward_std": 0.3473145291209221, + "rewards/code_reward": 0.6113347299396992, + "rewards/format_reward": 1.5, + "step": 2008 + }, + { + "clip_ratio": 0.0034199530782643706, + "epoch": 0.07495919033627163, + "grad_norm": 0.08839789032936096, + "kl": 0.027496337890625, + "learning_rate": 7.113109309340149e-06, + "loss": -0.0016, + "step": 2009 + }, + { + "clip_ratio": 0.0033757142955437303, + "epoch": 0.07499650202882328, + "grad_norm": 0.08447742462158203, + "kl": 0.02764892578125, + "learning_rate": 7.110387809194582e-06, + "loss": -0.0023, + "step": 2010 + }, + { + "clip_ratio": 0.00498470471939072, + "completion_length": 608.803596496582, + "epoch": 0.07503381372137494, + "grad_norm": 0.11361449956893921, + "kl": 0.029388427734375, + "learning_rate": 7.107665633360206e-06, + "loss": 0.007, + "num_tokens": 47574718.0, + "reward": 0.4955572560429573, + "reward_std": 0.3106734938919544, + "rewards/code_reward": 0.34823581390082836, + "rewards/format_reward": 1.4732142984867096, + "step": 2011 + }, + { + "clip_ratio": 0.003977060900069773, + "epoch": 0.07507112541392659, + "grad_norm": 0.09963659197092056, + "kl": 0.02899169921875, + "learning_rate": 7.104942782979192e-06, + "loss": 0.0068, + "step": 2012 + }, + { + "clip_ratio": 0.004126884683500975, + "epoch": 0.07510843710647824, + "grad_norm": 0.09261474013328552, + "kl": 0.029449462890625, + "learning_rate": 7.1022192591939985e-06, + "loss": 0.0061, + "step": 2013 + }, + { + "clip_ratio": 0.0017818387132138014, + "completion_length": 671.6785888671875, + "epoch": 0.0751457487990299, + "grad_norm": 0.05507858842611313, + "kl": 0.0234375, + "learning_rate": 7.0994950631473684e-06, + "loss": 0.0021, + "num_tokens": 47647094.0, + "reward": 0.8422268778085709, + "reward_std": 0.2251264452934265, + "rewards/code_reward": 0.6922268867492676, + "rewards/format_reward": 1.5, + "step": 2014 + }, + { + "clip_ratio": 0.0018788682646118104, + "epoch": 0.07518306049158155, + "grad_norm": 0.052819643169641495, + "kl": 0.023956298828125, + "learning_rate": 7.0967701959823185e-06, + "loss": 0.002, + "step": 2015 + }, + { + "clip_ratio": 0.0013775923289358616, + "epoch": 0.0752203721841332, + "grad_norm": 0.047467030584812164, + "kl": 0.02435302734375, + "learning_rate": 7.094044658842156e-06, + "loss": 0.0017, + "step": 2016 + }, + { + "clip_ratio": 0.0035457992926239967, + "completion_length": 573.6964645385742, + "epoch": 0.07525768387668486, + "grad_norm": 0.07602495700120926, + "kl": 0.035491943359375, + "learning_rate": 7.091318452870466e-06, + "loss": 0.0128, + "num_tokens": 47716755.0, + "reward": 0.35706470161676407, + "reward_std": 0.01756937149912119, + "rewards/code_reward": 0.2070647026412189, + "rewards/format_reward": 1.5, + "step": 2017 + }, + { + "clip_ratio": 0.0037210326408967376, + "epoch": 0.07529499556923651, + "grad_norm": 0.07165718823671341, + "kl": 0.036285400390625, + "learning_rate": 7.088591579211113e-06, + "loss": 0.0125, + "step": 2018 + }, + { + "clip_ratio": 0.0036175338900648057, + "epoch": 0.07533230726178816, + "grad_norm": 0.06877605617046356, + "kl": 0.03619384765625, + "learning_rate": 7.085864039008242e-06, + "loss": 0.0123, + "step": 2019 + }, + { + "clip_ratio": 0.003644620592240244, + "completion_length": 871.303596496582, + "epoch": 0.07536961895433981, + "grad_norm": 0.08622518181800842, + "kl": 0.0251312255859375, + "learning_rate": 7.083135833406276e-06, + "loss": 0.1184, + "num_tokens": 47788200.0, + "reward": 0.4329707883298397, + "reward_std": 0.05120203737169504, + "rewards/code_reward": 0.2952922089025378, + "rewards/format_reward": 1.3767857253551483, + "step": 2020 + }, + { + "clip_ratio": 0.003338955924846232, + "epoch": 0.07540693064689147, + "grad_norm": 0.07999008148908615, + "kl": 0.0267486572265625, + "learning_rate": 7.080406963549925e-06, + "loss": 0.1181, + "step": 2021 + }, + { + "clip_ratio": 0.0027429951587691903, + "epoch": 0.07544424233944312, + "grad_norm": 0.073165163397789, + "kl": 0.026580810546875, + "learning_rate": 7.077677430584169e-06, + "loss": 0.1176, + "step": 2022 + }, + { + "clip_ratio": 0.004369795438833535, + "completion_length": 568.7143020629883, + "epoch": 0.07548155403199477, + "grad_norm": 0.0775269865989685, + "kl": 0.029510498046875, + "learning_rate": 7.07494723565427e-06, + "loss": -0.0076, + "num_tokens": 47843148.0, + "reward": 0.2257554903626442, + "reward_std": 0.013502325455192477, + "rewards/code_reward": 0.07575547732994892, + "rewards/format_reward": 1.5, + "step": 2023 + }, + { + "clip_ratio": 0.004585331305861473, + "epoch": 0.07551886572454643, + "grad_norm": 0.07463868707418442, + "kl": 0.029571533203125, + "learning_rate": 7.0722163799057646e-06, + "loss": -0.0076, + "step": 2024 + }, + { + "clip_ratio": 0.004510969738475978, + "epoch": 0.07555617741709808, + "grad_norm": 0.07705334573984146, + "kl": 0.0294189453125, + "learning_rate": 7.069484864484471e-06, + "loss": -0.0079, + "step": 2025 + }, + { + "clip_ratio": 0.004351783660240471, + "completion_length": 787.6607360839844, + "epoch": 0.07559348910964973, + "grad_norm": 0.09050148725509644, + "kl": 0.02899169921875, + "learning_rate": 7.066752690536483e-06, + "loss": 0.0098, + "num_tokens": 47924133.0, + "reward": 0.6672390177845955, + "reward_std": 0.27755211293697357, + "rewards/code_reward": 0.5199175830930471, + "rewards/format_reward": 1.4732142984867096, + "step": 2026 + }, + { + "clip_ratio": 0.003571663866750896, + "epoch": 0.07563080080220139, + "grad_norm": 0.0853273868560791, + "kl": 0.027862548828125, + "learning_rate": 7.06401985920817e-06, + "loss": 0.0094, + "step": 2027 + }, + { + "clip_ratio": 0.0036633124691434205, + "epoch": 0.07566811249475304, + "grad_norm": 0.07968306541442871, + "kl": 0.0284423828125, + "learning_rate": 7.0612863716461754e-06, + "loss": 0.0091, + "step": 2028 + }, + { + "clip_ratio": 0.004309072275646031, + "completion_length": 631.6785888671875, + "epoch": 0.07570542418730469, + "grad_norm": 0.12688767910003662, + "kl": 0.032989501953125, + "learning_rate": 7.058552228997421e-06, + "loss": -0.002, + "num_tokens": 47999995.0, + "reward": 0.6649813055992126, + "reward_std": 0.2977083809673786, + "rewards/code_reward": 0.5149813070893288, + "rewards/format_reward": 1.5, + "step": 2029 + }, + { + "clip_ratio": 0.004016008344478905, + "epoch": 0.07574273587985635, + "grad_norm": 0.10281506925821304, + "kl": 0.034088134765625, + "learning_rate": 7.055817432409103e-06, + "loss": -0.0023, + "step": 2030 + }, + { + "clip_ratio": 0.004083916253875941, + "epoch": 0.075780047572408, + "grad_norm": 0.09690448641777039, + "kl": 0.03369140625, + "learning_rate": 7.05308198302869e-06, + "loss": -0.0028, + "step": 2031 + }, + { + "clip_ratio": 0.004198928421828896, + "completion_length": 655.0714569091797, + "epoch": 0.07581735926495965, + "grad_norm": 0.09466149657964706, + "kl": 0.04058837890625, + "learning_rate": 7.050345882003928e-06, + "loss": 0.049, + "num_tokens": 48065715.0, + "reward": 0.7276785783469677, + "reward_std": 0.2626022379845381, + "rewards/code_reward": 0.5803571464493871, + "rewards/format_reward": 1.4732142984867096, + "step": 2032 + }, + { + "clip_ratio": 0.004166237602476031, + "epoch": 0.0758546709575113, + "grad_norm": 0.08551596105098724, + "kl": 0.04046630859375, + "learning_rate": 7.047609130482831e-06, + "loss": 0.0488, + "step": 2033 + }, + { + "clip_ratio": 0.004079105332493782, + "epoch": 0.07589198265006296, + "grad_norm": 0.09186595678329468, + "kl": 0.041748046875, + "learning_rate": 7.04487172961369e-06, + "loss": 0.0482, + "step": 2034 + }, + { + "clip_ratio": 0.004408658598549664, + "completion_length": 763.1786041259766, + "epoch": 0.07592929434261461, + "grad_norm": 0.07627338171005249, + "kl": 0.022735595703125, + "learning_rate": 7.04213368054507e-06, + "loss": -0.0079, + "num_tokens": 48147359.0, + "reward": 0.23478785157203674, + "reward_std": 0.05299810180440545, + "rewards/code_reward": 0.08478781627491117, + "rewards/format_reward": 1.5, + "step": 2035 + }, + { + "clip_ratio": 0.004188002465525642, + "epoch": 0.07596660603516626, + "grad_norm": 0.0779477059841156, + "kl": 0.023162841796875, + "learning_rate": 7.039394984425802e-06, + "loss": -0.008, + "step": 2036 + }, + { + "clip_ratio": 0.004017817846033722, + "epoch": 0.07600391772771792, + "grad_norm": 0.07481900602579117, + "kl": 0.02337646484375, + "learning_rate": 7.036655642404996e-06, + "loss": -0.0084, + "step": 2037 + }, + { + "clip_ratio": 0.004292829835321754, + "completion_length": 546.8393020629883, + "epoch": 0.07604122942026957, + "grad_norm": 0.07981923222541809, + "kl": 0.0291290283203125, + "learning_rate": 7.033915655632023e-06, + "loss": 0.007, + "num_tokens": 48201572.0, + "reward": 0.5891741216182709, + "reward_std": 0.1984277586452663, + "rewards/code_reward": 0.43917411379516125, + "rewards/format_reward": 1.5, + "step": 2038 + }, + { + "clip_ratio": 0.0035581703414209187, + "epoch": 0.07607854111282122, + "grad_norm": 0.08350853621959686, + "kl": 0.0296630859375, + "learning_rate": 7.031175025256537e-06, + "loss": 0.0067, + "step": 2039 + }, + { + "clip_ratio": 0.0037105337250977755, + "epoch": 0.07611585280537288, + "grad_norm": 0.07734107971191406, + "kl": 0.029754638671875, + "learning_rate": 7.028433752428453e-06, + "loss": 0.0065, + "step": 2040 + }, + { + "clip_ratio": 0.003961622714996338, + "completion_length": 641.0536041259766, + "epoch": 0.07615316449792453, + "grad_norm": 0.05272841081023216, + "kl": 0.0260009765625, + "learning_rate": 7.025691838297958e-06, + "loss": 0.0218, + "num_tokens": 48256889.0, + "reward": 0.21071430295705795, + "reward_std": 0.08497213944792747, + "rewards/code_reward": 0.06071428768336773, + "rewards/format_reward": 1.5, + "step": 2041 + }, + { + "clip_ratio": 0.003561708115739748, + "epoch": 0.0761904761904762, + "grad_norm": 0.05652724951505661, + "kl": 0.026397705078125, + "learning_rate": 7.0229492840155055e-06, + "loss": 0.0217, + "step": 2042 + }, + { + "clip_ratio": 0.00405958344344981, + "epoch": 0.07622778788302785, + "grad_norm": 0.052204690873622894, + "kl": 0.026336669921875, + "learning_rate": 7.020206090731826e-06, + "loss": 0.0215, + "step": 2043 + }, + { + "clip_ratio": 0.00350855675060302, + "completion_length": 582.0000305175781, + "epoch": 0.0762650995755795, + "grad_norm": 0.08030658215284348, + "kl": 0.03717041015625, + "learning_rate": 7.017462259597908e-06, + "loss": -0.0119, + "num_tokens": 48322605.0, + "reward": 0.8888866528868675, + "reward_std": 0.14264278672635555, + "rewards/code_reward": 0.738886646926403, + "rewards/format_reward": 1.5, + "step": 2044 + }, + { + "clip_ratio": 0.002646047214511782, + "epoch": 0.07630241126813116, + "grad_norm": 0.07677503675222397, + "kl": 0.037506103515625, + "learning_rate": 7.0147177917650145e-06, + "loss": -0.0123, + "step": 2045 + }, + { + "clip_ratio": 0.0028728112229146063, + "epoch": 0.07633972296068281, + "grad_norm": 0.09579726308584213, + "kl": 0.034820556640625, + "learning_rate": 7.011972688384673e-06, + "loss": -0.0124, + "step": 2046 + }, + { + "clip_ratio": 0.00503242714330554, + "completion_length": 752.8393173217773, + "epoch": 0.07637703465323446, + "grad_norm": 0.06358505040407181, + "kl": 0.033966064453125, + "learning_rate": 7.0092269506086765e-06, + "loss": 0.004, + "num_tokens": 48407016.0, + "reward": 0.29688379541039467, + "reward_std": 0.05002240464091301, + "rewards/code_reward": 0.1468837410211563, + "rewards/format_reward": 1.5, + "step": 2047 + }, + { + "clip_ratio": 0.004615456215105951, + "epoch": 0.07641434634578612, + "grad_norm": 0.05880720168352127, + "kl": 0.02960205078125, + "learning_rate": 7.006480579589085e-06, + "loss": 0.0039, + "step": 2048 + }, + { + "clip_ratio": 0.0047384200734086335, + "epoch": 0.07645165803833777, + "grad_norm": 0.049698054790496826, + "kl": 0.02886962890625, + "learning_rate": 7.003733576478227e-06, + "loss": 0.0038, + "step": 2049 + }, + { + "clip_ratio": 0.0031999649945646524, + "completion_length": 714.4464797973633, + "epoch": 0.07648896973088942, + "grad_norm": 0.08972039073705673, + "kl": 0.024627685546875, + "learning_rate": 7.000985942428694e-06, + "loss": -0.0018, + "num_tokens": 48485119.0, + "reward": 0.5508715212345123, + "reward_std": 0.12273811921477318, + "rewards/code_reward": 0.40087148919701576, + "rewards/format_reward": 1.5, + "step": 2050 + }, + { + "clip_ratio": 0.0032655037939548492, + "epoch": 0.07652628142344108, + "grad_norm": 0.08308298885822296, + "kl": 0.0243988037109375, + "learning_rate": 6.99823767859334e-06, + "loss": -0.0019, + "step": 2051 + }, + { + "clip_ratio": 0.0034997552866116166, + "epoch": 0.07656359311599273, + "grad_norm": 0.06430862843990326, + "kl": 0.0251007080078125, + "learning_rate": 6.995488786125287e-06, + "loss": -0.0022, + "step": 2052 + }, + { + "clip_ratio": 0.003025997953955084, + "completion_length": 799.178596496582, + "epoch": 0.07660090480854438, + "grad_norm": 0.06878901273012161, + "kl": 0.02899169921875, + "learning_rate": 6.992739266177918e-06, + "loss": 0.0285, + "num_tokens": 48567575.0, + "reward": 0.44864967092871666, + "reward_std": 0.07282950729131699, + "rewards/code_reward": 0.3040068056434393, + "rewards/format_reward": 1.4464285671710968, + "step": 2053 + }, + { + "clip_ratio": 0.00346033344976604, + "epoch": 0.07663821650109603, + "grad_norm": 0.06069648638367653, + "kl": 0.028656005859375, + "learning_rate": 6.989989119904883e-06, + "loss": 0.0282, + "step": 2054 + }, + { + "clip_ratio": 0.0034614038304425776, + "epoch": 0.07667552819364769, + "grad_norm": 0.06076701357960701, + "kl": 0.02911376953125, + "learning_rate": 6.987238348460089e-06, + "loss": 0.0281, + "step": 2055 + }, + { + "clip_ratio": 0.004019030777271837, + "completion_length": 635.4643249511719, + "epoch": 0.07671283988619934, + "grad_norm": 0.09845862537622452, + "kl": 0.025054931640625, + "learning_rate": 6.984486952997713e-06, + "loss": 0.0028, + "num_tokens": 48632463.0, + "reward": 0.7324654161930084, + "reward_std": 0.25034582428634167, + "rewards/code_reward": 0.5824654195457697, + "rewards/format_reward": 1.5, + "step": 2056 + }, + { + "clip_ratio": 0.003283090831246227, + "epoch": 0.076750151578751, + "grad_norm": 0.08844514936208725, + "kl": 0.024810791015625, + "learning_rate": 6.9817349346721845e-06, + "loss": 0.0022, + "step": 2057 + }, + { + "clip_ratio": 0.0038273954414762557, + "epoch": 0.07678746327130265, + "grad_norm": 0.08042728155851364, + "kl": 0.025482177734375, + "learning_rate": 6.978982294638203e-06, + "loss": 0.0017, + "step": 2058 + }, + { + "clip_ratio": 0.0032430937862955034, + "completion_length": 819.3035888671875, + "epoch": 0.0768247749638543, + "grad_norm": 0.08425258845090866, + "kl": 0.03692626953125, + "learning_rate": 6.976229034050723e-06, + "loss": 0.0379, + "num_tokens": 48719602.0, + "reward": 0.5059913955628872, + "reward_std": 0.09039392601698637, + "rewards/code_reward": 0.35866992734372616, + "rewards/format_reward": 1.4732142984867096, + "step": 2059 + }, + { + "clip_ratio": 0.003167800430674106, + "epoch": 0.07686208665640595, + "grad_norm": 0.07009594142436981, + "kl": 0.033294677734375, + "learning_rate": 6.97347515406496e-06, + "loss": 0.0377, + "step": 2060 + }, + { + "clip_ratio": 0.0024579549790360034, + "epoch": 0.0768993983489576, + "grad_norm": 0.06438075006008148, + "kl": 0.034423828125, + "learning_rate": 6.970720655836396e-06, + "loss": 0.0372, + "step": 2061 + }, + { + "clip_ratio": 0.0053412000997923315, + "completion_length": 569.5893020629883, + "epoch": 0.07693671004150926, + "grad_norm": 0.06987496465444565, + "kl": 0.029388427734375, + "learning_rate": 6.967965540520763e-06, + "loss": 0.0137, + "num_tokens": 48777889.0, + "reward": 0.5268367975950241, + "reward_std": 0.13308561220765114, + "rewards/code_reward": 0.37683673249557614, + "rewards/format_reward": 1.5, + "step": 2062 + }, + { + "clip_ratio": 0.004828117787837982, + "epoch": 0.07697402173406091, + "grad_norm": 0.07345501333475113, + "kl": 0.029510498046875, + "learning_rate": 6.9652098092740585e-06, + "loss": 0.0136, + "step": 2063 + }, + { + "clip_ratio": 0.0050323292962275445, + "epoch": 0.07701133342661257, + "grad_norm": 0.06736921519041061, + "kl": 0.028900146484375, + "learning_rate": 6.962453463252533e-06, + "loss": 0.0133, + "step": 2064 + }, + { + "clip_ratio": 0.0047674610977992415, + "completion_length": 760.7678985595703, + "epoch": 0.07704864511916422, + "grad_norm": 0.19995316863059998, + "kl": 0.0244140625, + "learning_rate": 6.9596965036127015e-06, + "loss": 0.0003, + "num_tokens": 48851712.0, + "reward": 0.16510776430368423, + "reward_std": 0.03627623803913593, + "rewards/code_reward": 0.015107748098671436, + "rewards/format_reward": 1.5, + "step": 2065 + }, + { + "clip_ratio": 0.004709688131697476, + "epoch": 0.07708595681171587, + "grad_norm": 0.0554807223379612, + "kl": 0.024658203125, + "learning_rate": 6.956938931511332e-06, + "loss": 0.0001, + "step": 2066 + }, + { + "clip_ratio": 0.004735390073619783, + "epoch": 0.07712326850426753, + "grad_norm": 0.05388617888092995, + "kl": 0.0238037109375, + "learning_rate": 6.95418074810545e-06, + "loss": -0.0002, + "step": 2067 + }, + { + "clip_ratio": 0.003622811404056847, + "completion_length": 720.9107513427734, + "epoch": 0.07716058019681918, + "grad_norm": 0.0756058543920517, + "kl": 0.0224609375, + "learning_rate": 6.951421954552339e-06, + "loss": 0.0031, + "num_tokens": 48928455.0, + "reward": 0.621682420372963, + "reward_std": 0.19854185730218887, + "rewards/code_reward": 0.47168242558836937, + "rewards/format_reward": 1.5, + "step": 2068 + }, + { + "clip_ratio": 0.0030070245265960693, + "epoch": 0.07719789188937083, + "grad_norm": 0.07438717037439346, + "kl": 0.02276611328125, + "learning_rate": 6.948662552009535e-06, + "loss": 0.0028, + "step": 2069 + }, + { + "clip_ratio": 0.002773958840407431, + "epoch": 0.07723520358192248, + "grad_norm": 0.06727827340364456, + "kl": 0.022247314453125, + "learning_rate": 6.945902541634836e-06, + "loss": 0.0025, + "step": 2070 + }, + { + "clip_ratio": 0.0035161316045559943, + "completion_length": 843.8214492797852, + "epoch": 0.07727251527447414, + "grad_norm": 0.0966213122010231, + "kl": 0.023651123046875, + "learning_rate": 6.943141924586287e-06, + "loss": 0.002, + "num_tokens": 49002589.0, + "reward": 0.49527909234166145, + "reward_std": 0.09733750484883785, + "rewards/code_reward": 0.34527906589210033, + "rewards/format_reward": 1.5, + "step": 2071 + }, + { + "clip_ratio": 0.002944752079201862, + "epoch": 0.07730982696702579, + "grad_norm": 0.09443225711584091, + "kl": 0.023040771484375, + "learning_rate": 6.940380702022197e-06, + "loss": 0.0016, + "step": 2072 + }, + { + "clip_ratio": 0.0026436843909323215, + "epoch": 0.07734713865957744, + "grad_norm": 0.08302570879459381, + "kl": 0.0233612060546875, + "learning_rate": 6.937618875101117e-06, + "loss": 0.001, + "step": 2073 + }, + { + "clip_ratio": 0.0036373567418195307, + "completion_length": 727.6428985595703, + "epoch": 0.0773844503521291, + "grad_norm": 0.10694549977779388, + "kl": 0.027130126953125, + "learning_rate": 6.934856444981863e-06, + "loss": 0.0593, + "num_tokens": 49072457.0, + "reward": 0.754113283008337, + "reward_std": 0.10425138659775257, + "rewards/code_reward": 0.6094703823328018, + "rewards/format_reward": 1.4464285671710968, + "step": 2074 + }, + { + "clip_ratio": 0.003519778198096901, + "epoch": 0.07742176204468075, + "grad_norm": 0.07891993224620819, + "kl": 0.027587890625, + "learning_rate": 6.9320934128234985e-06, + "loss": 0.0592, + "step": 2075 + }, + { + "clip_ratio": 0.0037120076594874263, + "epoch": 0.0774590737372324, + "grad_norm": 0.07575897872447968, + "kl": 0.03045654296875, + "learning_rate": 6.929329779785339e-06, + "loss": 0.0588, + "step": 2076 + }, + { + "clip_ratio": 0.004645224311389029, + "completion_length": 724.0714721679688, + "epoch": 0.07749638542978406, + "grad_norm": 0.08164651691913605, + "kl": 0.024993896484375, + "learning_rate": 6.926565547026955e-06, + "loss": -0.0072, + "num_tokens": 49143777.0, + "reward": 0.4679250046610832, + "reward_std": 0.14856922253966331, + "rewards/code_reward": 0.31792498379945755, + "rewards/format_reward": 1.5, + "step": 2077 + }, + { + "clip_ratio": 0.004337604506872594, + "epoch": 0.07753369712233571, + "grad_norm": 0.07008609920740128, + "kl": 0.0244140625, + "learning_rate": 6.923800715708167e-06, + "loss": -0.0076, + "step": 2078 + }, + { + "clip_ratio": 0.0045277237659320235, + "epoch": 0.07757100881488736, + "grad_norm": 0.06214187666773796, + "kl": 0.02471923828125, + "learning_rate": 6.921035286989045e-06, + "loss": -0.0078, + "step": 2079 + }, + { + "clip_ratio": 0.002697729680221528, + "completion_length": 686.8393096923828, + "epoch": 0.07760832050743902, + "grad_norm": 0.07556142657995224, + "kl": 0.023895263671875, + "learning_rate": 6.9182692620299155e-06, + "loss": 0.0053, + "num_tokens": 49211780.0, + "reward": 0.509440690279007, + "reward_std": 0.11044824682176113, + "rewards/code_reward": 0.35944068897515535, + "rewards/format_reward": 1.5, + "step": 2080 + }, + { + "clip_ratio": 0.0026756193256005645, + "epoch": 0.07764563219999067, + "grad_norm": 0.07105974107980728, + "kl": 0.024078369140625, + "learning_rate": 6.915502641991348e-06, + "loss": 0.0051, + "step": 2081 + }, + { + "clip_ratio": 0.002851439989171922, + "epoch": 0.07768294389254232, + "grad_norm": 0.06639215350151062, + "kl": 0.023956298828125, + "learning_rate": 6.912735428034163e-06, + "loss": 0.0047, + "step": 2082 + }, + { + "clip_ratio": 0.004795789369381964, + "completion_length": 741.9643249511719, + "epoch": 0.07772025558509398, + "grad_norm": 0.079480841755867, + "kl": 0.0537109375, + "learning_rate": 6.909967621319437e-06, + "loss": -0.0114, + "num_tokens": 49301742.0, + "reward": 0.6189477555453777, + "reward_std": 0.06519469420891255, + "rewards/code_reward": 0.4689477451320272, + "rewards/format_reward": 1.5, + "step": 2083 + }, + { + "clip_ratio": 0.004423842648975551, + "epoch": 0.07775756727764563, + "grad_norm": 0.0782000720500946, + "kl": 0.048858642578125, + "learning_rate": 6.90719922300849e-06, + "loss": -0.0115, + "step": 2084 + }, + { + "clip_ratio": 0.004545401141513139, + "epoch": 0.07779487897019728, + "grad_norm": 0.06620746105909348, + "kl": 0.046295166015625, + "learning_rate": 6.904430234262886e-06, + "loss": -0.0119, + "step": 2085 + }, + { + "clip_ratio": 0.003553982183802873, + "completion_length": 639.7143096923828, + "epoch": 0.07783219066274893, + "grad_norm": 0.0777672529220581, + "kl": 0.0200347900390625, + "learning_rate": 6.901660656244444e-06, + "loss": -0.0004, + "num_tokens": 49366322.0, + "reward": 0.491736751049757, + "reward_std": 0.2567731998860836, + "rewards/code_reward": 0.34173670411109924, + "rewards/format_reward": 1.5, + "step": 2086 + }, + { + "clip_ratio": 0.003529657202307135, + "epoch": 0.07786950235530059, + "grad_norm": 0.07037633657455444, + "kl": 0.0197906494140625, + "learning_rate": 6.898890490115228e-06, + "loss": -0.0006, + "step": 2087 + }, + { + "clip_ratio": 0.003682441427372396, + "epoch": 0.07790681404785224, + "grad_norm": 0.06752961874008179, + "kl": 0.019561767578125, + "learning_rate": 6.896119737037548e-06, + "loss": -0.0011, + "step": 2088 + }, + { + "clip_ratio": 0.0037623451789841056, + "completion_length": 644.7500305175781, + "epoch": 0.0779441257404039, + "grad_norm": 0.07850802689790726, + "kl": 0.0189056396484375, + "learning_rate": 6.893348398173959e-06, + "loss": -0.0086, + "num_tokens": 49434992.0, + "reward": 0.6989394277334213, + "reward_std": 0.15934859961271286, + "rewards/code_reward": 0.5489394143223763, + "rewards/format_reward": 1.5, + "step": 2089 + }, + { + "clip_ratio": 0.003499924670904875, + "epoch": 0.07798143743295555, + "grad_norm": 0.07991218566894531, + "kl": 0.01971435546875, + "learning_rate": 6.890576474687264e-06, + "loss": -0.0087, + "step": 2090 + }, + { + "clip_ratio": 0.0026180963031947613, + "epoch": 0.0780187491255072, + "grad_norm": 0.06627506762742996, + "kl": 0.01934814453125, + "learning_rate": 6.887803967740512e-06, + "loss": -0.0093, + "step": 2091 + }, + { + "clip_ratio": 0.0008647252689115703, + "completion_length": 548.928596496582, + "epoch": 0.07805606081805885, + "grad_norm": 0.040795158594846725, + "kl": 0.02490234375, + "learning_rate": 6.885030878496993e-06, + "loss": 0.0024, + "num_tokens": 49496168.0, + "reward": 1.0056230872869492, + "reward_std": 0.06922309100627899, + "rewards/code_reward": 0.8556231036782265, + "rewards/format_reward": 1.5, + "step": 2092 + }, + { + "clip_ratio": 0.0009352655615657568, + "epoch": 0.0780933725106105, + "grad_norm": 0.04456767439842224, + "kl": 0.023345947265625, + "learning_rate": 6.882257208120247e-06, + "loss": 0.0024, + "step": 2093 + }, + { + "clip_ratio": 0.000935759162530303, + "epoch": 0.07813068420316216, + "grad_norm": 0.03722403943538666, + "kl": 0.023284912109375, + "learning_rate": 6.879482957774053e-06, + "loss": 0.0023, + "step": 2094 + }, + { + "clip_ratio": 0.0030120331794023514, + "completion_length": 485.6428756713867, + "epoch": 0.07816799589571381, + "grad_norm": 0.07754051685333252, + "kl": 0.02630615234375, + "learning_rate": 6.876708128622432e-06, + "loss": -0.0031, + "num_tokens": 49549992.0, + "reward": 0.8527453988790512, + "reward_std": 0.24551354348659515, + "rewards/code_reward": 0.7027453780174255, + "rewards/format_reward": 1.5, + "step": 2095 + }, + { + "clip_ratio": 0.0026117003289982677, + "epoch": 0.07820530758826547, + "grad_norm": 0.07795640826225281, + "kl": 0.026885986328125, + "learning_rate": 6.873932721829655e-06, + "loss": -0.0034, + "step": 2096 + }, + { + "clip_ratio": 0.0026831431314349174, + "epoch": 0.07824261928081713, + "grad_norm": 0.0735989436507225, + "kl": 0.026458740234375, + "learning_rate": 6.871156738560231e-06, + "loss": -0.0039, + "step": 2097 + }, + { + "clip_ratio": 0.002862066787201911, + "completion_length": 616.857177734375, + "epoch": 0.07827993097336879, + "grad_norm": 0.08644985407590866, + "kl": 0.025360107421875, + "learning_rate": 6.868380179978909e-06, + "loss": 0.0104, + "num_tokens": 49613348.0, + "reward": 0.8098563328385353, + "reward_std": 0.21663019806146622, + "rewards/code_reward": 0.6598562747240067, + "rewards/format_reward": 1.5, + "step": 2098 + }, + { + "clip_ratio": 0.002704748709220439, + "epoch": 0.07831724266592044, + "grad_norm": 0.08230523765087128, + "kl": 0.02545166015625, + "learning_rate": 6.865603047250678e-06, + "loss": 0.0103, + "step": 2099 + }, + { + "clip_ratio": 0.0026983808784279972, + "epoch": 0.07835455435847209, + "grad_norm": 0.07475920021533966, + "kl": 0.02593994140625, + "learning_rate": 6.862825341540779e-06, + "loss": 0.0098, + "step": 2100 + }, + { + "clip_ratio": 0.0032981361728161573, + "completion_length": 767.5357513427734, + "epoch": 0.07839186605102375, + "grad_norm": 0.07433145493268967, + "kl": 0.031524658203125, + "learning_rate": 6.860047064014681e-06, + "loss": 0.0055, + "num_tokens": 49702922.0, + "reward": 0.768916018307209, + "reward_std": 0.13060832023620605, + "rewards/code_reward": 0.6189160104840994, + "rewards/format_reward": 1.5, + "step": 2101 + }, + { + "clip_ratio": 0.003305580059532076, + "epoch": 0.0784291777435754, + "grad_norm": 0.06731320917606354, + "kl": 0.031646728515625, + "learning_rate": 6.857268215838098e-06, + "loss": 0.0055, + "step": 2102 + }, + { + "clip_ratio": 0.0032548473682254553, + "epoch": 0.07846648943612705, + "grad_norm": 0.06261119991540909, + "kl": 0.032470703125, + "learning_rate": 6.854488798176986e-06, + "loss": 0.0051, + "step": 2103 + }, + { + "clip_ratio": 0.0040468950173817575, + "completion_length": 689.6250228881836, + "epoch": 0.0785038011286787, + "grad_norm": 0.07345064729452133, + "kl": 0.023468017578125, + "learning_rate": 6.8517088121975325e-06, + "loss": 0.0062, + "num_tokens": 49768463.0, + "reward": 0.7355508804321289, + "reward_std": 0.21544107794761658, + "rewards/code_reward": 0.5855508539825678, + "rewards/format_reward": 1.5, + "step": 2104 + }, + { + "clip_ratio": 0.00406959920655936, + "epoch": 0.07854111282123036, + "grad_norm": 0.07283982634544373, + "kl": 0.022796630859375, + "learning_rate": 6.848928259066171e-06, + "loss": 0.0058, + "step": 2105 + }, + { + "clip_ratio": 0.0036098965210840106, + "epoch": 0.07857842451378201, + "grad_norm": 0.06550870090723038, + "kl": 0.022491455078125, + "learning_rate": 6.846147139949568e-06, + "loss": 0.0055, + "step": 2106 + }, + { + "clip_ratio": 0.0014394563622772694, + "completion_length": 610.0178985595703, + "epoch": 0.07861573620633366, + "grad_norm": 0.06309067457914352, + "kl": 0.02764892578125, + "learning_rate": 6.843365456014632e-06, + "loss": -0.0, + "num_tokens": 49833802.0, + "reward": 1.0054859966039658, + "reward_std": 0.15374475345015526, + "rewards/code_reward": 0.8554859906435013, + "rewards/format_reward": 1.5, + "step": 2107 + }, + { + "clip_ratio": 0.0018109639058820903, + "epoch": 0.07865304789888532, + "grad_norm": 0.06247290223836899, + "kl": 0.027252197265625, + "learning_rate": 6.840583208428504e-06, + "loss": -0.0001, + "step": 2108 + }, + { + "clip_ratio": 0.0016263863071799278, + "epoch": 0.07869035959143697, + "grad_norm": 0.05796550214290619, + "kl": 0.026458740234375, + "learning_rate": 6.8378003983585626e-06, + "loss": -0.0004, + "step": 2109 + }, + { + "clip_ratio": 0.004458974406588823, + "completion_length": 605.2143096923828, + "epoch": 0.07872767128398862, + "grad_norm": 0.114713653922081, + "kl": 0.02423095703125, + "learning_rate": 6.8350170269724255e-06, + "loss": 0.0287, + "num_tokens": 49907300.0, + "reward": 0.7946335077285767, + "reward_std": 0.2549171410501003, + "rewards/code_reward": 0.6446335092186928, + "rewards/format_reward": 1.5, + "step": 2110 + }, + { + "clip_ratio": 0.00482666352763772, + "epoch": 0.07876498297654028, + "grad_norm": 0.1489880234003067, + "kl": 0.02471923828125, + "learning_rate": 6.832233095437944e-06, + "loss": 0.0283, + "step": 2111 + }, + { + "clip_ratio": 0.004468266328331083, + "epoch": 0.07880229466909193, + "grad_norm": 0.09912370145320892, + "kl": 0.024322509765625, + "learning_rate": 6.8294486049232014e-06, + "loss": 0.0278, + "step": 2112 + }, + { + "clip_ratio": 0.0041903870878741145, + "completion_length": 710.1964874267578, + "epoch": 0.07883960636164358, + "grad_norm": 0.135378897190094, + "kl": 0.0589599609375, + "learning_rate": 6.826663556596519e-06, + "loss": 0.0362, + "num_tokens": 49981847.0, + "reward": 0.45994898676872253, + "reward_std": 0.2569291237741709, + "rewards/code_reward": 0.3099489863961935, + "rewards/format_reward": 1.5, + "step": 2113 + }, + { + "clip_ratio": 0.004645151144359261, + "epoch": 0.07887691805419524, + "grad_norm": 0.09901300072669983, + "kl": 0.044921875, + "learning_rate": 6.823877951626452e-06, + "loss": 0.0358, + "step": 2114 + }, + { + "clip_ratio": 0.004411026544403285, + "epoch": 0.07891422974674689, + "grad_norm": 0.10765837877988815, + "kl": 0.044952392578125, + "learning_rate": 6.821091791181788e-06, + "loss": 0.0353, + "step": 2115 + }, + { + "clip_ratio": 0.003687740652821958, + "completion_length": 562.6071624755859, + "epoch": 0.07895154143929854, + "grad_norm": 0.09548664838075638, + "kl": 0.029510498046875, + "learning_rate": 6.8183050764315495e-06, + "loss": -0.0012, + "num_tokens": 50051699.0, + "reward": 0.9525500684976578, + "reward_std": 0.2583514638245106, + "rewards/code_reward": 0.8025500178337097, + "rewards/format_reward": 1.5, + "step": 2116 + }, + { + "clip_ratio": 0.0036900289123877883, + "epoch": 0.0789888531318502, + "grad_norm": 0.09058550000190735, + "kl": 0.028656005859375, + "learning_rate": 6.815517808544988e-06, + "loss": -0.0017, + "step": 2117 + }, + { + "clip_ratio": 0.0037035643472336233, + "epoch": 0.07902616482440185, + "grad_norm": 0.08509614318609238, + "kl": 0.0291748046875, + "learning_rate": 6.812729988691591e-06, + "loss": -0.0023, + "step": 2118 + }, + { + "clip_ratio": 0.0046578567125834525, + "completion_length": 716.2678833007812, + "epoch": 0.0790634765169535, + "grad_norm": 0.08298401534557343, + "kl": 0.031646728515625, + "learning_rate": 6.809941618041076e-06, + "loss": 0.0108, + "num_tokens": 50131850.0, + "reward": 0.3673246316611767, + "reward_std": 0.09420641342876479, + "rewards/code_reward": 0.21732460768544115, + "rewards/format_reward": 1.5, + "step": 2119 + }, + { + "clip_ratio": 0.004883303830865771, + "epoch": 0.07910078820950515, + "grad_norm": 0.08225830644369125, + "kl": 0.031646728515625, + "learning_rate": 6.807152697763391e-06, + "loss": 0.0103, + "step": 2120 + }, + { + "clip_ratio": 0.004715811170171946, + "epoch": 0.07913809990205681, + "grad_norm": 0.07474755495786667, + "kl": 0.032012939453125, + "learning_rate": 6.804363229028715e-06, + "loss": 0.0097, + "step": 2121 + }, + { + "clip_ratio": 0.004261898924596608, + "completion_length": 593.0000305175781, + "epoch": 0.07917541159460846, + "grad_norm": 0.3251110017299652, + "kl": 0.026611328125, + "learning_rate": 6.801573213007456e-06, + "loss": 0.0023, + "num_tokens": 50201080.0, + "reward": 0.618406631052494, + "reward_std": 0.3375258259475231, + "rewards/code_reward": 0.4684066064655781, + "rewards/format_reward": 1.5, + "step": 2122 + }, + { + "clip_ratio": 0.003676584514323622, + "epoch": 0.07921272328716011, + "grad_norm": 0.09088651835918427, + "kl": 0.0267333984375, + "learning_rate": 6.798782650870255e-06, + "loss": 0.0017, + "step": 2123 + }, + { + "clip_ratio": 0.004046582849696279, + "epoch": 0.07925003497971177, + "grad_norm": 0.11278238147497177, + "kl": 0.0262451171875, + "learning_rate": 6.7959915437879805e-06, + "loss": 0.0015, + "step": 2124 + }, + { + "clip_ratio": 0.002196963643655181, + "completion_length": 630.160758972168, + "epoch": 0.07928734667226342, + "grad_norm": 0.05978940427303314, + "kl": 0.025970458984375, + "learning_rate": 6.793199892931727e-06, + "loss": -0.0054, + "num_tokens": 50273427.0, + "reward": 0.7638205178081989, + "reward_std": 0.05095734295900911, + "rewards/code_reward": 0.6138205269817263, + "rewards/format_reward": 1.5, + "step": 2125 + }, + { + "clip_ratio": 0.0020272666588425636, + "epoch": 0.07932465836481507, + "grad_norm": 0.05684466287493706, + "kl": 0.026123046875, + "learning_rate": 6.790407699472821e-06, + "loss": -0.0058, + "step": 2126 + }, + { + "clip_ratio": 0.00209851935505867, + "epoch": 0.07936197005736673, + "grad_norm": 0.05178897827863693, + "kl": 0.024932861328125, + "learning_rate": 6.787614964582815e-06, + "loss": -0.0059, + "step": 2127 + }, + { + "clip_ratio": 0.0025215059285983443, + "completion_length": 717.0893173217773, + "epoch": 0.07939928174991838, + "grad_norm": 0.0928526371717453, + "kl": 0.022613525390625, + "learning_rate": 6.784821689433485e-06, + "loss": 0.0593, + "num_tokens": 50342830.0, + "reward": 0.9320520311594009, + "reward_std": 0.11163684260100126, + "rewards/code_reward": 0.7874091416597366, + "rewards/format_reward": 1.4464285969734192, + "step": 2128 + }, + { + "clip_ratio": 0.0023552889470010996, + "epoch": 0.07943659344247003, + "grad_norm": 0.07227384299039841, + "kl": 0.0224609375, + "learning_rate": 6.782027875196844e-06, + "loss": 0.0593, + "step": 2129 + }, + { + "clip_ratio": 0.002839389839209616, + "epoch": 0.07947390513502169, + "grad_norm": 0.06689515709877014, + "kl": 0.02239990234375, + "learning_rate": 6.779233523045119e-06, + "loss": 0.059, + "step": 2130 + }, + { + "clip_ratio": 0.005105825490318239, + "completion_length": 676.1071701049805, + "epoch": 0.07951121682757334, + "grad_norm": 0.04494389519095421, + "kl": 0.0340576171875, + "learning_rate": 6.77643863415077e-06, + "loss": 0.0545, + "num_tokens": 50420956.0, + "reward": 0.4062500335276127, + "reward_std": 0.04342994373291731, + "rewards/code_reward": 0.2589285718277097, + "rewards/format_reward": 1.4732142984867096, + "step": 2131 + }, + { + "clip_ratio": 0.004840543551836163, + "epoch": 0.07954852852012499, + "grad_norm": 0.04259540140628815, + "kl": 0.03338623046875, + "learning_rate": 6.7736432096864814e-06, + "loss": 0.0542, + "step": 2132 + }, + { + "clip_ratio": 0.005274631432257593, + "epoch": 0.07958584021267665, + "grad_norm": 0.03890315070748329, + "kl": 0.03240966796875, + "learning_rate": 6.770847250825161e-06, + "loss": 0.0543, + "step": 2133 + }, + { + "clip_ratio": 0.003808980924077332, + "completion_length": 635.0893096923828, + "epoch": 0.0796231519052283, + "grad_norm": 0.08888822048902512, + "kl": 0.0234527587890625, + "learning_rate": 6.768050758739942e-06, + "loss": -0.0161, + "num_tokens": 50487195.0, + "reward": 0.27601150050759315, + "reward_std": 0.22279789671301842, + "rewards/code_reward": 0.12601149454712868, + "rewards/format_reward": 1.5, + "step": 2134 + }, + { + "clip_ratio": 0.003666449279990047, + "epoch": 0.07966046359777995, + "grad_norm": 0.08648344874382019, + "kl": 0.02276611328125, + "learning_rate": 6.765253734604175e-06, + "loss": -0.0164, + "step": 2135 + }, + { + "clip_ratio": 0.0036212377599440515, + "epoch": 0.0796977752903316, + "grad_norm": 0.07155846804380417, + "kl": 0.022735595703125, + "learning_rate": 6.762456179591448e-06, + "loss": -0.0167, + "step": 2136 + }, + { + "clip_ratio": 0.003114953520707786, + "completion_length": 710.3214569091797, + "epoch": 0.07973508698288326, + "grad_norm": 0.08020918071269989, + "kl": 0.0261993408203125, + "learning_rate": 6.759658094875557e-06, + "loss": -0.024, + "num_tokens": 50563345.0, + "reward": 0.5671873092651367, + "reward_std": 0.20243405923247337, + "rewards/code_reward": 0.4198658801615238, + "rewards/format_reward": 1.4732142984867096, + "step": 2137 + }, + { + "clip_ratio": 0.0031962257344275713, + "epoch": 0.07977239867543491, + "grad_norm": 0.0746135339140892, + "kl": 0.0255889892578125, + "learning_rate": 6.75685948163053e-06, + "loss": -0.024, + "step": 2138 + }, + { + "clip_ratio": 0.0028188092983327806, + "epoch": 0.07980971036798656, + "grad_norm": 0.07168792933225632, + "kl": 0.026123046875, + "learning_rate": 6.75406034103061e-06, + "loss": -0.0246, + "step": 2139 + }, + { + "clip_ratio": 0.0038631450152024627, + "completion_length": 826.6786041259766, + "epoch": 0.07984702206053822, + "grad_norm": 0.07906301319599152, + "kl": 0.02630615234375, + "learning_rate": 6.751260674250266e-06, + "loss": 0.0125, + "num_tokens": 50646529.0, + "reward": 0.716428779065609, + "reward_std": 0.27192900888621807, + "rewards/code_reward": 0.5717859081923962, + "rewards/format_reward": 1.4464285969734192, + "step": 2140 + }, + { + "clip_ratio": 0.0034543413785286248, + "epoch": 0.07988433375308987, + "grad_norm": 0.07660606503486633, + "kl": 0.026824951171875, + "learning_rate": 6.748460482464186e-06, + "loss": 0.0122, + "step": 2141 + }, + { + "clip_ratio": 0.0033054464729502797, + "epoch": 0.07992164544564152, + "grad_norm": 0.0736742690205574, + "kl": 0.026947021484375, + "learning_rate": 6.745659766847279e-06, + "loss": 0.0118, + "step": 2142 + }, + { + "clip_ratio": 0.003960239235311747, + "completion_length": 638.303596496582, + "epoch": 0.07995895713819318, + "grad_norm": 0.048017892986536026, + "kl": 0.0242156982421875, + "learning_rate": 6.742858528574672e-06, + "loss": -0.0005, + "num_tokens": 50712108.0, + "reward": 0.46475300192832947, + "reward_std": 0.034757114946842194, + "rewards/code_reward": 0.3147530034184456, + "rewards/format_reward": 1.5, + "step": 2143 + }, + { + "clip_ratio": 0.003458852763287723, + "epoch": 0.07999626883074483, + "grad_norm": 0.04585712030529976, + "kl": 0.0239105224609375, + "learning_rate": 6.740056768821713e-06, + "loss": -0.0006, + "step": 2144 + }, + { + "clip_ratio": 0.0036826353752985597, + "epoch": 0.08003358052329648, + "grad_norm": 0.04290793091058731, + "kl": 0.024169921875, + "learning_rate": 6.7372544887639694e-06, + "loss": -0.0007, + "step": 2145 + }, + { + "clip_ratio": 0.004011576296761632, + "completion_length": 505.0000305175781, + "epoch": 0.08007089221584814, + "grad_norm": 0.08214171975851059, + "kl": 0.029693603515625, + "learning_rate": 6.734451689577224e-06, + "loss": 0.0083, + "num_tokens": 50761668.0, + "reward": 0.6824580021202564, + "reward_std": 0.1777501404285431, + "rewards/code_reward": 0.532457984983921, + "rewards/format_reward": 1.5, + "step": 2146 + }, + { + "clip_ratio": 0.003644414711743593, + "epoch": 0.08010820390839979, + "grad_norm": 0.07535674422979355, + "kl": 0.03045654296875, + "learning_rate": 6.731648372437481e-06, + "loss": 0.0079, + "step": 2147 + }, + { + "clip_ratio": 0.0032101604738272727, + "epoch": 0.08014551560095144, + "grad_norm": 0.06664730608463287, + "kl": 0.0306396484375, + "learning_rate": 6.72884453852096e-06, + "loss": 0.0076, + "step": 2148 + }, + { + "clip_ratio": 0.0024147850926965475, + "completion_length": 809.6607360839844, + "epoch": 0.0801828272935031, + "grad_norm": 0.03746306151151657, + "kl": 0.028350830078125, + "learning_rate": 6.726040189004095e-06, + "loss": -0.0105, + "num_tokens": 50845775.0, + "reward": 0.737675528973341, + "reward_std": 0.06338877975940704, + "rewards/code_reward": 0.5903540924191475, + "rewards/format_reward": 1.4732142984867096, + "step": 2149 + }, + { + "clip_ratio": 0.0026112613268196583, + "epoch": 0.08022013898605475, + "grad_norm": 0.04276233911514282, + "kl": 0.0283203125, + "learning_rate": 6.723235325063544e-06, + "loss": -0.0106, + "step": 2150 + }, + { + "clip_ratio": 0.0026650899089872837, + "epoch": 0.08025745067860642, + "grad_norm": 0.03986501321196556, + "kl": 0.028167724609375, + "learning_rate": 6.720429947876171e-06, + "loss": -0.0107, + "step": 2151 + }, + { + "clip_ratio": 0.0033625689102336764, + "completion_length": 955.232177734375, + "epoch": 0.08029476237115807, + "grad_norm": 0.04316667094826698, + "kl": 0.0266265869140625, + "learning_rate": 6.717624058619062e-06, + "loss": 0.0047, + "num_tokens": 50936410.0, + "reward": 0.48196248337626457, + "reward_std": 0.053587223403155804, + "rewards/code_reward": 0.33196248119929805, + "rewards/format_reward": 1.5, + "step": 2152 + }, + { + "clip_ratio": 0.0034670476452447474, + "epoch": 0.08033207406370972, + "grad_norm": 0.04259771853685379, + "kl": 0.02496337890625, + "learning_rate": 6.714817658469517e-06, + "loss": 0.0047, + "step": 2153 + }, + { + "clip_ratio": 0.003396535583306104, + "epoch": 0.08036938575626137, + "grad_norm": 0.03952707722783089, + "kl": 0.0260009765625, + "learning_rate": 6.712010748605049e-06, + "loss": 0.0046, + "step": 2154 + }, + { + "clip_ratio": 0.004212828062009066, + "completion_length": 654.6785888671875, + "epoch": 0.08040669744881303, + "grad_norm": 0.06491146236658096, + "kl": 0.02703857421875, + "learning_rate": 6.709203330203383e-06, + "loss": 0.0078, + "num_tokens": 51002360.0, + "reward": 0.557212870568037, + "reward_std": 0.20960865169763565, + "rewards/code_reward": 0.4072128888219595, + "rewards/format_reward": 1.5, + "step": 2155 + }, + { + "clip_ratio": 0.003969678597059101, + "epoch": 0.08044400914136468, + "grad_norm": 0.06449099630117416, + "kl": 0.0263519287109375, + "learning_rate": 6.706395404442462e-06, + "loss": 0.0078, + "step": 2156 + }, + { + "clip_ratio": 0.0034934222348965704, + "epoch": 0.08048132083391633, + "grad_norm": 0.06321100145578384, + "kl": 0.0271759033203125, + "learning_rate": 6.703586972500439e-06, + "loss": 0.0075, + "step": 2157 + }, + { + "clip_ratio": 0.003224493353627622, + "completion_length": 582.7678833007812, + "epoch": 0.08051863252646799, + "grad_norm": 0.05789591744542122, + "kl": 0.022735595703125, + "learning_rate": 6.700778035555679e-06, + "loss": 0.0017, + "num_tokens": 51063373.0, + "reward": 0.563202578574419, + "reward_std": 0.08113952912390232, + "rewards/code_reward": 0.41320257633924484, + "rewards/format_reward": 1.5, + "step": 2158 + }, + { + "clip_ratio": 0.0030624918290413916, + "epoch": 0.08055594421901964, + "grad_norm": 0.05593932420015335, + "kl": 0.022857666015625, + "learning_rate": 6.697968594786761e-06, + "loss": 0.0015, + "step": 2159 + }, + { + "clip_ratio": 0.00288010312942788, + "epoch": 0.0805932559115713, + "grad_norm": 0.055590298026800156, + "kl": 0.022308349609375, + "learning_rate": 6.695158651372474e-06, + "loss": 0.0014, + "step": 2160 + }, + { + "clip_ratio": 0.001958807755727321, + "completion_length": 512.803596496582, + "epoch": 0.08063056760412295, + "grad_norm": 0.0322042852640152, + "kl": 0.03094482421875, + "learning_rate": 6.692348206491817e-06, + "loss": -0.0006, + "num_tokens": 51117934.0, + "reward": 0.8979195468127728, + "reward_std": 0.0008814065367914736, + "rewards/code_reward": 0.7479195594787598, + "rewards/format_reward": 1.5, + "step": 2161 + }, + { + "clip_ratio": 0.002136244031134993, + "epoch": 0.0806678792966746, + "grad_norm": 0.032026100903749466, + "kl": 0.031280517578125, + "learning_rate": 6.689537261324001e-06, + "loss": -0.0006, + "step": 2162 + }, + { + "clip_ratio": 0.001614646869711578, + "epoch": 0.08070519098922625, + "grad_norm": 0.02916775457561016, + "kl": 0.027862548828125, + "learning_rate": 6.6867258170484476e-06, + "loss": -0.0007, + "step": 2163 + }, + { + "clip_ratio": 0.0030964184552431107, + "completion_length": 634.4464645385742, + "epoch": 0.0807425026817779, + "grad_norm": 0.05708720535039902, + "kl": 0.02374267578125, + "learning_rate": 6.6839138748447856e-06, + "loss": 0.0061, + "num_tokens": 51188365.0, + "reward": 0.6476289071142673, + "reward_std": 0.07801885902881622, + "rewards/code_reward": 0.49762891232967377, + "rewards/format_reward": 1.5, + "step": 2164 + }, + { + "clip_ratio": 0.0027217394672334194, + "epoch": 0.08077981437432956, + "grad_norm": 0.05789165943861008, + "kl": 0.02337646484375, + "learning_rate": 6.68110143589285e-06, + "loss": 0.006, + "step": 2165 + }, + { + "clip_ratio": 0.0025080175837501884, + "epoch": 0.08081712606688121, + "grad_norm": 0.0541006475687027, + "kl": 0.0242462158203125, + "learning_rate": 6.678288501372695e-06, + "loss": 0.0058, + "step": 2166 + }, + { + "clip_ratio": 0.003672745544463396, + "completion_length": 753.5357513427734, + "epoch": 0.08085443775943287, + "grad_norm": 0.07934238016605377, + "kl": 0.0313720703125, + "learning_rate": 6.675475072464572e-06, + "loss": -0.0023, + "num_tokens": 51271219.0, + "reward": 0.517887320369482, + "reward_std": 0.12731550633907318, + "rewards/code_reward": 0.36788731813430786, + "rewards/format_reward": 1.5, + "step": 2167 + }, + { + "clip_ratio": 0.0037955306470394135, + "epoch": 0.08089174945198452, + "grad_norm": 0.07628854364156723, + "kl": 0.0323486328125, + "learning_rate": 6.672661150348943e-06, + "loss": -0.0024, + "step": 2168 + }, + { + "clip_ratio": 0.003327726328279823, + "epoch": 0.08092906114453617, + "grad_norm": 0.07033511996269226, + "kl": 0.032196044921875, + "learning_rate": 6.669846736206479e-06, + "loss": -0.0028, + "step": 2169 + }, + { + "clip_ratio": 0.0009489679941907525, + "completion_length": 497.0535888671875, + "epoch": 0.08096637283708782, + "grad_norm": 0.03534972667694092, + "kl": 0.028656005859375, + "learning_rate": 6.667031831218054e-06, + "loss": 0.0275, + "num_tokens": 51328972.0, + "reward": 1.0580357015132904, + "reward_std": 0.12837812304496765, + "rewards/code_reward": 0.9107142835855484, + "rewards/format_reward": 1.4732142984867096, + "step": 2170 + }, + { + "clip_ratio": 0.0008442089892923832, + "epoch": 0.08100368452963948, + "grad_norm": 0.03480515629053116, + "kl": 0.0283203125, + "learning_rate": 6.664216436564751e-06, + "loss": 0.0275, + "step": 2171 + }, + { + "clip_ratio": 0.000737890659365803, + "epoch": 0.08104099622219113, + "grad_norm": 0.03517135977745056, + "kl": 0.029571533203125, + "learning_rate": 6.6614005534278584e-06, + "loss": 0.0274, + "step": 2172 + }, + { + "clip_ratio": 0.004568680538795888, + "completion_length": 599.1071624755859, + "epoch": 0.08107830791474278, + "grad_norm": 0.08067076653242111, + "kl": 0.027008056640625, + "learning_rate": 6.658584182988867e-06, + "loss": -0.0065, + "num_tokens": 51398642.0, + "reward": 0.5118575803935528, + "reward_std": 0.2469283491373062, + "rewards/code_reward": 0.3618575781583786, + "rewards/format_reward": 1.5, + "step": 2173 + }, + { + "clip_ratio": 0.0038542969268746674, + "epoch": 0.08111561960729444, + "grad_norm": 0.07716774940490723, + "kl": 0.02532958984375, + "learning_rate": 6.655767326429471e-06, + "loss": -0.0069, + "step": 2174 + }, + { + "clip_ratio": 0.003591995278839022, + "epoch": 0.08115293129984609, + "grad_norm": 0.07267686724662781, + "kl": 0.024871826171875, + "learning_rate": 6.652949984931575e-06, + "loss": -0.0073, + "step": 2175 + }, + { + "clip_ratio": 0.0028908447129651904, + "completion_length": 609.8928985595703, + "epoch": 0.08119024299239774, + "grad_norm": 0.07713909447193146, + "kl": 0.0200347900390625, + "learning_rate": 6.650132159677284e-06, + "loss": 0.0197, + "num_tokens": 51458178.0, + "reward": 0.6660609170794487, + "reward_std": 0.2803139165043831, + "rewards/code_reward": 0.5183823499828577, + "rewards/format_reward": 1.4767857193946838, + "step": 2176 + }, + { + "clip_ratio": 0.0031419568695127964, + "epoch": 0.0812275546849494, + "grad_norm": 0.10891403257846832, + "kl": 0.0202178955078125, + "learning_rate": 6.647313851848903e-06, + "loss": 0.0196, + "step": 2177 + }, + { + "clip_ratio": 0.002877349907066673, + "epoch": 0.08126486637750105, + "grad_norm": 0.06930629909038544, + "kl": 0.02001953125, + "learning_rate": 6.644495062628942e-06, + "loss": 0.0192, + "step": 2178 + }, + { + "clip_ratio": 0.004333524731919169, + "completion_length": 726.357177734375, + "epoch": 0.0813021780700527, + "grad_norm": 0.08792535215616226, + "kl": 0.02752685546875, + "learning_rate": 6.641675793200111e-06, + "loss": -0.0443, + "num_tokens": 51536276.0, + "reward": 0.43945107981562614, + "reward_std": 0.16616259701550007, + "rewards/code_reward": 0.2921296269632876, + "rewards/format_reward": 1.4732142984867096, + "step": 2179 + }, + { + "clip_ratio": 0.004572688834741712, + "epoch": 0.08133948976260436, + "grad_norm": 0.08915039151906967, + "kl": 0.03021240234375, + "learning_rate": 6.638856044745323e-06, + "loss": -0.0443, + "step": 2180 + }, + { + "clip_ratio": 0.00452611839864403, + "epoch": 0.08137680145515601, + "grad_norm": 0.07629755139350891, + "kl": 0.02789306640625, + "learning_rate": 6.636035818447697e-06, + "loss": -0.045, + "step": 2181 + }, + { + "clip_ratio": 0.0041999733075499535, + "completion_length": 744.9286041259766, + "epoch": 0.08141411314770766, + "grad_norm": 0.07756790518760681, + "kl": 0.01861572265625, + "learning_rate": 6.6332151154905415e-06, + "loss": 0.0068, + "num_tokens": 51602128.0, + "reward": 0.6066712439060211, + "reward_std": 0.14456716924905777, + "rewards/code_reward": 0.45667121745646, + "rewards/format_reward": 1.5, + "step": 2182 + }, + { + "clip_ratio": 0.0036358493962325156, + "epoch": 0.08145142484025931, + "grad_norm": 0.07214932888746262, + "kl": 0.0180816650390625, + "learning_rate": 6.630393937057374e-06, + "loss": 0.0064, + "step": 2183 + }, + { + "clip_ratio": 0.003603067569201812, + "epoch": 0.08148873653281097, + "grad_norm": 0.07327113300561905, + "kl": 0.0192718505859375, + "learning_rate": 6.627572284331909e-06, + "loss": 0.006, + "step": 2184 + }, + { + "clip_ratio": 0.004057125363033265, + "completion_length": 551.0357437133789, + "epoch": 0.08152604822536262, + "grad_norm": 0.0790264904499054, + "kl": 0.020904541015625, + "learning_rate": 6.62475015849806e-06, + "loss": -0.0005, + "num_tokens": 51660244.0, + "reward": 0.47917912155389786, + "reward_std": 0.21566829085350037, + "rewards/code_reward": 0.3291791193187237, + "rewards/format_reward": 1.5, + "step": 2185 + }, + { + "clip_ratio": 0.0035126348375342786, + "epoch": 0.08156335991791427, + "grad_norm": 0.07680415362119675, + "kl": 0.020355224609375, + "learning_rate": 6.621927560739936e-06, + "loss": -0.0009, + "step": 2186 + }, + { + "clip_ratio": 0.0036948248744010925, + "epoch": 0.08160067161046593, + "grad_norm": 0.07443638145923615, + "kl": 0.020965576171875, + "learning_rate": 6.619104492241848e-06, + "loss": -0.0012, + "step": 2187 + }, + { + "clip_ratio": 0.0037198474747128785, + "completion_length": 605.9821701049805, + "epoch": 0.08163798330301758, + "grad_norm": 0.083676278591156, + "kl": 0.023956298828125, + "learning_rate": 6.616280954188302e-06, + "loss": 0.0079, + "num_tokens": 51732441.0, + "reward": 0.6505151055753231, + "reward_std": 0.13221744447946548, + "rewards/code_reward": 0.5005151173099875, + "rewards/format_reward": 1.5, + "step": 2188 + }, + { + "clip_ratio": 0.0033747180132195354, + "epoch": 0.08167529499556923, + "grad_norm": 0.07345272600650787, + "kl": 0.023681640625, + "learning_rate": 6.613456947764004e-06, + "loss": 0.0078, + "step": 2189 + }, + { + "clip_ratio": 0.0034223473630845547, + "epoch": 0.08171260668812089, + "grad_norm": 0.06484512984752655, + "kl": 0.024139404296875, + "learning_rate": 6.610632474153854e-06, + "loss": 0.0073, + "step": 2190 + }, + { + "clip_ratio": 0.004493479733355343, + "completion_length": 852.107177734375, + "epoch": 0.08174991838067254, + "grad_norm": 0.057672806084156036, + "kl": 0.023223876953125, + "learning_rate": 6.607807534542949e-06, + "loss": 0.0185, + "num_tokens": 51820129.0, + "reward": 0.3086484894156456, + "reward_std": 0.12595941498875618, + "rewards/code_reward": 0.16132702678442, + "rewards/format_reward": 1.4732142984867096, + "step": 2191 + }, + { + "clip_ratio": 0.00511956715490669, + "epoch": 0.08178723007322419, + "grad_norm": 0.09616554528474808, + "kl": 0.0230712890625, + "learning_rate": 6.604982130116581e-06, + "loss": 0.0186, + "step": 2192 + }, + { + "clip_ratio": 0.00501980260014534, + "epoch": 0.08182454176577585, + "grad_norm": 0.052537184208631516, + "kl": 0.023406982421875, + "learning_rate": 6.602156262060235e-06, + "loss": 0.0184, + "step": 2193 + }, + { + "clip_ratio": 0.003804087929893285, + "completion_length": 581.232177734375, + "epoch": 0.0818618534583275, + "grad_norm": 0.0739353597164154, + "kl": 0.032806396484375, + "learning_rate": 6.599329931559596e-06, + "loss": 0.0003, + "num_tokens": 51882036.0, + "reward": 0.5267467126250267, + "reward_std": 0.11129532009363174, + "rewards/code_reward": 0.37674668431282043, + "rewards/format_reward": 1.5, + "step": 2194 + }, + { + "clip_ratio": 0.003830950357951224, + "epoch": 0.08189916515087915, + "grad_norm": 0.07211441546678543, + "kl": 0.033203125, + "learning_rate": 6.596503139800538e-06, + "loss": 0.0001, + "step": 2195 + }, + { + "clip_ratio": 0.004416764713823795, + "epoch": 0.0819364768434308, + "grad_norm": 0.06488452106714249, + "kl": 0.028594970703125, + "learning_rate": 6.593675887969131e-06, + "loss": -0.0001, + "step": 2196 + }, + { + "clip_ratio": 0.002299406100064516, + "completion_length": 687.3393173217773, + "epoch": 0.08197378853598246, + "grad_norm": 0.0370863676071167, + "kl": 0.02362060546875, + "learning_rate": 6.590848177251638e-06, + "loss": -0.0269, + "num_tokens": 51957823.0, + "reward": 0.6911219619214535, + "reward_std": 0.08317086100578308, + "rewards/code_reward": 0.5438005402684212, + "rewards/format_reward": 1.4732142984867096, + "step": 2197 + }, + { + "clip_ratio": 0.002468607504852116, + "epoch": 0.08201110022853411, + "grad_norm": 0.03638512268662453, + "kl": 0.021881103515625, + "learning_rate": 6.5880200088345104e-06, + "loss": -0.027, + "step": 2198 + }, + { + "clip_ratio": 0.002076158532872796, + "epoch": 0.08204841192108576, + "grad_norm": 0.035626281052827835, + "kl": 0.021453857421875, + "learning_rate": 6.585191383904399e-06, + "loss": -0.0271, + "step": 2199 + }, + { + "clip_ratio": 0.0031015227432362735, + "completion_length": 732.6428680419922, + "epoch": 0.08208572361363742, + "grad_norm": 0.08496934175491333, + "kl": 0.0317535400390625, + "learning_rate": 6.582362303648142e-06, + "loss": 0.0049, + "num_tokens": 52033949.0, + "reward": 0.6884044297039509, + "reward_std": 0.12517081201076508, + "rewards/code_reward": 0.5384044144302607, + "rewards/format_reward": 1.5, + "step": 2200 + }, + { + "clip_ratio": 0.002850778109859675, + "epoch": 0.08212303530618907, + "grad_norm": 0.07058586180210114, + "kl": 0.0314483642578125, + "learning_rate": 6.5795327692527665e-06, + "loss": 0.0049, + "step": 2201 + }, + { + "clip_ratio": 0.0024289024877361953, + "epoch": 0.08216034699874072, + "grad_norm": 0.0752149447798729, + "kl": 0.031494140625, + "learning_rate": 6.576702781905495e-06, + "loss": 0.0045, + "step": 2202 + }, + { + "clip_ratio": 0.0030157205183058977, + "completion_length": 640.0357360839844, + "epoch": 0.08219765869129238, + "grad_norm": 0.04582086205482483, + "kl": 0.02197265625, + "learning_rate": 6.573872342793736e-06, + "loss": 0.0515, + "num_tokens": 52095411.0, + "reward": 0.6977801658213139, + "reward_std": 0.10195048525929451, + "rewards/code_reward": 0.5504587180912495, + "rewards/format_reward": 1.4732142984867096, + "step": 2203 + }, + { + "clip_ratio": 0.0030885706073604524, + "epoch": 0.08223497038384403, + "grad_norm": 0.04419352486729622, + "kl": 0.021331787109375, + "learning_rate": 6.57104145310509e-06, + "loss": 0.0516, + "step": 2204 + }, + { + "clip_ratio": 0.0026401213835924864, + "epoch": 0.08227228207639568, + "grad_norm": 0.04373009875416756, + "kl": 0.02093505859375, + "learning_rate": 6.568210114027346e-06, + "loss": 0.0514, + "step": 2205 + }, + { + "clip_ratio": 0.0034631043672561646, + "completion_length": 700.6250305175781, + "epoch": 0.08230959376894735, + "grad_norm": 0.05155074968934059, + "kl": 0.020233154296875, + "learning_rate": 6.565378326748479e-06, + "loss": 0.0458, + "num_tokens": 52171130.0, + "reward": 0.670504380017519, + "reward_std": 0.049230000004172325, + "rewards/code_reward": 0.5231829583644867, + "rewards/format_reward": 1.4732142984867096, + "step": 2206 + }, + { + "clip_ratio": 0.0031321484711952507, + "epoch": 0.082346905461499, + "grad_norm": 0.04999002814292908, + "kl": 0.0204620361328125, + "learning_rate": 6.562546092456658e-06, + "loss": 0.0456, + "step": 2207 + }, + { + "clip_ratio": 0.0028858596924692392, + "epoch": 0.08238421715405066, + "grad_norm": 0.06131253391504288, + "kl": 0.0202484130859375, + "learning_rate": 6.559713412340233e-06, + "loss": 0.0455, + "step": 2208 + }, + { + "clip_ratio": 0.004987267137039453, + "completion_length": 653.0000457763672, + "epoch": 0.08242152884660231, + "grad_norm": 0.10413622856140137, + "kl": 0.026611328125, + "learning_rate": 6.556880287587746e-06, + "loss": -0.0071, + "num_tokens": 52244728.0, + "reward": 0.2919052839279175, + "reward_std": 0.24763468652963638, + "rewards/code_reward": 0.14190527237951756, + "rewards/format_reward": 1.5, + "step": 2209 + }, + { + "clip_ratio": 0.004926198860630393, + "epoch": 0.08245884053915396, + "grad_norm": 0.0723404735326767, + "kl": 0.0255126953125, + "learning_rate": 6.55404671938792e-06, + "loss": -0.0073, + "step": 2210 + }, + { + "clip_ratio": 0.004720722965430468, + "epoch": 0.08249615223170562, + "grad_norm": 0.07933097332715988, + "kl": 0.0265350341796875, + "learning_rate": 6.551212708929672e-06, + "loss": -0.0077, + "step": 2211 + }, + { + "clip_ratio": 0.004501481889747083, + "completion_length": 865.9286041259766, + "epoch": 0.08253346392425727, + "grad_norm": 0.07281764596700668, + "kl": 0.0252685546875, + "learning_rate": 6.5483782574020985e-06, + "loss": 0.0054, + "num_tokens": 52330124.0, + "reward": 0.34504806250333786, + "reward_std": 0.16130083706229925, + "rewards/code_reward": 0.1950480416417122, + "rewards/format_reward": 1.5, + "step": 2212 + }, + { + "clip_ratio": 0.005099546164274216, + "epoch": 0.08257077561680892, + "grad_norm": 0.0714949369430542, + "kl": 0.02496337890625, + "learning_rate": 6.545543365994481e-06, + "loss": 0.0053, + "step": 2213 + }, + { + "clip_ratio": 0.004181672818958759, + "epoch": 0.08260808730936058, + "grad_norm": 0.06786421686410904, + "kl": 0.02520751953125, + "learning_rate": 6.542708035896291e-06, + "loss": 0.0048, + "step": 2214 + }, + { + "clip_ratio": 0.0030802389374002814, + "completion_length": 540.7857284545898, + "epoch": 0.08264539900191223, + "grad_norm": 0.04009777680039406, + "kl": 0.0198516845703125, + "learning_rate": 6.539872268297176e-06, + "loss": 0.0011, + "num_tokens": 52387218.0, + "reward": 0.5589285865426064, + "reward_std": 0.033407650887966156, + "rewards/code_reward": 0.40892857778817415, + "rewards/format_reward": 1.5, + "step": 2215 + }, + { + "clip_ratio": 0.0037689190357923508, + "epoch": 0.08268271069446388, + "grad_norm": 0.03800993412733078, + "kl": 0.0194854736328125, + "learning_rate": 6.537036064386974e-06, + "loss": 0.0011, + "step": 2216 + }, + { + "clip_ratio": 0.0034673468326218426, + "epoch": 0.08272002238701553, + "grad_norm": 0.03522901609539986, + "kl": 0.0195159912109375, + "learning_rate": 6.534199425355703e-06, + "loss": 0.0009, + "step": 2217 + }, + { + "clip_ratio": 0.00314640678698197, + "completion_length": 723.1785888671875, + "epoch": 0.08275733407956719, + "grad_norm": 0.053958434611558914, + "kl": 0.0171356201171875, + "learning_rate": 6.531362352393564e-06, + "loss": -0.0039, + "num_tokens": 52463150.0, + "reward": 0.39856790006160736, + "reward_std": 0.006841843947768211, + "rewards/code_reward": 0.25035360679612495, + "rewards/format_reward": 1.4821428656578064, + "step": 2218 + }, + { + "clip_ratio": 0.0031009784433990717, + "epoch": 0.08279464577211884, + "grad_norm": 0.022059237584471703, + "kl": 0.0167694091796875, + "learning_rate": 6.52852484669094e-06, + "loss": -0.004, + "step": 2219 + }, + { + "clip_ratio": 0.0031630536541342735, + "epoch": 0.0828319574646705, + "grad_norm": 0.020746132358908653, + "kl": 0.0171051025390625, + "learning_rate": 6.525686909438397e-06, + "loss": -0.004, + "step": 2220 + }, + { + "clip_ratio": 0.004407446656841785, + "completion_length": 676.4643249511719, + "epoch": 0.08286926915722215, + "grad_norm": 0.06319835036993027, + "kl": 0.02392578125, + "learning_rate": 6.5228485418266765e-06, + "loss": 0.045, + "num_tokens": 52539812.0, + "reward": 0.3233390301465988, + "reward_std": 0.2380820047110319, + "rewards/code_reward": 0.1760175988310948, + "rewards/format_reward": 1.4732142984867096, + "step": 2221 + }, + { + "clip_ratio": 0.004583072441164404, + "epoch": 0.0829065808497738, + "grad_norm": 0.06278514862060547, + "kl": 0.0233001708984375, + "learning_rate": 6.52000974504671e-06, + "loss": 0.0449, + "step": 2222 + }, + { + "clip_ratio": 0.004076450830325484, + "epoch": 0.08294389254232545, + "grad_norm": 0.07398798316717148, + "kl": 0.0233612060546875, + "learning_rate": 6.517170520289603e-06, + "loss": 0.0447, + "step": 2223 + }, + { + "clip_ratio": 0.005241179722361267, + "completion_length": 685.4643249511719, + "epoch": 0.0829812042348771, + "grad_norm": 0.04076528549194336, + "kl": 0.01849365234375, + "learning_rate": 6.514330868746638e-06, + "loss": 0.0059, + "num_tokens": 52606548.0, + "reward": 0.15013738349080086, + "reward_std": 0.0005139642744325101, + "rewards/code_reward": 0.00013736264372710139, + "rewards/format_reward": 1.5, + "step": 2224 + }, + { + "clip_ratio": 0.005667906836606562, + "epoch": 0.08301851592742876, + "grad_norm": 0.03329139202833176, + "kl": 0.018341064453125, + "learning_rate": 6.511490791609283e-06, + "loss": 0.0058, + "step": 2225 + }, + { + "clip_ratio": 0.005541287362575531, + "epoch": 0.08305582761998041, + "grad_norm": 0.030292203649878502, + "kl": 0.0184326171875, + "learning_rate": 6.5086502900691805e-06, + "loss": 0.0056, + "step": 2226 + }, + { + "clip_ratio": 0.005558039993047714, + "completion_length": 773.0357513427734, + "epoch": 0.08309313931253207, + "grad_norm": 0.09352164715528488, + "kl": 0.031463623046875, + "learning_rate": 6.505809365318153e-06, + "loss": 0.0199, + "num_tokens": 52687876.0, + "reward": 0.39538973569869995, + "reward_std": 0.30014998093247414, + "rewards/code_reward": 0.24538971669971943, + "rewards/format_reward": 1.5, + "step": 2227 + }, + { + "clip_ratio": 0.005180837179068476, + "epoch": 0.08313045100508372, + "grad_norm": 0.10489870607852936, + "kl": 0.03082275390625, + "learning_rate": 6.502968018548199e-06, + "loss": 0.0196, + "step": 2228 + }, + { + "clip_ratio": 0.00532500131521374, + "epoch": 0.08316776269763537, + "grad_norm": 0.10484808683395386, + "kl": 0.031036376953125, + "learning_rate": 6.500126250951495e-06, + "loss": 0.0192, + "step": 2229 + }, + { + "clip_ratio": 0.005292442510835826, + "completion_length": 666.2143173217773, + "epoch": 0.08320507439018703, + "grad_norm": 0.06428078562021255, + "kl": 0.01910400390625, + "learning_rate": 6.497284063720395e-06, + "loss": -0.0033, + "num_tokens": 52754696.0, + "reward": 0.42795033007860184, + "reward_std": 0.1417042724788189, + "rewards/code_reward": 0.27795030549168587, + "rewards/format_reward": 1.5, + "step": 2230 + }, + { + "clip_ratio": 0.004536502878181636, + "epoch": 0.08324238608273868, + "grad_norm": 0.06018954515457153, + "kl": 0.0193634033203125, + "learning_rate": 6.494441458047426e-06, + "loss": -0.0037, + "step": 2231 + }, + { + "clip_ratio": 0.004447558138053864, + "epoch": 0.08327969777529033, + "grad_norm": 0.059155289083719254, + "kl": 0.0186614990234375, + "learning_rate": 6.491598435125293e-06, + "loss": -0.0039, + "step": 2232 + }, + { + "clip_ratio": 0.004000146174803376, + "completion_length": 651.7500305175781, + "epoch": 0.08331700946784198, + "grad_norm": 0.07228229939937592, + "kl": 0.0192718505859375, + "learning_rate": 6.4887549961468775e-06, + "loss": 0.0188, + "num_tokens": 52823576.0, + "reward": 0.6406511105597019, + "reward_std": 0.23265334963798523, + "rewards/code_reward": 0.49065109714865685, + "rewards/format_reward": 1.5, + "step": 2233 + }, + { + "clip_ratio": 0.0041050928412005305, + "epoch": 0.08335432116039364, + "grad_norm": 0.06581064313650131, + "kl": 0.0191802978515625, + "learning_rate": 6.485911142305233e-06, + "loss": 0.0187, + "step": 2234 + }, + { + "clip_ratio": 0.004119156685192138, + "epoch": 0.08339163285294529, + "grad_norm": 0.09197518974542618, + "kl": 0.018585205078125, + "learning_rate": 6.483066874793584e-06, + "loss": 0.0186, + "step": 2235 + }, + { + "clip_ratio": 0.0048054681974463165, + "completion_length": 718.1964569091797, + "epoch": 0.08342894454549694, + "grad_norm": 0.06167571246623993, + "kl": 0.0383148193359375, + "learning_rate": 6.480222194805339e-06, + "loss": 0.0077, + "num_tokens": 52898585.0, + "reward": 0.42643214762210846, + "reward_std": 0.07326561555964872, + "rewards/code_reward": 0.27643210999667645, + "rewards/format_reward": 1.5, + "step": 2236 + }, + { + "clip_ratio": 0.004536793916486204, + "epoch": 0.0834662562380486, + "grad_norm": 0.06123540922999382, + "kl": 0.03515625, + "learning_rate": 6.477377103534066e-06, + "loss": 0.0076, + "step": 2237 + }, + { + "clip_ratio": 0.004971751826815307, + "epoch": 0.08350356793060025, + "grad_norm": 0.061336319893598557, + "kl": 0.03570556640625, + "learning_rate": 6.474531602173519e-06, + "loss": 0.0075, + "step": 2238 + }, + { + "clip_ratio": 0.003241691447328776, + "completion_length": 794.3928680419922, + "epoch": 0.0835408796231519, + "grad_norm": 0.07697806507349014, + "kl": 0.0165863037109375, + "learning_rate": 6.471685691917612e-06, + "loss": -0.0014, + "num_tokens": 52976055.0, + "reward": 0.7014561221003532, + "reward_std": 0.2649077062960714, + "rewards/code_reward": 0.551456093788147, + "rewards/format_reward": 1.5, + "step": 2239 + }, + { + "clip_ratio": 0.0031722340499982238, + "epoch": 0.08357819131570356, + "grad_norm": 0.07172980904579163, + "kl": 0.01702880859375, + "learning_rate": 6.468839373960437e-06, + "loss": -0.0018, + "step": 2240 + }, + { + "clip_ratio": 0.0027705097454600036, + "epoch": 0.08361550300825521, + "grad_norm": 0.06734520941972733, + "kl": 0.0167694091796875, + "learning_rate": 6.46599264949626e-06, + "loss": -0.0021, + "step": 2241 + }, + { + "clip_ratio": 0.0036353838513605297, + "completion_length": 746.2500228881836, + "epoch": 0.08365281470080686, + "grad_norm": 0.055701397359371185, + "kl": 0.0171661376953125, + "learning_rate": 6.463145519719509e-06, + "loss": 0.0059, + "num_tokens": 53043269.0, + "reward": 0.4575045071542263, + "reward_std": 0.15477361530065536, + "rewards/code_reward": 0.30750449001789093, + "rewards/format_reward": 1.5, + "step": 2242 + }, + { + "clip_ratio": 0.0035733984550461173, + "epoch": 0.08369012639335852, + "grad_norm": 0.054816633462905884, + "kl": 0.016937255859375, + "learning_rate": 6.460297985824792e-06, + "loss": 0.0058, + "step": 2243 + }, + { + "clip_ratio": 0.003309688763692975, + "epoch": 0.08372743808591017, + "grad_norm": 0.050557997077703476, + "kl": 0.0169219970703125, + "learning_rate": 6.457450049006877e-06, + "loss": 0.0058, + "step": 2244 + }, + { + "clip_ratio": 0.004156523966230452, + "completion_length": 652.107177734375, + "epoch": 0.08376474977846182, + "grad_norm": 0.09865551441907883, + "kl": 0.0425567626953125, + "learning_rate": 6.454601710460704e-06, + "loss": 0.0527, + "num_tokens": 53110797.0, + "reward": 0.6715435199439526, + "reward_std": 0.2360462723299861, + "rewards/code_reward": 0.5242220684885979, + "rewards/format_reward": 1.4732142984867096, + "step": 2245 + }, + { + "clip_ratio": 0.004453275410924107, + "epoch": 0.08380206147101348, + "grad_norm": 0.10090573132038116, + "kl": 0.0455322265625, + "learning_rate": 6.451752971381387e-06, + "loss": 0.0526, + "step": 2246 + }, + { + "clip_ratio": 0.003926549921743572, + "epoch": 0.08383937316356513, + "grad_norm": 0.081901915371418, + "kl": 0.0426483154296875, + "learning_rate": 6.4489038329642036e-06, + "loss": 0.052, + "step": 2247 + }, + { + "clip_ratio": 0.003179488063324243, + "completion_length": 855.5714721679688, + "epoch": 0.08387668485611678, + "grad_norm": 0.08511997759342194, + "kl": 0.0192718505859375, + "learning_rate": 6.446054296404599e-06, + "loss": 0.0118, + "num_tokens": 53196607.0, + "reward": 0.646565955132246, + "reward_std": 0.13379905931651592, + "rewards/code_reward": 0.49656593054533005, + "rewards/format_reward": 1.5, + "step": 2248 + }, + { + "clip_ratio": 0.0031441141036339104, + "epoch": 0.08391399654866843, + "grad_norm": 0.07625972479581833, + "kl": 0.01953125, + "learning_rate": 6.443204362898184e-06, + "loss": 0.0115, + "step": 2249 + }, + { + "clip_ratio": 0.00325679691741243, + "epoch": 0.08395130824122009, + "grad_norm": 0.0722414180636406, + "kl": 0.0197296142578125, + "learning_rate": 6.440354033640739e-06, + "loss": 0.0114, + "step": 2250 + }, + { + "clip_ratio": 0.0038677797419950366, + "completion_length": 828.3928985595703, + "epoch": 0.08398861993377174, + "grad_norm": 0.0957849770784378, + "kl": 0.0260009765625, + "learning_rate": 6.437503309828209e-06, + "loss": -0.0081, + "num_tokens": 53272103.0, + "reward": 0.7059432230889797, + "reward_std": 0.19932842534035444, + "rewards/code_reward": 0.5559431947767735, + "rewards/format_reward": 1.5, + "step": 2251 + }, + { + "clip_ratio": 0.0036501132417470217, + "epoch": 0.0840259316263234, + "grad_norm": 0.09307233989238739, + "kl": 0.025054931640625, + "learning_rate": 6.434652192656705e-06, + "loss": -0.0086, + "step": 2252 + }, + { + "clip_ratio": 0.003740235057193786, + "epoch": 0.08406324331887505, + "grad_norm": 0.09399892389774323, + "kl": 0.02618408203125, + "learning_rate": 6.431800683322502e-06, + "loss": -0.0091, + "step": 2253 + }, + { + "clip_ratio": 0.004544740193523467, + "completion_length": 596.7500152587891, + "epoch": 0.0841005550114267, + "grad_norm": 0.06707696616649628, + "kl": 0.016510009765625, + "learning_rate": 6.428948783022042e-06, + "loss": 0.0057, + "num_tokens": 53331519.0, + "reward": 0.5431523583829403, + "reward_std": 0.16561484150588512, + "rewards/code_reward": 0.3931523412466049, + "rewards/format_reward": 1.5, + "step": 2254 + }, + { + "clip_ratio": 0.004202517447993159, + "epoch": 0.08413786670397835, + "grad_norm": 0.06568772345781326, + "kl": 0.0165557861328125, + "learning_rate": 6.426096492951927e-06, + "loss": 0.0055, + "step": 2255 + }, + { + "clip_ratio": 0.003651194681879133, + "epoch": 0.08417517839653, + "grad_norm": 0.06556608527898788, + "kl": 0.0168609619140625, + "learning_rate": 6.423243814308927e-06, + "loss": 0.0052, + "step": 2256 + }, + { + "clip_ratio": 0.002071692084427923, + "completion_length": 633.3214645385742, + "epoch": 0.08421249008908166, + "grad_norm": 0.03803621232509613, + "kl": 0.016326904296875, + "learning_rate": 6.420390748289974e-06, + "loss": -0.0055, + "num_tokens": 53394971.0, + "reward": 0.8060782961547375, + "reward_std": 0.05768048018217087, + "rewards/code_reward": 0.6560783088207245, + "rewards/format_reward": 1.5, + "step": 2257 + }, + { + "clip_ratio": 0.0016797619755379856, + "epoch": 0.08424980178163331, + "grad_norm": 0.036048125475645065, + "kl": 0.0169830322265625, + "learning_rate": 6.417537296092159e-06, + "loss": -0.0056, + "step": 2258 + }, + { + "clip_ratio": 0.0022213852498680353, + "epoch": 0.08428711347418497, + "grad_norm": 0.053664736449718475, + "kl": 0.0164031982421875, + "learning_rate": 6.414683458912736e-06, + "loss": -0.0057, + "step": 2259 + }, + { + "clip_ratio": 0.0025464819045737386, + "completion_length": 560.4107360839844, + "epoch": 0.08432442516673663, + "grad_norm": 0.11232728511095047, + "kl": 0.024993896484375, + "learning_rate": 6.411829237949126e-06, + "loss": 0.0029, + "num_tokens": 53452954.0, + "reward": 0.805652379989624, + "reward_std": 0.08933072350919247, + "rewards/code_reward": 0.6556523684412241, + "rewards/format_reward": 1.5, + "step": 2260 + }, + { + "clip_ratio": 0.0017348311375826597, + "epoch": 0.08436173685928829, + "grad_norm": 0.06570233404636383, + "kl": 0.0269775390625, + "learning_rate": 6.408974634398907e-06, + "loss": 0.0027, + "step": 2261 + }, + { + "clip_ratio": 0.0017901999526657164, + "epoch": 0.08439904855183994, + "grad_norm": 0.0632985383272171, + "kl": 0.0250244140625, + "learning_rate": 6.406119649459814e-06, + "loss": 0.0023, + "step": 2262 + }, + { + "clip_ratio": 0.0020168066257610917, + "completion_length": 486.0178756713867, + "epoch": 0.08443636024439159, + "grad_norm": 0.07132554054260254, + "kl": 0.01959228515625, + "learning_rate": 6.403264284329749e-06, + "loss": -0.0068, + "num_tokens": 53500751.0, + "reward": 1.0387209504842758, + "reward_std": 0.1111663430929184, + "rewards/code_reward": 0.8887209445238113, + "rewards/format_reward": 1.5, + "step": 2263 + }, + { + "clip_ratio": 0.0019794589607045054, + "epoch": 0.08447367193694325, + "grad_norm": 0.06824646145105362, + "kl": 0.01959228515625, + "learning_rate": 6.400408540206767e-06, + "loss": -0.0069, + "step": 2264 + }, + { + "clip_ratio": 0.0017971127526834607, + "epoch": 0.0845109836294949, + "grad_norm": 0.06264341622591019, + "kl": 0.02001953125, + "learning_rate": 6.397552418289089e-06, + "loss": -0.0072, + "step": 2265 + }, + { + "clip_ratio": 0.0037764523876830935, + "completion_length": 518.2857360839844, + "epoch": 0.08454829532204655, + "grad_norm": 0.08091452717781067, + "kl": 0.019866943359375, + "learning_rate": 6.3946959197750894e-06, + "loss": -0.0041, + "num_tokens": 53558223.0, + "reward": 0.4661899246275425, + "reward_std": 0.1333039328455925, + "rewards/code_reward": 0.31618992052972317, + "rewards/format_reward": 1.5, + "step": 2266 + }, + { + "clip_ratio": 0.003400945686735213, + "epoch": 0.0845856070145982, + "grad_norm": 0.1397024691104889, + "kl": 0.0207366943359375, + "learning_rate": 6.391839045863301e-06, + "loss": -0.0043, + "step": 2267 + }, + { + "clip_ratio": 0.0030449633486568928, + "epoch": 0.08462291870714986, + "grad_norm": 0.08420228213071823, + "kl": 0.021575927734375, + "learning_rate": 6.388981797752413e-06, + "loss": -0.0048, + "step": 2268 + }, + { + "clip_ratio": 0.003496445482596755, + "completion_length": 802.3928985595703, + "epoch": 0.08466023039970151, + "grad_norm": 0.06008485332131386, + "kl": 0.01641845703125, + "learning_rate": 6.386124176641279e-06, + "loss": 0.0097, + "num_tokens": 53642777.0, + "reward": 0.4651556983590126, + "reward_std": 0.18579971860162914, + "rewards/code_reward": 0.3151556722878013, + "rewards/format_reward": 1.5, + "step": 2269 + }, + { + "clip_ratio": 0.003263993072323501, + "epoch": 0.08469754209225316, + "grad_norm": 0.059608545154333115, + "kl": 0.01629638671875, + "learning_rate": 6.383266183728899e-06, + "loss": 0.0096, + "step": 2270 + }, + { + "clip_ratio": 0.003570459899492562, + "epoch": 0.08473485378480482, + "grad_norm": 0.05992894247174263, + "kl": 0.016510009765625, + "learning_rate": 6.380407820214437e-06, + "loss": 0.0094, + "step": 2271 + }, + { + "clip_ratio": 0.0030770832672715187, + "completion_length": 629.928596496582, + "epoch": 0.08477216547735647, + "grad_norm": 0.08031794428825378, + "kl": 0.020294189453125, + "learning_rate": 6.377549087297207e-06, + "loss": 0.0075, + "num_tokens": 53702329.0, + "reward": 0.5971243157982826, + "reward_std": 0.09858077019453049, + "rewards/code_reward": 0.447124307975173, + "rewards/format_reward": 1.5, + "step": 2272 + }, + { + "clip_ratio": 0.003378487133886665, + "epoch": 0.08480947716990812, + "grad_norm": 0.059187427163124084, + "kl": 0.0193939208984375, + "learning_rate": 6.37468998617668e-06, + "loss": 0.0074, + "step": 2273 + }, + { + "clip_ratio": 0.003007908759173006, + "epoch": 0.08484678886245978, + "grad_norm": 0.056473858654499054, + "kl": 0.0201568603515625, + "learning_rate": 6.371830518052483e-06, + "loss": 0.0072, + "step": 2274 + }, + { + "clip_ratio": 0.003539633471518755, + "completion_length": 570.5357284545898, + "epoch": 0.08488410055501143, + "grad_norm": 0.03763359785079956, + "kl": 0.018310546875, + "learning_rate": 6.368970684124397e-06, + "loss": 0.0004, + "num_tokens": 53765051.0, + "reward": 0.6808441691100597, + "reward_std": 0.11720181256532669, + "rewards/code_reward": 0.5308441668748856, + "rewards/format_reward": 1.5, + "step": 2275 + }, + { + "clip_ratio": 0.003286710474640131, + "epoch": 0.08492141224756308, + "grad_norm": 0.03603711351752281, + "kl": 0.0181427001953125, + "learning_rate": 6.366110485592353e-06, + "loss": 0.0004, + "step": 2276 + }, + { + "clip_ratio": 0.003254020819440484, + "epoch": 0.08495872394011474, + "grad_norm": 0.036349814385175705, + "kl": 0.0184478759765625, + "learning_rate": 6.363249923656436e-06, + "loss": 0.0004, + "step": 2277 + }, + { + "clip_ratio": 0.0026300065801478922, + "completion_length": 651.3750228881836, + "epoch": 0.08499603563266639, + "grad_norm": 0.08233775943517685, + "kl": 0.025787353515625, + "learning_rate": 6.360388999516888e-06, + "loss": 0.004, + "num_tokens": 53828660.0, + "reward": 0.6459276154637337, + "reward_std": 0.33597060292959213, + "rewards/code_reward": 0.49860616074874997, + "rewards/format_reward": 1.4732142984867096, + "step": 2278 + }, + { + "clip_ratio": 0.0028502450441010296, + "epoch": 0.08503334732521804, + "grad_norm": 0.07976087927818298, + "kl": 0.02520751953125, + "learning_rate": 6.357527714374097e-06, + "loss": 0.0041, + "step": 2279 + }, + { + "clip_ratio": 0.0026775739388540387, + "epoch": 0.0850706590177697, + "grad_norm": 0.07471586763858795, + "kl": 0.025848388671875, + "learning_rate": 6.354666069428606e-06, + "loss": 0.0036, + "step": 2280 + }, + { + "clip_ratio": 0.004966569831594825, + "completion_length": 712.3928833007812, + "epoch": 0.08510797071032135, + "grad_norm": 0.21759983897209167, + "kl": 0.07928466796875, + "learning_rate": 6.351804065881109e-06, + "loss": 0.0563, + "num_tokens": 53906228.0, + "reward": 0.6700701825320721, + "reward_std": 0.18352179415524006, + "rewards/code_reward": 0.5227487023948925, + "rewards/format_reward": 1.4732142984867096, + "step": 2281 + }, + { + "clip_ratio": 0.00538579688873142, + "epoch": 0.085145282402873, + "grad_norm": 0.07523051649332047, + "kl": 0.037384033203125, + "learning_rate": 6.348941704932448e-06, + "loss": 0.0557, + "step": 2282 + }, + { + "clip_ratio": 0.0053266267059370875, + "epoch": 0.08518259409542465, + "grad_norm": 0.06701017916202545, + "kl": 0.03314208984375, + "learning_rate": 6.346078987783618e-06, + "loss": 0.0553, + "step": 2283 + }, + { + "clip_ratio": 0.0021327795111574233, + "completion_length": 606.6964569091797, + "epoch": 0.08521990578797631, + "grad_norm": 0.05768049880862236, + "kl": 0.021026611328125, + "learning_rate": 6.343215915635762e-06, + "loss": -0.0176, + "num_tokens": 53961791.0, + "reward": 0.9325091540813446, + "reward_std": 0.2014266699552536, + "rewards/code_reward": 0.7825091630220413, + "rewards/format_reward": 1.5, + "step": 2284 + }, + { + "clip_ratio": 0.0020278862211853266, + "epoch": 0.08525721748052796, + "grad_norm": 0.05615384131669998, + "kl": 0.02105712890625, + "learning_rate": 6.3403524896901695e-06, + "loss": -0.0177, + "step": 2285 + }, + { + "clip_ratio": 0.0021334121702238917, + "epoch": 0.08529452917307961, + "grad_norm": 0.054643429815769196, + "kl": 0.020477294921875, + "learning_rate": 6.337488711148283e-06, + "loss": -0.018, + "step": 2286 + }, + { + "clip_ratio": 0.004527118464466184, + "completion_length": 619.1071929931641, + "epoch": 0.08533184086563127, + "grad_norm": 0.0669570118188858, + "kl": 0.021759033203125, + "learning_rate": 6.334624581211689e-06, + "loss": 0.0052, + "num_tokens": 54026743.0, + "reward": 0.47108136117458344, + "reward_std": 0.15327735245227814, + "rewards/code_reward": 0.32108134869486094, + "rewards/format_reward": 1.5, + "step": 2287 + }, + { + "clip_ratio": 0.004481503856368363, + "epoch": 0.08536915255818292, + "grad_norm": 0.06921535730361938, + "kl": 0.021392822265625, + "learning_rate": 6.331760101082125e-06, + "loss": 0.0052, + "step": 2288 + }, + { + "clip_ratio": 0.004707074665930122, + "epoch": 0.08540646425073457, + "grad_norm": 0.060752902179956436, + "kl": 0.020721435546875, + "learning_rate": 6.328895271961475e-06, + "loss": 0.0048, + "step": 2289 + }, + { + "clip_ratio": 0.003833804279565811, + "completion_length": 818.7321929931641, + "epoch": 0.08544377594328623, + "grad_norm": 0.07792864739894867, + "kl": 0.0195770263671875, + "learning_rate": 6.326030095051763e-06, + "loss": 0.0023, + "num_tokens": 54105086.0, + "reward": 0.489620428532362, + "reward_std": 0.16595003567636013, + "rewards/code_reward": 0.3396203685551882, + "rewards/format_reward": 1.5, + "step": 2290 + }, + { + "clip_ratio": 0.003968177596107125, + "epoch": 0.08548108763583788, + "grad_norm": 0.0717332661151886, + "kl": 0.01898193359375, + "learning_rate": 6.323164571555169e-06, + "loss": 0.0021, + "step": 2291 + }, + { + "clip_ratio": 0.0034688730956986547, + "epoch": 0.08551839932838953, + "grad_norm": 0.06893118470907211, + "kl": 0.0187225341796875, + "learning_rate": 6.32029870267401e-06, + "loss": 0.0016, + "step": 2292 + }, + { + "clip_ratio": 0.0038718642899766564, + "completion_length": 878.607177734375, + "epoch": 0.08555571102094119, + "grad_norm": 0.06212557107210159, + "kl": 0.0171966552734375, + "learning_rate": 6.317432489610755e-06, + "loss": 0.014, + "num_tokens": 54189288.0, + "reward": 0.2347612977027893, + "reward_std": 0.15246876887977123, + "rewards/code_reward": 0.08743985742330551, + "rewards/format_reward": 1.4732142984867096, + "step": 2293 + }, + { + "clip_ratio": 0.00443222769536078, + "epoch": 0.08559302271349284, + "grad_norm": 0.0612935870885849, + "kl": 0.01753997802734375, + "learning_rate": 6.31456593356801e-06, + "loss": 0.014, + "step": 2294 + }, + { + "clip_ratio": 0.004168917192146182, + "epoch": 0.08563033440604449, + "grad_norm": 0.05966435745358467, + "kl": 0.0179443359375, + "learning_rate": 6.311699035748531e-06, + "loss": 0.0137, + "step": 2295 + }, + { + "clip_ratio": 0.0046691371826455, + "completion_length": 749.9643249511719, + "epoch": 0.08566764609859615, + "grad_norm": 0.08783304691314697, + "kl": 0.02703857421875, + "learning_rate": 6.308831797355214e-06, + "loss": 0.0097, + "num_tokens": 54268274.0, + "reward": 0.6792330667376518, + "reward_std": 0.21793344523757696, + "rewards/code_reward": 0.5292330486699939, + "rewards/format_reward": 1.5, + "step": 2296 + }, + { + "clip_ratio": 0.003993768303189427, + "epoch": 0.0857049577911478, + "grad_norm": 0.09656884521245956, + "kl": 0.026397705078125, + "learning_rate": 6.3059642195911e-06, + "loss": 0.0094, + "step": 2297 + }, + { + "clip_ratio": 0.004186834208667278, + "epoch": 0.08574226948369945, + "grad_norm": 0.09327604621648788, + "kl": 0.026824951171875, + "learning_rate": 6.303096303659371e-06, + "loss": 0.0093, + "step": 2298 + }, + { + "clip_ratio": 0.004933837277349085, + "completion_length": 773.5536193847656, + "epoch": 0.0857795811762511, + "grad_norm": 0.05525004491209984, + "kl": 0.024322509765625, + "learning_rate": 6.30022805076335e-06, + "loss": 0.0078, + "num_tokens": 54356785.0, + "reward": 0.5317992307245731, + "reward_std": 0.12553079053759575, + "rewards/code_reward": 0.38179919123649597, + "rewards/format_reward": 1.5, + "step": 2299 + }, + { + "clip_ratio": 0.004749401123262942, + "epoch": 0.08581689286880276, + "grad_norm": 0.05443299189209938, + "kl": 0.02447509765625, + "learning_rate": 6.297359462106504e-06, + "loss": 0.0077, + "step": 2300 + }, + { + "clip_ratio": 0.004891049116849899, + "epoch": 0.08585420456135441, + "grad_norm": 0.06581161916255951, + "kl": 0.0247344970703125, + "learning_rate": 6.294490538892438e-06, + "loss": 0.0075, + "step": 2301 + }, + { + "clip_ratio": 0.0042780249495990574, + "completion_length": 620.2143096923828, + "epoch": 0.08589151625390606, + "grad_norm": 0.12110596895217896, + "kl": 0.029876708984375, + "learning_rate": 6.2916212823249015e-06, + "loss": 0.0124, + "num_tokens": 54416521.0, + "reward": 0.6002077870070934, + "reward_std": 0.19622324965894222, + "rewards/code_reward": 0.4502077726647258, + "rewards/format_reward": 1.5, + "step": 2302 + }, + { + "clip_ratio": 0.003961318463552743, + "epoch": 0.08592882794645772, + "grad_norm": 0.10678421705961227, + "kl": 0.0296630859375, + "learning_rate": 6.288751693607777e-06, + "loss": 0.0121, + "step": 2303 + }, + { + "clip_ratio": 0.0039945938042365015, + "epoch": 0.08596613963900937, + "grad_norm": 0.1432289332151413, + "kl": 0.028106689453125, + "learning_rate": 6.2858817739450974e-06, + "loss": 0.0115, + "step": 2304 + }, + { + "clip_ratio": 0.002088404609821737, + "completion_length": 653.0893173217773, + "epoch": 0.08600345133156102, + "grad_norm": 0.0394509993493557, + "kl": 0.01613616943359375, + "learning_rate": 6.283011524541022e-06, + "loss": -0.0043, + "num_tokens": 54482774.0, + "reward": 0.4301277585327625, + "reward_std": 0.03417889098636806, + "rewards/code_reward": 0.28012775955721736, + "rewards/format_reward": 1.5, + "step": 2305 + }, + { + "clip_ratio": 0.002087934728479013, + "epoch": 0.08604076302411268, + "grad_norm": 0.03709514066576958, + "kl": 0.0165557861328125, + "learning_rate": 6.2801409465998555e-06, + "loss": -0.0044, + "step": 2306 + }, + { + "clip_ratio": 0.002065499109448865, + "epoch": 0.08607807471666433, + "grad_norm": 0.03820553049445152, + "kl": 0.0166015625, + "learning_rate": 6.277270041326039e-06, + "loss": -0.0045, + "step": 2307 + }, + { + "clip_ratio": 0.003369065059814602, + "completion_length": 572.6607437133789, + "epoch": 0.08611538640921598, + "grad_norm": 0.055157847702503204, + "kl": 0.02142333984375, + "learning_rate": 6.274398809924152e-06, + "loss": 0.0004, + "num_tokens": 54538013.0, + "reward": 0.524927195161581, + "reward_std": 0.05386695172637701, + "rewards/code_reward": 0.3749271873384714, + "rewards/format_reward": 1.5, + "step": 2308 + }, + { + "clip_ratio": 0.0030791779281571507, + "epoch": 0.08615269810176764, + "grad_norm": 0.05421833321452141, + "kl": 0.020751953125, + "learning_rate": 6.271527253598909e-06, + "loss": 0.0002, + "step": 2309 + }, + { + "clip_ratio": 0.0030008181056473404, + "epoch": 0.08619000979431929, + "grad_norm": 0.0516531765460968, + "kl": 0.020599365234375, + "learning_rate": 6.268655373555161e-06, + "loss": 0.0001, + "step": 2310 + }, + { + "clip_ratio": 0.00238158245338127, + "completion_length": 758.7143249511719, + "epoch": 0.08622732148687094, + "grad_norm": 0.05300157144665718, + "kl": 0.021514892578125, + "learning_rate": 6.265783170997899e-06, + "loss": -0.0016, + "num_tokens": 54620905.0, + "reward": 0.6957826875150204, + "reward_std": 0.2374269664287567, + "rewards/code_reward": 0.545782670378685, + "rewards/format_reward": 1.5, + "step": 2311 + }, + { + "clip_ratio": 0.0023047184222377837, + "epoch": 0.0862646331794226, + "grad_norm": 0.05502832308411598, + "kl": 0.0217132568359375, + "learning_rate": 6.262910647132241e-06, + "loss": -0.0016, + "step": 2312 + }, + { + "clip_ratio": 0.002607641275972128, + "epoch": 0.08630194487197425, + "grad_norm": 0.052380893379449844, + "kl": 0.021331787109375, + "learning_rate": 6.260037803163449e-06, + "loss": -0.0017, + "step": 2313 + }, + { + "clip_ratio": 0.002653928706422448, + "completion_length": 462.32144927978516, + "epoch": 0.08633925656452592, + "grad_norm": 0.05334401875734329, + "kl": 0.0218658447265625, + "learning_rate": 6.257164640296911e-06, + "loss": 0.0006, + "num_tokens": 54673311.0, + "reward": 0.8285714201629162, + "reward_std": 0.11720181256532669, + "rewards/code_reward": 0.6785714328289032, + "rewards/format_reward": 1.5, + "step": 2314 + }, + { + "clip_ratio": 0.0023980613914318383, + "epoch": 0.08637656825707757, + "grad_norm": 0.051961660385131836, + "kl": 0.02227783203125, + "learning_rate": 6.254291159738155e-06, + "loss": 0.0003, + "step": 2315 + }, + { + "clip_ratio": 0.002532281680032611, + "epoch": 0.08641387994962922, + "grad_norm": 0.050381600856781006, + "kl": 0.02166748046875, + "learning_rate": 6.251417362692841e-06, + "loss": 0.0001, + "step": 2316 + }, + { + "clip_ratio": 0.0029815444140695035, + "completion_length": 680.1786041259766, + "epoch": 0.08645119164218087, + "grad_norm": 0.03243245184421539, + "kl": 0.0215301513671875, + "learning_rate": 6.248543250366759e-06, + "loss": 0.0024, + "num_tokens": 54740913.0, + "reward": 0.6321428641676903, + "reward_std": 0.06681530922651291, + "rewards/code_reward": 0.4821428507566452, + "rewards/format_reward": 1.5, + "step": 2317 + }, + { + "clip_ratio": 0.002997817820869386, + "epoch": 0.08648850333473253, + "grad_norm": 0.031405139714479446, + "kl": 0.0216217041015625, + "learning_rate": 6.245668823965834e-06, + "loss": 0.0024, + "step": 2318 + }, + { + "clip_ratio": 0.002715440990868956, + "epoch": 0.08652581502728418, + "grad_norm": 0.026772387325763702, + "kl": 0.0216217041015625, + "learning_rate": 6.242794084696122e-06, + "loss": 0.0023, + "step": 2319 + }, + { + "clip_ratio": 0.0037950464175082743, + "completion_length": 592.6607513427734, + "epoch": 0.08656312671983583, + "grad_norm": 0.07766923308372498, + "kl": 0.0147247314453125, + "learning_rate": 6.239919033763809e-06, + "loss": -0.0155, + "num_tokens": 54799680.0, + "reward": 0.35680893808603287, + "reward_std": 0.2794981896877289, + "rewards/code_reward": 0.20680894423276186, + "rewards/format_reward": 1.5, + "step": 2320 + }, + { + "clip_ratio": 0.0037232402828522027, + "epoch": 0.08660043841238749, + "grad_norm": 0.077971450984478, + "kl": 0.014892578125, + "learning_rate": 6.2370436723752125e-06, + "loss": -0.0155, + "step": 2321 + }, + { + "clip_ratio": 0.003716148086823523, + "epoch": 0.08663775010493914, + "grad_norm": 0.0691564530134201, + "kl": 0.0142059326171875, + "learning_rate": 6.234168001736785e-06, + "loss": -0.0159, + "step": 2322 + }, + { + "clip_ratio": 0.0024136220454238355, + "completion_length": 547.4464492797852, + "epoch": 0.0866750617974908, + "grad_norm": 0.08253864198923111, + "kl": 0.01947021484375, + "learning_rate": 6.2312920230551e-06, + "loss": 0.0071, + "num_tokens": 54861613.0, + "reward": 0.8345237970352173, + "reward_std": 0.2553676627576351, + "rewards/code_reward": 0.6845238246023655, + "rewards/format_reward": 1.5, + "step": 2323 + }, + { + "clip_ratio": 0.0033804058330133557, + "epoch": 0.08671237349004245, + "grad_norm": 0.07981792837381363, + "kl": 0.0195465087890625, + "learning_rate": 6.228415737536867e-06, + "loss": 0.0072, + "step": 2324 + }, + { + "clip_ratio": 0.002643083280418068, + "epoch": 0.0867496851825941, + "grad_norm": 0.0741792693734169, + "kl": 0.019805908203125, + "learning_rate": 6.225539146388919e-06, + "loss": 0.0066, + "step": 2325 + }, + { + "clip_ratio": 0.003328070160932839, + "completion_length": 463.5893096923828, + "epoch": 0.08678699687514575, + "grad_norm": 0.09352989494800568, + "kl": 0.03607177734375, + "learning_rate": 6.222662250818224e-06, + "loss": -0.0054, + "num_tokens": 54916456.0, + "reward": 0.6180694214999676, + "reward_std": 0.16118674259632826, + "rewards/code_reward": 0.4707479923963547, + "rewards/format_reward": 1.4732142984867096, + "step": 2326 + }, + { + "clip_ratio": 0.0031217822688631713, + "epoch": 0.0868243085676974, + "grad_norm": 0.09011625498533249, + "kl": 0.033599853515625, + "learning_rate": 6.219785052031874e-06, + "loss": -0.0058, + "step": 2327 + }, + { + "clip_ratio": 0.0028831648523919284, + "epoch": 0.08686162026024906, + "grad_norm": 0.07988005131483078, + "kl": 0.033843994140625, + "learning_rate": 6.216907551237086e-06, + "loss": -0.0061, + "step": 2328 + }, + { + "clip_ratio": 0.004882155917584896, + "completion_length": 660.7678985595703, + "epoch": 0.08689893195280071, + "grad_norm": 0.07980436831712723, + "kl": 0.0193328857421875, + "learning_rate": 6.214029749641205e-06, + "loss": 0.0322, + "num_tokens": 54987017.0, + "reward": 0.6331685595214367, + "reward_std": 0.29140833765268326, + "rewards/code_reward": 0.4831685349345207, + "rewards/format_reward": 1.5, + "step": 2329 + }, + { + "clip_ratio": 0.004735725407954305, + "epoch": 0.08693624364535237, + "grad_norm": 0.09941209852695465, + "kl": 0.019989013671875, + "learning_rate": 6.211151648451704e-06, + "loss": 0.0322, + "step": 2330 + }, + { + "clip_ratio": 0.004807521472685039, + "epoch": 0.08697355533790402, + "grad_norm": 0.07352316379547119, + "kl": 0.0198211669921875, + "learning_rate": 6.208273248876184e-06, + "loss": 0.0316, + "step": 2331 + }, + { + "clip_ratio": 0.0018659875495359302, + "completion_length": 528.1786041259766, + "epoch": 0.08701086703045567, + "grad_norm": 0.06756175309419632, + "kl": 0.015838623046875, + "learning_rate": 6.2053945521223646e-06, + "loss": -0.0024, + "num_tokens": 55041095.0, + "reward": 0.9081834703683853, + "reward_std": 0.1401127502322197, + "rewards/code_reward": 0.7581834681332111, + "rewards/format_reward": 1.5, + "step": 2332 + }, + { + "clip_ratio": 0.0018286207923665643, + "epoch": 0.08704817872300732, + "grad_norm": 0.060487303882837296, + "kl": 0.0160980224609375, + "learning_rate": 6.202515559398094e-06, + "loss": -0.0021, + "step": 2333 + }, + { + "clip_ratio": 0.001804795494535938, + "epoch": 0.08708549041555898, + "grad_norm": 0.056109316647052765, + "kl": 0.015716552734375, + "learning_rate": 6.199636271911344e-06, + "loss": -0.0026, + "step": 2334 + }, + { + "clip_ratio": 0.003949746082071215, + "completion_length": 737.0714721679688, + "epoch": 0.08712280210811063, + "grad_norm": 0.1019149124622345, + "kl": 0.02337646484375, + "learning_rate": 6.196756690870209e-06, + "loss": -0.0046, + "num_tokens": 55117791.0, + "reward": 0.6444659754633904, + "reward_std": 0.08012205781415105, + "rewards/code_reward": 0.49446598440408707, + "rewards/format_reward": 1.5, + "step": 2335 + }, + { + "clip_ratio": 0.003844713093712926, + "epoch": 0.08716011380066228, + "grad_norm": 0.09185194224119186, + "kl": 0.0233154296875, + "learning_rate": 6.1938768174829115e-06, + "loss": -0.0048, + "step": 2336 + }, + { + "clip_ratio": 0.00404626049567014, + "epoch": 0.08719742549321394, + "grad_norm": 0.08579563349485397, + "kl": 0.02313232421875, + "learning_rate": 6.190996652957789e-06, + "loss": -0.0052, + "step": 2337 + }, + { + "clip_ratio": 0.003928172751329839, + "completion_length": 635.7857513427734, + "epoch": 0.08723473718576559, + "grad_norm": 7.287168979644775, + "kl": 0.594757080078125, + "learning_rate": 6.188116198503308e-06, + "loss": 0.0057, + "num_tokens": 55188857.0, + "reward": 0.648843239992857, + "reward_std": 0.13586999475955963, + "rewards/code_reward": 0.5008075013756752, + "rewards/format_reward": 1.480357140302658, + "step": 2338 + }, + { + "clip_ratio": 0.0037518120370805264, + "epoch": 0.08727204887831724, + "grad_norm": 0.06676819920539856, + "kl": 0.03460693359375, + "learning_rate": 6.18523545532805e-06, + "loss": 0.1243, + "step": 2339 + }, + { + "clip_ratio": 0.004658019635826349, + "epoch": 0.0873093605708689, + "grad_norm": 0.0637965202331543, + "kl": 0.03460693359375, + "learning_rate": 6.182354424640726e-06, + "loss": 0.1242, + "step": 2340 + }, + { + "clip_ratio": 0.004248167562764138, + "completion_length": 701.5178833007812, + "epoch": 0.08734667226342055, + "grad_norm": 0.09053874015808105, + "kl": 0.0135040283203125, + "learning_rate": 6.17947310765016e-06, + "loss": -0.0198, + "num_tokens": 55267776.0, + "reward": 0.453571442514658, + "reward_std": 0.3550764247775078, + "rewards/code_reward": 0.3035714291036129, + "rewards/format_reward": 1.5, + "step": 2341 + }, + { + "clip_ratio": 0.00419115019030869, + "epoch": 0.0873839839559722, + "grad_norm": 0.3535628020763397, + "kl": 0.013671875, + "learning_rate": 6.1765915055653e-06, + "loss": -0.02, + "step": 2342 + }, + { + "clip_ratio": 0.003612412081565708, + "epoch": 0.08742129564852386, + "grad_norm": 0.06539521366357803, + "kl": 0.0138397216796875, + "learning_rate": 6.173709619595211e-06, + "loss": -0.0202, + "step": 2343 + }, + { + "clip_ratio": 0.0039036301895976067, + "completion_length": 699.4643096923828, + "epoch": 0.08745860734107551, + "grad_norm": 0.10528738796710968, + "kl": 0.02630615234375, + "learning_rate": 6.170827450949081e-06, + "loss": 0.0287, + "num_tokens": 55341960.0, + "reward": 0.9321933835744858, + "reward_std": 0.2657363675534725, + "rewards/code_reward": 0.7848718911409378, + "rewards/format_reward": 1.4732142984867096, + "step": 2344 + }, + { + "clip_ratio": 0.003894842986483127, + "epoch": 0.08749591903362716, + "grad_norm": 0.09773049503564835, + "kl": 0.027618408203125, + "learning_rate": 6.1679450008362126e-06, + "loss": 0.0284, + "step": 2345 + }, + { + "clip_ratio": 0.003562093188520521, + "epoch": 0.08753323072617882, + "grad_norm": 0.0834876149892807, + "kl": 0.026885986328125, + "learning_rate": 6.165062270466031e-06, + "loss": 0.0281, + "step": 2346 + }, + { + "clip_ratio": 0.003275698167271912, + "completion_length": 824.1428756713867, + "epoch": 0.08757054241873047, + "grad_norm": 0.08124657720327377, + "kl": 0.0176239013671875, + "learning_rate": 6.1621792610480735e-06, + "loss": -0.0033, + "num_tokens": 55423322.0, + "reward": 0.647243469953537, + "reward_std": 0.2769697606563568, + "rewards/code_reward": 0.4972434267401695, + "rewards/format_reward": 1.5, + "step": 2347 + }, + { + "clip_ratio": 0.002751624269876629, + "epoch": 0.08760785411128212, + "grad_norm": 0.0723409652709961, + "kl": 0.0174102783203125, + "learning_rate": 6.159295973791998e-06, + "loss": -0.0037, + "step": 2348 + }, + { + "clip_ratio": 0.0031436809804290533, + "epoch": 0.08764516580383377, + "grad_norm": 0.06727493554353714, + "kl": 0.017120361328125, + "learning_rate": 6.156412409907579e-06, + "loss": -0.0037, + "step": 2349 + }, + { + "clip_ratio": 0.004982312151696533, + "completion_length": 667.1428833007812, + "epoch": 0.08768247749638543, + "grad_norm": 0.1069624274969101, + "kl": 0.0216217041015625, + "learning_rate": 6.1535285706047075e-06, + "loss": 0.0261, + "num_tokens": 55490978.0, + "reward": 0.4233182519674301, + "reward_std": 0.08271390199661255, + "rewards/code_reward": 0.2733182404190302, + "rewards/format_reward": 1.5, + "step": 2350 + }, + { + "clip_ratio": 0.004490515333600342, + "epoch": 0.08771978918893708, + "grad_norm": 0.07848677039146423, + "kl": 0.021728515625, + "learning_rate": 6.150644457093387e-06, + "loss": 0.0257, + "step": 2351 + }, + { + "clip_ratio": 0.004710841574706137, + "epoch": 0.08775710088148873, + "grad_norm": 0.0729421004652977, + "kl": 0.02197265625, + "learning_rate": 6.147760070583736e-06, + "loss": 0.0254, + "step": 2352 + }, + { + "clip_ratio": 0.004467007762286812, + "completion_length": 583.1786041259766, + "epoch": 0.08779441257404039, + "grad_norm": 0.08511505275964737, + "kl": 0.02557373046875, + "learning_rate": 6.144875412285994e-06, + "loss": 0.0097, + "num_tokens": 55556774.0, + "reward": 0.9363562315702438, + "reward_std": 0.22713781986385584, + "rewards/code_reward": 0.7863561734557152, + "rewards/format_reward": 1.5, + "step": 2353 + }, + { + "clip_ratio": 0.0042327349074184895, + "epoch": 0.08783172426659204, + "grad_norm": 0.0846407487988472, + "kl": 0.025482177734375, + "learning_rate": 6.141990483410507e-06, + "loss": 0.0094, + "step": 2354 + }, + { + "clip_ratio": 0.0044933538301847875, + "epoch": 0.0878690359591437, + "grad_norm": 0.08561908453702927, + "kl": 0.024871826171875, + "learning_rate": 6.13910528516774e-06, + "loss": 0.0092, + "step": 2355 + }, + { + "clip_ratio": 0.002326592046301812, + "completion_length": 832.5893096923828, + "epoch": 0.08790634765169535, + "grad_norm": 0.07253477722406387, + "kl": 0.020660400390625, + "learning_rate": 6.136219818768267e-06, + "loss": 0.0492, + "num_tokens": 55641479.0, + "reward": 0.5906611159443855, + "reward_std": 0.15876769181340933, + "rewards/code_reward": 0.4406611006706953, + "rewards/format_reward": 1.5, + "step": 2356 + }, + { + "clip_ratio": 0.002103680104482919, + "epoch": 0.087943659344247, + "grad_norm": 0.06730269640684128, + "kl": 0.0202789306640625, + "learning_rate": 6.133334085422774e-06, + "loss": 0.0489, + "step": 2357 + }, + { + "clip_ratio": 0.0019385216000955552, + "epoch": 0.08798097103679865, + "grad_norm": 0.06381136178970337, + "kl": 0.018707275390625, + "learning_rate": 6.130448086342065e-06, + "loss": 0.0487, + "step": 2358 + }, + { + "clip_ratio": 0.003574384783860296, + "completion_length": 583.1964569091797, + "epoch": 0.0880182827293503, + "grad_norm": 0.07406807690858841, + "kl": 0.02813720703125, + "learning_rate": 6.127561822737049e-06, + "loss": 0.0101, + "num_tokens": 55695642.0, + "reward": 0.7345120310783386, + "reward_std": 0.1763426847755909, + "rewards/code_reward": 0.5845120269805193, + "rewards/format_reward": 1.5, + "step": 2359 + }, + { + "clip_ratio": 0.003441056120209396, + "epoch": 0.08805559442190196, + "grad_norm": 0.0715903639793396, + "kl": 0.028961181640625, + "learning_rate": 6.124675295818752e-06, + "loss": 0.01, + "step": 2360 + }, + { + "clip_ratio": 0.003129095130134374, + "epoch": 0.08809290611445361, + "grad_norm": 0.07205428928136826, + "kl": 0.0283203125, + "learning_rate": 6.121788506798301e-06, + "loss": 0.0097, + "step": 2361 + }, + { + "clip_ratio": 0.002562187728472054, + "completion_length": 518.0178756713867, + "epoch": 0.08813021780700526, + "grad_norm": 0.06232240051031113, + "kl": 0.019195556640625, + "learning_rate": 6.1189014568869445e-06, + "loss": 0.0234, + "num_tokens": 55753351.0, + "reward": 0.800813302397728, + "reward_std": 0.22720932960510254, + "rewards/code_reward": 0.6508133001625538, + "rewards/format_reward": 1.5, + "step": 2362 + }, + { + "clip_ratio": 0.0026799917686730623, + "epoch": 0.08816752949955692, + "grad_norm": 0.05949331447482109, + "kl": 0.01953125, + "learning_rate": 6.116014147296033e-06, + "loss": 0.0232, + "step": 2363 + }, + { + "clip_ratio": 0.0022341166622936726, + "epoch": 0.08820484119210857, + "grad_norm": 0.056040968745946884, + "kl": 0.019378662109375, + "learning_rate": 6.1131265792370285e-06, + "loss": 0.0231, + "step": 2364 + }, + { + "clip_ratio": 0.003551323607098311, + "completion_length": 720.0893249511719, + "epoch": 0.08824215288466022, + "grad_norm": 0.11294920742511749, + "kl": 0.0213165283203125, + "learning_rate": 6.110238753921499e-06, + "loss": 0.0027, + "num_tokens": 55830524.0, + "reward": 0.7919157594442368, + "reward_std": 0.3144466895610094, + "rewards/code_reward": 0.6419157162308693, + "rewards/format_reward": 1.5, + "step": 2365 + }, + { + "clip_ratio": 0.003662358329165727, + "epoch": 0.08827946457721188, + "grad_norm": 0.07538233697414398, + "kl": 0.0227203369140625, + "learning_rate": 6.107350672561125e-06, + "loss": 0.0026, + "step": 2366 + }, + { + "clip_ratio": 0.003339176415465772, + "epoch": 0.08831677626976353, + "grad_norm": 0.07683496177196503, + "kl": 0.0235748291015625, + "learning_rate": 6.104462336367691e-06, + "loss": 0.002, + "step": 2367 + }, + { + "clip_ratio": 0.003267127205617726, + "completion_length": 550.4464340209961, + "epoch": 0.08835408796231518, + "grad_norm": 0.051512062549591064, + "kl": 0.017608642578125, + "learning_rate": 6.101573746553089e-06, + "loss": 0.0024, + "num_tokens": 55892107.0, + "reward": 0.6142857111990452, + "reward_std": 0.2455899640917778, + "rewards/code_reward": 0.4642857238650322, + "rewards/format_reward": 1.5, + "step": 2368 + }, + { + "clip_ratio": 0.0031158701167441905, + "epoch": 0.08839139965486685, + "grad_norm": 0.04954453557729721, + "kl": 0.017547607421875, + "learning_rate": 6.098684904329317e-06, + "loss": 0.0022, + "step": 2369 + }, + { + "clip_ratio": 0.003103230206761509, + "epoch": 0.0884287113474185, + "grad_norm": 0.04625619202852249, + "kl": 0.017547607421875, + "learning_rate": 6.09579581090848e-06, + "loss": 0.0021, + "step": 2370 + }, + { + "clip_ratio": 0.0029641653527505696, + "completion_length": 640.1607513427734, + "epoch": 0.08846602303997016, + "grad_norm": 0.6416929960250854, + "kl": 0.02101898193359375, + "learning_rate": 6.092906467502787e-06, + "loss": 0.0642, + "num_tokens": 55959610.0, + "reward": 0.5901785902678967, + "reward_std": 0.2180082518607378, + "rewards/code_reward": 0.44285715371370316, + "rewards/format_reward": 1.4732142984867096, + "step": 2371 + }, + { + "clip_ratio": 0.0029348712414503098, + "epoch": 0.08850333473252181, + "grad_norm": 0.06231872737407684, + "kl": 0.01999664306640625, + "learning_rate": 6.0900168753245555e-06, + "loss": 0.0639, + "step": 2372 + }, + { + "clip_ratio": 0.0034736136440187693, + "epoch": 0.08854064642507346, + "grad_norm": 0.059792712330818176, + "kl": 0.02069854736328125, + "learning_rate": 6.087127035586203e-06, + "loss": 0.0638, + "step": 2373 + }, + { + "clip_ratio": 0.0019345525652170181, + "completion_length": 603.2857513427734, + "epoch": 0.08857795811762512, + "grad_norm": 0.03773865848779678, + "kl": 0.0157928466796875, + "learning_rate": 6.08423694950025e-06, + "loss": 0.0031, + "num_tokens": 56023900.0, + "reward": 0.8799783401191235, + "reward_std": 0.06640290468931198, + "rewards/code_reward": 0.7299783527851105, + "rewards/format_reward": 1.5, + "step": 2374 + }, + { + "clip_ratio": 0.0016837079310789704, + "epoch": 0.08861526981017677, + "grad_norm": 0.03784659877419472, + "kl": 0.0152587890625, + "learning_rate": 6.081346618279329e-06, + "loss": 0.003, + "step": 2375 + }, + { + "clip_ratio": 0.0017323361244052649, + "epoch": 0.08865258150272842, + "grad_norm": 0.03893575817346573, + "kl": 0.0151824951171875, + "learning_rate": 6.078456043136165e-06, + "loss": 0.0029, + "step": 2376 + }, + { + "clip_ratio": 0.004745252197608352, + "completion_length": 580.3571624755859, + "epoch": 0.08868989319528008, + "grad_norm": 0.08162026107311249, + "kl": 0.02020263671875, + "learning_rate": 6.0755652252835895e-06, + "loss": 0.003, + "num_tokens": 56077442.0, + "reward": 0.513191033154726, + "reward_std": 0.1444067433476448, + "rewards/code_reward": 0.36319103883579373, + "rewards/format_reward": 1.5, + "step": 2377 + }, + { + "clip_ratio": 0.004712258116342127, + "epoch": 0.08872720488783173, + "grad_norm": 0.07893233746290207, + "kl": 0.020843505859375, + "learning_rate": 6.072674165934538e-06, + "loss": 0.0029, + "step": 2378 + }, + { + "clip_ratio": 0.00457202730467543, + "epoch": 0.08876451658038338, + "grad_norm": 0.0812821313738823, + "kl": 0.020355224609375, + "learning_rate": 6.069782866302043e-06, + "loss": 0.0025, + "step": 2379 + }, + { + "clip_ratio": 0.0020165564492344856, + "completion_length": 514.3035888671875, + "epoch": 0.08880182827293504, + "grad_norm": 0.056368693709373474, + "kl": 0.0179443359375, + "learning_rate": 6.066891327599242e-06, + "loss": 0.0203, + "num_tokens": 56132745.0, + "reward": 0.6651785634458065, + "reward_std": 0.07683760579675436, + "rewards/code_reward": 0.5178571436554193, + "rewards/format_reward": 1.4732142984867096, + "step": 2380 + }, + { + "clip_ratio": 0.0024213987053371966, + "epoch": 0.08883913996548669, + "grad_norm": 0.0726659893989563, + "kl": 0.01776123046875, + "learning_rate": 6.06399955103937e-06, + "loss": 0.0204, + "step": 2381 + }, + { + "clip_ratio": 0.0020900501986034214, + "epoch": 0.08887645165803834, + "grad_norm": 0.05414893478155136, + "kl": 0.0179595947265625, + "learning_rate": 6.061107537835763e-06, + "loss": 0.0199, + "step": 2382 + }, + { + "clip_ratio": 0.005365824967157096, + "completion_length": 751.5714492797852, + "epoch": 0.08891376335059, + "grad_norm": 0.07203670591115952, + "kl": 0.02874755859375, + "learning_rate": 6.058215289201855e-06, + "loss": -0.002, + "num_tokens": 56207467.0, + "reward": 0.40733836218714714, + "reward_std": 0.2475307732820511, + "rewards/code_reward": 0.25733834877610207, + "rewards/format_reward": 1.5, + "step": 2383 + }, + { + "clip_ratio": 0.005404197843745351, + "epoch": 0.08895107504314165, + "grad_norm": 0.07203629612922668, + "kl": 0.0284423828125, + "learning_rate": 6.055322806351183e-06, + "loss": -0.002, + "step": 2384 + }, + { + "clip_ratio": 0.005033921333961189, + "epoch": 0.0889883867356933, + "grad_norm": 0.08625837415456772, + "kl": 0.027740478515625, + "learning_rate": 6.0524300904973764e-06, + "loss": -0.0023, + "step": 2385 + }, + { + "clip_ratio": 0.005084313685074449, + "completion_length": 700.1250305175781, + "epoch": 0.08902569842824495, + "grad_norm": 0.09043604135513306, + "kl": 0.01971435546875, + "learning_rate": 6.049537142854165e-06, + "loss": 0.0131, + "num_tokens": 56272056.0, + "reward": 0.6324672289192677, + "reward_std": 0.25206058099865913, + "rewards/code_reward": 0.48246720246970654, + "rewards/format_reward": 1.5, + "step": 2386 + }, + { + "clip_ratio": 0.004408718552440405, + "epoch": 0.0890630101207966, + "grad_norm": 0.08939442783594131, + "kl": 0.019439697265625, + "learning_rate": 6.046643964635377e-06, + "loss": 0.0125, + "step": 2387 + }, + { + "clip_ratio": 0.004388839588500559, + "epoch": 0.08910032181334826, + "grad_norm": 0.09343080967664719, + "kl": 0.01959228515625, + "learning_rate": 6.043750557054934e-06, + "loss": 0.0124, + "step": 2388 + }, + { + "clip_ratio": 0.005011465982533991, + "completion_length": 662.4286041259766, + "epoch": 0.08913763350589991, + "grad_norm": 0.07081765681505203, + "kl": 0.020263671875, + "learning_rate": 6.040856921326859e-06, + "loss": -0.011, + "num_tokens": 56349402.0, + "reward": 0.3651120588183403, + "reward_std": 0.1857328563928604, + "rewards/code_reward": 0.21511204540729523, + "rewards/format_reward": 1.5, + "step": 2389 + }, + { + "clip_ratio": 0.004719126911368221, + "epoch": 0.08917494519845157, + "grad_norm": 0.06886447221040726, + "kl": 0.019866943359375, + "learning_rate": 6.037963058665266e-06, + "loss": -0.0111, + "step": 2390 + }, + { + "clip_ratio": 0.0044773566769436, + "epoch": 0.08921225689100322, + "grad_norm": 0.06891751289367676, + "kl": 0.0200653076171875, + "learning_rate": 6.035068970284367e-06, + "loss": -0.0113, + "step": 2391 + }, + { + "clip_ratio": 0.004239662433974445, + "completion_length": 632.982177734375, + "epoch": 0.08924956858355487, + "grad_norm": 0.07952767610549927, + "kl": 0.016998291015625, + "learning_rate": 6.032174657398467e-06, + "loss": -0.0017, + "num_tokens": 56416643.0, + "reward": 0.6247899271547794, + "reward_std": 0.24942180514335632, + "rewards/code_reward": 0.47478991746902466, + "rewards/format_reward": 1.5, + "step": 2392 + }, + { + "clip_ratio": 0.004364089923910797, + "epoch": 0.08928688027610653, + "grad_norm": 0.07580561935901642, + "kl": 0.0169525146484375, + "learning_rate": 6.029280121221968e-06, + "loss": -0.0017, + "step": 2393 + }, + { + "clip_ratio": 0.004034843586850911, + "epoch": 0.08932419196865818, + "grad_norm": 0.07747010141611099, + "kl": 0.0169219970703125, + "learning_rate": 6.026385362969359e-06, + "loss": -0.0021, + "step": 2394 + }, + { + "clip_ratio": 0.00443776382599026, + "completion_length": 694.8036041259766, + "epoch": 0.08936150366120983, + "grad_norm": 0.0723671019077301, + "kl": 0.017669677734375, + "learning_rate": 6.023490383855231e-06, + "loss": -0.0069, + "num_tokens": 56486800.0, + "reward": 0.5287507958710194, + "reward_std": 0.18956531770527363, + "rewards/code_reward": 0.37875078059732914, + "rewards/format_reward": 1.5, + "step": 2395 + }, + { + "clip_ratio": 0.004023256537038833, + "epoch": 0.08939881535376148, + "grad_norm": 0.07269974797964096, + "kl": 0.0173797607421875, + "learning_rate": 6.02059518509426e-06, + "loss": -0.0072, + "step": 2396 + }, + { + "clip_ratio": 0.004132636182475835, + "epoch": 0.08943612704631314, + "grad_norm": 0.0694381594657898, + "kl": 0.0175628662109375, + "learning_rate": 6.017699767901222e-06, + "loss": -0.0074, + "step": 2397 + }, + { + "clip_ratio": 0.004096146789379418, + "completion_length": 697.1607437133789, + "epoch": 0.08947343873886479, + "grad_norm": 0.08040903508663177, + "kl": 0.0228271484375, + "learning_rate": 6.014804133490979e-06, + "loss": 0.0076, + "num_tokens": 56555829.0, + "reward": 0.6925632953643799, + "reward_std": 0.2621272597461939, + "rewards/code_reward": 0.5425633043050766, + "rewards/format_reward": 1.5, + "step": 2398 + }, + { + "clip_ratio": 0.0037148811388760805, + "epoch": 0.08951075043141644, + "grad_norm": 0.07912945002317429, + "kl": 0.0234375, + "learning_rate": 6.011908283078484e-06, + "loss": 0.0073, + "step": 2399 + }, + { + "clip_ratio": 0.004092624818440527, + "epoch": 0.0895480621239681, + "grad_norm": 0.07669002562761307, + "kl": 0.02349853515625, + "learning_rate": 6.00901221787878e-06, + "loss": 0.0071, + "step": 2400 + }, + { + "clip_ratio": 0.004115177434869111, + "completion_length": 767.6786041259766, + "epoch": 0.08958537381651975, + "grad_norm": 0.07285567373037338, + "kl": 0.0260467529296875, + "learning_rate": 6.006115939107004e-06, + "loss": 0.0503, + "num_tokens": 56627743.0, + "reward": 0.4938296861946583, + "reward_std": 0.1509673639666289, + "rewards/code_reward": 0.3465082524053287, + "rewards/format_reward": 1.4732142984867096, + "step": 2401 + }, + { + "clip_ratio": 0.004054382152389735, + "epoch": 0.0896226855090714, + "grad_norm": 0.06855867058038712, + "kl": 0.0251007080078125, + "learning_rate": 6.003219447978386e-06, + "loss": 0.0502, + "step": 2402 + }, + { + "clip_ratio": 0.0038356948061846197, + "epoch": 0.08965999720162306, + "grad_norm": 0.06771949678659439, + "kl": 0.02618408203125, + "learning_rate": 6.000322745708231e-06, + "loss": 0.0498, + "step": 2403 + }, + { + "clip_ratio": 0.005284042912535369, + "completion_length": 874.357177734375, + "epoch": 0.08969730889417471, + "grad_norm": 0.10887879878282547, + "kl": 0.02752685546875, + "learning_rate": 5.997425833511947e-06, + "loss": -0.0017, + "num_tokens": 56725161.0, + "reward": 0.20544876903295517, + "reward_std": 0.07491175085306168, + "rewards/code_reward": 0.055448753759264946, + "rewards/format_reward": 1.5, + "step": 2404 + }, + { + "clip_ratio": 0.004840801004320383, + "epoch": 0.08973462058672636, + "grad_norm": 0.060662832111120224, + "kl": 0.027557373046875, + "learning_rate": 5.994528712605024e-06, + "loss": -0.002, + "step": 2405 + }, + { + "clip_ratio": 0.005182246677577496, + "epoch": 0.08977193227927802, + "grad_norm": 0.07548099011182785, + "kl": 0.026397705078125, + "learning_rate": 5.99163138420304e-06, + "loss": -0.0021, + "step": 2406 + }, + { + "clip_ratio": 0.0038517649518325925, + "completion_length": 652.3393173217773, + "epoch": 0.08980924397182967, + "grad_norm": 0.07207289338111877, + "kl": 0.02001953125, + "learning_rate": 5.988733849521657e-06, + "loss": 0.0072, + "num_tokens": 56792282.0, + "reward": 0.5826583839952946, + "reward_std": 0.16385483532212675, + "rewards/code_reward": 0.432658328441903, + "rewards/format_reward": 1.5, + "step": 2407 + }, + { + "clip_ratio": 0.003556829149601981, + "epoch": 0.08984655566438132, + "grad_norm": 0.0699944719672203, + "kl": 0.020965576171875, + "learning_rate": 5.985836109776632e-06, + "loss": 0.0072, + "step": 2408 + }, + { + "clip_ratio": 0.003716158214956522, + "epoch": 0.08988386735693298, + "grad_norm": 0.06904672831296921, + "kl": 0.020538330078125, + "learning_rate": 5.982938166183797e-06, + "loss": 0.0068, + "step": 2409 + }, + { + "clip_ratio": 0.00193812619545497, + "completion_length": 730.7321701049805, + "epoch": 0.08992117904948463, + "grad_norm": 0.05816531181335449, + "kl": 0.017242431640625, + "learning_rate": 5.980040019959079e-06, + "loss": 0.0536, + "num_tokens": 56859593.0, + "reward": 0.6817401275038719, + "reward_std": 0.09804199449717999, + "rewards/code_reward": 0.5342401145026088, + "rewards/format_reward": 1.4749999940395355, + "step": 2410 + }, + { + "clip_ratio": 0.002029270981438458, + "epoch": 0.08995849074203628, + "grad_norm": 0.05795542523264885, + "kl": 0.0172119140625, + "learning_rate": 5.977141672318487e-06, + "loss": 0.0535, + "step": 2411 + }, + { + "clip_ratio": 0.0016590971499681473, + "epoch": 0.08999580243458793, + "grad_norm": 0.051196854561567307, + "kl": 0.017333984375, + "learning_rate": 5.974243124478113e-06, + "loss": 0.0531, + "step": 2412 + }, + { + "clip_ratio": 0.0029117793310433626, + "completion_length": 674.1071624755859, + "epoch": 0.09003311412713959, + "grad_norm": 0.06154574826359749, + "kl": 0.02294921875, + "learning_rate": 5.971344377654133e-06, + "loss": -0.0087, + "num_tokens": 56927051.0, + "reward": 0.7082884088158607, + "reward_std": 0.22651870548725128, + "rewards/code_reward": 0.5582884084433317, + "rewards/format_reward": 1.5, + "step": 2413 + }, + { + "clip_ratio": 0.002880409127101302, + "epoch": 0.09007042581969124, + "grad_norm": 0.06308973580598831, + "kl": 0.02264404296875, + "learning_rate": 5.968445433062806e-06, + "loss": -0.0087, + "step": 2414 + }, + { + "clip_ratio": 0.003094626939855516, + "epoch": 0.0901077375122429, + "grad_norm": 0.06085924804210663, + "kl": 0.021820068359375, + "learning_rate": 5.965546291920478e-06, + "loss": -0.009, + "step": 2415 + }, + { + "clip_ratio": 0.006130753899924457, + "completion_length": 704.7321701049805, + "epoch": 0.09014504920479455, + "grad_norm": 0.07468144595623016, + "kl": 0.03204345703125, + "learning_rate": 5.962646955443573e-06, + "loss": -0.0004, + "num_tokens": 57009412.0, + "reward": 0.5185150727629662, + "reward_std": 0.22228866815567017, + "rewards/code_reward": 0.36851503420621157, + "rewards/format_reward": 1.5, + "step": 2416 + }, + { + "clip_ratio": 0.00639288459206, + "epoch": 0.0901823608973462, + "grad_norm": 0.07103579491376877, + "kl": 0.031158447265625, + "learning_rate": 5.959747424848601e-06, + "loss": -0.0004, + "step": 2417 + }, + { + "clip_ratio": 0.006613160250708461, + "epoch": 0.09021967258989785, + "grad_norm": 0.06995949894189835, + "kl": 0.0308837890625, + "learning_rate": 5.956847701352148e-06, + "loss": -0.0011, + "step": 2418 + }, + { + "clip_ratio": 0.003015765512827784, + "completion_length": 674.0000381469727, + "epoch": 0.0902569842824495, + "grad_norm": 0.059450067579746246, + "kl": 0.0201416015625, + "learning_rate": 5.953947786170886e-06, + "loss": 0.0034, + "num_tokens": 57076794.0, + "reward": 0.7362851947546005, + "reward_std": 0.1804158166050911, + "rewards/code_reward": 0.5862852279096842, + "rewards/format_reward": 1.5, + "step": 2419 + }, + { + "clip_ratio": 0.003265434643253684, + "epoch": 0.09029429597500116, + "grad_norm": 0.062262602150440216, + "kl": 0.02032470703125, + "learning_rate": 5.951047680521565e-06, + "loss": 0.0035, + "step": 2420 + }, + { + "clip_ratio": 0.0030290308059193194, + "epoch": 0.09033160766755281, + "grad_norm": 0.07472164183855057, + "kl": 0.020050048828125, + "learning_rate": 5.948147385621018e-06, + "loss": 0.0031, + "step": 2421 + }, + { + "clip_ratio": 0.0034190656733699143, + "completion_length": 732.6428680419922, + "epoch": 0.09036891936010447, + "grad_norm": 0.060164984315633774, + "kl": 0.0196533203125, + "learning_rate": 5.9452469026861496e-06, + "loss": 0.0161, + "num_tokens": 57159626.0, + "reward": 0.4686509743332863, + "reward_std": 0.1952003873884678, + "rewards/code_reward": 0.31865095905959606, + "rewards/format_reward": 1.5, + "step": 2422 + }, + { + "clip_ratio": 0.0036599706509150565, + "epoch": 0.09040623105265613, + "grad_norm": 0.07891426980495453, + "kl": 0.020233154296875, + "learning_rate": 5.942346232933953e-06, + "loss": 0.0161, + "step": 2423 + }, + { + "clip_ratio": 0.0032311484683305025, + "epoch": 0.09044354274520779, + "grad_norm": 0.06169736385345459, + "kl": 0.019866943359375, + "learning_rate": 5.939445377581494e-06, + "loss": 0.0157, + "step": 2424 + }, + { + "clip_ratio": 0.0028565427055582404, + "completion_length": 870.3214569091797, + "epoch": 0.09048085443775944, + "grad_norm": 0.06472548097372055, + "kl": 0.0193328857421875, + "learning_rate": 5.9365443378459174e-06, + "loss": 0.0291, + "num_tokens": 57241180.0, + "reward": 0.49790993705391884, + "reward_std": 0.06384428439196199, + "rewards/code_reward": 0.3498742141528055, + "rewards/format_reward": 1.480357140302658, + "step": 2425 + }, + { + "clip_ratio": 0.002930486691184342, + "epoch": 0.09051816613031109, + "grad_norm": 0.06631328165531158, + "kl": 0.018890380859375, + "learning_rate": 5.9336431149444465e-06, + "loss": 0.029, + "step": 2426 + }, + { + "clip_ratio": 0.002818013250362128, + "epoch": 0.09055547782286275, + "grad_norm": 0.062412362545728683, + "kl": 0.0191802978515625, + "learning_rate": 5.930741710094378e-06, + "loss": 0.0288, + "step": 2427 + }, + { + "clip_ratio": 0.004171465872786939, + "completion_length": 690.803596496582, + "epoch": 0.0905927895154144, + "grad_norm": 0.08387692272663116, + "kl": 0.027130126953125, + "learning_rate": 5.92784012451309e-06, + "loss": 0.0072, + "num_tokens": 57315551.0, + "reward": 0.6829031892120838, + "reward_std": 0.13950107619166374, + "rewards/code_reward": 0.5355817470699549, + "rewards/format_reward": 1.4732142984867096, + "step": 2428 + }, + { + "clip_ratio": 0.003559093631338328, + "epoch": 0.09063010120796605, + "grad_norm": 0.07680433243513107, + "kl": 0.025299072265625, + "learning_rate": 5.9249383594180325e-06, + "loss": 0.007, + "step": 2429 + }, + { + "clip_ratio": 0.003562906349543482, + "epoch": 0.0906674129005177, + "grad_norm": 0.07574930787086487, + "kl": 0.0255584716796875, + "learning_rate": 5.922036416026734e-06, + "loss": 0.0066, + "step": 2430 + }, + { + "clip_ratio": 0.0033905653399415314, + "completion_length": 821.5714492797852, + "epoch": 0.09070472459306936, + "grad_norm": 0.07669815421104431, + "kl": 0.027862548828125, + "learning_rate": 5.919134295556794e-06, + "loss": 0.0546, + "num_tokens": 57400409.0, + "reward": 0.5509677529335022, + "reward_std": 0.3145232852548361, + "rewards/code_reward": 0.4063248746097088, + "rewards/format_reward": 1.4464285671710968, + "step": 2431 + }, + { + "clip_ratio": 0.0034155510365962982, + "epoch": 0.09074203628562101, + "grad_norm": 0.07636000961065292, + "kl": 0.0277099609375, + "learning_rate": 5.91623199922589e-06, + "loss": 0.0545, + "step": 2432 + }, + { + "clip_ratio": 0.003315607027616352, + "epoch": 0.09077934797817266, + "grad_norm": 0.0704566165804863, + "kl": 0.027740478515625, + "learning_rate": 5.91332952825177e-06, + "loss": 0.0543, + "step": 2433 + }, + { + "clip_ratio": 0.00411979213822633, + "completion_length": 748.9643096923828, + "epoch": 0.09081665967072432, + "grad_norm": 0.08215408027172089, + "kl": 0.0205535888671875, + "learning_rate": 5.910426883852258e-06, + "loss": 0.0099, + "num_tokens": 57470407.0, + "reward": 0.6609669327735901, + "reward_std": 0.3365785740315914, + "rewards/code_reward": 0.5109669268131256, + "rewards/format_reward": 1.5, + "step": 2434 + }, + { + "clip_ratio": 0.003521828737575561, + "epoch": 0.09085397136327597, + "grad_norm": 0.07827519625425339, + "kl": 0.0206451416015625, + "learning_rate": 5.9075240672452515e-06, + "loss": 0.0098, + "step": 2435 + }, + { + "clip_ratio": 0.0030083194142207503, + "epoch": 0.09089128305582762, + "grad_norm": 0.07716581225395203, + "kl": 0.0201873779296875, + "learning_rate": 5.904621079648714e-06, + "loss": 0.0094, + "step": 2436 + }, + { + "clip_ratio": 0.004692959482781589, + "completion_length": 667.5893096923828, + "epoch": 0.09092859474837928, + "grad_norm": 0.07034944742918015, + "kl": 0.02252197265625, + "learning_rate": 5.901717922280687e-06, + "loss": 0.0068, + "num_tokens": 57532180.0, + "reward": 0.5667346566915512, + "reward_std": 0.17281370679847896, + "rewards/code_reward": 0.4194132246193476, + "rewards/format_reward": 1.4732142984867096, + "step": 2437 + }, + { + "clip_ratio": 0.004199192626401782, + "epoch": 0.09096590644093093, + "grad_norm": 0.07524409890174866, + "kl": 0.022491455078125, + "learning_rate": 5.898814596359284e-06, + "loss": 0.0068, + "step": 2438 + }, + { + "clip_ratio": 0.004185911966487765, + "epoch": 0.09100321813348258, + "grad_norm": 0.06751879304647446, + "kl": 0.02239990234375, + "learning_rate": 5.895911103102685e-06, + "loss": 0.0062, + "step": 2439 + }, + { + "clip_ratio": 0.0032189186895266175, + "completion_length": 779.607177734375, + "epoch": 0.09104052982603424, + "grad_norm": 0.05162349343299866, + "kl": 0.01751708984375, + "learning_rate": 5.893007443729139e-06, + "loss": 0.0026, + "num_tokens": 57606414.0, + "reward": 0.4595238193869591, + "reward_std": 0.0759952012449503, + "rewards/code_reward": 0.309523805975914, + "rewards/format_reward": 1.5, + "step": 2440 + }, + { + "clip_ratio": 0.003889802610501647, + "epoch": 0.09107784151858589, + "grad_norm": 0.06033369526267052, + "kl": 0.017913818359375, + "learning_rate": 5.890103619456972e-06, + "loss": 0.0027, + "step": 2441 + }, + { + "clip_ratio": 0.003530879213940352, + "epoch": 0.09111515321113754, + "grad_norm": 0.045578841120004654, + "kl": 0.017669677734375, + "learning_rate": 5.88719963150457e-06, + "loss": 0.0024, + "step": 2442 + }, + { + "clip_ratio": 0.005762151617091149, + "completion_length": 717.0714569091797, + "epoch": 0.0911524649036892, + "grad_norm": 0.10003982484340668, + "kl": 0.022674560546875, + "learning_rate": 5.884295481090398e-06, + "loss": 0.002, + "num_tokens": 57673730.0, + "reward": 0.35803183913230896, + "reward_std": 0.1295857923105359, + "rewards/code_reward": 0.20803180895745754, + "rewards/format_reward": 1.5, + "step": 2443 + }, + { + "clip_ratio": 0.004863987269345671, + "epoch": 0.09118977659624085, + "grad_norm": 0.09281841665506363, + "kl": 0.023040771484375, + "learning_rate": 5.881391169432979e-06, + "loss": 0.0016, + "step": 2444 + }, + { + "clip_ratio": 0.004675672622397542, + "epoch": 0.0912270882887925, + "grad_norm": 0.08775908499956131, + "kl": 0.023101806640625, + "learning_rate": 5.87848669775091e-06, + "loss": 0.001, + "step": 2445 + }, + { + "clip_ratio": 0.0034323910367675126, + "completion_length": 534.5714645385742, + "epoch": 0.09126439998134415, + "grad_norm": 0.04444044828414917, + "kl": 0.0229034423828125, + "learning_rate": 5.875582067262855e-06, + "loss": 0.0028, + "num_tokens": 57726724.0, + "reward": 0.4948243126273155, + "reward_std": 0.04581746831536293, + "rewards/code_reward": 0.34482429921627045, + "rewards/format_reward": 1.5, + "step": 2446 + }, + { + "clip_ratio": 0.0033040529815480113, + "epoch": 0.09130171167389581, + "grad_norm": 0.04488781467080116, + "kl": 0.022247314453125, + "learning_rate": 5.872677279187539e-06, + "loss": 0.0028, + "step": 2447 + }, + { + "clip_ratio": 0.0037152108852751553, + "epoch": 0.09133902336644746, + "grad_norm": 0.04191496968269348, + "kl": 0.02239990234375, + "learning_rate": 5.869772334743761e-06, + "loss": 0.0026, + "step": 2448 + }, + { + "clip_ratio": 0.003926266159396619, + "completion_length": 686.1607513427734, + "epoch": 0.09137633505899911, + "grad_norm": 0.09730564057826996, + "kl": 0.034515380859375, + "learning_rate": 5.8668672351503775e-06, + "loss": 0.0064, + "num_tokens": 57803215.0, + "reward": 0.6412012353539467, + "reward_std": 0.3451300598680973, + "rewards/code_reward": 0.4912012182176113, + "rewards/format_reward": 1.5, + "step": 2449 + }, + { + "clip_ratio": 0.0038160697440616786, + "epoch": 0.09141364675155077, + "grad_norm": 0.09168151021003723, + "kl": 0.033538818359375, + "learning_rate": 5.863961981626321e-06, + "loss": 0.006, + "step": 2450 + }, + { + "clip_ratio": 0.0038467179983854294, + "epoch": 0.09145095844410242, + "grad_norm": 0.08659102767705917, + "kl": 0.03265380859375, + "learning_rate": 5.861056575390574e-06, + "loss": 0.0055, + "step": 2451 + }, + { + "clip_ratio": 0.004704380175098777, + "completion_length": 501.82144927978516, + "epoch": 0.09148827013665407, + "grad_norm": 0.0940679982304573, + "kl": 0.025299072265625, + "learning_rate": 5.858151017662196e-06, + "loss": 0.0083, + "num_tokens": 57855831.0, + "reward": 0.4366275630891323, + "reward_std": 0.12386870197951794, + "rewards/code_reward": 0.28662754502147436, + "rewards/format_reward": 1.5, + "step": 2452 + }, + { + "clip_ratio": 0.004822761984542012, + "epoch": 0.09152558182920573, + "grad_norm": 0.0892845094203949, + "kl": 0.025238037109375, + "learning_rate": 5.855245309660306e-06, + "loss": 0.0081, + "step": 2453 + }, + { + "clip_ratio": 0.0042871839832514524, + "epoch": 0.09156289352175738, + "grad_norm": 0.08163424581289291, + "kl": 0.02532958984375, + "learning_rate": 5.852339452604079e-06, + "loss": 0.0074, + "step": 2454 + }, + { + "clip_ratio": 0.005432485952042043, + "completion_length": 620.7500305175781, + "epoch": 0.09160020521430903, + "grad_norm": 0.07597526162862778, + "kl": 0.0233154296875, + "learning_rate": 5.849433447712766e-06, + "loss": 0.0018, + "num_tokens": 57919713.0, + "reward": 0.40956634283065796, + "reward_std": 0.1938980519771576, + "rewards/code_reward": 0.2595663480460644, + "rewards/format_reward": 1.5, + "step": 2455 + }, + { + "clip_ratio": 0.005228355992585421, + "epoch": 0.09163751690686069, + "grad_norm": 0.0728253424167633, + "kl": 0.0226593017578125, + "learning_rate": 5.846527296205667e-06, + "loss": 0.0015, + "step": 2456 + }, + { + "clip_ratio": 0.004745923331938684, + "epoch": 0.09167482859941234, + "grad_norm": 0.07066506147384644, + "kl": 0.0230560302734375, + "learning_rate": 5.8436209993021535e-06, + "loss": 0.0012, + "step": 2457 + }, + { + "clip_ratio": 0.003873375535476953, + "completion_length": 930.6607513427734, + "epoch": 0.09171214029196399, + "grad_norm": 0.12104762345552444, + "kl": 0.022491455078125, + "learning_rate": 5.840714558221649e-06, + "loss": -0.0101, + "num_tokens": 58005808.0, + "reward": 0.40752609074115753, + "reward_std": 0.28307574428617954, + "rewards/code_reward": 0.2628832384943962, + "rewards/format_reward": 1.4464285671710968, + "step": 2458 + }, + { + "clip_ratio": 0.003912592277629301, + "epoch": 0.09174945198451565, + "grad_norm": 0.08720389008522034, + "kl": 0.0218963623046875, + "learning_rate": 5.837807974183647e-06, + "loss": -0.0102, + "step": 2459 + }, + { + "clip_ratio": 0.00346011872170493, + "epoch": 0.0917867636770673, + "grad_norm": 0.08207841217517853, + "kl": 0.0216217041015625, + "learning_rate": 5.834901248407694e-06, + "loss": -0.0109, + "step": 2460 + }, + { + "clip_ratio": 0.003790146205574274, + "completion_length": 712.7143096923828, + "epoch": 0.09182407536961895, + "grad_norm": 0.08791033178567886, + "kl": 0.018798828125, + "learning_rate": 5.831994382113396e-06, + "loss": 0.0006, + "num_tokens": 58078858.0, + "reward": 0.5087256729602814, + "reward_std": 0.24472086504101753, + "rewards/code_reward": 0.358725655823946, + "rewards/format_reward": 1.5, + "step": 2461 + }, + { + "clip_ratio": 0.00371861137682572, + "epoch": 0.0918613870621706, + "grad_norm": 0.08310466259717941, + "kl": 0.020751953125, + "learning_rate": 5.829087376520424e-06, + "loss": 0.0002, + "step": 2462 + }, + { + "clip_ratio": 0.0034971291315741837, + "epoch": 0.09189869875472226, + "grad_norm": 0.08231912553310394, + "kl": 0.019561767578125, + "learning_rate": 5.826180232848501e-06, + "loss": -0.0001, + "step": 2463 + }, + { + "clip_ratio": 0.0033478899858891964, + "completion_length": 793.8928909301758, + "epoch": 0.09193601044727391, + "grad_norm": 0.0561375729739666, + "kl": 0.024078369140625, + "learning_rate": 5.823272952317414e-06, + "loss": -0.0276, + "num_tokens": 58164462.0, + "reward": 0.7105716094374657, + "reward_std": 0.17491288855671883, + "rewards/code_reward": 0.5632501728832722, + "rewards/format_reward": 1.4732142984867096, + "step": 2464 + }, + { + "clip_ratio": 0.003305978316348046, + "epoch": 0.09197332213982556, + "grad_norm": 0.055980078876018524, + "kl": 0.024383544921875, + "learning_rate": 5.820365536146998e-06, + "loss": -0.0275, + "step": 2465 + }, + { + "clip_ratio": 0.0030529905925504863, + "epoch": 0.09201063383237722, + "grad_norm": 0.06791609525680542, + "kl": 0.02447509765625, + "learning_rate": 5.8174579855571556e-06, + "loss": -0.0279, + "step": 2466 + }, + { + "clip_ratio": 0.0018725795089267194, + "completion_length": 528.178581237793, + "epoch": 0.09204794552492887, + "grad_norm": 0.073087178170681, + "kl": 0.022735595703125, + "learning_rate": 5.814550301767837e-06, + "loss": 0.0063, + "num_tokens": 58228522.0, + "reward": 0.9344245046377182, + "reward_std": 0.059521205723285675, + "rewards/code_reward": 0.7844244912266731, + "rewards/format_reward": 1.5, + "step": 2467 + }, + { + "clip_ratio": 0.0022050414700061083, + "epoch": 0.09208525721748052, + "grad_norm": 0.07329849898815155, + "kl": 0.022705078125, + "learning_rate": 5.811642485999059e-06, + "loss": 0.0064, + "step": 2468 + }, + { + "clip_ratio": 0.0020692694233730435, + "epoch": 0.09212256891003218, + "grad_norm": 0.06279691308736801, + "kl": 0.022552490234375, + "learning_rate": 5.808734539470881e-06, + "loss": 0.006, + "step": 2469 + }, + { + "clip_ratio": 0.0049384013982489705, + "completion_length": 689.7678909301758, + "epoch": 0.09215988060258383, + "grad_norm": 0.08609913289546967, + "kl": 0.0224609375, + "learning_rate": 5.8058264634034235e-06, + "loss": 0.0144, + "num_tokens": 58294603.0, + "reward": 0.5108818635344505, + "reward_std": 0.23408175818622112, + "rewards/code_reward": 0.36088184639811516, + "rewards/format_reward": 1.5, + "step": 2470 + }, + { + "clip_ratio": 0.0044400483020581305, + "epoch": 0.09219719229513548, + "grad_norm": 0.09951025247573853, + "kl": 0.022552490234375, + "learning_rate": 5.802918259016865e-06, + "loss": 0.0142, + "step": 2471 + }, + { + "clip_ratio": 0.004515219770837575, + "epoch": 0.09223450398768714, + "grad_norm": 0.08188478648662567, + "kl": 0.0225830078125, + "learning_rate": 5.800009927531429e-06, + "loss": 0.0138, + "step": 2472 + }, + { + "clip_ratio": 0.003953386418288574, + "completion_length": 723.2857513427734, + "epoch": 0.09227181568023879, + "grad_norm": 0.07968772202730179, + "kl": 0.0207977294921875, + "learning_rate": 5.797101470167402e-06, + "loss": 0.0116, + "num_tokens": 58375217.0, + "reward": 0.8820137232542038, + "reward_std": 0.26076686661690474, + "rewards/code_reward": 0.7320136949419975, + "rewards/format_reward": 1.5, + "step": 2473 + }, + { + "clip_ratio": 0.0038825825322419405, + "epoch": 0.09230912737279044, + "grad_norm": 0.07634370028972626, + "kl": 0.020294189453125, + "learning_rate": 5.794192888145118e-06, + "loss": 0.0114, + "step": 2474 + }, + { + "clip_ratio": 0.003425609669648111, + "epoch": 0.0923464390653421, + "grad_norm": 0.0930873304605484, + "kl": 0.0199127197265625, + "learning_rate": 5.791284182684962e-06, + "loss": 0.0111, + "step": 2475 + }, + { + "clip_ratio": 0.005738442181609571, + "completion_length": 738.1250381469727, + "epoch": 0.09238375075789375, + "grad_norm": 0.0950004830956459, + "kl": 0.0213623046875, + "learning_rate": 5.7883753550073726e-06, + "loss": 0.0049, + "num_tokens": 58451496.0, + "reward": 0.44077838957309723, + "reward_std": 0.32781414501369, + "rewards/code_reward": 0.2907783957198262, + "rewards/format_reward": 1.5, + "step": 2476 + }, + { + "clip_ratio": 0.0049352741334587336, + "epoch": 0.0924210624504454, + "grad_norm": 0.0913325697183609, + "kl": 0.021148681640625, + "learning_rate": 5.785466406332843e-06, + "loss": 0.0045, + "step": 2477 + }, + { + "clip_ratio": 0.004889239557087421, + "epoch": 0.09245837414299707, + "grad_norm": 0.08639533072710037, + "kl": 0.021484375, + "learning_rate": 5.782557337881911e-06, + "loss": 0.0041, + "step": 2478 + }, + { + "clip_ratio": 0.004362954758107662, + "completion_length": 712.0535888671875, + "epoch": 0.09249568583554872, + "grad_norm": 0.06817273795604706, + "kl": 0.021575927734375, + "learning_rate": 5.779648150875169e-06, + "loss": -0.0065, + "num_tokens": 58533595.0, + "reward": 0.6639610789716244, + "reward_std": 0.2524045705795288, + "rewards/code_reward": 0.5139610394835472, + "rewards/format_reward": 1.5, + "step": 2479 + }, + { + "clip_ratio": 0.0045604167389683425, + "epoch": 0.09253299752810037, + "grad_norm": 0.06490964442491531, + "kl": 0.021392822265625, + "learning_rate": 5.776738846533258e-06, + "loss": -0.0067, + "step": 2480 + }, + { + "clip_ratio": 0.004399084427859634, + "epoch": 0.09257030922065203, + "grad_norm": 0.06464801728725433, + "kl": 0.0220947265625, + "learning_rate": 5.773829426076866e-06, + "loss": -0.007, + "step": 2481 + }, + { + "clip_ratio": 0.0030981972813606262, + "completion_length": 851.5178985595703, + "epoch": 0.09260762091320368, + "grad_norm": 0.023062406107783318, + "kl": 0.0182342529296875, + "learning_rate": 5.770919890726735e-06, + "loss": -0.0002, + "num_tokens": 58616644.0, + "reward": 0.49094686657190323, + "reward_std": 0.09137651324272156, + "rewards/code_reward": 0.34094684571027756, + "rewards/format_reward": 1.5, + "step": 2482 + }, + { + "clip_ratio": 0.002946728956885636, + "epoch": 0.09264493260575533, + "grad_norm": 0.021716300398111343, + "kl": 0.0190887451171875, + "learning_rate": 5.76801024170365e-06, + "loss": -0.0002, + "step": 2483 + }, + { + "clip_ratio": 0.003229746362194419, + "epoch": 0.09268224429830699, + "grad_norm": 0.024505458772182465, + "kl": 0.0180206298828125, + "learning_rate": 5.765100480228445e-06, + "loss": -0.0002, + "step": 2484 + }, + { + "clip_ratio": 0.004766064579598606, + "completion_length": 724.8928833007812, + "epoch": 0.09271955599085864, + "grad_norm": 0.08709855377674103, + "kl": 0.023834228515625, + "learning_rate": 5.762190607522003e-06, + "loss": -0.0173, + "num_tokens": 58685812.0, + "reward": 0.6569401025772095, + "reward_std": 0.43048742413520813, + "rewards/code_reward": 0.506940096616745, + "rewards/format_reward": 1.5, + "step": 2485 + }, + { + "clip_ratio": 0.004183047567494214, + "epoch": 0.0927568676834103, + "grad_norm": 0.08099531382322311, + "kl": 0.02337646484375, + "learning_rate": 5.759280624805253e-06, + "loss": -0.0177, + "step": 2486 + }, + { + "clip_ratio": 0.004128795408178121, + "epoch": 0.09279417937596195, + "grad_norm": 0.07681553810834885, + "kl": 0.023895263671875, + "learning_rate": 5.756370533299169e-06, + "loss": -0.018, + "step": 2487 + }, + { + "clip_ratio": 0.005087859462946653, + "completion_length": 900.0714569091797, + "epoch": 0.0928314910685136, + "grad_norm": 0.08933138847351074, + "kl": 0.0333251953125, + "learning_rate": 5.753460334224772e-06, + "loss": 0.0022, + "num_tokens": 58784236.0, + "reward": 0.2785550132393837, + "reward_std": 0.20846342737786472, + "rewards/code_reward": 0.13123357738368213, + "rewards/format_reward": 1.4732142984867096, + "step": 2488 + }, + { + "clip_ratio": 0.004663388186600059, + "epoch": 0.09286880276106525, + "grad_norm": 0.0806945189833641, + "kl": 0.031219482421875, + "learning_rate": 5.750550028803125e-06, + "loss": 0.002, + "step": 2489 + }, + { + "clip_ratio": 0.004889415868092328, + "epoch": 0.0929061144536169, + "grad_norm": 0.07968205958604813, + "kl": 0.03131103515625, + "learning_rate": 5.7476396182553425e-06, + "loss": 0.0015, + "step": 2490 + }, + { + "clip_ratio": 0.003865789796691388, + "completion_length": 747.9643096923828, + "epoch": 0.09294342614616856, + "grad_norm": 0.09559236466884613, + "kl": 0.0240631103515625, + "learning_rate": 5.744729103802575e-06, + "loss": 0.0039, + "num_tokens": 58852778.0, + "reward": 0.6507254019379616, + "reward_std": 0.3810408152639866, + "rewards/code_reward": 0.5026896819472313, + "rewards/format_reward": 1.480357140302658, + "step": 2491 + }, + { + "clip_ratio": 0.0039829189772717655, + "epoch": 0.09298073783872021, + "grad_norm": 0.0920788049697876, + "kl": 0.0262298583984375, + "learning_rate": 5.741818486666023e-06, + "loss": 0.0037, + "step": 2492 + }, + { + "clip_ratio": 0.0037550481501966715, + "epoch": 0.09301804953127187, + "grad_norm": 0.08488605916500092, + "kl": 0.026336669921875, + "learning_rate": 5.738907768066924e-06, + "loss": 0.0034, + "step": 2493 + }, + { + "clip_ratio": 0.0042239949107170105, + "completion_length": 586.5178833007812, + "epoch": 0.09305536122382352, + "grad_norm": 0.07121109217405319, + "kl": 0.0280914306640625, + "learning_rate": 5.735996949226563e-06, + "loss": 0.0032, + "num_tokens": 58920063.0, + "reward": 0.7533779703080654, + "reward_std": 0.19233045540750027, + "rewards/code_reward": 0.6033779233694077, + "rewards/format_reward": 1.5, + "step": 2494 + }, + { + "clip_ratio": 0.00402268732432276, + "epoch": 0.09309267291637517, + "grad_norm": 0.06911643594503403, + "kl": 0.0280609130859375, + "learning_rate": 5.733086031366267e-06, + "loss": 0.0031, + "step": 2495 + }, + { + "clip_ratio": 0.0043832072406075895, + "epoch": 0.09312998460892682, + "grad_norm": 0.06446634232997894, + "kl": 0.027862548828125, + "learning_rate": 5.730175015707403e-06, + "loss": 0.0029, + "step": 2496 + }, + { + "clip_ratio": 0.0028812273521907628, + "completion_length": 620.7500305175781, + "epoch": 0.09316729630147848, + "grad_norm": 0.06153006851673126, + "kl": 0.0215301513671875, + "learning_rate": 5.7272639034713775e-06, + "loss": 0.0091, + "num_tokens": 58982741.0, + "reward": 0.7895502708852291, + "reward_std": 0.16527026146650314, + "rewards/code_reward": 0.6395502835512161, + "rewards/format_reward": 1.5, + "step": 2497 + }, + { + "clip_ratio": 0.0028478566091507673, + "epoch": 0.09320460799403013, + "grad_norm": 0.06097452715039253, + "kl": 0.0215911865234375, + "learning_rate": 5.724352695879637e-06, + "loss": 0.0091, + "step": 2498 + }, + { + "clip_ratio": 0.0030481924768537283, + "epoch": 0.09324191968658178, + "grad_norm": 0.05676069110631943, + "kl": 0.0215301513671875, + "learning_rate": 5.721441394153675e-06, + "loss": 0.0089, + "step": 2499 + }, + { + "clip_ratio": 0.004791238636244088, + "completion_length": 747.1607360839844, + "epoch": 0.09327923137913344, + "grad_norm": 0.0669904574751854, + "kl": 0.02008056640625, + "learning_rate": 5.718529999515018e-06, + "loss": 0.003, + "num_tokens": 59057482.0, + "reward": 0.43125002458691597, + "reward_std": 0.15336021967232227, + "rewards/code_reward": 0.28124999441206455, + "rewards/format_reward": 1.5, + "step": 2500 + }, + { + "clip_ratio": 0.004641495819669217, + "epoch": 0.09331654307168509, + "grad_norm": 0.06677450239658356, + "kl": 0.020416259765625, + "learning_rate": 5.715618513185231e-06, + "loss": 0.003, + "step": 2501 + }, + { + "clip_ratio": 0.004924814682453871, + "epoch": 0.09335385476423674, + "grad_norm": 0.061532162129879, + "kl": 0.020263671875, + "learning_rate": 5.712706936385924e-06, + "loss": 0.0026, + "step": 2502 + }, + { + "clip_ratio": 0.0042644673376344144, + "completion_length": 870.8393249511719, + "epoch": 0.0933911664567884, + "grad_norm": 0.06547567993402481, + "kl": 0.0156097412109375, + "learning_rate": 5.7097952703387335e-06, + "loss": 0.0251, + "num_tokens": 59141011.0, + "reward": 0.45160656049847603, + "reward_std": 0.22472301870584488, + "rewards/code_reward": 0.30339224822819233, + "rewards/format_reward": 1.4821428656578064, + "step": 2503 + }, + { + "clip_ratio": 0.003611812775488943, + "epoch": 0.09342847814934005, + "grad_norm": 0.06508633494377136, + "kl": 0.016082763671875, + "learning_rate": 5.706883516265347e-06, + "loss": 0.0249, + "step": 2504 + }, + { + "clip_ratio": 0.0035827967803925276, + "epoch": 0.0934657898418917, + "grad_norm": 0.0627167671918869, + "kl": 0.0159759521484375, + "learning_rate": 5.703971675387481e-06, + "loss": 0.0247, + "step": 2505 + }, + { + "clip_ratio": 0.0020952573977410793, + "completion_length": 533.8928985595703, + "epoch": 0.09350310153444336, + "grad_norm": 0.05500645190477371, + "kl": 0.0174407958984375, + "learning_rate": 5.7010597489268874e-06, + "loss": 0.0027, + "num_tokens": 59197089.0, + "reward": 0.9241596460342407, + "reward_std": 0.24441765248775482, + "rewards/code_reward": 0.7741596698760986, + "rewards/format_reward": 1.5, + "step": 2506 + }, + { + "clip_ratio": 0.002401251927949488, + "epoch": 0.09354041322699501, + "grad_norm": 0.05401879549026489, + "kl": 0.0172119140625, + "learning_rate": 5.698147738105356e-06, + "loss": 0.0028, + "step": 2507 + }, + { + "clip_ratio": 0.0020979027613066137, + "epoch": 0.09357772491954666, + "grad_norm": 0.05560065805912018, + "kl": 0.017486572265625, + "learning_rate": 5.6952356441447175e-06, + "loss": 0.0025, + "step": 2508 + }, + { + "clip_ratio": 0.001679882116150111, + "completion_length": 665.6785888671875, + "epoch": 0.09361503661209832, + "grad_norm": 0.05507194623351097, + "kl": 0.020233154296875, + "learning_rate": 5.6923234682668295e-06, + "loss": 0.0197, + "num_tokens": 59264537.0, + "reward": 0.8364685624837875, + "reward_std": 0.15023242309689522, + "rewards/code_reward": 0.6864685602486134, + "rewards/format_reward": 1.5, + "step": 2509 + }, + { + "clip_ratio": 0.0016802415484562516, + "epoch": 0.09365234830464997, + "grad_norm": 0.05187588557600975, + "kl": 0.0196685791015625, + "learning_rate": 5.6894112116935855e-06, + "loss": 0.0197, + "step": 2510 + }, + { + "clip_ratio": 0.001894388406071812, + "epoch": 0.09368965999720162, + "grad_norm": 0.06782905012369156, + "kl": 0.0205841064453125, + "learning_rate": 5.6864988756469154e-06, + "loss": 0.0196, + "step": 2511 + }, + { + "clip_ratio": 0.0031458083540201187, + "completion_length": 729.5535888671875, + "epoch": 0.09372697168975327, + "grad_norm": 0.07534654438495636, + "kl": 0.020721435546875, + "learning_rate": 5.68358646134878e-06, + "loss": 0.0196, + "num_tokens": 59334582.0, + "reward": 0.7840401828289032, + "reward_std": 0.13251067698001862, + "rewards/code_reward": 0.6340401694178581, + "rewards/format_reward": 1.5, + "step": 2512 + }, + { + "clip_ratio": 0.002897675964049995, + "epoch": 0.09376428338230493, + "grad_norm": 0.0734388455748558, + "kl": 0.020355224609375, + "learning_rate": 5.680673970021177e-06, + "loss": 0.0194, + "step": 2513 + }, + { + "clip_ratio": 0.002845797105692327, + "epoch": 0.09380159507485658, + "grad_norm": 0.07825148850679398, + "kl": 0.020233154296875, + "learning_rate": 5.677761402886134e-06, + "loss": 0.019, + "step": 2514 + }, + { + "clip_ratio": 0.0016516426112502813, + "completion_length": 525.5357284545898, + "epoch": 0.09383890676740823, + "grad_norm": 0.0588788166642189, + "kl": 0.0236358642578125, + "learning_rate": 5.674848761165708e-06, + "loss": -0.0058, + "num_tokens": 59393090.0, + "reward": 1.0625099390745163, + "reward_std": 0.09148638974875212, + "rewards/code_reward": 0.9144742339849472, + "rewards/format_reward": 1.480357140302658, + "step": 2515 + }, + { + "clip_ratio": 0.0016096404287964106, + "epoch": 0.09387621845995989, + "grad_norm": 0.059472303837537766, + "kl": 0.0237884521484375, + "learning_rate": 5.671936046081988e-06, + "loss": -0.006, + "step": 2516 + }, + { + "clip_ratio": 0.0017216509440913796, + "epoch": 0.09391353015251154, + "grad_norm": 0.05521026626229286, + "kl": 0.0228424072265625, + "learning_rate": 5.669023258857097e-06, + "loss": -0.0061, + "step": 2517 + }, + { + "clip_ratio": 0.004490124876610935, + "completion_length": 741.1250305175781, + "epoch": 0.0939508418450632, + "grad_norm": 0.04077683389186859, + "kl": 0.01873779296875, + "learning_rate": 5.666110400713188e-06, + "loss": -0.0015, + "num_tokens": 59469117.0, + "reward": 0.3821428753435612, + "reward_std": 0.06681530922651291, + "rewards/code_reward": 0.2321428507566452, + "rewards/format_reward": 1.5, + "step": 2518 + }, + { + "clip_ratio": 0.0043629087740555406, + "epoch": 0.09398815353761485, + "grad_norm": 0.04689043387770653, + "kl": 0.0187225341796875, + "learning_rate": 5.663197472872439e-06, + "loss": -0.0017, + "step": 2519 + }, + { + "clip_ratio": 0.00443962006829679, + "epoch": 0.0940254652301665, + "grad_norm": 0.03546852618455887, + "kl": 0.01849365234375, + "learning_rate": 5.660284476557062e-06, + "loss": -0.0018, + "step": 2520 + }, + { + "clip_ratio": 0.004483272321522236, + "completion_length": 705.3571701049805, + "epoch": 0.09406277692271815, + "grad_norm": 0.06394895911216736, + "kl": 0.03009033203125, + "learning_rate": 5.657371412989297e-06, + "loss": 0.0264, + "num_tokens": 59543211.0, + "reward": 0.6407332979142666, + "reward_std": 0.27836451679468155, + "rewards/code_reward": 0.4926975667476654, + "rewards/format_reward": 1.480357140302658, + "step": 2521 + }, + { + "clip_ratio": 0.004512997984420508, + "epoch": 0.0941000886152698, + "grad_norm": 0.0647144615650177, + "kl": 0.029815673828125, + "learning_rate": 5.654458283391408e-06, + "loss": 0.0263, + "step": 2522 + }, + { + "clip_ratio": 0.003719974891282618, + "epoch": 0.09413740030782146, + "grad_norm": 0.05911439284682274, + "kl": 0.030303955078125, + "learning_rate": 5.651545088985695e-06, + "loss": 0.0259, + "step": 2523 + }, + { + "clip_ratio": 0.002253580605611205, + "completion_length": 508.7143096923828, + "epoch": 0.09417471200037311, + "grad_norm": 0.051667891442775726, + "kl": 0.0238189697265625, + "learning_rate": 5.648631830994477e-06, + "loss": -0.002, + "num_tokens": 59600833.0, + "reward": 0.8512986861169338, + "reward_std": 0.07819133996963501, + "rewards/code_reward": 0.701298713684082, + "rewards/format_reward": 1.5, + "step": 2524 + }, + { + "clip_ratio": 0.002205378084909171, + "epoch": 0.09421202369292477, + "grad_norm": 0.0483679361641407, + "kl": 0.0236968994140625, + "learning_rate": 5.645718510640102e-06, + "loss": -0.0022, + "step": 2525 + }, + { + "clip_ratio": 0.0024220781051553786, + "epoch": 0.09424933538547642, + "grad_norm": 0.049033477902412415, + "kl": 0.0237274169921875, + "learning_rate": 5.642805129144948e-06, + "loss": -0.0024, + "step": 2526 + }, + { + "clip_ratio": 0.004446829785592854, + "completion_length": 460.73217010498047, + "epoch": 0.09428664707802807, + "grad_norm": 0.0641283169388771, + "kl": 0.019775390625, + "learning_rate": 5.6398916877314155e-06, + "loss": 0.0017, + "num_tokens": 59649118.0, + "reward": 0.5451465584337711, + "reward_std": 0.09696119511500001, + "rewards/code_reward": 0.39514651522040367, + "rewards/format_reward": 1.5, + "step": 2527 + }, + { + "clip_ratio": 0.004798488924279809, + "epoch": 0.09432395877057972, + "grad_norm": 0.06881428509950638, + "kl": 0.0196990966796875, + "learning_rate": 5.636978187621927e-06, + "loss": 0.0017, + "step": 2528 + }, + { + "clip_ratio": 0.004252248792909086, + "epoch": 0.09436127046313138, + "grad_norm": 0.05867312103509903, + "kl": 0.01947021484375, + "learning_rate": 5.634064630038938e-06, + "loss": 0.0014, + "step": 2529 + }, + { + "clip_ratio": 0.0038436478935182095, + "completion_length": 759.0714569091797, + "epoch": 0.09439858215568303, + "grad_norm": 0.06703542917966843, + "kl": 0.0184478759765625, + "learning_rate": 5.63115101620492e-06, + "loss": 0.0026, + "num_tokens": 59723868.0, + "reward": 0.6122171767055988, + "reward_std": 0.09319057752145454, + "rewards/code_reward": 0.46221717141452245, + "rewards/format_reward": 1.5, + "step": 2530 + }, + { + "clip_ratio": 0.003527498396579176, + "epoch": 0.09443589384823468, + "grad_norm": 0.07522506266832352, + "kl": 0.0187835693359375, + "learning_rate": 5.6282373473423705e-06, + "loss": 0.0024, + "step": 2531 + }, + { + "clip_ratio": 0.003460004983935505, + "epoch": 0.09447320554078635, + "grad_norm": 0.06540660560131073, + "kl": 0.0186004638671875, + "learning_rate": 5.625323624673813e-06, + "loss": 0.0023, + "step": 2532 + }, + { + "clip_ratio": 0.004618048958946019, + "completion_length": 767.0714721679688, + "epoch": 0.094510517233338, + "grad_norm": 0.06098216027021408, + "kl": 0.0150909423828125, + "learning_rate": 5.6224098494217904e-06, + "loss": -0.0146, + "num_tokens": 59791408.0, + "reward": 0.39661991223692894, + "reward_std": 0.09738228376954794, + "rewards/code_reward": 0.2466198916081339, + "rewards/format_reward": 1.5, + "step": 2533 + }, + { + "clip_ratio": 0.004487057158257812, + "epoch": 0.09454782892588966, + "grad_norm": 0.06192595884203911, + "kl": 0.0155029296875, + "learning_rate": 5.619496022808869e-06, + "loss": -0.0147, + "step": 2534 + }, + { + "clip_ratio": 0.003739049017895013, + "epoch": 0.09458514061844131, + "grad_norm": 0.06105285510420799, + "kl": 0.014862060546875, + "learning_rate": 5.616582146057636e-06, + "loss": -0.0149, + "step": 2535 + }, + { + "clip_ratio": 0.004092914052307606, + "completion_length": 727.4821624755859, + "epoch": 0.09462245231099296, + "grad_norm": 0.06996490061283112, + "kl": 0.019073486328125, + "learning_rate": 5.6136682203907e-06, + "loss": 0.0056, + "num_tokens": 59859335.0, + "reward": 0.29881956055760384, + "reward_std": 0.15333550423383713, + "rewards/code_reward": 0.14881953969597816, + "rewards/format_reward": 1.5, + "step": 2536 + }, + { + "clip_ratio": 0.003943029209040105, + "epoch": 0.09465976400354462, + "grad_norm": 0.08511065691709518, + "kl": 0.018951416015625, + "learning_rate": 5.610754247030691e-06, + "loss": 0.0053, + "step": 2537 + }, + { + "clip_ratio": 0.0038547880249097943, + "epoch": 0.09469707569609627, + "grad_norm": 0.06388656795024872, + "kl": 0.019866943359375, + "learning_rate": 5.607840227200255e-06, + "loss": 0.0052, + "step": 2538 + }, + { + "clip_ratio": 0.003274607239291072, + "completion_length": 645.0536003112793, + "epoch": 0.09473438738864792, + "grad_norm": 0.06787942349910736, + "kl": 0.022705078125, + "learning_rate": 5.604926162122065e-06, + "loss": 0.0185, + "num_tokens": 59931648.0, + "reward": 0.7243172377347946, + "reward_std": 0.2419685684144497, + "rewards/code_reward": 0.5787815321236849, + "rewards/format_reward": 1.455357164144516, + "step": 2539 + }, + { + "clip_ratio": 0.0030350253800861537, + "epoch": 0.09477169908119958, + "grad_norm": 0.0651756301522255, + "kl": 0.02313232421875, + "learning_rate": 5.602012053018805e-06, + "loss": 0.0184, + "step": 2540 + }, + { + "clip_ratio": 0.0027321037487126887, + "epoch": 0.09480901077375123, + "grad_norm": 0.06368955224752426, + "kl": 0.0222930908203125, + "learning_rate": 5.599097901113184e-06, + "loss": 0.0181, + "step": 2541 + }, + { + "clip_ratio": 0.004023969231639057, + "completion_length": 594.5536041259766, + "epoch": 0.09484632246630288, + "grad_norm": 0.0920414924621582, + "kl": 0.020477294921875, + "learning_rate": 5.5961837076279235e-06, + "loss": -0.0053, + "num_tokens": 59989597.0, + "reward": 0.8005553185939789, + "reward_std": 0.2371932975947857, + "rewards/code_reward": 0.6505552530288696, + "rewards/format_reward": 1.5, + "step": 2542 + }, + { + "clip_ratio": 0.0037733427598141134, + "epoch": 0.09488363415885454, + "grad_norm": 0.08817088603973389, + "kl": 0.01995849609375, + "learning_rate": 5.593269473785767e-06, + "loss": -0.0056, + "step": 2543 + }, + { + "clip_ratio": 0.0038926698616705835, + "epoch": 0.09492094585140619, + "grad_norm": 0.08524233847856522, + "kl": 0.0201873779296875, + "learning_rate": 5.59035520080947e-06, + "loss": -0.006, + "step": 2544 + }, + { + "clip_ratio": 0.003149803029373288, + "completion_length": 694.0178833007812, + "epoch": 0.09495825754395784, + "grad_norm": 0.0732717216014862, + "kl": 0.02984619140625, + "learning_rate": 5.587440889921811e-06, + "loss": 0.0118, + "num_tokens": 60062328.0, + "reward": 0.516168586909771, + "reward_std": 0.17941023944877088, + "rewards/code_reward": 0.3661685831611976, + "rewards/format_reward": 1.5, + "step": 2545 + }, + { + "clip_ratio": 0.003500345512293279, + "epoch": 0.0949955692365095, + "grad_norm": 0.07488613575696945, + "kl": 0.0262298583984375, + "learning_rate": 5.584526542345579e-06, + "loss": 0.0117, + "step": 2546 + }, + { + "clip_ratio": 0.0028026390355080366, + "epoch": 0.09503288092906115, + "grad_norm": 0.06477291136980057, + "kl": 0.0259552001953125, + "learning_rate": 5.581612159303579e-06, + "loss": 0.0114, + "step": 2547 + }, + { + "clip_ratio": 0.003667204699013382, + "completion_length": 659.928596496582, + "epoch": 0.0950701926216128, + "grad_norm": 0.055724840611219406, + "kl": 0.018341064453125, + "learning_rate": 5.578697742018633e-06, + "loss": 0.0041, + "num_tokens": 60125086.0, + "reward": 0.6142857186496258, + "reward_std": 0.1732691377401352, + "rewards/code_reward": 0.46428571082651615, + "rewards/format_reward": 1.5, + "step": 2548 + }, + { + "clip_ratio": 0.003281062876340002, + "epoch": 0.09510750431416445, + "grad_norm": 0.058500200510025024, + "kl": 0.0184783935546875, + "learning_rate": 5.575783291713576e-06, + "loss": 0.0039, + "step": 2549 + }, + { + "clip_ratio": 0.003116194624453783, + "epoch": 0.09514481600671611, + "grad_norm": 0.05641228333115578, + "kl": 0.0182647705078125, + "learning_rate": 5.572868809611258e-06, + "loss": 0.0038, + "step": 2550 + }, + { + "clip_ratio": 0.004437344963662326, + "completion_length": 755.0357284545898, + "epoch": 0.09518212769926776, + "grad_norm": 0.06826705485582352, + "kl": 0.017333984375, + "learning_rate": 5.56995429693454e-06, + "loss": 0.0074, + "num_tokens": 60201794.0, + "reward": 0.38526832684874535, + "reward_std": 0.2050119899213314, + "rewards/code_reward": 0.23526830971240997, + "rewards/format_reward": 1.5, + "step": 2551 + }, + { + "clip_ratio": 0.004248217737767845, + "epoch": 0.09521943939181941, + "grad_norm": 0.06507550179958344, + "kl": 0.017578125, + "learning_rate": 5.567039754906299e-06, + "loss": 0.0071, + "step": 2552 + }, + { + "clip_ratio": 0.0042238017776981, + "epoch": 0.09525675108437107, + "grad_norm": 0.06231199577450752, + "kl": 0.0172576904296875, + "learning_rate": 5.564125184749424e-06, + "loss": 0.0071, + "step": 2553 + }, + { + "clip_ratio": 0.004238336929120123, + "completion_length": 598.1250305175781, + "epoch": 0.09529406277692272, + "grad_norm": 0.06770618259906769, + "kl": 0.0216522216796875, + "learning_rate": 5.5612105876868105e-06, + "loss": 0.004, + "num_tokens": 60259299.0, + "reward": 0.56324627622962, + "reward_std": 0.14444052055478096, + "rewards/code_reward": 0.41324626468122005, + "rewards/format_reward": 1.5, + "step": 2554 + }, + { + "clip_ratio": 0.0038616714300587773, + "epoch": 0.09533137446947437, + "grad_norm": 0.07154923677444458, + "kl": 0.021697998046875, + "learning_rate": 5.558295964941373e-06, + "loss": 0.0038, + "step": 2555 + }, + { + "clip_ratio": 0.0033607856021262705, + "epoch": 0.09536868616202603, + "grad_norm": 0.05921977385878563, + "kl": 0.02105712890625, + "learning_rate": 5.555381317736029e-06, + "loss": 0.0036, + "step": 2556 + }, + { + "clip_ratio": 0.0029170497437007725, + "completion_length": 538.3393020629883, + "epoch": 0.09540599785457768, + "grad_norm": 0.06328117847442627, + "kl": 0.0208587646484375, + "learning_rate": 5.552466647293715e-06, + "loss": 0.0004, + "num_tokens": 60314422.0, + "reward": 0.6232143007218838, + "reward_std": 0.18917061388492584, + "rewards/code_reward": 0.4732142835855484, + "rewards/format_reward": 1.5, + "step": 2557 + }, + { + "clip_ratio": 0.002408205531537533, + "epoch": 0.09544330954712933, + "grad_norm": 0.07567989826202393, + "kl": 0.020355224609375, + "learning_rate": 5.54955195483737e-06, + "loss": 0.0, + "step": 2558 + }, + { + "clip_ratio": 0.0027985460765194148, + "epoch": 0.09548062123968099, + "grad_norm": 0.05985887721180916, + "kl": 0.020355224609375, + "learning_rate": 5.546637241589945e-06, + "loss": -0.0001, + "step": 2559 + }, + { + "clip_ratio": 0.005639491602778435, + "completion_length": 818.6428985595703, + "epoch": 0.09551793293223264, + "grad_norm": 45.82278060913086, + "kl": 22.513824462890625, + "learning_rate": 5.543722508774398e-06, + "loss": 0.2309, + "num_tokens": 60399200.0, + "reward": 0.6261735409498215, + "reward_std": 0.284734146669507, + "rewards/code_reward": 0.4761735387146473, + "rewards/format_reward": 1.5, + "step": 2560 + }, + { + "clip_ratio": 0.006764327874407172, + "epoch": 0.09555524462478429, + "grad_norm": 0.7391661405563354, + "kl": 0.4393310546875, + "learning_rate": 5.540807757613702e-06, + "loss": 0.011, + "step": 2561 + }, + { + "clip_ratio": 0.007384157623164356, + "epoch": 0.09559255631733594, + "grad_norm": 1161.9521484375, + "kl": 0.0288848876953125, + "learning_rate": 5.537892989330826e-06, + "loss": 10.9114, + "step": 2562 + }, + { + "clip_ratio": 0.001852182438597083, + "completion_length": 455.6964569091797, + "epoch": 0.0956298680098876, + "grad_norm": 0.08550983667373657, + "kl": 0.020477294921875, + "learning_rate": 5.5349782051487584e-06, + "loss": 0.0138, + "num_tokens": 60444221.0, + "reward": 1.015546202659607, + "reward_std": 0.03537281462922692, + "rewards/code_reward": 0.8655462190508842, + "rewards/format_reward": 1.5, + "step": 2563 + }, + { + "clip_ratio": 0.0016582509269937873, + "epoch": 0.09566717970243925, + "grad_norm": 0.07087282836437225, + "kl": 0.020751953125, + "learning_rate": 5.5320634062904836e-06, + "loss": 0.0137, + "step": 2564 + }, + { + "clip_ratio": 0.0020394339808262885, + "epoch": 0.0957044913949909, + "grad_norm": 0.0770641341805458, + "kl": 0.020721435546875, + "learning_rate": 5.529148593978998e-06, + "loss": 0.0133, + "step": 2565 + }, + { + "clip_ratio": 0.004351566778495908, + "completion_length": 681.3928985595703, + "epoch": 0.09574180308754256, + "grad_norm": 0.0908159390091896, + "kl": 0.0199432373046875, + "learning_rate": 5.526233769437306e-06, + "loss": -0.0068, + "num_tokens": 60515251.0, + "reward": 0.26169804111123085, + "reward_std": 0.16225570626556873, + "rewards/code_reward": 0.11169804120436311, + "rewards/format_reward": 1.5, + "step": 2566 + }, + { + "clip_ratio": 0.004716740571893752, + "epoch": 0.09577911478009421, + "grad_norm": 0.08753234148025513, + "kl": 0.0197906494140625, + "learning_rate": 5.523318933888412e-06, + "loss": -0.0071, + "step": 2567 + }, + { + "clip_ratio": 0.005447440664283931, + "epoch": 0.09581642647264586, + "grad_norm": 0.07933396846055984, + "kl": 0.0197601318359375, + "learning_rate": 5.520404088555324e-06, + "loss": -0.0074, + "step": 2568 + }, + { + "clip_ratio": 0.004193619359284639, + "completion_length": 616.1964569091797, + "epoch": 0.09585373816519752, + "grad_norm": 0.0804300457239151, + "kl": 0.02337646484375, + "learning_rate": 5.517489234661061e-06, + "loss": -0.0121, + "num_tokens": 60586256.0, + "reward": 0.5923025906085968, + "reward_std": 0.3422291576862335, + "rewards/code_reward": 0.442302580922842, + "rewards/format_reward": 1.5, + "step": 2569 + }, + { + "clip_ratio": 0.0034991296124644578, + "epoch": 0.09589104985774917, + "grad_norm": 0.07712285965681076, + "kl": 0.023284912109375, + "learning_rate": 5.514574373428641e-06, + "loss": -0.0126, + "step": 2570 + }, + { + "clip_ratio": 0.004251076898071915, + "epoch": 0.09592836155030082, + "grad_norm": 0.0742771178483963, + "kl": 0.023468017578125, + "learning_rate": 5.511659506081083e-06, + "loss": -0.0128, + "step": 2571 + }, + { + "clip_ratio": 0.003107711672782898, + "completion_length": 626.4643173217773, + "epoch": 0.09596567324285248, + "grad_norm": 0.07588566839694977, + "kl": 0.016448974609375, + "learning_rate": 5.508744633841413e-06, + "loss": -0.0013, + "num_tokens": 60655582.0, + "reward": 0.6579365096986294, + "reward_std": 0.07532273232936859, + "rewards/code_reward": 0.507936516776681, + "rewards/format_reward": 1.5, + "step": 2572 + }, + { + "clip_ratio": 0.0030908474000170827, + "epoch": 0.09600298493540413, + "grad_norm": 0.06625714153051376, + "kl": 0.01629638671875, + "learning_rate": 5.505829757932658e-06, + "loss": -0.0013, + "step": 2573 + }, + { + "clip_ratio": 0.0033907496836036444, + "epoch": 0.09604029662795578, + "grad_norm": 0.06269905716180801, + "kl": 0.0164031982421875, + "learning_rate": 5.502914879577843e-06, + "loss": -0.0016, + "step": 2574 + }, + { + "clip_ratio": 0.003673651604913175, + "completion_length": 665.7143325805664, + "epoch": 0.09607760832050743, + "grad_norm": 0.03458446264266968, + "kl": 0.02764892578125, + "learning_rate": 5.500000000000001e-06, + "loss": 0.0042, + "num_tokens": 60727778.0, + "reward": 0.40033693611621857, + "reward_std": 0.0012606660602614284, + "rewards/code_reward": 0.2503369272162672, + "rewards/format_reward": 1.5, + "step": 2575 + }, + { + "clip_ratio": 0.0038637849502265453, + "epoch": 0.09611492001305909, + "grad_norm": 0.034603822976350784, + "kl": 0.0276947021484375, + "learning_rate": 5.4970851204221575e-06, + "loss": 0.0041, + "step": 2576 + }, + { + "clip_ratio": 0.0037350229686126113, + "epoch": 0.09615223170561074, + "grad_norm": 0.03264646232128143, + "kl": 0.0276336669921875, + "learning_rate": 5.4941702420673435e-06, + "loss": 0.0041, + "step": 2577 + }, + { + "clip_ratio": 0.003307066101115197, + "completion_length": 827.5536193847656, + "epoch": 0.0961895433981624, + "grad_norm": 0.06294307857751846, + "kl": 0.01715087890625, + "learning_rate": 5.491255366158588e-06, + "loss": 0.0069, + "num_tokens": 60811039.0, + "reward": 0.6322336606681347, + "reward_std": 0.1657109558582306, + "rewards/code_reward": 0.4822336509823799, + "rewards/format_reward": 1.5, + "step": 2578 + }, + { + "clip_ratio": 0.0029594370862469077, + "epoch": 0.09622685509071405, + "grad_norm": 0.05652841180562973, + "kl": 0.017059326171875, + "learning_rate": 5.48834049391892e-06, + "loss": 0.0068, + "step": 2579 + }, + { + "clip_ratio": 0.003070243517868221, + "epoch": 0.0962641667832657, + "grad_norm": 0.05281670391559601, + "kl": 0.017181396484375, + "learning_rate": 5.485425626571362e-06, + "loss": 0.0066, + "step": 2580 + }, + { + "clip_ratio": 0.004969057627022266, + "completion_length": 792.2321624755859, + "epoch": 0.09630147847581735, + "grad_norm": 0.06955058127641678, + "kl": 0.026519775390625, + "learning_rate": 5.48251076533894e-06, + "loss": -0.0017, + "num_tokens": 60892182.0, + "reward": 0.31091008707880974, + "reward_std": 0.26172947883605957, + "rewards/code_reward": 0.16537437215447426, + "rewards/format_reward": 1.4553571343421936, + "step": 2581 + }, + { + "clip_ratio": 0.004218439687974751, + "epoch": 0.096338790168369, + "grad_norm": 0.06624611467123032, + "kl": 0.026763916015625, + "learning_rate": 5.479595911444677e-06, + "loss": -0.0021, + "step": 2582 + }, + { + "clip_ratio": 0.004543870862107724, + "epoch": 0.09637610186092066, + "grad_norm": 0.06527870893478394, + "kl": 0.026641845703125, + "learning_rate": 5.47668106611159e-06, + "loss": -0.0022, + "step": 2583 + }, + { + "clip_ratio": 0.005175412574317306, + "completion_length": 773.4286041259766, + "epoch": 0.09641341355347231, + "grad_norm": 0.06251053512096405, + "kl": 0.027313232421875, + "learning_rate": 5.473766230562697e-06, + "loss": -0.0067, + "num_tokens": 60968296.0, + "reward": 0.26545261964201927, + "reward_std": 0.15650848485529423, + "rewards/code_reward": 0.11545262020081282, + "rewards/format_reward": 1.5, + "step": 2584 + }, + { + "clip_ratio": 0.005021945107728243, + "epoch": 0.09645072524602397, + "grad_norm": 0.06300018727779388, + "kl": 0.027069091796875, + "learning_rate": 5.4708514060210036e-06, + "loss": -0.0067, + "step": 2585 + }, + { + "clip_ratio": 0.004794915963429958, + "epoch": 0.09648803693857562, + "grad_norm": 0.06099100783467293, + "kl": 0.02783203125, + "learning_rate": 5.467936593709519e-06, + "loss": -0.007, + "step": 2586 + }, + { + "clip_ratio": 0.002990720560774207, + "completion_length": 715.1071624755859, + "epoch": 0.09652534863112729, + "grad_norm": 0.06852275878190994, + "kl": 0.0153961181640625, + "learning_rate": 5.4650217948512455e-06, + "loss": -0.0049, + "num_tokens": 61042054.0, + "reward": 0.6531888097524643, + "reward_std": 0.27442085929214954, + "rewards/code_reward": 0.5031887926161289, + "rewards/format_reward": 1.5, + "step": 2587 + }, + { + "clip_ratio": 0.0032862373627722263, + "epoch": 0.09656266032367894, + "grad_norm": 0.06591558456420898, + "kl": 0.01476287841796875, + "learning_rate": 5.462107010669174e-06, + "loss": -0.005, + "step": 2588 + }, + { + "clip_ratio": 0.0032365123624913394, + "epoch": 0.09659997201623059, + "grad_norm": 0.06523305177688599, + "kl": 0.0147857666015625, + "learning_rate": 5.459192242386301e-06, + "loss": -0.0055, + "step": 2589 + }, + { + "clip_ratio": 0.003421015804633498, + "completion_length": 720.7500305175781, + "epoch": 0.09663728370878225, + "grad_norm": 0.08124828338623047, + "kl": 0.0194549560546875, + "learning_rate": 5.456277491225603e-06, + "loss": 0.0045, + "num_tokens": 61111592.0, + "reward": 0.6353251561522484, + "reward_std": 0.19843216985464096, + "rewards/code_reward": 0.4853251315653324, + "rewards/format_reward": 1.5, + "step": 2590 + }, + { + "clip_ratio": 0.0034886974026449025, + "epoch": 0.0966745954013339, + "grad_norm": 0.08225724846124649, + "kl": 0.01947021484375, + "learning_rate": 5.453362758410057e-06, + "loss": 0.0045, + "step": 2591 + }, + { + "clip_ratio": 0.003461558138951659, + "epoch": 0.09671190709388555, + "grad_norm": 0.07803784310817719, + "kl": 0.0190277099609375, + "learning_rate": 5.450448045162633e-06, + "loss": 0.0041, + "step": 2592 + }, + { + "clip_ratio": 0.00260710169095546, + "completion_length": 703.2857513427734, + "epoch": 0.0967492187864372, + "grad_norm": 0.04728667438030243, + "kl": 0.018768310546875, + "learning_rate": 5.447533352706287e-06, + "loss": -0.0017, + "num_tokens": 61183120.0, + "reward": 0.4892857186496258, + "reward_std": 0.17378021404147148, + "rewards/code_reward": 0.3392857164144516, + "rewards/format_reward": 1.5, + "step": 2593 + }, + { + "clip_ratio": 0.002681913145352155, + "epoch": 0.09678653047898886, + "grad_norm": 0.04691126570105553, + "kl": 0.0186004638671875, + "learning_rate": 5.444618682263971e-06, + "loss": -0.0018, + "step": 2594 + }, + { + "clip_ratio": 0.002763597934972495, + "epoch": 0.09682384217154051, + "grad_norm": 0.04552368447184563, + "kl": 0.018341064453125, + "learning_rate": 5.441704035058629e-06, + "loss": -0.0019, + "step": 2595 + }, + { + "clip_ratio": 0.0041326009668409824, + "completion_length": 774.9286193847656, + "epoch": 0.09686115386409216, + "grad_norm": 0.06673534959554672, + "kl": 0.022796630859375, + "learning_rate": 5.43878941231319e-06, + "loss": 0.0222, + "num_tokens": 61259206.0, + "reward": 0.442714411765337, + "reward_std": 0.15643083024770021, + "rewards/code_reward": 0.2927143760025501, + "rewards/format_reward": 1.5, + "step": 2596 + }, + { + "clip_ratio": 0.0038535729399882257, + "epoch": 0.09689846555664382, + "grad_norm": 0.06902862340211868, + "kl": 0.022552490234375, + "learning_rate": 5.435874815250579e-06, + "loss": 0.0221, + "step": 2597 + }, + { + "clip_ratio": 0.003455133701208979, + "epoch": 0.09693577724919547, + "grad_norm": 2.5982308387756348, + "kl": 0.0235595703125, + "learning_rate": 5.432960245093701e-06, + "loss": 0.0233, + "step": 2598 + }, + { + "clip_ratio": 0.005219157552346587, + "completion_length": 647.1607513427734, + "epoch": 0.09697308894174712, + "grad_norm": 0.09260625392198563, + "kl": 0.019439697265625, + "learning_rate": 5.43004570306546e-06, + "loss": -0.0149, + "num_tokens": 61323013.0, + "reward": 0.6622950211167336, + "reward_std": 0.36199953593313694, + "rewards/code_reward": 0.5122950077056885, + "rewards/format_reward": 1.5, + "step": 2599 + }, + { + "clip_ratio": 0.004474675981327891, + "epoch": 0.09701040063429878, + "grad_norm": 0.08527153730392456, + "kl": 0.018951416015625, + "learning_rate": 5.427131190388743e-06, + "loss": -0.0155, + "step": 2600 + }, + { + "clip_ratio": 0.004268511140253395, + "epoch": 0.09704771232685043, + "grad_norm": 0.2630155086517334, + "kl": 0.019805908203125, + "learning_rate": 5.424216708286425e-06, + "loss": -0.0158, + "step": 2601 + }, + { + "clip_ratio": 0.003088909143116325, + "completion_length": 604.1250152587891, + "epoch": 0.09708502401940208, + "grad_norm": 0.046994924545288086, + "kl": 0.0169830322265625, + "learning_rate": 5.42130225798137e-06, + "loss": 0.0051, + "num_tokens": 61384032.0, + "reward": 0.7150510214269161, + "reward_std": 0.09963326156139374, + "rewards/code_reward": 0.5650510042905807, + "rewards/format_reward": 1.5, + "step": 2602 + }, + { + "clip_ratio": 0.00309372425545007, + "epoch": 0.09712233571195374, + "grad_norm": 0.04867083951830864, + "kl": 0.0167999267578125, + "learning_rate": 5.418387840696424e-06, + "loss": 0.0052, + "step": 2603 + }, + { + "clip_ratio": 0.0026249439106322825, + "epoch": 0.09715964740450539, + "grad_norm": 0.0935879498720169, + "kl": 0.01641845703125, + "learning_rate": 5.415473457654423e-06, + "loss": 0.0049, + "step": 2604 + }, + { + "clip_ratio": 0.00484274368500337, + "completion_length": 762.1428985595703, + "epoch": 0.09719695909705704, + "grad_norm": 0.07833705097436905, + "kl": 0.0250244140625, + "learning_rate": 5.41255911007819e-06, + "loss": -0.0136, + "num_tokens": 61465226.0, + "reward": 0.39428794011473656, + "reward_std": 0.33167316764593124, + "rewards/code_reward": 0.25053794868290424, + "rewards/format_reward": 1.4375, + "step": 2605 + }, + { + "clip_ratio": 0.004477651615161449, + "epoch": 0.0972342707896087, + "grad_norm": 0.07784047722816467, + "kl": 0.02520751953125, + "learning_rate": 5.40964479919053e-06, + "loss": -0.014, + "step": 2606 + }, + { + "clip_ratio": 0.0043655497138388455, + "epoch": 0.09727158248216035, + "grad_norm": 0.08012665808200836, + "kl": 0.025054931640625, + "learning_rate": 5.406730526214235e-06, + "loss": -0.0139, + "step": 2607 + }, + { + "clip_ratio": 0.004795259330421686, + "completion_length": 686.9107513427734, + "epoch": 0.097308894174712, + "grad_norm": 0.05714574083685875, + "kl": 0.023681640625, + "learning_rate": 5.403816292372078e-06, + "loss": 0.0038, + "num_tokens": 61533051.0, + "reward": 0.4132537469267845, + "reward_std": 0.08685114979743958, + "rewards/code_reward": 0.26325371116399765, + "rewards/format_reward": 1.5, + "step": 2608 + }, + { + "clip_ratio": 0.004870397795457393, + "epoch": 0.09734620586726365, + "grad_norm": 0.05514908581972122, + "kl": 0.02288818359375, + "learning_rate": 5.400902098886816e-06, + "loss": 0.004, + "step": 2609 + }, + { + "clip_ratio": 0.004933802061714232, + "epoch": 0.09738351755981531, + "grad_norm": 0.05578560009598732, + "kl": 0.023040771484375, + "learning_rate": 5.397987946981196e-06, + "loss": 0.0037, + "step": 2610 + }, + { + "clip_ratio": 0.004007871611975133, + "completion_length": 761.5893249511719, + "epoch": 0.09742082925236696, + "grad_norm": 0.07178189605474472, + "kl": 0.024169921875, + "learning_rate": 5.395073837877936e-06, + "loss": 0.0403, + "num_tokens": 61611932.0, + "reward": 0.6249721385538578, + "reward_std": 0.06474696798250079, + "rewards/code_reward": 0.47765071620233357, + "rewards/format_reward": 1.4732142984867096, + "step": 2611 + }, + { + "clip_ratio": 0.0036426689475774765, + "epoch": 0.09745814094491861, + "grad_norm": 0.07209452986717224, + "kl": 0.0225067138671875, + "learning_rate": 5.392159772799746e-06, + "loss": 0.0402, + "step": 2612 + }, + { + "clip_ratio": 0.003778169513680041, + "epoch": 0.09749545263747027, + "grad_norm": 0.07013744115829468, + "kl": 0.0233612060546875, + "learning_rate": 5.3892457529693106e-06, + "loss": 0.0401, + "step": 2613 + }, + { + "clip_ratio": 0.004678394994698465, + "completion_length": 659.9107437133789, + "epoch": 0.09753276433002192, + "grad_norm": 0.07940584421157837, + "kl": 0.018035888671875, + "learning_rate": 5.386331779609301e-06, + "loss": 0.0006, + "num_tokens": 61673597.0, + "reward": 0.5014022588729858, + "reward_std": 0.24647657782770693, + "rewards/code_reward": 0.3514022642048076, + "rewards/format_reward": 1.5, + "step": 2614 + }, + { + "clip_ratio": 0.0050468898843973875, + "epoch": 0.09757007602257357, + "grad_norm": 0.07795996218919754, + "kl": 0.018524169921875, + "learning_rate": 5.383417853942367e-06, + "loss": 0.0007, + "step": 2615 + }, + { + "clip_ratio": 0.004247483273502439, + "epoch": 0.09760738771512523, + "grad_norm": 0.07488095015287399, + "kl": 0.018035888671875, + "learning_rate": 5.380503977191133e-06, + "loss": 0.0, + "step": 2616 + }, + { + "clip_ratio": 0.0026789590483531356, + "completion_length": 578.1786041259766, + "epoch": 0.09764469940767688, + "grad_norm": 0.040925610810518265, + "kl": 0.023956298828125, + "learning_rate": 5.377590150578213e-06, + "loss": -0.0043, + "num_tokens": 61729523.0, + "reward": 0.6504761874675751, + "reward_std": 0.0014916217187419534, + "rewards/code_reward": 0.5004761905001942, + "rewards/format_reward": 1.5, + "step": 2617 + }, + { + "clip_ratio": 0.0025869213277474046, + "epoch": 0.09768201110022853, + "grad_norm": 0.04155148193240166, + "kl": 0.023651123046875, + "learning_rate": 5.374676375326189e-06, + "loss": -0.0044, + "step": 2618 + }, + { + "clip_ratio": 0.0025107688270509243, + "epoch": 0.09771932279278019, + "grad_norm": 0.03891070559620857, + "kl": 0.02362060546875, + "learning_rate": 5.371762652657631e-06, + "loss": -0.0044, + "step": 2619 + }, + { + "clip_ratio": 0.002847955038305372, + "completion_length": 640.6428985595703, + "epoch": 0.09775663448533184, + "grad_norm": 0.06176310032606125, + "kl": 0.0225982666015625, + "learning_rate": 5.368848983795083e-06, + "loss": 0.0011, + "num_tokens": 61800455.0, + "reward": 0.7003409117460251, + "reward_std": 0.17025966942310333, + "rewards/code_reward": 0.5503409206867218, + "rewards/format_reward": 1.5, + "step": 2620 + }, + { + "clip_ratio": 0.0027905122260563076, + "epoch": 0.09779394617788349, + "grad_norm": 0.061631761491298676, + "kl": 0.022979736328125, + "learning_rate": 5.3659353699610635e-06, + "loss": 0.001, + "step": 2621 + }, + { + "clip_ratio": 0.0027350384916644543, + "epoch": 0.09783125787043515, + "grad_norm": 0.0648251473903656, + "kl": 0.023162841796875, + "learning_rate": 5.363021812378073e-06, + "loss": 0.0008, + "step": 2622 + }, + { + "clip_ratio": 0.002542132802773267, + "completion_length": 781.8571624755859, + "epoch": 0.0978685695629868, + "grad_norm": 0.06897781789302826, + "kl": 0.017333984375, + "learning_rate": 5.360108312268587e-06, + "loss": -0.0051, + "num_tokens": 61879141.0, + "reward": 0.7615011781454086, + "reward_std": 0.2117786556482315, + "rewards/code_reward": 0.6115011908113956, + "rewards/format_reward": 1.5, + "step": 2623 + }, + { + "clip_ratio": 0.0025746383471414447, + "epoch": 0.09790588125553845, + "grad_norm": 0.07888873666524887, + "kl": 0.0172119140625, + "learning_rate": 5.357194870855052e-06, + "loss": -0.0051, + "step": 2624 + }, + { + "clip_ratio": 0.0029626466566696763, + "epoch": 0.0979431929480901, + "grad_norm": 0.0658683255314827, + "kl": 0.017425537109375, + "learning_rate": 5.354281489359901e-06, + "loss": -0.0052, + "step": 2625 + }, + { + "clip_ratio": 0.004184057645034045, + "completion_length": 552.303596496582, + "epoch": 0.09798050464064176, + "grad_norm": 0.08931013196706772, + "kl": 0.0179595947265625, + "learning_rate": 5.351368169005526e-06, + "loss": 0.0211, + "num_tokens": 61943096.0, + "reward": 0.4984084442257881, + "reward_std": 0.33088623359799385, + "rewards/code_reward": 0.3484084401279688, + "rewards/format_reward": 1.5, + "step": 2626 + }, + { + "clip_ratio": 0.0042849486344493926, + "epoch": 0.09801781633319341, + "grad_norm": 0.08636275678873062, + "kl": 0.0182037353515625, + "learning_rate": 5.348454911014306e-06, + "loss": 0.021, + "step": 2627 + }, + { + "clip_ratio": 0.0038989357417449355, + "epoch": 0.09805512802574506, + "grad_norm": 0.08390521258115768, + "kl": 0.0179901123046875, + "learning_rate": 5.345541716608593e-06, + "loss": 0.0205, + "step": 2628 + }, + { + "clip_ratio": 0.00388418190414086, + "completion_length": 606.1071701049805, + "epoch": 0.09809243971829672, + "grad_norm": 0.08388470113277435, + "kl": 0.01953125, + "learning_rate": 5.342628587010706e-06, + "loss": -0.0218, + "num_tokens": 62015902.0, + "reward": 0.6675711274147034, + "reward_std": 0.17787615803536028, + "rewards/code_reward": 0.5175711290212348, + "rewards/format_reward": 1.5, + "step": 2629 + }, + { + "clip_ratio": 0.004332472977694124, + "epoch": 0.09812975141084837, + "grad_norm": 0.094846211373806, + "kl": 0.020172119140625, + "learning_rate": 5.33971552344294e-06, + "loss": -0.0215, + "step": 2630 + }, + { + "clip_ratio": 0.004085538908839226, + "epoch": 0.09816706310340002, + "grad_norm": 0.08135131001472473, + "kl": 0.020263671875, + "learning_rate": 5.336802527127562e-06, + "loss": -0.0218, + "step": 2631 + }, + { + "clip_ratio": 0.0017033398034982383, + "completion_length": 687.3928833007812, + "epoch": 0.09820437479595168, + "grad_norm": 0.06627709418535233, + "kl": 0.0214996337890625, + "learning_rate": 5.333889599286814e-06, + "loss": 0.0573, + "num_tokens": 62077958.0, + "reward": 0.8946428410708904, + "reward_std": 0.020044592209160328, + "rewards/code_reward": 0.75, + "rewards/format_reward": 1.4464285969734192, + "step": 2632 + }, + { + "clip_ratio": 0.0017057275399565697, + "epoch": 0.09824168648850333, + "grad_norm": 0.06481917947530746, + "kl": 0.02203369140625, + "learning_rate": 5.330976741142905e-06, + "loss": 0.0574, + "step": 2633 + }, + { + "clip_ratio": 0.0016882892232388258, + "epoch": 0.09827899818105498, + "grad_norm": 0.056351032108068466, + "kl": 0.02288818359375, + "learning_rate": 5.328063953918013e-06, + "loss": 0.0571, + "step": 2634 + }, + { + "clip_ratio": 0.00258363428292796, + "completion_length": 605.6250305175781, + "epoch": 0.09831630987360664, + "grad_norm": 0.09587351232767105, + "kl": 0.0260009765625, + "learning_rate": 5.325151238834295e-06, + "loss": 0.0104, + "num_tokens": 62143079.0, + "reward": 0.7168461829423904, + "reward_std": 0.19002775847911835, + "rewards/code_reward": 0.5668461695313454, + "rewards/format_reward": 1.5, + "step": 2635 + }, + { + "clip_ratio": 0.002733082335907966, + "epoch": 0.09835362156615829, + "grad_norm": 0.08044493198394775, + "kl": 0.026458740234375, + "learning_rate": 5.322238597113866e-06, + "loss": 0.0101, + "step": 2636 + }, + { + "clip_ratio": 0.002752492728177458, + "epoch": 0.09839093325870994, + "grad_norm": 0.07300316542387009, + "kl": 0.02691650390625, + "learning_rate": 5.3193260299788214e-06, + "loss": 0.0099, + "step": 2637 + }, + { + "clip_ratio": 0.005029447900597006, + "completion_length": 645.2143249511719, + "epoch": 0.0984282449512616, + "grad_norm": 0.07583007216453552, + "kl": 0.025360107421875, + "learning_rate": 5.31641353865122e-06, + "loss": 0.0007, + "num_tokens": 62204635.0, + "reward": 0.2569979950785637, + "reward_std": 0.07102666120044887, + "rewards/code_reward": 0.1069979767780751, + "rewards/format_reward": 1.5, + "step": 2638 + }, + { + "clip_ratio": 0.004921930842101574, + "epoch": 0.09846555664381325, + "grad_norm": 0.0790710598230362, + "kl": 0.025665283203125, + "learning_rate": 5.313501124353087e-06, + "loss": 0.0004, + "step": 2639 + }, + { + "clip_ratio": 0.004694946575909853, + "epoch": 0.0985028683363649, + "grad_norm": 0.07221037894487381, + "kl": 0.025604248046875, + "learning_rate": 5.3105887883064176e-06, + "loss": 0.0002, + "step": 2640 + }, + { + "clip_ratio": 0.00241747876862064, + "completion_length": 755.7500305175781, + "epoch": 0.09854018002891657, + "grad_norm": 0.055667221546173096, + "kl": 0.01507568359375, + "learning_rate": 5.307676531733173e-06, + "loss": 0.0211, + "num_tokens": 62282895.0, + "reward": 0.6638251394033432, + "reward_std": 0.10327344946563244, + "rewards/code_reward": 0.5165037311380729, + "rewards/format_reward": 1.4732142984867096, + "step": 2641 + }, + { + "clip_ratio": 0.0021605839137919247, + "epoch": 0.09857749172146822, + "grad_norm": 0.08075658977031708, + "kl": 0.0193328857421875, + "learning_rate": 5.304764355855283e-06, + "loss": 0.0211, + "step": 2642 + }, + { + "clip_ratio": 0.0022015630966052413, + "epoch": 0.09861480341401987, + "grad_norm": 0.0627395436167717, + "kl": 0.01824951171875, + "learning_rate": 5.301852261894646e-06, + "loss": 0.0209, + "step": 2643 + }, + { + "clip_ratio": 0.003657354274764657, + "completion_length": 744.482177734375, + "epoch": 0.09865211510657153, + "grad_norm": 0.0853981152176857, + "kl": 0.02288818359375, + "learning_rate": 5.298940251073115e-06, + "loss": -0.0029, + "num_tokens": 62357206.0, + "reward": 0.7143034115433693, + "reward_std": 0.20299778319895267, + "rewards/code_reward": 0.5643033944070339, + "rewards/format_reward": 1.5, + "step": 2644 + }, + { + "clip_ratio": 0.0035560120595619082, + "epoch": 0.09868942679912318, + "grad_norm": 0.08461729437112808, + "kl": 0.023773193359375, + "learning_rate": 5.296028324612522e-06, + "loss": -0.0032, + "step": 2645 + }, + { + "clip_ratio": 0.0033216520096175373, + "epoch": 0.09872673849167483, + "grad_norm": 0.07848004251718521, + "kl": 0.02362060546875, + "learning_rate": 5.2931164837346545e-06, + "loss": -0.0036, + "step": 2646 + }, + { + "clip_ratio": 0.0038750868407078087, + "completion_length": 638.8750381469727, + "epoch": 0.09876405018422649, + "grad_norm": 0.07748715579509735, + "kl": 0.02398681640625, + "learning_rate": 5.290204729661267e-06, + "loss": 0.0739, + "num_tokens": 62418771.0, + "reward": 0.9792293608188629, + "reward_std": 0.24835481867194176, + "rewards/code_reward": 0.8345864713191986, + "rewards/format_reward": 1.4464285671710968, + "step": 2647 + }, + { + "clip_ratio": 0.0036750011204276234, + "epoch": 0.09880136187677814, + "grad_norm": 0.07522040605545044, + "kl": 0.02386474609375, + "learning_rate": 5.2872930636140795e-06, + "loss": 0.0738, + "step": 2648 + }, + { + "clip_ratio": 0.00385529815685004, + "epoch": 0.0988386735693298, + "grad_norm": 0.07757129520177841, + "kl": 0.02349853515625, + "learning_rate": 5.284381486814769e-06, + "loss": 0.0736, + "step": 2649 + }, + { + "clip_ratio": 0.0034513852442614734, + "completion_length": 599.0178985595703, + "epoch": 0.09887598526188145, + "grad_norm": 0.027359383180737495, + "kl": 0.022857666015625, + "learning_rate": 5.281470000484985e-06, + "loss": -0.0105, + "num_tokens": 62473232.0, + "reward": 0.40618812665343285, + "reward_std": 0.015737412497401237, + "rewards/code_reward": 0.2561881192959845, + "rewards/format_reward": 1.5, + "step": 2650 + }, + { + "clip_ratio": 0.0034680768148973584, + "epoch": 0.0989132969544331, + "grad_norm": 0.028518958017230034, + "kl": 0.023193359375, + "learning_rate": 5.278558605846327e-06, + "loss": -0.0106, + "step": 2651 + }, + { + "clip_ratio": 0.003276460396591574, + "epoch": 0.09895060864698475, + "grad_norm": 0.02614886313676834, + "kl": 0.022705078125, + "learning_rate": 5.275647304120364e-06, + "loss": -0.0106, + "step": 2652 + }, + { + "clip_ratio": 0.0046464502229355276, + "completion_length": 671.3035888671875, + "epoch": 0.0989879203395364, + "grad_norm": 0.08213555812835693, + "kl": 0.0178070068359375, + "learning_rate": 5.272736096528626e-06, + "loss": 0.0111, + "num_tokens": 62540001.0, + "reward": 0.4339197427034378, + "reward_std": 0.22415192425251007, + "rewards/code_reward": 0.28659830056130886, + "rewards/format_reward": 1.4732142984867096, + "step": 2653 + }, + { + "clip_ratio": 0.004901550244539976, + "epoch": 0.09902523203208806, + "grad_norm": 0.06896426528692245, + "kl": 0.0183258056640625, + "learning_rate": 5.269824984292599e-06, + "loss": 0.0111, + "step": 2654 + }, + { + "clip_ratio": 0.004404248611535877, + "epoch": 0.09906254372463971, + "grad_norm": 0.07151543349027634, + "kl": 0.0180816650390625, + "learning_rate": 5.266913968633733e-06, + "loss": 0.0108, + "step": 2655 + }, + { + "clip_ratio": 0.003840220917481929, + "completion_length": 831.8036193847656, + "epoch": 0.09909985541719137, + "grad_norm": 0.07914257049560547, + "kl": 0.021148681640625, + "learning_rate": 5.264003050773438e-06, + "loss": 0.0592, + "num_tokens": 62614736.0, + "reward": 0.5601803623139858, + "reward_std": 0.20243677217513323, + "rewards/code_reward": 0.4128588940948248, + "rewards/format_reward": 1.4732142984867096, + "step": 2656 + }, + { + "clip_ratio": 0.0036641404149122536, + "epoch": 0.09913716710974302, + "grad_norm": 0.07785462588071823, + "kl": 0.021575927734375, + "learning_rate": 5.2610922319330786e-06, + "loss": 0.059, + "step": 2657 + }, + { + "clip_ratio": 0.0033806615974754095, + "epoch": 0.09917447880229467, + "grad_norm": 0.07094573229551315, + "kl": 0.021575927734375, + "learning_rate": 5.25818151333398e-06, + "loss": 0.0586, + "step": 2658 + }, + { + "clip_ratio": 0.003802258986979723, + "completion_length": 823.0357513427734, + "epoch": 0.09921179049484632, + "grad_norm": 0.0963192954659462, + "kl": 0.019287109375, + "learning_rate": 5.255270896197426e-06, + "loss": -0.009, + "num_tokens": 62693068.0, + "reward": 0.45158088207244873, + "reward_std": 0.2050174114992842, + "rewards/code_reward": 0.30425944179296494, + "rewards/format_reward": 1.4732142984867096, + "step": 2659 + }, + { + "clip_ratio": 0.0038189615588635206, + "epoch": 0.09924910218739798, + "grad_norm": 0.07829467207193375, + "kl": 0.019134521484375, + "learning_rate": 5.252360381744658e-06, + "loss": -0.0094, + "step": 2660 + }, + { + "clip_ratio": 0.0038053180905990303, + "epoch": 0.09928641387994963, + "grad_norm": 0.07962943613529205, + "kl": 0.019561767578125, + "learning_rate": 5.249449971196875e-06, + "loss": -0.0093, + "step": 2661 + }, + { + "clip_ratio": 0.0019756039255298674, + "completion_length": 553.6428756713867, + "epoch": 0.09932372557250128, + "grad_norm": 0.049917418509721756, + "kl": 0.0208892822265625, + "learning_rate": 5.24653966577523e-06, + "loss": -0.0015, + "num_tokens": 62747606.0, + "reward": 0.7254192307591438, + "reward_std": 0.11958577111363411, + "rewards/code_reward": 0.5772049520164728, + "rewards/format_reward": 1.4821428656578064, + "step": 2662 + }, + { + "clip_ratio": 0.0018370641046203673, + "epoch": 0.09936103726505294, + "grad_norm": 0.0538790188729763, + "kl": 0.0206146240234375, + "learning_rate": 5.243629466700833e-06, + "loss": -0.0015, + "step": 2663 + }, + { + "clip_ratio": 0.0020474593620747328, + "epoch": 0.09939834895760459, + "grad_norm": 0.04641196131706238, + "kl": 0.0206756591796875, + "learning_rate": 5.240719375194748e-06, + "loss": -0.0015, + "step": 2664 + }, + { + "clip_ratio": 0.00261801719898358, + "completion_length": 611.7321624755859, + "epoch": 0.09943566065015624, + "grad_norm": 0.08050066232681274, + "kl": 0.017181396484375, + "learning_rate": 5.237809392477998e-06, + "loss": 0.0139, + "num_tokens": 62811893.0, + "reward": 1.006150797009468, + "reward_std": 0.25901833921670914, + "rewards/code_reward": 0.8561507910490036, + "rewards/format_reward": 1.5, + "step": 2665 + }, + { + "clip_ratio": 0.0024156117578968406, + "epoch": 0.0994729723427079, + "grad_norm": 0.08000346273183823, + "kl": 0.0166473388671875, + "learning_rate": 5.2348995197715565e-06, + "loss": 0.0136, + "step": 2666 + }, + { + "clip_ratio": 0.00249588571023196, + "epoch": 0.09951028403525955, + "grad_norm": 0.07336444407701492, + "kl": 0.017120361328125, + "learning_rate": 5.231989758296352e-06, + "loss": 0.0133, + "step": 2667 + }, + { + "clip_ratio": 0.0036112541565671563, + "completion_length": 645.9464721679688, + "epoch": 0.0995475957278112, + "grad_norm": 0.08030203729867935, + "kl": 0.02471923828125, + "learning_rate": 5.229080109273268e-06, + "loss": 0.0029, + "num_tokens": 62881722.0, + "reward": 0.5959451198577881, + "reward_std": 0.14160800422541797, + "rewards/code_reward": 0.4459450919239316, + "rewards/format_reward": 1.5, + "step": 2668 + }, + { + "clip_ratio": 0.0036722126533277333, + "epoch": 0.09958490742036286, + "grad_norm": 0.07881763577461243, + "kl": 0.024078369140625, + "learning_rate": 5.226170573923136e-06, + "loss": 0.0029, + "step": 2669 + }, + { + "clip_ratio": 0.0037088695098645985, + "epoch": 0.09962221911291451, + "grad_norm": 0.07644681632518768, + "kl": 0.024169921875, + "learning_rate": 5.223261153466744e-06, + "loss": 0.0027, + "step": 2670 + }, + { + "clip_ratio": 0.0030278577469289303, + "completion_length": 613.3393249511719, + "epoch": 0.09965953080546616, + "grad_norm": 0.07210074365139008, + "kl": 0.0230712890625, + "learning_rate": 5.2203518491248325e-06, + "loss": 0.0008, + "num_tokens": 62942123.0, + "reward": 0.7578625790774822, + "reward_std": 0.0980701893568039, + "rewards/code_reward": 0.6078625470399857, + "rewards/format_reward": 1.5, + "step": 2671 + }, + { + "clip_ratio": 0.0032845194218680263, + "epoch": 0.09969684249801782, + "grad_norm": 0.05029135197401047, + "kl": 0.02398681640625, + "learning_rate": 5.217442662118091e-06, + "loss": 0.0008, + "step": 2672 + }, + { + "clip_ratio": 0.002816210617311299, + "epoch": 0.09973415419056947, + "grad_norm": 0.039281196892261505, + "kl": 0.02362060546875, + "learning_rate": 5.2145335936671596e-06, + "loss": 0.0008, + "step": 2673 + }, + { + "clip_ratio": 0.0016251910710707307, + "completion_length": 612.4107360839844, + "epoch": 0.09977146588312112, + "grad_norm": 0.05236806347966194, + "kl": 0.0209808349609375, + "learning_rate": 5.211624644992629e-06, + "loss": 0.038, + "num_tokens": 63001702.0, + "reward": 0.8701618127524853, + "reward_std": 0.027504166588187218, + "rewards/code_reward": 0.7228403858898673, + "rewards/format_reward": 1.4732142984867096, + "step": 2674 + }, + { + "clip_ratio": 0.0017036927165463567, + "epoch": 0.09980877757567277, + "grad_norm": 0.047621238976716995, + "kl": 0.020843505859375, + "learning_rate": 5.20871581731504e-06, + "loss": 0.038, + "step": 2675 + }, + { + "clip_ratio": 0.0012721040984615684, + "epoch": 0.09984608926822443, + "grad_norm": 0.044708508998155594, + "kl": 0.0203857421875, + "learning_rate": 5.2058071118548835e-06, + "loss": 0.0378, + "step": 2676 + }, + { + "clip_ratio": 0.004611779935657978, + "completion_length": 837.7143096923828, + "epoch": 0.09988340096077608, + "grad_norm": 0.13388440012931824, + "kl": 0.0424346923828125, + "learning_rate": 5.2028985298325985e-06, + "loss": -0.0091, + "num_tokens": 63082396.0, + "reward": 0.703006811439991, + "reward_std": 0.2295708768069744, + "rewards/code_reward": 0.553006786853075, + "rewards/format_reward": 1.5, + "step": 2677 + }, + { + "clip_ratio": 0.003941935952752829, + "epoch": 0.09992071265332773, + "grad_norm": 0.0873541384935379, + "kl": 0.0297088623046875, + "learning_rate": 5.199990072468573e-06, + "loss": -0.0094, + "step": 2678 + }, + { + "clip_ratio": 0.003981965652201325, + "epoch": 0.09995802434587939, + "grad_norm": 0.08329115062952042, + "kl": 0.029449462890625, + "learning_rate": 5.197081740983138e-06, + "loss": -0.0097, + "step": 2679 + }, + { + "clip_ratio": 0.003933412488549948, + "completion_length": 757.6607513427734, + "epoch": 0.09999533603843104, + "grad_norm": 0.067487433552742, + "kl": 0.0196533203125, + "learning_rate": 5.194173536596577e-06, + "loss": -0.0037, + "num_tokens": 63161169.0, + "reward": 0.7562889419496059, + "reward_std": 0.15713631734251976, + "rewards/code_reward": 0.6062889248132706, + "rewards/format_reward": 1.5, + "step": 2680 + }, + { + "clip_ratio": 0.0038617680547758937, + "epoch": 0.1000326477309827, + "grad_norm": 0.07005442678928375, + "kl": 0.01971435546875, + "learning_rate": 5.191265460529122e-06, + "loss": -0.0037, + "step": 2681 + }, + { + "clip_ratio": 0.0033484692103229463, + "epoch": 0.10006995942353435, + "grad_norm": 0.06474132835865021, + "kl": 0.0198211669921875, + "learning_rate": 5.188357514000944e-06, + "loss": -0.0041, + "step": 2682 + }, + { + "clip_ratio": 0.0011270211543887854, + "completion_length": 549.6428756713867, + "epoch": 0.100107271116086, + "grad_norm": 0.041950568556785583, + "kl": 0.0219268798828125, + "learning_rate": 5.1854496982321624e-06, + "loss": 0.0002, + "num_tokens": 63224513.0, + "reward": 0.9017421528697014, + "reward_std": 0.06419406831264496, + "rewards/code_reward": 0.7517421655356884, + "rewards/format_reward": 1.5, + "step": 2683 + }, + { + "clip_ratio": 0.0011309011606499553, + "epoch": 0.10014458280863765, + "grad_norm": 0.04456527158617973, + "kl": 0.0221710205078125, + "learning_rate": 5.182542014442846e-06, + "loss": 0.0004, + "step": 2684 + }, + { + "clip_ratio": 0.0010828948579728603, + "epoch": 0.1001818945011893, + "grad_norm": 0.03989411145448685, + "kl": 0.0220489501953125, + "learning_rate": 5.1796344638530025e-06, + "loss": 0.0002, + "step": 2685 + }, + { + "clip_ratio": 0.0013319432036951184, + "completion_length": 687.8750457763672, + "epoch": 0.10021920619374096, + "grad_norm": 0.04200589284300804, + "kl": 0.0186767578125, + "learning_rate": 5.1767270476825894e-06, + "loss": 0.0766, + "num_tokens": 63304110.0, + "reward": 0.8041015528142452, + "reward_std": 0.1369577357545495, + "rewards/code_reward": 0.6594587117433548, + "rewards/format_reward": 1.4464285671710968, + "step": 2686 + }, + { + "clip_ratio": 0.0016511001158505678, + "epoch": 0.10025651788629261, + "grad_norm": 0.04537477344274521, + "kl": 0.01910400390625, + "learning_rate": 5.1738197671515e-06, + "loss": 0.0768, + "step": 2687 + }, + { + "clip_ratio": 0.00135240942472592, + "epoch": 0.10029382957884427, + "grad_norm": 0.0432913564145565, + "kl": 0.0189056396484375, + "learning_rate": 5.170912623479577e-06, + "loss": 0.0765, + "step": 2688 + }, + { + "clip_ratio": 0.0037190100410953164, + "completion_length": 641.803596496582, + "epoch": 0.10033114127139592, + "grad_norm": 0.04298630729317665, + "kl": 0.0196075439453125, + "learning_rate": 5.1680056178866045e-06, + "loss": -0.001, + "num_tokens": 63373805.0, + "reward": 0.5607142820954323, + "reward_std": 0.12431129068136215, + "rewards/code_reward": 0.4107142835855484, + "rewards/format_reward": 1.5, + "step": 2689 + }, + { + "clip_ratio": 0.0036812001490034163, + "epoch": 0.10036845296394757, + "grad_norm": 0.04487678036093712, + "kl": 0.02008056640625, + "learning_rate": 5.165098751592308e-06, + "loss": -0.0009, + "step": 2690 + }, + { + "clip_ratio": 0.0030185228097252548, + "epoch": 0.10040576465649922, + "grad_norm": 0.040565866976976395, + "kl": 0.01971435546875, + "learning_rate": 5.1621920258163565e-06, + "loss": -0.0011, + "step": 2691 + }, + { + "clip_ratio": 0.0020141872810199857, + "completion_length": 577.4464416503906, + "epoch": 0.10044307634905088, + "grad_norm": 0.04514557868242264, + "kl": 0.020263671875, + "learning_rate": 5.159285441778352e-06, + "loss": 0.0075, + "num_tokens": 63435486.0, + "reward": 0.6767857111990452, + "reward_std": 0.053226910531520844, + "rewards/code_reward": 0.5267857145518064, + "rewards/format_reward": 1.5, + "step": 2692 + }, + { + "clip_ratio": 0.002330393355805427, + "epoch": 0.10048038804160253, + "grad_norm": 0.04501254856586456, + "kl": 0.020263671875, + "learning_rate": 5.156379000697849e-06, + "loss": 0.0074, + "step": 2693 + }, + { + "clip_ratio": 0.002365942928008735, + "epoch": 0.10051769973415418, + "grad_norm": 0.0439479760825634, + "kl": 0.02001953125, + "learning_rate": 5.153472703794335e-06, + "loss": 0.0074, + "step": 2694 + }, + { + "clip_ratio": 0.002904502092860639, + "completion_length": 648.3571701049805, + "epoch": 0.10055501142670584, + "grad_norm": 0.03708072006702423, + "kl": 0.022003173828125, + "learning_rate": 5.150566552287236e-06, + "loss": 0.0132, + "num_tokens": 63502832.0, + "reward": 0.6178571507334709, + "reward_std": 0.08229204267263412, + "rewards/code_reward": 0.46785713732242584, + "rewards/format_reward": 1.5, + "step": 2695 + }, + { + "clip_ratio": 0.0029059029184281826, + "epoch": 0.1005923231192575, + "grad_norm": 0.04044182226061821, + "kl": 0.02227783203125, + "learning_rate": 5.147660547395923e-06, + "loss": 0.0133, + "step": 2696 + }, + { + "clip_ratio": 0.002993398520629853, + "epoch": 0.10062963481180916, + "grad_norm": 0.040453795343637466, + "kl": 0.022430419921875, + "learning_rate": 5.1447546903396975e-06, + "loss": 0.0132, + "step": 2697 + }, + { + "clip_ratio": 0.00300467578927055, + "completion_length": 728.1428909301758, + "epoch": 0.10066694650436081, + "grad_norm": 0.06593628227710724, + "kl": 0.0202789306640625, + "learning_rate": 5.141848982337805e-06, + "loss": -0.0227, + "num_tokens": 63567786.0, + "reward": 0.6154258921742439, + "reward_std": 0.2460191212594509, + "rewards/code_reward": 0.46542586013674736, + "rewards/format_reward": 1.5, + "step": 2698 + }, + { + "clip_ratio": 0.0026794408913701773, + "epoch": 0.10070425819691246, + "grad_norm": 0.0647796168923378, + "kl": 0.0198974609375, + "learning_rate": 5.138943424609428e-06, + "loss": -0.0226, + "step": 2699 + }, + { + "clip_ratio": 0.0029788849933538586, + "epoch": 0.10074156988946412, + "grad_norm": 0.06410941481590271, + "kl": 0.01953125, + "learning_rate": 5.136038018373682e-06, + "loss": -0.0228, + "step": 2700 + }, + { + "clip_ratio": 0.004626414447557181, + "completion_length": 637.1964569091797, + "epoch": 0.10077888158201577, + "grad_norm": 0.09590167552232742, + "kl": 0.0194244384765625, + "learning_rate": 5.133132764849623e-06, + "loss": 0.0196, + "num_tokens": 63626597.0, + "reward": 0.5207449160516262, + "reward_std": 0.24582261219620705, + "rewards/code_reward": 0.3707449147477746, + "rewards/format_reward": 1.5, + "step": 2701 + }, + { + "clip_ratio": 0.004241666116286069, + "epoch": 0.10081619327456742, + "grad_norm": 0.08656599372625351, + "kl": 0.0193328857421875, + "learning_rate": 5.130227665256241e-06, + "loss": 0.0193, + "step": 2702 + }, + { + "clip_ratio": 0.0036073621013201773, + "epoch": 0.10085350496711908, + "grad_norm": 0.08662063628435135, + "kl": 0.0198974609375, + "learning_rate": 5.127322720812462e-06, + "loss": 0.0189, + "step": 2703 + }, + { + "clip_ratio": 0.003437722392845899, + "completion_length": 717.6250228881836, + "epoch": 0.10089081665967073, + "grad_norm": 0.11764491349458694, + "kl": 0.018951416015625, + "learning_rate": 5.124417932737148e-06, + "loss": 0.0442, + "num_tokens": 63698508.0, + "reward": 0.5580357350409031, + "reward_std": 0.2412269813939929, + "rewards/code_reward": 0.4107142835855484, + "rewards/format_reward": 1.4732142984867096, + "step": 2704 + }, + { + "clip_ratio": 0.003253320057410747, + "epoch": 0.10092812835222238, + "grad_norm": 0.08070947974920273, + "kl": 0.01922607421875, + "learning_rate": 5.121513302249091e-06, + "loss": 0.044, + "step": 2705 + }, + { + "clip_ratio": 0.003196197794750333, + "epoch": 0.10096544004477404, + "grad_norm": 0.07177864015102386, + "kl": 0.019378662109375, + "learning_rate": 5.118608830567024e-06, + "loss": 0.0436, + "step": 2706 + }, + { + "clip_ratio": 0.0033972591627389193, + "completion_length": 696.9464721679688, + "epoch": 0.10100275173732569, + "grad_norm": 0.08653799444437027, + "kl": 0.0220489501953125, + "learning_rate": 5.115704518909605e-06, + "loss": 0.0039, + "num_tokens": 63777045.0, + "reward": 0.6555734239518642, + "reward_std": 0.03490676824003458, + "rewards/code_reward": 0.5055734282359481, + "rewards/format_reward": 1.5, + "step": 2707 + }, + { + "clip_ratio": 0.003335279645398259, + "epoch": 0.10104006342987734, + "grad_norm": 0.11033808439970016, + "kl": 0.0222015380859375, + "learning_rate": 5.11280036849543e-06, + "loss": 0.0034, + "step": 2708 + }, + { + "clip_ratio": 0.002954955445602536, + "epoch": 0.101077375122429, + "grad_norm": 0.0679280161857605, + "kl": 0.023040771484375, + "learning_rate": 5.109896380543031e-06, + "loss": 0.0033, + "step": 2709 + }, + { + "clip_ratio": 0.0031782840378582478, + "completion_length": 668.1250381469727, + "epoch": 0.10111468681498065, + "grad_norm": 0.058298639953136444, + "kl": 0.028411865234375, + "learning_rate": 5.1069925562708634e-06, + "loss": -0.0018, + "num_tokens": 63855172.0, + "reward": 0.6327283419668674, + "reward_std": 0.06830357271246612, + "rewards/code_reward": 0.48272833082592115, + "rewards/format_reward": 1.5, + "step": 2710 + }, + { + "clip_ratio": 0.004020522348582745, + "epoch": 0.1011519985075323, + "grad_norm": 0.05352403596043587, + "kl": 0.0280609130859375, + "learning_rate": 5.104088896897319e-06, + "loss": -0.0018, + "step": 2711 + }, + { + "clip_ratio": 0.003837636671960354, + "epoch": 0.10118931020008395, + "grad_norm": 0.05192873999476433, + "kl": 0.027435302734375, + "learning_rate": 5.101185403640717e-06, + "loss": -0.0021, + "step": 2712 + }, + { + "clip_ratio": 0.0029478889191523194, + "completion_length": 576.0893173217773, + "epoch": 0.10122662189263561, + "grad_norm": 0.06368640065193176, + "kl": 0.020538330078125, + "learning_rate": 5.098282077719313e-06, + "loss": -0.0039, + "num_tokens": 63922853.0, + "reward": 0.7697479240596294, + "reward_std": 0.16881820559501648, + "rewards/code_reward": 0.6197478920221329, + "rewards/format_reward": 1.5, + "step": 2713 + }, + { + "clip_ratio": 0.0028146691038273275, + "epoch": 0.10126393358518726, + "grad_norm": 0.06350359320640564, + "kl": 0.020355224609375, + "learning_rate": 5.095378920351288e-06, + "loss": -0.0041, + "step": 2714 + }, + { + "clip_ratio": 0.0027597080916166306, + "epoch": 0.10130124527773891, + "grad_norm": 0.05863846465945244, + "kl": 0.02032470703125, + "learning_rate": 5.09247593275475e-06, + "loss": -0.0043, + "step": 2715 + }, + { + "clip_ratio": 0.004130390239879489, + "completion_length": 716.357177734375, + "epoch": 0.10133855697029057, + "grad_norm": 0.056541092693805695, + "kl": 0.02264404296875, + "learning_rate": 5.089573116147743e-06, + "loss": -0.0096, + "num_tokens": 63989779.0, + "reward": 0.4061063416302204, + "reward_std": 0.13483741506934166, + "rewards/code_reward": 0.2561063254252076, + "rewards/format_reward": 1.5, + "step": 2716 + }, + { + "clip_ratio": 0.00394246936775744, + "epoch": 0.10137586866284222, + "grad_norm": 0.05462857335805893, + "kl": 0.0223388671875, + "learning_rate": 5.086670471748232e-06, + "loss": -0.01, + "step": 2717 + }, + { + "clip_ratio": 0.00403546157758683, + "epoch": 0.10141318035539387, + "grad_norm": 0.055590949952602386, + "kl": 0.02288818359375, + "learning_rate": 5.083768000774112e-06, + "loss": -0.01, + "step": 2718 + }, + { + "clip_ratio": 0.0039025036385282874, + "completion_length": 794.5714569091797, + "epoch": 0.10145049204794553, + "grad_norm": 0.08068642020225525, + "kl": 0.016876220703125, + "learning_rate": 5.0808657044432075e-06, + "loss": 0.0071, + "num_tokens": 64064431.0, + "reward": 0.4631805121898651, + "reward_std": 0.1595055777579546, + "rewards/code_reward": 0.31318049039691687, + "rewards/format_reward": 1.5, + "step": 2719 + }, + { + "clip_ratio": 0.0035227860789746046, + "epoch": 0.10148780374049718, + "grad_norm": 0.08063631504774094, + "kl": 0.0167694091796875, + "learning_rate": 5.077963583973268e-06, + "loss": 0.0067, + "step": 2720 + }, + { + "clip_ratio": 0.003576119721401483, + "epoch": 0.10152511543304883, + "grad_norm": 0.07378783077001572, + "kl": 0.01727294921875, + "learning_rate": 5.075061640581967e-06, + "loss": 0.0064, + "step": 2721 + }, + { + "clip_ratio": 0.005062172363977879, + "completion_length": 685.6428756713867, + "epoch": 0.10156242712560049, + "grad_norm": 0.07747272402048111, + "kl": 0.025390625, + "learning_rate": 5.072159875486912e-06, + "loss": -0.0209, + "num_tokens": 64135741.0, + "reward": 0.21055517345666885, + "reward_std": 0.11213696049526334, + "rewards/code_reward": 0.060555160976946354, + "rewards/format_reward": 1.5, + "step": 2722 + }, + { + "clip_ratio": 0.0054923962452448905, + "epoch": 0.10159973881815214, + "grad_norm": 0.0760573297739029, + "kl": 0.0255126953125, + "learning_rate": 5.0692582899056244e-06, + "loss": -0.021, + "step": 2723 + }, + { + "clip_ratio": 0.005616007256321609, + "epoch": 0.10163705051070379, + "grad_norm": 0.07438289374113083, + "kl": 0.025299072265625, + "learning_rate": 5.0663568850555566e-06, + "loss": -0.0211, + "step": 2724 + }, + { + "clip_ratio": 0.003271907742600888, + "completion_length": 729.4107513427734, + "epoch": 0.10167436220325544, + "grad_norm": 0.07991227507591248, + "kl": 0.019683837890625, + "learning_rate": 5.063455662154084e-06, + "loss": -0.0099, + "num_tokens": 64208200.0, + "reward": 0.5396681129932404, + "reward_std": 0.16214095056056976, + "rewards/code_reward": 0.38966810423880816, + "rewards/format_reward": 1.5, + "step": 2725 + }, + { + "clip_ratio": 0.0031201091478578746, + "epoch": 0.1017116738958071, + "grad_norm": 0.07260597497224808, + "kl": 0.01971435546875, + "learning_rate": 5.060554622418507e-06, + "loss": -0.0101, + "step": 2726 + }, + { + "clip_ratio": 0.003385394928045571, + "epoch": 0.10174898558835875, + "grad_norm": 0.09174107760190964, + "kl": 0.020416259765625, + "learning_rate": 5.0576537670660495e-06, + "loss": -0.0104, + "step": 2727 + }, + { + "clip_ratio": 0.0020915628410875797, + "completion_length": 578.0357360839844, + "epoch": 0.1017862972809104, + "grad_norm": 0.0688282921910286, + "kl": 0.026580810546875, + "learning_rate": 5.054753097313852e-06, + "loss": 0.0054, + "num_tokens": 64269970.0, + "reward": 0.8910448998212814, + "reward_std": 0.16989681869745255, + "rewards/code_reward": 0.7410449087619781, + "rewards/format_reward": 1.5, + "step": 2728 + }, + { + "clip_ratio": 0.0019390748930163682, + "epoch": 0.10182360897346206, + "grad_norm": 0.06952356547117233, + "kl": 0.02587890625, + "learning_rate": 5.051852614378985e-06, + "loss": 0.0054, + "step": 2729 + }, + { + "clip_ratio": 0.0020554884686134756, + "epoch": 0.10186092066601371, + "grad_norm": 0.06322529166936874, + "kl": 0.027191162109375, + "learning_rate": 5.048952319478436e-06, + "loss": 0.0052, + "step": 2730 + }, + { + "clip_ratio": 0.0030996655696071684, + "completion_length": 814.1429061889648, + "epoch": 0.10189823235856536, + "grad_norm": 0.03587973117828369, + "kl": 0.024078369140625, + "learning_rate": 5.046052213829116e-06, + "loss": 0.002, + "num_tokens": 64349358.0, + "reward": 0.4412946552038193, + "reward_std": 0.014512230642139912, + "rewards/code_reward": 0.2912946417927742, + "rewards/format_reward": 1.5, + "step": 2731 + }, + { + "clip_ratio": 0.0030609495006501675, + "epoch": 0.10193554405111702, + "grad_norm": 0.03342793881893158, + "kl": 0.021697998046875, + "learning_rate": 5.043152298647855e-06, + "loss": 0.002, + "step": 2732 + }, + { + "clip_ratio": 0.003087480494286865, + "epoch": 0.10197285574366867, + "grad_norm": 0.041356224566698074, + "kl": 0.02520751953125, + "learning_rate": 5.040252575151401e-06, + "loss": 0.0021, + "step": 2733 + }, + { + "clip_ratio": 0.0032601410639472306, + "completion_length": 804.6607513427734, + "epoch": 0.10201016743622032, + "grad_norm": 0.07591377198696136, + "kl": 0.01522064208984375, + "learning_rate": 5.037353044556429e-06, + "loss": -0.0001, + "num_tokens": 64432583.0, + "reward": 0.5878948122262955, + "reward_std": 0.3692503944039345, + "rewards/code_reward": 0.4378948099911213, + "rewards/format_reward": 1.5, + "step": 2734 + }, + { + "clip_ratio": 0.003314079949632287, + "epoch": 0.10204747912877198, + "grad_norm": 0.07410301268100739, + "kl": 0.0153961181640625, + "learning_rate": 5.0344537080795246e-06, + "loss": -0.0004, + "step": 2735 + }, + { + "clip_ratio": 0.0032557156519033015, + "epoch": 0.10208479082132363, + "grad_norm": 0.07010893523693085, + "kl": 0.0152587890625, + "learning_rate": 5.031554566937195e-06, + "loss": -0.0005, + "step": 2736 + }, + { + "clip_ratio": 0.0031178451608866453, + "completion_length": 697.8036041259766, + "epoch": 0.10212210251387528, + "grad_norm": 0.061817340552806854, + "kl": 0.022125244140625, + "learning_rate": 5.028655622345871e-06, + "loss": 0.0009, + "num_tokens": 64497042.0, + "reward": 0.4890275374054909, + "reward_std": 0.15809911489486694, + "rewards/code_reward": 0.3390275314450264, + "rewards/format_reward": 1.5, + "step": 2737 + }, + { + "clip_ratio": 0.00344135012710467, + "epoch": 0.10215941420642694, + "grad_norm": 0.060621581971645355, + "kl": 0.0223388671875, + "learning_rate": 5.025756875521889e-06, + "loss": 0.0007, + "step": 2738 + }, + { + "clip_ratio": 0.00310094328597188, + "epoch": 0.10219672589897859, + "grad_norm": 0.059722863137722015, + "kl": 0.0224609375, + "learning_rate": 5.022858327681515e-06, + "loss": 0.0006, + "step": 2739 + }, + { + "clip_ratio": 0.003380353969987482, + "completion_length": 821.8036117553711, + "epoch": 0.10223403759153024, + "grad_norm": 0.0921674594283104, + "kl": 0.020904541015625, + "learning_rate": 5.019959980040921e-06, + "loss": -0.0002, + "num_tokens": 64579631.0, + "reward": 0.3746810257434845, + "reward_std": 0.16532692674081773, + "rewards/code_reward": 0.22468098971876316, + "rewards/format_reward": 1.5, + "step": 2740 + }, + { + "clip_ratio": 0.003185894340276718, + "epoch": 0.1022713492840819, + "grad_norm": 0.09493301808834076, + "kl": 0.020355224609375, + "learning_rate": 5.017061833816203e-06, + "loss": -0.0001, + "step": 2741 + }, + { + "clip_ratio": 0.003083643037825823, + "epoch": 0.10230866097663355, + "grad_norm": 0.07996547967195511, + "kl": 0.021026611328125, + "learning_rate": 5.014163890223371e-06, + "loss": -0.0007, + "step": 2742 + }, + { + "clip_ratio": 0.0028766169562004507, + "completion_length": 486.35717010498047, + "epoch": 0.1023459726691852, + "grad_norm": 0.06799623370170593, + "kl": 0.02520751953125, + "learning_rate": 5.011266150478344e-06, + "loss": -0.0052, + "num_tokens": 64635693.0, + "reward": 0.6485486216843128, + "reward_std": 0.06099618785083294, + "rewards/code_reward": 0.49854861106723547, + "rewards/format_reward": 1.5, + "step": 2743 + }, + { + "clip_ratio": 0.0027044740272685885, + "epoch": 0.10238328436173685, + "grad_norm": 0.06729774922132492, + "kl": 0.025421142578125, + "learning_rate": 5.008368615796963e-06, + "loss": -0.0055, + "step": 2744 + }, + { + "clip_ratio": 0.0029199872515164316, + "epoch": 0.1024205960542885, + "grad_norm": 0.06270746141672134, + "kl": 0.025115966796875, + "learning_rate": 5.005471287394978e-06, + "loss": -0.0057, + "step": 2745 + }, + { + "clip_ratio": 0.00228181789861992, + "completion_length": 613.9464492797852, + "epoch": 0.10245790774684016, + "grad_norm": 0.07471325993537903, + "kl": 0.025665283203125, + "learning_rate": 5.002574166488053e-06, + "loss": 0.0027, + "num_tokens": 64702680.0, + "reward": 0.9429391324520111, + "reward_std": 0.1501077264547348, + "rewards/code_reward": 0.792939119040966, + "rewards/format_reward": 1.5, + "step": 2746 + }, + { + "clip_ratio": 0.0023983444552868605, + "epoch": 0.10249521943939181, + "grad_norm": 0.09709218889474869, + "kl": 0.026031494140625, + "learning_rate": 4.99967725429177e-06, + "loss": 0.0026, + "step": 2747 + }, + { + "clip_ratio": 0.001992570178117603, + "epoch": 0.10253253113194347, + "grad_norm": 0.06752099096775055, + "kl": 0.025604248046875, + "learning_rate": 4.996780552021616e-06, + "loss": 0.0021, + "step": 2748 + }, + { + "clip_ratio": 0.004134607093874365, + "completion_length": 736.0714416503906, + "epoch": 0.10256984282449512, + "grad_norm": 0.08214205503463745, + "kl": 0.029876708984375, + "learning_rate": 4.9938840608929954e-06, + "loss": 0.0083, + "num_tokens": 64777318.0, + "reward": 0.6114679612219334, + "reward_std": 0.19957662434899248, + "rewards/code_reward": 0.46146794464584673, + "rewards/format_reward": 1.5, + "step": 2749 + }, + { + "clip_ratio": 0.0035788804525509477, + "epoch": 0.10260715451704679, + "grad_norm": 0.07799065858125687, + "kl": 0.02606201171875, + "learning_rate": 4.9909877821212215e-06, + "loss": 0.0079, + "step": 2750 + }, + { + "clip_ratio": 0.003875419613905251, + "epoch": 0.10264446620959844, + "grad_norm": 0.07595808058977127, + "kl": 0.0266571044921875, + "learning_rate": 4.988091716921518e-06, + "loss": 0.0078, + "step": 2751 + }, + { + "clip_ratio": 0.004320425039622933, + "completion_length": 563.3571701049805, + "epoch": 0.10268177790215009, + "grad_norm": 0.09090859442949295, + "kl": 0.02923583984375, + "learning_rate": 4.985195866509024e-06, + "loss": 0.0053, + "num_tokens": 64841934.0, + "reward": 0.3819549009203911, + "reward_std": 0.3020145818591118, + "rewards/code_reward": 0.2319548800587654, + "rewards/format_reward": 1.5, + "step": 2752 + }, + { + "clip_ratio": 0.004133278853259981, + "epoch": 0.10271908959470175, + "grad_norm": 0.0879095271229744, + "kl": 0.02813720703125, + "learning_rate": 4.982300232098779e-06, + "loss": 0.0051, + "step": 2753 + }, + { + "clip_ratio": 0.004474422545172274, + "epoch": 0.1027564012872534, + "grad_norm": 0.0746108666062355, + "kl": 0.02886962890625, + "learning_rate": 4.979404814905739e-06, + "loss": 0.0048, + "step": 2754 + }, + { + "clip_ratio": 0.00499927275814116, + "completion_length": 679.0714569091797, + "epoch": 0.10279371297980505, + "grad_norm": 0.11571905016899109, + "kl": 0.02496337890625, + "learning_rate": 4.976509616144771e-06, + "loss": -0.0005, + "num_tokens": 64913352.0, + "reward": 0.4229273237287998, + "reward_std": 0.19101178529672325, + "rewards/code_reward": 0.27292729972396046, + "rewards/format_reward": 1.5, + "step": 2755 + }, + { + "clip_ratio": 0.005337247741408646, + "epoch": 0.1028310246723567, + "grad_norm": 0.10058558732271194, + "kl": 0.02508544921875, + "learning_rate": 4.973614637030643e-06, + "loss": -0.0004, + "step": 2756 + }, + { + "clip_ratio": 0.0045281078782863915, + "epoch": 0.10286833636490836, + "grad_norm": 0.10022944211959839, + "kl": 0.0255126953125, + "learning_rate": 4.970719878778036e-06, + "loss": -0.001, + "step": 2757 + }, + { + "clip_ratio": 0.004468523955438286, + "completion_length": 610.8036041259766, + "epoch": 0.10290564805746001, + "grad_norm": 0.09185821563005447, + "kl": 0.022308349609375, + "learning_rate": 4.967825342601535e-06, + "loss": -0.0061, + "num_tokens": 64970657.0, + "reward": 0.6567306742072105, + "reward_std": 0.21808872371912003, + "rewards/code_reward": 0.5067306272685528, + "rewards/format_reward": 1.5, + "step": 2758 + }, + { + "clip_ratio": 0.003909737046342343, + "epoch": 0.10294295975001166, + "grad_norm": 0.09002004563808441, + "kl": 0.02276611328125, + "learning_rate": 4.964931029715634e-06, + "loss": -0.0065, + "step": 2759 + }, + { + "clip_ratio": 0.0035905138356611133, + "epoch": 0.10298027144256332, + "grad_norm": 0.08401764184236526, + "kl": 0.022216796875, + "learning_rate": 4.962036941334736e-06, + "loss": -0.0069, + "step": 2760 + }, + { + "clip_ratio": 0.002345920482184738, + "completion_length": 629.7678833007812, + "epoch": 0.10301758313511497, + "grad_norm": 0.05641459301114082, + "kl": 0.024078369140625, + "learning_rate": 4.9591430786731425e-06, + "loss": -0.001, + "num_tokens": 65035898.0, + "reward": 0.6300595290958881, + "reward_std": 0.09350547916255891, + "rewards/code_reward": 0.48005952878156677, + "rewards/format_reward": 1.5, + "step": 2761 + }, + { + "clip_ratio": 0.0021465522586368024, + "epoch": 0.10305489482766662, + "grad_norm": 0.05428861826658249, + "kl": 0.0243988037109375, + "learning_rate": 4.956249442945069e-06, + "loss": -0.0012, + "step": 2762 + }, + { + "clip_ratio": 0.0019386591156944633, + "epoch": 0.10309220652021828, + "grad_norm": 0.05388074740767479, + "kl": 0.02459716796875, + "learning_rate": 4.953356035364626e-06, + "loss": -0.0012, + "step": 2763 + }, + { + "clip_ratio": 0.002483668620698154, + "completion_length": 672.785758972168, + "epoch": 0.10312951821276993, + "grad_norm": 0.08185752481222153, + "kl": 0.026580810546875, + "learning_rate": 4.950462857145836e-06, + "loss": -0.0054, + "num_tokens": 65100664.0, + "reward": 0.633196022361517, + "reward_std": 0.06957547646015882, + "rewards/code_reward": 0.48319601075490937, + "rewards/format_reward": 1.5, + "step": 2764 + }, + { + "clip_ratio": 0.0023436845222022384, + "epoch": 0.10316682990532158, + "grad_norm": 0.07536359131336212, + "kl": 0.025787353515625, + "learning_rate": 4.947569909502627e-06, + "loss": -0.0058, + "step": 2765 + }, + { + "clip_ratio": 0.0022592025343328714, + "epoch": 0.10320414159787324, + "grad_norm": 0.08166281878948212, + "kl": 0.025146484375, + "learning_rate": 4.944677193648818e-06, + "loss": -0.0061, + "step": 2766 + }, + { + "clip_ratio": 0.0040046717040240765, + "completion_length": 642.5714416503906, + "epoch": 0.10324145329042489, + "grad_norm": 0.07041557133197784, + "kl": 0.02301025390625, + "learning_rate": 4.941784710798145e-06, + "loss": 0.0, + "num_tokens": 65172474.0, + "reward": 0.5675476677715778, + "reward_std": 0.06237761024385691, + "rewards/code_reward": 0.41754765901714563, + "rewards/format_reward": 1.5, + "step": 2767 + }, + { + "clip_ratio": 0.0035233726957812905, + "epoch": 0.10327876498297654, + "grad_norm": 0.06784336268901825, + "kl": 0.023895263671875, + "learning_rate": 4.938892462164239e-06, + "loss": -0.0, + "step": 2768 + }, + { + "clip_ratio": 0.003226140106562525, + "epoch": 0.1033160766755282, + "grad_norm": 0.07112893462181091, + "kl": 0.023681640625, + "learning_rate": 4.936000448960631e-06, + "loss": -0.0002, + "step": 2769 + }, + { + "clip_ratio": 0.003941901260986924, + "completion_length": 667.6607437133789, + "epoch": 0.10335338836807985, + "grad_norm": 0.07097378373146057, + "kl": 0.020263671875, + "learning_rate": 4.9331086724007605e-06, + "loss": -0.0025, + "num_tokens": 65234587.0, + "reward": 0.5531379207968712, + "reward_std": 0.1856401450932026, + "rewards/code_reward": 0.4031379222869873, + "rewards/format_reward": 1.5, + "step": 2770 + }, + { + "clip_ratio": 0.0037226082640700042, + "epoch": 0.1033907000606315, + "grad_norm": 0.06935189664363861, + "kl": 0.020660400390625, + "learning_rate": 4.9302171336979586e-06, + "loss": -0.0025, + "step": 2771 + }, + { + "clip_ratio": 0.0038457351038232446, + "epoch": 0.10342801175318315, + "grad_norm": 0.06914988905191422, + "kl": 0.02032470703125, + "learning_rate": 4.927325834065465e-06, + "loss": -0.0029, + "step": 2772 + }, + { + "clip_ratio": 0.0050870736595243216, + "completion_length": 680.5000305175781, + "epoch": 0.10346532344573481, + "grad_norm": 0.0766826719045639, + "kl": 0.02374267578125, + "learning_rate": 4.924434774716412e-06, + "loss": -0.0128, + "num_tokens": 65306701.0, + "reward": 0.42171379923820496, + "reward_std": 0.12946279253810644, + "rewards/code_reward": 0.27171377860940993, + "rewards/format_reward": 1.5, + "step": 2773 + }, + { + "clip_ratio": 0.0045543378801085055, + "epoch": 0.10350263513828646, + "grad_norm": 0.0768994614481926, + "kl": 0.023834228515625, + "learning_rate": 4.9215439568638356e-06, + "loss": -0.013, + "step": 2774 + }, + { + "clip_ratio": 0.004437538504134864, + "epoch": 0.10353994683083811, + "grad_norm": 0.07394496351480484, + "kl": 0.023590087890625, + "learning_rate": 4.918653381720673e-06, + "loss": -0.0133, + "step": 2775 + }, + { + "clip_ratio": 0.003757162019610405, + "completion_length": 561.3036041259766, + "epoch": 0.10357725852338977, + "grad_norm": 0.06381680816411972, + "kl": 0.0321044921875, + "learning_rate": 4.91576305049975e-06, + "loss": 0.0154, + "num_tokens": 65374800.0, + "reward": 0.5099225081503391, + "reward_std": 0.15486245602369308, + "rewards/code_reward": 0.35992250964045525, + "rewards/format_reward": 1.5, + "step": 2776 + }, + { + "clip_ratio": 0.0034628238063305616, + "epoch": 0.10361457021594142, + "grad_norm": 0.06295543164014816, + "kl": 0.031890869140625, + "learning_rate": 4.912872964413798e-06, + "loss": 0.0153, + "step": 2777 + }, + { + "clip_ratio": 0.0034605401451699436, + "epoch": 0.10365188190849307, + "grad_norm": 0.06160398945212364, + "kl": 0.031951904296875, + "learning_rate": 4.909983124675446e-06, + "loss": 0.0151, + "step": 2778 + }, + { + "clip_ratio": 0.0035692513920366764, + "completion_length": 617.3214721679688, + "epoch": 0.10368919360104473, + "grad_norm": 0.06692250818014145, + "kl": 0.023193359375, + "learning_rate": 4.907093532497212e-06, + "loss": 0.0018, + "num_tokens": 65438070.0, + "reward": 0.7233711034059525, + "reward_std": 0.28436706215143204, + "rewards/code_reward": 0.5733710825443268, + "rewards/format_reward": 1.5, + "step": 2779 + }, + { + "clip_ratio": 0.0032368205138482153, + "epoch": 0.10372650529359638, + "grad_norm": 0.06766758114099503, + "kl": 0.022491455078125, + "learning_rate": 4.904204189091523e-06, + "loss": 0.0016, + "step": 2780 + }, + { + "clip_ratio": 0.0034045412321574986, + "epoch": 0.10376381698614803, + "grad_norm": 0.06541267037391663, + "kl": 0.022064208984375, + "learning_rate": 4.901315095670684e-06, + "loss": 0.0013, + "step": 2781 + }, + { + "clip_ratio": 0.004225602431688458, + "completion_length": 682.2857360839844, + "epoch": 0.10380112867869969, + "grad_norm": 0.08531343191862106, + "kl": 0.02728271484375, + "learning_rate": 4.898426253446912e-06, + "loss": -0.0067, + "num_tokens": 65513714.0, + "reward": 0.6305071711540222, + "reward_std": 0.2600112333893776, + "rewards/code_reward": 0.48050716891884804, + "rewards/format_reward": 1.5, + "step": 2782 + }, + { + "clip_ratio": 0.0044872843427583575, + "epoch": 0.10383844037125134, + "grad_norm": 0.09100160747766495, + "kl": 0.02752685546875, + "learning_rate": 4.895537663632311e-06, + "loss": -0.0067, + "step": 2783 + }, + { + "clip_ratio": 0.003900522831827402, + "epoch": 0.10387575206380299, + "grad_norm": 0.0749603882431984, + "kl": 0.027801513671875, + "learning_rate": 4.892649327438875e-06, + "loss": -0.0072, + "step": 2784 + }, + { + "clip_ratio": 0.002844082424417138, + "completion_length": 803.0893402099609, + "epoch": 0.10391306375635465, + "grad_norm": 0.27002424001693726, + "kl": 0.024078369140625, + "learning_rate": 4.889761246078502e-06, + "loss": -0.0272, + "num_tokens": 65592147.0, + "reward": 0.7392592094838619, + "reward_std": 0.22752676159143448, + "rewards/code_reward": 0.591937780380249, + "rewards/format_reward": 1.4732142984867096, + "step": 2785 + }, + { + "clip_ratio": 0.002915595076046884, + "epoch": 0.1039503754489063, + "grad_norm": 0.05236536264419556, + "kl": 0.023651123046875, + "learning_rate": 4.886873420762973e-06, + "loss": -0.0273, + "step": 2786 + }, + { + "clip_ratio": 0.0030878030229359865, + "epoch": 0.10398768714145795, + "grad_norm": 0.04660627990961075, + "kl": 0.023956298828125, + "learning_rate": 4.8839858527039675e-06, + "loss": -0.0273, + "step": 2787 + }, + { + "clip_ratio": 0.0038959927624091506, + "completion_length": 646.0893325805664, + "epoch": 0.1040249988340096, + "grad_norm": 0.0687435120344162, + "kl": 0.0228271484375, + "learning_rate": 4.881098543113058e-06, + "loss": 0.0079, + "num_tokens": 65659450.0, + "reward": 0.578026544302702, + "reward_std": 0.11984615167602897, + "rewards/code_reward": 0.43070510867983103, + "rewards/format_reward": 1.4732142984867096, + "step": 2788 + }, + { + "clip_ratio": 0.0038992742775008082, + "epoch": 0.10406231052656126, + "grad_norm": 0.07307133823633194, + "kl": 0.023101806640625, + "learning_rate": 4.8782114932017e-06, + "loss": 0.0078, + "step": 2789 + }, + { + "clip_ratio": 0.0035239397548139095, + "epoch": 0.10409962221911291, + "grad_norm": 0.07166165858507156, + "kl": 0.0230712890625, + "learning_rate": 4.875324704181251e-06, + "loss": 0.0075, + "step": 2790 + }, + { + "clip_ratio": 0.004776031651999801, + "completion_length": 720.4464416503906, + "epoch": 0.10413693391166456, + "grad_norm": 0.11588522046804428, + "kl": 0.023284912109375, + "learning_rate": 4.872438177262951e-06, + "loss": -0.0084, + "num_tokens": 65728705.0, + "reward": 0.45062321797013283, + "reward_std": 0.20498154126107693, + "rewards/code_reward": 0.3006232138723135, + "rewards/format_reward": 1.5, + "step": 2791 + }, + { + "clip_ratio": 0.004181686206720769, + "epoch": 0.10417424560421622, + "grad_norm": 0.10558339953422546, + "kl": 0.023712158203125, + "learning_rate": 4.8695519136579365e-06, + "loss": -0.0088, + "step": 2792 + }, + { + "clip_ratio": 0.003973010461777449, + "epoch": 0.10421155729676787, + "grad_norm": 0.08806930482387543, + "kl": 0.02337646484375, + "learning_rate": 4.866665914577228e-06, + "loss": -0.0093, + "step": 2793 + }, + { + "clip_ratio": 0.00335399410687387, + "completion_length": 604.4286041259766, + "epoch": 0.10424886898931952, + "grad_norm": 0.06177133321762085, + "kl": 0.0267333984375, + "learning_rate": 4.8637801812317354e-06, + "loss": -0.0005, + "num_tokens": 65785793.0, + "reward": 0.41428573802113533, + "reward_std": 0.1532504130154848, + "rewards/code_reward": 0.26428571343421936, + "rewards/format_reward": 1.5, + "step": 2794 + }, + { + "clip_ratio": 0.002621666470076889, + "epoch": 0.10428618068187118, + "grad_norm": 0.05562605708837509, + "kl": 0.02740478515625, + "learning_rate": 4.860894714832263e-06, + "loss": -0.0007, + "step": 2795 + }, + { + "clip_ratio": 0.0028732227510772645, + "epoch": 0.10432349237442283, + "grad_norm": 0.058892954140901566, + "kl": 0.027069091796875, + "learning_rate": 4.8580095165894935e-06, + "loss": -0.001, + "step": 2796 + }, + { + "clip_ratio": 0.0034200535155832767, + "completion_length": 677.107177734375, + "epoch": 0.10436080406697448, + "grad_norm": 0.08695392310619354, + "kl": 0.0212249755859375, + "learning_rate": 4.855124587714007e-06, + "loss": 0.0136, + "num_tokens": 65858865.0, + "reward": 0.7286560833454132, + "reward_std": 0.30379515141248703, + "rewards/code_reward": 0.5786560252308846, + "rewards/format_reward": 1.5, + "step": 2797 + }, + { + "clip_ratio": 0.0035693591344170272, + "epoch": 0.10439811575952614, + "grad_norm": 0.0863238126039505, + "kl": 0.021148681640625, + "learning_rate": 4.852239929416264e-06, + "loss": 0.0134, + "step": 2798 + }, + { + "clip_ratio": 0.0036988818319514394, + "epoch": 0.10443542745207779, + "grad_norm": 0.08211600035429001, + "kl": 0.02099609375, + "learning_rate": 4.849355542906615e-06, + "loss": 0.0132, + "step": 2799 + }, + { + "clip_ratio": 0.0033984369947575033, + "completion_length": 745.7500305175781, + "epoch": 0.10447273914462944, + "grad_norm": 0.06335268169641495, + "kl": 0.030426025390625, + "learning_rate": 4.8464714293952956e-06, + "loss": 0.0463, + "num_tokens": 65931161.0, + "reward": 0.49787620827555656, + "reward_std": 0.161510000936687, + "rewards/code_reward": 0.3505547810345888, + "rewards/format_reward": 1.4732142984867096, + "step": 2800 + }, + { + "clip_ratio": 0.0035153607022948563, + "epoch": 0.1045100508371811, + "grad_norm": 0.06453349441289902, + "kl": 0.032562255859375, + "learning_rate": 4.843587590092422e-06, + "loss": 0.0465, + "step": 2801 + }, + { + "clip_ratio": 0.0030214753933250904, + "epoch": 0.10454736252973275, + "grad_norm": 0.06270863115787506, + "kl": 0.031219482421875, + "learning_rate": 4.8407040262080026e-06, + "loss": 0.0461, + "step": 2802 + }, + { + "clip_ratio": 0.004197804373688996, + "completion_length": 698.2678985595703, + "epoch": 0.1045846742222844, + "grad_norm": 0.08195656538009644, + "kl": 0.030242919921875, + "learning_rate": 4.837820738951928e-06, + "loss": -0.0126, + "num_tokens": 66001092.0, + "reward": 0.5391048900783062, + "reward_std": 0.35782057233154774, + "rewards/code_reward": 0.3891048775985837, + "rewards/format_reward": 1.5, + "step": 2803 + }, + { + "clip_ratio": 0.003934749402105808, + "epoch": 0.10462198591483607, + "grad_norm": 0.0832839235663414, + "kl": 0.030731201171875, + "learning_rate": 4.8349377295339705e-06, + "loss": -0.0128, + "step": 2804 + }, + { + "clip_ratio": 0.0039554653922095895, + "epoch": 0.10465929760738772, + "grad_norm": 0.07737763226032257, + "kl": 0.029541015625, + "learning_rate": 4.832054999163788e-06, + "loss": -0.0132, + "step": 2805 + }, + { + "clip_ratio": 0.0033734756289049983, + "completion_length": 668.6607360839844, + "epoch": 0.10469660929993937, + "grad_norm": 0.06779024004936218, + "kl": 0.02130126953125, + "learning_rate": 4.829172549050921e-06, + "loss": -0.0007, + "num_tokens": 66069673.0, + "reward": 0.6567122638225555, + "reward_std": 0.12439422309398651, + "rewards/code_reward": 0.5067122541368008, + "rewards/format_reward": 1.5, + "step": 2806 + }, + { + "clip_ratio": 0.003691651043482125, + "epoch": 0.10473392099249103, + "grad_norm": 0.08419918268918991, + "kl": 0.022003173828125, + "learning_rate": 4.82629038040479e-06, + "loss": -0.0007, + "step": 2807 + }, + { + "clip_ratio": 0.0031506233499385417, + "epoch": 0.10477123268504268, + "grad_norm": 0.06131649762392044, + "kl": 0.023590087890625, + "learning_rate": 4.823408494434702e-06, + "loss": -0.0008, + "step": 2808 + }, + { + "clip_ratio": 0.004452297522220761, + "completion_length": 607.7678985595703, + "epoch": 0.10480854437759433, + "grad_norm": 0.08796443045139313, + "kl": 0.024688720703125, + "learning_rate": 4.820526892349842e-06, + "loss": 0.0149, + "num_tokens": 66134914.0, + "reward": 0.8813629224896431, + "reward_std": 0.20181067660450935, + "rewards/code_reward": 0.7313628755509853, + "rewards/format_reward": 1.5, + "step": 2809 + }, + { + "clip_ratio": 0.00404670974239707, + "epoch": 0.10484585607014599, + "grad_norm": 0.08404731750488281, + "kl": 0.02532958984375, + "learning_rate": 4.817645575359274e-06, + "loss": 0.0143, + "step": 2810 + }, + { + "clip_ratio": 0.004061497224029154, + "epoch": 0.10488316776269764, + "grad_norm": 0.08684574067592621, + "kl": 0.02545166015625, + "learning_rate": 4.814764544671951e-06, + "loss": 0.014, + "step": 2811 + }, + { + "clip_ratio": 0.003959791618399322, + "completion_length": 728.6607666015625, + "epoch": 0.1049204794552493, + "grad_norm": 0.04602961242198944, + "kl": 0.03314208984375, + "learning_rate": 4.811883801496694e-06, + "loss": -0.0002, + "num_tokens": 66212819.0, + "reward": 0.29378999024629593, + "reward_std": 0.062155645340681076, + "rewards/code_reward": 0.14378997683525085, + "rewards/format_reward": 1.5, + "step": 2812 + }, + { + "clip_ratio": 0.0038412706344388425, + "epoch": 0.10495779114780095, + "grad_norm": 0.046213291585445404, + "kl": 0.034393310546875, + "learning_rate": 4.809003347042212e-06, + "loss": -0.0001, + "step": 2813 + }, + { + "clip_ratio": 0.004434080678038299, + "epoch": 0.1049951028403526, + "grad_norm": 0.047786932438611984, + "kl": 0.03741455078125, + "learning_rate": 4.80612318251709e-06, + "loss": -0.0002, + "step": 2814 + }, + { + "clip_ratio": 0.001801978563889861, + "completion_length": 775.8036193847656, + "epoch": 0.10503241453290425, + "grad_norm": 0.053912654519081116, + "kl": 0.018646240234375, + "learning_rate": 4.803243309129791e-06, + "loss": 0.0041, + "num_tokens": 66295954.0, + "reward": 0.8642857074737549, + "reward_std": 0.1732691377401352, + "rewards/code_reward": 0.7142857108265162, + "rewards/format_reward": 1.5, + "step": 2815 + }, + { + "clip_ratio": 0.0017128672334365547, + "epoch": 0.1050697262254559, + "grad_norm": 0.05219351127743721, + "kl": 0.01873779296875, + "learning_rate": 4.8003637280886585e-06, + "loss": 0.0039, + "step": 2816 + }, + { + "clip_ratio": 0.001423641457222402, + "epoch": 0.10510703791800756, + "grad_norm": 0.0506034716963768, + "kl": 0.018798828125, + "learning_rate": 4.797484440601907e-06, + "loss": 0.0038, + "step": 2817 + }, + { + "clip_ratio": 0.0017418201896362007, + "completion_length": 605.5714492797852, + "epoch": 0.10514434961055921, + "grad_norm": 0.05212535336613655, + "kl": 0.0252685546875, + "learning_rate": 4.7946054478776385e-06, + "loss": 0.0056, + "num_tokens": 66354254.0, + "reward": 0.6427606046199799, + "reward_std": 0.037096603540703654, + "rewards/code_reward": 0.49276062194257975, + "rewards/format_reward": 1.5, + "step": 2818 + }, + { + "clip_ratio": 0.001723958645015955, + "epoch": 0.10518166130311087, + "grad_norm": 0.047691963613033295, + "kl": 0.025848388671875, + "learning_rate": 4.791726751123817e-06, + "loss": 0.0057, + "step": 2819 + }, + { + "clip_ratio": 0.0015425533638335764, + "epoch": 0.10521897299566252, + "grad_norm": 0.053018756210803986, + "kl": 0.025482177734375, + "learning_rate": 4.788848351548295e-06, + "loss": 0.0055, + "step": 2820 + }, + { + "clip_ratio": 0.0030298330821096897, + "completion_length": 540.0357360839844, + "epoch": 0.10525628468821417, + "grad_norm": 0.05390223488211632, + "kl": 0.02508544921875, + "learning_rate": 4.785970250358797e-06, + "loss": 0.0054, + "num_tokens": 66420644.0, + "reward": 0.7048909179866314, + "reward_std": 0.10523472726345062, + "rewards/code_reward": 0.5548909232020378, + "rewards/format_reward": 1.5, + "step": 2821 + }, + { + "clip_ratio": 0.0022859573364257812, + "epoch": 0.10529359638076582, + "grad_norm": 0.050855718553066254, + "kl": 0.02496337890625, + "learning_rate": 4.783092448762916e-06, + "loss": 0.0053, + "step": 2822 + }, + { + "clip_ratio": 0.002645326196216047, + "epoch": 0.10533090807331748, + "grad_norm": 0.045873258262872696, + "kl": 0.024383544921875, + "learning_rate": 4.780214947968128e-06, + "loss": 0.0052, + "step": 2823 + }, + { + "clip_ratio": 0.0019142588716931641, + "completion_length": 539.9821701049805, + "epoch": 0.10536821976586913, + "grad_norm": 0.03826499357819557, + "kl": 0.021270751953125, + "learning_rate": 4.777337749181776e-06, + "loss": -0.0033, + "num_tokens": 66479261.0, + "reward": 0.8702381029725075, + "reward_std": 0.10645382851362228, + "rewards/code_reward": 0.7202381007373333, + "rewards/format_reward": 1.5, + "step": 2824 + }, + { + "clip_ratio": 0.0021099275909364223, + "epoch": 0.10540553145842078, + "grad_norm": 0.03715531527996063, + "kl": 0.021881103515625, + "learning_rate": 4.774460853611082e-06, + "loss": -0.0032, + "step": 2825 + }, + { + "clip_ratio": 0.0021872390061616898, + "epoch": 0.10544284315097244, + "grad_norm": 0.03794325888156891, + "kl": 0.0210113525390625, + "learning_rate": 4.771584262463136e-06, + "loss": -0.0032, + "step": 2826 + }, + { + "clip_ratio": 0.0038689528009854257, + "completion_length": 526.1607360839844, + "epoch": 0.10548015484352409, + "grad_norm": 0.0735984519124031, + "kl": 0.02227783203125, + "learning_rate": 4.7687079769449016e-06, + "loss": 0.0009, + "num_tokens": 66542424.0, + "reward": 0.694112978875637, + "reward_std": 0.16697635874152184, + "rewards/code_reward": 0.544112928211689, + "rewards/format_reward": 1.5, + "step": 2827 + }, + { + "clip_ratio": 0.003419851651415229, + "epoch": 0.10551746653607574, + "grad_norm": 0.07756109535694122, + "kl": 0.021026611328125, + "learning_rate": 4.7658319982632184e-06, + "loss": 0.0008, + "step": 2828 + }, + { + "clip_ratio": 0.0029423528467305005, + "epoch": 0.1055547782286274, + "grad_norm": 0.06668975204229355, + "kl": 0.0213623046875, + "learning_rate": 4.762956327624789e-06, + "loss": 0.0005, + "step": 2829 + }, + { + "clip_ratio": 0.00281747046392411, + "completion_length": 679.3036041259766, + "epoch": 0.10559208992117905, + "grad_norm": 0.05857645720243454, + "kl": 0.021026611328125, + "learning_rate": 4.760080966236194e-06, + "loss": 0.0142, + "num_tokens": 66607051.0, + "reward": 0.6775227785110474, + "reward_std": 0.17809577356092632, + "rewards/code_reward": 0.5275227659149095, + "rewards/format_reward": 1.5, + "step": 2830 + }, + { + "clip_ratio": 0.0024626855738461018, + "epoch": 0.1056294016137307, + "grad_norm": 0.05717366188764572, + "kl": 0.020904541015625, + "learning_rate": 4.757205915303881e-06, + "loss": 0.014, + "step": 2831 + }, + { + "clip_ratio": 0.0029935463680885732, + "epoch": 0.10566671330628236, + "grad_norm": 0.055868037045001984, + "kl": 0.020660400390625, + "learning_rate": 4.754331176034168e-06, + "loss": 0.0141, + "step": 2832 + }, + { + "clip_ratio": 0.0023389438865706325, + "completion_length": 524.8928680419922, + "epoch": 0.10570402499883401, + "grad_norm": 0.3357444703578949, + "kl": 0.044586181640625, + "learning_rate": 4.751456749633242e-06, + "loss": 0.0028, + "num_tokens": 66668687.0, + "reward": 0.6925875097513199, + "reward_std": 0.0119604857172817, + "rewards/code_reward": 0.5425875160872238, + "rewards/format_reward": 1.5, + "step": 2833 + }, + { + "clip_ratio": 0.002075344091281295, + "epoch": 0.10574133669138566, + "grad_norm": 0.23444516956806183, + "kl": 0.02587890625, + "learning_rate": 4.7485826373071606e-06, + "loss": 0.0027, + "step": 2834 + }, + { + "clip_ratio": 0.0023628423223271966, + "epoch": 0.10577864838393732, + "grad_norm": 0.3915504813194275, + "kl": 0.026275634765625, + "learning_rate": 4.745708840261844e-06, + "loss": 0.003, + "step": 2835 + }, + { + "clip_ratio": 0.0033375584753230214, + "completion_length": 761.0000534057617, + "epoch": 0.10581596007648897, + "grad_norm": 0.06048738956451416, + "kl": 0.017578125, + "learning_rate": 4.7428353597030905e-06, + "loss": 0.0556, + "num_tokens": 66742117.0, + "reward": 0.25898872315883636, + "reward_std": 0.0193362336140126, + "rewards/code_reward": 0.11166727589443326, + "rewards/format_reward": 1.4732142984867096, + "step": 2836 + }, + { + "clip_ratio": 0.0035312542458996177, + "epoch": 0.10585327176904062, + "grad_norm": 0.06027422100305557, + "kl": 0.01739501953125, + "learning_rate": 4.739962196836553e-06, + "loss": 0.0554, + "step": 2837 + }, + { + "clip_ratio": 0.0033011852647177875, + "epoch": 0.10589058346159227, + "grad_norm": 0.06504299491643906, + "kl": 0.017791748046875, + "learning_rate": 4.73708935286776e-06, + "loss": 0.0553, + "step": 2838 + }, + { + "clip_ratio": 0.0021810639882460237, + "completion_length": 663.6071853637695, + "epoch": 0.10592789515414393, + "grad_norm": 0.0695134699344635, + "kl": 0.02874755859375, + "learning_rate": 4.734216829002103e-06, + "loss": 0.011, + "num_tokens": 66809155.0, + "reward": 0.8762064874172211, + "reward_std": 0.1875390037894249, + "rewards/code_reward": 0.7262064591050148, + "rewards/format_reward": 1.5, + "step": 2839 + }, + { + "clip_ratio": 0.002174199791625142, + "epoch": 0.10596520684669558, + "grad_norm": 0.06839777529239655, + "kl": 0.028228759765625, + "learning_rate": 4.731344626444838e-06, + "loss": 0.0106, + "step": 2840 + }, + { + "clip_ratio": 0.0019491256098262966, + "epoch": 0.10600251853924723, + "grad_norm": 0.06354353576898575, + "kl": 0.0284423828125, + "learning_rate": 4.728472746401093e-06, + "loss": 0.0106, + "step": 2841 + }, + { + "clip_ratio": 0.0029220636351965368, + "completion_length": 788.4464721679688, + "epoch": 0.10603983023179889, + "grad_norm": 0.05837205424904823, + "kl": 0.016571044921875, + "learning_rate": 4.7256011900758505e-06, + "loss": -0.0104, + "num_tokens": 66890478.0, + "reward": 0.5125533603131771, + "reward_std": 0.0885538486763835, + "rewards/code_reward": 0.3625533632002771, + "rewards/format_reward": 1.5, + "step": 2842 + }, + { + "clip_ratio": 0.003378460882231593, + "epoch": 0.10607714192435054, + "grad_norm": 0.09556526690721512, + "kl": 0.01616668701171875, + "learning_rate": 4.722729958673963e-06, + "loss": -0.0106, + "step": 2843 + }, + { + "clip_ratio": 0.0029596309759654105, + "epoch": 0.1061144536169022, + "grad_norm": 0.062308479100465775, + "kl": 0.016845703125, + "learning_rate": 4.719859053400147e-06, + "loss": -0.0106, + "step": 2844 + }, + { + "clip_ratio": 0.003082855371758342, + "completion_length": 686.6250305175781, + "epoch": 0.10615176530945385, + "grad_norm": 0.09704096615314484, + "kl": 0.017120361328125, + "learning_rate": 4.716988475458979e-06, + "loss": 0.016, + "num_tokens": 66956887.0, + "reward": 0.6762220449745655, + "reward_std": 0.2528739635599777, + "rewards/code_reward": 0.5262220515287481, + "rewards/format_reward": 1.5, + "step": 2845 + }, + { + "clip_ratio": 0.003140424203593284, + "epoch": 0.1061890770020055, + "grad_norm": 0.0789634957909584, + "kl": 0.0178375244140625, + "learning_rate": 4.714118226054906e-06, + "loss": 0.0158, + "step": 2846 + }, + { + "clip_ratio": 0.002963137230835855, + "epoch": 0.10622638869455715, + "grad_norm": 0.07745378464460373, + "kl": 0.0182037353515625, + "learning_rate": 4.711248306392223e-06, + "loss": 0.0157, + "step": 2847 + }, + { + "clip_ratio": 0.0034354457748122513, + "completion_length": 877.6428985595703, + "epoch": 0.1062637003871088, + "grad_norm": 0.07154670357704163, + "kl": 0.0178985595703125, + "learning_rate": 4.708378717675099e-06, + "loss": -0.0122, + "num_tokens": 67035869.0, + "reward": 0.4316401779651642, + "reward_std": 0.35886181704699993, + "rewards/code_reward": 0.2816401831805706, + "rewards/format_reward": 1.5, + "step": 2848 + }, + { + "clip_ratio": 0.0029470815788954496, + "epoch": 0.10630101207966046, + "grad_norm": 0.07420879602432251, + "kl": 0.018951416015625, + "learning_rate": 4.7055094611075634e-06, + "loss": -0.0121, + "step": 2849 + }, + { + "clip_ratio": 0.003252878232160583, + "epoch": 0.10633832377221211, + "grad_norm": 0.06677235662937164, + "kl": 0.019073486328125, + "learning_rate": 4.702640537893498e-06, + "loss": -0.0124, + "step": 2850 + }, + { + "clip_ratio": 0.004211212450172752, + "completion_length": 562.5714492797852, + "epoch": 0.10637563546476377, + "grad_norm": 0.05816718563437462, + "kl": 0.025177001953125, + "learning_rate": 4.699771949236653e-06, + "loss": -0.0045, + "num_tokens": 67101883.0, + "reward": 0.7504258416593075, + "reward_std": 0.11776290601119399, + "rewards/code_reward": 0.6004258245229721, + "rewards/format_reward": 1.5, + "step": 2851 + }, + { + "clip_ratio": 0.0037908205413259566, + "epoch": 0.10641294715731542, + "grad_norm": 0.05697030574083328, + "kl": 0.024871826171875, + "learning_rate": 4.6969036963406315e-06, + "loss": -0.0047, + "step": 2852 + }, + { + "clip_ratio": 0.003730171883944422, + "epoch": 0.10645025884986707, + "grad_norm": 0.05321313068270683, + "kl": 0.0251922607421875, + "learning_rate": 4.694035780408901e-06, + "loss": -0.005, + "step": 2853 + }, + { + "clip_ratio": 0.004626776266377419, + "completion_length": 647.8571701049805, + "epoch": 0.10648757054241872, + "grad_norm": 0.0524170808494091, + "kl": 0.02935791015625, + "learning_rate": 4.691168202644788e-06, + "loss": 0.007, + "num_tokens": 67172981.0, + "reward": 0.34017858654260635, + "reward_std": 0.16970033943653107, + "rewards/code_reward": 0.1928571406751871, + "rewards/format_reward": 1.4732142984867096, + "step": 2854 + }, + { + "clip_ratio": 0.004152342211455107, + "epoch": 0.10652488223497038, + "grad_norm": 0.05996140092611313, + "kl": 0.0303955078125, + "learning_rate": 4.68830096425147e-06, + "loss": 0.0071, + "step": 2855 + }, + { + "clip_ratio": 0.00429740030085668, + "epoch": 0.10656219392752203, + "grad_norm": 0.05401941388845444, + "kl": 0.028839111328125, + "learning_rate": 4.685434066431992e-06, + "loss": 0.007, + "step": 2856 + }, + { + "clip_ratio": 0.0023338523460552096, + "completion_length": 687.5714569091797, + "epoch": 0.10659950562007368, + "grad_norm": 0.033312976360321045, + "kl": 0.02471923828125, + "learning_rate": 4.682567510389246e-06, + "loss": 0.003, + "num_tokens": 67242677.0, + "reward": 0.7874999903142452, + "reward_std": 0.08370110392570496, + "rewards/code_reward": 0.6375000029802322, + "rewards/format_reward": 1.5, + "step": 2857 + }, + { + "clip_ratio": 0.0020219485741108656, + "epoch": 0.10663681731262534, + "grad_norm": 0.03467462956905365, + "kl": 0.023834228515625, + "learning_rate": 4.67970129732599e-06, + "loss": 0.003, + "step": 2858 + }, + { + "clip_ratio": 0.002090051071718335, + "epoch": 0.106674129005177, + "grad_norm": 0.03320775553584099, + "kl": 0.02410888671875, + "learning_rate": 4.676835428444834e-06, + "loss": 0.0029, + "step": 2859 + }, + { + "clip_ratio": 0.0022136904881335795, + "completion_length": 601.482177734375, + "epoch": 0.10671144069772866, + "grad_norm": 0.08118690550327301, + "kl": 0.0222625732421875, + "learning_rate": 4.673969904948238e-06, + "loss": 0.0, + "num_tokens": 67306950.0, + "reward": 0.8755505532026291, + "reward_std": 0.24827508628368378, + "rewards/code_reward": 0.725550539791584, + "rewards/format_reward": 1.5, + "step": 2860 + }, + { + "clip_ratio": 0.0021657617180608213, + "epoch": 0.10674875239028031, + "grad_norm": 0.07892066240310669, + "kl": 0.022552490234375, + "learning_rate": 4.671104728038528e-06, + "loss": -0.0002, + "step": 2861 + }, + { + "clip_ratio": 0.002166268997825682, + "epoch": 0.10678606408283196, + "grad_norm": 0.0690675750374794, + "kl": 0.0234375, + "learning_rate": 4.6682398989178755e-06, + "loss": -0.0003, + "step": 2862 + }, + { + "clip_ratio": 0.002660572063177824, + "completion_length": 704.2143173217773, + "epoch": 0.10682337577538362, + "grad_norm": 0.07925183326005936, + "kl": 0.041534423828125, + "learning_rate": 4.665375418788311e-06, + "loss": 0.0302, + "num_tokens": 67380266.0, + "reward": 0.712845578789711, + "reward_std": 0.15447527542710304, + "rewards/code_reward": 0.5628455951809883, + "rewards/format_reward": 1.5, + "step": 2863 + }, + { + "clip_ratio": 0.0024264942039735615, + "epoch": 0.10686068746793527, + "grad_norm": 0.07503212243318558, + "kl": 0.03924560546875, + "learning_rate": 4.66251128885172e-06, + "loss": 0.0301, + "step": 2864 + }, + { + "clip_ratio": 0.002661491569597274, + "epoch": 0.10689799916048692, + "grad_norm": 0.06870328634977341, + "kl": 0.0303955078125, + "learning_rate": 4.659647510309833e-06, + "loss": 0.0297, + "step": 2865 + }, + { + "clip_ratio": 0.004435028706211597, + "completion_length": 682.4107513427734, + "epoch": 0.10693531085303858, + "grad_norm": 0.08103528618812561, + "kl": 0.029266357421875, + "learning_rate": 4.656784084364239e-06, + "loss": 0.0079, + "num_tokens": 67456701.0, + "reward": 0.5164361894130707, + "reward_std": 0.17268051847349852, + "rewards/code_reward": 0.3664361475384794, + "rewards/format_reward": 1.5, + "step": 2866 + }, + { + "clip_ratio": 0.004901942971628159, + "epoch": 0.10697262254559023, + "grad_norm": 0.09674033522605896, + "kl": 0.028472900390625, + "learning_rate": 4.653921012216382e-06, + "loss": 0.008, + "step": 2867 + }, + { + "clip_ratio": 0.0048389962175861, + "epoch": 0.10700993423814188, + "grad_norm": 0.07377389818429947, + "kl": 0.0292816162109375, + "learning_rate": 4.651058295067552e-06, + "loss": 0.0076, + "step": 2868 + }, + { + "clip_ratio": 0.0024256351753138006, + "completion_length": 779.2857666015625, + "epoch": 0.10704724593069354, + "grad_norm": 0.029668312519788742, + "kl": 0.02410888671875, + "learning_rate": 4.648195934118892e-06, + "loss": 0.0033, + "num_tokens": 67534207.0, + "reward": 0.685714278370142, + "reward_std": 0.09078412503004074, + "rewards/code_reward": 0.5357142873108387, + "rewards/format_reward": 1.5, + "step": 2869 + }, + { + "clip_ratio": 0.002446855593007058, + "epoch": 0.10708455762324519, + "grad_norm": 0.02822536788880825, + "kl": 0.023284912109375, + "learning_rate": 4.645333930571394e-06, + "loss": 0.0033, + "step": 2870 + }, + { + "clip_ratio": 0.0024426766321994364, + "epoch": 0.10712186931579684, + "grad_norm": 0.028309013694524765, + "kl": 0.02301025390625, + "learning_rate": 4.642472285625904e-06, + "loss": 0.0033, + "step": 2871 + }, + { + "clip_ratio": 0.002880412503145635, + "completion_length": 446.64288330078125, + "epoch": 0.1071591810083485, + "grad_norm": 0.0901409387588501, + "kl": 0.023651123046875, + "learning_rate": 4.639611000483115e-06, + "loss": 0.0025, + "num_tokens": 67580457.0, + "reward": 0.6887199319899082, + "reward_std": 0.09276873781345785, + "rewards/code_reward": 0.5387199448887259, + "rewards/format_reward": 1.5, + "step": 2872 + }, + { + "clip_ratio": 0.002865334623493254, + "epoch": 0.10719649270090015, + "grad_norm": 0.08127837628126144, + "kl": 0.023834228515625, + "learning_rate": 4.636750076343566e-06, + "loss": 0.0022, + "step": 2873 + }, + { + "clip_ratio": 0.00252921343781054, + "epoch": 0.1072338043934518, + "grad_norm": 0.07783037424087524, + "kl": 0.024505615234375, + "learning_rate": 4.63388951440765e-06, + "loss": 0.002, + "step": 2874 + }, + { + "clip_ratio": 0.002755283028818667, + "completion_length": 577.0357513427734, + "epoch": 0.10727111608600345, + "grad_norm": 0.0745367556810379, + "kl": 0.031585693359375, + "learning_rate": 4.631029315875605e-06, + "loss": -0.0202, + "num_tokens": 67655289.0, + "reward": 0.9060297571122646, + "reward_std": 0.030185394920408726, + "rewards/code_reward": 0.757994057610631, + "rewards/format_reward": 1.480357140302658, + "step": 2875 + }, + { + "clip_ratio": 0.0025230066967196763, + "epoch": 0.10730842777855511, + "grad_norm": 0.07820922136306763, + "kl": 0.03277587890625, + "learning_rate": 4.628169481947518e-06, + "loss": -0.0201, + "step": 2876 + }, + { + "clip_ratio": 0.0024998500011861324, + "epoch": 0.10734573947110676, + "grad_norm": 0.08376701921224594, + "kl": 0.031463623046875, + "learning_rate": 4.625310013823322e-06, + "loss": -0.0204, + "step": 2877 + }, + { + "clip_ratio": 0.005173479614313692, + "completion_length": 661.8571624755859, + "epoch": 0.10738305116365841, + "grad_norm": 0.0982036143541336, + "kl": 0.066131591796875, + "learning_rate": 4.622450912702795e-06, + "loss": -0.0126, + "num_tokens": 67729467.0, + "reward": 0.692801047116518, + "reward_std": 0.2511601597070694, + "rewards/code_reward": 0.5428010430186987, + "rewards/format_reward": 1.5, + "step": 2878 + }, + { + "clip_ratio": 0.004471839056350291, + "epoch": 0.10742036285621007, + "grad_norm": 0.09334321320056915, + "kl": 0.05938720703125, + "learning_rate": 4.619592179785565e-06, + "loss": -0.0132, + "step": 2879 + }, + { + "clip_ratio": 0.005263536993879825, + "epoch": 0.10745767454876172, + "grad_norm": 0.09028090536594391, + "kl": 0.053955078125, + "learning_rate": 4.616733816271101e-06, + "loss": -0.0133, + "step": 2880 + }, + { + "clip_ratio": 0.002222307026386261, + "completion_length": 459.1607360839844, + "epoch": 0.10749498624131337, + "grad_norm": 0.062296923249959946, + "kl": 0.026275634765625, + "learning_rate": 4.613875823358722e-06, + "loss": 0.0141, + "num_tokens": 67781024.0, + "reward": 0.9063775688409805, + "reward_std": 0.08962683938443661, + "rewards/code_reward": 0.7563775423914194, + "rewards/format_reward": 1.5, + "step": 2881 + }, + { + "clip_ratio": 0.0027190435212105513, + "epoch": 0.10753229793386503, + "grad_norm": 0.07435976713895798, + "kl": 0.0263671875, + "learning_rate": 4.611018202247588e-06, + "loss": 0.0142, + "step": 2882 + }, + { + "clip_ratio": 0.0022023514029569924, + "epoch": 0.10756960962641668, + "grad_norm": 0.055763352662324905, + "kl": 0.02667236328125, + "learning_rate": 4.608160954136701e-06, + "loss": 0.014, + "step": 2883 + }, + { + "clip_ratio": 0.003034283989109099, + "completion_length": 710.9107437133789, + "epoch": 0.10760692131896833, + "grad_norm": 0.05413472279906273, + "kl": 0.022125244140625, + "learning_rate": 4.605304080224913e-06, + "loss": 0.0353, + "num_tokens": 67852825.0, + "reward": 0.5513638146221638, + "reward_std": 0.10733718459960073, + "rewards/code_reward": 0.40404237250913866, + "rewards/format_reward": 1.4732142984867096, + "step": 2884 + }, + { + "clip_ratio": 0.00303437263937667, + "epoch": 0.10764423301151999, + "grad_norm": 0.054014384746551514, + "kl": 0.02197265625, + "learning_rate": 4.602447581710911e-06, + "loss": 0.0352, + "step": 2885 + }, + { + "clip_ratio": 0.003238213248550892, + "epoch": 0.10768154470407164, + "grad_norm": 0.05090701952576637, + "kl": 0.021881103515625, + "learning_rate": 4.599591459793233e-06, + "loss": 0.0352, + "step": 2886 + }, + { + "clip_ratio": 0.004249512974638492, + "completion_length": 636.6964492797852, + "epoch": 0.10771885639662329, + "grad_norm": 0.08097701519727707, + "kl": 0.02801513671875, + "learning_rate": 4.596735715670253e-06, + "loss": 0.0043, + "num_tokens": 67914576.0, + "reward": 0.4624999985098839, + "reward_std": 0.265747606754303, + "rewards/code_reward": 0.3125000046566129, + "rewards/format_reward": 1.5, + "step": 2887 + }, + { + "clip_ratio": 0.004297313513234258, + "epoch": 0.10775616808917494, + "grad_norm": 0.08434752374887466, + "kl": 0.026641845703125, + "learning_rate": 4.593880350540187e-06, + "loss": 0.0043, + "step": 2888 + }, + { + "clip_ratio": 0.004056296136695892, + "epoch": 0.1077934797817266, + "grad_norm": 0.07474866509437561, + "kl": 0.026641845703125, + "learning_rate": 4.591025365601096e-06, + "loss": 0.0038, + "step": 2889 + }, + { + "clip_ratio": 0.0033892703359015286, + "completion_length": 648.9107360839844, + "epoch": 0.10783079147427825, + "grad_norm": 0.06224140152335167, + "kl": 0.0206298828125, + "learning_rate": 4.588170762050875e-06, + "loss": 0.0529, + "num_tokens": 67974813.0, + "reward": 0.8875000029802322, + "reward_std": 0.12592121213674545, + "rewards/code_reward": 0.740178570151329, + "rewards/format_reward": 1.4732142984867096, + "step": 2890 + }, + { + "clip_ratio": 0.003263829741626978, + "epoch": 0.1078681031668299, + "grad_norm": 0.0617978535592556, + "kl": 0.0213165283203125, + "learning_rate": 4.5853165410872644e-06, + "loss": 0.0527, + "step": 2891 + }, + { + "clip_ratio": 0.0029192882357165217, + "epoch": 0.10790541485938156, + "grad_norm": 0.06164131686091423, + "kl": 0.020904541015625, + "learning_rate": 4.582462703907844e-06, + "loss": 0.0525, + "step": 2892 + }, + { + "clip_ratio": 0.005075126246083528, + "completion_length": 828.3035888671875, + "epoch": 0.10794272655193321, + "grad_norm": 0.09047847241163254, + "kl": 0.026275634765625, + "learning_rate": 4.579609251710028e-06, + "loss": 0.0038, + "num_tokens": 68067636.0, + "reward": 0.4824270159006119, + "reward_std": 0.4040183871984482, + "rewards/code_reward": 0.3324270211160183, + "rewards/format_reward": 1.5, + "step": 2893 + }, + { + "clip_ratio": 0.004666253691539168, + "epoch": 0.10798003824448486, + "grad_norm": 0.09062961488962173, + "kl": 0.026092529296875, + "learning_rate": 4.576756185691073e-06, + "loss": 0.0035, + "step": 2894 + }, + { + "clip_ratio": 0.004503996926359832, + "epoch": 0.10801734993703652, + "grad_norm": 0.08683731406927109, + "kl": 0.026275634765625, + "learning_rate": 4.573903507048074e-06, + "loss": 0.0033, + "step": 2895 + }, + { + "clip_ratio": 0.002240188419818878, + "completion_length": 578.1250228881836, + "epoch": 0.10805466162958817, + "grad_norm": 0.06659582257270813, + "kl": 0.02093505859375, + "learning_rate": 4.57105121697796e-06, + "loss": 0.0188, + "num_tokens": 68121025.0, + "reward": 0.8382211402058601, + "reward_std": 0.13789259642362595, + "rewards/code_reward": 0.6882211416959763, + "rewards/format_reward": 1.5, + "step": 2896 + }, + { + "clip_ratio": 0.0020573852816596627, + "epoch": 0.10809197332213982, + "grad_norm": 0.06413477659225464, + "kl": 0.0205078125, + "learning_rate": 4.5681993166774995e-06, + "loss": 0.0187, + "step": 2897 + }, + { + "clip_ratio": 0.0020661947783082724, + "epoch": 0.10812928501469148, + "grad_norm": 0.05807995796203613, + "kl": 0.020660400390625, + "learning_rate": 4.565347807343297e-06, + "loss": 0.0186, + "step": 2898 + }, + { + "clip_ratio": 0.004050434043165296, + "completion_length": 646.2500228881836, + "epoch": 0.10816659670724313, + "grad_norm": 0.0708700492978096, + "kl": 0.0199432373046875, + "learning_rate": 4.562496690171792e-06, + "loss": 0.001, + "num_tokens": 68195855.0, + "reward": 0.38895437121391296, + "reward_std": 0.1604393869638443, + "rewards/code_reward": 0.238954346626997, + "rewards/format_reward": 1.5, + "step": 2899 + }, + { + "clip_ratio": 0.005148619064129889, + "epoch": 0.10820390839979478, + "grad_norm": 0.057983119040727615, + "kl": 0.0204620361328125, + "learning_rate": 4.559645966359263e-06, + "loss": 0.0011, + "step": 2900 + }, + { + "clip_ratio": 0.004496183479204774, + "epoch": 0.10824122009234644, + "grad_norm": 0.058068741112947464, + "kl": 0.020843505859375, + "learning_rate": 4.556795637101816e-06, + "loss": 0.0009, + "step": 2901 + }, + { + "clip_ratio": 0.0031530720880255103, + "completion_length": 684.7857437133789, + "epoch": 0.10827853178489809, + "grad_norm": 0.06315457075834274, + "kl": 0.03228759765625, + "learning_rate": 4.553945703595404e-06, + "loss": 0.0026, + "num_tokens": 68267915.0, + "reward": 0.6270408257842064, + "reward_std": 0.2537750229239464, + "rewards/code_reward": 0.47704082913696766, + "rewards/format_reward": 1.5, + "step": 2902 + }, + { + "clip_ratio": 0.002906934591010213, + "epoch": 0.10831584347744974, + "grad_norm": 0.06032988801598549, + "kl": 0.033172607421875, + "learning_rate": 4.551096167035797e-06, + "loss": 0.0025, + "step": 2903 + }, + { + "clip_ratio": 0.003001705277711153, + "epoch": 0.1083531551700014, + "grad_norm": 0.05743594840168953, + "kl": 0.03228759765625, + "learning_rate": 4.548247028618612e-06, + "loss": 0.0024, + "step": 2904 + }, + { + "clip_ratio": 0.004239371046423912, + "completion_length": 844.1607513427734, + "epoch": 0.10839046686255305, + "grad_norm": 0.08376529067754745, + "kl": 0.024322509765625, + "learning_rate": 4.545398289539297e-06, + "loss": 0.0205, + "num_tokens": 68350636.0, + "reward": 0.5219215452671051, + "reward_std": 0.3752354606986046, + "rewards/code_reward": 0.37192152440547943, + "rewards/format_reward": 1.5, + "step": 2905 + }, + { + "clip_ratio": 0.004537703294772655, + "epoch": 0.1084277785551047, + "grad_norm": 0.08362260460853577, + "kl": 0.0244140625, + "learning_rate": 4.542549950993126e-06, + "loss": 0.0206, + "step": 2906 + }, + { + "clip_ratio": 0.004202176118269563, + "epoch": 0.10846509024765635, + "grad_norm": 0.08093149214982986, + "kl": 0.023651123046875, + "learning_rate": 4.539702014175211e-06, + "loss": 0.02, + "step": 2907 + }, + { + "clip_ratio": 0.0044503105455078185, + "completion_length": 770.3393096923828, + "epoch": 0.10850240194020801, + "grad_norm": 0.08391819894313812, + "kl": 0.029083251953125, + "learning_rate": 4.536854480280491e-06, + "loss": 0.0229, + "num_tokens": 68427613.0, + "reward": 0.4920261837542057, + "reward_std": 0.22873065061867237, + "rewards/code_reward": 0.34202617034316063, + "rewards/format_reward": 1.5, + "step": 2908 + }, + { + "clip_ratio": 0.004332451731897891, + "epoch": 0.10853971363275966, + "grad_norm": 0.0829925537109375, + "kl": 0.028533935546875, + "learning_rate": 4.5340073505037406e-06, + "loss": 0.0229, + "step": 2909 + }, + { + "clip_ratio": 0.004011714190710336, + "epoch": 0.10857702532531131, + "grad_norm": 0.0770534798502922, + "kl": 0.029083251953125, + "learning_rate": 4.531160626039563e-06, + "loss": 0.0225, + "step": 2910 + }, + { + "clip_ratio": 0.0021515830885618925, + "completion_length": 609.6964569091797, + "epoch": 0.10861433701786297, + "grad_norm": 0.11751538515090942, + "kl": 0.09808349609375, + "learning_rate": 4.52831430808239e-06, + "loss": -0.0038, + "num_tokens": 68493074.0, + "reward": 0.9009418934583664, + "reward_std": 0.19556614384055138, + "rewards/code_reward": 0.7509418874979019, + "rewards/format_reward": 1.5, + "step": 2911 + }, + { + "clip_ratio": 0.0020400236535351723, + "epoch": 0.10865164871041462, + "grad_norm": 0.06949575245380402, + "kl": 0.059478759765625, + "learning_rate": 4.5254683978264845e-06, + "loss": -0.0041, + "step": 2912 + }, + { + "clip_ratio": 0.0018620941264089197, + "epoch": 0.10868896040296629, + "grad_norm": 0.07994043827056885, + "kl": 0.054931640625, + "learning_rate": 4.522622896465936e-06, + "loss": -0.0045, + "step": 2913 + }, + { + "clip_ratio": 0.003557386517059058, + "completion_length": 721.4464721679688, + "epoch": 0.10872627209551794, + "grad_norm": 0.048885639756917953, + "kl": 0.02252197265625, + "learning_rate": 4.519777805194663e-06, + "loss": 0.0318, + "num_tokens": 68563211.0, + "reward": 0.4188833720982075, + "reward_std": 0.058294459246098995, + "rewards/code_reward": 0.27156194509007037, + "rewards/format_reward": 1.4732142984867096, + "step": 2914 + }, + { + "clip_ratio": 0.0038049404975026846, + "epoch": 0.10876358378806959, + "grad_norm": 0.046423494815826416, + "kl": 0.0223388671875, + "learning_rate": 4.5169331252064165e-06, + "loss": 0.0319, + "step": 2915 + }, + { + "clip_ratio": 0.003413545957300812, + "epoch": 0.10880089548062125, + "grad_norm": 0.04834475740790367, + "kl": 0.022674560546875, + "learning_rate": 4.51408885769477e-06, + "loss": 0.0317, + "step": 2916 + }, + { + "clip_ratio": 0.003522803308442235, + "completion_length": 528.6607360839844, + "epoch": 0.1088382071731729, + "grad_norm": 0.10663507133722305, + "kl": 0.0238037109375, + "learning_rate": 4.511245003853124e-06, + "loss": -0.0158, + "num_tokens": 68622384.0, + "reward": 0.9572660475969315, + "reward_std": 0.353719562292099, + "rewards/code_reward": 0.8072660118341446, + "rewards/format_reward": 1.5, + "step": 2917 + }, + { + "clip_ratio": 0.0033784015104174614, + "epoch": 0.10887551886572455, + "grad_norm": 0.14286872744560242, + "kl": 0.02423095703125, + "learning_rate": 4.508401564874707e-06, + "loss": -0.0158, + "step": 2918 + }, + { + "clip_ratio": 0.003610509738791734, + "epoch": 0.1089128305582762, + "grad_norm": 0.09604928642511368, + "kl": 0.024749755859375, + "learning_rate": 4.505558541952575e-06, + "loss": -0.0164, + "step": 2919 + }, + { + "clip_ratio": 0.004136213916353881, + "completion_length": 764.5178833007812, + "epoch": 0.10895014225082786, + "grad_norm": 0.09148366749286652, + "kl": 0.02734375, + "learning_rate": 4.502715936279607e-06, + "loss": 0.0039, + "num_tokens": 68700113.0, + "reward": 0.5397534146904945, + "reward_std": 0.22754384577274323, + "rewards/code_reward": 0.3897533994168043, + "rewards/format_reward": 1.5, + "step": 2920 + }, + { + "clip_ratio": 0.004048747010529041, + "epoch": 0.10898745394337951, + "grad_norm": 0.09178873151540756, + "kl": 0.026947021484375, + "learning_rate": 4.499873749048506e-06, + "loss": 0.0038, + "step": 2921 + }, + { + "clip_ratio": 0.003949135891161859, + "epoch": 0.10902476563593116, + "grad_norm": 0.08186564594507217, + "kl": 0.027069091796875, + "learning_rate": 4.4970319814518025e-06, + "loss": 0.0033, + "step": 2922 + }, + { + "clip_ratio": 0.0016173476469703019, + "completion_length": 401.2857360839844, + "epoch": 0.10906207732848282, + "grad_norm": 0.06264238059520721, + "kl": 0.02032470703125, + "learning_rate": 4.494190634681848e-06, + "loss": -0.0021, + "num_tokens": 68744537.0, + "reward": 1.1170691549777985, + "reward_std": 0.10490889474749565, + "rewards/code_reward": 0.9670691192150116, + "rewards/format_reward": 1.5, + "step": 2923 + }, + { + "clip_ratio": 0.0016054417355917394, + "epoch": 0.10909938902103447, + "grad_norm": 0.06068703904747963, + "kl": 0.020599365234375, + "learning_rate": 4.49134970993082e-06, + "loss": -0.0021, + "step": 2924 + }, + { + "clip_ratio": 0.0015618923935107887, + "epoch": 0.10913670071358612, + "grad_norm": 0.058256275951862335, + "kl": 0.020782470703125, + "learning_rate": 4.48850920839072e-06, + "loss": -0.0024, + "step": 2925 + }, + { + "clip_ratio": 0.004172130953520536, + "completion_length": 707.8214721679688, + "epoch": 0.10917401240613778, + "grad_norm": 0.08598192781209946, + "kl": 0.0218353271484375, + "learning_rate": 4.485669131253364e-06, + "loss": -0.0024, + "num_tokens": 68827127.0, + "reward": 0.6562663242220879, + "reward_std": 0.29893494374118745, + "rewards/code_reward": 0.5062663094140589, + "rewards/format_reward": 1.5, + "step": 2926 + }, + { + "clip_ratio": 0.00384328979998827, + "epoch": 0.10921132409868943, + "grad_norm": 0.0854567140340805, + "kl": 0.0213775634765625, + "learning_rate": 4.4828294797103985e-06, + "loss": -0.0025, + "step": 2927 + }, + { + "clip_ratio": 0.0040493455599062145, + "epoch": 0.10924863579124108, + "grad_norm": 0.07795149832963943, + "kl": 0.022186279296875, + "learning_rate": 4.479990254953291e-06, + "loss": -0.0028, + "step": 2928 + }, + { + "clip_ratio": 0.005330996587872505, + "completion_length": 499.57144927978516, + "epoch": 0.10928594748379274, + "grad_norm": 0.09404879063367844, + "kl": 0.026824951171875, + "learning_rate": 4.477151458173323e-06, + "loss": 0.0197, + "num_tokens": 68880163.0, + "reward": 0.6308687888085842, + "reward_std": 0.18038197979331017, + "rewards/code_reward": 0.4808687809854746, + "rewards/format_reward": 1.5, + "step": 2929 + }, + { + "clip_ratio": 0.004621168947778642, + "epoch": 0.10932325917634439, + "grad_norm": 0.09182540327310562, + "kl": 0.0261383056640625, + "learning_rate": 4.474313090561606e-06, + "loss": 0.0195, + "step": 2930 + }, + { + "clip_ratio": 0.004962392849847674, + "epoch": 0.10936057086889604, + "grad_norm": 0.08962463587522507, + "kl": 0.0267791748046875, + "learning_rate": 4.471475153309061e-06, + "loss": 0.0191, + "step": 2931 + }, + { + "clip_ratio": 0.0028362928424030542, + "completion_length": 769.3750305175781, + "epoch": 0.1093978825614477, + "grad_norm": 0.07728756964206696, + "kl": 0.022979736328125, + "learning_rate": 4.468637647606437e-06, + "loss": 0.0268, + "num_tokens": 68959438.0, + "reward": 0.6330687887966633, + "reward_std": 0.24238169565796852, + "rewards/code_reward": 0.4884259160608053, + "rewards/format_reward": 1.4464285671710968, + "step": 2932 + }, + { + "clip_ratio": 0.0027135724667459726, + "epoch": 0.10943519425399935, + "grad_norm": 0.07221934199333191, + "kl": 0.022979736328125, + "learning_rate": 4.465800574644299e-06, + "loss": 0.0266, + "step": 2933 + }, + { + "clip_ratio": 0.0030113989487290382, + "epoch": 0.109472505946551, + "grad_norm": 0.07270821183919907, + "kl": 0.023040771484375, + "learning_rate": 4.462963935613027e-06, + "loss": 0.0267, + "step": 2934 + }, + { + "clip_ratio": 0.003622763848397881, + "completion_length": 679.785758972168, + "epoch": 0.10950981763910266, + "grad_norm": 0.12248256057500839, + "kl": 0.0201568603515625, + "learning_rate": 4.460127731702825e-06, + "loss": -0.0129, + "num_tokens": 69027242.0, + "reward": 0.5375035405158997, + "reward_std": 0.2740888763219118, + "rewards/code_reward": 0.3875035345554352, + "rewards/format_reward": 1.5, + "step": 2935 + }, + { + "clip_ratio": 0.0033785777632147074, + "epoch": 0.10954712933165431, + "grad_norm": 0.09558549523353577, + "kl": 0.024383544921875, + "learning_rate": 4.4572919641037104e-06, + "loss": -0.0135, + "step": 2936 + }, + { + "clip_ratio": 0.004297375737223774, + "epoch": 0.10958444102420596, + "grad_norm": 0.09491196274757385, + "kl": 0.022186279296875, + "learning_rate": 4.454456634005519e-06, + "loss": -0.0136, + "step": 2937 + }, + { + "clip_ratio": 0.0018670640420168638, + "completion_length": 516.9464569091797, + "epoch": 0.10962175271675761, + "grad_norm": 0.0672241747379303, + "kl": 0.018402099609375, + "learning_rate": 4.451621742597904e-06, + "loss": -0.0079, + "num_tokens": 69086669.0, + "reward": 0.9416666701436043, + "reward_std": 0.11305591464042664, + "rewards/code_reward": 0.7916666716337204, + "rewards/format_reward": 1.5, + "step": 2938 + }, + { + "clip_ratio": 0.0016208466258831322, + "epoch": 0.10965906440930927, + "grad_norm": 0.06621387600898743, + "kl": 0.0185546875, + "learning_rate": 4.4487872910703285e-06, + "loss": -0.0079, + "step": 2939 + }, + { + "clip_ratio": 0.0017721927724778652, + "epoch": 0.10969637610186092, + "grad_norm": 0.06673406809568405, + "kl": 0.0185699462890625, + "learning_rate": 4.445953280612081e-06, + "loss": -0.0078, + "step": 2940 + }, + { + "clip_ratio": 0.0032709165825508535, + "completion_length": 647.4464492797852, + "epoch": 0.10973368779441257, + "grad_norm": 0.0934496745467186, + "kl": 0.0281524658203125, + "learning_rate": 4.443119712412256e-06, + "loss": -0.0023, + "num_tokens": 69165248.0, + "reward": 0.885576918721199, + "reward_std": 0.15924466773867607, + "rewards/code_reward": 0.7355769071727991, + "rewards/format_reward": 1.5, + "step": 2941 + }, + { + "clip_ratio": 0.0032091091270558536, + "epoch": 0.10977099948696423, + "grad_norm": 0.08463118970394135, + "kl": 0.028106689453125, + "learning_rate": 4.440286587659768e-06, + "loss": -0.0023, + "step": 2942 + }, + { + "clip_ratio": 0.0029491817695088685, + "epoch": 0.10980831117951588, + "grad_norm": 0.3200570344924927, + "kl": 0.0263214111328125, + "learning_rate": 4.4374539075433454e-06, + "loss": -0.0027, + "step": 2943 + }, + { + "clip_ratio": 0.003523615130688995, + "completion_length": 544.6250305175781, + "epoch": 0.10984562287206753, + "grad_norm": 0.1282445788383484, + "kl": 0.024383544921875, + "learning_rate": 4.434621673251522e-06, + "loss": 0.0058, + "num_tokens": 69221633.0, + "reward": 0.7655087262392044, + "reward_std": 0.22565694898366928, + "rewards/code_reward": 0.6155087221413851, + "rewards/format_reward": 1.5, + "step": 2944 + }, + { + "clip_ratio": 0.003741673950571567, + "epoch": 0.10988293456461919, + "grad_norm": 0.09422527253627777, + "kl": 0.02484130859375, + "learning_rate": 4.431789885972657e-06, + "loss": 0.0055, + "step": 2945 + }, + { + "clip_ratio": 0.003189508803188801, + "epoch": 0.10992024625717084, + "grad_norm": 0.08765909075737, + "kl": 0.02374267578125, + "learning_rate": 4.428958546894911e-06, + "loss": 0.005, + "step": 2946 + }, + { + "clip_ratio": 0.0037156579783186316, + "completion_length": 582.0000305175781, + "epoch": 0.10995755794972249, + "grad_norm": 0.08346793055534363, + "kl": 0.024261474609375, + "learning_rate": 4.426127657206265e-06, + "loss": 0.0078, + "num_tokens": 69281273.0, + "reward": 0.3613095246255398, + "reward_std": 0.21403182856738567, + "rewards/code_reward": 0.21130952471867204, + "rewards/format_reward": 1.5, + "step": 2947 + }, + { + "clip_ratio": 0.004560734145343304, + "epoch": 0.10999486964227415, + "grad_norm": 0.08248219639062881, + "kl": 0.02410888671875, + "learning_rate": 4.423297218094508e-06, + "loss": 0.0078, + "step": 2948 + }, + { + "clip_ratio": 0.004130157991312444, + "epoch": 0.1100321813348258, + "grad_norm": 0.08055084943771362, + "kl": 0.0245361328125, + "learning_rate": 4.420467230747234e-06, + "loss": 0.0076, + "step": 2949 + }, + { + "clip_ratio": 0.0034289007890038192, + "completion_length": 617.3571624755859, + "epoch": 0.11006949302737745, + "grad_norm": 0.05857350304722786, + "kl": 0.02227783203125, + "learning_rate": 4.417637696351861e-06, + "loss": -0.001, + "num_tokens": 69350243.0, + "reward": 0.6708723790943623, + "reward_std": 0.12339125573635101, + "rewards/code_reward": 0.5208723666146398, + "rewards/format_reward": 1.5, + "step": 2950 + }, + { + "clip_ratio": 0.00257763231638819, + "epoch": 0.1101068047199291, + "grad_norm": 0.05773884430527687, + "kl": 0.0222015380859375, + "learning_rate": 4.414808616095602e-06, + "loss": -0.0009, + "step": 2951 + }, + { + "clip_ratio": 0.002656281809322536, + "epoch": 0.11014411641248076, + "grad_norm": 0.05242821201682091, + "kl": 0.0220489501953125, + "learning_rate": 4.41197999116549e-06, + "loss": -0.0013, + "step": 2952 + }, + { + "clip_ratio": 0.002027360606007278, + "completion_length": 608.2143173217773, + "epoch": 0.11018142810503241, + "grad_norm": 0.06210771203041077, + "kl": 0.0176849365234375, + "learning_rate": 4.409151822748365e-06, + "loss": -0.0021, + "num_tokens": 69413353.0, + "reward": 0.7343879975378513, + "reward_std": 0.08607378415763378, + "rewards/code_reward": 0.5843879976309836, + "rewards/format_reward": 1.5, + "step": 2953 + }, + { + "clip_ratio": 0.0020036973292008042, + "epoch": 0.11021873979758406, + "grad_norm": 0.05830490216612816, + "kl": 0.017608642578125, + "learning_rate": 4.40632411203087e-06, + "loss": -0.0023, + "step": 2954 + }, + { + "clip_ratio": 0.0020503528066910803, + "epoch": 0.11025605149013572, + "grad_norm": 0.05405324324965477, + "kl": 0.0181121826171875, + "learning_rate": 4.403496860199462e-06, + "loss": -0.0024, + "step": 2955 + }, + { + "clip_ratio": 0.0030272369040176272, + "completion_length": 632.2678909301758, + "epoch": 0.11029336318268737, + "grad_norm": 0.07044254243373871, + "kl": 0.02532958984375, + "learning_rate": 4.400670068440406e-06, + "loss": 0.0099, + "num_tokens": 69492440.0, + "reward": 0.7394303977489471, + "reward_std": 0.30022473633289337, + "rewards/code_reward": 0.5894303694367409, + "rewards/format_reward": 1.5, + "step": 2956 + }, + { + "clip_ratio": 0.002899000304751098, + "epoch": 0.11033067487523902, + "grad_norm": 0.07264624536037445, + "kl": 0.02520751953125, + "learning_rate": 4.397843737939766e-06, + "loss": 0.0098, + "step": 2957 + }, + { + "clip_ratio": 0.002813757280819118, + "epoch": 0.11036798656779068, + "grad_norm": 0.06744924932718277, + "kl": 0.02490234375, + "learning_rate": 4.395017869883421e-06, + "loss": 0.0094, + "step": 2958 + }, + { + "clip_ratio": 0.004077081161085516, + "completion_length": 672.0893249511719, + "epoch": 0.11040529826034233, + "grad_norm": 0.08540528267621994, + "kl": 0.027374267578125, + "learning_rate": 4.3921924654570515e-06, + "loss": -0.0014, + "num_tokens": 69567121.0, + "reward": 0.6760487332940102, + "reward_std": 0.4420721158385277, + "rewards/code_reward": 0.5260487161576748, + "rewards/format_reward": 1.5, + "step": 2959 + }, + { + "clip_ratio": 0.0040788460173644125, + "epoch": 0.11044260995289398, + "grad_norm": 0.07732630521059036, + "kl": 0.02679443359375, + "learning_rate": 4.389367525846146e-06, + "loss": -0.0016, + "step": 2960 + }, + { + "clip_ratio": 0.003831312176771462, + "epoch": 0.11047992164544564, + "grad_norm": 0.0824175775051117, + "kl": 0.027374267578125, + "learning_rate": 4.386543052235998e-06, + "loss": -0.0019, + "step": 2961 + }, + { + "clip_ratio": 0.004258432716596872, + "completion_length": 673.2678985595703, + "epoch": 0.11051723333799729, + "grad_norm": 0.09142318367958069, + "kl": 0.021453857421875, + "learning_rate": 4.383719045811699e-06, + "loss": -0.006, + "num_tokens": 69646376.0, + "reward": 0.8823220133781433, + "reward_std": 0.22496878076344728, + "rewards/code_reward": 0.7323219776153564, + "rewards/format_reward": 1.5, + "step": 2962 + }, + { + "clip_ratio": 0.004094799805898219, + "epoch": 0.11055454503054894, + "grad_norm": 0.09351271390914917, + "kl": 0.0227203369140625, + "learning_rate": 4.380895507758155e-06, + "loss": -0.006, + "step": 2963 + }, + { + "clip_ratio": 0.003544554812833667, + "epoch": 0.1105918567231006, + "grad_norm": 0.08562523871660233, + "kl": 0.0221405029296875, + "learning_rate": 4.378072439260066e-06, + "loss": -0.0065, + "step": 2964 + }, + { + "clip_ratio": 0.003892105945851654, + "completion_length": 960.9464874267578, + "epoch": 0.11062916841565225, + "grad_norm": 0.07384706288576126, + "kl": 0.0181732177734375, + "learning_rate": 4.3752498415019415e-06, + "loss": 0.0005, + "num_tokens": 69736225.0, + "reward": 0.28664596751332283, + "reward_std": 0.24167287349700928, + "rewards/code_reward": 0.13664596620947123, + "rewards/format_reward": 1.5, + "step": 2965 + }, + { + "clip_ratio": 0.0037096236483193934, + "epoch": 0.1106664801082039, + "grad_norm": 0.08283380419015884, + "kl": 0.018035888671875, + "learning_rate": 4.372427715668093e-06, + "loss": 0.0002, + "step": 2966 + }, + { + "clip_ratio": 0.0036383914994075894, + "epoch": 0.11070379180075555, + "grad_norm": 0.0823279395699501, + "kl": 0.0181732177734375, + "learning_rate": 4.369606062942627e-06, + "loss": 0.0002, + "step": 2967 + }, + { + "clip_ratio": 0.003851635497994721, + "completion_length": 491.25001525878906, + "epoch": 0.11074110349330722, + "grad_norm": 0.0942634791135788, + "kl": 0.029022216796875, + "learning_rate": 4.366784884509461e-06, + "loss": 0.0302, + "num_tokens": 69785155.0, + "reward": 0.7446525916457176, + "reward_std": 0.2094252146780491, + "rewards/code_reward": 0.5973311848938465, + "rewards/format_reward": 1.4732142984867096, + "step": 2968 + }, + { + "clip_ratio": 0.003619480994530022, + "epoch": 0.11077841518585888, + "grad_norm": 0.08488435298204422, + "kl": 0.0296630859375, + "learning_rate": 4.363964181552306e-06, + "loss": 0.03, + "step": 2969 + }, + { + "clip_ratio": 0.0035381666966713965, + "epoch": 0.11081572687841053, + "grad_norm": 0.0846874788403511, + "kl": 0.028045654296875, + "learning_rate": 4.361143955254677e-06, + "loss": 0.0297, + "step": 2970 + }, + { + "clip_ratio": 0.0033654365106485784, + "completion_length": 583.6607437133789, + "epoch": 0.11085303857096218, + "grad_norm": 0.06867488473653793, + "kl": 0.023529052734375, + "learning_rate": 4.358324206799892e-06, + "loss": 0.0387, + "num_tokens": 69842438.0, + "reward": 0.49383974075317383, + "reward_std": 0.16386864241212606, + "rewards/code_reward": 0.3484826012281701, + "rewards/format_reward": 1.4535714089870453, + "step": 2971 + }, + { + "clip_ratio": 0.0026822987128980458, + "epoch": 0.11089035026351383, + "grad_norm": 0.06699175387620926, + "kl": 0.02423095703125, + "learning_rate": 4.35550493737106e-06, + "loss": 0.0385, + "step": 2972 + }, + { + "clip_ratio": 0.0028905303915962577, + "epoch": 0.11092766195606549, + "grad_norm": 0.06549711525440216, + "kl": 0.02447509765625, + "learning_rate": 4.3526861481511e-06, + "loss": 0.0384, + "step": 2973 + }, + { + "clip_ratio": 0.0033400754327885807, + "completion_length": 608.0000228881836, + "epoch": 0.11096497364861714, + "grad_norm": 0.06865067034959793, + "kl": 0.02691650390625, + "learning_rate": 4.349867840322717e-06, + "loss": -0.0061, + "num_tokens": 69902862.0, + "reward": 0.6955952756106853, + "reward_std": 0.04865932837128639, + "rewards/code_reward": 0.5455952361226082, + "rewards/format_reward": 1.5, + "step": 2974 + }, + { + "clip_ratio": 0.0030678636976517737, + "epoch": 0.1110022853411688, + "grad_norm": 0.05943227559328079, + "kl": 0.0269775390625, + "learning_rate": 4.347050015068425e-06, + "loss": -0.0061, + "step": 2975 + }, + { + "clip_ratio": 0.0030711182043887675, + "epoch": 0.11103959703372045, + "grad_norm": 0.05627545714378357, + "kl": 0.02728271484375, + "learning_rate": 4.34423267357053e-06, + "loss": -0.0063, + "step": 2976 + }, + { + "clip_ratio": 0.0024751779856160283, + "completion_length": 747.553596496582, + "epoch": 0.1110769087262721, + "grad_norm": 0.07031029462814331, + "kl": 0.0192718505859375, + "learning_rate": 4.3414158170111365e-06, + "loss": 0.0175, + "num_tokens": 69972991.0, + "reward": 0.6784560903906822, + "reward_std": 0.11999355629086494, + "rewards/code_reward": 0.5284560779109597, + "rewards/format_reward": 1.5, + "step": 2977 + }, + { + "clip_ratio": 0.0028321509016677737, + "epoch": 0.11111422041882375, + "grad_norm": 0.07928816229104996, + "kl": 0.01959228515625, + "learning_rate": 4.338599446572145e-06, + "loss": 0.0176, + "step": 2978 + }, + { + "clip_ratio": 0.0028959542978554964, + "epoch": 0.1111515321113754, + "grad_norm": 0.06614828109741211, + "kl": 0.0192108154296875, + "learning_rate": 4.335783563435251e-06, + "loss": 0.0175, + "step": 2979 + }, + { + "clip_ratio": 0.003921608382370323, + "completion_length": 802.9107513427734, + "epoch": 0.11118884380392706, + "grad_norm": 0.07165351510047913, + "kl": 0.0196075439453125, + "learning_rate": 4.332968168781948e-06, + "loss": -0.0174, + "num_tokens": 70053598.0, + "reward": 0.29675455018877983, + "reward_std": 0.20220846310257912, + "rewards/code_reward": 0.14675454050302505, + "rewards/format_reward": 1.5, + "step": 2980 + }, + { + "clip_ratio": 0.0037033851840533316, + "epoch": 0.11122615549647871, + "grad_norm": 0.06882625818252563, + "kl": 0.01995849609375, + "learning_rate": 4.330153263793524e-06, + "loss": -0.0176, + "step": 2981 + }, + { + "clip_ratio": 0.003863225574605167, + "epoch": 0.11126346718903037, + "grad_norm": 0.06462590396404266, + "kl": 0.0197296142578125, + "learning_rate": 4.327338849651058e-06, + "loss": -0.0177, + "step": 2982 + }, + { + "clip_ratio": 0.0012818727409467101, + "completion_length": 563.1071701049805, + "epoch": 0.11130077888158202, + "grad_norm": 0.014953583478927612, + "kl": 0.024810791015625, + "learning_rate": 4.324524927535432e-06, + "loss": 0.0003, + "num_tokens": 70109520.0, + "reward": 0.899999987334013, + "reward_std": 0.0, + "rewards/code_reward": 0.75, + "rewards/format_reward": 1.5, + "step": 2983 + }, + { + "clip_ratio": 0.0013800527667626739, + "epoch": 0.11133809057413367, + "grad_norm": 0.009075930342078209, + "kl": 0.022247314453125, + "learning_rate": 4.321711498627307e-06, + "loss": 0.0003, + "step": 2984 + }, + { + "clip_ratio": 0.0011141457362100482, + "epoch": 0.11137540226668532, + "grad_norm": 0.014828338287770748, + "kl": 0.024505615234375, + "learning_rate": 4.318898564107149e-06, + "loss": 0.0003, + "step": 2985 + }, + { + "clip_ratio": 0.0034581819199956954, + "completion_length": 929.357177734375, + "epoch": 0.11141271395923698, + "grad_norm": 0.06872215867042542, + "kl": 0.0190582275390625, + "learning_rate": 4.3160861251552175e-06, + "loss": -0.0087, + "num_tokens": 70194142.0, + "reward": 0.709020584821701, + "reward_std": 0.24667158350348473, + "rewards/code_reward": 0.5590205639600754, + "rewards/format_reward": 1.5, + "step": 2986 + }, + { + "clip_ratio": 0.003405319293960929, + "epoch": 0.11145002565178863, + "grad_norm": 0.06825317442417145, + "kl": 0.01885986328125, + "learning_rate": 4.313274182951555e-06, + "loss": -0.0088, + "step": 2987 + }, + { + "clip_ratio": 0.003637082816567272, + "epoch": 0.11148733734434028, + "grad_norm": 0.06702792644500732, + "kl": 0.019073486328125, + "learning_rate": 4.310462738675999e-06, + "loss": -0.0089, + "step": 2988 + }, + { + "clip_ratio": 0.003061146999243647, + "completion_length": 675.0893249511719, + "epoch": 0.11152464903689194, + "grad_norm": 0.07139483094215393, + "kl": 0.018707275390625, + "learning_rate": 4.307651793508184e-06, + "loss": 0.0064, + "num_tokens": 70257945.0, + "reward": 0.8005877062678337, + "reward_std": 0.17437280039303005, + "rewards/code_reward": 0.6541591576533392, + "rewards/format_reward": 1.4642857015132904, + "step": 2989 + }, + { + "clip_ratio": 0.002432606997899711, + "epoch": 0.11156196072944359, + "grad_norm": 0.06962662935256958, + "kl": 0.018524169921875, + "learning_rate": 4.304841348627527e-06, + "loss": 0.006, + "step": 2990 + }, + { + "clip_ratio": 0.0030320139485411346, + "epoch": 0.11159927242199524, + "grad_norm": 0.07072708010673523, + "kl": 0.019134521484375, + "learning_rate": 4.3020314052132405e-06, + "loss": 0.0059, + "step": 2991 + }, + { + "clip_ratio": 0.002050451294053346, + "completion_length": 726.2678985595703, + "epoch": 0.1116365841145469, + "grad_norm": 0.12934108078479767, + "kl": 0.029144287109375, + "learning_rate": 4.299221964444322e-06, + "loss": 0.0075, + "num_tokens": 70331054.0, + "reward": 1.0669319927692413, + "reward_std": 0.12876926735043526, + "rewards/code_reward": 0.9196105599403381, + "rewards/format_reward": 1.4732142984867096, + "step": 2992 + }, + { + "clip_ratio": 0.0018871183274313807, + "epoch": 0.11167389580709855, + "grad_norm": 0.054675519466400146, + "kl": 0.02886962890625, + "learning_rate": 4.296413027499562e-06, + "loss": 0.0073, + "step": 2993 + }, + { + "clip_ratio": 0.001741601328831166, + "epoch": 0.1117112074996502, + "grad_norm": 0.052578896284103394, + "kl": 0.028411865234375, + "learning_rate": 4.293604595557539e-06, + "loss": 0.0073, + "step": 2994 + }, + { + "clip_ratio": 0.0033088134368881583, + "completion_length": 664.6071624755859, + "epoch": 0.11174851919220186, + "grad_norm": 0.05912527069449425, + "kl": 0.021484375, + "learning_rate": 4.290796669796618e-06, + "loss": -0.0037, + "num_tokens": 70404826.0, + "reward": 0.5315417312085629, + "reward_std": 0.15949298441410065, + "rewards/code_reward": 0.38154173269867897, + "rewards/format_reward": 1.5, + "step": 2995 + }, + { + "clip_ratio": 0.003196065837983042, + "epoch": 0.11178583088475351, + "grad_norm": 0.06042885407805443, + "kl": 0.021026611328125, + "learning_rate": 4.287989251394954e-06, + "loss": -0.0037, + "step": 2996 + }, + { + "clip_ratio": 0.0029202717123553157, + "epoch": 0.11182314257730516, + "grad_norm": 0.051448725163936615, + "kl": 0.020965576171875, + "learning_rate": 4.285182341530485e-06, + "loss": -0.004, + "step": 2997 + }, + { + "clip_ratio": 0.002599485742393881, + "completion_length": 681.8214569091797, + "epoch": 0.11186045426985682, + "grad_norm": 0.048376522958278656, + "kl": 0.0205230712890625, + "learning_rate": 4.282375941380938e-06, + "loss": -0.0099, + "num_tokens": 70469678.0, + "reward": 0.8285714499652386, + "reward_std": 0.18156826496124268, + "rewards/code_reward": 0.6785714328289032, + "rewards/format_reward": 1.5, + "step": 2998 + }, + { + "clip_ratio": 0.002881379914470017, + "epoch": 0.11189776596240847, + "grad_norm": 0.0477842278778553, + "kl": 0.0197906494140625, + "learning_rate": 4.279570052123832e-06, + "loss": -0.0098, + "step": 2999 + }, + { + "clip_ratio": 0.0025465008802711964, + "epoch": 0.11193507765496012, + "grad_norm": 0.0473262183368206, + "kl": 0.019989013671875, + "learning_rate": 4.2767646749364574e-06, + "loss": -0.0099, + "step": 3000 + }, + { + "clip_ratio": 0.0025254194624722004, + "completion_length": 503.9107360839844, + "epoch": 0.11197238934751177, + "grad_norm": 0.07656988501548767, + "kl": 0.02935791015625, + "learning_rate": 4.273959810995907e-06, + "loss": -0.0133, + "num_tokens": 70524567.0, + "reward": 0.9892857074737549, + "reward_std": 0.29758043587207794, + "rewards/code_reward": 0.8392857015132904, + "rewards/format_reward": 1.5, + "step": 3001 + }, + { + "clip_ratio": 0.0024601526092737913, + "epoch": 0.11200970104006343, + "grad_norm": 0.09326240420341492, + "kl": 0.030181884765625, + "learning_rate": 4.271155461479042e-06, + "loss": -0.0135, + "step": 3002 + }, + { + "clip_ratio": 0.0020561309647746384, + "epoch": 0.11204701273261508, + "grad_norm": 0.07567526400089264, + "kl": 0.029083251953125, + "learning_rate": 4.268351627562519e-06, + "loss": -0.0138, + "step": 3003 + }, + { + "clip_ratio": 0.0036708847619593143, + "completion_length": 600.2678833007812, + "epoch": 0.11208432442516673, + "grad_norm": 0.07572196424007416, + "kl": 0.03021240234375, + "learning_rate": 4.265548310422778e-06, + "loss": 0.0008, + "num_tokens": 70589752.0, + "reward": 0.6031578779220581, + "reward_std": 0.1865740902721882, + "rewards/code_reward": 0.4551221802830696, + "rewards/format_reward": 1.480357140302658, + "step": 3004 + }, + { + "clip_ratio": 0.0028506204835139215, + "epoch": 0.11212163611771839, + "grad_norm": 0.07397118210792542, + "kl": 0.02838134765625, + "learning_rate": 4.262745511236032e-06, + "loss": 0.0006, + "step": 3005 + }, + { + "clip_ratio": 0.002507304190658033, + "epoch": 0.11215894781027004, + "grad_norm": 0.07148142904043198, + "kl": 0.030059814453125, + "learning_rate": 4.25994323117829e-06, + "loss": 0.0004, + "step": 3006 + }, + { + "clip_ratio": 0.005121662165038288, + "completion_length": 769.732177734375, + "epoch": 0.1121962595028217, + "grad_norm": 0.08421896398067474, + "kl": 0.027374267578125, + "learning_rate": 4.257141471425329e-06, + "loss": 0.0073, + "num_tokens": 70666569.0, + "reward": 0.4911053776741028, + "reward_std": 0.313351646065712, + "rewards/code_reward": 0.3411053828895092, + "rewards/format_reward": 1.5, + "step": 3007 + }, + { + "clip_ratio": 0.004378562793135643, + "epoch": 0.11223357119537335, + "grad_norm": 0.08445079624652863, + "kl": 0.027374267578125, + "learning_rate": 4.2543402331527215e-06, + "loss": 0.007, + "step": 3008 + }, + { + "clip_ratio": 0.004152984591200948, + "epoch": 0.112270882887925, + "grad_norm": 0.07885698229074478, + "kl": 0.027984619140625, + "learning_rate": 4.251539517535815e-06, + "loss": 0.0067, + "step": 3009 + }, + { + "clip_ratio": 0.003310724685434252, + "completion_length": 874.8571929931641, + "epoch": 0.11230819458047665, + "grad_norm": 0.057700395584106445, + "kl": 0.018798828125, + "learning_rate": 4.248739325749736e-06, + "loss": 0.0158, + "num_tokens": 70752591.0, + "reward": 0.5047187358140945, + "reward_std": 0.06491262465715408, + "rewards/code_reward": 0.3573972964659333, + "rewards/format_reward": 1.4732142984867096, + "step": 3010 + }, + { + "clip_ratio": 0.0028742238646373153, + "epoch": 0.1123455062730283, + "grad_norm": 0.0561402253806591, + "kl": 0.0186767578125, + "learning_rate": 4.245939658969391e-06, + "loss": 0.0157, + "step": 3011 + }, + { + "clip_ratio": 0.0033147084177471697, + "epoch": 0.11238281796557996, + "grad_norm": 0.0573740191757679, + "kl": 0.01873779296875, + "learning_rate": 4.2431405183694714e-06, + "loss": 0.0156, + "step": 3012 + }, + { + "clip_ratio": 0.0047564058331772685, + "completion_length": 635.1250305175781, + "epoch": 0.11242012965813161, + "grad_norm": 0.06800897419452667, + "kl": 0.023162841796875, + "learning_rate": 4.240341905124443e-06, + "loss": 0.0035, + "num_tokens": 70821240.0, + "reward": 0.5962256565690041, + "reward_std": 0.0568082258105278, + "rewards/code_reward": 0.446225643157959, + "rewards/format_reward": 1.5, + "step": 3013 + }, + { + "clip_ratio": 0.004576051025651395, + "epoch": 0.11245744135068327, + "grad_norm": 0.0664023607969284, + "kl": 0.02264404296875, + "learning_rate": 4.237543820408554e-06, + "loss": 0.0032, + "step": 3014 + }, + { + "clip_ratio": 0.0043716830550692976, + "epoch": 0.11249475304323492, + "grad_norm": 0.0701705664396286, + "kl": 0.022674560546875, + "learning_rate": 4.234746265395825e-06, + "loss": 0.0032, + "step": 3015 + }, + { + "clip_ratio": 0.002510272286599502, + "completion_length": 646.0714645385742, + "epoch": 0.11253206473578657, + "grad_norm": 0.05173566937446594, + "kl": 0.02459716796875, + "learning_rate": 4.231949241260061e-06, + "loss": 0.0045, + "num_tokens": 70884986.0, + "reward": 0.6539318673312664, + "reward_std": 0.12047728896141052, + "rewards/code_reward": 0.5039318464696407, + "rewards/format_reward": 1.5, + "step": 3016 + }, + { + "clip_ratio": 0.0028835536795668304, + "epoch": 0.11256937642833822, + "grad_norm": 0.04928164929151535, + "kl": 0.02447509765625, + "learning_rate": 4.22915274917484e-06, + "loss": 0.0048, + "step": 3017 + }, + { + "clip_ratio": 0.0028227324364706874, + "epoch": 0.11260668812088988, + "grad_norm": 0.059352051466703415, + "kl": 0.0247344970703125, + "learning_rate": 4.226356790313519e-06, + "loss": 0.0045, + "step": 3018 + }, + { + "clip_ratio": 0.0037241256213746965, + "completion_length": 685.2500305175781, + "epoch": 0.11264399981344153, + "grad_norm": 0.06900251656770706, + "kl": 0.027099609375, + "learning_rate": 4.223561365849231e-06, + "loss": -0.0055, + "num_tokens": 70952200.0, + "reward": 0.5498309098184109, + "reward_std": 0.22028922010213137, + "rewards/code_reward": 0.3998308628797531, + "rewards/format_reward": 1.5, + "step": 3019 + }, + { + "clip_ratio": 0.00354582944419235, + "epoch": 0.11268131150599318, + "grad_norm": 0.06811651587486267, + "kl": 0.02679443359375, + "learning_rate": 4.220766476954882e-06, + "loss": -0.0056, + "step": 3020 + }, + { + "clip_ratio": 0.003778191574383527, + "epoch": 0.11271862319854484, + "grad_norm": 0.07143990695476532, + "kl": 0.026458740234375, + "learning_rate": 4.217972124803157e-06, + "loss": -0.0057, + "step": 3021 + }, + { + "clip_ratio": 0.003015047637745738, + "completion_length": 696.1964492797852, + "epoch": 0.1127559348910965, + "grad_norm": 0.08888174593448639, + "kl": 0.0240478515625, + "learning_rate": 4.215178310566515e-06, + "loss": -0.0011, + "num_tokens": 71031395.0, + "reward": 0.7223469279706478, + "reward_std": 0.2320151999592781, + "rewards/code_reward": 0.5723469406366348, + "rewards/format_reward": 1.5, + "step": 3022 + }, + { + "clip_ratio": 0.0030442160204984248, + "epoch": 0.11279324658364816, + "grad_norm": 0.07431123405694962, + "kl": 0.02386474609375, + "learning_rate": 4.212385035417188e-06, + "loss": -0.0013, + "step": 3023 + }, + { + "clip_ratio": 0.003158289473503828, + "epoch": 0.11283055827619981, + "grad_norm": 0.08487582951784134, + "kl": 0.024505615234375, + "learning_rate": 4.209592300527181e-06, + "loss": -0.0015, + "step": 3024 + }, + { + "clip_ratio": 0.00274714914849028, + "completion_length": 595.9107437133789, + "epoch": 0.11286786996875146, + "grad_norm": 0.07766856998205185, + "kl": 0.025115966796875, + "learning_rate": 4.206800107068274e-06, + "loss": 0.0431, + "num_tokens": 71090330.0, + "reward": 0.6411814726889133, + "reward_std": 0.21824651211500168, + "rewards/code_reward": 0.4938600603491068, + "rewards/format_reward": 1.4732142984867096, + "step": 3025 + }, + { + "clip_ratio": 0.002766778750810772, + "epoch": 0.11290518166130312, + "grad_norm": 0.07735113799571991, + "kl": 0.025787353515625, + "learning_rate": 4.20400845621202e-06, + "loss": 0.043, + "step": 3026 + }, + { + "clip_ratio": 0.002327118389075622, + "epoch": 0.11294249335385477, + "grad_norm": 0.0737922191619873, + "kl": 0.0247802734375, + "learning_rate": 4.201217349129746e-06, + "loss": 0.0428, + "step": 3027 + }, + { + "clip_ratio": 0.0015893708332441747, + "completion_length": 517.8393020629883, + "epoch": 0.11297980504640642, + "grad_norm": 0.06774024665355682, + "kl": 0.02777099609375, + "learning_rate": 4.198426786992545e-06, + "loss": 0.0015, + "num_tokens": 71151627.0, + "reward": 1.1147893965244293, + "reward_std": 0.09164615860208869, + "rewards/code_reward": 0.966575101017952, + "rewards/format_reward": 1.4821428656578064, + "step": 3028 + }, + { + "clip_ratio": 0.00135347421746701, + "epoch": 0.11301711673895808, + "grad_norm": 0.05836808681488037, + "kl": 0.029449462890625, + "learning_rate": 4.195636770971288e-06, + "loss": 0.0013, + "step": 3029 + }, + { + "clip_ratio": 0.0016863202909007668, + "epoch": 0.11305442843150973, + "grad_norm": 0.060341499745845795, + "kl": 0.028594970703125, + "learning_rate": 4.19284730223661e-06, + "loss": 0.0014, + "step": 3030 + }, + { + "clip_ratio": 0.00317970389733091, + "completion_length": 496.6071548461914, + "epoch": 0.11309174012406138, + "grad_norm": 0.07134660333395004, + "kl": 0.0186767578125, + "learning_rate": 4.190058381958925e-06, + "loss": 0.0055, + "num_tokens": 71203979.0, + "reward": 0.8832582980394363, + "reward_std": 0.07778435945510864, + "rewards/code_reward": 0.7332582622766495, + "rewards/format_reward": 1.5, + "step": 3031 + }, + { + "clip_ratio": 0.0022724865120835602, + "epoch": 0.11312905181661304, + "grad_norm": 0.06887272000312805, + "kl": 0.018524169921875, + "learning_rate": 4.187270011308411e-06, + "loss": 0.0054, + "step": 3032 + }, + { + "clip_ratio": 0.002570230746641755, + "epoch": 0.11316636350916469, + "grad_norm": 0.06947717070579529, + "kl": 0.018585205078125, + "learning_rate": 4.184482191455014e-06, + "loss": 0.005, + "step": 3033 + }, + { + "clip_ratio": 0.003842545673251152, + "completion_length": 629.9464569091797, + "epoch": 0.11320367520171634, + "grad_norm": 0.08588096499443054, + "kl": 0.0244598388671875, + "learning_rate": 4.181694923568454e-06, + "loss": -0.0016, + "num_tokens": 71269482.0, + "reward": 0.4721444323658943, + "reward_std": 0.092388212447986, + "rewards/code_reward": 0.3221444240771234, + "rewards/format_reward": 1.5, + "step": 3034 + }, + { + "clip_ratio": 0.003811804694123566, + "epoch": 0.113240986894268, + "grad_norm": 0.08326318114995956, + "kl": 0.024810791015625, + "learning_rate": 4.1789082088182135e-06, + "loss": -0.0017, + "step": 3035 + }, + { + "clip_ratio": 0.0035835584276355803, + "epoch": 0.11327829858681965, + "grad_norm": 0.08203893899917603, + "kl": 0.02496337890625, + "learning_rate": 4.17612204837355e-06, + "loss": -0.0018, + "step": 3036 + }, + { + "clip_ratio": 0.0025600182125344872, + "completion_length": 693.5536041259766, + "epoch": 0.1133156102793713, + "grad_norm": 0.053259797394275665, + "kl": 0.018951416015625, + "learning_rate": 4.173336443403483e-06, + "loss": -0.0101, + "num_tokens": 71349103.0, + "reward": 0.7749999910593033, + "reward_std": 0.19520344585180283, + "rewards/code_reward": 0.6250000018626451, + "rewards/format_reward": 1.5, + "step": 3037 + }, + { + "clip_ratio": 0.0024038004921749234, + "epoch": 0.11335292197192295, + "grad_norm": 0.0515618659555912, + "kl": 0.0183868408203125, + "learning_rate": 4.170551395076801e-06, + "loss": -0.0102, + "step": 3038 + }, + { + "clip_ratio": 0.0018357158405706286, + "epoch": 0.11339023366447461, + "grad_norm": 0.048076219856739044, + "kl": 0.019012451171875, + "learning_rate": 4.167766904562058e-06, + "loss": -0.0103, + "step": 3039 + }, + { + "clip_ratio": 0.003080607973970473, + "completion_length": 763.8036041259766, + "epoch": 0.11342754535702626, + "grad_norm": 0.06787385046482086, + "kl": 0.0240478515625, + "learning_rate": 4.164982973027576e-06, + "loss": -0.0011, + "num_tokens": 71425070.0, + "reward": 0.6367346979677677, + "reward_std": 0.15867429599165916, + "rewards/code_reward": 0.48673469573259354, + "rewards/format_reward": 1.5, + "step": 3040 + }, + { + "clip_ratio": 0.0028518802719190717, + "epoch": 0.11346485704957791, + "grad_norm": 0.06597809493541718, + "kl": 0.02423095703125, + "learning_rate": 4.162199601641438e-06, + "loss": -0.0013, + "step": 3041 + }, + { + "clip_ratio": 0.003038820519577712, + "epoch": 0.11350216874212957, + "grad_norm": 0.06204788386821747, + "kl": 0.02362060546875, + "learning_rate": 4.159416791571498e-06, + "loss": -0.0015, + "step": 3042 + }, + { + "clip_ratio": 0.0029866424156352878, + "completion_length": 715.232177734375, + "epoch": 0.11353948043468122, + "grad_norm": 0.07095342129468918, + "kl": 0.0204010009765625, + "learning_rate": 4.156634543985369e-06, + "loss": 0.001, + "num_tokens": 71496357.0, + "reward": 0.48164985701441765, + "reward_std": 0.10413644928485155, + "rewards/code_reward": 0.33164984825998545, + "rewards/format_reward": 1.5, + "step": 3043 + }, + { + "clip_ratio": 0.002923233318142593, + "epoch": 0.11357679212723287, + "grad_norm": 0.06864243745803833, + "kl": 0.020233154296875, + "learning_rate": 4.153852860050434e-06, + "loss": 0.0008, + "step": 3044 + }, + { + "clip_ratio": 0.002570969343651086, + "epoch": 0.11361410381978453, + "grad_norm": 0.06705319136381149, + "kl": 0.020294189453125, + "learning_rate": 4.151071740933832e-06, + "loss": 0.0005, + "step": 3045 + }, + { + "clip_ratio": 0.0030825440189801157, + "completion_length": 745.8214569091797, + "epoch": 0.11365141551233618, + "grad_norm": 0.08692969381809235, + "kl": 0.021484375, + "learning_rate": 4.148291187802469e-06, + "loss": 0.0021, + "num_tokens": 71584085.0, + "reward": 0.9215300977230072, + "reward_std": 0.13193211262114346, + "rewards/code_reward": 0.7715300917625427, + "rewards/format_reward": 1.5, + "step": 3046 + }, + { + "clip_ratio": 0.0032580666593275964, + "epoch": 0.11368872720488783, + "grad_norm": 0.08537603914737701, + "kl": 0.021514892578125, + "learning_rate": 4.145511201823016e-06, + "loss": 0.0022, + "step": 3047 + }, + { + "clip_ratio": 0.0028965890523977578, + "epoch": 0.11372603889743949, + "grad_norm": 0.0754326656460762, + "kl": 0.02178955078125, + "learning_rate": 4.142731784161902e-06, + "loss": 0.0016, + "step": 3048 + }, + { + "clip_ratio": 0.004240205336827785, + "completion_length": 772.7143096923828, + "epoch": 0.11376335058999114, + "grad_norm": 0.04997457191348076, + "kl": 0.01593017578125, + "learning_rate": 4.13995293598532e-06, + "loss": -0.0086, + "num_tokens": 71664541.0, + "reward": 0.3828231766819954, + "reward_std": 0.06865153042599559, + "rewards/code_reward": 0.23282312287483364, + "rewards/format_reward": 1.5, + "step": 3049 + }, + { + "clip_ratio": 0.0036333634052425623, + "epoch": 0.11380066228254279, + "grad_norm": 0.046446360647678375, + "kl": 0.0157318115234375, + "learning_rate": 4.137174658459223e-06, + "loss": -0.0087, + "step": 3050 + }, + { + "clip_ratio": 0.00418688275385648, + "epoch": 0.11383797397509444, + "grad_norm": 0.047518469393253326, + "kl": 0.015838623046875, + "learning_rate": 4.134396952749322e-06, + "loss": -0.0088, + "step": 3051 + }, + { + "clip_ratio": 0.004142048128414899, + "completion_length": 719.3214569091797, + "epoch": 0.1138752856676461, + "grad_norm": 0.08792991191148758, + "kl": 0.018524169921875, + "learning_rate": 4.131619820021095e-06, + "loss": 0.0322, + "num_tokens": 71735259.0, + "reward": 0.6254500634968281, + "reward_std": 0.26747364550828934, + "rewards/code_reward": 0.47545008547604084, + "rewards/format_reward": 1.5, + "step": 3052 + }, + { + "clip_ratio": 0.003890021878760308, + "epoch": 0.11391259736019775, + "grad_norm": 0.08239000290632248, + "kl": 0.01898193359375, + "learning_rate": 4.128843261439771e-06, + "loss": 0.0323, + "step": 3053 + }, + { + "clip_ratio": 0.0035623597213998437, + "epoch": 0.1139499090527494, + "grad_norm": 0.08649556338787079, + "kl": 0.0177001953125, + "learning_rate": 4.1260672781703445e-06, + "loss": 0.0316, + "step": 3054 + }, + { + "clip_ratio": 0.002817876695189625, + "completion_length": 677.2678833007812, + "epoch": 0.11398722074530106, + "grad_norm": 0.07337877154350281, + "kl": 0.02581787109375, + "learning_rate": 4.123291871377568e-06, + "loss": 0.0009, + "num_tokens": 71802866.0, + "reward": 0.5301136448979378, + "reward_std": 0.2052411250770092, + "rewards/code_reward": 0.38279221765697, + "rewards/format_reward": 1.4732142984867096, + "step": 3055 + }, + { + "clip_ratio": 0.002899770741350949, + "epoch": 0.11402453243785271, + "grad_norm": 0.07430095225572586, + "kl": 0.025634765625, + "learning_rate": 4.120517042225949e-06, + "loss": 0.0008, + "step": 3056 + }, + { + "clip_ratio": 0.0024142153561115265, + "epoch": 0.11406184413040436, + "grad_norm": 0.07219666242599487, + "kl": 0.025299072265625, + "learning_rate": 4.117742791879755e-06, + "loss": 0.0006, + "step": 3057 + }, + { + "clip_ratio": 0.003446823393460363, + "completion_length": 593.1786041259766, + "epoch": 0.11409915582295602, + "grad_norm": 0.08581529557704926, + "kl": 0.036376953125, + "learning_rate": 4.114969121503008e-06, + "loss": 0.0019, + "num_tokens": 71868704.0, + "reward": 0.5779558643698692, + "reward_std": 0.25972137600183487, + "rewards/code_reward": 0.42795586958527565, + "rewards/format_reward": 1.5, + "step": 3058 + }, + { + "clip_ratio": 0.0033673688303679228, + "epoch": 0.11413646751550767, + "grad_norm": 0.08499018847942352, + "kl": 0.03668212890625, + "learning_rate": 4.112196032259489e-06, + "loss": 0.0018, + "step": 3059 + }, + { + "clip_ratio": 0.0033152876421809196, + "epoch": 0.11417377920805932, + "grad_norm": 0.07976987212896347, + "kl": 0.035736083984375, + "learning_rate": 4.109423525312738e-06, + "loss": 0.0016, + "step": 3060 + }, + { + "clip_ratio": 0.0026780300540849566, + "completion_length": 742.6964721679688, + "epoch": 0.11421109090061098, + "grad_norm": 0.07603408396244049, + "kl": 0.019561767578125, + "learning_rate": 4.106651601826043e-06, + "loss": 0.0216, + "num_tokens": 71940829.0, + "reward": 0.6323196664452553, + "reward_std": 0.2174740768969059, + "rewards/code_reward": 0.48231966234743595, + "rewards/format_reward": 1.5, + "step": 3061 + }, + { + "clip_ratio": 0.0026795920566655695, + "epoch": 0.11424840259316263, + "grad_norm": 0.07278896868228912, + "kl": 0.0198974609375, + "learning_rate": 4.103880262962455e-06, + "loss": 0.0216, + "step": 3062 + }, + { + "clip_ratio": 0.0027679649647325277, + "epoch": 0.11428571428571428, + "grad_norm": 0.06827753037214279, + "kl": 0.01995849609375, + "learning_rate": 4.101109509884774e-06, + "loss": 0.0214, + "step": 3063 + }, + { + "clip_ratio": 0.0023933431366458535, + "completion_length": 548.0536041259766, + "epoch": 0.11432302597826594, + "grad_norm": 0.06705852597951889, + "kl": 0.018341064453125, + "learning_rate": 4.098339343755557e-06, + "loss": 0.0057, + "num_tokens": 72003678.0, + "reward": 0.6843406595289707, + "reward_std": 0.06665830314159393, + "rewards/code_reward": 0.5343406572937965, + "rewards/format_reward": 1.5, + "step": 3064 + }, + { + "clip_ratio": 0.0028647974831983447, + "epoch": 0.11436033767081759, + "grad_norm": 0.046338651329278946, + "kl": 0.018829345703125, + "learning_rate": 4.095569765737117e-06, + "loss": 0.0056, + "step": 3065 + }, + { + "clip_ratio": 0.002772614825516939, + "epoch": 0.11439764936336924, + "grad_norm": 0.04562043771147728, + "kl": 0.018951416015625, + "learning_rate": 4.0928007769915125e-06, + "loss": 0.0056, + "step": 3066 + }, + { + "clip_ratio": 0.005359673872590065, + "completion_length": 735.8214569091797, + "epoch": 0.1144349610559209, + "grad_norm": 0.062233906239271164, + "kl": 0.024993896484375, + "learning_rate": 4.090032378680565e-06, + "loss": 0.0082, + "num_tokens": 72078288.0, + "reward": 0.36064494401216507, + "reward_std": 0.19071801006793976, + "rewards/code_reward": 0.2106449455022812, + "rewards/format_reward": 1.5, + "step": 3067 + }, + { + "clip_ratio": 0.004875645041465759, + "epoch": 0.11447227274847255, + "grad_norm": 0.0599636435508728, + "kl": 0.02496337890625, + "learning_rate": 4.087264571965839e-06, + "loss": 0.008, + "step": 3068 + }, + { + "clip_ratio": 0.004769790917634964, + "epoch": 0.1145095844410242, + "grad_norm": 0.05860403925180435, + "kl": 0.025299072265625, + "learning_rate": 4.084497358008655e-06, + "loss": 0.0079, + "step": 3069 + }, + { + "clip_ratio": 0.0026052811881527305, + "completion_length": 554.0893173217773, + "epoch": 0.11454689613357585, + "grad_norm": 0.07320836186408997, + "kl": 0.02569580078125, + "learning_rate": 4.081730737970088e-06, + "loss": -0.0096, + "num_tokens": 72133607.0, + "reward": 0.8877320885658264, + "reward_std": 0.24338745698332787, + "rewards/code_reward": 0.7377321124076843, + "rewards/format_reward": 1.5, + "step": 3070 + }, + { + "clip_ratio": 0.002597702026832849, + "epoch": 0.11458420782612751, + "grad_norm": 0.06974682956933975, + "kl": 0.02593994140625, + "learning_rate": 4.078964713010955e-06, + "loss": -0.0097, + "step": 3071 + }, + { + "clip_ratio": 0.002238288114313036, + "epoch": 0.11462151951867916, + "grad_norm": 0.06836219877004623, + "kl": 0.02642822265625, + "learning_rate": 4.0761992842918354e-06, + "loss": -0.01, + "step": 3072 + }, + { + "clip_ratio": 0.0031879227608442307, + "completion_length": 581.5178909301758, + "epoch": 0.11465883121123081, + "grad_norm": 0.07696779072284698, + "kl": 0.021392822265625, + "learning_rate": 4.073434452973046e-06, + "loss": 0.0053, + "num_tokens": 72192142.0, + "reward": 0.7246909402310848, + "reward_std": 0.14178798347711563, + "rewards/code_reward": 0.57469093054533, + "rewards/format_reward": 1.5, + "step": 3073 + }, + { + "clip_ratio": 0.0029702525353059173, + "epoch": 0.11469614290378247, + "grad_norm": 0.07201526314020157, + "kl": 0.021331787109375, + "learning_rate": 4.070670220214662e-06, + "loss": 0.0051, + "step": 3074 + }, + { + "clip_ratio": 0.002913308038841933, + "epoch": 0.11473345459633412, + "grad_norm": 0.06800186634063721, + "kl": 0.02117919921875, + "learning_rate": 4.067906587176505e-06, + "loss": 0.0049, + "step": 3075 + }, + { + "clip_ratio": 0.0015932642854750156, + "completion_length": 597.9285888671875, + "epoch": 0.11477076628888577, + "grad_norm": 0.047900181263685226, + "kl": 0.0298614501953125, + "learning_rate": 4.065143555018138e-06, + "loss": 0.0045, + "num_tokens": 72258638.0, + "reward": 0.9175035208463669, + "reward_std": 0.03479549288749695, + "rewards/code_reward": 0.767503535374999, + "rewards/format_reward": 1.5, + "step": 3076 + }, + { + "clip_ratio": 0.001844295533373952, + "epoch": 0.11480807798143744, + "grad_norm": 0.0549008771777153, + "kl": 0.0296630859375, + "learning_rate": 4.062381124898884e-06, + "loss": 0.0046, + "step": 3077 + }, + { + "clip_ratio": 0.0017547780880704522, + "epoch": 0.11484538967398909, + "grad_norm": 0.04575654864311218, + "kl": 0.0268402099609375, + "learning_rate": 4.059619297977805e-06, + "loss": 0.0045, + "step": 3078 + }, + { + "clip_ratio": 0.0035365551593713462, + "completion_length": 673.6607360839844, + "epoch": 0.11488270136654075, + "grad_norm": 0.15675508975982666, + "kl": 0.0198516845703125, + "learning_rate": 4.056858075413712e-06, + "loss": 0.0098, + "num_tokens": 72330971.0, + "reward": 0.6450164802372456, + "reward_std": 0.0870803608559072, + "rewards/code_reward": 0.4950164703768678, + "rewards/format_reward": 1.5, + "step": 3079 + }, + { + "clip_ratio": 0.0035369893303141, + "epoch": 0.1149200130590924, + "grad_norm": 0.07927655428647995, + "kl": 0.0196533203125, + "learning_rate": 4.054097458365166e-06, + "loss": 0.0099, + "step": 3080 + }, + { + "clip_ratio": 0.0028820562874898314, + "epoch": 0.11495732475164405, + "grad_norm": 0.07247942686080933, + "kl": 0.019775390625, + "learning_rate": 4.051337447990466e-06, + "loss": 0.0092, + "step": 3081 + }, + { + "clip_ratio": 0.003241895465180278, + "completion_length": 774.4464797973633, + "epoch": 0.1149946364441957, + "grad_norm": 0.07892172038555145, + "kl": 0.0178375244140625, + "learning_rate": 4.048578045447662e-06, + "loss": 0.0037, + "num_tokens": 72405994.0, + "reward": 0.8014452904462814, + "reward_std": 0.2584223039448261, + "rewards/code_reward": 0.6514453217387199, + "rewards/format_reward": 1.5, + "step": 3082 + }, + { + "clip_ratio": 0.003381099901162088, + "epoch": 0.11503194813674736, + "grad_norm": 0.08620434254407883, + "kl": 0.017242431640625, + "learning_rate": 4.0458192518945514e-06, + "loss": 0.0036, + "step": 3083 + }, + { + "clip_ratio": 0.0027119721635244787, + "epoch": 0.11506925982929901, + "grad_norm": 0.07394978404045105, + "kl": 0.017242431640625, + "learning_rate": 4.0430610684886695e-06, + "loss": 0.0033, + "step": 3084 + }, + { + "clip_ratio": 0.0035133027122355998, + "completion_length": 638.9107360839844, + "epoch": 0.11510657152185066, + "grad_norm": 0.0699625238776207, + "kl": 0.0184173583984375, + "learning_rate": 4.040303496387301e-06, + "loss": -0.0091, + "num_tokens": 72465027.0, + "reward": 0.5596455335617065, + "reward_std": 0.14689847454428673, + "rewards/code_reward": 0.4096455327235162, + "rewards/format_reward": 1.5, + "step": 3085 + }, + { + "clip_ratio": 0.0035923649556934834, + "epoch": 0.11514388321440232, + "grad_norm": 0.068097785115242, + "kl": 0.0184326171875, + "learning_rate": 4.037546536747469e-06, + "loss": -0.0092, + "step": 3086 + }, + { + "clip_ratio": 0.003553078742697835, + "epoch": 0.11518119490695397, + "grad_norm": 0.06774682551622391, + "kl": 0.018646240234375, + "learning_rate": 4.034790190725944e-06, + "loss": -0.0093, + "step": 3087 + }, + { + "clip_ratio": 0.004287823103368282, + "completion_length": 843.6428833007812, + "epoch": 0.11521850659950562, + "grad_norm": 0.07276341319084167, + "kl": 0.018890380859375, + "learning_rate": 4.032034459479238e-06, + "loss": -0.0021, + "num_tokens": 72555349.0, + "reward": 0.6226444132626057, + "reward_std": 0.08299578516744077, + "rewards/code_reward": 0.4726443514227867, + "rewards/format_reward": 1.5, + "step": 3088 + }, + { + "clip_ratio": 0.004362944921012968, + "epoch": 0.11525581829205728, + "grad_norm": 0.07131991535425186, + "kl": 0.019073486328125, + "learning_rate": 4.029279344163605e-06, + "loss": -0.002, + "step": 3089 + }, + { + "clip_ratio": 0.0036331198643893003, + "epoch": 0.11529312998460893, + "grad_norm": 0.07262702286243439, + "kl": 0.019256591796875, + "learning_rate": 4.026524845935041e-06, + "loss": -0.0023, + "step": 3090 + }, + { + "clip_ratio": 0.001961186178959906, + "completion_length": 640.1428833007812, + "epoch": 0.11533044167716058, + "grad_norm": 0.07209241390228271, + "kl": 0.01641845703125, + "learning_rate": 4.02377096594928e-06, + "loss": 0.0058, + "num_tokens": 72619001.0, + "reward": 1.0833868384361267, + "reward_std": 0.13690322265028954, + "rewards/code_reward": 0.9333868324756622, + "rewards/format_reward": 1.5, + "step": 3091 + }, + { + "clip_ratio": 0.0013779195433016866, + "epoch": 0.11536775336971224, + "grad_norm": 0.06914730370044708, + "kl": 0.01654052734375, + "learning_rate": 4.0210177053617986e-06, + "loss": 0.0055, + "step": 3092 + }, + { + "clip_ratio": 0.0017436977941542864, + "epoch": 0.11540506506226389, + "grad_norm": 0.06654515117406845, + "kl": 0.0169830322265625, + "learning_rate": 4.018265065327818e-06, + "loss": 0.0055, + "step": 3093 + }, + { + "clip_ratio": 0.003048429382033646, + "completion_length": 679.3036041259766, + "epoch": 0.11544237675481554, + "grad_norm": 0.0662471130490303, + "kl": 0.02215576171875, + "learning_rate": 4.015513047002288e-06, + "loss": 0.0093, + "num_tokens": 72693638.0, + "reward": 0.8584369868040085, + "reward_std": 0.04175075946841389, + "rewards/code_reward": 0.7084369704243727, + "rewards/format_reward": 1.5, + "step": 3094 + }, + { + "clip_ratio": 0.0027887807809747756, + "epoch": 0.1154796884473672, + "grad_norm": 0.06384135782718658, + "kl": 0.02215576171875, + "learning_rate": 4.012761651539911e-06, + "loss": 0.0092, + "step": 3095 + }, + { + "clip_ratio": 0.0022713723592460155, + "epoch": 0.11551700013991885, + "grad_norm": 0.06349638849496841, + "kl": 0.021636962890625, + "learning_rate": 4.010010880095119e-06, + "loss": 0.0089, + "step": 3096 + }, + { + "clip_ratio": 0.004485226818360388, + "completion_length": 708.107177734375, + "epoch": 0.1155543118324705, + "grad_norm": 0.09302335977554321, + "kl": 0.024505615234375, + "learning_rate": 4.007260733822081e-06, + "loss": 0.0138, + "num_tokens": 72770028.0, + "reward": 0.45450514554977417, + "reward_std": 0.28768477961421013, + "rewards/code_reward": 0.30450513772666454, + "rewards/format_reward": 1.5, + "step": 3097 + }, + { + "clip_ratio": 0.004355655226390809, + "epoch": 0.11559162352502216, + "grad_norm": 0.08866193890571594, + "kl": 0.0244140625, + "learning_rate": 4.004511213874715e-06, + "loss": 0.0134, + "step": 3098 + }, + { + "clip_ratio": 0.0039815971395000815, + "epoch": 0.11562893521757381, + "grad_norm": 0.08813921362161636, + "kl": 0.024078369140625, + "learning_rate": 4.001762321406663e-06, + "loss": 0.0132, + "step": 3099 + }, + { + "clip_ratio": 0.004271191137377173, + "completion_length": 683.982177734375, + "epoch": 0.11566624691012546, + "grad_norm": 0.08123603463172913, + "kl": 0.022003173828125, + "learning_rate": 3.999014057571309e-06, + "loss": -0.0183, + "num_tokens": 72844815.0, + "reward": 0.5465111248195171, + "reward_std": 0.19092470407485962, + "rewards/code_reward": 0.3965110722929239, + "rewards/format_reward": 1.5, + "step": 3100 + }, + { + "clip_ratio": 0.004382693208754063, + "epoch": 0.11570355860267711, + "grad_norm": 0.0760425478219986, + "kl": 0.02166748046875, + "learning_rate": 3.996266423521774e-06, + "loss": -0.0184, + "step": 3101 + }, + { + "clip_ratio": 0.0039124725153669715, + "epoch": 0.11574087029522877, + "grad_norm": 0.08194892853498459, + "kl": 0.021697998046875, + "learning_rate": 3.993519420410915e-06, + "loss": -0.0187, + "step": 3102 + }, + { + "clip_ratio": 0.004498971393331885, + "completion_length": 779.5893249511719, + "epoch": 0.11577818198778042, + "grad_norm": 0.0336659774184227, + "kl": 0.0162811279296875, + "learning_rate": 3.990773049391327e-06, + "loss": 0.0014, + "num_tokens": 72918026.0, + "reward": 0.17335166782140732, + "reward_std": 0.03636370226740837, + "rewards/code_reward": 0.023351648822426796, + "rewards/format_reward": 1.5, + "step": 3103 + }, + { + "clip_ratio": 0.004272716119885445, + "epoch": 0.11581549368033207, + "grad_norm": 0.03529425337910652, + "kl": 0.0162353515625, + "learning_rate": 3.988027311615329e-06, + "loss": 0.0014, + "step": 3104 + }, + { + "clip_ratio": 0.004347110108938068, + "epoch": 0.11585280537288373, + "grad_norm": 0.03600297123193741, + "kl": 0.015960693359375, + "learning_rate": 3.985282208234988e-06, + "loss": 0.0012, + "step": 3105 + }, + { + "clip_ratio": 0.004262383037712425, + "completion_length": 703.9285888671875, + "epoch": 0.11589011706543538, + "grad_norm": 0.08675108104944229, + "kl": 0.024627685546875, + "learning_rate": 3.982537740402093e-06, + "loss": 0.0065, + "num_tokens": 72986286.0, + "reward": 0.8016390204429626, + "reward_std": 0.14406917616724968, + "rewards/code_reward": 0.6516389735043049, + "rewards/format_reward": 1.5, + "step": 3106 + }, + { + "clip_ratio": 0.004166271770372987, + "epoch": 0.11592742875798703, + "grad_norm": 0.08556212484836578, + "kl": 0.02447509765625, + "learning_rate": 3.979793909268176e-06, + "loss": 0.0063, + "step": 3107 + }, + { + "clip_ratio": 0.003950608428567648, + "epoch": 0.11596474045053869, + "grad_norm": 0.08247672766447067, + "kl": 0.02398681640625, + "learning_rate": 3.977050715984495e-06, + "loss": 0.006, + "step": 3108 + }, + { + "clip_ratio": 0.004049533512443304, + "completion_length": 759.7857513427734, + "epoch": 0.11600205214309034, + "grad_norm": 0.16103848814964294, + "kl": 0.044403076171875, + "learning_rate": 3.974308161702044e-06, + "loss": 0.0285, + "num_tokens": 73072878.0, + "reward": 0.5401528589427471, + "reward_std": 0.2184888795018196, + "rewards/code_reward": 0.39283140003681183, + "rewards/format_reward": 1.4732142984867096, + "step": 3109 + }, + { + "clip_ratio": 0.003980644454713911, + "epoch": 0.11603936383564199, + "grad_norm": 0.08524198830127716, + "kl": 0.0316009521484375, + "learning_rate": 3.971566247571548e-06, + "loss": 0.0284, + "step": 3110 + }, + { + "clip_ratio": 0.004123181453906, + "epoch": 0.11607667552819365, + "grad_norm": 0.5277365446090698, + "kl": 0.026580810546875, + "learning_rate": 3.968824974743465e-06, + "loss": 0.0287, + "step": 3111 + }, + { + "clip_ratio": 0.0014393999590538442, + "completion_length": 592.4643020629883, + "epoch": 0.1161139872207453, + "grad_norm": 0.04474326968193054, + "kl": 0.0172119140625, + "learning_rate": 3.966084344367977e-06, + "loss": 0.0125, + "num_tokens": 73136156.0, + "reward": 1.0629766434431076, + "reward_std": 0.11804236099123955, + "rewards/code_reward": 0.9129766523838043, + "rewards/format_reward": 1.5, + "step": 3112 + }, + { + "clip_ratio": 0.0013206902658566833, + "epoch": 0.11615129891329695, + "grad_norm": 0.04254365339875221, + "kl": 0.0174560546875, + "learning_rate": 3.963344357595007e-06, + "loss": 0.0124, + "step": 3113 + }, + { + "clip_ratio": 0.0013047896209172904, + "epoch": 0.1161886106058486, + "grad_norm": 0.04392736777663231, + "kl": 0.016754150390625, + "learning_rate": 3.9606050155741985e-06, + "loss": 0.0123, + "step": 3114 + }, + { + "clip_ratio": 0.0024323131947312504, + "completion_length": 572.2857437133789, + "epoch": 0.11622592229840026, + "grad_norm": 0.05632540583610535, + "kl": 0.0208892822265625, + "learning_rate": 3.9578663194549305e-06, + "loss": 0.0046, + "num_tokens": 73189246.0, + "reward": 0.828252486884594, + "reward_std": 0.235872533172369, + "rewards/code_reward": 0.6782524548470974, + "rewards/format_reward": 1.5, + "step": 3115 + }, + { + "clip_ratio": 0.002540474059060216, + "epoch": 0.11626323399095191, + "grad_norm": 0.0527803860604763, + "kl": 0.0212860107421875, + "learning_rate": 3.955128270386311e-06, + "loss": 0.0046, + "step": 3116 + }, + { + "clip_ratio": 0.0020329186518210918, + "epoch": 0.11630054568350356, + "grad_norm": 0.05142677202820778, + "kl": 0.0212860107421875, + "learning_rate": 3.952390869517171e-06, + "loss": 0.0045, + "step": 3117 + }, + { + "clip_ratio": 0.004098658915609121, + "completion_length": 670.0714721679688, + "epoch": 0.11633785737605522, + "grad_norm": 0.07766994088888168, + "kl": 0.0206298828125, + "learning_rate": 3.949654117996074e-06, + "loss": -0.0039, + "num_tokens": 73260174.0, + "reward": 0.5565769374370575, + "reward_std": 0.3267693519592285, + "rewards/code_reward": 0.40657692588865757, + "rewards/format_reward": 1.5, + "step": 3118 + }, + { + "clip_ratio": 0.004013161233160645, + "epoch": 0.11637516906860687, + "grad_norm": 0.079647958278656, + "kl": 0.0205078125, + "learning_rate": 3.946918016971311e-06, + "loss": -0.004, + "step": 3119 + }, + { + "clip_ratio": 0.003641595016233623, + "epoch": 0.11641248076115852, + "grad_norm": 0.08069349080324173, + "kl": 0.02069091796875, + "learning_rate": 3.944182567590897e-06, + "loss": -0.0042, + "step": 3120 + }, + { + "clip_ratio": 0.004284569207811728, + "completion_length": 658.3571701049805, + "epoch": 0.11644979245371018, + "grad_norm": 0.07129634916782379, + "kl": 0.02215576171875, + "learning_rate": 3.941447771002581e-06, + "loss": 0.0009, + "num_tokens": 73321626.0, + "reward": 0.5425821617245674, + "reward_std": 0.10583068197593093, + "rewards/code_reward": 0.39258212223649025, + "rewards/format_reward": 1.5, + "step": 3121 + }, + { + "clip_ratio": 0.004576375999022275, + "epoch": 0.11648710414626183, + "grad_norm": 0.06886506080627441, + "kl": 0.022369384765625, + "learning_rate": 3.938713628353825e-06, + "loss": 0.0008, + "step": 3122 + }, + { + "clip_ratio": 0.004260918241925538, + "epoch": 0.11652441583881348, + "grad_norm": 0.06677033752202988, + "kl": 0.022674560546875, + "learning_rate": 3.935980140791833e-06, + "loss": 0.0005, + "step": 3123 + }, + { + "clip_ratio": 0.00511650979751721, + "completion_length": 617.9821701049805, + "epoch": 0.11656172753136514, + "grad_norm": 0.0917869508266449, + "kl": 0.02911376953125, + "learning_rate": 3.933247309463518e-06, + "loss": 0.0099, + "num_tokens": 73387313.0, + "reward": 0.4023868329823017, + "reward_std": 0.21424176171422005, + "rewards/code_reward": 0.25238684192299843, + "rewards/format_reward": 1.5, + "step": 3124 + }, + { + "clip_ratio": 0.0045646374928765, + "epoch": 0.11659903922391679, + "grad_norm": 0.08557558804750443, + "kl": 0.029449462890625, + "learning_rate": 3.93051513551553e-06, + "loss": 0.0097, + "step": 3125 + }, + { + "clip_ratio": 0.004710663168225437, + "epoch": 0.11663635091646844, + "grad_norm": 0.0763154998421669, + "kl": 0.028228759765625, + "learning_rate": 3.927783620094238e-06, + "loss": 0.0095, + "step": 3126 + }, + { + "clip_ratio": 0.002660051337443292, + "completion_length": 646.7678833007812, + "epoch": 0.1166736626090201, + "grad_norm": 0.07518567144870758, + "kl": 0.020751953125, + "learning_rate": 3.925052764345733e-06, + "loss": 0.0036, + "num_tokens": 73454374.0, + "reward": 0.9302621930837631, + "reward_std": 0.1473453901708126, + "rewards/code_reward": 0.7802621722221375, + "rewards/format_reward": 1.5, + "step": 3127 + }, + { + "clip_ratio": 0.0022066174424253404, + "epoch": 0.11671097430157175, + "grad_norm": 0.07511213421821594, + "kl": 0.02001953125, + "learning_rate": 3.922322569415834e-06, + "loss": 0.0034, + "step": 3128 + }, + { + "clip_ratio": 0.0023551505291834474, + "epoch": 0.1167482859941234, + "grad_norm": 0.07292918115854263, + "kl": 0.020660400390625, + "learning_rate": 3.9195930364500755e-06, + "loss": 0.0033, + "step": 3129 + }, + { + "clip_ratio": 0.004818943212740123, + "completion_length": 667.0000152587891, + "epoch": 0.11678559768667506, + "grad_norm": 0.09753982722759247, + "kl": 0.02081298828125, + "learning_rate": 3.916864166593724e-06, + "loss": -0.0008, + "num_tokens": 73519474.0, + "reward": 0.7502859607338905, + "reward_std": 0.21077083982527256, + "rewards/code_reward": 0.600285965949297, + "rewards/format_reward": 1.5, + "step": 3130 + }, + { + "clip_ratio": 0.004404923529364169, + "epoch": 0.11682290937922672, + "grad_norm": 0.08686208724975586, + "kl": 0.0209503173828125, + "learning_rate": 3.914135960991761e-06, + "loss": -0.0009, + "step": 3131 + }, + { + "clip_ratio": 0.004518921137787402, + "epoch": 0.11686022107177838, + "grad_norm": 0.0872860699892044, + "kl": 0.0218048095703125, + "learning_rate": 3.911408420788888e-06, + "loss": -0.0012, + "step": 3132 + }, + { + "clip_ratio": 0.003520070225931704, + "completion_length": 664.9464569091797, + "epoch": 0.11689753276433003, + "grad_norm": 0.08086175471544266, + "kl": 0.022857666015625, + "learning_rate": 3.908681547129537e-06, + "loss": 0.0076, + "num_tokens": 73598935.0, + "reward": 0.7428002879023552, + "reward_std": 0.16912661120295525, + "rewards/code_reward": 0.5928002819418907, + "rewards/format_reward": 1.5, + "step": 3133 + }, + { + "clip_ratio": 0.004127569613046944, + "epoch": 0.11693484445688168, + "grad_norm": 0.07718481868505478, + "kl": 0.022552490234375, + "learning_rate": 3.9059553411578455e-06, + "loss": 0.0075, + "step": 3134 + }, + { + "clip_ratio": 0.0036211302503943443, + "epoch": 0.11697215614943333, + "grad_norm": 0.07484066486358643, + "kl": 0.02239990234375, + "learning_rate": 3.903229804017683e-06, + "loss": 0.0072, + "step": 3135 + }, + { + "clip_ratio": 0.0015760365640744567, + "completion_length": 503.46431732177734, + "epoch": 0.11700946784198499, + "grad_norm": 0.04872216284275055, + "kl": 0.0238037109375, + "learning_rate": 3.900504936852635e-06, + "loss": 0.0027, + "num_tokens": 73651251.0, + "reward": 0.8107142597436905, + "reward_std": 0.05860090255737305, + "rewards/code_reward": 0.6607142873108387, + "rewards/format_reward": 1.5, + "step": 3136 + }, + { + "clip_ratio": 0.0013038375182077289, + "epoch": 0.11704677953453664, + "grad_norm": 0.04975006356835365, + "kl": 0.0238494873046875, + "learning_rate": 3.897780740806002e-06, + "loss": 0.0027, + "step": 3137 + }, + { + "clip_ratio": 0.0012936923885717988, + "epoch": 0.1170840912270883, + "grad_norm": 0.04598410055041313, + "kl": 0.0244598388671875, + "learning_rate": 3.895057217020809e-06, + "loss": 0.0024, + "step": 3138 + }, + { + "clip_ratio": 0.0036260567139834166, + "completion_length": 752.3928985595703, + "epoch": 0.11712140291963995, + "grad_norm": 0.07815928757190704, + "kl": 0.02313232421875, + "learning_rate": 3.8923343666397965e-06, + "loss": 0.0047, + "num_tokens": 73728091.0, + "reward": 0.5746649131178856, + "reward_std": 0.20857674349099398, + "rewards/code_reward": 0.4246648964472115, + "rewards/format_reward": 1.5, + "step": 3139 + }, + { + "clip_ratio": 0.0038656608667224646, + "epoch": 0.1171587146121916, + "grad_norm": 0.07799101620912552, + "kl": 0.022796630859375, + "learning_rate": 3.889612190805419e-06, + "loss": 0.0048, + "step": 3140 + }, + { + "clip_ratio": 0.0033889905898831785, + "epoch": 0.11719602630474325, + "grad_norm": 0.07826808840036392, + "kl": 0.022705078125, + "learning_rate": 3.886890690659854e-06, + "loss": 0.0043, + "step": 3141 + }, + { + "clip_ratio": 0.004006878240033984, + "completion_length": 649.6250228881836, + "epoch": 0.1172333379972949, + "grad_norm": 0.07376465201377869, + "kl": 0.021759033203125, + "learning_rate": 3.884169867344988e-06, + "loss": -0.002, + "num_tokens": 73793184.0, + "reward": 0.7665006630122662, + "reward_std": 0.11075379326939583, + "rewards/code_reward": 0.6165006458759308, + "rewards/format_reward": 1.5, + "step": 3142 + }, + { + "clip_ratio": 0.003065898607019335, + "epoch": 0.11727064968984656, + "grad_norm": 0.07570933550596237, + "kl": 0.019317626953125, + "learning_rate": 3.881449722002431e-06, + "loss": -0.0022, + "step": 3143 + }, + { + "clip_ratio": 0.0033084986498579383, + "epoch": 0.11730796138239821, + "grad_norm": 0.0668923631310463, + "kl": 0.01971435546875, + "learning_rate": 3.878730255773507e-06, + "loss": -0.0025, + "step": 3144 + }, + { + "clip_ratio": 0.003673262894153595, + "completion_length": 677.3928833007812, + "epoch": 0.11734527307494987, + "grad_norm": 0.05958327278494835, + "kl": 0.0191497802734375, + "learning_rate": 3.87601146979925e-06, + "loss": -0.0046, + "num_tokens": 73856556.0, + "reward": 0.2715539075434208, + "reward_std": 0.19332346692681313, + "rewards/code_reward": 0.12155388854444027, + "rewards/format_reward": 1.5, + "step": 3145 + }, + { + "clip_ratio": 0.003445886541157961, + "epoch": 0.11738258476750152, + "grad_norm": 0.05819453299045563, + "kl": 0.018585205078125, + "learning_rate": 3.873293365220416e-06, + "loss": -0.0049, + "step": 3146 + }, + { + "clip_ratio": 0.003009026753716171, + "epoch": 0.11741989646005317, + "grad_norm": 0.05515163391828537, + "kl": 0.01837158203125, + "learning_rate": 3.870575943177467e-06, + "loss": -0.0049, + "step": 3147 + }, + { + "clip_ratio": 0.0018761554965749383, + "completion_length": 637.0178833007812, + "epoch": 0.11745720815260483, + "grad_norm": 0.07975072413682938, + "kl": 0.027679443359375, + "learning_rate": 3.867859204810586e-06, + "loss": 0.0055, + "num_tokens": 73918577.0, + "reward": 0.7685942724347115, + "reward_std": 0.14607293158769608, + "rewards/code_reward": 0.6185942869633436, + "rewards/format_reward": 1.5, + "step": 3148 + }, + { + "clip_ratio": 0.0021206961246207356, + "epoch": 0.11749451984515648, + "grad_norm": 0.07499407976865768, + "kl": 0.027191162109375, + "learning_rate": 3.865143151259664e-06, + "loss": 0.0054, + "step": 3149 + }, + { + "clip_ratio": 0.002004773123189807, + "epoch": 0.11753183153770813, + "grad_norm": 0.06723622977733612, + "kl": 0.02764892578125, + "learning_rate": 3.862427783664306e-06, + "loss": 0.0053, + "step": 3150 + }, + { + "clip_ratio": 0.003956971631851047, + "completion_length": 737.0893096923828, + "epoch": 0.11756914323025978, + "grad_norm": 0.07505204528570175, + "kl": 0.02154541015625, + "learning_rate": 3.859713103163834e-06, + "loss": -0.0063, + "num_tokens": 73993104.0, + "reward": 0.4098086729645729, + "reward_std": 0.05503605003468692, + "rewards/code_reward": 0.2598086494981544, + "rewards/format_reward": 1.5, + "step": 3151 + }, + { + "clip_ratio": 0.004617932194378227, + "epoch": 0.11760645492281144, + "grad_norm": 0.08557739108800888, + "kl": 0.0213165283203125, + "learning_rate": 3.856999110897273e-06, + "loss": -0.0063, + "step": 3152 + }, + { + "clip_ratio": 0.004432236193679273, + "epoch": 0.11764376661536309, + "grad_norm": 0.06232628971338272, + "kl": 0.0217437744140625, + "learning_rate": 3.854285808003365e-06, + "loss": -0.0066, + "step": 3153 + }, + { + "clip_ratio": 0.003217117046006024, + "completion_length": 673.1428909301758, + "epoch": 0.11768107830791474, + "grad_norm": 0.06902199238538742, + "kl": 0.0186309814453125, + "learning_rate": 3.8515731956205626e-06, + "loss": -0.0039, + "num_tokens": 74064442.0, + "reward": 0.6380769312381744, + "reward_std": 0.15834076888859272, + "rewards/code_reward": 0.4880769243463874, + "rewards/format_reward": 1.5, + "step": 3154 + }, + { + "clip_ratio": 0.0033148761722259223, + "epoch": 0.1177183900004664, + "grad_norm": 0.07173483818769455, + "kl": 0.01873779296875, + "learning_rate": 3.848861274887026e-06, + "loss": -0.0039, + "step": 3155 + }, + { + "clip_ratio": 0.00299039640231058, + "epoch": 0.11775570169301805, + "grad_norm": 0.06658875197172165, + "kl": 0.01861572265625, + "learning_rate": 3.84615004694063e-06, + "loss": -0.004, + "step": 3156 + }, + { + "clip_ratio": 0.003946274926420301, + "completion_length": 842.8214569091797, + "epoch": 0.1177930133855697, + "grad_norm": 0.08234892040491104, + "kl": 0.020233154296875, + "learning_rate": 3.843439512918949e-06, + "loss": 0.0022, + "num_tokens": 74143476.0, + "reward": 0.4408017434179783, + "reward_std": 0.24007617309689522, + "rewards/code_reward": 0.2908017234876752, + "rewards/format_reward": 1.5, + "step": 3157 + }, + { + "clip_ratio": 0.003569665481336415, + "epoch": 0.11783032507812136, + "grad_norm": 0.09544827044010162, + "kl": 0.020782470703125, + "learning_rate": 3.840729673959279e-06, + "loss": 0.0021, + "step": 3158 + }, + { + "clip_ratio": 0.003715096798259765, + "epoch": 0.11786763677067301, + "grad_norm": 0.07640023529529572, + "kl": 0.020111083984375, + "learning_rate": 3.838020531198618e-06, + "loss": 0.0018, + "step": 3159 + }, + { + "clip_ratio": 0.002355264557991177, + "completion_length": 491.53572845458984, + "epoch": 0.11790494846322466, + "grad_norm": 0.0681726261973381, + "kl": 0.018218994140625, + "learning_rate": 3.835312085773667e-06, + "loss": 0.0003, + "num_tokens": 74200318.0, + "reward": 0.6774045191705227, + "reward_std": 0.13260414451360703, + "rewards/code_reward": 0.5274045336991549, + "rewards/format_reward": 1.5, + "step": 3160 + }, + { + "clip_ratio": 0.0021043798187747598, + "epoch": 0.11794226015577632, + "grad_norm": 0.07246432453393936, + "kl": 0.018524169921875, + "learning_rate": 3.832604338820843e-06, + "loss": 0.0, + "step": 3161 + }, + { + "clip_ratio": 0.0019592674798332155, + "epoch": 0.11797957184832797, + "grad_norm": 0.06524486839771271, + "kl": 0.01885986328125, + "learning_rate": 3.8298972914762654e-06, + "loss": -0.0001, + "step": 3162 + }, + { + "clip_ratio": 0.004983360297046602, + "completion_length": 907.428596496582, + "epoch": 0.11801688354087962, + "grad_norm": 0.06335091590881348, + "kl": 0.01983642578125, + "learning_rate": 3.8271909448757615e-06, + "loss": 0.0132, + "num_tokens": 74283166.0, + "reward": 0.2849184051156044, + "reward_std": 0.18785777501761913, + "rewards/code_reward": 0.1349183921702206, + "rewards/format_reward": 1.5, + "step": 3163 + }, + { + "clip_ratio": 0.004787419515196234, + "epoch": 0.11805419523343127, + "grad_norm": 0.06151396781206131, + "kl": 0.020172119140625, + "learning_rate": 3.824485300154866e-06, + "loss": 0.013, + "step": 3164 + }, + { + "clip_ratio": 0.004084488609805703, + "epoch": 0.11809150692598293, + "grad_norm": 0.06608529388904572, + "kl": 0.02032470703125, + "learning_rate": 3.821780358448811e-06, + "loss": 0.0129, + "step": 3165 + }, + { + "clip_ratio": 0.004509448248427361, + "completion_length": 904.357177734375, + "epoch": 0.11812881861853458, + "grad_norm": 0.06404298543930054, + "kl": 0.021881103515625, + "learning_rate": 3.819076120892545e-06, + "loss": 0.0186, + "num_tokens": 74371778.0, + "reward": 0.4875524081289768, + "reward_std": 0.118827267549932, + "rewards/code_reward": 0.33755238726735115, + "rewards/format_reward": 1.5, + "step": 3166 + }, + { + "clip_ratio": 0.004349616996478289, + "epoch": 0.11816613031108623, + "grad_norm": 0.06417249143123627, + "kl": 0.021514892578125, + "learning_rate": 3.816372588620715e-06, + "loss": 0.0183, + "step": 3167 + }, + { + "clip_ratio": 0.00429290346801281, + "epoch": 0.11820344200363789, + "grad_norm": 0.06272221356630325, + "kl": 0.021331787109375, + "learning_rate": 3.813669762767672e-06, + "loss": 0.0184, + "step": 3168 + }, + { + "clip_ratio": 0.004206355835776776, + "completion_length": 836.3928680419922, + "epoch": 0.11824075369618954, + "grad_norm": 0.07112158834934235, + "kl": 0.0423583984375, + "learning_rate": 3.8109676444674736e-06, + "loss": 0.0685, + "num_tokens": 74454596.0, + "reward": 0.5562239848077297, + "reward_std": 0.13946610444691032, + "rewards/code_reward": 0.4115811352385208, + "rewards/format_reward": 1.4464285671710968, + "step": 3169 + }, + { + "clip_ratio": 0.003994468075688928, + "epoch": 0.1182780653887412, + "grad_norm": 0.06652302294969559, + "kl": 0.041717529296875, + "learning_rate": 3.8082662348538746e-06, + "loss": 0.0683, + "step": 3170 + }, + { + "clip_ratio": 0.0036025020526722074, + "epoch": 0.11831537708129285, + "grad_norm": 0.06565985828638077, + "kl": 0.039306640625, + "learning_rate": 3.8055655350603393e-06, + "loss": 0.0681, + "step": 3171 + }, + { + "clip_ratio": 0.002754130575340241, + "completion_length": 746.0178833007812, + "epoch": 0.1183526887738445, + "grad_norm": 0.08800854533910751, + "kl": 0.02655029296875, + "learning_rate": 3.8028655462200314e-06, + "loss": 0.0041, + "num_tokens": 74542489.0, + "reward": 0.631341889500618, + "reward_std": 0.228589229285717, + "rewards/code_reward": 0.4813418984413147, + "rewards/format_reward": 1.5, + "step": 3172 + }, + { + "clip_ratio": 0.0035296493442729115, + "epoch": 0.11839000046639615, + "grad_norm": 0.0844016820192337, + "kl": 0.02691650390625, + "learning_rate": 3.8001662694658135e-06, + "loss": 0.0042, + "step": 3173 + }, + { + "clip_ratio": 0.0036625126958824694, + "epoch": 0.1184273121589478, + "grad_norm": 0.06932277232408524, + "kl": 0.027191162109375, + "learning_rate": 3.7974677059302545e-06, + "loss": 0.0042, + "step": 3174 + }, + { + "clip_ratio": 0.003626051067840308, + "completion_length": 644.9464569091797, + "epoch": 0.11846462385149946, + "grad_norm": 0.08629102259874344, + "kl": 0.0196380615234375, + "learning_rate": 3.7947698567456202e-06, + "loss": 0.0164, + "num_tokens": 74619038.0, + "reward": 0.8075092099606991, + "reward_std": 0.18941552378237247, + "rewards/code_reward": 0.6575091481208801, + "rewards/format_reward": 1.5, + "step": 3175 + }, + { + "clip_ratio": 0.003257309435866773, + "epoch": 0.11850193554405111, + "grad_norm": 0.08270081132650375, + "kl": 0.01934814453125, + "learning_rate": 3.792072723043878e-06, + "loss": 0.0163, + "step": 3176 + }, + { + "clip_ratio": 0.00298681078129448, + "epoch": 0.11853924723660277, + "grad_norm": 0.08334255963563919, + "kl": 0.019134521484375, + "learning_rate": 3.789376305956698e-06, + "loss": 0.0162, + "step": 3177 + }, + { + "clip_ratio": 0.002545383875258267, + "completion_length": 589.428596496582, + "epoch": 0.11857655892915442, + "grad_norm": 0.05327135697007179, + "kl": 0.0205841064453125, + "learning_rate": 3.7866806066154417e-06, + "loss": -0.025, + "num_tokens": 74678660.0, + "reward": 0.6689179539680481, + "reward_std": 0.046663232147693634, + "rewards/code_reward": 0.5189179639564827, + "rewards/format_reward": 1.5, + "step": 3178 + }, + { + "clip_ratio": 0.002592555363662541, + "epoch": 0.11861387062170607, + "grad_norm": 0.058085847645998, + "kl": 0.0203399658203125, + "learning_rate": 3.7839856261511774e-06, + "loss": -0.0251, + "step": 3179 + }, + { + "clip_ratio": 0.0024124328047037125, + "epoch": 0.11865118231425772, + "grad_norm": 0.05482613667845726, + "kl": 0.0200958251953125, + "learning_rate": 3.7812913656946683e-06, + "loss": -0.025, + "step": 3180 + }, + { + "clip_ratio": 0.004426781903021038, + "completion_length": 739.482177734375, + "epoch": 0.11868849400680938, + "grad_norm": 0.06949705630540848, + "kl": 0.017974853515625, + "learning_rate": 3.7785978263763758e-06, + "loss": 0.0035, + "num_tokens": 74756709.0, + "reward": 0.36645640432834625, + "reward_std": 0.12626174977049232, + "rewards/code_reward": 0.21645640581846237, + "rewards/format_reward": 1.5, + "step": 3181 + }, + { + "clip_ratio": 0.004318740742746741, + "epoch": 0.11872580569936103, + "grad_norm": 0.10428254306316376, + "kl": 0.0184173583984375, + "learning_rate": 3.7759050093264617e-06, + "loss": 0.0033, + "step": 3182 + }, + { + "clip_ratio": 0.004133711219765246, + "epoch": 0.11876311739191268, + "grad_norm": 0.06715582311153412, + "kl": 0.018157958984375, + "learning_rate": 3.7732129156747766e-06, + "loss": 0.0033, + "step": 3183 + }, + { + "clip_ratio": 0.0022417415166273713, + "completion_length": 682.8214416503906, + "epoch": 0.11880042908446434, + "grad_norm": 0.057495009154081345, + "kl": 0.0216064453125, + "learning_rate": 3.770521546550877e-06, + "loss": 0.0069, + "num_tokens": 74825425.0, + "reward": 0.897321417927742, + "reward_std": 0.2595449239015579, + "rewards/code_reward": 0.7500000074505806, + "rewards/format_reward": 1.4732142984867096, + "step": 3184 + }, + { + "clip_ratio": 0.0021442438010126352, + "epoch": 0.11883774077701599, + "grad_norm": 0.05508112162351608, + "kl": 0.02178955078125, + "learning_rate": 3.767830903084011e-06, + "loss": 0.0068, + "step": 3185 + }, + { + "clip_ratio": 0.001606310484930873, + "epoch": 0.11887505246956766, + "grad_norm": 0.054474350064992905, + "kl": 0.0211029052734375, + "learning_rate": 3.7651409864031207e-06, + "loss": 0.0064, + "step": 3186 + }, + { + "clip_ratio": 0.002914567681727931, + "completion_length": 642.8928833007812, + "epoch": 0.11891236416211931, + "grad_norm": 0.049391139298677444, + "kl": 0.0179901123046875, + "learning_rate": 3.7624517976368493e-06, + "loss": -0.0171, + "num_tokens": 74897827.0, + "reward": 0.6940240524709225, + "reward_std": 0.1302274614572525, + "rewards/code_reward": 0.5440240353345871, + "rewards/format_reward": 1.5, + "step": 3187 + }, + { + "clip_ratio": 0.0029331016121432185, + "epoch": 0.11894967585467096, + "grad_norm": 0.04947115108370781, + "kl": 0.0177001953125, + "learning_rate": 3.7597633379135245e-06, + "loss": -0.0168, + "step": 3188 + }, + { + "clip_ratio": 0.003177687874995172, + "epoch": 0.11898698754722262, + "grad_norm": 0.0495181567966938, + "kl": 0.0179443359375, + "learning_rate": 3.757075608361179e-06, + "loss": -0.017, + "step": 3189 + }, + { + "clip_ratio": 0.004301096778362989, + "completion_length": 653.4107666015625, + "epoch": 0.11902429923977427, + "grad_norm": 0.0681157335639, + "kl": 0.016876220703125, + "learning_rate": 3.7543886101075312e-06, + "loss": 0.0018, + "num_tokens": 74961452.0, + "reward": 0.2904075048863888, + "reward_std": 0.1773951854556799, + "rewards/code_reward": 0.14040748868137598, + "rewards/format_reward": 1.5, + "step": 3190 + }, + { + "clip_ratio": 0.00415386725217104, + "epoch": 0.11906161093232592, + "grad_norm": 0.06071795895695686, + "kl": 0.0166473388671875, + "learning_rate": 3.751702344279997e-06, + "loss": 0.0014, + "step": 3191 + }, + { + "clip_ratio": 0.0037940923939459026, + "epoch": 0.11909892262487758, + "grad_norm": 0.06309258192777634, + "kl": 0.016876220703125, + "learning_rate": 3.749016812005685e-06, + "loss": 0.0013, + "step": 3192 + }, + { + "clip_ratio": 0.0027335549821145833, + "completion_length": 595.5535888671875, + "epoch": 0.11913623431742923, + "grad_norm": 0.06088685244321823, + "kl": 0.016845703125, + "learning_rate": 3.746332014411391e-06, + "loss": -0.0027, + "num_tokens": 75023629.0, + "reward": 0.8773274533450603, + "reward_std": 0.08483291789889336, + "rewards/code_reward": 0.727327436208725, + "rewards/format_reward": 1.5, + "step": 3193 + }, + { + "clip_ratio": 0.002602566732093692, + "epoch": 0.11917354600998088, + "grad_norm": 0.06352593004703522, + "kl": 0.016571044921875, + "learning_rate": 3.7436479526236114e-06, + "loss": -0.0027, + "step": 3194 + }, + { + "clip_ratio": 0.002713323978241533, + "epoch": 0.11921085770253254, + "grad_norm": 0.05922922119498253, + "kl": 0.016387939453125, + "learning_rate": 3.740964627768524e-06, + "loss": -0.0028, + "step": 3195 + }, + { + "clip_ratio": 0.002964951447211206, + "completion_length": 688.2143249511719, + "epoch": 0.11924816939508419, + "grad_norm": 0.05179457738995552, + "kl": 0.0240478515625, + "learning_rate": 3.738282040972002e-06, + "loss": -0.0125, + "num_tokens": 75087593.0, + "reward": 0.5966521762311459, + "reward_std": 0.1447286196053028, + "rewards/code_reward": 0.4466521665453911, + "rewards/format_reward": 1.5, + "step": 3196 + }, + { + "clip_ratio": 0.002713327470701188, + "epoch": 0.11928548108763584, + "grad_norm": 0.04943661764264107, + "kl": 0.02459716796875, + "learning_rate": 3.735600193359613e-06, + "loss": -0.0128, + "step": 3197 + }, + { + "clip_ratio": 0.003202046558726579, + "epoch": 0.1193227927801875, + "grad_norm": 0.04942714795470238, + "kl": 0.02459716796875, + "learning_rate": 3.7329190860566066e-06, + "loss": -0.0127, + "step": 3198 + }, + { + "clip_ratio": 0.004320526437368244, + "completion_length": 773.8214721679688, + "epoch": 0.11936010447273915, + "grad_norm": 0.09604209661483765, + "kl": 0.02569580078125, + "learning_rate": 3.7302387201879275e-06, + "loss": 0.0002, + "num_tokens": 75160761.0, + "reward": 0.3625575564801693, + "reward_std": 0.1897135111503303, + "rewards/code_reward": 0.2125575270038098, + "rewards/format_reward": 1.5, + "step": 3199 + }, + { + "clip_ratio": 0.004052879812661558, + "epoch": 0.1193974161652908, + "grad_norm": 0.0882202535867691, + "kl": 0.02587890625, + "learning_rate": 3.7275590968782092e-06, + "loss": 0.0001, + "step": 3200 + }, + { + "clip_ratio": 0.003514446783810854, + "epoch": 0.11943472785784245, + "grad_norm": 0.08643096685409546, + "kl": 0.025482177734375, + "learning_rate": 3.7248802172517672e-06, + "loss": -0.0003, + "step": 3201 + }, + { + "clip_ratio": 0.003762095875572413, + "completion_length": 834.8750534057617, + "epoch": 0.11947203955039411, + "grad_norm": 0.07721506804227829, + "kl": 0.022064208984375, + "learning_rate": 3.7222020824326134e-06, + "loss": 0.0164, + "num_tokens": 75247694.0, + "reward": 0.6425919234752655, + "reward_std": 0.2716106250882149, + "rewards/code_reward": 0.4952704459428787, + "rewards/format_reward": 1.4732142984867096, + "step": 3202 + }, + { + "clip_ratio": 0.0038527342840097845, + "epoch": 0.11950935124294576, + "grad_norm": 0.07679919898509979, + "kl": 0.021697998046875, + "learning_rate": 3.719524693544441e-06, + "loss": 0.0165, + "step": 3203 + }, + { + "clip_ratio": 0.0032838783226907253, + "epoch": 0.11954666293549741, + "grad_norm": 0.07743402570486069, + "kl": 0.0222015380859375, + "learning_rate": 3.716848051710634e-06, + "loss": 0.0163, + "step": 3204 + }, + { + "clip_ratio": 0.004765747231431305, + "completion_length": 549.1250305175781, + "epoch": 0.11958397462804907, + "grad_norm": 0.09555885195732117, + "kl": 0.023162841796875, + "learning_rate": 3.7141721580542634e-06, + "loss": -0.0029, + "num_tokens": 75311765.0, + "reward": 0.6036401093006134, + "reward_std": 0.2577563300728798, + "rewards/code_reward": 0.4536401052027941, + "rewards/format_reward": 1.5, + "step": 3205 + }, + { + "clip_ratio": 0.004653690557461232, + "epoch": 0.11962128632060072, + "grad_norm": 0.09684383869171143, + "kl": 0.0230712890625, + "learning_rate": 3.711497013698081e-06, + "loss": -0.003, + "step": 3206 + }, + { + "clip_ratio": 0.004854105180129409, + "epoch": 0.11965859801315237, + "grad_norm": 0.074730783700943, + "kl": 0.023406982421875, + "learning_rate": 3.7088226197645294e-06, + "loss": -0.0033, + "step": 3207 + }, + { + "clip_ratio": 0.004197440081043169, + "completion_length": 640.1250305175781, + "epoch": 0.11969590970570403, + "grad_norm": 0.06597737222909927, + "kl": 0.02044677734375, + "learning_rate": 3.706148977375734e-06, + "loss": -0.0024, + "num_tokens": 75375780.0, + "reward": 0.5844742469489574, + "reward_std": 0.14293115586042404, + "rewards/code_reward": 0.43447423726320267, + "rewards/format_reward": 1.5, + "step": 3208 + }, + { + "clip_ratio": 0.003810590977082029, + "epoch": 0.11973322139825568, + "grad_norm": 0.07029876857995987, + "kl": 0.02081298828125, + "learning_rate": 3.703476087653505e-06, + "loss": -0.0025, + "step": 3209 + }, + { + "clip_ratio": 0.0037640680675394833, + "epoch": 0.11977053309080733, + "grad_norm": 0.0709041953086853, + "kl": 0.02093505859375, + "learning_rate": 3.7008039517193396e-06, + "loss": -0.0026, + "step": 3210 + }, + { + "clip_ratio": 0.003343959106132388, + "completion_length": 797.6071701049805, + "epoch": 0.11980784478335899, + "grad_norm": 0.07458386570215225, + "kl": 0.0309906005859375, + "learning_rate": 3.698132570694415e-06, + "loss": -0.0002, + "num_tokens": 75460262.0, + "reward": 0.6724326089024544, + "reward_std": 0.27871936559677124, + "rewards/code_reward": 0.5224325880408287, + "rewards/format_reward": 1.5, + "step": 3211 + }, + { + "clip_ratio": 0.003255473158787936, + "epoch": 0.11984515647591064, + "grad_norm": 0.07005707174539566, + "kl": 0.031341552734375, + "learning_rate": 3.6954619456995933e-06, + "loss": -0.0001, + "step": 3212 + }, + { + "clip_ratio": 0.0032814466394484043, + "epoch": 0.11988246816846229, + "grad_norm": 0.06756866723299026, + "kl": 0.0304718017578125, + "learning_rate": 3.692792077855418e-06, + "loss": -0.0002, + "step": 3213 + }, + { + "clip_ratio": 0.0033860268886201084, + "completion_length": 755.1607513427734, + "epoch": 0.11991977986101394, + "grad_norm": 0.06750387698411942, + "kl": 0.0159912109375, + "learning_rate": 3.690122968282116e-06, + "loss": 0.0033, + "num_tokens": 75533463.0, + "reward": 0.4683639854192734, + "reward_std": 0.11913722287863493, + "rewards/code_reward": 0.3183639799244702, + "rewards/format_reward": 1.5, + "step": 3214 + }, + { + "clip_ratio": 0.0033050094498321414, + "epoch": 0.1199570915535656, + "grad_norm": 0.050453606992959976, + "kl": 0.015869140625, + "learning_rate": 3.6874546180995994e-06, + "loss": 0.0033, + "step": 3215 + }, + { + "clip_ratio": 0.0032834933954291046, + "epoch": 0.11999440324611725, + "grad_norm": 0.05137243866920471, + "kl": 0.015960693359375, + "learning_rate": 3.6847870284274533e-06, + "loss": 0.0032, + "step": 3216 + }, + { + "clip_ratio": 0.002060184080619365, + "completion_length": 592.1428756713867, + "epoch": 0.1200317149386689, + "grad_norm": 0.06570267677307129, + "kl": 0.018310546875, + "learning_rate": 3.6821202003849543e-06, + "loss": 0.0099, + "num_tokens": 75586251.0, + "reward": 0.7936462722718716, + "reward_std": 0.10773773735854775, + "rewards/code_reward": 0.6436462728888728, + "rewards/format_reward": 1.5, + "step": 3217 + }, + { + "clip_ratio": 0.0016134431934915483, + "epoch": 0.12006902663122056, + "grad_norm": 0.054239191114902496, + "kl": 0.01806640625, + "learning_rate": 3.6794541350910495e-06, + "loss": 0.0097, + "step": 3218 + }, + { + "clip_ratio": 0.0022031995467841625, + "epoch": 0.12010633832377221, + "grad_norm": 0.05522192269563675, + "kl": 0.01824951171875, + "learning_rate": 3.676788833664372e-06, + "loss": 0.0098, + "step": 3219 + }, + { + "clip_ratio": 0.003688161028549075, + "completion_length": 780.1071929931641, + "epoch": 0.12014365001632386, + "grad_norm": 0.07658540457487106, + "kl": 0.0175933837890625, + "learning_rate": 3.6741242972232326e-06, + "loss": -0.0045, + "num_tokens": 75669809.0, + "reward": 0.7806410454213619, + "reward_std": 0.16399390972219408, + "rewards/code_reward": 0.6306410320394207, + "rewards/format_reward": 1.5, + "step": 3220 + }, + { + "clip_ratio": 0.0034067304513882846, + "epoch": 0.12018096170887552, + "grad_norm": 0.07508627325296402, + "kl": 0.0179901123046875, + "learning_rate": 3.671460526885621e-06, + "loss": -0.0047, + "step": 3221 + }, + { + "clip_ratio": 0.003485659050056711, + "epoch": 0.12021827340142717, + "grad_norm": 0.07490460574626923, + "kl": 0.017608642578125, + "learning_rate": 3.6687975237692075e-06, + "loss": -0.0049, + "step": 3222 + }, + { + "clip_ratio": 0.003566127037629485, + "completion_length": 801.8571929931641, + "epoch": 0.12025558509397882, + "grad_norm": 0.06919880956411362, + "kl": 0.02362060546875, + "learning_rate": 3.666135288991336e-06, + "loss": 0.0137, + "num_tokens": 75745919.0, + "reward": 0.523039273917675, + "reward_std": 0.08673156052827835, + "rewards/code_reward": 0.37571780756115913, + "rewards/format_reward": 1.4732142984867096, + "step": 3223 + }, + { + "clip_ratio": 0.003832756308838725, + "epoch": 0.12029289678653048, + "grad_norm": 0.06627404689788818, + "kl": 0.0233154296875, + "learning_rate": 3.6634738236690317e-06, + "loss": 0.0136, + "step": 3224 + }, + { + "clip_ratio": 0.003607424965593964, + "epoch": 0.12033020847908213, + "grad_norm": 0.06442967802286148, + "kl": 0.0233917236328125, + "learning_rate": 3.6608131289189985e-06, + "loss": 0.0133, + "step": 3225 + }, + { + "clip_ratio": 0.004447162908036262, + "completion_length": 594.2857437133789, + "epoch": 0.12036752017163378, + "grad_norm": 0.08318357914686203, + "kl": 0.022064208984375, + "learning_rate": 3.6581532058576106e-06, + "loss": 0.018, + "num_tokens": 75808431.0, + "reward": 0.610676746815443, + "reward_std": 0.1728401891887188, + "rewards/code_reward": 0.46067674458026886, + "rewards/format_reward": 1.5, + "step": 3226 + }, + { + "clip_ratio": 0.004240899521391839, + "epoch": 0.12040483186418544, + "grad_norm": 0.08089282363653183, + "kl": 0.021881103515625, + "learning_rate": 3.6554940556009243e-06, + "loss": 0.0177, + "step": 3227 + }, + { + "clip_ratio": 0.004253811959642917, + "epoch": 0.12044214355673709, + "grad_norm": 0.07218444347381592, + "kl": 0.02239990234375, + "learning_rate": 3.6528356792646715e-06, + "loss": 0.0174, + "step": 3228 + }, + { + "clip_ratio": 0.0034291197080165148, + "completion_length": 660.2143173217773, + "epoch": 0.12047945524928874, + "grad_norm": 0.06068615987896919, + "kl": 0.022369384765625, + "learning_rate": 3.6501780779642528e-06, + "loss": -0.0058, + "num_tokens": 75874425.0, + "reward": 0.5385714247822762, + "reward_std": 0.146863779053092, + "rewards/code_reward": 0.3885714281350374, + "rewards/format_reward": 1.5, + "step": 3229 + }, + { + "clip_ratio": 0.003138350381050259, + "epoch": 0.1205167669418404, + "grad_norm": 0.05720178410410881, + "kl": 0.02239990234375, + "learning_rate": 3.647521252814754e-06, + "loss": -0.006, + "step": 3230 + }, + { + "clip_ratio": 0.00325335148954764, + "epoch": 0.12055407863439205, + "grad_norm": 0.05627739056944847, + "kl": 0.02178955078125, + "learning_rate": 3.644865204930924e-06, + "loss": -0.0061, + "step": 3231 + }, + { + "clip_ratio": 0.0028940493357367814, + "completion_length": 758.2678985595703, + "epoch": 0.1205913903269437, + "grad_norm": 0.07078709453344345, + "kl": 0.020904541015625, + "learning_rate": 3.6422099354271945e-06, + "loss": -0.0043, + "num_tokens": 75956942.0, + "reward": 0.721881989389658, + "reward_std": 0.11814757669344544, + "rewards/code_reward": 0.5718819983303547, + "rewards/format_reward": 1.5, + "step": 3232 + }, + { + "clip_ratio": 0.0029169777990318835, + "epoch": 0.12062870201949535, + "grad_norm": 0.06927255541086197, + "kl": 0.0207672119140625, + "learning_rate": 3.639555445417667e-06, + "loss": -0.0044, + "step": 3233 + }, + { + "clip_ratio": 0.0027394936769269407, + "epoch": 0.12066601371204701, + "grad_norm": 0.06617403030395508, + "kl": 0.0208282470703125, + "learning_rate": 3.6369017360161163e-06, + "loss": -0.0046, + "step": 3234 + }, + { + "clip_ratio": 0.003886696242261678, + "completion_length": 841.0536193847656, + "epoch": 0.12070332540459866, + "grad_norm": 0.05987152084708214, + "kl": 0.0182342529296875, + "learning_rate": 3.6342488083359895e-06, + "loss": 0.0085, + "num_tokens": 76046069.0, + "reward": 0.3810250870883465, + "reward_std": 0.2917453497648239, + "rewards/code_reward": 0.23102505691349506, + "rewards/format_reward": 1.5, + "step": 3235 + }, + { + "clip_ratio": 0.00383836985565722, + "epoch": 0.12074063709715031, + "grad_norm": 0.058344628661870956, + "kl": 0.0189056396484375, + "learning_rate": 3.6315966634904045e-06, + "loss": 0.0086, + "step": 3236 + }, + { + "clip_ratio": 0.0035272433306090534, + "epoch": 0.12077794878970197, + "grad_norm": 0.055790677666664124, + "kl": 0.01824951171875, + "learning_rate": 3.6289453025921517e-06, + "loss": 0.0082, + "step": 3237 + }, + { + "clip_ratio": 0.001377586624585092, + "completion_length": 546.178596496582, + "epoch": 0.12081526048225362, + "grad_norm": 0.0025819840375334024, + "kl": 0.021148681640625, + "learning_rate": 3.626294726753695e-06, + "loss": 0.0002, + "num_tokens": 76112523.0, + "reward": 0.899999987334013, + "reward_std": 0.0, + "rewards/code_reward": 0.75, + "rewards/format_reward": 1.5, + "step": 3238 + }, + { + "clip_ratio": 0.0015507623320445418, + "epoch": 0.12085257217480527, + "grad_norm": 0.0028007281944155693, + "kl": 0.021636962890625, + "learning_rate": 3.623644937087165e-06, + "loss": 0.0003, + "step": 3239 + }, + { + "clip_ratio": 0.001763948705047369, + "epoch": 0.12088988386735694, + "grad_norm": 0.0026020838413387537, + "kl": 0.021026611328125, + "learning_rate": 3.6209959347043676e-06, + "loss": 0.0002, + "step": 3240 + }, + { + "clip_ratio": 0.004040351137518883, + "completion_length": 738.5893096923828, + "epoch": 0.12092719555990859, + "grad_norm": 0.07141219079494476, + "kl": 0.03289794921875, + "learning_rate": 3.6183477207167703e-06, + "loss": 0.0202, + "num_tokens": 76188870.0, + "reward": 0.6092776395380497, + "reward_std": 0.10633524414151907, + "rewards/code_reward": 0.4612419307231903, + "rewards/format_reward": 1.480357140302658, + "step": 3241 + }, + { + "clip_ratio": 0.003753526514628902, + "epoch": 0.12096450725246025, + "grad_norm": 0.07148023694753647, + "kl": 0.0326385498046875, + "learning_rate": 3.615700296235516e-06, + "loss": 0.0202, + "step": 3242 + }, + { + "clip_ratio": 0.004002752888482064, + "epoch": 0.1210018189450119, + "grad_norm": 0.07255565375089645, + "kl": 0.0316162109375, + "learning_rate": 3.6130536623714163e-06, + "loss": 0.0203, + "step": 3243 + }, + { + "clip_ratio": 0.0029526565922424197, + "completion_length": 662.5178833007812, + "epoch": 0.12103913063756355, + "grad_norm": 0.08008284866809845, + "kl": 0.02166748046875, + "learning_rate": 3.610407820234948e-06, + "loss": 0.0065, + "num_tokens": 76249407.0, + "reward": 0.7969825640320778, + "reward_std": 0.15823099203407764, + "rewards/code_reward": 0.6469825580716133, + "rewards/format_reward": 1.5, + "step": 3244 + }, + { + "clip_ratio": 0.003102195740211755, + "epoch": 0.1210764423301152, + "grad_norm": 0.07470151036977768, + "kl": 0.02154541015625, + "learning_rate": 3.6077627709362597e-06, + "loss": 0.0064, + "step": 3245 + }, + { + "clip_ratio": 0.0023765236837789416, + "epoch": 0.12111375402266686, + "grad_norm": 0.06947454810142517, + "kl": 0.0213623046875, + "learning_rate": 3.6051185155851633e-06, + "loss": 0.006, + "step": 3246 + }, + { + "clip_ratio": 0.004035108664538711, + "completion_length": 760.6785888671875, + "epoch": 0.12115106571521851, + "grad_norm": 0.08201508969068527, + "kl": 0.0213623046875, + "learning_rate": 3.602475055291137e-06, + "loss": 0.0149, + "num_tokens": 76312865.0, + "reward": 0.46703530475497246, + "reward_std": 0.17167866602540016, + "rewards/code_reward": 0.31971385097131133, + "rewards/format_reward": 1.4732142984867096, + "step": 3247 + }, + { + "clip_ratio": 0.004309201496653259, + "epoch": 0.12118837740777016, + "grad_norm": 0.08041948080062866, + "kl": 0.0206451416015625, + "learning_rate": 3.5998323911633326e-06, + "loss": 0.0148, + "step": 3248 + }, + { + "clip_ratio": 0.003958480490837246, + "epoch": 0.12122568910032182, + "grad_norm": 0.07862448692321777, + "kl": 0.02008056640625, + "learning_rate": 3.5971905243105587e-06, + "loss": 0.0148, + "step": 3249 + }, + { + "clip_ratio": 0.004058787249960005, + "completion_length": 573.857177734375, + "epoch": 0.12126300079287347, + "grad_norm": 0.0723070576786995, + "kl": 0.022796630859375, + "learning_rate": 3.5945494558412964e-06, + "loss": 0.0014, + "num_tokens": 76366141.0, + "reward": 0.5321778990328312, + "reward_std": 0.2857184559106827, + "rewards/code_reward": 0.3821778725832701, + "rewards/format_reward": 1.5, + "step": 3250 + }, + { + "clip_ratio": 0.003561908844858408, + "epoch": 0.12130031248542512, + "grad_norm": 0.071421779692173, + "kl": 0.023193359375, + "learning_rate": 3.5919091868636856e-06, + "loss": 0.0015, + "step": 3251 + }, + { + "clip_ratio": 0.0032809998956508934, + "epoch": 0.12133762417797678, + "grad_norm": 0.0668526366353035, + "kl": 0.023162841796875, + "learning_rate": 3.5892697184855364e-06, + "loss": 0.0012, + "step": 3252 + }, + { + "clip_ratio": 0.004225993063300848, + "completion_length": 707.1428985595703, + "epoch": 0.12137493587052843, + "grad_norm": 0.05574675649404526, + "kl": 0.025543212890625, + "learning_rate": 3.58663105181432e-06, + "loss": 0.0029, + "num_tokens": 76442225.0, + "reward": 0.7314935140311718, + "reward_std": 0.21436453610658646, + "rewards/code_reward": 0.5814934968948364, + "rewards/format_reward": 1.5, + "step": 3253 + }, + { + "clip_ratio": 0.0038781487965025008, + "epoch": 0.12141224756308008, + "grad_norm": 0.054938189685344696, + "kl": 0.0252685546875, + "learning_rate": 3.5839931879571733e-06, + "loss": 0.0027, + "step": 3254 + }, + { + "clip_ratio": 0.003505534608848393, + "epoch": 0.12144955925563174, + "grad_norm": 0.05515574291348457, + "kl": 0.025634765625, + "learning_rate": 3.5813561280208953e-06, + "loss": 0.0027, + "step": 3255 + }, + { + "clip_ratio": 0.0030415590154007077, + "completion_length": 615.8571548461914, + "epoch": 0.12148687094818339, + "grad_norm": 0.09136798977851868, + "kl": 0.01714324951171875, + "learning_rate": 3.578719873111943e-06, + "loss": -0.0011, + "num_tokens": 76495291.0, + "reward": 0.920036643743515, + "reward_std": 0.24683740735054016, + "rewards/code_reward": 0.7700366377830505, + "rewards/format_reward": 1.5, + "step": 3256 + }, + { + "clip_ratio": 0.0025369951617904007, + "epoch": 0.12152418264073504, + "grad_norm": 0.07468993961811066, + "kl": 0.016448974609375, + "learning_rate": 3.5760844243364446e-06, + "loss": -0.0011, + "step": 3257 + }, + { + "clip_ratio": 0.002826703537721187, + "epoch": 0.1215614943332867, + "grad_norm": 0.06724870949983597, + "kl": 0.01628875732421875, + "learning_rate": 3.5734497828001856e-06, + "loss": -0.0012, + "step": 3258 + }, + { + "clip_ratio": 0.005322706769220531, + "completion_length": 836.8750305175781, + "epoch": 0.12159880602583835, + "grad_norm": 0.08485773205757141, + "kl": 0.0408935546875, + "learning_rate": 3.570815949608609e-06, + "loss": -0.0235, + "num_tokens": 76588454.0, + "reward": 0.5942209661006927, + "reward_std": 0.2884673457592726, + "rewards/code_reward": 0.44422096642665565, + "rewards/format_reward": 1.5, + "step": 3259 + }, + { + "clip_ratio": 0.004957864410243928, + "epoch": 0.12163611771839, + "grad_norm": 0.08455481380224228, + "kl": 0.046173095703125, + "learning_rate": 3.5681829258668245e-06, + "loss": -0.0236, + "step": 3260 + }, + { + "clip_ratio": 0.004613128665369004, + "epoch": 0.12167342941094166, + "grad_norm": 0.08481277525424957, + "kl": 0.0426177978515625, + "learning_rate": 3.5655507126795997e-06, + "loss": -0.024, + "step": 3261 + }, + { + "clip_ratio": 0.0037475941935554147, + "completion_length": 873.982177734375, + "epoch": 0.12171074110349331, + "grad_norm": 0.07377642393112183, + "kl": 0.0178985595703125, + "learning_rate": 3.5629193111513615e-06, + "loss": 0.0251, + "num_tokens": 76670997.0, + "reward": 0.42942189797759056, + "reward_std": 0.18555491976439953, + "rewards/code_reward": 0.2794218680355698, + "rewards/format_reward": 1.5, + "step": 3262 + }, + { + "clip_ratio": 0.0032056656782515347, + "epoch": 0.12174805279604496, + "grad_norm": 0.07309293746948242, + "kl": 0.018341064453125, + "learning_rate": 3.5602887223862004e-06, + "loss": 0.0249, + "step": 3263 + }, + { + "clip_ratio": 0.0033068565535359085, + "epoch": 0.12178536448859661, + "grad_norm": 0.07060471177101135, + "kl": 0.0175933837890625, + "learning_rate": 3.5576589474878585e-06, + "loss": 0.0248, + "step": 3264 + }, + { + "clip_ratio": 0.0035007541300728917, + "completion_length": 684.2143249511719, + "epoch": 0.12182267618114827, + "grad_norm": 0.07712812721729279, + "kl": 0.0153961181640625, + "learning_rate": 3.5550299875597404e-06, + "loss": 0.0063, + "num_tokens": 76737369.0, + "reward": 0.5111156366765499, + "reward_std": 0.15494662220589817, + "rewards/code_reward": 0.3611155841499567, + "rewards/format_reward": 1.5, + "step": 3265 + }, + { + "clip_ratio": 0.0036638486781157553, + "epoch": 0.12185998787369992, + "grad_norm": 0.07535664737224579, + "kl": 0.015411376953125, + "learning_rate": 3.5524018437049145e-06, + "loss": 0.0062, + "step": 3266 + }, + { + "clip_ratio": 0.0041164371650666, + "epoch": 0.12189729956625157, + "grad_norm": 0.06284154206514359, + "kl": 0.015411376953125, + "learning_rate": 3.5497745170260955e-06, + "loss": 0.0062, + "step": 3267 + }, + { + "clip_ratio": 0.0017029159353114665, + "completion_length": 516.9286041259766, + "epoch": 0.12193461125880323, + "grad_norm": 0.05670943856239319, + "kl": 0.0186767578125, + "learning_rate": 3.5471480086256626e-06, + "loss": 0.0113, + "num_tokens": 76787863.0, + "reward": 1.0815724581480026, + "reward_std": 0.12620496563613415, + "rewards/code_reward": 0.931572437286377, + "rewards/format_reward": 1.5, + "step": 3268 + }, + { + "clip_ratio": 0.002210859442129731, + "epoch": 0.12197192295135488, + "grad_norm": 0.055106330662965775, + "kl": 0.0186309814453125, + "learning_rate": 3.5445223196056488e-06, + "loss": 0.0112, + "step": 3269 + }, + { + "clip_ratio": 0.0019299584673717618, + "epoch": 0.12200923464390653, + "grad_norm": 0.05487102270126343, + "kl": 0.0185699462890625, + "learning_rate": 3.5418974510677462e-06, + "loss": 0.011, + "step": 3270 + }, + { + "clip_ratio": 0.003533620503731072, + "completion_length": 588.6428680419922, + "epoch": 0.12204654633645819, + "grad_norm": 0.060120176523923874, + "kl": 0.0201873779296875, + "learning_rate": 3.5392734041133e-06, + "loss": 0.0053, + "num_tokens": 76855267.0, + "reward": 0.6814365088939667, + "reward_std": 0.16880172491073608, + "rewards/code_reward": 0.531436488032341, + "rewards/format_reward": 1.5, + "step": 3271 + }, + { + "clip_ratio": 0.0032274973927997053, + "epoch": 0.12208385802900984, + "grad_norm": 0.05637926608324051, + "kl": 0.019927978515625, + "learning_rate": 3.5366501798433117e-06, + "loss": 0.0051, + "step": 3272 + }, + { + "clip_ratio": 0.003472518175840378, + "epoch": 0.12212116972156149, + "grad_norm": 0.05780617147684097, + "kl": 0.02001953125, + "learning_rate": 3.53402777935844e-06, + "loss": 0.0051, + "step": 3273 + }, + { + "clip_ratio": 0.0033248720574192703, + "completion_length": 585.5536041259766, + "epoch": 0.12215848141411315, + "grad_norm": 0.06939540058374405, + "kl": 0.0244140625, + "learning_rate": 3.53140620375899e-06, + "loss": 0.0004, + "num_tokens": 76919782.0, + "reward": 0.8345938213169575, + "reward_std": 0.1603264548466541, + "rewards/code_reward": 0.6845938069163822, + "rewards/format_reward": 1.5, + "step": 3274 + }, + { + "clip_ratio": 0.0025622062967158854, + "epoch": 0.1221957931066648, + "grad_norm": 0.06845883280038834, + "kl": 0.023895263671875, + "learning_rate": 3.5287854541449294e-06, + "loss": 0.0001, + "step": 3275 + }, + { + "clip_ratio": 0.002570124459452927, + "epoch": 0.12223310479921645, + "grad_norm": 0.06768900156021118, + "kl": 0.0244140625, + "learning_rate": 3.5261655316158783e-06, + "loss": -0.0002, + "step": 3276 + }, + { + "clip_ratio": 0.003401731955818832, + "completion_length": 1085.8750457763672, + "epoch": 0.1222704164917681, + "grad_norm": 0.0662391260266304, + "kl": 0.0144805908203125, + "learning_rate": 3.5235464372711027e-06, + "loss": 0.0018, + "num_tokens": 77024285.0, + "reward": 0.4268491417169571, + "reward_std": 0.1663036230020225, + "rewards/code_reward": 0.27684911666437984, + "rewards/format_reward": 1.5, + "step": 3277 + }, + { + "clip_ratio": 0.0032447707490064204, + "epoch": 0.12230772818431976, + "grad_norm": 0.08063413202762604, + "kl": 0.014678955078125, + "learning_rate": 3.520928172209529e-06, + "loss": 0.0017, + "step": 3278 + }, + { + "clip_ratio": 0.003390219120774418, + "epoch": 0.12234503987687141, + "grad_norm": 0.05832769721746445, + "kl": 0.0149078369140625, + "learning_rate": 3.518310737529731e-06, + "loss": 0.0017, + "step": 3279 + }, + { + "clip_ratio": 0.0015975040150806308, + "completion_length": 624.8928909301758, + "epoch": 0.12238235156942306, + "grad_norm": 0.0016363946488127112, + "kl": 0.015411376953125, + "learning_rate": 3.515694134329937e-06, + "loss": 0.0002, + "num_tokens": 77088903.0, + "reward": 0.774999987334013, + "reward_std": 0.0, + "rewards/code_reward": 0.625, + "rewards/format_reward": 1.5, + "step": 3280 + }, + { + "clip_ratio": 0.0014343614457175136, + "epoch": 0.12241966326197472, + "grad_norm": 0.0015349301975220442, + "kl": 0.0152587890625, + "learning_rate": 3.5130783637080258e-06, + "loss": 0.0002, + "step": 3281 + }, + { + "clip_ratio": 0.001204386935569346, + "epoch": 0.12245697495452637, + "grad_norm": 0.001526908134110272, + "kl": 0.015472412109375, + "learning_rate": 3.5104634267615235e-06, + "loss": 0.0002, + "step": 3282 + }, + { + "clip_ratio": 0.004074029566254467, + "completion_length": 666.8393249511719, + "epoch": 0.12249428664707802, + "grad_norm": 0.07031568884849548, + "kl": 0.016815185546875, + "learning_rate": 3.507849324587612e-06, + "loss": 0.0187, + "num_tokens": 77157704.0, + "reward": 0.7204805724322796, + "reward_std": 0.16535314545035362, + "rewards/code_reward": 0.5704805254936218, + "rewards/format_reward": 1.5, + "step": 3283 + }, + { + "clip_ratio": 0.00396865454968065, + "epoch": 0.12253159833962968, + "grad_norm": 0.06848049908876419, + "kl": 0.0171356201171875, + "learning_rate": 3.505236058283118e-06, + "loss": 0.0188, + "step": 3284 + }, + { + "clip_ratio": 0.004069296322995797, + "epoch": 0.12256891003218133, + "grad_norm": 0.06266361474990845, + "kl": 0.0170135498046875, + "learning_rate": 3.5026236289445183e-06, + "loss": 0.0185, + "step": 3285 + }, + { + "clip_ratio": 0.0037691364414058626, + "completion_length": 585.7678833007812, + "epoch": 0.12260622172473298, + "grad_norm": 0.05924214795231819, + "kl": 0.01387786865234375, + "learning_rate": 3.5000120376679447e-06, + "loss": -0.0026, + "num_tokens": 77220859.0, + "reward": 0.5383928790688515, + "reward_std": 0.1918054074048996, + "rewards/code_reward": 0.3883928554132581, + "rewards/format_reward": 1.5, + "step": 3286 + }, + { + "clip_ratio": 0.004091858281753957, + "epoch": 0.12264353341728464, + "grad_norm": 0.05928516387939453, + "kl": 0.01404571533203125, + "learning_rate": 3.4974012855491667e-06, + "loss": -0.0024, + "step": 3287 + }, + { + "clip_ratio": 0.00398349529132247, + "epoch": 0.12268084510983629, + "grad_norm": 0.05902867764234543, + "kl": 0.01412200927734375, + "learning_rate": 3.49479137368361e-06, + "loss": -0.0024, + "step": 3288 + }, + { + "clip_ratio": 0.0034677974181249738, + "completion_length": 605.1964569091797, + "epoch": 0.12271815680238794, + "grad_norm": 0.054390840232372284, + "kl": 0.02459716796875, + "learning_rate": 3.4921823031663455e-06, + "loss": 0.0007, + "num_tokens": 77296232.0, + "reward": 0.6113613769412041, + "reward_std": 0.01545453816652298, + "rewards/code_reward": 0.4613613784313202, + "rewards/format_reward": 1.5, + "step": 3289 + }, + { + "clip_ratio": 0.003377836139407009, + "epoch": 0.1227554684949396, + "grad_norm": 0.05340440943837166, + "kl": 0.025054931640625, + "learning_rate": 3.48957407509209e-06, + "loss": 0.0006, + "step": 3290 + }, + { + "clip_ratio": 0.0034759296104311943, + "epoch": 0.12279278018749125, + "grad_norm": 0.05179211497306824, + "kl": 0.024871826171875, + "learning_rate": 3.4869666905552092e-06, + "loss": 0.0003, + "step": 3291 + }, + { + "clip_ratio": 0.004842805152293295, + "completion_length": 626.5536041259766, + "epoch": 0.1228300918800429, + "grad_norm": 0.07208505272865295, + "kl": 0.0170135498046875, + "learning_rate": 3.4843601506497095e-06, + "loss": -0.0026, + "num_tokens": 77361713.0, + "reward": 0.4421435035765171, + "reward_std": 0.19636291172355413, + "rewards/code_reward": 0.2921434924355708, + "rewards/format_reward": 1.5, + "step": 3292 + }, + { + "clip_ratio": 0.004686621658038348, + "epoch": 0.12286740357259456, + "grad_norm": 0.07251010835170746, + "kl": 0.0168914794921875, + "learning_rate": 3.4817544564692497e-06, + "loss": -0.0029, + "step": 3293 + }, + { + "clip_ratio": 0.004787354846484959, + "epoch": 0.12290471526514621, + "grad_norm": 0.07153011113405228, + "kl": 0.0174560546875, + "learning_rate": 3.479149609107132e-06, + "loss": -0.0028, + "step": 3294 + }, + { + "clip_ratio": 0.0030682009528391063, + "completion_length": 703.4285888671875, + "epoch": 0.12294202695769788, + "grad_norm": 0.07508868724107742, + "kl": 0.0198974609375, + "learning_rate": 3.476545609656297e-06, + "loss": -0.0025, + "num_tokens": 77436945.0, + "reward": 1.0273491889238358, + "reward_std": 0.1556708049029112, + "rewards/code_reward": 0.8773491382598877, + "rewards/format_reward": 1.5, + "step": 3295 + }, + { + "clip_ratio": 0.0028823568136431277, + "epoch": 0.12297933865024953, + "grad_norm": 0.07396262884140015, + "kl": 0.02093505859375, + "learning_rate": 3.473942459209338e-06, + "loss": -0.0026, + "step": 3296 + }, + { + "clip_ratio": 0.0029606830212287605, + "epoch": 0.12301665034280118, + "grad_norm": 0.07111619412899017, + "kl": 0.019500732421875, + "learning_rate": 3.471340158858488e-06, + "loss": -0.0028, + "step": 3297 + }, + { + "clip_ratio": 0.005490940064191818, + "completion_length": 606.5357360839844, + "epoch": 0.12305396203535283, + "grad_norm": 0.045634347945451736, + "kl": 0.03948974609375, + "learning_rate": 3.4687387096956233e-06, + "loss": -0.0022, + "num_tokens": 77503377.0, + "reward": 0.5058991238474846, + "reward_std": 0.1283881515264511, + "rewards/code_reward": 0.35589905828237534, + "rewards/format_reward": 1.5, + "step": 3298 + }, + { + "clip_ratio": 0.004363291896879673, + "epoch": 0.12309127372790449, + "grad_norm": 0.04302666336297989, + "kl": 0.037322998046875, + "learning_rate": 3.4661381128122664e-06, + "loss": -0.0023, + "step": 3299 + }, + { + "clip_ratio": 0.004430459870491177, + "epoch": 0.12312858542045614, + "grad_norm": 0.043095912784338, + "kl": 0.040283203125, + "learning_rate": 3.463538369299576e-06, + "loss": -0.0024, + "step": 3300 + }, + { + "clip_ratio": 0.004429796128533781, + "completion_length": 761.9107513427734, + "epoch": 0.1231658971130078, + "grad_norm": 0.07730039954185486, + "kl": 0.034576416015625, + "learning_rate": 3.4609394802483575e-06, + "loss": -0.0097, + "num_tokens": 77590284.0, + "reward": 0.6667914539575577, + "reward_std": 0.21735519170761108, + "rewards/code_reward": 0.5167914647608995, + "rewards/format_reward": 1.5, + "step": 3301 + }, + { + "clip_ratio": 0.004063522559590638, + "epoch": 0.12320320880555945, + "grad_norm": 0.11184808611869812, + "kl": 0.034423828125, + "learning_rate": 3.458341446749056e-06, + "loss": -0.0098, + "step": 3302 + }, + { + "clip_ratio": 0.004189736559055746, + "epoch": 0.1232405204981111, + "grad_norm": 0.07400871813297272, + "kl": 0.035369873046875, + "learning_rate": 3.4557442698917577e-06, + "loss": -0.0099, + "step": 3303 + }, + { + "clip_ratio": 0.002822626556735486, + "completion_length": 860.1964569091797, + "epoch": 0.12327783219066275, + "grad_norm": 0.06245522201061249, + "kl": 0.023651123046875, + "learning_rate": 3.4531479507661927e-06, + "loss": 0.0001, + "num_tokens": 77677291.0, + "reward": 0.5065724849700928, + "reward_std": 0.056664278730750084, + "rewards/code_reward": 0.35657248366624117, + "rewards/format_reward": 1.5, + "step": 3304 + }, + { + "clip_ratio": 0.002845659095328301, + "epoch": 0.1233151438832144, + "grad_norm": 0.058470942080020905, + "kl": 0.0234222412109375, + "learning_rate": 3.4505524904617247e-06, + "loss": 0.0, + "step": 3305 + }, + { + "clip_ratio": 0.0028319748234935105, + "epoch": 0.12335245557576606, + "grad_norm": 0.05214754119515419, + "kl": 0.0231781005859375, + "learning_rate": 3.4479578900673618e-06, + "loss": -0.0, + "step": 3306 + }, + { + "clip_ratio": 0.0030590295209549367, + "completion_length": 689.5714569091797, + "epoch": 0.12338976726831771, + "grad_norm": 0.09298142045736313, + "kl": 0.01361083984375, + "learning_rate": 3.4453641506717495e-06, + "loss": 0.0211, + "num_tokens": 77751705.0, + "reward": 0.6509724296629429, + "reward_std": 0.25592645443975925, + "rewards/code_reward": 0.500972418114543, + "rewards/format_reward": 1.5, + "step": 3307 + }, + { + "clip_ratio": 0.003117046901024878, + "epoch": 0.12342707896086937, + "grad_norm": 0.08075343072414398, + "kl": 0.01377105712890625, + "learning_rate": 3.442771273363174e-06, + "loss": 0.0209, + "step": 3308 + }, + { + "clip_ratio": 0.0029915018239989877, + "epoch": 0.12346439065342102, + "grad_norm": 0.08338242024183273, + "kl": 0.01409912109375, + "learning_rate": 3.4401792592295603e-06, + "loss": 0.0207, + "step": 3309 + }, + { + "clip_ratio": 0.0013979755458422005, + "completion_length": 767.9464569091797, + "epoch": 0.12350170234597267, + "grad_norm": 0.043157100677490234, + "kl": 0.0169830322265625, + "learning_rate": 3.437588109358465e-06, + "loss": 0.0308, + "num_tokens": 77828884.0, + "reward": 0.9328440129756927, + "reward_std": 0.22041639685630798, + "rewards/code_reward": 0.7855226099491119, + "rewards/format_reward": 1.4732142984867096, + "step": 3310 + }, + { + "clip_ratio": 0.0014479966484941542, + "epoch": 0.12353901403852433, + "grad_norm": 0.04301004484295845, + "kl": 0.016815185546875, + "learning_rate": 3.4349978248370908e-06, + "loss": 0.0308, + "step": 3311 + }, + { + "clip_ratio": 0.0014608438359573483, + "epoch": 0.12357632573107598, + "grad_norm": 0.04433252662420273, + "kl": 0.016754150390625, + "learning_rate": 3.432408406752268e-06, + "loss": 0.0308, + "step": 3312 + }, + { + "clip_ratio": 0.002191282110288739, + "completion_length": 599.5000228881836, + "epoch": 0.12361363742362763, + "grad_norm": 0.037467923015356064, + "kl": 0.0206451416015625, + "learning_rate": 3.4298198561904712e-06, + "loss": -0.0041, + "num_tokens": 77894572.0, + "reward": 0.8285714201629162, + "reward_std": 0.06419407576322556, + "rewards/code_reward": 0.6785714328289032, + "rewards/format_reward": 1.5, + "step": 3313 + }, + { + "clip_ratio": 0.001816570875234902, + "epoch": 0.12365094911617928, + "grad_norm": 0.038937535136938095, + "kl": 0.020599365234375, + "learning_rate": 3.427232174237808e-06, + "loss": -0.0041, + "step": 3314 + }, + { + "clip_ratio": 0.0018137689912691712, + "epoch": 0.12368826080873094, + "grad_norm": 0.05264589190483093, + "kl": 0.0211334228515625, + "learning_rate": 3.4246453619800225e-06, + "loss": -0.0042, + "step": 3315 + }, + { + "clip_ratio": 0.0036609291564673185, + "completion_length": 630.9286117553711, + "epoch": 0.12372557250128259, + "grad_norm": 0.08005592226982117, + "kl": 0.0247802734375, + "learning_rate": 3.4220594205024925e-06, + "loss": -0.0052, + "num_tokens": 77964708.0, + "reward": 0.6659830138087273, + "reward_std": 0.3666919395327568, + "rewards/code_reward": 0.5177687481045723, + "rewards/format_reward": 1.4821428656578064, + "step": 3316 + }, + { + "clip_ratio": 0.0035102064721286297, + "epoch": 0.12376288419383424, + "grad_norm": 0.07968675345182419, + "kl": 0.0250244140625, + "learning_rate": 3.4194743508902295e-06, + "loss": -0.0053, + "step": 3317 + }, + { + "clip_ratio": 0.0032432895386591554, + "epoch": 0.1238001958863859, + "grad_norm": 0.0918649360537529, + "kl": 0.02447509765625, + "learning_rate": 3.4168901542278827e-06, + "loss": -0.0054, + "step": 3318 + }, + { + "clip_ratio": 0.004554424260277301, + "completion_length": 743.8393096923828, + "epoch": 0.12383750757893755, + "grad_norm": 0.10511420667171478, + "kl": 0.017486572265625, + "learning_rate": 3.4143068315997336e-06, + "loss": 0.014, + "num_tokens": 78044793.0, + "reward": 0.39104075357317924, + "reward_std": 0.2670020340010524, + "rewards/code_reward": 0.24104075180366635, + "rewards/format_reward": 1.5, + "step": 3319 + }, + { + "clip_ratio": 0.004351576615590602, + "epoch": 0.1238748192714892, + "grad_norm": 0.08055247366428375, + "kl": 0.017547607421875, + "learning_rate": 3.411724384089693e-06, + "loss": 0.0137, + "step": 3320 + }, + { + "clip_ratio": 0.004381479462608695, + "epoch": 0.12391213096404086, + "grad_norm": 0.08115466684103012, + "kl": 0.0174713134765625, + "learning_rate": 3.40914281278131e-06, + "loss": 0.0136, + "step": 3321 + }, + { + "clip_ratio": 0.004627706948667765, + "completion_length": 638.0536041259766, + "epoch": 0.12394944265659251, + "grad_norm": 0.0965382382273674, + "kl": 0.027313232421875, + "learning_rate": 3.406562118757765e-06, + "loss": -0.0074, + "num_tokens": 78113872.0, + "reward": 0.6515977755188942, + "reward_std": 0.321767121553421, + "rewards/code_reward": 0.5015977695584297, + "rewards/format_reward": 1.5, + "step": 3322 + }, + { + "clip_ratio": 0.005131187033839524, + "epoch": 0.12398675434914416, + "grad_norm": 0.0971384197473526, + "kl": 0.028045654296875, + "learning_rate": 3.403982303101867e-06, + "loss": -0.0071, + "step": 3323 + }, + { + "clip_ratio": 0.003799003316089511, + "epoch": 0.12402406604169582, + "grad_norm": 0.09381041675806046, + "kl": 0.026519775390625, + "learning_rate": 3.40140336689606e-06, + "loss": -0.0079, + "step": 3324 + }, + { + "clip_ratio": 0.0005318945040926337, + "completion_length": 538.2500228881836, + "epoch": 0.12406137773424747, + "grad_norm": 0.03751419112086296, + "kl": 0.025177001953125, + "learning_rate": 3.3988253112224177e-06, + "loss": 0.0045, + "num_tokens": 78171202.0, + "reward": 1.095770239830017, + "reward_std": 0.07077732682228088, + "rewards/code_reward": 0.9457702487707138, + "rewards/format_reward": 1.5, + "step": 3325 + }, + { + "clip_ratio": 0.0007042965735308826, + "epoch": 0.12409868942679912, + "grad_norm": 0.03521256148815155, + "kl": 0.02490234375, + "learning_rate": 3.3962481371626437e-06, + "loss": 0.0045, + "step": 3326 + }, + { + "clip_ratio": 0.0005146123003214598, + "epoch": 0.12413600111935078, + "grad_norm": 0.03659931197762489, + "kl": 0.024810791015625, + "learning_rate": 3.3936718457980753e-06, + "loss": 0.0043, + "step": 3327 + }, + { + "clip_ratio": 0.0029862368246540427, + "completion_length": 653.928596496582, + "epoch": 0.12417331281190243, + "grad_norm": 0.11210646480321884, + "kl": 0.017425537109375, + "learning_rate": 3.3910964382096716e-06, + "loss": -0.0012, + "num_tokens": 78240456.0, + "reward": 0.4281592033803463, + "reward_std": 0.019548336043953896, + "rewards/code_reward": 0.27994491532444954, + "rewards/format_reward": 1.4821428656578064, + "step": 3328 + }, + { + "clip_ratio": 0.003263341379351914, + "epoch": 0.12421062450445408, + "grad_norm": 0.057482291013002396, + "kl": 0.016510009765625, + "learning_rate": 3.388521915478031e-06, + "loss": -0.0012, + "step": 3329 + }, + { + "clip_ratio": 0.003266888321377337, + "epoch": 0.12424793619700573, + "grad_norm": 0.05109044909477234, + "kl": 0.016357421875, + "learning_rate": 3.385948278683371e-06, + "loss": -0.0014, + "step": 3330 + }, + { + "clip_ratio": 0.0049037462449632585, + "completion_length": 759.8214569091797, + "epoch": 0.12428524788955739, + "grad_norm": 0.08249744027853012, + "kl": 0.023651123046875, + "learning_rate": 3.3833755289055424e-06, + "loss": -0.0097, + "num_tokens": 78321548.0, + "reward": 0.46432989090681076, + "reward_std": 0.20602131634950638, + "rewards/code_reward": 0.314329881221056, + "rewards/format_reward": 1.5, + "step": 3331 + }, + { + "clip_ratio": 0.004988397122360766, + "epoch": 0.12432255958210904, + "grad_norm": 0.08009625226259232, + "kl": 0.024322509765625, + "learning_rate": 3.380803667224025e-06, + "loss": -0.0098, + "step": 3332 + }, + { + "clip_ratio": 0.004613926168531179, + "epoch": 0.1243598712746607, + "grad_norm": 0.09435795247554779, + "kl": 0.024139404296875, + "learning_rate": 3.378232694717924e-06, + "loss": -0.0099, + "step": 3333 + }, + { + "clip_ratio": 0.00372976076323539, + "completion_length": 674.5357513427734, + "epoch": 0.12439718296721235, + "grad_norm": 0.08286946266889572, + "kl": 0.02447509765625, + "learning_rate": 3.375662612465973e-06, + "loss": 0.0509, + "num_tokens": 78391830.0, + "reward": 0.5067719221115112, + "reward_std": 0.1701037036255002, + "rewards/code_reward": 0.3594504618085921, + "rewards/format_reward": 1.4732142984867096, + "step": 3334 + }, + { + "clip_ratio": 0.0034701653057709336, + "epoch": 0.124434494659764, + "grad_norm": 0.07841577380895615, + "kl": 0.024688720703125, + "learning_rate": 3.3730934215465265e-06, + "loss": 0.0507, + "step": 3335 + }, + { + "clip_ratio": 0.002965990570373833, + "epoch": 0.12447180635231565, + "grad_norm": 0.07435742020606995, + "kl": 0.024749755859375, + "learning_rate": 3.370525123037572e-06, + "loss": 0.0504, + "step": 3336 + }, + { + "clip_ratio": 0.004052021307870746, + "completion_length": 696.2321624755859, + "epoch": 0.1245091180448673, + "grad_norm": 0.0714157372713089, + "kl": 0.01715087890625, + "learning_rate": 3.367957718016721e-06, + "loss": 0.0041, + "num_tokens": 78468577.0, + "reward": 0.8306447863578796, + "reward_std": 0.1826060521416366, + "rewards/code_reward": 0.6806447207927704, + "rewards/format_reward": 1.5, + "step": 3337 + }, + { + "clip_ratio": 0.003936028049793094, + "epoch": 0.12454642973741896, + "grad_norm": 0.06588342040777206, + "kl": 0.017822265625, + "learning_rate": 3.365391207561207e-06, + "loss": 0.004, + "step": 3338 + }, + { + "clip_ratio": 0.004207409336231649, + "epoch": 0.12458374142997061, + "grad_norm": 0.07706573605537415, + "kl": 0.017578125, + "learning_rate": 3.362825592747892e-06, + "loss": 0.004, + "step": 3339 + }, + { + "clip_ratio": 0.0024714371538721025, + "completion_length": 880.3214721679688, + "epoch": 0.12462105312252227, + "grad_norm": 0.07017571479082108, + "kl": 0.0279541015625, + "learning_rate": 3.360260874653257e-06, + "loss": 0.0009, + "num_tokens": 78550271.0, + "reward": 0.6522224918007851, + "reward_std": 0.022162443958222866, + "rewards/code_reward": 0.502222481649369, + "rewards/format_reward": 1.5, + "step": 3340 + }, + { + "clip_ratio": 0.002872168435715139, + "epoch": 0.12465836481507392, + "grad_norm": 0.3157333731651306, + "kl": 0.022918701171875, + "learning_rate": 3.357697054353413e-06, + "loss": 0.0007, + "step": 3341 + }, + { + "clip_ratio": 0.002697100688237697, + "epoch": 0.12469567650762557, + "grad_norm": 0.04818720370531082, + "kl": 0.0259246826171875, + "learning_rate": 3.3551341329240892e-06, + "loss": 0.0005, + "step": 3342 + }, + { + "clip_ratio": 0.004747291561216116, + "completion_length": 574.1964569091797, + "epoch": 0.12473298820017722, + "grad_norm": 0.0735696330666542, + "kl": 0.0213470458984375, + "learning_rate": 3.352572111440642e-06, + "loss": 0.0057, + "num_tokens": 78620492.0, + "reward": 0.4357143044471741, + "reward_std": 0.17580271884799004, + "rewards/code_reward": 0.285714291036129, + "rewards/format_reward": 1.5, + "step": 3343 + }, + { + "clip_ratio": 0.00447832205099985, + "epoch": 0.12477029989272888, + "grad_norm": 0.07344018667936325, + "kl": 0.0213623046875, + "learning_rate": 3.3500109909780486e-06, + "loss": 0.0055, + "step": 3344 + }, + { + "clip_ratio": 0.004542232898529619, + "epoch": 0.12480761158528053, + "grad_norm": 0.07040713727474213, + "kl": 0.021331787109375, + "learning_rate": 3.347450772610904e-06, + "loss": 0.0052, + "step": 3345 + }, + { + "clip_ratio": 0.004253425693605095, + "completion_length": 668.1964569091797, + "epoch": 0.12484492327783218, + "grad_norm": 0.08209454268217087, + "kl": 0.02508544921875, + "learning_rate": 3.3448914574134306e-06, + "loss": 0.0297, + "num_tokens": 78688711.0, + "reward": 0.40568340569734573, + "reward_std": 0.23745206370949745, + "rewards/code_reward": 0.25568340765312314, + "rewards/format_reward": 1.5, + "step": 3346 + }, + { + "clip_ratio": 0.003972113016061485, + "epoch": 0.12488223497038384, + "grad_norm": 0.07388640195131302, + "kl": 0.02490234375, + "learning_rate": 3.3423330464594717e-06, + "loss": 0.0292, + "step": 3347 + }, + { + "clip_ratio": 0.003948348166886717, + "epoch": 0.12491954666293549, + "grad_norm": 0.08473552763462067, + "kl": 0.025054931640625, + "learning_rate": 3.3397755408224842e-06, + "loss": 0.0291, + "step": 3348 + }, + { + "clip_ratio": 0.00569697271566838, + "completion_length": 689.4107513427734, + "epoch": 0.12495685835548716, + "grad_norm": 0.08285471051931381, + "kl": 0.05145263671875, + "learning_rate": 3.3372189415755517e-06, + "loss": 0.0064, + "num_tokens": 78759854.0, + "reward": 0.3995814509689808, + "reward_std": 0.2902437150478363, + "rewards/code_reward": 0.24958143010735512, + "rewards/format_reward": 1.5, + "step": 3349 + }, + { + "clip_ratio": 0.00524793443037197, + "epoch": 0.12499417004803881, + "grad_norm": 0.1798548400402069, + "kl": 0.045379638671875, + "learning_rate": 3.334663249791378e-06, + "loss": 0.0063, + "step": 3350 + }, + { + "clip_ratio": 0.004750925349071622, + "epoch": 0.12503148174059045, + "grad_norm": 0.07595983147621155, + "kl": 0.04522705078125, + "learning_rate": 3.332108466542281e-06, + "loss": 0.0059, + "step": 3351 + }, + { + "clip_ratio": 0.00207934231730178, + "completion_length": 611.5000457763672, + "epoch": 0.12506879343314212, + "grad_norm": 0.0739937275648117, + "kl": 0.0187835693359375, + "learning_rate": 3.3295545929002036e-06, + "loss": 0.0069, + "num_tokens": 78831478.0, + "reward": 0.8821428567171097, + "reward_std": 0.15759944915771484, + "rewards/code_reward": 0.732142860069871, + "rewards/format_reward": 1.5, + "step": 3352 + }, + { + "clip_ratio": 0.0018243178492411971, + "epoch": 0.12510610512569376, + "grad_norm": 0.05303655564785004, + "kl": 0.0183868408203125, + "learning_rate": 3.3270016299367002e-06, + "loss": 0.0067, + "step": 3353 + }, + { + "clip_ratio": 0.001929588324856013, + "epoch": 0.12514341681824542, + "grad_norm": 0.05824446305632591, + "kl": 0.0189056396484375, + "learning_rate": 3.324449578722948e-06, + "loss": 0.0068, + "step": 3354 + }, + { + "clip_ratio": 0.003963553055655211, + "completion_length": 594.0714569091797, + "epoch": 0.12518072851079706, + "grad_norm": 0.07356854528188705, + "kl": 0.025848388671875, + "learning_rate": 3.321898440329743e-06, + "loss": -0.0033, + "num_tokens": 78898024.0, + "reward": 0.7277310937643051, + "reward_std": 0.309846393764019, + "rewards/code_reward": 0.5777310989797115, + "rewards/format_reward": 1.5, + "step": 3355 + }, + { + "clip_ratio": 0.003732511540874839, + "epoch": 0.12521804020334873, + "grad_norm": 0.07482507824897766, + "kl": 0.0264892578125, + "learning_rate": 3.3193482158274904e-06, + "loss": -0.0037, + "step": 3356 + }, + { + "clip_ratio": 0.003265902050770819, + "epoch": 0.12525535189590037, + "grad_norm": 0.0734267607331276, + "kl": 0.0264892578125, + "learning_rate": 3.3167989062862215e-06, + "loss": -0.0037, + "step": 3357 + }, + { + "clip_ratio": 0.0030938066774979234, + "completion_length": 648.3571624755859, + "epoch": 0.12529266358845204, + "grad_norm": 0.05814874917268753, + "kl": 0.03216552734375, + "learning_rate": 3.3142505127755754e-06, + "loss": -0.0189, + "num_tokens": 78960876.0, + "reward": 0.3129516802728176, + "reward_std": 0.10292697884142399, + "rewards/code_reward": 0.1649159644730389, + "rewards/format_reward": 1.480357140302658, + "step": 3358 + }, + { + "clip_ratio": 0.0030807810835540295, + "epoch": 0.12532997528100367, + "grad_norm": 0.055284518748521805, + "kl": 0.028717041015625, + "learning_rate": 3.311703036364813e-06, + "loss": -0.0189, + "step": 3359 + }, + { + "clip_ratio": 0.0026326186198275536, + "epoch": 0.12536728697355534, + "grad_norm": 0.05503236502408981, + "kl": 0.027252197265625, + "learning_rate": 3.3091564781228074e-06, + "loss": -0.0192, + "step": 3360 + }, + { + "clip_ratio": 0.001996835577301681, + "completion_length": 604.8035888671875, + "epoch": 0.12540459866610698, + "grad_norm": 0.05136578157544136, + "kl": 0.025146484375, + "learning_rate": 3.3066108391180474e-06, + "loss": 0.0102, + "num_tokens": 79017481.0, + "reward": 0.585786908864975, + "reward_std": 0.13161606900393963, + "rewards/code_reward": 0.4357869168743491, + "rewards/format_reward": 1.5, + "step": 3361 + }, + { + "clip_ratio": 0.00209666951559484, + "epoch": 0.12544191035865865, + "grad_norm": 0.06089303269982338, + "kl": 0.0257568359375, + "learning_rate": 3.3040661204186373e-06, + "loss": 0.0103, + "step": 3362 + }, + { + "clip_ratio": 0.001954790437594056, + "epoch": 0.1254792220512103, + "grad_norm": 0.053125955164432526, + "kl": 0.025146484375, + "learning_rate": 3.301522323092291e-06, + "loss": 0.0102, + "step": 3363 + }, + { + "clip_ratio": 0.003910862084012479, + "completion_length": 643.2857437133789, + "epoch": 0.12551653374376195, + "grad_norm": 0.08371138572692871, + "kl": 0.02105712890625, + "learning_rate": 3.29897944820634e-06, + "loss": -0.0121, + "num_tokens": 79089289.0, + "reward": 0.7533773742616177, + "reward_std": 0.25934954173862934, + "rewards/code_reward": 0.6033773431554437, + "rewards/format_reward": 1.5, + "step": 3364 + }, + { + "clip_ratio": 0.003784747823374346, + "epoch": 0.1255538454363136, + "grad_norm": 0.08304328471422195, + "kl": 0.021240234375, + "learning_rate": 3.2964374968277286e-06, + "loss": -0.0123, + "step": 3365 + }, + { + "clip_ratio": 0.003536797739798203, + "epoch": 0.12559115712886526, + "grad_norm": 0.07731577754020691, + "kl": 0.02117919921875, + "learning_rate": 3.2938964700230093e-06, + "loss": -0.0123, + "step": 3366 + }, + { + "clip_ratio": 0.003593349305447191, + "completion_length": 493.28572845458984, + "epoch": 0.1256284688214169, + "grad_norm": 0.08944926410913467, + "kl": 0.021270751953125, + "learning_rate": 3.2913563688583523e-06, + "loss": 0.0188, + "num_tokens": 79144815.0, + "reward": 0.4000757709145546, + "reward_std": 0.23246114701032639, + "rewards/code_reward": 0.25007578171789646, + "rewards/format_reward": 1.5, + "step": 3367 + }, + { + "clip_ratio": 0.002957322809379548, + "epoch": 0.12566578051396857, + "grad_norm": 0.08468563854694366, + "kl": 0.021759033203125, + "learning_rate": 3.288817194399535e-06, + "loss": 0.0185, + "step": 3368 + }, + { + "clip_ratio": 0.0031053893617354333, + "epoch": 0.1257030922065202, + "grad_norm": 0.0782952830195427, + "kl": 0.02105712890625, + "learning_rate": 3.28627894771195e-06, + "loss": 0.0184, + "step": 3369 + }, + { + "clip_ratio": 0.001166472677141428, + "completion_length": 585.6607437133789, + "epoch": 0.12574040389907187, + "grad_norm": 0.043655216693878174, + "kl": 0.0185394287109375, + "learning_rate": 3.2837416298605983e-06, + "loss": 0.0024, + "num_tokens": 79205346.0, + "reward": 0.8107142820954323, + "reward_std": 0.20798593759536743, + "rewards/code_reward": 0.660714291036129, + "rewards/format_reward": 1.5, + "step": 3370 + }, + { + "clip_ratio": 0.0012771678157150745, + "epoch": 0.1257777155916235, + "grad_norm": 0.044834576547145844, + "kl": 0.0193328857421875, + "learning_rate": 3.2812052419100893e-06, + "loss": 0.0024, + "step": 3371 + }, + { + "clip_ratio": 0.0012313828920014203, + "epoch": 0.12581502728417518, + "grad_norm": 0.04528504237532616, + "kl": 0.0186614990234375, + "learning_rate": 3.2786697849246467e-06, + "loss": 0.0023, + "step": 3372 + }, + { + "clip_ratio": 0.0005924319848418236, + "completion_length": 570.2678833007812, + "epoch": 0.12585233897672682, + "grad_norm": 0.025960827246308327, + "kl": 0.0161590576171875, + "learning_rate": 3.2761352599680985e-06, + "loss": -0.0004, + "num_tokens": 79270153.0, + "reward": 1.1355769038200378, + "reward_std": 0.053966209292411804, + "rewards/code_reward": 0.9855769127607346, + "rewards/format_reward": 1.5, + "step": 3373 + }, + { + "clip_ratio": 0.0007174111669883132, + "epoch": 0.12588965066927849, + "grad_norm": 0.025578703731298447, + "kl": 0.016326904296875, + "learning_rate": 3.2736016681038856e-06, + "loss": -0.0004, + "step": 3374 + }, + { + "clip_ratio": 0.00046299552195705473, + "epoch": 0.12592696236183012, + "grad_norm": 0.026633186265826225, + "kl": 0.016021728515625, + "learning_rate": 3.2710690103950582e-06, + "loss": -0.0004, + "step": 3375 + }, + { + "clip_ratio": 0.004422121855895966, + "completion_length": 599.6786041259766, + "epoch": 0.1259642740543818, + "grad_norm": 0.07829828560352325, + "kl": 0.0401611328125, + "learning_rate": 3.2685372879042678e-06, + "loss": -0.0022, + "num_tokens": 79332253.0, + "reward": 0.5584188476204872, + "reward_std": 0.3518487177789211, + "rewards/code_reward": 0.40841884911060333, + "rewards/format_reward": 1.5, + "step": 3376 + }, + { + "clip_ratio": 0.0037490068934857845, + "epoch": 0.12600158574693346, + "grad_norm": 0.07992813736200333, + "kl": 0.034027099609375, + "learning_rate": 3.2660065016937813e-06, + "loss": -0.0023, + "step": 3377 + }, + { + "clip_ratio": 0.003892851818818599, + "epoch": 0.1260388974394851, + "grad_norm": 0.07250382751226425, + "kl": 0.040191650390625, + "learning_rate": 3.263476652825469e-06, + "loss": -0.0025, + "step": 3378 + }, + { + "clip_ratio": 0.002680430479813367, + "completion_length": 524.7321548461914, + "epoch": 0.12607620913203677, + "grad_norm": 0.07398314774036407, + "kl": 0.0173187255859375, + "learning_rate": 3.260947742360808e-06, + "loss": 0.005, + "num_tokens": 79394398.0, + "reward": 0.6668841578066349, + "reward_std": 0.24011776223778725, + "rewards/code_reward": 0.5168841565027833, + "rewards/format_reward": 1.5, + "step": 3379 + }, + { + "clip_ratio": 0.002395987161435187, + "epoch": 0.1261135208245884, + "grad_norm": 0.07295361906290054, + "kl": 0.0185546875, + "learning_rate": 3.2584197713608846e-06, + "loss": 0.0046, + "step": 3380 + }, + { + "clip_ratio": 0.0027321703964844346, + "epoch": 0.12615083251714007, + "grad_norm": 0.0717867910861969, + "kl": 0.0174560546875, + "learning_rate": 3.2558927408863848e-06, + "loss": 0.0045, + "step": 3381 + }, + { + "clip_ratio": 0.00259765621740371, + "completion_length": 758.5893249511719, + "epoch": 0.1261881442096917, + "grad_norm": 0.058966197073459625, + "kl": 0.021453857421875, + "learning_rate": 3.253366651997605e-06, + "loss": -0.0122, + "num_tokens": 79464305.0, + "reward": 0.9245669096708298, + "reward_std": 0.2684727720916271, + "rewards/code_reward": 0.7818882912397385, + "rewards/format_reward": 1.4267857074737549, + "step": 3382 + }, + { + "clip_ratio": 0.002161594806239009, + "epoch": 0.12622545590224338, + "grad_norm": 0.059662114828825, + "kl": 0.020721435546875, + "learning_rate": 3.2508415057544476e-06, + "loss": -0.0124, + "step": 3383 + }, + { + "clip_ratio": 0.0022677548113279045, + "epoch": 0.12626276759479502, + "grad_norm": 0.055666714906692505, + "kl": 0.020751953125, + "learning_rate": 3.2483173032164128e-06, + "loss": -0.0125, + "step": 3384 + }, + { + "clip_ratio": 0.003563282545655966, + "completion_length": 593.8571624755859, + "epoch": 0.12630007928734668, + "grad_norm": 0.07478782534599304, + "kl": 0.028472900390625, + "learning_rate": 3.2457940454426118e-06, + "loss": 0.0088, + "num_tokens": 79524525.0, + "reward": 0.6129827424883842, + "reward_std": 0.1509528837632388, + "rewards/code_reward": 0.4629827211610973, + "rewards/format_reward": 1.5, + "step": 3385 + }, + { + "clip_ratio": 0.0034186950069852173, + "epoch": 0.12633739097989832, + "grad_norm": 0.07707253843545914, + "kl": 0.028289794921875, + "learning_rate": 3.243271733491756e-06, + "loss": 0.0086, + "step": 3386 + }, + { + "clip_ratio": 0.0033909076009877026, + "epoch": 0.12637470267245, + "grad_norm": 0.07056356966495514, + "kl": 0.0286865234375, + "learning_rate": 3.2407503684221607e-06, + "loss": 0.0083, + "step": 3387 + }, + { + "clip_ratio": 0.003251238667871803, + "completion_length": 606.5714569091797, + "epoch": 0.12641201436500163, + "grad_norm": 0.06443453580141068, + "kl": 0.02374267578125, + "learning_rate": 3.2382299512917458e-06, + "loss": 0.0112, + "num_tokens": 79581453.0, + "reward": 0.6401575170457363, + "reward_std": 0.1055878633633256, + "rewards/code_reward": 0.49015749990940094, + "rewards/format_reward": 1.5, + "step": 3388 + }, + { + "clip_ratio": 0.0037373522063717246, + "epoch": 0.1264493260575533, + "grad_norm": 0.0604027658700943, + "kl": 0.023590087890625, + "learning_rate": 3.2357104831580278e-06, + "loss": 0.0113, + "step": 3389 + }, + { + "clip_ratio": 0.003453458717558533, + "epoch": 0.12648663775010494, + "grad_norm": 0.057206135243177414, + "kl": 0.02410888671875, + "learning_rate": 3.2331919650781322e-06, + "loss": 0.011, + "step": 3390 + }, + { + "clip_ratio": 0.004231575992889702, + "completion_length": 688.9107437133789, + "epoch": 0.1265239494426566, + "grad_norm": 0.09343158453702927, + "kl": 0.0230865478515625, + "learning_rate": 3.2306743981087797e-06, + "loss": 0.0071, + "num_tokens": 79658014.0, + "reward": 0.6702116020023823, + "reward_std": 0.21113774180412292, + "rewards/code_reward": 0.5202116263099015, + "rewards/format_reward": 1.5, + "step": 3391 + }, + { + "clip_ratio": 0.004657829646021128, + "epoch": 0.12656126113520824, + "grad_norm": 0.09327443689107895, + "kl": 0.0234527587890625, + "learning_rate": 3.228157783306295e-06, + "loss": 0.0069, + "step": 3392 + }, + { + "clip_ratio": 0.004359314450994134, + "epoch": 0.1265985728277599, + "grad_norm": 0.0863880142569542, + "kl": 0.023468017578125, + "learning_rate": 3.2256421217266055e-06, + "loss": 0.0067, + "step": 3393 + }, + { + "clip_ratio": 0.0050183620769530535, + "completion_length": 653.1964569091797, + "epoch": 0.12663588452031155, + "grad_norm": 5.675691604614258, + "kl": 1.0980224609375, + "learning_rate": 3.223127414425233e-06, + "loss": 0.0192, + "num_tokens": 79739015.0, + "reward": 0.6081976145505905, + "reward_std": 0.37604668736457825, + "rewards/code_reward": 0.4608761668205261, + "rewards/format_reward": 1.4732142984867096, + "step": 3394 + }, + { + "clip_ratio": 0.005203619715757668, + "epoch": 0.12667319621286321, + "grad_norm": 0.09902949631214142, + "kl": 0.0308990478515625, + "learning_rate": 3.2206136624573024e-06, + "loss": 0.0086, + "step": 3395 + }, + { + "clip_ratio": 0.005530811147764325, + "epoch": 0.12671050790541485, + "grad_norm": 0.09974535554647446, + "kl": 0.029144287109375, + "learning_rate": 3.2181008668775394e-06, + "loss": 0.0085, + "step": 3396 + }, + { + "clip_ratio": 0.0026186747127212584, + "completion_length": 509.48217010498047, + "epoch": 0.12674781959796652, + "grad_norm": 0.08266033977270126, + "kl": 0.0178680419921875, + "learning_rate": 3.2155890287402634e-06, + "loss": 0.0052, + "num_tokens": 79790590.0, + "reward": 1.1107791364192963, + "reward_std": 0.14462199993431568, + "rewards/code_reward": 0.9607791006565094, + "rewards/format_reward": 1.5, + "step": 3397 + }, + { + "clip_ratio": 0.0022789763752371073, + "epoch": 0.12678513129051816, + "grad_norm": 0.0840848982334137, + "kl": 0.017578125, + "learning_rate": 3.2130781490993993e-06, + "loss": 0.0049, + "step": 3398 + }, + { + "clip_ratio": 0.0020067921141162515, + "epoch": 0.12682244298306983, + "grad_norm": 0.07410623133182526, + "kl": 0.0180206298828125, + "learning_rate": 3.2105682290084615e-06, + "loss": 0.0045, + "step": 3399 + }, + { + "clip_ratio": 0.004499006550759077, + "completion_length": 592.3393096923828, + "epoch": 0.12685975467562147, + "grad_norm": 0.10620264708995819, + "kl": 0.0205078125, + "learning_rate": 3.208059269520568e-06, + "loss": 0.0127, + "num_tokens": 79853119.0, + "reward": 0.8072597831487656, + "reward_std": 0.20186891593039036, + "rewards/code_reward": 0.6572597546037287, + "rewards/format_reward": 1.5, + "step": 3400 + }, + { + "clip_ratio": 0.004531552956905216, + "epoch": 0.12689706636817313, + "grad_norm": 0.10840870440006256, + "kl": 0.020965576171875, + "learning_rate": 3.2055512716884296e-06, + "loss": 0.0126, + "step": 3401 + }, + { + "clip_ratio": 0.004408107546623796, + "epoch": 0.12693437806072477, + "grad_norm": 0.0988849326968193, + "kl": 0.021026611328125, + "learning_rate": 3.2030442365643578e-06, + "loss": 0.0119, + "step": 3402 + }, + { + "clip_ratio": 0.005178260151296854, + "completion_length": 730.7678909301758, + "epoch": 0.12697168975327644, + "grad_norm": 0.08581608533859253, + "kl": 0.021331787109375, + "learning_rate": 3.2005381652002563e-06, + "loss": -0.0037, + "num_tokens": 79928104.0, + "reward": 0.45612310245633125, + "reward_std": 0.10623059095814824, + "rewards/code_reward": 0.3061230785679072, + "rewards/format_reward": 1.5, + "step": 3403 + }, + { + "clip_ratio": 0.00432837480911985, + "epoch": 0.12700900144582808, + "grad_norm": 0.07972996681928635, + "kl": 0.021484375, + "learning_rate": 3.198033058647627e-06, + "loss": -0.0041, + "step": 3404 + }, + { + "clip_ratio": 0.004884684807620943, + "epoch": 0.12704631313837975, + "grad_norm": 0.08005839586257935, + "kl": 0.021514892578125, + "learning_rate": 3.1955289179575677e-06, + "loss": -0.0042, + "step": 3405 + }, + { + "clip_ratio": 0.0017654167022556067, + "completion_length": 502.1428756713867, + "epoch": 0.12708362483093139, + "grad_norm": 0.04346148297190666, + "kl": 0.020599365234375, + "learning_rate": 3.1930257441807657e-06, + "loss": 0.0018, + "num_tokens": 79976944.0, + "reward": 0.7392857186496258, + "reward_std": 0.09078413993120193, + "rewards/code_reward": 0.5892857164144516, + "rewards/format_reward": 1.5, + "step": 3406 + }, + { + "clip_ratio": 0.0020683412440121174, + "epoch": 0.12712093652348305, + "grad_norm": 0.04117473587393761, + "kl": 0.0207977294921875, + "learning_rate": 3.1905235383675084e-06, + "loss": 0.0017, + "step": 3407 + }, + { + "clip_ratio": 0.001938605448231101, + "epoch": 0.1271582482160347, + "grad_norm": 0.04310844466090202, + "kl": 0.020843505859375, + "learning_rate": 3.1880223015676753e-06, + "loss": 0.0017, + "step": 3408 + }, + { + "clip_ratio": 0.00303281401284039, + "completion_length": 690.053581237793, + "epoch": 0.12719555990858636, + "grad_norm": 0.07243321090936661, + "kl": 0.034576416015625, + "learning_rate": 3.185522034830737e-06, + "loss": 0.0043, + "num_tokens": 80057041.0, + "reward": 0.7044871784746647, + "reward_std": 0.18511445657350123, + "rewards/code_reward": 0.5544871763559058, + "rewards/format_reward": 1.5, + "step": 3409 + }, + { + "clip_ratio": 0.003045249730348587, + "epoch": 0.127232871601138, + "grad_norm": 0.06750199943780899, + "kl": 0.03387451171875, + "learning_rate": 3.18302273920576e-06, + "loss": 0.0045, + "step": 3410 + }, + { + "clip_ratio": 0.0031179130310192704, + "epoch": 0.12727018329368966, + "grad_norm": 0.0834713950753212, + "kl": 0.034088134765625, + "learning_rate": 3.1805244157414046e-06, + "loss": 0.0045, + "step": 3411 + }, + { + "clip_ratio": 0.0030988127109594643, + "completion_length": 660.0178680419922, + "epoch": 0.1273074949862413, + "grad_norm": 0.0656685084104538, + "kl": 0.0240325927734375, + "learning_rate": 3.1780270654859146e-06, + "loss": 0.0064, + "num_tokens": 80137080.0, + "reward": 0.7284356355667114, + "reward_std": 0.11184091679751873, + "rewards/code_reward": 0.578435592353344, + "rewards/format_reward": 1.5, + "step": 3412 + }, + { + "clip_ratio": 0.0035235705436207354, + "epoch": 0.12734480667879297, + "grad_norm": 0.06715866923332214, + "kl": 0.022247314453125, + "learning_rate": 3.1755306894871394e-06, + "loss": 0.0066, + "step": 3413 + }, + { + "clip_ratio": 0.002851430093869567, + "epoch": 0.1273821183713446, + "grad_norm": 0.0636068806052208, + "kl": 0.0238189697265625, + "learning_rate": 3.1730352887925082e-06, + "loss": 0.0063, + "step": 3414 + }, + { + "clip_ratio": 0.0044363614288158715, + "completion_length": 772.5178833007812, + "epoch": 0.12741943006389628, + "grad_norm": 0.07218831032514572, + "kl": 0.0189361572265625, + "learning_rate": 3.170540864449044e-06, + "loss": 0.0052, + "num_tokens": 80216125.0, + "reward": 0.5179685316979885, + "reward_std": 0.3438460035249591, + "rewards/code_reward": 0.3679685096722096, + "rewards/format_reward": 1.5, + "step": 3415 + }, + { + "clip_ratio": 0.0036959698772989213, + "epoch": 0.12745674175644792, + "grad_norm": 0.0745856910943985, + "kl": 0.0194854736328125, + "learning_rate": 3.1680474175033637e-06, + "loss": 0.0047, + "step": 3416 + }, + { + "clip_ratio": 0.0041877091862261295, + "epoch": 0.12749405344899958, + "grad_norm": 0.06987186521291733, + "kl": 0.0189666748046875, + "learning_rate": 3.165554949001668e-06, + "loss": 0.005, + "step": 3417 + }, + { + "clip_ratio": 0.0046643882524222136, + "completion_length": 670.3036041259766, + "epoch": 0.12753136514155122, + "grad_norm": 0.09368038922548294, + "kl": 0.02423095703125, + "learning_rate": 3.163063459989754e-06, + "loss": -0.0055, + "num_tokens": 80280472.0, + "reward": 0.5043689422309399, + "reward_std": 0.28028638660907745, + "rewards/code_reward": 0.3543689288198948, + "rewards/format_reward": 1.5, + "step": 3418 + }, + { + "clip_ratio": 0.0043929898529313505, + "epoch": 0.1275686768341029, + "grad_norm": 0.09594806283712387, + "kl": 0.024078369140625, + "learning_rate": 3.160572951513e-06, + "loss": -0.0058, + "step": 3419 + }, + { + "clip_ratio": 0.004110051901079714, + "epoch": 0.12760598852665453, + "grad_norm": 0.08066297322511673, + "kl": 0.02386474609375, + "learning_rate": 3.1580834246163807e-06, + "loss": -0.0061, + "step": 3420 + }, + { + "clip_ratio": 0.006834129220806062, + "completion_length": 510.64288330078125, + "epoch": 0.1276433002192062, + "grad_norm": 0.13516126573085785, + "kl": 0.027130126953125, + "learning_rate": 3.1555948803444527e-06, + "loss": 0.0104, + "num_tokens": 80351866.0, + "reward": 0.6040368489921093, + "reward_std": 0.3378795739263296, + "rewards/code_reward": 0.45403683721087873, + "rewards/format_reward": 1.5, + "step": 3421 + }, + { + "clip_ratio": 0.00580055802129209, + "epoch": 0.12768061191175784, + "grad_norm": 0.18186207115650177, + "kl": 0.031707763671875, + "learning_rate": 3.1531073197413642e-06, + "loss": 0.0098, + "step": 3422 + }, + { + "clip_ratio": 0.005455761915072799, + "epoch": 0.1277179236043095, + "grad_norm": 0.1041942685842514, + "kl": 0.030364990234375, + "learning_rate": 3.1506207438508517e-06, + "loss": 0.0092, + "step": 3423 + }, + { + "clip_ratio": 0.003769582835957408, + "completion_length": 727.982177734375, + "epoch": 0.12775523529686114, + "grad_norm": 0.09134254604578018, + "kl": 0.01995849609375, + "learning_rate": 3.148135153716232e-06, + "loss": -0.005, + "num_tokens": 80428375.0, + "reward": 0.7130699753761292, + "reward_std": 0.21833198703825474, + "rewards/code_reward": 0.5630699517205358, + "rewards/format_reward": 1.5, + "step": 3424 + }, + { + "clip_ratio": 0.0037937580491416156, + "epoch": 0.1277925469894128, + "grad_norm": 0.09768296778202057, + "kl": 0.02044677734375, + "learning_rate": 3.1456505503804146e-06, + "loss": -0.0051, + "step": 3425 + }, + { + "clip_ratio": 0.003653095103800297, + "epoch": 0.12782985868196445, + "grad_norm": 0.08660121262073517, + "kl": 0.022430419921875, + "learning_rate": 3.143166934885895e-06, + "loss": -0.0055, + "step": 3426 + }, + { + "clip_ratio": 0.0030335576157085598, + "completion_length": 516.3928756713867, + "epoch": 0.12786717037451611, + "grad_norm": 0.07301302254199982, + "kl": 0.0179290771484375, + "learning_rate": 3.1406843082747475e-06, + "loss": -0.0129, + "num_tokens": 80481163.0, + "reward": 0.6446366645395756, + "reward_std": 0.07313124532811344, + "rewards/code_reward": 0.4946366375661455, + "rewards/format_reward": 1.5, + "step": 3427 + }, + { + "clip_ratio": 0.003323323093354702, + "epoch": 0.12790448206706775, + "grad_norm": 0.07314284145832062, + "kl": 0.0178375244140625, + "learning_rate": 3.1382026715886417e-06, + "loss": -0.0126, + "step": 3428 + }, + { + "clip_ratio": 0.0026594570954330266, + "epoch": 0.12794179375961942, + "grad_norm": 0.06892766803503036, + "kl": 0.017791748046875, + "learning_rate": 3.1357220258688183e-06, + "loss": -0.013, + "step": 3429 + }, + { + "clip_ratio": 0.0032629005145281553, + "completion_length": 759.2143325805664, + "epoch": 0.12797910545217106, + "grad_norm": 0.037930168211460114, + "kl": 0.0252685546875, + "learning_rate": 3.1332423721561165e-06, + "loss": -0.0039, + "num_tokens": 80555599.0, + "reward": 0.685714278370142, + "reward_std": 0.06822971999645233, + "rewards/code_reward": 0.5357142873108387, + "rewards/format_reward": 1.5, + "step": 3430 + }, + { + "clip_ratio": 0.0029744856292381883, + "epoch": 0.12801641714472273, + "grad_norm": 0.03326747938990593, + "kl": 0.02435302734375, + "learning_rate": 3.1307637114909538e-06, + "loss": -0.0041, + "step": 3431 + }, + { + "clip_ratio": 0.002725106489378959, + "epoch": 0.1280537288372744, + "grad_norm": 0.03803994134068489, + "kl": 0.024627685546875, + "learning_rate": 3.1282860449133268e-06, + "loss": -0.0041, + "step": 3432 + }, + { + "clip_ratio": 0.004329490242525935, + "completion_length": 805.5714569091797, + "epoch": 0.12809104052982603, + "grad_norm": 0.06777684390544891, + "kl": 0.024566650390625, + "learning_rate": 3.1258093734628203e-06, + "loss": 0.0084, + "num_tokens": 80637909.0, + "reward": 0.32337184622883797, + "reward_std": 0.14342845231294632, + "rewards/code_reward": 0.1733718253672123, + "rewards/format_reward": 1.5, + "step": 3433 + }, + { + "clip_ratio": 0.0037029460072517395, + "epoch": 0.1281283522223777, + "grad_norm": 0.06586232036352158, + "kl": 0.02447509765625, + "learning_rate": 3.123333698178597e-06, + "loss": 0.0082, + "step": 3434 + }, + { + "clip_ratio": 0.0040067065274342895, + "epoch": 0.12816566391492934, + "grad_norm": 0.06397905945777893, + "kl": 0.024932861328125, + "learning_rate": 3.1208590200994066e-06, + "loss": 0.0082, + "step": 3435 + }, + { + "clip_ratio": 0.004194217792246491, + "completion_length": 547.6250305175781, + "epoch": 0.128202975607481, + "grad_norm": 0.0770845115184784, + "kl": 0.0225982666015625, + "learning_rate": 3.1183853402635787e-06, + "loss": 0.0058, + "num_tokens": 80694518.0, + "reward": 0.4372163191437721, + "reward_std": 0.17933660000562668, + "rewards/code_reward": 0.28721628338098526, + "rewards/format_reward": 1.5, + "step": 3436 + }, + { + "clip_ratio": 0.004420380922965705, + "epoch": 0.12824028730003265, + "grad_norm": 0.0777827799320221, + "kl": 0.02294921875, + "learning_rate": 3.1159126597090208e-06, + "loss": 0.006, + "step": 3437 + }, + { + "clip_ratio": 0.004338039027061313, + "epoch": 0.1282775989925843, + "grad_norm": 0.07834675908088684, + "kl": 0.02294921875, + "learning_rate": 3.1134409794732244e-06, + "loss": 0.0057, + "step": 3438 + }, + { + "clip_ratio": 0.002990516019053757, + "completion_length": 798.5893249511719, + "epoch": 0.12831491068513595, + "grad_norm": 0.07023254781961441, + "kl": 0.017913818359375, + "learning_rate": 3.1109703005932612e-06, + "loss": -0.0147, + "num_tokens": 80768975.0, + "reward": 0.7549601197242737, + "reward_std": 0.21829366218298674, + "rewards/code_reward": 0.6049601137638092, + "rewards/format_reward": 1.5, + "step": 3439 + }, + { + "clip_ratio": 0.003152798512019217, + "epoch": 0.12835222237768762, + "grad_norm": 0.06941839307546616, + "kl": 0.01824951171875, + "learning_rate": 3.1085006241057835e-06, + "loss": -0.0146, + "step": 3440 + }, + { + "clip_ratio": 0.003006549261044711, + "epoch": 0.12838953407023926, + "grad_norm": 0.06809034198522568, + "kl": 0.018402099609375, + "learning_rate": 3.1060319510470217e-06, + "loss": -0.0147, + "step": 3441 + }, + { + "clip_ratio": 0.003824072307907045, + "completion_length": 596.0000152587891, + "epoch": 0.12842684576279093, + "grad_norm": 0.08776353299617767, + "kl": 0.02642822265625, + "learning_rate": 3.103564282452781e-06, + "loss": 0.0199, + "num_tokens": 80836217.0, + "reward": 0.6289167404174805, + "reward_std": 0.21223894134163857, + "rewards/code_reward": 0.48159530758857727, + "rewards/format_reward": 1.4732142984867096, + "step": 3442 + }, + { + "clip_ratio": 0.004164690210018307, + "epoch": 0.12846415745534256, + "grad_norm": 0.10174693912267685, + "kl": 0.025390625, + "learning_rate": 3.101097619358454e-06, + "loss": 0.0199, + "step": 3443 + }, + { + "clip_ratio": 0.0036465045413933694, + "epoch": 0.12850146914789423, + "grad_norm": 0.08482629805803299, + "kl": 0.025543212890625, + "learning_rate": 3.0986319627990053e-06, + "loss": 0.0195, + "step": 3444 + }, + { + "clip_ratio": 0.005746832583099604, + "completion_length": 894.2143096923828, + "epoch": 0.12853878084044587, + "grad_norm": 0.7878042459487915, + "kl": 0.08587646484375, + "learning_rate": 3.0961673138089766e-06, + "loss": 0.0491, + "num_tokens": 80922959.0, + "reward": 0.18789010122418404, + "reward_std": 0.08396447356790304, + "rewards/code_reward": 0.040568653494119644, + "rewards/format_reward": 1.4732142984867096, + "step": 3445 + }, + { + "clip_ratio": 0.005911943037062883, + "epoch": 0.12857609253299754, + "grad_norm": 0.05930749699473381, + "kl": 0.034515380859375, + "learning_rate": 3.093703673422493e-06, + "loss": 0.0485, + "step": 3446 + }, + { + "clip_ratio": 0.005948698031716049, + "epoch": 0.12861340422554918, + "grad_norm": 1.609792947769165, + "kl": 0.031524658203125, + "learning_rate": 3.0912410426732452e-06, + "loss": 0.0494, + "step": 3447 + }, + { + "clip_ratio": 0.003815157455392182, + "completion_length": 766.1250228881836, + "epoch": 0.12865071591810084, + "grad_norm": 0.04514912888407707, + "kl": 0.017181396484375, + "learning_rate": 3.0887794225945143e-06, + "loss": 0.0041, + "num_tokens": 81001576.0, + "reward": 0.4714285805821419, + "reward_std": 0.23617246001958847, + "rewards/code_reward": 0.3214285671710968, + "rewards/format_reward": 1.5, + "step": 3448 + }, + { + "clip_ratio": 0.004380838596262038, + "epoch": 0.12868802761065248, + "grad_norm": 0.04566343128681183, + "kl": 0.016845703125, + "learning_rate": 3.0863188142191497e-06, + "loss": 0.0041, + "step": 3449 + }, + { + "clip_ratio": 0.00418086361605674, + "epoch": 0.12872533930320415, + "grad_norm": 0.04756531864404678, + "kl": 0.0164794921875, + "learning_rate": 3.0838592185795733e-06, + "loss": 0.004, + "step": 3450 + }, + { + "clip_ratio": 0.004181711585260928, + "completion_length": 788.0178985595703, + "epoch": 0.1287626509957558, + "grad_norm": 0.08282531052827835, + "kl": 0.0176239013671875, + "learning_rate": 3.081400636707789e-06, + "loss": 0.0005, + "num_tokens": 81072473.0, + "reward": 0.5877562016248703, + "reward_std": 0.2671590391546488, + "rewards/code_reward": 0.4377561993896961, + "rewards/format_reward": 1.5, + "step": 3451 + }, + { + "clip_ratio": 0.004145927086938173, + "epoch": 0.12879996268830746, + "grad_norm": 0.08787272125482559, + "kl": 0.017822265625, + "learning_rate": 3.0789430696353684e-06, + "loss": 0.0004, + "step": 3452 + }, + { + "clip_ratio": 0.004463764373213053, + "epoch": 0.1288372743808591, + "grad_norm": 0.07806310057640076, + "kl": 0.0178985595703125, + "learning_rate": 3.0764865183934643e-06, + "loss": 0.0004, + "step": 3453 + }, + { + "clip_ratio": 0.003218497382476926, + "completion_length": 655.053596496582, + "epoch": 0.12887458607341076, + "grad_norm": 0.08986973762512207, + "kl": 0.016448974609375, + "learning_rate": 3.0740309840128002e-06, + "loss": -0.0054, + "num_tokens": 81141286.0, + "reward": 0.7615892365574837, + "reward_std": 0.17423483729362488, + "rewards/code_reward": 0.6115892194211483, + "rewards/format_reward": 1.5, + "step": 3454 + }, + { + "clip_ratio": 0.0035070215817540884, + "epoch": 0.1289118977659624, + "grad_norm": 0.08113344758749008, + "kl": 0.0166015625, + "learning_rate": 3.071576467523669e-06, + "loss": -0.0055, + "step": 3455 + }, + { + "clip_ratio": 0.00324380851816386, + "epoch": 0.12894920945851407, + "grad_norm": 0.07063103467226028, + "kl": 0.016326904296875, + "learning_rate": 3.069122969955944e-06, + "loss": -0.0057, + "step": 3456 + }, + { + "clip_ratio": 0.004021251457743347, + "completion_length": 623.1785888671875, + "epoch": 0.1289865211510657, + "grad_norm": 0.0726507380604744, + "kl": 0.031402587890625, + "learning_rate": 3.0666704923390655e-06, + "loss": 0.0052, + "num_tokens": 81201622.0, + "reward": 0.5715557560324669, + "reward_std": 0.24709703773260117, + "rewards/code_reward": 0.42423432134091854, + "rewards/format_reward": 1.4732142984867096, + "step": 3457 + }, + { + "clip_ratio": 0.0039026702288538218, + "epoch": 0.12902383284361738, + "grad_norm": 0.06736726313829422, + "kl": 0.030609130859375, + "learning_rate": 3.064219035702048e-06, + "loss": 0.0053, + "step": 3458 + }, + { + "clip_ratio": 0.003860432654619217, + "epoch": 0.12906114453616901, + "grad_norm": 0.06502830237150192, + "kl": 0.03192138671875, + "learning_rate": 3.061768601073478e-06, + "loss": 0.0049, + "step": 3459 + }, + { + "clip_ratio": 0.0035817804164253175, + "completion_length": 819.3214721679688, + "epoch": 0.12909845622872068, + "grad_norm": 0.07350064814090729, + "kl": 0.017364501953125, + "learning_rate": 3.05931918948151e-06, + "loss": 0.0062, + "num_tokens": 81280894.0, + "reward": 0.5328441485762596, + "reward_std": 0.2776500806212425, + "rewards/code_reward": 0.38552269898355007, + "rewards/format_reward": 1.4732142984867096, + "step": 3460 + }, + { + "clip_ratio": 0.003832671296549961, + "epoch": 0.12913576792127232, + "grad_norm": 0.07548890262842178, + "kl": 0.0175018310546875, + "learning_rate": 3.056870801953875e-06, + "loss": 0.0065, + "step": 3461 + }, + { + "clip_ratio": 0.0036182936164550483, + "epoch": 0.129173079613824, + "grad_norm": 0.07020501792430878, + "kl": 0.017730712890625, + "learning_rate": 3.0544234395178674e-06, + "loss": 0.0062, + "step": 3462 + }, + { + "clip_ratio": 0.0020241813617758453, + "completion_length": 654.0893173217773, + "epoch": 0.12921039130637563, + "grad_norm": 0.060084421187639236, + "kl": 0.0175933837890625, + "learning_rate": 3.0519771032003566e-06, + "loss": -0.0023, + "num_tokens": 81347875.0, + "reward": 0.6528344638645649, + "reward_std": 0.010605606250464916, + "rewards/code_reward": 0.5028344672173262, + "rewards/format_reward": 1.5, + "step": 3463 + }, + { + "clip_ratio": 0.0016829388332553208, + "epoch": 0.1292477029989273, + "grad_norm": 0.0319971963763237, + "kl": 0.0178070068359375, + "learning_rate": 3.0495317940277803e-06, + "loss": -0.0024, + "step": 3464 + }, + { + "clip_ratio": 0.001991675700992346, + "epoch": 0.12928501469147893, + "grad_norm": 0.03301620110869408, + "kl": 0.0176544189453125, + "learning_rate": 3.0470875130261436e-06, + "loss": -0.0025, + "step": 3465 + }, + { + "clip_ratio": 0.002502810297301039, + "completion_length": 749.5357513427734, + "epoch": 0.1293223263840306, + "grad_norm": 0.0704604983329773, + "kl": 0.0200653076171875, + "learning_rate": 3.0446442612210235e-06, + "loss": 0.017, + "num_tokens": 81423099.0, + "reward": 1.0065319538116455, + "reward_std": 0.33395916223526, + "rewards/code_reward": 0.8592105209827423, + "rewards/format_reward": 1.4732142984867096, + "step": 3466 + }, + { + "clip_ratio": 0.002808246004860848, + "epoch": 0.12935963807658224, + "grad_norm": 0.07051033526659012, + "kl": 0.019805908203125, + "learning_rate": 3.0422020396375606e-06, + "loss": 0.0172, + "step": 3467 + }, + { + "clip_ratio": 0.0027578416629694402, + "epoch": 0.1293969497691339, + "grad_norm": 0.07099718600511551, + "kl": 0.020111083984375, + "learning_rate": 3.039760849300467e-06, + "loss": 0.0171, + "step": 3468 + }, + { + "clip_ratio": 0.0031489396933466196, + "completion_length": 720.0714569091797, + "epoch": 0.12943426146168555, + "grad_norm": 0.05809873342514038, + "kl": 0.02532958984375, + "learning_rate": 3.037320691234022e-06, + "loss": -0.0014, + "num_tokens": 81495623.0, + "reward": 0.5175595618784428, + "reward_std": 0.13429265283048153, + "rewards/code_reward": 0.3675595335662365, + "rewards/format_reward": 1.5, + "step": 3469 + }, + { + "clip_ratio": 0.0032803233480080962, + "epoch": 0.1294715731542372, + "grad_norm": 0.05645911023020744, + "kl": 0.025665283203125, + "learning_rate": 3.0348815664620677e-06, + "loss": -0.0017, + "step": 3470 + }, + { + "clip_ratio": 0.0028810526710003614, + "epoch": 0.12950888484678885, + "grad_norm": 0.055142246186733246, + "kl": 0.02532958984375, + "learning_rate": 3.032443476008017e-06, + "loss": -0.0019, + "step": 3471 + }, + { + "clip_ratio": 0.003080841910559684, + "completion_length": 612.4821701049805, + "epoch": 0.12954619653934052, + "grad_norm": 0.0770808532834053, + "kl": 0.0245361328125, + "learning_rate": 3.0300064208948494e-06, + "loss": -0.0102, + "num_tokens": 81561898.0, + "reward": 0.8311392888426781, + "reward_std": 0.15287571912631392, + "rewards/code_reward": 0.6811392873642035, + "rewards/format_reward": 1.5, + "step": 3472 + }, + { + "clip_ratio": 0.003079304355196655, + "epoch": 0.12958350823189216, + "grad_norm": 0.07687104493379593, + "kl": 0.024810791015625, + "learning_rate": 3.027570402145104e-06, + "loss": -0.0104, + "step": 3473 + }, + { + "clip_ratio": 0.003139568434562534, + "epoch": 0.12962081992444383, + "grad_norm": 0.06887542456388474, + "kl": 0.024688720703125, + "learning_rate": 3.0251354207808913e-06, + "loss": -0.0104, + "step": 3474 + }, + { + "clip_ratio": 0.0033179630408994853, + "completion_length": 555.9464569091797, + "epoch": 0.12965813161699546, + "grad_norm": 0.07329566776752472, + "kl": 0.018707275390625, + "learning_rate": 3.0227014778238844e-06, + "loss": -0.0062, + "num_tokens": 81617979.0, + "reward": 0.7804730832576752, + "reward_std": 0.2538032829761505, + "rewards/code_reward": 0.6304730549454689, + "rewards/format_reward": 1.5, + "step": 3475 + }, + { + "clip_ratio": 0.0028987342957407236, + "epoch": 0.12969544330954713, + "grad_norm": 0.07234786450862885, + "kl": 0.019195556640625, + "learning_rate": 3.0202685742953196e-06, + "loss": -0.0062, + "step": 3476 + }, + { + "clip_ratio": 0.0036029372131451964, + "epoch": 0.12973275500209877, + "grad_norm": 0.06420297920703888, + "kl": 0.018218994140625, + "learning_rate": 3.0178367112160006e-06, + "loss": -0.0064, + "step": 3477 + }, + { + "clip_ratio": 0.003755459561944008, + "completion_length": 687.9107513427734, + "epoch": 0.12977006669465044, + "grad_norm": 0.07424639910459518, + "kl": 0.037841796875, + "learning_rate": 3.0154058896062876e-06, + "loss": 0.0263, + "num_tokens": 81694064.0, + "reward": 0.8445071578025818, + "reward_std": 0.26811383664608, + "rewards/code_reward": 0.6945071443915367, + "rewards/format_reward": 1.5, + "step": 3478 + }, + { + "clip_ratio": 0.003787883324548602, + "epoch": 0.12980737838720208, + "grad_norm": 0.07760385423898697, + "kl": 0.037109375, + "learning_rate": 3.012976110486113e-06, + "loss": 0.0264, + "step": 3479 + }, + { + "clip_ratio": 0.0035188066540285945, + "epoch": 0.12984469007975374, + "grad_norm": 0.07327106595039368, + "kl": 0.038909912109375, + "learning_rate": 3.010547374874963e-06, + "loss": 0.0262, + "step": 3480 + }, + { + "clip_ratio": 0.002198777918238193, + "completion_length": 674.1785888671875, + "epoch": 0.12988200177230538, + "grad_norm": 0.06915807723999023, + "kl": 0.028289794921875, + "learning_rate": 3.0081196837918907e-06, + "loss": -0.0161, + "num_tokens": 81767252.0, + "reward": 0.9091631025075912, + "reward_std": 0.2039211392402649, + "rewards/code_reward": 0.7591631039977074, + "rewards/format_reward": 1.5, + "step": 3481 + }, + { + "clip_ratio": 0.002060238446574658, + "epoch": 0.12991931346485705, + "grad_norm": 0.05905427038669586, + "kl": 0.02410888671875, + "learning_rate": 3.0056930382555115e-06, + "loss": -0.0162, + "step": 3482 + }, + { + "clip_ratio": 0.0019936601165682077, + "epoch": 0.1299566251574087, + "grad_norm": 0.057517096400260925, + "kl": 0.028350830078125, + "learning_rate": 3.0032674392840007e-06, + "loss": -0.0163, + "step": 3483 + }, + { + "clip_ratio": 0.0034204550902359188, + "completion_length": 862.9643402099609, + "epoch": 0.12999393684996036, + "grad_norm": 0.05157710239291191, + "kl": 0.0177001953125, + "learning_rate": 3.0008428878950944e-06, + "loss": -0.0043, + "num_tokens": 81858194.0, + "reward": 0.5381293669342995, + "reward_std": 0.05330860335379839, + "rewards/code_reward": 0.38812935817986727, + "rewards/format_reward": 1.5, + "step": 3484 + }, + { + "clip_ratio": 0.0031656441860832274, + "epoch": 0.13003124854251202, + "grad_norm": 0.04958818107843399, + "kl": 0.01776123046875, + "learning_rate": 2.998419385106088e-06, + "loss": -0.0044, + "step": 3485 + }, + { + "clip_ratio": 0.003170272975694388, + "epoch": 0.13006856023506366, + "grad_norm": 0.046001456677913666, + "kl": 0.017730712890625, + "learning_rate": 2.99599693193384e-06, + "loss": -0.0046, + "step": 3486 + }, + { + "clip_ratio": 0.004274695937056094, + "completion_length": 556.2678756713867, + "epoch": 0.13010587192761533, + "grad_norm": 0.0804232656955719, + "kl": 0.024871826171875, + "learning_rate": 2.993575529394768e-06, + "loss": -0.0061, + "num_tokens": 81922511.0, + "reward": 0.662699531763792, + "reward_std": 0.23936041817069054, + "rewards/code_reward": 0.5126994848251343, + "rewards/format_reward": 1.5, + "step": 3487 + }, + { + "clip_ratio": 0.0037959173787385225, + "epoch": 0.13014318362016697, + "grad_norm": 0.0828535333275795, + "kl": 0.0250244140625, + "learning_rate": 2.9911551785048427e-06, + "loss": -0.006, + "step": 3488 + }, + { + "clip_ratio": 0.003625009849201888, + "epoch": 0.13018049531271864, + "grad_norm": 0.07790300250053406, + "kl": 0.024658203125, + "learning_rate": 2.988735880279603e-06, + "loss": -0.0064, + "step": 3489 + }, + { + "clip_ratio": 0.0032192361541092396, + "completion_length": 599.7678909301758, + "epoch": 0.13021780700527028, + "grad_norm": 0.104183129966259, + "kl": 0.0234375, + "learning_rate": 2.986317635734139e-06, + "loss": -0.0037, + "num_tokens": 81986786.0, + "reward": 1.0634994059801102, + "reward_std": 0.18531054351478815, + "rewards/code_reward": 0.9134993553161621, + "rewards/format_reward": 1.5, + "step": 3490 + }, + { + "clip_ratio": 0.00321804714621976, + "epoch": 0.13025511869782194, + "grad_norm": 0.08622181415557861, + "kl": 0.023101806640625, + "learning_rate": 2.983900445883101e-06, + "loss": -0.0038, + "step": 3491 + }, + { + "clip_ratio": 0.0034591160947456956, + "epoch": 0.13029243039037358, + "grad_norm": 0.11360566318035126, + "kl": 0.02398681640625, + "learning_rate": 2.9814843117406982e-06, + "loss": -0.004, + "step": 3492 + }, + { + "clip_ratio": 0.0025106286630034447, + "completion_length": 597.4464569091797, + "epoch": 0.13032974208292525, + "grad_norm": 0.0689665749669075, + "kl": 0.02410888671875, + "learning_rate": 2.9790692343206955e-06, + "loss": 0.0098, + "num_tokens": 82047837.0, + "reward": 0.7297810912132263, + "reward_std": 0.12381128873676062, + "rewards/code_reward": 0.5797811094671488, + "rewards/format_reward": 1.5, + "step": 3493 + }, + { + "clip_ratio": 0.0026312759146094322, + "epoch": 0.1303670537754769, + "grad_norm": 0.06478147208690643, + "kl": 0.02362060546875, + "learning_rate": 2.9766552146364163e-06, + "loss": 0.0097, + "step": 3494 + }, + { + "clip_ratio": 0.002167206723242998, + "epoch": 0.13040436546802855, + "grad_norm": 0.06553415954113007, + "kl": 0.02447509765625, + "learning_rate": 2.9742422537007344e-06, + "loss": 0.0096, + "step": 3495 + }, + { + "clip_ratio": 0.004890649230219424, + "completion_length": 555.7321853637695, + "epoch": 0.1304416771605802, + "grad_norm": 0.09974033385515213, + "kl": 0.0247802734375, + "learning_rate": 2.971830352526085e-06, + "loss": 0.004, + "num_tokens": 82109786.0, + "reward": 0.675715483725071, + "reward_std": 0.3950365036725998, + "rewards/code_reward": 0.5257154572755098, + "rewards/format_reward": 1.5, + "step": 3496 + }, + { + "clip_ratio": 0.004594653903041035, + "epoch": 0.13047898885313186, + "grad_norm": 0.11086884140968323, + "kl": 0.02630615234375, + "learning_rate": 2.969419512124461e-06, + "loss": 0.0037, + "step": 3497 + }, + { + "clip_ratio": 0.004821206908673048, + "epoch": 0.1305163005456835, + "grad_norm": 0.09989157319068909, + "kl": 0.025177001953125, + "learning_rate": 2.9670097335073988e-06, + "loss": 0.0039, + "step": 3498 + }, + { + "clip_ratio": 0.0029650519136339426, + "completion_length": 516.5178833007812, + "epoch": 0.13055361223823517, + "grad_norm": 0.06443404406309128, + "kl": 0.02642822265625, + "learning_rate": 2.964601017686002e-06, + "loss": 0.0086, + "num_tokens": 82164219.0, + "reward": 0.8767927698791027, + "reward_std": 0.047496894374489784, + "rewards/code_reward": 0.7267927527427673, + "rewards/format_reward": 1.5, + "step": 3499 + }, + { + "clip_ratio": 0.0029779976466670632, + "epoch": 0.1305909239307868, + "grad_norm": 0.06545324623584747, + "kl": 0.02685546875, + "learning_rate": 2.962193365670921e-06, + "loss": 0.0083, + "step": 3500 + }, + { + "clip_ratio": 0.0031405948102474213, + "epoch": 0.13062823562333847, + "grad_norm": 0.06216578558087349, + "kl": 0.02587890625, + "learning_rate": 2.959786778472362e-06, + "loss": 0.0084, + "step": 3501 + }, + { + "clip_ratio": 0.0028962064534425735, + "completion_length": 713.7857513427734, + "epoch": 0.1306655473158901, + "grad_norm": 0.0749368742108345, + "kl": 0.01824951171875, + "learning_rate": 2.9573812571000883e-06, + "loss": 0.0502, + "num_tokens": 82232555.0, + "reward": 1.0696956813335419, + "reward_std": 0.17699581291526556, + "rewards/code_reward": 0.9223742336034775, + "rewards/format_reward": 1.4732142984867096, + "step": 3502 + }, + { + "clip_ratio": 0.0029402636573649943, + "epoch": 0.13070285900844178, + "grad_norm": 0.07142822444438934, + "kl": 0.0185089111328125, + "learning_rate": 2.9549768025634052e-06, + "loss": 0.0502, + "step": 3503 + }, + { + "clip_ratio": 0.0027169162640348077, + "epoch": 0.13074017070099342, + "grad_norm": 0.07644833624362946, + "kl": 0.0186309814453125, + "learning_rate": 2.952573415871181e-06, + "loss": 0.0501, + "step": 3504 + }, + { + "clip_ratio": 0.004266089585144073, + "completion_length": 560.4286041259766, + "epoch": 0.13077748239354509, + "grad_norm": 0.11701870709657669, + "kl": 0.032135009765625, + "learning_rate": 2.9501710980318333e-06, + "loss": 0.0151, + "num_tokens": 82292597.0, + "reward": 0.8074524737894535, + "reward_std": 0.10660505417035893, + "rewards/code_reward": 0.6574524752941215, + "rewards/format_reward": 1.5, + "step": 3505 + }, + { + "clip_ratio": 0.004020628111902624, + "epoch": 0.13081479408609673, + "grad_norm": 0.10688840597867966, + "kl": 0.032379150390625, + "learning_rate": 2.947769850053327e-06, + "loss": 0.0149, + "step": 3506 + }, + { + "clip_ratio": 0.0037484102649614215, + "epoch": 0.1308521057786484, + "grad_norm": 0.09837973862886429, + "kl": 0.033233642578125, + "learning_rate": 2.9453696729431834e-06, + "loss": 0.0147, + "step": 3507 + }, + { + "clip_ratio": 0.0031032765400595963, + "completion_length": 645.7500305175781, + "epoch": 0.13088941747120003, + "grad_norm": 0.07748618721961975, + "kl": 0.0205230712890625, + "learning_rate": 2.9429705677084707e-06, + "loss": -0.0047, + "num_tokens": 82357543.0, + "reward": 0.6472522541880608, + "reward_std": 0.21086566150188446, + "rewards/code_reward": 0.49725223146378994, + "rewards/format_reward": 1.5, + "step": 3508 + }, + { + "clip_ratio": 0.0025575823383405805, + "epoch": 0.1309267291637517, + "grad_norm": 0.0990574061870575, + "kl": 0.021026611328125, + "learning_rate": 2.9405725353558083e-06, + "loss": -0.0048, + "step": 3509 + }, + { + "clip_ratio": 0.002671308582648635, + "epoch": 0.13096404085630334, + "grad_norm": 0.06893111020326614, + "kl": 0.0220794677734375, + "learning_rate": 2.938175576891368e-06, + "loss": -0.0049, + "step": 3510 + }, + { + "clip_ratio": 0.0036321221268735826, + "completion_length": 843.0536041259766, + "epoch": 0.131001352548855, + "grad_norm": 0.08850356191396713, + "kl": 0.019561767578125, + "learning_rate": 2.935779693320868e-06, + "loss": 0.0083, + "num_tokens": 82436982.0, + "reward": 0.5365060679614544, + "reward_std": 0.20896525564603508, + "rewards/code_reward": 0.38650606595911086, + "rewards/format_reward": 1.5, + "step": 3511 + }, + { + "clip_ratio": 0.003566293802578002, + "epoch": 0.13103866424140664, + "grad_norm": 0.08527820557355881, + "kl": 0.020294189453125, + "learning_rate": 2.9333848856495784e-06, + "loss": 0.0083, + "step": 3512 + }, + { + "clip_ratio": 0.003609861305449158, + "epoch": 0.1310759759339583, + "grad_norm": 0.07492704689502716, + "kl": 0.02008056640625, + "learning_rate": 2.9309911548823134e-06, + "loss": 0.0079, + "step": 3513 + }, + { + "clip_ratio": 0.0033549059880897403, + "completion_length": 675.0357437133789, + "epoch": 0.13111328762650995, + "grad_norm": 0.14622744917869568, + "kl": 0.0236358642578125, + "learning_rate": 2.9285985020234375e-06, + "loss": -0.0027, + "num_tokens": 82499116.0, + "reward": 0.8406484723091125, + "reward_std": 0.2569923549890518, + "rewards/code_reward": 0.6906484588980675, + "rewards/format_reward": 1.5, + "step": 3514 + }, + { + "clip_ratio": 0.0030834702774882317, + "epoch": 0.13115059931906162, + "grad_norm": 0.0807621106505394, + "kl": 0.0214080810546875, + "learning_rate": 2.926206928076868e-06, + "loss": -0.003, + "step": 3515 + }, + { + "clip_ratio": 0.00307976602925919, + "epoch": 0.13118791101161326, + "grad_norm": 0.07737798243761063, + "kl": 0.020538330078125, + "learning_rate": 2.9238164340460595e-06, + "loss": -0.0031, + "step": 3516 + }, + { + "clip_ratio": 0.004011853830888867, + "completion_length": 764.4286193847656, + "epoch": 0.13122522270416492, + "grad_norm": 0.0837055966258049, + "kl": 0.017578125, + "learning_rate": 2.921427020934022e-06, + "loss": -0.0069, + "num_tokens": 82570260.0, + "reward": 0.8510679006576538, + "reward_std": 0.2651101350784302, + "rewards/code_reward": 0.7010678648948669, + "rewards/format_reward": 1.5, + "step": 3517 + }, + { + "clip_ratio": 0.003428841824643314, + "epoch": 0.13126253439671656, + "grad_norm": 0.08075080066919327, + "kl": 0.018463134765625, + "learning_rate": 2.9190386897433065e-06, + "loss": -0.0071, + "step": 3518 + }, + { + "clip_ratio": 0.0031102343345992267, + "epoch": 0.13129984608926823, + "grad_norm": 0.07774423807859421, + "kl": 0.0181884765625, + "learning_rate": 2.916651441476015e-06, + "loss": -0.0076, + "step": 3519 + }, + { + "clip_ratio": 0.0032907186541706324, + "completion_length": 585.928596496582, + "epoch": 0.13133715778181987, + "grad_norm": 0.07021703571081161, + "kl": 0.0360107421875, + "learning_rate": 2.914265277133793e-06, + "loss": -0.0236, + "num_tokens": 82631212.0, + "reward": 0.5607142858207226, + "reward_std": 0.16259387508034706, + "rewards/code_reward": 0.410714291036129, + "rewards/format_reward": 1.5, + "step": 3520 + }, + { + "clip_ratio": 0.003237767261452973, + "epoch": 0.13137446947437154, + "grad_norm": 0.0660955086350441, + "kl": 0.03607177734375, + "learning_rate": 2.911880197717828e-06, + "loss": -0.0237, + "step": 3521 + }, + { + "clip_ratio": 0.0026531466864980757, + "epoch": 0.13141178116692318, + "grad_norm": 0.06492302566766739, + "kl": 0.035888671875, + "learning_rate": 2.9094962042288565e-06, + "loss": -0.024, + "step": 3522 + }, + { + "clip_ratio": 0.003817280288785696, + "completion_length": 636.5178833007812, + "epoch": 0.13144909285947484, + "grad_norm": 22.003578186035156, + "kl": 5.517364501953125, + "learning_rate": 2.907113297667157e-06, + "loss": 0.0775, + "num_tokens": 82698315.0, + "reward": 0.5592998526990414, + "reward_std": 0.017632946372032166, + "rewards/code_reward": 0.4092998579144478, + "rewards/format_reward": 1.5, + "step": 3523 + }, + { + "clip_ratio": 0.004135191556997597, + "epoch": 0.13148640455202648, + "grad_norm": 1.298048496246338, + "kl": 0.560089111328125, + "learning_rate": 2.904731479032553e-06, + "loss": 0.0278, + "step": 3524 + }, + { + "clip_ratio": 0.005238955549430102, + "epoch": 0.13152371624457815, + "grad_norm": 0.08007758110761642, + "kl": 0.066741943359375, + "learning_rate": 2.902350749324414e-06, + "loss": 0.0231, + "step": 3525 + }, + { + "clip_ratio": 0.0007482484797947109, + "completion_length": 468.08931732177734, + "epoch": 0.1315610279371298, + "grad_norm": 0.04663233458995819, + "kl": 0.0176239013671875, + "learning_rate": 2.899971109541646e-06, + "loss": -0.0038, + "num_tokens": 82750078.0, + "reward": 1.0725510120391846, + "reward_std": 0.04197321832180023, + "rewards/code_reward": 0.9225510209798813, + "rewards/format_reward": 1.5, + "step": 3526 + }, + { + "clip_ratio": 0.0008330881828442216, + "epoch": 0.13159833962968145, + "grad_norm": 0.04196196794509888, + "kl": 0.0170440673828125, + "learning_rate": 2.897592560682703e-06, + "loss": -0.0038, + "step": 3527 + }, + { + "clip_ratio": 0.0010893815197050571, + "epoch": 0.1316356513222331, + "grad_norm": 0.04077760875225067, + "kl": 0.016510009765625, + "learning_rate": 2.89521510374558e-06, + "loss": -0.0038, + "step": 3528 + }, + { + "clip_ratio": 0.004860185435973108, + "completion_length": 697.857177734375, + "epoch": 0.13167296301478476, + "grad_norm": 0.0770225003361702, + "kl": 0.022613525390625, + "learning_rate": 2.8928387397278153e-06, + "loss": -0.0051, + "num_tokens": 82820686.0, + "reward": 0.3817034624516964, + "reward_std": 0.24525565654039383, + "rewards/code_reward": 0.23170345090329647, + "rewards/format_reward": 1.5, + "step": 3529 + }, + { + "clip_ratio": 0.0049458498833701015, + "epoch": 0.1317102747073364, + "grad_norm": 0.07489622384309769, + "kl": 0.0228729248046875, + "learning_rate": 2.890463469626487e-06, + "loss": -0.0051, + "step": 3530 + }, + { + "clip_ratio": 0.006128316628746688, + "epoch": 0.13174758639988807, + "grad_norm": 0.07335397601127625, + "kl": 0.0222930908203125, + "learning_rate": 2.888089294438211e-06, + "loss": -0.0052, + "step": 3531 + }, + { + "clip_ratio": 0.003523295803461224, + "completion_length": 588.3214569091797, + "epoch": 0.1317848980924397, + "grad_norm": 0.08226107060909271, + "kl": 0.0196685791015625, + "learning_rate": 2.88571621515915e-06, + "loss": 0.0065, + "num_tokens": 82897116.0, + "reward": 0.6675462238490582, + "reward_std": 0.10350478813052177, + "rewards/code_reward": 0.5175461955368519, + "rewards/format_reward": 1.5, + "step": 3532 + }, + { + "clip_ratio": 0.003969377896282822, + "epoch": 0.13182220978499137, + "grad_norm": 0.08208386600017548, + "kl": 0.02001953125, + "learning_rate": 2.8833442327850047e-06, + "loss": 0.0065, + "step": 3533 + }, + { + "clip_ratio": 0.0039075519307516515, + "epoch": 0.131859521477543, + "grad_norm": 0.07564550638198853, + "kl": 0.0198822021484375, + "learning_rate": 2.8809733483110114e-06, + "loss": 0.0065, + "step": 3534 + }, + { + "clip_ratio": 0.003867521067149937, + "completion_length": 505.75001525878906, + "epoch": 0.13189683317009468, + "grad_norm": 0.08791828155517578, + "kl": 0.023193359375, + "learning_rate": 2.8786035627319507e-06, + "loss": -0.0124, + "num_tokens": 82949434.0, + "reward": 0.6254464276134968, + "reward_std": 0.17014771979302168, + "rewards/code_reward": 0.4754464265424758, + "rewards/format_reward": 1.5, + "step": 3535 + }, + { + "clip_ratio": 0.0035761152394115925, + "epoch": 0.13193414486264632, + "grad_norm": 0.10204048454761505, + "kl": 0.0236968994140625, + "learning_rate": 2.8762348770421423e-06, + "loss": -0.0125, + "step": 3536 + }, + { + "clip_ratio": 0.003769928007386625, + "epoch": 0.13197145655519799, + "grad_norm": 0.07378154993057251, + "kl": 0.0233001708984375, + "learning_rate": 2.8738672922354418e-06, + "loss": -0.0125, + "step": 3537 + }, + { + "clip_ratio": 0.004634531505871564, + "completion_length": 591.1250381469727, + "epoch": 0.13200876824774962, + "grad_norm": 0.07912573218345642, + "kl": 0.024261474609375, + "learning_rate": 2.8715008093052442e-06, + "loss": -0.0002, + "num_tokens": 83022955.0, + "reward": 0.4535714387893677, + "reward_std": 0.1840171217918396, + "rewards/code_reward": 0.3035714253783226, + "rewards/format_reward": 1.5, + "step": 3538 + }, + { + "clip_ratio": 0.004296736209653318, + "epoch": 0.1320460799403013, + "grad_norm": 0.0878211259841919, + "kl": 0.02410888671875, + "learning_rate": 2.8691354292444794e-06, + "loss": -0.0005, + "step": 3539 + }, + { + "clip_ratio": 0.003808994428254664, + "epoch": 0.13208339163285296, + "grad_norm": 0.06810954958200455, + "kl": 0.024810791015625, + "learning_rate": 2.866771153045621e-06, + "loss": -0.0007, + "step": 3540 + }, + { + "clip_ratio": 0.0020408204291015863, + "completion_length": 536.5893096923828, + "epoch": 0.1321207033254046, + "grad_norm": 0.038554348051548004, + "kl": 0.0303955078125, + "learning_rate": 2.8644079817006713e-06, + "loss": 0.0053, + "num_tokens": 83079842.0, + "reward": 0.6730952374637127, + "reward_std": 0.04617811366915703, + "rewards/code_reward": 0.5230952389538288, + "rewards/format_reward": 1.5, + "step": 3541 + }, + { + "clip_ratio": 0.002139866992365569, + "epoch": 0.13215801501795627, + "grad_norm": 0.0485025979578495, + "kl": 0.029693603515625, + "learning_rate": 2.862045916201175e-06, + "loss": 0.0053, + "step": 3542 + }, + { + "clip_ratio": 0.0022064667427912354, + "epoch": 0.1321953267105079, + "grad_norm": 0.04589433968067169, + "kl": 0.02880859375, + "learning_rate": 2.8596849575382114e-06, + "loss": 0.0052, + "step": 3543 + }, + { + "clip_ratio": 0.0038561466499231756, + "completion_length": 723.8036041259766, + "epoch": 0.13223263840305957, + "grad_norm": 0.09144093841314316, + "kl": 0.02056884765625, + "learning_rate": 2.857325106702394e-06, + "loss": 0.0074, + "num_tokens": 83157223.0, + "reward": 0.6634418964385986, + "reward_std": 0.2735026925802231, + "rewards/code_reward": 0.5134418569505215, + "rewards/format_reward": 1.5, + "step": 3544 + }, + { + "clip_ratio": 0.0039005122962407768, + "epoch": 0.1322699500956112, + "grad_norm": 0.09021914750337601, + "kl": 0.02093505859375, + "learning_rate": 2.854966364683872e-06, + "loss": 0.0075, + "step": 3545 + }, + { + "clip_ratio": 0.0035043751704506576, + "epoch": 0.13230726178816288, + "grad_norm": 0.0819854810833931, + "kl": 0.0208740234375, + "learning_rate": 2.85260873247233e-06, + "loss": 0.0071, + "step": 3546 + }, + { + "clip_ratio": 0.002391802379861474, + "completion_length": 400.41072845458984, + "epoch": 0.13234457348071452, + "grad_norm": 0.06583145260810852, + "kl": 0.02874755859375, + "learning_rate": 2.8502522110569867e-06, + "loss": 0.0004, + "num_tokens": 83208136.0, + "reward": 0.6321428678929806, + "reward_std": 0.1283881515264511, + "rewards/code_reward": 0.4821428582072258, + "rewards/format_reward": 1.5, + "step": 3547 + }, + { + "clip_ratio": 0.002226690645329654, + "epoch": 0.13238188517326618, + "grad_norm": 0.055193983018398285, + "kl": 0.028106689453125, + "learning_rate": 2.847896801426598e-06, + "loss": 0.0005, + "step": 3548 + }, + { + "clip_ratio": 0.00246850261464715, + "epoch": 0.13241919686581782, + "grad_norm": 0.08290238678455353, + "kl": 0.02703857421875, + "learning_rate": 2.845542504569444e-06, + "loss": 0.0003, + "step": 3549 + }, + { + "clip_ratio": 0.0019354841788299382, + "completion_length": 498.1785888671875, + "epoch": 0.1324565085583695, + "grad_norm": 0.05784974247217178, + "kl": 0.0179595947265625, + "learning_rate": 2.84318932147335e-06, + "loss": 0.0077, + "num_tokens": 83258602.0, + "reward": 1.0831065773963928, + "reward_std": 0.1481529325246811, + "rewards/code_reward": 0.9331065714359283, + "rewards/format_reward": 1.5, + "step": 3550 + }, + { + "clip_ratio": 0.002021121035795659, + "epoch": 0.13249382025092113, + "grad_norm": 0.09211176633834839, + "kl": 0.0179901123046875, + "learning_rate": 2.8408372531256623e-06, + "loss": 0.0079, + "step": 3551 + }, + { + "clip_ratio": 0.001720740576274693, + "epoch": 0.1325311319434728, + "grad_norm": 0.06162381172180176, + "kl": 0.017669677734375, + "learning_rate": 2.838486300513268e-06, + "loss": 0.0078, + "step": 3552 + }, + { + "clip_ratio": 0.0014757753815501928, + "completion_length": 570.1786041259766, + "epoch": 0.13256844363602444, + "grad_norm": 0.051412537693977356, + "kl": 0.0216827392578125, + "learning_rate": 2.836136464622583e-06, + "loss": 0.0049, + "num_tokens": 83324160.0, + "reward": 0.959006205201149, + "reward_std": 0.1640089452266693, + "rewards/code_reward": 0.8090062141418457, + "rewards/format_reward": 1.5, + "step": 3553 + }, + { + "clip_ratio": 0.0012442024890333414, + "epoch": 0.1326057553285761, + "grad_norm": 0.04847884923219681, + "kl": 0.0220947265625, + "learning_rate": 2.8337877464395547e-06, + "loss": 0.005, + "step": 3554 + }, + { + "clip_ratio": 0.0013388962834142148, + "epoch": 0.13264306702112774, + "grad_norm": 0.050818149000406265, + "kl": 0.0211181640625, + "learning_rate": 2.831440146949663e-06, + "loss": 0.005, + "step": 3555 + }, + { + "clip_ratio": 0.0014280524337664247, + "completion_length": 488.3750228881836, + "epoch": 0.1326803787136794, + "grad_norm": 0.05500400811433792, + "kl": 0.0180206298828125, + "learning_rate": 2.8290936671379145e-06, + "loss": -0.0046, + "num_tokens": 83377003.0, + "reward": 0.8695898056030273, + "reward_std": 0.16123632341623306, + "rewards/code_reward": 0.7195898145437241, + "rewards/format_reward": 1.5, + "step": 3556 + }, + { + "clip_ratio": 0.0018079131841659546, + "epoch": 0.13271769040623105, + "grad_norm": 0.061562079936265945, + "kl": 0.01788330078125, + "learning_rate": 2.8267483079888487e-06, + "loss": -0.0045, + "step": 3557 + }, + { + "clip_ratio": 0.0016155801713466644, + "epoch": 0.13275500209878272, + "grad_norm": 0.06175525486469269, + "kl": 0.0184783935546875, + "learning_rate": 2.8244040704865376e-06, + "loss": -0.0046, + "step": 3558 + }, + { + "clip_ratio": 0.003411505836993456, + "completion_length": 609.6785888671875, + "epoch": 0.13279231379133435, + "grad_norm": 0.05883019044995308, + "kl": 0.0179290771484375, + "learning_rate": 2.822060955614576e-06, + "loss": -0.0014, + "num_tokens": 83443919.0, + "reward": 0.6442601941525936, + "reward_std": 0.17860103398561478, + "rewards/code_reward": 0.49426019936800003, + "rewards/format_reward": 1.5, + "step": 3559 + }, + { + "clip_ratio": 0.0036424134159460664, + "epoch": 0.13282962548388602, + "grad_norm": 0.0574563592672348, + "kl": 0.0184478759765625, + "learning_rate": 2.8197189643560952e-06, + "loss": -0.0012, + "step": 3560 + }, + { + "clip_ratio": 0.0032601613202132285, + "epoch": 0.13286693717643766, + "grad_norm": 0.05623394250869751, + "kl": 0.0177154541015625, + "learning_rate": 2.817378097693747e-06, + "loss": -0.0014, + "step": 3561 + }, + { + "clip_ratio": 0.004072299983818084, + "completion_length": 738.4464721679688, + "epoch": 0.13290424886898933, + "grad_norm": 0.055262915790081024, + "kl": 0.02288818359375, + "learning_rate": 2.8150383566097184e-06, + "loss": 0.0039, + "num_tokens": 83521266.0, + "reward": 0.6240207776427269, + "reward_std": 0.09720554202795029, + "rewards/code_reward": 0.47402074933052063, + "rewards/format_reward": 1.5, + "step": 3562 + }, + { + "clip_ratio": 0.004051994706969708, + "epoch": 0.13294156056154097, + "grad_norm": 0.053980328142642975, + "kl": 0.0228271484375, + "learning_rate": 2.8126997420857205e-06, + "loss": 0.0038, + "step": 3563 + }, + { + "clip_ratio": 0.00418720010202378, + "epoch": 0.13297887225409263, + "grad_norm": 0.0524073950946331, + "kl": 0.022674560546875, + "learning_rate": 2.810362255102993e-06, + "loss": 0.0038, + "step": 3564 + }, + { + "clip_ratio": 0.004211962164845318, + "completion_length": 614.9464492797852, + "epoch": 0.13301618394664427, + "grad_norm": 0.08834511041641235, + "kl": 0.022369384765625, + "learning_rate": 2.8080258966423012e-06, + "loss": -0.0007, + "num_tokens": 83584977.0, + "reward": 0.7636167630553246, + "reward_std": 0.2245138566941023, + "rewards/code_reward": 0.6136166974902153, + "rewards/format_reward": 1.5, + "step": 3565 + }, + { + "clip_ratio": 0.004142293939366937, + "epoch": 0.13305349563919594, + "grad_norm": 0.08888770639896393, + "kl": 0.022552490234375, + "learning_rate": 2.8056906676839404e-06, + "loss": -0.0005, + "step": 3566 + }, + { + "clip_ratio": 0.0032840545172803104, + "epoch": 0.13309080733174758, + "grad_norm": 0.0868401825428009, + "kl": 0.022430419921875, + "learning_rate": 2.8033565692077247e-06, + "loss": -0.0009, + "step": 3567 + }, + { + "clip_ratio": 0.0037896839203312993, + "completion_length": 693.4821701049805, + "epoch": 0.13312811902429925, + "grad_norm": 0.1352643072605133, + "kl": 0.0154876708984375, + "learning_rate": 2.8010236021930026e-06, + "loss": -0.009, + "num_tokens": 83654808.0, + "reward": 0.8710926622152328, + "reward_std": 0.32181379944086075, + "rewards/code_reward": 0.7210926413536072, + "rewards/format_reward": 1.5, + "step": 3568 + }, + { + "clip_ratio": 0.003750298754312098, + "epoch": 0.13316543071685089, + "grad_norm": 0.08461742848157883, + "kl": 0.015625, + "learning_rate": 2.798691767618641e-06, + "loss": -0.009, + "step": 3569 + }, + { + "clip_ratio": 0.003528170578647405, + "epoch": 0.13320274240940255, + "grad_norm": 0.08222979307174683, + "kl": 0.0157318115234375, + "learning_rate": 2.796361066463035e-06, + "loss": -0.0092, + "step": 3570 + }, + { + "clip_ratio": 0.0026651895605027676, + "completion_length": 622.7857437133789, + "epoch": 0.1332400541019542, + "grad_norm": 0.03739483281970024, + "kl": 0.02001953125, + "learning_rate": 2.7940314997041028e-06, + "loss": -0.0012, + "num_tokens": 83721548.0, + "reward": 0.7277934961020947, + "reward_std": 0.09705667197704315, + "rewards/code_reward": 0.5777934938669205, + "rewards/format_reward": 1.5, + "step": 3571 + }, + { + "clip_ratio": 0.0032111051259562373, + "epoch": 0.13327736579450586, + "grad_norm": 0.036735229194164276, + "kl": 0.020263671875, + "learning_rate": 2.7917030683192903e-06, + "loss": -0.0012, + "step": 3572 + }, + { + "clip_ratio": 0.0029488365398719907, + "epoch": 0.1333146774870575, + "grad_norm": 0.03856858238577843, + "kl": 0.01995849609375, + "learning_rate": 2.789375773285562e-06, + "loss": -0.0014, + "step": 3573 + }, + { + "clip_ratio": 0.00416969700017944, + "completion_length": 847.8571624755859, + "epoch": 0.13335198917960916, + "grad_norm": 0.08933959156274796, + "kl": 0.0149993896484375, + "learning_rate": 2.787049615579408e-06, + "loss": -0.0034, + "num_tokens": 83804224.0, + "reward": 0.5187110900878906, + "reward_std": 0.13481803750619292, + "rewards/code_reward": 0.36871105059981346, + "rewards/format_reward": 1.5, + "step": 3574 + }, + { + "clip_ratio": 0.003954813058953732, + "epoch": 0.1333893008721608, + "grad_norm": 0.07929647713899612, + "kl": 0.01483154296875, + "learning_rate": 2.7847245961768394e-06, + "loss": -0.0034, + "step": 3575 + }, + { + "clip_ratio": 0.003902718599420041, + "epoch": 0.13342661256471247, + "grad_norm": 0.06973214447498322, + "kl": 0.014923095703125, + "learning_rate": 2.782400716053394e-06, + "loss": -0.0037, + "step": 3576 + }, + { + "clip_ratio": 0.004745518788695335, + "completion_length": 733.2143020629883, + "epoch": 0.1334639242572641, + "grad_norm": 0.09033196419477463, + "kl": 0.0182037353515625, + "learning_rate": 2.7800779761841255e-06, + "loss": 0.008, + "num_tokens": 83879318.0, + "reward": 0.6010768078267574, + "reward_std": 0.2500266656279564, + "rewards/code_reward": 0.45107680559158325, + "rewards/format_reward": 1.5, + "step": 3577 + }, + { + "clip_ratio": 0.004713008413091302, + "epoch": 0.13350123594981578, + "grad_norm": 0.0887424498796463, + "kl": 0.0183258056640625, + "learning_rate": 2.777756377543617e-06, + "loss": 0.0079, + "step": 3578 + }, + { + "clip_ratio": 0.00445955473696813, + "epoch": 0.13353854764236742, + "grad_norm": 0.0844259262084961, + "kl": 0.01837158203125, + "learning_rate": 2.7754359211059624e-06, + "loss": 0.0079, + "step": 3579 + }, + { + "clip_ratio": 0.0037349553895182908, + "completion_length": 626.3750305175781, + "epoch": 0.13357585933491908, + "grad_norm": 0.08709623664617538, + "kl": 0.0176239013671875, + "learning_rate": 2.7731166078447857e-06, + "loss": 0.0103, + "num_tokens": 83936473.0, + "reward": 0.41182373464107513, + "reward_std": 0.27579185273498297, + "rewards/code_reward": 0.26182373240590096, + "rewards/format_reward": 1.5, + "step": 3580 + }, + { + "clip_ratio": 0.00393938587512821, + "epoch": 0.13361317102747072, + "grad_norm": 0.07387709617614746, + "kl": 0.0179443359375, + "learning_rate": 2.7707984387332267e-06, + "loss": 0.0102, + "step": 3581 + }, + { + "clip_ratio": 0.003653263411251828, + "epoch": 0.1336504827200224, + "grad_norm": 0.07331935316324234, + "kl": 0.0177764892578125, + "learning_rate": 2.768481414743945e-06, + "loss": 0.01, + "step": 3582 + }, + { + "clip_ratio": 0.0028348598862066865, + "completion_length": 748.4821701049805, + "epoch": 0.13368779441257403, + "grad_norm": 0.057466067373752594, + "kl": 0.016387939453125, + "learning_rate": 2.7661655368491253e-06, + "loss": -0.0114, + "num_tokens": 84013318.0, + "reward": 0.7791764959692955, + "reward_std": 0.17278393264859915, + "rewards/code_reward": 0.629176510614343, + "rewards/format_reward": 1.5, + "step": 3583 + }, + { + "clip_ratio": 0.003264137892983854, + "epoch": 0.1337251061051257, + "grad_norm": 0.057627491652965546, + "kl": 0.0166778564453125, + "learning_rate": 2.7638508060204618e-06, + "loss": -0.0115, + "step": 3584 + }, + { + "clip_ratio": 0.002842540794517845, + "epoch": 0.13376241779767734, + "grad_norm": 0.05615536868572235, + "kl": 0.0163421630859375, + "learning_rate": 2.7615372232291747e-06, + "loss": -0.0116, + "step": 3585 + }, + { + "clip_ratio": 0.004153543501161039, + "completion_length": 651.8214492797852, + "epoch": 0.133799729490229, + "grad_norm": 0.05209331586956978, + "kl": 0.01861572265625, + "learning_rate": 2.759224789446001e-06, + "loss": 0.0051, + "num_tokens": 84079430.0, + "reward": 0.4192577116191387, + "reward_std": 0.026131443679332733, + "rewards/code_reward": 0.2692577037960291, + "rewards/format_reward": 1.5, + "step": 3586 + }, + { + "clip_ratio": 0.0040053605334833264, + "epoch": 0.13383704118278064, + "grad_norm": 0.05249059200286865, + "kl": 0.018218994140625, + "learning_rate": 2.7569135056411938e-06, + "loss": 0.0051, + "step": 3587 + }, + { + "clip_ratio": 0.0042339463252574205, + "epoch": 0.1338743528753323, + "grad_norm": 0.050957515835762024, + "kl": 0.0178375244140625, + "learning_rate": 2.7546033727845247e-06, + "loss": 0.0051, + "step": 3588 + }, + { + "clip_ratio": 0.00387702853186056, + "completion_length": 594.7857360839844, + "epoch": 0.13391166456788395, + "grad_norm": 0.07779718935489655, + "kl": 0.0186614990234375, + "learning_rate": 2.7522943918452825e-06, + "loss": 0.0083, + "num_tokens": 84152022.0, + "reward": 0.6145378462970257, + "reward_std": 0.14664420764893293, + "rewards/code_reward": 0.46453780867159367, + "rewards/format_reward": 1.5, + "step": 3589 + }, + { + "clip_ratio": 0.004762668686453253, + "epoch": 0.13394897626043561, + "grad_norm": 0.07830234616994858, + "kl": 0.0187835693359375, + "learning_rate": 2.749986563792274e-06, + "loss": 0.0085, + "step": 3590 + }, + { + "clip_ratio": 0.004444045771379024, + "epoch": 0.13398628795298725, + "grad_norm": 0.07015827298164368, + "kl": 0.0186614990234375, + "learning_rate": 2.7476798895938215e-06, + "loss": 0.0082, + "step": 3591 + }, + { + "clip_ratio": 0.0033880877308547497, + "completion_length": 738.8928680419922, + "epoch": 0.13402359964553892, + "grad_norm": 0.0554010309278965, + "kl": 0.01800537109375, + "learning_rate": 2.74537437021776e-06, + "loss": 0.0052, + "num_tokens": 84241056.0, + "reward": 0.6854166872799397, + "reward_std": 0.14791301637887955, + "rewards/code_reward": 0.5354166477918625, + "rewards/format_reward": 1.5, + "step": 3592 + }, + { + "clip_ratio": 0.003192436881363392, + "epoch": 0.13406091133809056, + "grad_norm": 0.05703911557793617, + "kl": 0.017791748046875, + "learning_rate": 2.7430700066314435e-06, + "loss": 0.0053, + "step": 3593 + }, + { + "clip_ratio": 0.003291718545369804, + "epoch": 0.13409822303064223, + "grad_norm": 0.058273255825042725, + "kl": 0.0177764892578125, + "learning_rate": 2.740766799801743e-06, + "loss": 0.005, + "step": 3594 + }, + { + "clip_ratio": 0.004300842294469476, + "completion_length": 693.4107513427734, + "epoch": 0.1341355347231939, + "grad_norm": 0.08853475749492645, + "kl": 0.0200653076171875, + "learning_rate": 2.7384647506950366e-06, + "loss": -0.007, + "num_tokens": 84312857.0, + "reward": 0.7643497139215469, + "reward_std": 0.30336594954133034, + "rewards/code_reward": 0.6143496930599213, + "rewards/format_reward": 1.5, + "step": 3595 + }, + { + "clip_ratio": 0.00383341294946149, + "epoch": 0.13417284641574553, + "grad_norm": 0.08591283857822418, + "kl": 0.0207061767578125, + "learning_rate": 2.7361638602772258e-06, + "loss": -0.0072, + "step": 3596 + }, + { + "clip_ratio": 0.004443240351974964, + "epoch": 0.1342101581082972, + "grad_norm": 0.0837937593460083, + "kl": 0.021026611328125, + "learning_rate": 2.733864129513718e-06, + "loss": -0.0072, + "step": 3597 + }, + { + "clip_ratio": 0.0025895900325849652, + "completion_length": 450.44646072387695, + "epoch": 0.13424746980084884, + "grad_norm": 0.049940094351768494, + "kl": 0.019622802734375, + "learning_rate": 2.731565559369438e-06, + "loss": 0.0086, + "num_tokens": 84365186.0, + "reward": 0.806966494768858, + "reward_std": 0.055183105170726776, + "rewards/code_reward": 0.6569664776325226, + "rewards/format_reward": 1.5, + "step": 3598 + }, + { + "clip_ratio": 0.0026247105561196804, + "epoch": 0.1342847814934005, + "grad_norm": 0.04776471108198166, + "kl": 0.0184783935546875, + "learning_rate": 2.7292681508088247e-06, + "loss": 0.0085, + "step": 3599 + }, + { + "clip_ratio": 0.002547869924455881, + "epoch": 0.13432209318595215, + "grad_norm": 0.04729441553354263, + "kl": 0.01849365234375, + "learning_rate": 2.726971904795827e-06, + "loss": 0.0086, + "step": 3600 + }, + { + "clip_ratio": 0.0010064765810966492, + "completion_length": 506.66072845458984, + "epoch": 0.1343594048785038, + "grad_norm": 0.04145420342683792, + "kl": 0.020416259765625, + "learning_rate": 2.724676822293909e-06, + "loss": 0.0041, + "num_tokens": 84416197.0, + "reward": 1.011607140302658, + "reward_std": 0.036183394491672516, + "rewards/code_reward": 0.8616071492433548, + "rewards/format_reward": 1.5, + "step": 3601 + }, + { + "clip_ratio": 0.0006836616084910929, + "epoch": 0.13439671657105545, + "grad_norm": 0.04149801656603813, + "kl": 0.02032470703125, + "learning_rate": 2.7223829042660423e-06, + "loss": 0.004, + "step": 3602 + }, + { + "clip_ratio": 0.0006204217206686735, + "epoch": 0.13443402826360712, + "grad_norm": 0.04046453908085823, + "kl": 0.0202789306640625, + "learning_rate": 2.7200901516747125e-06, + "loss": 0.004, + "step": 3603 + }, + { + "clip_ratio": 0.004009978089015931, + "completion_length": 692.6250381469727, + "epoch": 0.13447133995615876, + "grad_norm": 0.09618248045444489, + "kl": 0.0464935302734375, + "learning_rate": 2.7177985654819184e-06, + "loss": -0.0022, + "num_tokens": 84484986.0, + "reward": 0.4634849615395069, + "reward_std": 0.20181113481521606, + "rewards/code_reward": 0.3134849686175585, + "rewards/format_reward": 1.5, + "step": 3604 + }, + { + "clip_ratio": 0.0038485884433612227, + "epoch": 0.13450865164871043, + "grad_norm": 0.09960202872753143, + "kl": 0.047210693359375, + "learning_rate": 2.715508146649164e-06, + "loss": -0.0023, + "step": 3605 + }, + { + "clip_ratio": 0.003988798533100635, + "epoch": 0.13454596334126206, + "grad_norm": 0.0766439214348793, + "kl": 0.0351104736328125, + "learning_rate": 2.713218896137467e-06, + "loss": -0.0026, + "step": 3606 + }, + { + "clip_ratio": 0.0032626629690639675, + "completion_length": 630.4464569091797, + "epoch": 0.13458327503381373, + "grad_norm": 0.1828758418560028, + "kl": 0.0172576904296875, + "learning_rate": 2.7109308149073556e-06, + "loss": 0.0037, + "num_tokens": 84556271.0, + "reward": 0.8471207246184349, + "reward_std": 0.30981554090976715, + "rewards/code_reward": 0.6971207037568092, + "rewards/format_reward": 1.5, + "step": 3607 + }, + { + "clip_ratio": 0.003326795296743512, + "epoch": 0.13462058672636537, + "grad_norm": 0.07792592793703079, + "kl": 0.016632080078125, + "learning_rate": 2.7086439039188655e-06, + "loss": 0.0037, + "step": 3608 + }, + { + "clip_ratio": 0.003240978461690247, + "epoch": 0.13465789841891704, + "grad_norm": 0.08112721145153046, + "kl": 0.016845703125, + "learning_rate": 2.706358164131544e-06, + "loss": 0.0039, + "step": 3609 + }, + { + "clip_ratio": 0.0034081694902852178, + "completion_length": 595.8928756713867, + "epoch": 0.13469521011146868, + "grad_norm": 0.05567813292145729, + "kl": 0.018768310546875, + "learning_rate": 2.7040735965044416e-06, + "loss": 0.0077, + "num_tokens": 84622443.0, + "reward": 0.5988795608282089, + "reward_std": 0.10741518175927922, + "rewards/code_reward": 0.4488795476499945, + "rewards/format_reward": 1.5, + "step": 3610 + }, + { + "clip_ratio": 0.003206342051271349, + "epoch": 0.13473252180402034, + "grad_norm": 0.049137748777866364, + "kl": 0.0184326171875, + "learning_rate": 2.701790201996124e-06, + "loss": 0.0076, + "step": 3611 + }, + { + "clip_ratio": 0.003141351160593331, + "epoch": 0.13476983349657198, + "grad_norm": 0.05677592009305954, + "kl": 0.0183868408203125, + "learning_rate": 2.6995079815646574e-06, + "loss": 0.0076, + "step": 3612 + }, + { + "clip_ratio": 0.004125041363295168, + "completion_length": 722.6607513427734, + "epoch": 0.13480714518912365, + "grad_norm": 0.1848246306180954, + "kl": 0.0164642333984375, + "learning_rate": 2.6972269361676207e-06, + "loss": 0.0253, + "num_tokens": 84689596.0, + "reward": 0.7042113468050957, + "reward_std": 0.24055491667240858, + "rewards/code_reward": 0.5568898889468983, + "rewards/format_reward": 1.4732142984867096, + "step": 3613 + }, + { + "clip_ratio": 0.0036794483312405646, + "epoch": 0.1348444568816753, + "grad_norm": 0.07292698323726654, + "kl": 0.0168304443359375, + "learning_rate": 2.6949470667621e-06, + "loss": 0.0251, + "step": 3614 + }, + { + "clip_ratio": 0.0035567774903029203, + "epoch": 0.13488176857422696, + "grad_norm": 0.08360914885997772, + "kl": 0.0168609619140625, + "learning_rate": 2.692668374304684e-06, + "loss": 0.0248, + "step": 3615 + }, + { + "clip_ratio": 0.005623518256470561, + "completion_length": 710.482177734375, + "epoch": 0.1349190802667786, + "grad_norm": 0.09842535108327866, + "kl": 0.02813720703125, + "learning_rate": 2.6903908597514695e-06, + "loss": 0.0057, + "num_tokens": 84775765.0, + "reward": 0.32011639699339867, + "reward_std": 0.10451779421418905, + "rewards/code_reward": 0.170116376131773, + "rewards/format_reward": 1.5, + "step": 3616 + }, + { + "clip_ratio": 0.005061408795882016, + "epoch": 0.13495639195933026, + "grad_norm": 0.0729314312338829, + "kl": 0.028106689453125, + "learning_rate": 2.688114524058061e-06, + "loss": 0.0057, + "step": 3617 + }, + { + "clip_ratio": 0.005015965085476637, + "epoch": 0.1349937036518819, + "grad_norm": 0.07386484742164612, + "kl": 0.02838134765625, + "learning_rate": 2.685839368179565e-06, + "loss": 0.0054, + "step": 3618 + }, + { + "clip_ratio": 0.002923675870988518, + "completion_length": 672.8035888671875, + "epoch": 0.13503101534443357, + "grad_norm": 0.06477141380310059, + "kl": 0.01806640625, + "learning_rate": 2.6835653930705974e-06, + "loss": 0.0056, + "num_tokens": 84848008.0, + "reward": 0.6619047522544861, + "reward_std": 0.31513702124357224, + "rewards/code_reward": 0.511904776096344, + "rewards/format_reward": 1.5, + "step": 3619 + }, + { + "clip_ratio": 0.0029837460606358945, + "epoch": 0.1350683270369852, + "grad_norm": 0.06686551868915558, + "kl": 0.017486572265625, + "learning_rate": 2.6812925996852717e-06, + "loss": 0.0055, + "step": 3620 + }, + { + "clip_ratio": 0.0029696222627535462, + "epoch": 0.13510563872953688, + "grad_norm": 0.06349237263202667, + "kl": 0.01727294921875, + "learning_rate": 2.679020988977213e-06, + "loss": 0.0056, + "step": 3621 + }, + { + "clip_ratio": 0.002936585573479533, + "completion_length": 707.982177734375, + "epoch": 0.13514295042208851, + "grad_norm": 0.06995640695095062, + "kl": 0.02215576171875, + "learning_rate": 2.676750561899547e-06, + "loss": 0.0162, + "num_tokens": 84920051.0, + "reward": 0.85491131991148, + "reward_std": 0.1574852094054222, + "rewards/code_reward": 0.7049112990498543, + "rewards/format_reward": 1.5, + "step": 3622 + }, + { + "clip_ratio": 0.0031073709833435714, + "epoch": 0.13518026211464018, + "grad_norm": 0.07223553210496902, + "kl": 0.021881103515625, + "learning_rate": 2.6744813194049e-06, + "loss": 0.0164, + "step": 3623 + }, + { + "clip_ratio": 0.0028291670605540276, + "epoch": 0.13521757380719182, + "grad_norm": 0.07274381071329117, + "kl": 0.021575927734375, + "learning_rate": 2.672213262445407e-06, + "loss": 0.0162, + "step": 3624 + }, + { + "clip_ratio": 0.002196493500377983, + "completion_length": 592.1071701049805, + "epoch": 0.1352548854997435, + "grad_norm": 0.06188764423131943, + "kl": 0.0258026123046875, + "learning_rate": 2.6699463919727e-06, + "loss": -0.0056, + "num_tokens": 84978661.0, + "reward": 0.9695684909820557, + "reward_std": 0.12850817386060953, + "rewards/code_reward": 0.8195684477686882, + "rewards/format_reward": 1.5, + "step": 3625 + }, + { + "clip_ratio": 0.0022818653378635645, + "epoch": 0.13529219719229513, + "grad_norm": 0.061966996639966965, + "kl": 0.0270843505859375, + "learning_rate": 2.6676807089379177e-06, + "loss": -0.0056, + "step": 3626 + }, + { + "clip_ratio": 0.0021545723429881036, + "epoch": 0.1353295088848468, + "grad_norm": 0.059368643909692764, + "kl": 0.025543212890625, + "learning_rate": 2.6654162142916983e-06, + "loss": -0.0059, + "step": 3627 + }, + { + "clip_ratio": 0.004891498887445778, + "completion_length": 630.1428833007812, + "epoch": 0.13536682057739843, + "grad_norm": 0.17710331082344055, + "kl": 0.06646728515625, + "learning_rate": 2.663152908984181e-06, + "loss": 0.0035, + "num_tokens": 85055235.0, + "reward": 0.3901258036494255, + "reward_std": 0.28121381998062134, + "rewards/code_reward": 0.24012579582631588, + "rewards/format_reward": 1.5, + "step": 3628 + }, + { + "clip_ratio": 0.005252545262919739, + "epoch": 0.1354041322699501, + "grad_norm": 0.09622170031070709, + "kl": 0.035888671875, + "learning_rate": 2.6608907939650087e-06, + "loss": 0.0036, + "step": 3629 + }, + { + "clip_ratio": 0.004866912378929555, + "epoch": 0.13544144396250174, + "grad_norm": 0.10906222462654114, + "kl": 0.035003662109375, + "learning_rate": 2.658629870183319e-06, + "loss": 0.003, + "step": 3630 + }, + { + "clip_ratio": 0.0027812802873086184, + "completion_length": 507.32144927978516, + "epoch": 0.1354787556550534, + "grad_norm": 0.07021030783653259, + "kl": 0.021453857421875, + "learning_rate": 2.656370138587756e-06, + "loss": -0.0119, + "num_tokens": 85106941.0, + "reward": 0.5704329796135426, + "reward_std": 0.09870181954465806, + "rewards/code_reward": 0.4204329935600981, + "rewards/format_reward": 1.5, + "step": 3631 + }, + { + "clip_ratio": 0.003335206536576152, + "epoch": 0.13551606734760505, + "grad_norm": 0.48210811614990234, + "kl": 0.0211334228515625, + "learning_rate": 2.6541116001264623e-06, + "loss": -0.0118, + "step": 3632 + }, + { + "clip_ratio": 0.0026203475426882505, + "epoch": 0.1355533790401567, + "grad_norm": 0.07603713124990463, + "kl": 0.0205230712890625, + "learning_rate": 2.651854255747076e-06, + "loss": -0.0121, + "step": 3633 + }, + { + "clip_ratio": 0.0029033846221864223, + "completion_length": 684.4464569091797, + "epoch": 0.13559069073270835, + "grad_norm": 0.06789729744195938, + "kl": 0.0191192626953125, + "learning_rate": 2.649598106396737e-06, + "loss": -0.0144, + "num_tokens": 85177848.0, + "reward": 0.5837425142526627, + "reward_std": 0.18921833857893944, + "rewards/code_reward": 0.4337424598634243, + "rewards/format_reward": 1.5, + "step": 3634 + }, + { + "clip_ratio": 0.003546376945450902, + "epoch": 0.13562800242526002, + "grad_norm": 0.06664774566888809, + "kl": 0.0200347900390625, + "learning_rate": 2.6473431530220857e-06, + "loss": -0.0144, + "step": 3635 + }, + { + "clip_ratio": 0.003259311168221757, + "epoch": 0.13566531411781166, + "grad_norm": 0.06660384684801102, + "kl": 0.0193023681640625, + "learning_rate": 2.645089396569257e-06, + "loss": -0.0144, + "step": 3636 + }, + { + "clip_ratio": 0.003671048325486481, + "completion_length": 660.0178756713867, + "epoch": 0.13570262581036333, + "grad_norm": 0.07657307386398315, + "kl": 0.03155517578125, + "learning_rate": 2.642836837983888e-06, + "loss": 0.0183, + "num_tokens": 85239337.0, + "reward": 0.6524350568652153, + "reward_std": 0.0577041320502758, + "rewards/code_reward": 0.5024350779131055, + "rewards/format_reward": 1.5, + "step": 3637 + }, + { + "clip_ratio": 0.0034030753886327147, + "epoch": 0.13573993750291496, + "grad_norm": 0.06911080330610275, + "kl": 0.03131103515625, + "learning_rate": 2.6405854782111063e-06, + "loss": 0.0183, + "step": 3638 + }, + { + "clip_ratio": 0.003618087968789041, + "epoch": 0.13577724919546663, + "grad_norm": 0.058099158108234406, + "kl": 0.0374755859375, + "learning_rate": 2.638335318195544e-06, + "loss": 0.0182, + "step": 3639 + }, + { + "clip_ratio": 0.0032822935027070343, + "completion_length": 649.5000305175781, + "epoch": 0.13581456088801827, + "grad_norm": 0.14156007766723633, + "kl": 0.0164337158203125, + "learning_rate": 2.6360863588813246e-06, + "loss": 0.0205, + "num_tokens": 85304521.0, + "reward": 0.7083264701068401, + "reward_std": 0.19012633943930268, + "rewards/code_reward": 0.5583264529705048, + "rewards/format_reward": 1.5, + "step": 3640 + }, + { + "clip_ratio": 0.0026951085310429335, + "epoch": 0.13585187258056994, + "grad_norm": 0.07444627583026886, + "kl": 0.0168304443359375, + "learning_rate": 2.6338386012120698e-06, + "loss": 0.0202, + "step": 3641 + }, + { + "clip_ratio": 0.0031966560636647046, + "epoch": 0.13588918427312158, + "grad_norm": 0.07396117597818375, + "kl": 0.0163726806640625, + "learning_rate": 2.6315920461308963e-06, + "loss": 0.0201, + "step": 3642 + }, + { + "clip_ratio": 0.004560810048133135, + "completion_length": 822.5178985595703, + "epoch": 0.13592649596567324, + "grad_norm": 0.04988201707601547, + "kl": 0.01751708984375, + "learning_rate": 2.6293466945804193e-06, + "loss": 0.0062, + "num_tokens": 85387276.0, + "reward": 0.34692687541246414, + "reward_std": 0.13658401928842068, + "rewards/code_reward": 0.19692684896290302, + "rewards/format_reward": 1.5, + "step": 3643 + }, + { + "clip_ratio": 0.004641529521904886, + "epoch": 0.13596380765822488, + "grad_norm": 0.049354203045368195, + "kl": 0.017242431640625, + "learning_rate": 2.627102547502746e-06, + "loss": 0.0062, + "step": 3644 + }, + { + "clip_ratio": 0.004498885769862682, + "epoch": 0.13600111935077655, + "grad_norm": 0.047338809818029404, + "kl": 0.017425537109375, + "learning_rate": 2.6248596058394765e-06, + "loss": 0.0062, + "step": 3645 + }, + { + "clip_ratio": 0.002607113856356591, + "completion_length": 706.8750457763672, + "epoch": 0.1360384310433282, + "grad_norm": 0.05061594396829605, + "kl": 0.016998291015625, + "learning_rate": 2.6226178705317096e-06, + "loss": 0.0009, + "num_tokens": 85460363.0, + "reward": 0.7313330136239529, + "reward_std": 0.10563255345914513, + "rewards/code_reward": 0.5813330337405205, + "rewards/format_reward": 1.5, + "step": 3646 + }, + { + "clip_ratio": 0.002032938995398581, + "epoch": 0.13607574273587986, + "grad_norm": 0.04986681044101715, + "kl": 0.01647186279296875, + "learning_rate": 2.6203773425200356e-06, + "loss": 0.0007, + "step": 3647 + }, + { + "clip_ratio": 0.0021855293889530003, + "epoch": 0.13611305442843152, + "grad_norm": 0.05097386986017227, + "kl": 0.01628875732421875, + "learning_rate": 2.6181380227445373e-06, + "loss": 0.0006, + "step": 3648 + }, + { + "clip_ratio": 0.0017896354547701776, + "completion_length": 505.60717010498047, + "epoch": 0.13615036612098316, + "grad_norm": 0.06202619522809982, + "kl": 0.0191192626953125, + "learning_rate": 2.615899912144794e-06, + "loss": 0.0019, + "num_tokens": 85525371.0, + "reward": 1.058079406619072, + "reward_std": 0.1742568388581276, + "rewards/code_reward": 0.9080794006586075, + "rewards/format_reward": 1.5, + "step": 3649 + }, + { + "clip_ratio": 0.001717940904200077, + "epoch": 0.13618767781353483, + "grad_norm": 0.05977705493569374, + "kl": 0.0191192626953125, + "learning_rate": 2.6136630116598715e-06, + "loss": 0.0018, + "step": 3650 + }, + { + "clip_ratio": 0.0014208884676918387, + "epoch": 0.13622498950608647, + "grad_norm": 0.05559810623526573, + "kl": 0.019439697265625, + "learning_rate": 2.6114273222283333e-06, + "loss": 0.0016, + "step": 3651 + }, + { + "clip_ratio": 0.002827271819114685, + "completion_length": 863.0000457763672, + "epoch": 0.13626230119863814, + "grad_norm": 0.05374803766608238, + "kl": 0.0147705078125, + "learning_rate": 2.6091928447882364e-06, + "loss": -0.0061, + "num_tokens": 85600537.0, + "reward": 0.49494045972824097, + "reward_std": 0.1948661357164383, + "rewards/code_reward": 0.346726194024086, + "rewards/format_reward": 1.4821428656578064, + "step": 3652 + }, + { + "clip_ratio": 0.002764219243545085, + "epoch": 0.13629961289118978, + "grad_norm": 0.053275853395462036, + "kl": 0.0144500732421875, + "learning_rate": 2.6069595802771218e-06, + "loss": -0.0062, + "step": 3653 + }, + { + "clip_ratio": 0.0027535412227734923, + "epoch": 0.13633692458374144, + "grad_norm": 0.051347848027944565, + "kl": 0.014404296875, + "learning_rate": 2.6047275296320272e-06, + "loss": -0.0063, + "step": 3654 + }, + { + "clip_ratio": 0.004194783337879926, + "completion_length": 575.6607360839844, + "epoch": 0.13637423627629308, + "grad_norm": 0.09110017120838165, + "kl": 0.019439697265625, + "learning_rate": 2.6024966937894824e-06, + "loss": 0.0001, + "num_tokens": 85664918.0, + "reward": 0.44442858174443245, + "reward_std": 0.3064107373356819, + "rewards/code_reward": 0.2944285776466131, + "rewards/format_reward": 1.5, + "step": 3655 + }, + { + "clip_ratio": 0.004661877523176372, + "epoch": 0.13641154796884475, + "grad_norm": 0.0881665050983429, + "kl": 0.01934814453125, + "learning_rate": 2.6002670736854997e-06, + "loss": 0.0001, + "step": 3656 + }, + { + "clip_ratio": 0.004159072530455887, + "epoch": 0.1364488596613964, + "grad_norm": 0.08312903344631195, + "kl": 0.019561767578125, + "learning_rate": 2.5980386702555903e-06, + "loss": -0.0003, + "step": 3657 + }, + { + "clip_ratio": 0.0049897838034667075, + "completion_length": 715.7678833007812, + "epoch": 0.13648617135394805, + "grad_norm": 0.09390350431203842, + "kl": 0.014495849609375, + "learning_rate": 2.595811484434748e-06, + "loss": 0.0124, + "num_tokens": 85735619.0, + "reward": 0.5644701085984707, + "reward_std": 0.2600076012313366, + "rewards/code_reward": 0.41447008727118373, + "rewards/format_reward": 1.5, + "step": 3658 + }, + { + "clip_ratio": 0.005011786881368607, + "epoch": 0.1365234830464997, + "grad_norm": 0.07747439295053482, + "kl": 0.01446533203125, + "learning_rate": 2.59358551715746e-06, + "loss": 0.0121, + "step": 3659 + }, + { + "clip_ratio": 0.004514387284871191, + "epoch": 0.13656079473905136, + "grad_norm": 0.07063641399145126, + "kl": 0.0145263671875, + "learning_rate": 2.5913607693577005e-06, + "loss": 0.0119, + "step": 3660 + }, + { + "clip_ratio": 0.004060321138240397, + "completion_length": 766.9821701049805, + "epoch": 0.136598106431603, + "grad_norm": 0.04489348828792572, + "kl": 0.016693115234375, + "learning_rate": 2.5891372419689322e-06, + "loss": 0.0453, + "num_tokens": 85811848.0, + "reward": 0.41875002533197403, + "reward_std": 0.11647612508386374, + "rewards/code_reward": 0.27142857015132904, + "rewards/format_reward": 1.4732142984867096, + "step": 3661 + }, + { + "clip_ratio": 0.0036060423008166254, + "epoch": 0.13663541812415467, + "grad_norm": 0.045308757573366165, + "kl": 0.0172119140625, + "learning_rate": 2.5869149359241077e-06, + "loss": 0.0452, + "step": 3662 + }, + { + "clip_ratio": 0.0031990021816454828, + "epoch": 0.1366727298167063, + "grad_norm": 0.043571747839450836, + "kl": 0.016510009765625, + "learning_rate": 2.584693852155662e-06, + "loss": 0.0451, + "step": 3663 + }, + { + "clip_ratio": 0.0027234439039602876, + "completion_length": 623.3214492797852, + "epoch": 0.13671004150925797, + "grad_norm": 0.02520930767059326, + "kl": 0.0182037353515625, + "learning_rate": 2.582473991595523e-06, + "loss": -0.0019, + "num_tokens": 85872514.0, + "reward": 0.5334865674376488, + "reward_std": 0.021013591438531876, + "rewards/code_reward": 0.3834865693934262, + "rewards/format_reward": 1.5, + "step": 3664 + }, + { + "clip_ratio": 0.0025730925262905657, + "epoch": 0.1367473532018096, + "grad_norm": 0.026042265817523003, + "kl": 0.0179595947265625, + "learning_rate": 2.580255355175104e-06, + "loss": -0.0019, + "step": 3665 + }, + { + "clip_ratio": 0.002448912477120757, + "epoch": 0.13678466489436128, + "grad_norm": 0.024910131469368935, + "kl": 0.0180511474609375, + "learning_rate": 2.5780379438253004e-06, + "loss": -0.0019, + "step": 3666 + }, + { + "clip_ratio": 0.0031130541465245187, + "completion_length": 518.0893020629883, + "epoch": 0.13682197658691292, + "grad_norm": 0.07236882299184799, + "kl": 0.016815185546875, + "learning_rate": 2.5758217584765e-06, + "loss": 0.004, + "num_tokens": 85929779.0, + "reward": 0.764601968228817, + "reward_std": 0.1471609827131033, + "rewards/code_reward": 0.6146019399166107, + "rewards/format_reward": 1.5, + "step": 3667 + }, + { + "clip_ratio": 0.0031899893656373024, + "epoch": 0.13685928827946459, + "grad_norm": 0.10484723001718521, + "kl": 0.0166473388671875, + "learning_rate": 2.573606800058568e-06, + "loss": 0.0037, + "step": 3668 + }, + { + "clip_ratio": 0.002927478461060673, + "epoch": 0.13689659997201623, + "grad_norm": 0.14141002297401428, + "kl": 0.0167083740234375, + "learning_rate": 2.5713930695008656e-06, + "loss": 0.0035, + "step": 3669 + }, + { + "clip_ratio": 0.0036581263411790133, + "completion_length": 628.6964492797852, + "epoch": 0.1369339116645679, + "grad_norm": 0.06824374198913574, + "kl": 0.02099609375, + "learning_rate": 2.569180567732232e-06, + "loss": 0.0576, + "num_tokens": 85998068.0, + "reward": 0.6301942989230156, + "reward_std": 0.187460795044899, + "rewards/code_reward": 0.48287286423146725, + "rewards/format_reward": 1.4732142984867096, + "step": 3670 + }, + { + "clip_ratio": 0.003886517835780978, + "epoch": 0.13697122335711953, + "grad_norm": 0.06498869508504868, + "kl": 0.020751953125, + "learning_rate": 2.566969295680989e-06, + "loss": 0.0576, + "step": 3671 + }, + { + "clip_ratio": 0.003425983479246497, + "epoch": 0.1370085350496712, + "grad_norm": 0.06851157546043396, + "kl": 0.021026611328125, + "learning_rate": 2.564759254274948e-06, + "loss": 0.0572, + "step": 3672 + }, + { + "clip_ratio": 0.0028016277065034956, + "completion_length": 742.035758972168, + "epoch": 0.13704584674222284, + "grad_norm": 0.07749434560537338, + "kl": 0.0229034423828125, + "learning_rate": 2.562550444441398e-06, + "loss": -0.0215, + "num_tokens": 86062386.0, + "reward": 0.7177346050739288, + "reward_std": 0.2123325653374195, + "rewards/code_reward": 0.5677345953881741, + "rewards/format_reward": 1.5, + "step": 3673 + }, + { + "clip_ratio": 0.0024498725251760334, + "epoch": 0.1370831584347745, + "grad_norm": 0.06037705019116402, + "kl": 0.019500732421875, + "learning_rate": 2.5603428671071175e-06, + "loss": -0.0218, + "step": 3674 + }, + { + "clip_ratio": 0.002665566105861217, + "epoch": 0.13712047012732614, + "grad_norm": 0.08276189118623734, + "kl": 0.01944732666015625, + "learning_rate": 2.558136523198365e-06, + "loss": -0.0216, + "step": 3675 + }, + { + "clip_ratio": 0.003232435090467334, + "completion_length": 651.232177734375, + "epoch": 0.1371577818198778, + "grad_norm": 0.0799732580780983, + "kl": 0.01348876953125, + "learning_rate": 2.55593141364088e-06, + "loss": 0.008, + "num_tokens": 86130635.0, + "reward": 0.8760869652032852, + "reward_std": 0.22870393097400665, + "rewards/code_reward": 0.7260869517922401, + "rewards/format_reward": 1.5, + "step": 3676 + }, + { + "clip_ratio": 0.00357487186556682, + "epoch": 0.13719509351242945, + "grad_norm": 0.08610161393880844, + "kl": 0.013702392578125, + "learning_rate": 2.553727539359886e-06, + "loss": 0.0079, + "step": 3677 + }, + { + "clip_ratio": 0.0029787044622935355, + "epoch": 0.13723240520498112, + "grad_norm": 0.07851425558328629, + "kl": 0.013641357421875, + "learning_rate": 2.5515249012800868e-06, + "loss": 0.0077, + "step": 3678 + }, + { + "clip_ratio": 0.0040300574619323015, + "completion_length": 649.1428833007812, + "epoch": 0.13726971689753276, + "grad_norm": 0.10693997889757156, + "kl": 0.01666259765625, + "learning_rate": 2.54932350032567e-06, + "loss": -0.0141, + "num_tokens": 86198305.0, + "reward": 0.5827511698007584, + "reward_std": 0.1812298847362399, + "rewards/code_reward": 0.4327511293813586, + "rewards/format_reward": 1.5, + "step": 3679 + }, + { + "clip_ratio": 0.003571757930330932, + "epoch": 0.13730702859008442, + "grad_norm": 0.09848012775182724, + "kl": 0.0166778564453125, + "learning_rate": 2.5471233374203037e-06, + "loss": -0.0141, + "step": 3680 + }, + { + "clip_ratio": 0.004235006170347333, + "epoch": 0.13734434028263606, + "grad_norm": 0.09025132656097412, + "kl": 0.01611328125, + "learning_rate": 2.544924413487131e-06, + "loss": -0.0143, + "step": 3681 + }, + { + "clip_ratio": 0.004822748829610646, + "completion_length": 633.482177734375, + "epoch": 0.13738165197518773, + "grad_norm": 0.05076567828655243, + "kl": 0.020111083984375, + "learning_rate": 2.5427267294487836e-06, + "loss": 0.0056, + "num_tokens": 86260212.0, + "reward": 0.39834053441882133, + "reward_std": 0.046663492918014526, + "rewards/code_reward": 0.24834049819037318, + "rewards/format_reward": 1.5, + "step": 3682 + }, + { + "clip_ratio": 0.004843089525820687, + "epoch": 0.13741896366773937, + "grad_norm": 0.049708228558301926, + "kl": 0.021087646484375, + "learning_rate": 2.5405302862273682e-06, + "loss": 0.0057, + "step": 3683 + }, + { + "clip_ratio": 0.0049142984207719564, + "epoch": 0.13745627536029104, + "grad_norm": 0.05147179588675499, + "kl": 0.02069091796875, + "learning_rate": 2.53833508474447e-06, + "loss": 0.0055, + "step": 3684 + }, + { + "clip_ratio": 0.0019305668829474598, + "completion_length": 702.7857360839844, + "epoch": 0.13749358705284268, + "grad_norm": 0.0600927472114563, + "kl": 0.021209716796875, + "learning_rate": 2.5361411259211565e-06, + "loss": -0.0021, + "num_tokens": 86331502.0, + "reward": 0.8450723402202129, + "reward_std": 0.14296654611825943, + "rewards/code_reward": 0.6950723230838776, + "rewards/format_reward": 1.5, + "step": 3685 + }, + { + "clip_ratio": 0.002546560688642785, + "epoch": 0.13753089874539434, + "grad_norm": 0.060947902500629425, + "kl": 0.0223236083984375, + "learning_rate": 2.5339484106779678e-06, + "loss": -0.002, + "step": 3686 + }, + { + "clip_ratio": 0.002452517976053059, + "epoch": 0.13756821043794598, + "grad_norm": 0.05671258270740509, + "kl": 0.0223388671875, + "learning_rate": 2.531756939934932e-06, + "loss": -0.0022, + "step": 3687 + }, + { + "clip_ratio": 0.003987648175098002, + "completion_length": 757.9286193847656, + "epoch": 0.13760552213049765, + "grad_norm": 0.06368421018123627, + "kl": 0.019744873046875, + "learning_rate": 2.529566714611549e-06, + "loss": -0.0014, + "num_tokens": 86410500.0, + "reward": 0.6243277005851269, + "reward_std": 0.061522409319877625, + "rewards/code_reward": 0.4743276685476303, + "rewards/format_reward": 1.5, + "step": 3688 + }, + { + "clip_ratio": 0.0035783814964815974, + "epoch": 0.1376428338230493, + "grad_norm": 0.06454025208950043, + "kl": 0.01971435546875, + "learning_rate": 2.5273777356267933e-06, + "loss": -0.0015, + "step": 3689 + }, + { + "clip_ratio": 0.003717927262187004, + "epoch": 0.13768014551560095, + "grad_norm": 0.06381350010633469, + "kl": 0.0197296142578125, + "learning_rate": 2.5251900038991228e-06, + "loss": -0.0016, + "step": 3690 + }, + { + "clip_ratio": 0.003398414934054017, + "completion_length": 758.8928833007812, + "epoch": 0.1377174572081526, + "grad_norm": 0.072980597615242, + "kl": 0.0203094482421875, + "learning_rate": 2.5230035203464664e-06, + "loss": -0.0059, + "num_tokens": 86484372.0, + "reward": 0.6037264764308929, + "reward_std": 0.1873280517756939, + "rewards/code_reward": 0.4537264611572027, + "rewards/format_reward": 1.5, + "step": 3691 + }, + { + "clip_ratio": 0.0033382315887138247, + "epoch": 0.13775476890070426, + "grad_norm": 0.07338722795248032, + "kl": 0.0201568603515625, + "learning_rate": 2.5208182858862334e-06, + "loss": -0.0059, + "step": 3692 + }, + { + "clip_ratio": 0.0032192791113629937, + "epoch": 0.1377920805932559, + "grad_norm": 0.07467687129974365, + "kl": 0.0205535888671875, + "learning_rate": 2.51863430143531e-06, + "loss": -0.0061, + "step": 3693 + }, + { + "clip_ratio": 0.003982852533226833, + "completion_length": 646.2321624755859, + "epoch": 0.13782939228580757, + "grad_norm": 0.09061095863580704, + "kl": 0.0204925537109375, + "learning_rate": 2.516451567910051e-06, + "loss": 0.0056, + "num_tokens": 86552747.0, + "reward": 0.7736203223466873, + "reward_std": 0.18076338386163116, + "rewards/code_reward": 0.623620331287384, + "rewards/format_reward": 1.5, + "step": 3694 + }, + { + "clip_ratio": 0.0037623876123689115, + "epoch": 0.1378667039783592, + "grad_norm": 0.08912339061498642, + "kl": 0.0206298828125, + "learning_rate": 2.5142700862262924e-06, + "loss": 0.0057, + "step": 3695 + }, + { + "clip_ratio": 0.0038302132161334157, + "epoch": 0.13790401567091087, + "grad_norm": 0.08439730107784271, + "kl": 0.0207061767578125, + "learning_rate": 2.512089857299345e-06, + "loss": 0.0056, + "step": 3696 + }, + { + "clip_ratio": 0.0024355824571102858, + "completion_length": 579.3571853637695, + "epoch": 0.1379413273634625, + "grad_norm": 0.0754682868719101, + "kl": 0.02685546875, + "learning_rate": 2.509910882043991e-06, + "loss": -0.0057, + "num_tokens": 86608403.0, + "reward": 0.9676020443439484, + "reward_std": 0.29932140558958054, + "rewards/code_reward": 0.8176020532846451, + "rewards/format_reward": 1.5, + "step": 3697 + }, + { + "clip_ratio": 0.002621069026645273, + "epoch": 0.13797863905601418, + "grad_norm": 0.07555479556322098, + "kl": 0.02691650390625, + "learning_rate": 2.5077331613744894e-06, + "loss": -0.0058, + "step": 3698 + }, + { + "clip_ratio": 0.0023408369161188602, + "epoch": 0.13801595074856582, + "grad_norm": 0.07384031265974045, + "kl": 0.027130126953125, + "learning_rate": 2.5055566962045684e-06, + "loss": -0.006, + "step": 3699 + }, + { + "clip_ratio": 0.0028468641685321927, + "completion_length": 646.7500305175781, + "epoch": 0.13805326244111749, + "grad_norm": 0.04810914397239685, + "kl": 0.025604248046875, + "learning_rate": 2.503381487447436e-06, + "loss": 0.0073, + "num_tokens": 86674343.0, + "reward": 0.6821253560483456, + "reward_std": 0.06500019878149033, + "rewards/code_reward": 0.5321253500878811, + "rewards/format_reward": 1.5, + "step": 3700 + }, + { + "clip_ratio": 0.0028102241922169924, + "epoch": 0.13809057413366913, + "grad_norm": 0.05365170165896416, + "kl": 0.02459716796875, + "learning_rate": 2.5012075360157668e-06, + "loss": 0.0074, + "step": 3701 + }, + { + "clip_ratio": 0.0027309523429721594, + "epoch": 0.1381278858262208, + "grad_norm": 0.04548284411430359, + "kl": 0.02398681640625, + "learning_rate": 2.4990348428217106e-06, + "loss": 0.0072, + "step": 3702 + }, + { + "clip_ratio": 0.0032176561653614044, + "completion_length": 526.9286041259766, + "epoch": 0.13816519751877246, + "grad_norm": 0.09237069636583328, + "kl": 0.02862548828125, + "learning_rate": 2.4968634087768897e-06, + "loss": -0.0133, + "num_tokens": 86733713.0, + "reward": 0.9585565626621246, + "reward_std": 0.24617700278759003, + "rewards/code_reward": 0.8085565343499184, + "rewards/format_reward": 1.5, + "step": 3703 + }, + { + "clip_ratio": 0.0032558991806581616, + "epoch": 0.1382025092113241, + "grad_norm": 0.09073960036039352, + "kl": 0.026824951171875, + "learning_rate": 2.494693234792398e-06, + "loss": -0.0135, + "step": 3704 + }, + { + "clip_ratio": 0.0031563856173306704, + "epoch": 0.13823982090387577, + "grad_norm": 0.09018471837043762, + "kl": 0.02734375, + "learning_rate": 2.4925243217788005e-06, + "loss": -0.0137, + "step": 3705 + }, + { + "clip_ratio": 0.004574928607326001, + "completion_length": 701.0000457763672, + "epoch": 0.1382771325964274, + "grad_norm": 0.06654893606901169, + "kl": 0.021453857421875, + "learning_rate": 2.4903566706461303e-06, + "loss": 0.0422, + "num_tokens": 86808543.0, + "reward": 0.462797649204731, + "reward_std": 0.1494958782568574, + "rewards/code_reward": 0.3154762014746666, + "rewards/format_reward": 1.4732142984867096, + "step": 3706 + }, + { + "clip_ratio": 0.004408457083627582, + "epoch": 0.13831444428897907, + "grad_norm": 0.05896024405956268, + "kl": 0.0208282470703125, + "learning_rate": 2.4881902823038954e-06, + "loss": 0.0423, + "step": 3707 + }, + { + "clip_ratio": 0.004542519338428974, + "epoch": 0.1383517559815307, + "grad_norm": 0.06641047447919846, + "kl": 0.020599365234375, + "learning_rate": 2.4860251576610737e-06, + "loss": 0.0422, + "step": 3708 + }, + { + "clip_ratio": 0.002973677939735353, + "completion_length": 712.2678833007812, + "epoch": 0.13838906767408238, + "grad_norm": 0.07595048099756241, + "kl": 0.0164031982421875, + "learning_rate": 2.483861297626108e-06, + "loss": -0.0099, + "num_tokens": 86882548.0, + "reward": 0.6030424498021603, + "reward_std": 0.22963805682957172, + "rewards/code_reward": 0.45304241264238954, + "rewards/format_reward": 1.5, + "step": 3709 + }, + { + "clip_ratio": 0.003175445832312107, + "epoch": 0.13842637936663402, + "grad_norm": 0.07356392592191696, + "kl": 0.01678466796875, + "learning_rate": 2.4816987031069163e-06, + "loss": -0.0098, + "step": 3710 + }, + { + "clip_ratio": 0.003326202800963074, + "epoch": 0.13846369105918568, + "grad_norm": 0.07287082076072693, + "kl": 0.016632080078125, + "learning_rate": 2.4795373750108813e-06, + "loss": -0.0098, + "step": 3711 + }, + { + "clip_ratio": 0.003325973404571414, + "completion_length": 578.1964569091797, + "epoch": 0.13850100275173732, + "grad_norm": 0.07602003216743469, + "kl": 0.018035888671875, + "learning_rate": 2.4773773142448568e-06, + "loss": 0.0138, + "num_tokens": 86938111.0, + "reward": 0.6313441880047321, + "reward_std": 0.1501044314354658, + "rewards/code_reward": 0.48134420439600945, + "rewards/format_reward": 1.5, + "step": 3712 + }, + { + "clip_ratio": 0.00292623761924915, + "epoch": 0.138538314444289, + "grad_norm": 0.08326305449008942, + "kl": 0.017730712890625, + "learning_rate": 2.475218521715164e-06, + "loss": 0.0137, + "step": 3713 + }, + { + "clip_ratio": 0.0029792555142194033, + "epoch": 0.13857562613684063, + "grad_norm": 0.06993255019187927, + "kl": 0.0180816650390625, + "learning_rate": 2.4730609983275924e-06, + "loss": 0.0136, + "step": 3714 + }, + { + "clip_ratio": 0.003136372717563063, + "completion_length": 704.553596496582, + "epoch": 0.1386129378293923, + "grad_norm": 0.08046768605709076, + "kl": 0.0191802978515625, + "learning_rate": 2.470904744987399e-06, + "loss": 0.0037, + "num_tokens": 87010858.0, + "reward": 0.7269230894744396, + "reward_std": 0.12220939621329308, + "rewards/code_reward": 0.5769230797886848, + "rewards/format_reward": 1.5, + "step": 3715 + }, + { + "clip_ratio": 0.003020562930032611, + "epoch": 0.13865024952194394, + "grad_norm": 0.07115000486373901, + "kl": 0.0193634033203125, + "learning_rate": 2.468749762599309e-06, + "loss": 0.0037, + "step": 3716 + }, + { + "clip_ratio": 0.003157034399919212, + "epoch": 0.1386875612144956, + "grad_norm": 0.06633812189102173, + "kl": 0.019805908203125, + "learning_rate": 2.4665960520675093e-06, + "loss": 0.0036, + "step": 3717 + }, + { + "clip_ratio": 0.0035826709354296327, + "completion_length": 601.5536041259766, + "epoch": 0.13872487290704724, + "grad_norm": 0.08520348370075226, + "kl": 0.0192108154296875, + "learning_rate": 2.464443614295661e-06, + "loss": 0.0085, + "num_tokens": 87080861.0, + "reward": 0.8422807566821575, + "reward_std": 0.0255051429849118, + "rewards/code_reward": 0.6922807693481445, + "rewards/format_reward": 1.5, + "step": 3718 + }, + { + "clip_ratio": 0.00361569388769567, + "epoch": 0.1387621845995989, + "grad_norm": 0.08764816075563431, + "kl": 0.018707275390625, + "learning_rate": 2.4622924501868835e-06, + "loss": 0.0084, + "step": 3719 + }, + { + "clip_ratio": 0.0036194262793287635, + "epoch": 0.13879949629215055, + "grad_norm": 0.06264755874872208, + "kl": 0.0184173583984375, + "learning_rate": 2.460142560643767e-06, + "loss": 0.0082, + "step": 3720 + }, + { + "clip_ratio": 0.0035561880213208497, + "completion_length": 643.6964721679688, + "epoch": 0.13883680798470222, + "grad_norm": 0.07407388836145401, + "kl": 0.026275634765625, + "learning_rate": 2.457993946568364e-06, + "loss": -0.0306, + "num_tokens": 87146336.0, + "reward": 0.6748802624642849, + "reward_std": 0.14197126775979996, + "rewards/code_reward": 0.5275588035583496, + "rewards/format_reward": 1.4732142984867096, + "step": 3721 + }, + { + "clip_ratio": 0.0035535998176783323, + "epoch": 0.13887411967725385, + "grad_norm": 0.0779465064406395, + "kl": 0.0263671875, + "learning_rate": 2.4558466088621943e-06, + "loss": -0.0306, + "step": 3722 + }, + { + "clip_ratio": 0.003625290119089186, + "epoch": 0.13891143136980552, + "grad_norm": 0.07504525035619736, + "kl": 0.0261383056640625, + "learning_rate": 2.4537005484262424e-06, + "loss": -0.0304, + "step": 3723 + }, + { + "clip_ratio": 0.004003097536042333, + "completion_length": 772.3393096923828, + "epoch": 0.13894874306235716, + "grad_norm": 0.05690696835517883, + "kl": 0.0193634033203125, + "learning_rate": 2.451555766160952e-06, + "loss": -0.0047, + "num_tokens": 87216691.0, + "reward": 0.4196428656578064, + "reward_std": 0.16275831125676632, + "rewards/code_reward": 0.26964285236317664, + "rewards/format_reward": 1.5, + "step": 3724 + }, + { + "clip_ratio": 0.0032981830299831927, + "epoch": 0.13898605475490883, + "grad_norm": 0.05728308856487274, + "kl": 0.0193939208984375, + "learning_rate": 2.4494122629662355e-06, + "loss": -0.0049, + "step": 3725 + }, + { + "clip_ratio": 0.003766013018321246, + "epoch": 0.13902336644746047, + "grad_norm": 0.05670643597841263, + "kl": 0.019683837890625, + "learning_rate": 2.447270039741469e-06, + "loss": -0.0048, + "step": 3726 + }, + { + "clip_ratio": 0.0036524300230666995, + "completion_length": 543.7500152587891, + "epoch": 0.13906067814001213, + "grad_norm": 0.059275899082422256, + "kl": 0.021575927734375, + "learning_rate": 2.445129097385485e-06, + "loss": 0.0127, + "num_tokens": 87279453.0, + "reward": 0.7809524051845074, + "reward_std": 0.20863528549671173, + "rewards/code_reward": 0.630952388048172, + "rewards/format_reward": 1.5, + "step": 3727 + }, + { + "clip_ratio": 0.003553830727469176, + "epoch": 0.13909798983256377, + "grad_norm": 0.062114547938108444, + "kl": 0.0215606689453125, + "learning_rate": 2.4429894367965883e-06, + "loss": 0.0128, + "step": 3728 + }, + { + "clip_ratio": 0.0035646797623485327, + "epoch": 0.13913530152511544, + "grad_norm": 0.06181694567203522, + "kl": 0.021453857421875, + "learning_rate": 2.440851058872536e-06, + "loss": 0.0126, + "step": 3729 + }, + { + "clip_ratio": 0.0038970786845311522, + "completion_length": 600.3928756713867, + "epoch": 0.13917261321766708, + "grad_norm": 0.08288724720478058, + "kl": 0.02392578125, + "learning_rate": 2.4387139645105545e-06, + "loss": 0.0397, + "num_tokens": 87356195.0, + "reward": 0.6988670527935028, + "reward_std": 0.3960409536957741, + "rewards/code_reward": 0.5515456087887287, + "rewards/format_reward": 1.4732142984867096, + "step": 3730 + }, + { + "clip_ratio": 0.0038852907018736005, + "epoch": 0.13920992491021875, + "grad_norm": 0.08148004114627838, + "kl": 0.023712158203125, + "learning_rate": 2.436578154607328e-06, + "loss": 0.0399, + "step": 3731 + }, + { + "clip_ratio": 0.003890659660100937, + "epoch": 0.13924723660277039, + "grad_norm": 0.0764705091714859, + "kl": 0.023040771484375, + "learning_rate": 2.434443630059003e-06, + "loss": 0.0395, + "step": 3732 + }, + { + "clip_ratio": 0.0035873879387509078, + "completion_length": 634.1964492797852, + "epoch": 0.13928454829532205, + "grad_norm": 0.07310814410448074, + "kl": 0.01462554931640625, + "learning_rate": 2.4323103917611875e-06, + "loss": -0.0062, + "num_tokens": 87414684.0, + "reward": 0.7054348103702068, + "reward_std": 0.32088954746723175, + "rewards/code_reward": 0.5554347783327103, + "rewards/format_reward": 1.5, + "step": 3733 + }, + { + "clip_ratio": 0.003693299164297059, + "epoch": 0.1393218599878737, + "grad_norm": 0.07419713586568832, + "kl": 0.01470947265625, + "learning_rate": 2.430178440608947e-06, + "loss": -0.0063, + "step": 3734 + }, + { + "clip_ratio": 0.003850404522381723, + "epoch": 0.13935917168042536, + "grad_norm": 0.07074322551488876, + "kl": 0.0152435302734375, + "learning_rate": 2.4280477774968084e-06, + "loss": -0.0063, + "step": 3735 + }, + { + "clip_ratio": 0.005018203519284725, + "completion_length": 594.8928756713867, + "epoch": 0.139396483372977, + "grad_norm": 0.07594149559736252, + "kl": 0.020172119140625, + "learning_rate": 2.4259184033187604e-06, + "loss": -0.0014, + "num_tokens": 87478336.0, + "reward": 0.4063025414943695, + "reward_std": 0.10227619111537933, + "rewards/code_reward": 0.25630251690745354, + "rewards/format_reward": 1.5, + "step": 3736 + }, + { + "clip_ratio": 0.005232736584730446, + "epoch": 0.13943379506552867, + "grad_norm": 0.07093417644500732, + "kl": 0.020263671875, + "learning_rate": 2.4237903189682466e-06, + "loss": -0.0013, + "step": 3737 + }, + { + "clip_ratio": 0.005491426563821733, + "epoch": 0.1394711067580803, + "grad_norm": 0.06214594468474388, + "kl": 0.01995849609375, + "learning_rate": 2.421663525338172e-06, + "loss": -0.0016, + "step": 3738 + }, + { + "clip_ratio": 0.0030540915904566646, + "completion_length": 805.8929138183594, + "epoch": 0.13950841845063197, + "grad_norm": 0.041789665818214417, + "kl": 0.017730712890625, + "learning_rate": 2.419538023320901e-06, + "loss": 0.0023, + "num_tokens": 87553160.0, + "reward": 0.5440127588808537, + "reward_std": 0.0916517460718751, + "rewards/code_reward": 0.39401271753013134, + "rewards/format_reward": 1.5, + "step": 3739 + }, + { + "clip_ratio": 0.0035296431742608547, + "epoch": 0.1395457301431836, + "grad_norm": 0.04207374155521393, + "kl": 0.018829345703125, + "learning_rate": 2.4174138138082536e-06, + "loss": 0.0022, + "step": 3740 + }, + { + "clip_ratio": 0.003225556341931224, + "epoch": 0.13958304183573528, + "grad_norm": 0.04192071780562401, + "kl": 0.017425537109375, + "learning_rate": 2.415290897691511e-06, + "loss": 0.0022, + "step": 3741 + }, + { + "clip_ratio": 0.004162753641139716, + "completion_length": 805.6786193847656, + "epoch": 0.13962035352828692, + "grad_norm": 0.09281422197818756, + "kl": 0.019866943359375, + "learning_rate": 2.4131692758614055e-06, + "loss": 0.0101, + "num_tokens": 87642924.0, + "reward": 0.6211612969636917, + "reward_std": 0.3574531748890877, + "rewards/code_reward": 0.4711612854152918, + "rewards/format_reward": 1.5, + "step": 3742 + }, + { + "clip_ratio": 0.004158424912020564, + "epoch": 0.13965766522083858, + "grad_norm": 0.08358944952487946, + "kl": 0.019439697265625, + "learning_rate": 2.4110489492081324e-06, + "loss": 0.0099, + "step": 3743 + }, + { + "clip_ratio": 0.004045524401590228, + "epoch": 0.13969497691339022, + "grad_norm": 0.08031395077705383, + "kl": 0.0196380615234375, + "learning_rate": 2.4089299186213425e-06, + "loss": 0.0096, + "step": 3744 + }, + { + "clip_ratio": 0.0033489534398540854, + "completion_length": 580.803596496582, + "epoch": 0.1397322886059419, + "grad_norm": 0.08395101875066757, + "kl": 0.019317626953125, + "learning_rate": 2.406812184990139e-06, + "loss": 0.0029, + "num_tokens": 87701587.0, + "reward": 0.41513318195939064, + "reward_std": 0.2455899640917778, + "rewards/code_reward": 0.26513317599892616, + "rewards/format_reward": 1.5, + "step": 3745 + }, + { + "clip_ratio": 0.0030440265545621514, + "epoch": 0.13976960029849353, + "grad_norm": 0.13524049520492554, + "kl": 0.01947021484375, + "learning_rate": 2.404695749203086e-06, + "loss": 0.0029, + "step": 3746 + }, + { + "clip_ratio": 0.0032670843065716326, + "epoch": 0.1398069119910452, + "grad_norm": 0.05711490288376808, + "kl": 0.018890380859375, + "learning_rate": 2.402580612148199e-06, + "loss": 0.0027, + "step": 3747 + }, + { + "clip_ratio": 0.0020338864997029305, + "completion_length": 684.107177734375, + "epoch": 0.13984422368359684, + "grad_norm": 0.12229106575250626, + "kl": 0.0218353271484375, + "learning_rate": 2.4004667747129503e-06, + "loss": 0.0025, + "num_tokens": 87766889.0, + "reward": 0.7974442839622498, + "reward_std": 0.13194452598690987, + "rewards/code_reward": 0.6474443003535271, + "rewards/format_reward": 1.5, + "step": 3748 + }, + { + "clip_ratio": 0.001845705322921276, + "epoch": 0.1398815353761485, + "grad_norm": 0.10607393831014633, + "kl": 0.021759033203125, + "learning_rate": 2.398354237784267e-06, + "loss": 0.0024, + "step": 3749 + }, + { + "clip_ratio": 0.0020594343659467995, + "epoch": 0.13991884706870014, + "grad_norm": 0.06499645113945007, + "kl": 0.0217132568359375, + "learning_rate": 2.396243002248531e-06, + "loss": 0.0023, + "step": 3750 + }, + { + "clip_ratio": 0.004254895029589534, + "completion_length": 657.1964569091797, + "epoch": 0.1399561587612518, + "grad_norm": 0.08539664000272751, + "kl": 0.0321197509765625, + "learning_rate": 2.394133068991579e-06, + "loss": -0.0078, + "num_tokens": 87845258.0, + "reward": 0.5596120096743107, + "reward_std": 0.29566051810979843, + "rewards/code_reward": 0.4096120111644268, + "rewards/format_reward": 1.5, + "step": 3751 + }, + { + "clip_ratio": 0.004489297745749354, + "epoch": 0.13999347045380345, + "grad_norm": 0.09625202417373657, + "kl": 0.0326385498046875, + "learning_rate": 2.3920244388986957e-06, + "loss": -0.0077, + "step": 3752 + }, + { + "clip_ratio": 0.004054405610077083, + "epoch": 0.14003078214635512, + "grad_norm": 0.1310921162366867, + "kl": 0.032318115234375, + "learning_rate": 2.3899171128546258e-06, + "loss": -0.0079, + "step": 3753 + }, + { + "clip_ratio": 0.0031912093400023878, + "completion_length": 558.6964416503906, + "epoch": 0.14006809383890675, + "grad_norm": 0.05655590817332268, + "kl": 0.0190582275390625, + "learning_rate": 2.3878110917435653e-06, + "loss": 0.0003, + "num_tokens": 87909851.0, + "reward": 0.7284589521586895, + "reward_std": 0.2002679482102394, + "rewards/code_reward": 0.5784589424729347, + "rewards/format_reward": 1.5, + "step": 3754 + }, + { + "clip_ratio": 0.003443208639509976, + "epoch": 0.14010540553145842, + "grad_norm": 0.056685417890548706, + "kl": 0.01849365234375, + "learning_rate": 2.3857063764491584e-06, + "loss": 0.0005, + "step": 3755 + }, + { + "clip_ratio": 0.0031690422911196947, + "epoch": 0.14014271722401006, + "grad_norm": 0.056050855666399, + "kl": 0.019805908203125, + "learning_rate": 2.3836029678545065e-06, + "loss": 0.0002, + "step": 3756 + }, + { + "clip_ratio": 0.003175009915139526, + "completion_length": 671.9107513427734, + "epoch": 0.14018002891656173, + "grad_norm": 0.06710396707057953, + "kl": 0.01447296142578125, + "learning_rate": 2.3815008668421603e-06, + "loss": 0.0492, + "num_tokens": 87981890.0, + "reward": 0.6976575180888176, + "reward_std": 0.0742548443377018, + "rewards/code_reward": 0.5503360964357853, + "rewards/format_reward": 1.4732142984867096, + "step": 3757 + }, + { + "clip_ratio": 0.0030478014959953725, + "epoch": 0.1402173406091134, + "grad_norm": 0.06730298697948456, + "kl": 0.0147705078125, + "learning_rate": 2.3794000742941217e-06, + "loss": 0.0491, + "step": 3758 + }, + { + "clip_ratio": 0.0028803330496884882, + "epoch": 0.14025465230166503, + "grad_norm": 0.06428273022174835, + "kl": 0.01528167724609375, + "learning_rate": 2.377300591091847e-06, + "loss": 0.0493, + "step": 3759 + }, + { + "clip_ratio": 0.003634637512732297, + "completion_length": 536.5892944335938, + "epoch": 0.1402919639942167, + "grad_norm": 0.04906739667057991, + "kl": 0.0252532958984375, + "learning_rate": 2.375202418116237e-06, + "loss": 0.0002, + "num_tokens": 88039813.0, + "reward": 0.714009452611208, + "reward_std": 0.08346777968108654, + "rewards/code_reward": 0.5640094205737114, + "rewards/format_reward": 1.5, + "step": 3760 + }, + { + "clip_ratio": 0.00331899308366701, + "epoch": 0.14032927568676834, + "grad_norm": 0.05156927928328514, + "kl": 0.0251312255859375, + "learning_rate": 2.373105556247649e-06, + "loss": 0.0002, + "step": 3761 + }, + { + "clip_ratio": 0.003166818816680461, + "epoch": 0.14036658737932, + "grad_norm": 0.05320104956626892, + "kl": 0.0253143310546875, + "learning_rate": 2.3710100063658843e-06, + "loss": 0.0001, + "step": 3762 + }, + { + "clip_ratio": 0.003673737810458988, + "completion_length": 709.1607360839844, + "epoch": 0.14040389907187165, + "grad_norm": 0.07726054638624191, + "kl": 0.0161590576171875, + "learning_rate": 2.3689157693501975e-06, + "loss": -0.0025, + "num_tokens": 88124354.0, + "reward": 0.7390284761786461, + "reward_std": 0.24372103763744235, + "rewards/code_reward": 0.5890284776687622, + "rewards/format_reward": 1.5, + "step": 3763 + }, + { + "clip_ratio": 0.0037675126222893596, + "epoch": 0.1404412107644233, + "grad_norm": 0.07925332337617874, + "kl": 0.01654052734375, + "learning_rate": 2.366822846079294e-06, + "loss": -0.0028, + "step": 3764 + }, + { + "clip_ratio": 0.0034054634161293507, + "epoch": 0.14047852245697495, + "grad_norm": 0.07293885946273804, + "kl": 0.016143798828125, + "learning_rate": 2.3647312374313224e-06, + "loss": -0.0029, + "step": 3765 + }, + { + "clip_ratio": 0.0032007520203478634, + "completion_length": 566.6607360839844, + "epoch": 0.14051583414952662, + "grad_norm": 0.06155906617641449, + "kl": 0.0168609619140625, + "learning_rate": 2.3626409442838843e-06, + "loss": -0.0004, + "num_tokens": 88181133.0, + "reward": 0.6909798793494701, + "reward_std": 0.23726535588502884, + "rewards/code_reward": 0.5409798696637154, + "rewards/format_reward": 1.5, + "step": 3766 + }, + { + "clip_ratio": 0.0032336642034351826, + "epoch": 0.14055314584207826, + "grad_norm": 0.05885324254631996, + "kl": 0.016357421875, + "learning_rate": 2.3605519675140273e-06, + "loss": -0.0004, + "step": 3767 + }, + { + "clip_ratio": 0.002925833803601563, + "epoch": 0.14059045753462993, + "grad_norm": 0.06446241587400436, + "kl": 0.0165252685546875, + "learning_rate": 2.3584643079982476e-06, + "loss": -0.0008, + "step": 3768 + }, + { + "clip_ratio": 0.0021536509739235044, + "completion_length": 595.9464492797852, + "epoch": 0.14062776922718156, + "grad_norm": 0.06467161327600479, + "kl": 0.0147552490234375, + "learning_rate": 2.3563779666124907e-06, + "loss": -0.0156, + "num_tokens": 88239706.0, + "reward": 0.555563185364008, + "reward_std": 0.19566639885306358, + "rewards/code_reward": 0.40556318522430956, + "rewards/format_reward": 1.5, + "step": 3769 + }, + { + "clip_ratio": 0.0022899473551660776, + "epoch": 0.14066508091973323, + "grad_norm": 0.06621062010526657, + "kl": 0.014312744140625, + "learning_rate": 2.354292944232142e-06, + "loss": -0.0157, + "step": 3770 + }, + { + "clip_ratio": 0.002059508056845516, + "epoch": 0.14070239261228487, + "grad_norm": 0.05795692652463913, + "kl": 0.0146636962890625, + "learning_rate": 2.3522092417320424e-06, + "loss": -0.0156, + "step": 3771 + }, + { + "clip_ratio": 0.004015860438812524, + "completion_length": 509.6071548461914, + "epoch": 0.14073970430483654, + "grad_norm": 0.08819280564785004, + "kl": 0.03790283203125, + "learning_rate": 2.3501268599864714e-06, + "loss": 0.0123, + "num_tokens": 88301396.0, + "reward": 0.6331842504441738, + "reward_std": 0.08541113324463367, + "rewards/code_reward": 0.4851485083345324, + "rewards/format_reward": 1.480357140302658, + "step": 3772 + }, + { + "clip_ratio": 0.004580662469379604, + "epoch": 0.14077701599738818, + "grad_norm": 0.08090195804834366, + "kl": 0.0355224609375, + "learning_rate": 2.3480457998691596e-06, + "loss": 0.0119, + "step": 3773 + }, + { + "clip_ratio": 0.003950597194489092, + "epoch": 0.14081432768993984, + "grad_norm": 0.0763573870062828, + "kl": 0.038055419921875, + "learning_rate": 2.34596606225328e-06, + "loss": 0.0117, + "step": 3774 + }, + { + "clip_ratio": 0.0038088979199528694, + "completion_length": 681.6964721679688, + "epoch": 0.14085163938249148, + "grad_norm": 0.09425757080316544, + "kl": 0.022796630859375, + "learning_rate": 2.343887648011454e-06, + "loss": 0.0197, + "num_tokens": 88376909.0, + "reward": 0.6047594584524632, + "reward_std": 0.1590766425943002, + "rewards/code_reward": 0.4547594782052329, + "rewards/format_reward": 1.5, + "step": 3775 + }, + { + "clip_ratio": 0.0036904365988448262, + "epoch": 0.14088895107504315, + "grad_norm": 0.06760646402835846, + "kl": 0.0225830078125, + "learning_rate": 2.3418105580157435e-06, + "loss": 0.0195, + "step": 3776 + }, + { + "clip_ratio": 0.0035454753087833524, + "epoch": 0.1409262627675948, + "grad_norm": 0.09272400289773941, + "kl": 0.02325439453125, + "learning_rate": 2.339734793137659e-06, + "loss": 0.0195, + "step": 3777 + }, + { + "clip_ratio": 0.0032966912258416414, + "completion_length": 682.7678985595703, + "epoch": 0.14096357446014646, + "grad_norm": 0.07508864253759384, + "kl": 0.0145111083984375, + "learning_rate": 2.3376603542481506e-06, + "loss": 0.015, + "num_tokens": 88449822.0, + "reward": 0.7957530319690704, + "reward_std": 0.11531881988048553, + "rewards/code_reward": 0.6457530180923641, + "rewards/format_reward": 1.5, + "step": 3778 + }, + { + "clip_ratio": 0.003248010005336255, + "epoch": 0.1410008861526981, + "grad_norm": 0.0753571167588234, + "kl": 0.0147247314453125, + "learning_rate": 2.335587242217616e-06, + "loss": 0.0149, + "step": 3779 + }, + { + "clip_ratio": 0.002759930386673659, + "epoch": 0.14103819784524976, + "grad_norm": 0.07468689978122711, + "kl": 0.0144805908203125, + "learning_rate": 2.3335154579158923e-06, + "loss": 0.0149, + "step": 3780 + }, + { + "clip_ratio": 0.0022411373793147504, + "completion_length": 580.7143096923828, + "epoch": 0.1410755095378014, + "grad_norm": 0.0668846070766449, + "kl": 0.022003173828125, + "learning_rate": 2.331445002212264e-06, + "loss": -0.0001, + "num_tokens": 88512526.0, + "reward": 0.7035714201629162, + "reward_std": 0.11720181256532669, + "rewards/code_reward": 0.5535714328289032, + "rewards/format_reward": 1.5, + "step": 3781 + }, + { + "clip_ratio": 0.0023881113156676292, + "epoch": 0.14111282123035307, + "grad_norm": 0.042773373425006866, + "kl": 0.0226898193359375, + "learning_rate": 2.3293758759754558e-06, + "loss": -0.0001, + "step": 3782 + }, + { + "clip_ratio": 0.002326209971215576, + "epoch": 0.1411501329229047, + "grad_norm": 0.03788042068481445, + "kl": 0.022613525390625, + "learning_rate": 2.327308080073633e-06, + "loss": -0.0001, + "step": 3783 + }, + { + "clip_ratio": 0.003380439826287329, + "completion_length": 779.0000305175781, + "epoch": 0.14118744461545638, + "grad_norm": 0.08172817528247833, + "kl": 0.0249786376953125, + "learning_rate": 2.3252416153744047e-06, + "loss": 0.0186, + "num_tokens": 88592830.0, + "reward": 0.7354417033493519, + "reward_std": 0.28905901685357094, + "rewards/code_reward": 0.5881202463060617, + "rewards/format_reward": 1.4732142984867096, + "step": 3784 + }, + { + "clip_ratio": 0.002893168362788856, + "epoch": 0.14122475630800801, + "grad_norm": 0.08848490566015244, + "kl": 0.02508544921875, + "learning_rate": 2.323176482744822e-06, + "loss": 0.0185, + "step": 3785 + }, + { + "clip_ratio": 0.0029961687396280468, + "epoch": 0.14126206800055968, + "grad_norm": 0.21353687345981598, + "kl": 0.0267791748046875, + "learning_rate": 2.3211126830513756e-06, + "loss": 0.0182, + "step": 3786 + }, + { + "clip_ratio": 0.004715017101261765, + "completion_length": 855.8036041259766, + "epoch": 0.14129937969311132, + "grad_norm": 0.07340002804994583, + "kl": 0.022735595703125, + "learning_rate": 2.319050217159999e-06, + "loss": 0.0008, + "num_tokens": 88684379.0, + "reward": 0.462178785353899, + "reward_std": 0.33657393604516983, + "rewards/code_reward": 0.3121787775307894, + "rewards/format_reward": 1.5, + "step": 3787 + }, + { + "clip_ratio": 0.004026586655527353, + "epoch": 0.141336691385663, + "grad_norm": 0.0777389258146286, + "kl": 0.0223388671875, + "learning_rate": 2.3169890859360626e-06, + "loss": 0.0008, + "step": 3788 + }, + { + "clip_ratio": 0.004209309117868543, + "epoch": 0.14137400307821463, + "grad_norm": 0.07346566021442413, + "kl": 0.02337646484375, + "learning_rate": 2.3149292902443802e-06, + "loss": 0.0006, + "step": 3789 + }, + { + "clip_ratio": 0.0041873938171193, + "completion_length": 876.7321929931641, + "epoch": 0.1414113147707663, + "grad_norm": 0.0603862889111042, + "kl": 0.01861572265625, + "learning_rate": 2.3128708309492014e-06, + "loss": -0.0165, + "num_tokens": 88776106.0, + "reward": 0.3514682911336422, + "reward_std": 0.1316909696906805, + "rewards/code_reward": 0.20146827772259712, + "rewards/format_reward": 1.5, + "step": 3790 + }, + { + "clip_ratio": 0.0040728902094997466, + "epoch": 0.14144862646331793, + "grad_norm": 0.05740904062986374, + "kl": 0.018798828125, + "learning_rate": 2.3108137089142197e-06, + "loss": -0.0165, + "step": 3791 + }, + { + "clip_ratio": 0.0047998446971178055, + "epoch": 0.1414859381558696, + "grad_norm": 0.060246095061302185, + "kl": 0.0183868408203125, + "learning_rate": 2.308757925002565e-06, + "loss": -0.0164, + "step": 3792 + }, + { + "clip_ratio": 0.003472534939646721, + "completion_length": 635.357177734375, + "epoch": 0.14152324984842124, + "grad_norm": 0.07642021775245667, + "kl": 0.02783203125, + "learning_rate": 2.3067034800768053e-06, + "loss": 0.0048, + "num_tokens": 88840218.0, + "reward": 0.8675637394189835, + "reward_std": 0.16520486399531364, + "rewards/code_reward": 0.7175637260079384, + "rewards/format_reward": 1.5, + "step": 3793 + }, + { + "clip_ratio": 0.003238371224142611, + "epoch": 0.1415605615409729, + "grad_norm": 0.07572026550769806, + "kl": 0.0276336669921875, + "learning_rate": 2.3046503749989495e-06, + "loss": 0.0047, + "step": 3794 + }, + { + "clip_ratio": 0.0034777592518366873, + "epoch": 0.14159787323352455, + "grad_norm": 0.09195621311664581, + "kl": 0.0279693603515625, + "learning_rate": 2.30259861063044e-06, + "loss": 0.0046, + "step": 3795 + }, + { + "clip_ratio": 0.0017173197120428085, + "completion_length": 593.8214721679688, + "epoch": 0.1416351849260762, + "grad_norm": 0.003889427287504077, + "kl": 0.0189208984375, + "learning_rate": 2.3005481878321593e-06, + "loss": 0.0002, + "num_tokens": 88897412.0, + "reward": 0.899999987334013, + "reward_std": 0.0, + "rewards/code_reward": 0.75, + "rewards/format_reward": 1.5, + "step": 3796 + }, + { + "clip_ratio": 0.0018567187944427133, + "epoch": 0.14167249661862785, + "grad_norm": 0.003243987215682864, + "kl": 0.0184326171875, + "learning_rate": 2.298499107464429e-06, + "loss": 0.0002, + "step": 3797 + }, + { + "clip_ratio": 0.001918352791108191, + "epoch": 0.14170980831117952, + "grad_norm": 0.00317405560053885, + "kl": 0.01837158203125, + "learning_rate": 2.2964513703870017e-06, + "loss": 0.0002, + "step": 3798 + }, + { + "clip_ratio": 0.0033022479619830847, + "completion_length": 560.0178756713867, + "epoch": 0.14174712000373116, + "grad_norm": 0.04453970864415169, + "kl": 0.0234375, + "learning_rate": 2.294404977459074e-06, + "loss": -0.0008, + "num_tokens": 88953385.0, + "reward": 0.7497171573340893, + "reward_std": 0.06681530922651291, + "rewards/code_reward": 0.5997170954942703, + "rewards/format_reward": 1.5, + "step": 3799 + }, + { + "clip_ratio": 0.0031686118454672396, + "epoch": 0.14178443169628283, + "grad_norm": 0.044116247445344925, + "kl": 0.023193359375, + "learning_rate": 2.29235992953927e-06, + "loss": -0.001, + "step": 3800 + }, + { + "clip_ratio": 0.0034101320197805762, + "epoch": 0.14182174338883446, + "grad_norm": 0.0410187654197216, + "kl": 0.023406982421875, + "learning_rate": 2.2903162274856552e-06, + "loss": -0.0009, + "step": 3801 + }, + { + "clip_ratio": 0.003977985179517418, + "completion_length": 628.0357437133789, + "epoch": 0.14185905508138613, + "grad_norm": 0.08846084028482437, + "kl": 0.0206298828125, + "learning_rate": 2.2882738721557302e-06, + "loss": 0.0057, + "num_tokens": 89026201.0, + "reward": 0.9023037403821945, + "reward_std": 0.27521441504359245, + "rewards/code_reward": 0.7523037046194077, + "rewards/format_reward": 1.5, + "step": 3802 + }, + { + "clip_ratio": 0.0035853697918355465, + "epoch": 0.14189636677393777, + "grad_norm": 0.09082643687725067, + "kl": 0.020843505859375, + "learning_rate": 2.286232864406429e-06, + "loss": 0.0057, + "step": 3803 + }, + { + "clip_ratio": 0.0035008866107091308, + "epoch": 0.14193367846648944, + "grad_norm": 0.08404255658388138, + "kl": 0.021270751953125, + "learning_rate": 2.2841932050941197e-06, + "loss": 0.0056, + "step": 3804 + }, + { + "clip_ratio": 0.0018185455701313913, + "completion_length": 562.6607437133789, + "epoch": 0.14197099015904108, + "grad_norm": 0.06476955860853195, + "kl": 0.020904541015625, + "learning_rate": 2.282154895074608e-06, + "loss": 0.0088, + "num_tokens": 89093928.0, + "reward": 0.8751400411128998, + "reward_std": 0.24741605669260025, + "rewards/code_reward": 0.7251400649547577, + "rewards/format_reward": 1.5, + "step": 3805 + }, + { + "clip_ratio": 0.0014849267899990082, + "epoch": 0.14200830185159274, + "grad_norm": 0.06212881579995155, + "kl": 0.020721435546875, + "learning_rate": 2.2801179352031273e-06, + "loss": 0.0085, + "step": 3806 + }, + { + "clip_ratio": 0.0018793675699271262, + "epoch": 0.14204561354414438, + "grad_norm": 0.04818537086248398, + "kl": 0.020965576171875, + "learning_rate": 2.2780823263343517e-06, + "loss": 0.0086, + "step": 3807 + }, + { + "clip_ratio": 0.004106286913156509, + "completion_length": 684.357177734375, + "epoch": 0.14208292523669605, + "grad_norm": 0.09581701457500458, + "kl": 0.019805908203125, + "learning_rate": 2.276048069322381e-06, + "loss": 0.0136, + "num_tokens": 89159202.0, + "reward": 0.4605785235762596, + "reward_std": 0.06330188881838694, + "rewards/code_reward": 0.31057849805802107, + "rewards/format_reward": 1.5, + "step": 3808 + }, + { + "clip_ratio": 0.003571941691916436, + "epoch": 0.1421202369292477, + "grad_norm": 0.09112443774938583, + "kl": 0.0204620361328125, + "learning_rate": 2.274015165020754e-06, + "loss": 0.0136, + "step": 3809 + }, + { + "clip_ratio": 0.0033085976901929826, + "epoch": 0.14215754862179936, + "grad_norm": 0.0963180884718895, + "kl": 0.020782470703125, + "learning_rate": 2.271983614282439e-06, + "loss": 0.0133, + "step": 3810 + }, + { + "clip_ratio": 0.0033292804146185517, + "completion_length": 628.2678756713867, + "epoch": 0.142194860314351, + "grad_norm": 0.08394035696983337, + "kl": 0.0223388671875, + "learning_rate": 2.269953417959837e-06, + "loss": -0.0004, + "num_tokens": 89228433.0, + "reward": 0.6142857111990452, + "reward_std": 0.24821119010448456, + "rewards/code_reward": 0.4642857126891613, + "rewards/format_reward": 1.5, + "step": 3811 + }, + { + "clip_ratio": 0.0038353840354830027, + "epoch": 0.14223217200690266, + "grad_norm": 0.07423177361488342, + "kl": 0.022552490234375, + "learning_rate": 2.267924576904783e-06, + "loss": -0.0004, + "step": 3812 + }, + { + "clip_ratio": 0.0033992304233834147, + "epoch": 0.14226948369945433, + "grad_norm": 0.0727456584572792, + "kl": 0.02264404296875, + "learning_rate": 2.2658970919685368e-06, + "loss": -0.0006, + "step": 3813 + }, + { + "clip_ratio": 0.0016554566100239754, + "completion_length": 849.4285888671875, + "epoch": 0.14230679539200597, + "grad_norm": 0.030754022300243378, + "kl": 0.01605224609375, + "learning_rate": 2.2638709640017955e-06, + "loss": 0.011, + "num_tokens": 89316377.0, + "reward": 0.8642857186496258, + "reward_std": 0.09078412503004074, + "rewards/code_reward": 0.7142857164144516, + "rewards/format_reward": 1.5, + "step": 3814 + }, + { + "clip_ratio": 0.001871948828920722, + "epoch": 0.14234410708455764, + "grad_norm": 0.03198840841650963, + "kl": 0.0165252685546875, + "learning_rate": 2.2618461938546866e-06, + "loss": 0.011, + "step": 3815 + }, + { + "clip_ratio": 0.0015176908345893025, + "epoch": 0.14238141877710928, + "grad_norm": 0.03183954954147339, + "kl": 0.0165252685546875, + "learning_rate": 2.259822782376762e-06, + "loss": 0.011, + "step": 3816 + }, + { + "clip_ratio": 0.002933660871349275, + "completion_length": 656.9643249511719, + "epoch": 0.14241873046966094, + "grad_norm": 0.07810778170824051, + "kl": 0.021453857421875, + "learning_rate": 2.257800730417012e-06, + "loss": 0.0475, + "num_tokens": 89379039.0, + "reward": 0.6519652530550957, + "reward_std": 0.17756003700196743, + "rewards/code_reward": 0.5046438183635473, + "rewards/format_reward": 1.4732142984867096, + "step": 3817 + }, + { + "clip_ratio": 0.0028626919956877828, + "epoch": 0.14245604216221258, + "grad_norm": 0.07401280850172043, + "kl": 0.021759033203125, + "learning_rate": 2.2557800388238487e-06, + "loss": 0.0475, + "step": 3818 + }, + { + "clip_ratio": 0.002730539534240961, + "epoch": 0.14249335385476425, + "grad_norm": 0.0729617029428482, + "kl": 0.022430419921875, + "learning_rate": 2.2537607084451168e-06, + "loss": 0.0474, + "step": 3819 + }, + { + "clip_ratio": 0.002900220511946827, + "completion_length": 526.5893173217773, + "epoch": 0.1425306655473159, + "grad_norm": 0.08295802772045135, + "kl": 0.019287109375, + "learning_rate": 2.2517427401280913e-06, + "loss": 0.0217, + "num_tokens": 89437228.0, + "reward": 0.6608752198517323, + "reward_std": 0.23262453265488148, + "rewards/code_reward": 0.5108752157539129, + "rewards/format_reward": 1.5, + "step": 3820 + }, + { + "clip_ratio": 0.003003880614414811, + "epoch": 0.14256797723986755, + "grad_norm": 0.08268946409225464, + "kl": 0.01953125, + "learning_rate": 2.249726134719473e-06, + "loss": 0.0219, + "step": 3821 + }, + { + "clip_ratio": 0.003629721235483885, + "epoch": 0.1426052889324192, + "grad_norm": 0.105606809258461, + "kl": 0.019622802734375, + "learning_rate": 2.2477108930653953e-06, + "loss": 0.0218, + "step": 3822 + }, + { + "clip_ratio": 0.0013645464787259698, + "completion_length": 638.9107513427734, + "epoch": 0.14264260062497086, + "grad_norm": 0.0023178388364613056, + "kl": 0.017822265625, + "learning_rate": 2.245697016011412e-06, + "loss": 0.0002, + "num_tokens": 89504325.0, + "reward": 0.899999987334013, + "reward_std": 0.0, + "rewards/code_reward": 0.75, + "rewards/format_reward": 1.5, + "step": 3823 + }, + { + "clip_ratio": 0.0013223455753177404, + "epoch": 0.1426799123175225, + "grad_norm": 0.0024449352640658617, + "kl": 0.0182342529296875, + "learning_rate": 2.243684504402511e-06, + "loss": 0.0002, + "step": 3824 + }, + { + "clip_ratio": 0.0010122215608134866, + "epoch": 0.14271722401007417, + "grad_norm": 0.0023986082524061203, + "kl": 0.0182342529296875, + "learning_rate": 2.2416733590831046e-06, + "loss": 0.0002, + "step": 3825 + }, + { + "clip_ratio": 0.003296162001788616, + "completion_length": 743.6428985595703, + "epoch": 0.1427545357026258, + "grad_norm": 0.07034673541784286, + "kl": 0.0223846435546875, + "learning_rate": 2.23966358089703e-06, + "loss": -0.0118, + "num_tokens": 89574683.0, + "reward": 0.6579731404781342, + "reward_std": 0.1874421490356326, + "rewards/code_reward": 0.5099374204874039, + "rewards/format_reward": 1.480357140302658, + "step": 3826 + }, + { + "clip_ratio": 0.002904806286096573, + "epoch": 0.14279184739517747, + "grad_norm": 0.0646098181605339, + "kl": 0.02301025390625, + "learning_rate": 2.2376551706875544e-06, + "loss": -0.012, + "step": 3827 + }, + { + "clip_ratio": 0.0030541997984983027, + "epoch": 0.1428291590877291, + "grad_norm": 0.06626158207654953, + "kl": 0.023101806640625, + "learning_rate": 2.235648129297369e-06, + "loss": -0.0121, + "step": 3828 + }, + { + "clip_ratio": 0.0038026062538847327, + "completion_length": 636.6964569091797, + "epoch": 0.14286647078028078, + "grad_norm": 0.08472999930381775, + "kl": 0.0186004638671875, + "learning_rate": 2.2336424575685915e-06, + "loss": 0.0111, + "num_tokens": 89636252.0, + "reward": 0.7054826319217682, + "reward_std": 0.3648286685347557, + "rewards/code_reward": 0.5554825998842716, + "rewards/format_reward": 1.5, + "step": 3829 + }, + { + "clip_ratio": 0.0040165462414734066, + "epoch": 0.14290378247283242, + "grad_norm": 0.08613456785678864, + "kl": 0.018768310546875, + "learning_rate": 2.231638156342766e-06, + "loss": 0.0111, + "step": 3830 + }, + { + "clip_ratio": 0.0034963715297635645, + "epoch": 0.14294109416538409, + "grad_norm": 0.08263339847326279, + "kl": 0.019378662109375, + "learning_rate": 2.2296352264608565e-06, + "loss": 0.0112, + "step": 3831 + }, + { + "clip_ratio": 0.0035815017763525248, + "completion_length": 635.6071624755859, + "epoch": 0.14297840585793573, + "grad_norm": 0.09052610397338867, + "kl": 0.023681640625, + "learning_rate": 2.2276336687632587e-06, + "loss": 0.022, + "num_tokens": 89717150.0, + "reward": 0.678724680095911, + "reward_std": 0.20065140398219228, + "rewards/code_reward": 0.5305103808641434, + "rewards/format_reward": 1.4821428656578064, + "step": 3832 + }, + { + "clip_ratio": 0.0031983958033379167, + "epoch": 0.1430157175504874, + "grad_norm": 0.09248040616512299, + "kl": 0.0234375, + "learning_rate": 2.2256334840897866e-06, + "loss": 0.0222, + "step": 3833 + }, + { + "clip_ratio": 0.003394772473257035, + "epoch": 0.14305302924303903, + "grad_norm": 0.0869109183549881, + "kl": 0.023895263671875, + "learning_rate": 2.22363467327968e-06, + "loss": 0.022, + "step": 3834 + }, + { + "clip_ratio": 0.0029272648971527815, + "completion_length": 648.6250381469727, + "epoch": 0.1430903409355907, + "grad_norm": 0.06879252940416336, + "kl": 0.0244140625, + "learning_rate": 2.2216372371716067e-06, + "loss": 0.0077, + "num_tokens": 89787395.0, + "reward": 0.8524041697382927, + "reward_std": 0.22315070778131485, + "rewards/code_reward": 0.7024041190743446, + "rewards/format_reward": 1.5, + "step": 3835 + }, + { + "clip_ratio": 0.002875381935155019, + "epoch": 0.14312765262814234, + "grad_norm": 0.06775635480880737, + "kl": 0.0247802734375, + "learning_rate": 2.2196411766036492e-06, + "loss": 0.0077, + "step": 3836 + }, + { + "clip_ratio": 0.0032291539828293025, + "epoch": 0.143164964320694, + "grad_norm": 0.07134551554918289, + "kl": 0.0245513916015625, + "learning_rate": 2.2176464924133197e-06, + "loss": 0.0078, + "step": 3837 + }, + { + "clip_ratio": 0.0020266450592316687, + "completion_length": 550.2143020629883, + "epoch": 0.14320227601324564, + "grad_norm": 0.08005255460739136, + "kl": 0.0191192626953125, + "learning_rate": 2.2156531854375503e-06, + "loss": 0.0124, + "num_tokens": 89846333.0, + "reward": 0.8815115541219711, + "reward_std": 0.17769143730401993, + "rewards/code_reward": 0.7315115481615067, + "rewards/format_reward": 1.5, + "step": 3838 + }, + { + "clip_ratio": 0.0020412663579918444, + "epoch": 0.1432395877057973, + "grad_norm": 0.08080358058214188, + "kl": 0.018951416015625, + "learning_rate": 2.2136612565126952e-06, + "loss": 0.0123, + "step": 3839 + }, + { + "clip_ratio": 0.0020974058425053954, + "epoch": 0.14327689939834895, + "grad_norm": 0.06842617690563202, + "kl": 0.01861572265625, + "learning_rate": 2.2116707064745335e-06, + "loss": 0.0123, + "step": 3840 + }, + { + "clip_ratio": 0.0021670308196917176, + "completion_length": 536.7321624755859, + "epoch": 0.14331421109090062, + "grad_norm": 0.04233347997069359, + "kl": 0.022979736328125, + "learning_rate": 2.209681536158259e-06, + "loss": -0.0046, + "num_tokens": 89895542.0, + "reward": 0.7042717151343822, + "reward_std": 0.1160527691245079, + "rewards/code_reward": 0.5542716979980469, + "rewards/format_reward": 1.5, + "step": 3841 + }, + { + "clip_ratio": 0.0020009499276056886, + "epoch": 0.14335152278345226, + "grad_norm": 0.04567364603281021, + "kl": 0.0230712890625, + "learning_rate": 2.207693746398492e-06, + "loss": -0.0045, + "step": 3842 + }, + { + "clip_ratio": 0.0018242966034449637, + "epoch": 0.14338883447600392, + "grad_norm": 0.04671306163072586, + "kl": 0.0232086181640625, + "learning_rate": 2.2057073380292744e-06, + "loss": -0.0048, + "step": 3843 + }, + { + "clip_ratio": 0.0032599865226075053, + "completion_length": 749.7500457763672, + "epoch": 0.14342614616855556, + "grad_norm": 0.06832034885883331, + "kl": 0.0186309814453125, + "learning_rate": 2.2037223118840626e-06, + "loss": 0.0073, + "num_tokens": 89977064.0, + "reward": 0.49635810777544975, + "reward_std": 0.20512861479073763, + "rewards/code_reward": 0.34635808411985636, + "rewards/format_reward": 1.5, + "step": 3844 + }, + { + "clip_ratio": 0.00361137674190104, + "epoch": 0.14346345786110723, + "grad_norm": 0.06847482174634933, + "kl": 0.0185546875, + "learning_rate": 2.201738668795739e-06, + "loss": 0.0074, + "step": 3845 + }, + { + "clip_ratio": 0.003798432822804898, + "epoch": 0.14350076955365887, + "grad_norm": 0.06776740401983261, + "kl": 0.0192108154296875, + "learning_rate": 2.1997564095966024e-06, + "loss": 0.0073, + "step": 3846 + }, + { + "clip_ratio": 0.005293550901114941, + "completion_length": 641.0536041259766, + "epoch": 0.14353808124621054, + "grad_norm": 0.09254269301891327, + "kl": 0.0236358642578125, + "learning_rate": 2.1977755351183727e-06, + "loss": 0.0179, + "num_tokens": 90044477.0, + "reward": 0.6412617638707161, + "reward_std": 0.28339451365172863, + "rewards/code_reward": 0.4912617653608322, + "rewards/format_reward": 1.5, + "step": 3847 + }, + { + "clip_ratio": 0.004603929934091866, + "epoch": 0.14357539293876218, + "grad_norm": 0.08815179020166397, + "kl": 0.0232391357421875, + "learning_rate": 2.195796046192189e-06, + "loss": 0.0178, + "step": 3848 + }, + { + "clip_ratio": 0.004619318526238203, + "epoch": 0.14361270463131384, + "grad_norm": 0.08609887957572937, + "kl": 0.02294921875, + "learning_rate": 2.193817943648605e-06, + "loss": 0.0176, + "step": 3849 + }, + { + "clip_ratio": 0.001718131243251264, + "completion_length": 549.8928833007812, + "epoch": 0.14365001632386548, + "grad_norm": 0.06530912220478058, + "kl": 0.0228271484375, + "learning_rate": 2.1918412283175996e-06, + "loss": 0.0071, + "num_tokens": 90103103.0, + "reward": 1.0437070429325104, + "reward_std": 0.15521597117185593, + "rewards/code_reward": 0.8937070816755295, + "rewards/format_reward": 1.5, + "step": 3850 + }, + { + "clip_ratio": 0.0023199199931696057, + "epoch": 0.14368732801641715, + "grad_norm": 0.0648551881313324, + "kl": 0.023101806640625, + "learning_rate": 2.1898659010285622e-06, + "loss": 0.0072, + "step": 3851 + }, + { + "clip_ratio": 0.0018572589033283293, + "epoch": 0.1437246397089688, + "grad_norm": 0.06454605609178543, + "kl": 0.023040771484375, + "learning_rate": 2.1878919626103047e-06, + "loss": 0.0071, + "step": 3852 + }, + { + "clip_ratio": 0.004230451711919159, + "completion_length": 496.0357208251953, + "epoch": 0.14376195140152045, + "grad_norm": 0.06971196085214615, + "kl": 0.0209808349609375, + "learning_rate": 2.185919413891057e-06, + "loss": -0.0021, + "num_tokens": 90157915.0, + "reward": 0.3654128462076187, + "reward_std": 0.15884003974497318, + "rewards/code_reward": 0.21541284024715424, + "rewards/format_reward": 1.5, + "step": 3853 + }, + { + "clip_ratio": 0.0046571792918257415, + "epoch": 0.1437992630940721, + "grad_norm": 0.06925683468580246, + "kl": 0.0207366943359375, + "learning_rate": 2.1839482556984615e-06, + "loss": -0.0023, + "step": 3854 + }, + { + "clip_ratio": 0.004236088541802019, + "epoch": 0.14383657478662376, + "grad_norm": 0.06703845411539078, + "kl": 0.0207672119140625, + "learning_rate": 2.1819784888595804e-06, + "loss": -0.0025, + "step": 3855 + }, + { + "clip_ratio": 0.003721292072441429, + "completion_length": 566.3393020629883, + "epoch": 0.1438738864791754, + "grad_norm": 0.06931711733341217, + "kl": 0.020263671875, + "learning_rate": 2.18001011420089e-06, + "loss": 0.0076, + "num_tokens": 90221228.0, + "reward": 0.7875298075377941, + "reward_std": 0.20315571129322052, + "rewards/code_reward": 0.6375297904014587, + "rewards/format_reward": 1.5, + "step": 3856 + }, + { + "clip_ratio": 0.003272645582910627, + "epoch": 0.14391119817172707, + "grad_norm": 0.07156321406364441, + "kl": 0.019989013671875, + "learning_rate": 2.178043132548286e-06, + "loss": 0.0075, + "step": 3857 + }, + { + "clip_ratio": 0.0035384977818466723, + "epoch": 0.1439485098642787, + "grad_norm": 0.06418399512767792, + "kl": 0.0199432373046875, + "learning_rate": 2.176077544727077e-06, + "loss": 0.0075, + "step": 3858 + }, + { + "clip_ratio": 0.003138196130748838, + "completion_length": 583.1786041259766, + "epoch": 0.14398582155683037, + "grad_norm": 0.07537002116441727, + "kl": 0.0220184326171875, + "learning_rate": 2.1741133515619855e-06, + "loss": 0.0001, + "num_tokens": 90288690.0, + "reward": 0.6685401014983654, + "reward_std": 0.10773396212607622, + "rewards/code_reward": 0.5185400866903365, + "rewards/format_reward": 1.5, + "step": 3859 + }, + { + "clip_ratio": 0.003067357640247792, + "epoch": 0.144023133249382, + "grad_norm": 0.06911881268024445, + "kl": 0.0218505859375, + "learning_rate": 2.1721505538771533e-06, + "loss": 0.0003, + "step": 3860 + }, + { + "clip_ratio": 0.0033854033099487424, + "epoch": 0.14406044494193368, + "grad_norm": 0.07246226817369461, + "kl": 0.0216217041015625, + "learning_rate": 2.17018915249613e-06, + "loss": 0.0001, + "step": 3861 + }, + { + "clip_ratio": 0.003580053220503032, + "completion_length": 724.0714645385742, + "epoch": 0.14409775663448532, + "grad_norm": 0.041061241179704666, + "kl": 0.0209197998046875, + "learning_rate": 2.168229148241885e-06, + "loss": -0.0001, + "num_tokens": 90360066.0, + "reward": 0.6483082994818687, + "reward_std": 0.0063298651948571205, + "rewards/code_reward": 0.4983082711696625, + "rewards/format_reward": 1.5, + "step": 3862 + }, + { + "clip_ratio": 0.0033373170881532133, + "epoch": 0.14413506832703699, + "grad_norm": 0.03958608955144882, + "kl": 0.021209716796875, + "learning_rate": 2.1662705419367995e-06, + "loss": -0.0001, + "step": 3863 + }, + { + "clip_ratio": 0.003360941424034536, + "epoch": 0.14417238001958863, + "grad_norm": 0.04278671741485596, + "kl": 0.021087646484375, + "learning_rate": 2.1643133344026694e-06, + "loss": 0.0, + "step": 3864 + }, + { + "clip_ratio": 0.003464230860117823, + "completion_length": 695.607177734375, + "epoch": 0.1442096917121403, + "grad_norm": 0.07545942068099976, + "kl": 0.0202178955078125, + "learning_rate": 2.1623575264607018e-06, + "loss": 0.0278, + "num_tokens": 90427104.0, + "reward": 0.6594960764050484, + "reward_std": 0.2698672264814377, + "rewards/code_reward": 0.5094960276037455, + "rewards/format_reward": 1.5, + "step": 3865 + }, + { + "clip_ratio": 0.003105582785792649, + "epoch": 0.14424700340469196, + "grad_norm": 0.07975174486637115, + "kl": 0.0207061767578125, + "learning_rate": 2.1604031189315187e-06, + "loss": 0.0278, + "step": 3866 + }, + { + "clip_ratio": 0.003578498086426407, + "epoch": 0.1442843150972436, + "grad_norm": 0.0740511417388916, + "kl": 0.020599365234375, + "learning_rate": 2.158450112635151e-06, + "loss": 0.0277, + "step": 3867 + }, + { + "clip_ratio": 0.003303675795905292, + "completion_length": 819.7857513427734, + "epoch": 0.14432162678979527, + "grad_norm": 0.06353114545345306, + "kl": 0.02099609375, + "learning_rate": 2.1564985083910456e-06, + "loss": 0.0082, + "num_tokens": 90517938.0, + "reward": 0.5647742226719856, + "reward_std": 0.17220482975244522, + "rewards/code_reward": 0.4147742073982954, + "rewards/format_reward": 1.5, + "step": 3868 + }, + { + "clip_ratio": 0.0028463067719712853, + "epoch": 0.1443589384823469, + "grad_norm": 0.060613129287958145, + "kl": 0.020843505859375, + "learning_rate": 2.1545483070180573e-06, + "loss": 0.0079, + "step": 3869 + }, + { + "clip_ratio": 0.0033742449013516307, + "epoch": 0.14439625017489857, + "grad_norm": 0.06149478629231453, + "kl": 0.020294189453125, + "learning_rate": 2.1525995093344558e-06, + "loss": 0.0081, + "step": 3870 + }, + { + "clip_ratio": 0.0039981326553970575, + "completion_length": 593.053596496582, + "epoch": 0.1444335618674502, + "grad_norm": 0.06541875749826431, + "kl": 0.0244903564453125, + "learning_rate": 2.150652116157922e-06, + "loss": 0.0064, + "num_tokens": 90582229.0, + "reward": 0.6332298219203949, + "reward_std": 0.15132849290966988, + "rewards/code_reward": 0.48322981037199497, + "rewards/format_reward": 1.5, + "step": 3871 + }, + { + "clip_ratio": 0.003647824691142887, + "epoch": 0.14447087356000188, + "grad_norm": 0.0637548491358757, + "kl": 0.0244293212890625, + "learning_rate": 2.1487061283055422e-06, + "loss": 0.0063, + "step": 3872 + }, + { + "clip_ratio": 0.0035196112003177404, + "epoch": 0.14450818525255352, + "grad_norm": 0.058373548090457916, + "kl": 0.024444580078125, + "learning_rate": 2.146761546593819e-06, + "loss": 0.0062, + "step": 3873 + }, + { + "clip_ratio": 0.004725866951048374, + "completion_length": 714.5714721679688, + "epoch": 0.14454549694510518, + "grad_norm": 0.08335880190134048, + "kl": 0.01654052734375, + "learning_rate": 2.144818371838662e-06, + "loss": 0.0054, + "num_tokens": 90656965.0, + "reward": 0.4357142969965935, + "reward_std": 0.2455899640917778, + "rewards/code_reward": 0.285714291036129, + "rewards/format_reward": 1.5, + "step": 3874 + }, + { + "clip_ratio": 0.004018135892692953, + "epoch": 0.14458280863765682, + "grad_norm": 0.05527481064200401, + "kl": 0.016632080078125, + "learning_rate": 2.142876604855393e-06, + "loss": 0.0053, + "step": 3875 + }, + { + "clip_ratio": 0.004288573283702135, + "epoch": 0.1446201203302085, + "grad_norm": 0.07189877331256866, + "kl": 0.01678466796875, + "learning_rate": 2.1409362464587404e-06, + "loss": 0.0055, + "step": 3876 + }, + { + "clip_ratio": 0.003820535377599299, + "completion_length": 769.8214416503906, + "epoch": 0.14465743202276013, + "grad_norm": 0.1101001724600792, + "kl": 0.018463134765625, + "learning_rate": 2.1389972974628413e-06, + "loss": -0.0153, + "num_tokens": 90739807.0, + "reward": 0.5033834837377071, + "reward_std": 0.24843795597553253, + "rewards/code_reward": 0.3533834610134363, + "rewards/format_reward": 1.5, + "step": 3877 + }, + { + "clip_ratio": 0.004254997300449759, + "epoch": 0.1446947437153118, + "grad_norm": 0.06451532989740372, + "kl": 0.018402099609375, + "learning_rate": 2.1370597586812446e-06, + "loss": -0.0153, + "step": 3878 + }, + { + "clip_ratio": 0.004083727952092886, + "epoch": 0.14473205540786344, + "grad_norm": 0.0981401577591896, + "kl": 0.0181427001953125, + "learning_rate": 2.1351236309269036e-06, + "loss": -0.0153, + "step": 3879 + }, + { + "clip_ratio": 0.0029495496419258416, + "completion_length": 748.857177734375, + "epoch": 0.1447693671004151, + "grad_norm": 0.0365581177175045, + "kl": 0.0166778564453125, + "learning_rate": 2.1331889150121828e-06, + "loss": 0.0101, + "num_tokens": 90809939.0, + "reward": 0.6328717395663261, + "reward_std": 0.030912132933735847, + "rewards/code_reward": 0.48287174105644226, + "rewards/format_reward": 1.5, + "step": 3880 + }, + { + "clip_ratio": 0.00305310235125944, + "epoch": 0.14480667879296674, + "grad_norm": 0.0354996956884861, + "kl": 0.017120361328125, + "learning_rate": 2.1312556117488524e-06, + "loss": 0.0102, + "step": 3881 + }, + { + "clip_ratio": 0.0030789338634349406, + "epoch": 0.1448439904855184, + "grad_norm": 0.03300361707806587, + "kl": 0.01666259765625, + "learning_rate": 2.1293237219480912e-06, + "loss": 0.0101, + "step": 3882 + }, + { + "clip_ratio": 0.0039404521230608225, + "completion_length": 720.2678985595703, + "epoch": 0.14488130217807005, + "grad_norm": 0.08044559508562088, + "kl": 0.021728515625, + "learning_rate": 2.1273932464204854e-06, + "loss": -0.0103, + "num_tokens": 90884224.0, + "reward": 0.6126061305403709, + "reward_std": 0.2958842422813177, + "rewards/code_reward": 0.4626060966402292, + "rewards/format_reward": 1.5, + "step": 3883 + }, + { + "clip_ratio": 0.004146636987570673, + "epoch": 0.14491861387062172, + "grad_norm": 0.07577157020568848, + "kl": 0.0217437744140625, + "learning_rate": 2.1254641859760236e-06, + "loss": -0.0101, + "step": 3884 + }, + { + "clip_ratio": 0.003738857281859964, + "epoch": 0.14495592556317335, + "grad_norm": 0.08211416006088257, + "kl": 0.0218505859375, + "learning_rate": 2.1235365414241056e-06, + "loss": -0.0105, + "step": 3885 + }, + { + "clip_ratio": 0.003421722096391022, + "completion_length": 537.178596496582, + "epoch": 0.14499323725572502, + "grad_norm": 0.07500457763671875, + "kl": 0.0188446044921875, + "learning_rate": 2.121610313573536e-06, + "loss": -0.005, + "num_tokens": 90954696.0, + "reward": 0.6524752452969551, + "reward_std": 0.1428921464830637, + "rewards/code_reward": 0.5024752418976277, + "rewards/format_reward": 1.5, + "step": 3886 + }, + { + "clip_ratio": 0.003364729811437428, + "epoch": 0.14503054894827666, + "grad_norm": 0.07051274925470352, + "kl": 0.0189208984375, + "learning_rate": 2.1196855032325227e-06, + "loss": -0.005, + "step": 3887 + }, + { + "clip_ratio": 0.0033355341874994338, + "epoch": 0.14506786064082833, + "grad_norm": 0.06805061548948288, + "kl": 0.0186004638671875, + "learning_rate": 2.1177621112086816e-06, + "loss": -0.0051, + "step": 3888 + }, + { + "clip_ratio": 0.0032594744698144495, + "completion_length": 820.7678833007812, + "epoch": 0.14510517233337997, + "grad_norm": 0.06123776361346245, + "kl": 0.0139007568359375, + "learning_rate": 2.1158401383090305e-06, + "loss": 0.0005, + "num_tokens": 91034091.0, + "reward": 0.4716918580234051, + "reward_std": 0.29020557552576065, + "rewards/code_reward": 0.3216918557882309, + "rewards/format_reward": 1.5, + "step": 3889 + }, + { + "clip_ratio": 0.0033918453264050186, + "epoch": 0.14514248402593163, + "grad_norm": 0.06073663383722305, + "kl": 0.01416015625, + "learning_rate": 2.113919585339994e-06, + "loss": 0.0005, + "step": 3890 + }, + { + "clip_ratio": 0.003389657649677247, + "epoch": 0.14517979571848327, + "grad_norm": 0.063315249979496, + "kl": 0.0142364501953125, + "learning_rate": 2.1120004531074026e-06, + "loss": 0.0005, + "step": 3891 + }, + { + "clip_ratio": 0.0012919171713292599, + "completion_length": 793.2857208251953, + "epoch": 0.14521710741103494, + "grad_norm": 0.04823717474937439, + "kl": 0.0125732421875, + "learning_rate": 2.1100827424164863e-06, + "loss": 0.0075, + "num_tokens": 91108209.0, + "reward": 0.8950139135122299, + "reward_std": 0.11379000917077065, + "rewards/code_reward": 0.7450139224529266, + "rewards/format_reward": 1.5, + "step": 3892 + }, + { + "clip_ratio": 0.001272858469747007, + "epoch": 0.14525441910358658, + "grad_norm": 0.045498549938201904, + "kl": 0.0127105712890625, + "learning_rate": 2.1081664540718827e-06, + "loss": 0.0074, + "step": 3893 + }, + { + "clip_ratio": 0.0011174690444022417, + "epoch": 0.14529173079613825, + "grad_norm": 0.052743539214134216, + "kl": 0.0124969482421875, + "learning_rate": 2.1062515888776284e-06, + "loss": 0.0073, + "step": 3894 + }, + { + "clip_ratio": 0.004017905448563397, + "completion_length": 724.2678833007812, + "epoch": 0.14532904248868989, + "grad_norm": 0.09244243055582047, + "kl": 0.022857666015625, + "learning_rate": 2.1043381476371673e-06, + "loss": 0.0111, + "num_tokens": 91175368.0, + "reward": 0.4883401319384575, + "reward_std": 0.16792783699929714, + "rewards/code_reward": 0.34030441008508205, + "rewards/format_reward": 1.480357140302658, + "step": 3895 + }, + { + "clip_ratio": 0.004136166302487254, + "epoch": 0.14536635418124155, + "grad_norm": 0.10167179256677628, + "kl": 0.022125244140625, + "learning_rate": 2.102426131153344e-06, + "loss": 0.0111, + "step": 3896 + }, + { + "clip_ratio": 0.004094040195923299, + "epoch": 0.1454036658737932, + "grad_norm": 0.27349692583084106, + "kl": 0.02215576171875, + "learning_rate": 2.100515540228402e-06, + "loss": 0.0113, + "step": 3897 + }, + { + "clip_ratio": 0.003393103019334376, + "completion_length": 543.6250305175781, + "epoch": 0.14544097756634486, + "grad_norm": 0.07338178157806396, + "kl": 0.01824951171875, + "learning_rate": 2.098606375663993e-06, + "loss": -0.0173, + "num_tokens": 91236681.0, + "reward": 0.7445037066936493, + "reward_std": 0.2667912906035781, + "rewards/code_reward": 0.5945037100464106, + "rewards/format_reward": 1.5, + "step": 3898 + }, + { + "clip_ratio": 0.0034133917652070522, + "epoch": 0.1454782892588965, + "grad_norm": 0.07742492109537125, + "kl": 0.017913818359375, + "learning_rate": 2.0966986382611654e-06, + "loss": -0.0171, + "step": 3899 + }, + { + "clip_ratio": 0.0035159512772224844, + "epoch": 0.14551560095144817, + "grad_norm": 0.0736151784658432, + "kl": 0.0180816650390625, + "learning_rate": 2.0947923288203713e-06, + "loss": -0.0174, + "step": 3900 + }, + { + "clip_ratio": 0.0025500780902802944, + "completion_length": 692.3214797973633, + "epoch": 0.1455529126439998, + "grad_norm": 0.05828431248664856, + "kl": 0.02294921875, + "learning_rate": 2.092887448141463e-06, + "loss": 0.0428, + "num_tokens": 91311053.0, + "reward": 0.626797329634428, + "reward_std": 0.14838935621082783, + "rewards/code_reward": 0.47947587817907333, + "rewards/format_reward": 1.4732142984867096, + "step": 3901 + }, + { + "clip_ratio": 0.0026450997102074325, + "epoch": 0.14559022433655147, + "grad_norm": 0.05561436712741852, + "kl": 0.0233154296875, + "learning_rate": 2.0909839970236918e-06, + "loss": 0.0428, + "step": 3902 + }, + { + "clip_ratio": 0.002811238693539053, + "epoch": 0.1456275360291031, + "grad_norm": 0.053102150559425354, + "kl": 0.022979736328125, + "learning_rate": 2.0890819762657107e-06, + "loss": 0.0426, + "step": 3903 + }, + { + "clip_ratio": 0.004711119283456355, + "completion_length": 645.5357360839844, + "epoch": 0.14566484772165478, + "grad_norm": 0.08250270783901215, + "kl": 0.023040771484375, + "learning_rate": 2.0871813866655745e-06, + "loss": -0.0036, + "num_tokens": 91382777.0, + "reward": 0.5671445280313492, + "reward_std": 0.29142579808831215, + "rewards/code_reward": 0.4171445108950138, + "rewards/format_reward": 1.5, + "step": 3904 + }, + { + "clip_ratio": 0.004302107670810074, + "epoch": 0.14570215941420642, + "grad_norm": 0.08532547205686569, + "kl": 0.0226898193359375, + "learning_rate": 2.0852822290207307e-06, + "loss": -0.0039, + "step": 3905 + }, + { + "clip_ratio": 0.0038030585274100304, + "epoch": 0.14573947110675808, + "grad_norm": 0.08396761119365692, + "kl": 0.022125244140625, + "learning_rate": 2.0833845041280353e-06, + "loss": -0.0041, + "step": 3906 + }, + { + "clip_ratio": 0.0030621368205174804, + "completion_length": 659.303596496582, + "epoch": 0.14577678279930972, + "grad_norm": 0.07354544848203659, + "kl": 0.035552978515625, + "learning_rate": 2.081488212783735e-06, + "loss": -0.0016, + "num_tokens": 91455426.0, + "reward": 0.9271969944238663, + "reward_std": 0.24524822924286127, + "rewards/code_reward": 0.7791612148284912, + "rewards/format_reward": 1.480357140302658, + "step": 3907 + }, + { + "clip_ratio": 0.0030272912699729204, + "epoch": 0.1458140944918614, + "grad_norm": 0.07422751188278198, + "kl": 0.035888671875, + "learning_rate": 2.0795933557834785e-06, + "loss": -0.0018, + "step": 3908 + }, + { + "clip_ratio": 0.003060771618038416, + "epoch": 0.14585140618441303, + "grad_norm": 0.07618630677461624, + "kl": 0.04046630859375, + "learning_rate": 2.077699933922317e-06, + "loss": -0.0018, + "step": 3909 + }, + { + "clip_ratio": 0.0031309796031564474, + "completion_length": 634.2500228881836, + "epoch": 0.1458887178769647, + "grad_norm": 0.1981993168592453, + "kl": 0.02203369140625, + "learning_rate": 2.07580794799469e-06, + "loss": -0.0012, + "num_tokens": 91521030.0, + "reward": 0.9657025188207626, + "reward_std": 0.2435181364417076, + "rewards/code_reward": 0.8157025128602982, + "rewards/format_reward": 1.5, + "step": 3910 + }, + { + "clip_ratio": 0.0027121230377815664, + "epoch": 0.14592602956951634, + "grad_norm": 0.08704012632369995, + "kl": 0.0224761962890625, + "learning_rate": 2.073917398794443e-06, + "loss": -0.0014, + "step": 3911 + }, + { + "clip_ratio": 0.0026445388793945312, + "epoch": 0.145963341262068, + "grad_norm": 0.08596073091030121, + "kl": 0.0223388671875, + "learning_rate": 2.0720282871148132e-06, + "loss": -0.0017, + "step": 3912 + }, + { + "clip_ratio": 0.0030410593026317656, + "completion_length": 842.6607360839844, + "epoch": 0.14600065295461964, + "grad_norm": 0.055559299886226654, + "kl": 0.01377105712890625, + "learning_rate": 2.0701406137484376e-06, + "loss": 0.0105, + "num_tokens": 91600811.0, + "reward": 0.655251283198595, + "reward_std": 0.10139788687229156, + "rewards/code_reward": 0.5052512213587761, + "rewards/format_reward": 1.5, + "step": 3913 + }, + { + "clip_ratio": 0.0031193341710604727, + "epoch": 0.1460379646471713, + "grad_norm": 0.053860437124967575, + "kl": 0.01360321044921875, + "learning_rate": 2.06825437948735e-06, + "loss": 0.0105, + "step": 3914 + }, + { + "clip_ratio": 0.0029562487034127116, + "epoch": 0.14607527633972295, + "grad_norm": 0.053857795894145966, + "kl": 0.0137786865234375, + "learning_rate": 2.0663695851229764e-06, + "loss": 0.0106, + "step": 3915 + }, + { + "clip_ratio": 0.004904755100142211, + "completion_length": 607.9286041259766, + "epoch": 0.14611258803227462, + "grad_norm": 0.08076325058937073, + "kl": 0.02215576171875, + "learning_rate": 2.0644862314461433e-06, + "loss": -0.0095, + "num_tokens": 91669967.0, + "reward": 0.6888682954013348, + "reward_std": 0.2801191322505474, + "rewards/code_reward": 0.5388682782649994, + "rewards/format_reward": 1.5, + "step": 3916 + }, + { + "clip_ratio": 0.004619924526195973, + "epoch": 0.14614989972482625, + "grad_norm": 0.07754591107368469, + "kl": 0.0224609375, + "learning_rate": 2.062604319247071e-06, + "loss": -0.0097, + "step": 3917 + }, + { + "clip_ratio": 0.0045085587771609426, + "epoch": 0.14618721141737792, + "grad_norm": 0.07647739350795746, + "kl": 0.0213623046875, + "learning_rate": 2.0607238493153754e-06, + "loss": -0.0097, + "step": 3918 + }, + { + "clip_ratio": 0.001816131523810327, + "completion_length": 503.67859649658203, + "epoch": 0.14622452310992956, + "grad_norm": 0.04736844822764397, + "kl": 0.016387939453125, + "learning_rate": 2.058844822440067e-06, + "loss": 0.0005, + "num_tokens": 91718683.0, + "reward": 1.0071428418159485, + "reward_std": 0.12838813662528992, + "rewards/code_reward": 0.8571428582072258, + "rewards/format_reward": 1.5, + "step": 3919 + }, + { + "clip_ratio": 0.0015640448546037078, + "epoch": 0.14626183480248123, + "grad_norm": 0.04730404540896416, + "kl": 0.016326904296875, + "learning_rate": 2.0569672394095474e-06, + "loss": 0.0005, + "step": 3920 + }, + { + "clip_ratio": 0.0014725134242326021, + "epoch": 0.1462991464950329, + "grad_norm": 0.04720151051878929, + "kl": 0.0163116455078125, + "learning_rate": 2.0550911010116203e-06, + "loss": 0.0004, + "step": 3921 + }, + { + "clip_ratio": 0.005286343104671687, + "completion_length": 691.2857437133789, + "epoch": 0.14633645818758453, + "grad_norm": 0.06454858183860779, + "kl": 0.023651123046875, + "learning_rate": 2.053216408033474e-06, + "loss": 0.0042, + "num_tokens": 91804421.0, + "reward": 0.3832341134548187, + "reward_std": 0.08756693731993437, + "rewards/code_reward": 0.2332341026631184, + "rewards/format_reward": 1.5, + "step": 3922 + }, + { + "clip_ratio": 0.004801828588824719, + "epoch": 0.1463737698801362, + "grad_norm": 0.06574150919914246, + "kl": 0.02410888671875, + "learning_rate": 2.0513431612616975e-06, + "loss": 0.0044, + "step": 3923 + }, + { + "clip_ratio": 0.004491302592214197, + "epoch": 0.14641108157268784, + "grad_norm": 0.06127043813467026, + "kl": 0.024017333984375, + "learning_rate": 2.0494713614822703e-06, + "loss": 0.0042, + "step": 3924 + }, + { + "clip_ratio": 0.004104515246581286, + "completion_length": 750.6786041259766, + "epoch": 0.1464483932652395, + "grad_norm": 0.07785362005233765, + "kl": 0.0213775634765625, + "learning_rate": 2.0476010094805627e-06, + "loss": 0.0283, + "num_tokens": 91878701.0, + "reward": 0.831712156534195, + "reward_std": 0.19916533480864018, + "rewards/code_reward": 0.6817121282219887, + "rewards/format_reward": 1.5, + "step": 3925 + }, + { + "clip_ratio": 0.004158890573307872, + "epoch": 0.14648570495779115, + "grad_norm": 0.08044275641441345, + "kl": 0.0214996337890625, + "learning_rate": 2.045732106041342e-06, + "loss": 0.0285, + "step": 3926 + }, + { + "clip_ratio": 0.003926556033547968, + "epoch": 0.1465230166503428, + "grad_norm": 0.07543592154979706, + "kl": 0.0211639404296875, + "learning_rate": 2.0438646519487652e-06, + "loss": 0.0282, + "step": 3927 + }, + { + "clip_ratio": 0.003979977569542825, + "completion_length": 828.8750610351562, + "epoch": 0.14656032834289445, + "grad_norm": 0.0770588293671608, + "kl": 0.0216217041015625, + "learning_rate": 2.041998647986381e-06, + "loss": 0.014, + "num_tokens": 91956856.0, + "reward": 0.37542326003313065, + "reward_std": 0.22441309480927885, + "rewards/code_reward": 0.22810181765817106, + "rewards/format_reward": 1.4732142984867096, + "step": 3928 + }, + { + "clip_ratio": 0.00365048018284142, + "epoch": 0.14659764003544612, + "grad_norm": 0.08162719756364822, + "kl": 0.02130126953125, + "learning_rate": 2.0401340949371297e-06, + "loss": 0.014, + "step": 3929 + }, + { + "clip_ratio": 0.003929362166672945, + "epoch": 0.14663495172799776, + "grad_norm": 0.07493540644645691, + "kl": 0.022003173828125, + "learning_rate": 2.0382709935833424e-06, + "loss": 0.0139, + "step": 3930 + }, + { + "clip_ratio": 0.0036317461635917425, + "completion_length": 522.6071701049805, + "epoch": 0.14667226342054943, + "grad_norm": 0.06618764251470566, + "kl": 0.02117919921875, + "learning_rate": 2.0364093447067424e-06, + "loss": -0.0112, + "num_tokens": 92011168.0, + "reward": 0.3994607664644718, + "reward_std": 0.20787282288074493, + "rewards/code_reward": 0.24946074979379773, + "rewards/format_reward": 1.5, + "step": 3931 + }, + { + "clip_ratio": 0.00398313719779253, + "epoch": 0.14670957511310107, + "grad_norm": 0.06591051071882248, + "kl": 0.021331787109375, + "learning_rate": 2.034549149088444e-06, + "loss": -0.0112, + "step": 3932 + }, + { + "clip_ratio": 0.003447895054705441, + "epoch": 0.14674688680565273, + "grad_norm": 0.06700775027275085, + "kl": 0.020751953125, + "learning_rate": 2.032690407508949e-06, + "loss": -0.0113, + "step": 3933 + }, + { + "clip_ratio": 0.0006905851187184453, + "completion_length": 546.6428833007812, + "epoch": 0.14678419849820437, + "grad_norm": 0.02994932234287262, + "kl": 0.03399658203125, + "learning_rate": 2.0308331207481524e-06, + "loss": -0.0032, + "num_tokens": 92062710.0, + "reward": 0.8073129132390022, + "reward_std": 0.05906205624341965, + "rewards/code_reward": 0.6573129259049892, + "rewards/format_reward": 1.5, + "step": 3934 + }, + { + "clip_ratio": 0.0007580694509670138, + "epoch": 0.14682151019075604, + "grad_norm": 0.02526039443910122, + "kl": 0.034088134765625, + "learning_rate": 2.0289772895853354e-06, + "loss": -0.0032, + "step": 3935 + }, + { + "clip_ratio": 0.0008882270194590092, + "epoch": 0.14685882188330768, + "grad_norm": 0.03344748914241791, + "kl": 0.034088134765625, + "learning_rate": 2.0271229147991726e-06, + "loss": -0.0032, + "step": 3936 + }, + { + "clip_ratio": 0.004468613187782466, + "completion_length": 954.7857666015625, + "epoch": 0.14689613357585934, + "grad_norm": 0.051484428346157074, + "kl": 0.0171356201171875, + "learning_rate": 2.0252699971677246e-06, + "loss": 0.0407, + "num_tokens": 92156064.0, + "reward": 0.39203042536973953, + "reward_std": 0.19662753492593765, + "rewards/code_reward": 0.2447089971974492, + "rewards/format_reward": 1.4732142984867096, + "step": 3937 + }, + { + "clip_ratio": 0.004584223730489612, + "epoch": 0.14693344526841098, + "grad_norm": 0.059601880609989166, + "kl": 0.0174407958984375, + "learning_rate": 2.0234185374684395e-06, + "loss": 0.0408, + "step": 3938 + }, + { + "clip_ratio": 0.004148234031163156, + "epoch": 0.14697075696096265, + "grad_norm": 0.05329522490501404, + "kl": 0.0172119140625, + "learning_rate": 2.0215685364781578e-06, + "loss": 0.0407, + "step": 3939 + }, + { + "clip_ratio": 0.003670826437883079, + "completion_length": 633.2857437133789, + "epoch": 0.1470080686535143, + "grad_norm": 0.09075573086738586, + "kl": 0.0167236328125, + "learning_rate": 2.019719994973103e-06, + "loss": 0.014, + "num_tokens": 92225296.0, + "reward": 0.550131943076849, + "reward_std": 0.07921394845470786, + "rewards/code_reward": 0.4001319380477071, + "rewards/format_reward": 1.5, + "step": 3940 + }, + { + "clip_ratio": 0.003499888873193413, + "epoch": 0.14704538034606596, + "grad_norm": 0.08646620810031891, + "kl": 0.0171051025390625, + "learning_rate": 2.0178729137288906e-06, + "loss": 0.0137, + "step": 3941 + }, + { + "clip_ratio": 0.00348064920399338, + "epoch": 0.1470826920386176, + "grad_norm": 0.09058276563882828, + "kl": 0.0167388916015625, + "learning_rate": 2.016027293520521e-06, + "loss": 0.0137, + "step": 3942 + }, + { + "clip_ratio": 0.004795813467353582, + "completion_length": 775.8214721679688, + "epoch": 0.14712000373116926, + "grad_norm": 0.07738854736089706, + "kl": 0.020782470703125, + "learning_rate": 2.014183135122382e-06, + "loss": -0.0151, + "num_tokens": 92302986.0, + "reward": 0.318377286195755, + "reward_std": 0.08835185167845339, + "rewards/code_reward": 0.168377258698456, + "rewards/format_reward": 1.5, + "step": 3943 + }, + { + "clip_ratio": 0.004929931368678808, + "epoch": 0.1471573154237209, + "grad_norm": 0.07893036305904388, + "kl": 0.019775390625, + "learning_rate": 2.0123404393082493e-06, + "loss": -0.0152, + "step": 3944 + }, + { + "clip_ratio": 0.00510617985855788, + "epoch": 0.14719462711627257, + "grad_norm": 0.07934248447418213, + "kl": 0.0199432373046875, + "learning_rate": 2.0104992068512824e-06, + "loss": -0.0151, + "step": 3945 + }, + { + "clip_ratio": 0.005073447595350444, + "completion_length": 718.4285888671875, + "epoch": 0.1472319388088242, + "grad_norm": 0.07902862131595612, + "kl": 0.02362060546875, + "learning_rate": 2.0086594385240283e-06, + "loss": -0.0168, + "num_tokens": 92371022.0, + "reward": 0.39815831929445267, + "reward_std": 0.27394941076636314, + "rewards/code_reward": 0.24815830402076244, + "rewards/format_reward": 1.5, + "step": 3946 + }, + { + "clip_ratio": 0.005680040922015905, + "epoch": 0.14726925050137588, + "grad_norm": 0.08192262798547745, + "kl": 0.0234375, + "learning_rate": 2.006821135098422e-06, + "loss": -0.0169, + "step": 3947 + }, + { + "clip_ratio": 0.005198057042434812, + "epoch": 0.14730656219392751, + "grad_norm": 0.07973082363605499, + "kl": 0.023773193359375, + "learning_rate": 2.0049842973457775e-06, + "loss": -0.0172, + "step": 3948 + }, + { + "clip_ratio": 0.002814611652866006, + "completion_length": 878.2678985595703, + "epoch": 0.14734387388647918, + "grad_norm": 1603610368.0, + "kl": 616562688.0135651, + "learning_rate": 2.0031489260368016e-06, + "loss": 6166972.5, + "num_tokens": 92457095.0, + "reward": 0.428465373814106, + "reward_std": 0.09685748722404242, + "rewards/code_reward": 0.2811439037322998, + "rewards/format_reward": 1.4732142984867096, + "step": 3949 + }, + { + "clip_ratio": 0.0031315513188019395, + "epoch": 0.14738118557903082, + "grad_norm": 311984320.0, + "kl": 121110528.01348877, + "learning_rate": 2.0013150219415796e-06, + "loss": 1213046.75, + "step": 3950 + }, + { + "clip_ratio": 0.0033322179224342108, + "epoch": 0.1474184972715825, + "grad_norm": 8771390.0, + "kl": 2228224.0133361816, + "learning_rate": 1.999482585829583e-06, + "loss": 22255.5332, + "step": 3951 + }, + { + "clip_ratio": 0.0024677393375895917, + "completion_length": 627.5536041259766, + "epoch": 0.14745580896413413, + "grad_norm": 0.07292172312736511, + "kl": 0.0183868408203125, + "learning_rate": 1.9976516184696704e-06, + "loss": 0.0032, + "num_tokens": 92533188.0, + "reward": 0.9942495226860046, + "reward_std": 0.08232991071417928, + "rewards/code_reward": 0.844249501824379, + "rewards/format_reward": 1.5, + "step": 3952 + }, + { + "clip_ratio": 0.002471443614922464, + "epoch": 0.1474931206566858, + "grad_norm": 0.07197274267673492, + "kl": 0.0178985595703125, + "learning_rate": 1.9958221206300805e-06, + "loss": 0.0033, + "step": 3953 + }, + { + "clip_ratio": 0.0031995090539567173, + "epoch": 0.14753043234923743, + "grad_norm": 0.07277244329452515, + "kl": 0.018157958984375, + "learning_rate": 1.993994093078437e-06, + "loss": 0.0034, + "step": 3954 + }, + { + "clip_ratio": 0.003201571642421186, + "completion_length": 602.8750305175781, + "epoch": 0.1475677440417891, + "grad_norm": 0.06032898277044296, + "kl": 0.02618408203125, + "learning_rate": 1.9921675365817458e-06, + "loss": 0.0098, + "num_tokens": 92603685.0, + "reward": 0.7672836780548096, + "reward_std": 0.12050987780094147, + "rewards/code_reward": 0.6172836869955063, + "rewards/format_reward": 1.5, + "step": 3955 + }, + { + "clip_ratio": 0.003094985382631421, + "epoch": 0.14760505573434074, + "grad_norm": 0.05894092470407486, + "kl": 0.026092529296875, + "learning_rate": 1.9903424519063953e-06, + "loss": 0.0098, + "step": 3956 + }, + { + "clip_ratio": 0.0037705327849835157, + "epoch": 0.1476423674268924, + "grad_norm": 0.0558314323425293, + "kl": 0.026092529296875, + "learning_rate": 1.9885188398181597e-06, + "loss": 0.0097, + "step": 3957 + }, + { + "clip_ratio": 0.0030977383139543235, + "completion_length": 666.1071853637695, + "epoch": 0.14767967911944405, + "grad_norm": 0.04904712364077568, + "kl": 0.026885986328125, + "learning_rate": 1.9866967010821903e-06, + "loss": -0.0108, + "num_tokens": 92670989.0, + "reward": 0.5785714127123356, + "reward_std": 0.11538255959749222, + "rewards/code_reward": 0.4285714253783226, + "rewards/format_reward": 1.5, + "step": 3958 + }, + { + "clip_ratio": 0.0031081921188160777, + "epoch": 0.1477169908119957, + "grad_norm": 0.04739425703883171, + "kl": 0.027618408203125, + "learning_rate": 1.984876036463023e-06, + "loss": -0.0108, + "step": 3959 + }, + { + "clip_ratio": 0.0032783743808977306, + "epoch": 0.14775430250454735, + "grad_norm": 0.04711928218603134, + "kl": 0.026092529296875, + "learning_rate": 1.9830568467245753e-06, + "loss": -0.011, + "step": 3960 + }, + { + "clip_ratio": 0.0043797967955470085, + "completion_length": 656.803596496582, + "epoch": 0.14779161419709902, + "grad_norm": 0.07830507308244705, + "kl": 0.01922607421875, + "learning_rate": 1.9812391326301458e-06, + "loss": -0.003, + "num_tokens": 92735686.0, + "reward": 0.3726806603372097, + "reward_std": 0.17973280232399702, + "rewards/code_reward": 0.22268061712384224, + "rewards/format_reward": 1.5, + "step": 3961 + }, + { + "clip_ratio": 0.003963778028264642, + "epoch": 0.14782892588965066, + "grad_norm": 0.06769536435604095, + "kl": 0.018829345703125, + "learning_rate": 1.9794228949424142e-06, + "loss": -0.003, + "step": 3962 + }, + { + "clip_ratio": 0.004377734090667218, + "epoch": 0.14786623758220233, + "grad_norm": 0.06683240085840225, + "kl": 0.019012451171875, + "learning_rate": 1.977608134423438e-06, + "loss": -0.0032, + "step": 3963 + }, + { + "clip_ratio": 0.002447206585202366, + "completion_length": 543.0535888671875, + "epoch": 0.14790354927475396, + "grad_norm": 0.06884130090475082, + "kl": 0.0190887451171875, + "learning_rate": 1.9757948518346585e-06, + "loss": 0.0202, + "num_tokens": 92799123.0, + "reward": 0.8541645966470242, + "reward_std": 0.11640492454171181, + "rewards/code_reward": 0.7041645366698503, + "rewards/format_reward": 1.5, + "step": 3964 + }, + { + "clip_ratio": 0.0024120197631418705, + "epoch": 0.14794086096730563, + "grad_norm": 0.07513725012540817, + "kl": 0.018768310546875, + "learning_rate": 1.9739830479368956e-06, + "loss": 0.02, + "step": 3965 + }, + { + "clip_ratio": 0.00267921038903296, + "epoch": 0.14797817265985727, + "grad_norm": 0.06758132576942444, + "kl": 0.0198211669921875, + "learning_rate": 1.972172723490347e-06, + "loss": 0.0201, + "step": 3966 + }, + { + "clip_ratio": 0.00460785033646971, + "completion_length": 881.8750457763672, + "epoch": 0.14801548435240894, + "grad_norm": 0.07627296447753906, + "kl": 0.019378662109375, + "learning_rate": 1.9703638792545933e-06, + "loss": 0.0341, + "num_tokens": 92882542.0, + "reward": 0.38098228722810745, + "reward_std": 0.136549006216228, + "rewards/code_reward": 0.23366083623841405, + "rewards/format_reward": 1.4732142984867096, + "step": 3967 + }, + { + "clip_ratio": 0.003911683219484985, + "epoch": 0.14805279604496058, + "grad_norm": 0.06941252946853638, + "kl": 0.018890380859375, + "learning_rate": 1.96855651598859e-06, + "loss": 0.034, + "step": 3968 + }, + { + "clip_ratio": 0.004776505287736654, + "epoch": 0.14809010773751224, + "grad_norm": 0.06621438264846802, + "kl": 0.01873779296875, + "learning_rate": 1.966750634450674e-06, + "loss": 0.0342, + "step": 3969 + }, + { + "clip_ratio": 0.005258074612356722, + "completion_length": 767.0000228881836, + "epoch": 0.14812741943006388, + "grad_norm": 0.0311006810516119, + "kl": 0.0226898193359375, + "learning_rate": 1.964946235398559e-06, + "loss": 0.0049, + "num_tokens": 92955944.0, + "reward": 0.20212166756391525, + "reward_std": 0.00410268222913146, + "rewards/code_reward": 0.052121645072475076, + "rewards/format_reward": 1.5, + "step": 3970 + }, + { + "clip_ratio": 0.005490544193889946, + "epoch": 0.14816473112261555, + "grad_norm": 0.03103722631931305, + "kl": 0.0220489501953125, + "learning_rate": 1.963143319589339e-06, + "loss": 0.0049, + "step": 3971 + }, + { + "clip_ratio": 0.005629284889437258, + "epoch": 0.1482020428151672, + "grad_norm": 0.02887379378080368, + "kl": 0.021087646484375, + "learning_rate": 1.9613418877794835e-06, + "loss": 0.0049, + "step": 3972 + }, + { + "clip_ratio": 0.0038493716274388134, + "completion_length": 679.8750228881836, + "epoch": 0.14823935450771886, + "grad_norm": 0.1683398336172104, + "kl": 0.0196685791015625, + "learning_rate": 1.959541940724838e-06, + "loss": -0.0077, + "num_tokens": 93023137.0, + "reward": 0.4758523032069206, + "reward_std": 0.27354632318019867, + "rewards/code_reward": 0.32585229352116585, + "rewards/format_reward": 1.5, + "step": 3973 + }, + { + "clip_ratio": 0.004209749400615692, + "epoch": 0.1482766662002705, + "grad_norm": 0.1074850857257843, + "kl": 0.019561767578125, + "learning_rate": 1.9577434791806283e-06, + "loss": -0.0077, + "step": 3974 + }, + { + "clip_ratio": 0.004108972032554448, + "epoch": 0.14831397789282216, + "grad_norm": 0.09626840054988861, + "kl": 0.019378662109375, + "learning_rate": 1.9559465039014565e-06, + "loss": -0.008, + "step": 3975 + }, + { + "clip_ratio": 0.004669969785027206, + "completion_length": 803.8750305175781, + "epoch": 0.14835128958537383, + "grad_norm": 0.0946834459900856, + "kl": 0.018707275390625, + "learning_rate": 1.9541510156412973e-06, + "loss": 0.0087, + "num_tokens": 93101362.0, + "reward": 0.23989814892411232, + "reward_std": 0.17171266861259937, + "rewards/code_reward": 0.09257669188082218, + "rewards/format_reward": 1.4732142984867096, + "step": 3976 + }, + { + "clip_ratio": 0.004352353513240814, + "epoch": 0.14838860127792547, + "grad_norm": 0.06697800755500793, + "kl": 0.01806640625, + "learning_rate": 1.952357015153506e-06, + "loss": 0.0086, + "step": 3977 + }, + { + "clip_ratio": 0.004590030410327017, + "epoch": 0.14842591297047714, + "grad_norm": 0.06708981096744537, + "kl": 0.01898193359375, + "learning_rate": 1.9505645031908116e-06, + "loss": 0.0085, + "step": 3978 + }, + { + "clip_ratio": 0.0042596517014317214, + "completion_length": 711.8036041259766, + "epoch": 0.14846322466302878, + "grad_norm": 0.08062521368265152, + "kl": 0.01788330078125, + "learning_rate": 1.948773480505319e-06, + "loss": 0.0058, + "num_tokens": 93173661.0, + "reward": 0.43850360438227654, + "reward_std": 0.2667864318937063, + "rewards/code_reward": 0.28850357234477997, + "rewards/format_reward": 1.5, + "step": 3979 + }, + { + "clip_ratio": 0.004517586319707334, + "epoch": 0.14850053635558044, + "grad_norm": 0.08214481174945831, + "kl": 0.0172271728515625, + "learning_rate": 1.9469839478485085e-06, + "loss": 0.006, + "step": 3980 + }, + { + "clip_ratio": 0.004067154950462282, + "epoch": 0.14853784804813208, + "grad_norm": 0.07693324238061905, + "kl": 0.017303466796875, + "learning_rate": 1.9451959059712337e-06, + "loss": 0.0058, + "step": 3981 + }, + { + "clip_ratio": 0.003178951039444655, + "completion_length": 644.6250305175781, + "epoch": 0.14857515974068375, + "grad_norm": 0.08307460695505142, + "kl": 0.0169830322265625, + "learning_rate": 1.9434093556237255e-06, + "loss": 0.0061, + "num_tokens": 93242086.0, + "reward": 0.5928689353168011, + "reward_std": 0.22252137400209904, + "rewards/code_reward": 0.4428689330816269, + "rewards/format_reward": 1.5, + "step": 3982 + }, + { + "clip_ratio": 0.0035062338574789464, + "epoch": 0.1486124714332354, + "grad_norm": 0.08169068396091461, + "kl": 0.019256591796875, + "learning_rate": 1.9416242975555843e-06, + "loss": 0.0062, + "step": 3983 + }, + { + "clip_ratio": 0.0032830663258209825, + "epoch": 0.14864978312578705, + "grad_norm": 0.07650884985923767, + "kl": 0.0189361572265625, + "learning_rate": 1.939840732515789e-06, + "loss": 0.0059, + "step": 3984 + }, + { + "clip_ratio": 0.0029784546932205558, + "completion_length": 797.4286041259766, + "epoch": 0.1486870948183387, + "grad_norm": 0.06764723360538483, + "kl": 0.017852783203125, + "learning_rate": 1.9380586612526915e-06, + "loss": -0.0181, + "num_tokens": 93314784.0, + "reward": 0.6185807064175606, + "reward_std": 0.27473757416009903, + "rewards/code_reward": 0.4685806892812252, + "rewards/format_reward": 1.5, + "step": 3985 + }, + { + "clip_ratio": 0.0032050879090093076, + "epoch": 0.14872440651089036, + "grad_norm": 0.06990326941013336, + "kl": 0.017822265625, + "learning_rate": 1.9362780845140136e-06, + "loss": -0.0182, + "step": 3986 + }, + { + "clip_ratio": 0.00330150593072176, + "epoch": 0.148761718203442, + "grad_norm": 0.06427579373121262, + "kl": 0.01806640625, + "learning_rate": 1.934499003046853e-06, + "loss": -0.0182, + "step": 3987 + }, + { + "clip_ratio": 0.001577143499162048, + "completion_length": 569.5178833007812, + "epoch": 0.14879902989599367, + "grad_norm": 0.060806918889284134, + "kl": 0.0196533203125, + "learning_rate": 1.93272141759768e-06, + "loss": -0.0112, + "num_tokens": 93378555.0, + "reward": 1.0391321331262589, + "reward_std": 0.1607881337404251, + "rewards/code_reward": 0.8891321122646332, + "rewards/format_reward": 1.5, + "step": 3988 + }, + { + "clip_ratio": 0.0015943844337016344, + "epoch": 0.1488363415885453, + "grad_norm": 0.0652041956782341, + "kl": 0.0203704833984375, + "learning_rate": 1.9309453289123355e-06, + "loss": -0.0113, + "step": 3989 + }, + { + "clip_ratio": 0.0013747168704867363, + "epoch": 0.14887365328109697, + "grad_norm": 0.0609813891351223, + "kl": 0.019195556640625, + "learning_rate": 1.9291707377360354e-06, + "loss": -0.0114, + "step": 3990 + }, + { + "clip_ratio": 0.004382300132419914, + "completion_length": 620.8214569091797, + "epoch": 0.1489109649736486, + "grad_norm": 0.10501217842102051, + "kl": 0.0189208984375, + "learning_rate": 1.927397644813361e-06, + "loss": 0.0044, + "num_tokens": 93447719.0, + "reward": 0.8562550917267799, + "reward_std": 0.252030860632658, + "rewards/code_reward": 0.7062550485134125, + "rewards/format_reward": 1.5, + "step": 3991 + }, + { + "clip_ratio": 0.003907244885340333, + "epoch": 0.14894827666620028, + "grad_norm": 0.1057393029332161, + "kl": 0.019134521484375, + "learning_rate": 1.925626050888273e-06, + "loss": 0.0041, + "step": 3992 + }, + { + "clip_ratio": 0.003941576462239027, + "epoch": 0.14898558835875192, + "grad_norm": 0.09805825352668762, + "kl": 0.01910400390625, + "learning_rate": 1.923855956704099e-06, + "loss": 0.0039, + "step": 3993 + }, + { + "clip_ratio": 0.003713191021233797, + "completion_length": 746.0000610351562, + "epoch": 0.1490229000513036, + "grad_norm": 0.059565991163253784, + "kl": 0.01245880126953125, + "learning_rate": 1.9220873630035353e-06, + "loss": 0.0204, + "num_tokens": 93523837.0, + "reward": 0.5673598684370518, + "reward_std": 0.20269012916833162, + "rewards/code_reward": 0.41735984408296645, + "rewards/format_reward": 1.5, + "step": 3994 + }, + { + "clip_ratio": 0.003166793379932642, + "epoch": 0.14906021174385523, + "grad_norm": 0.05980939790606499, + "kl": 0.0124359130859375, + "learning_rate": 1.920320270528652e-06, + "loss": 0.0201, + "step": 3995 + }, + { + "clip_ratio": 0.0032305470085702837, + "epoch": 0.1490975234364069, + "grad_norm": 0.060419633984565735, + "kl": 0.01262664794921875, + "learning_rate": 1.918554680020889e-06, + "loss": 0.02, + "step": 3996 + }, + { + "clip_ratio": 0.0032099546515382826, + "completion_length": 551.7500305175781, + "epoch": 0.14913483512895853, + "grad_norm": 0.06285125017166138, + "kl": 0.0188446044921875, + "learning_rate": 1.9167905922210546e-06, + "loss": 0.0049, + "num_tokens": 93584569.0, + "reward": 0.5060581006109715, + "reward_std": 0.1538875438272953, + "rewards/code_reward": 0.35605807788670063, + "rewards/format_reward": 1.5, + "step": 3997 + }, + { + "clip_ratio": 0.0033415677025914192, + "epoch": 0.1491721468215102, + "grad_norm": 0.0631854236125946, + "kl": 0.019287109375, + "learning_rate": 1.9150280078693285e-06, + "loss": 0.0049, + "step": 3998 + }, + { + "clip_ratio": 0.0029685100307688117, + "epoch": 0.14920945851406184, + "grad_norm": 0.06127069517970085, + "kl": 0.019927978515625, + "learning_rate": 1.913266927705257e-06, + "loss": 0.0049, + "step": 3999 + }, + { + "clip_ratio": 0.004705242346972227, + "completion_length": 694.2143096923828, + "epoch": 0.1492467702066135, + "grad_norm": 0.08748254179954529, + "kl": 0.02105712890625, + "learning_rate": 1.9115073524677572e-06, + "loss": 0.0076, + "num_tokens": 93652537.0, + "reward": 0.6440439485013485, + "reward_std": 0.180202417075634, + "rewards/code_reward": 0.49404391972348094, + "rewards/format_reward": 1.5, + "step": 4000 + }, + { + "clip_ratio": 0.004660602891817689, + "epoch": 0.14928408189916514, + "grad_norm": 0.08627671748399734, + "kl": 0.02069091796875, + "learning_rate": 1.9097492828951134e-06, + "loss": 0.0078, + "step": 4001 + }, + { + "clip_ratio": 0.004203109419904649, + "epoch": 0.1493213935917168, + "grad_norm": 0.08581000566482544, + "kl": 0.020843505859375, + "learning_rate": 1.907992719724979e-06, + "loss": 0.0074, + "step": 4002 + }, + { + "clip_ratio": 0.004215803404804319, + "completion_length": 668.1428985595703, + "epoch": 0.14935870528426845, + "grad_norm": 0.07738114893436432, + "kl": 0.021575927734375, + "learning_rate": 1.906237663694378e-06, + "loss": 0.0155, + "num_tokens": 93730791.0, + "reward": 0.48399563878774643, + "reward_std": 0.25003697723150253, + "rewards/code_reward": 0.33399561792612076, + "rewards/format_reward": 1.5, + "step": 4003 + }, + { + "clip_ratio": 0.004028263327199966, + "epoch": 0.14939601697682012, + "grad_norm": 0.07907000184059143, + "kl": 0.022003173828125, + "learning_rate": 1.9044841155396949e-06, + "loss": 0.0154, + "step": 4004 + }, + { + "clip_ratio": 0.003859634685795754, + "epoch": 0.14943332866937176, + "grad_norm": 0.07921607792377472, + "kl": 0.0218505859375, + "learning_rate": 1.9027320759966882e-06, + "loss": 0.0154, + "step": 4005 + }, + { + "clip_ratio": 0.0037813230883330107, + "completion_length": 625.8750305175781, + "epoch": 0.14947064036192342, + "grad_norm": 0.07782827317714691, + "kl": 0.0174713134765625, + "learning_rate": 1.900981545800482e-06, + "loss": -0.0007, + "num_tokens": 93804984.0, + "reward": 0.6611976027488708, + "reward_std": 0.140300257015042, + "rewards/code_reward": 0.5111975409090519, + "rewards/format_reward": 1.5, + "step": 4006 + }, + { + "clip_ratio": 0.003348255471792072, + "epoch": 0.14950795205447506, + "grad_norm": 0.07268171012401581, + "kl": 0.017822265625, + "learning_rate": 1.8992325256855649e-06, + "loss": -0.0009, + "step": 4007 + }, + { + "clip_ratio": 0.003119231725577265, + "epoch": 0.14954526374702673, + "grad_norm": 0.07647408545017242, + "kl": 0.0175323486328125, + "learning_rate": 1.8974850163857944e-06, + "loss": -0.0009, + "step": 4008 + }, + { + "clip_ratio": 0.0026701720198616385, + "completion_length": 709.3750457763672, + "epoch": 0.14958257543957837, + "grad_norm": 0.06796012818813324, + "kl": 0.016815185546875, + "learning_rate": 1.8957390186343912e-06, + "loss": 0.029, + "num_tokens": 93872863.0, + "reward": 0.8399707525968552, + "reward_std": 0.24065281450748444, + "rewards/code_reward": 0.6899707354605198, + "rewards/format_reward": 1.5, + "step": 4009 + }, + { + "clip_ratio": 0.0024435570230707526, + "epoch": 0.14961988713213004, + "grad_norm": 0.06666920334100723, + "kl": 0.0165252685546875, + "learning_rate": 1.893994533163946e-06, + "loss": 0.0287, + "step": 4010 + }, + { + "clip_ratio": 0.002403818885795772, + "epoch": 0.14965719882468168, + "grad_norm": 0.06610488891601562, + "kl": 0.0164642333984375, + "learning_rate": 1.89225156070641e-06, + "loss": 0.0287, + "step": 4011 + }, + { + "clip_ratio": 0.003963901719544083, + "completion_length": 622.2500305175781, + "epoch": 0.14969451051723334, + "grad_norm": 0.0798511654138565, + "kl": 0.01641845703125, + "learning_rate": 1.8905101019931028e-06, + "loss": -0.003, + "num_tokens": 93942163.0, + "reward": 0.5900910496711731, + "reward_std": 0.23845097329467535, + "rewards/code_reward": 0.4400910325348377, + "rewards/format_reward": 1.5, + "step": 4012 + }, + { + "clip_ratio": 0.004504726442974061, + "epoch": 0.14973182220978498, + "grad_norm": 0.08273596316576004, + "kl": 0.015533447265625, + "learning_rate": 1.8887701577547086e-06, + "loss": -0.0027, + "step": 4013 + }, + { + "clip_ratio": 0.004209313367027789, + "epoch": 0.14976913390233665, + "grad_norm": 0.08220329880714417, + "kl": 0.01596832275390625, + "learning_rate": 1.8870317287212752e-06, + "loss": -0.0028, + "step": 4014 + }, + { + "clip_ratio": 0.002835775085259229, + "completion_length": 523.5535888671875, + "epoch": 0.1498064455948883, + "grad_norm": 0.06259723752737045, + "kl": 0.01781463623046875, + "learning_rate": 1.8852948156222161e-06, + "loss": 0.0145, + "num_tokens": 93995828.0, + "reward": 0.8483805172145367, + "reward_std": 0.1256755255162716, + "rewards/code_reward": 0.6983805000782013, + "rewards/format_reward": 1.5, + "step": 4015 + }, + { + "clip_ratio": 0.002965266816318035, + "epoch": 0.14984375728743995, + "grad_norm": 0.0634375810623169, + "kl": 0.01740264892578125, + "learning_rate": 1.8835594191863078e-06, + "loss": 0.0146, + "step": 4016 + }, + { + "clip_ratio": 0.0023455178015865386, + "epoch": 0.1498810689799916, + "grad_norm": 0.06362677365541458, + "kl": 0.01727294921875, + "learning_rate": 1.8818255401416887e-06, + "loss": 0.0144, + "step": 4017 + }, + { + "clip_ratio": 0.0025614940095692873, + "completion_length": 725.8750305175781, + "epoch": 0.14991838067254326, + "grad_norm": 0.044429995119571686, + "kl": 0.014251708984375, + "learning_rate": 1.8800931792158644e-06, + "loss": 0.0009, + "num_tokens": 94064981.0, + "reward": 0.8179835937917233, + "reward_std": 0.14721234142780304, + "rewards/code_reward": 0.6679835617542267, + "rewards/format_reward": 1.5, + "step": 4018 + }, + { + "clip_ratio": 0.002788447367493063, + "epoch": 0.1499556923650949, + "grad_norm": 0.04461495205760002, + "kl": 0.014434814453125, + "learning_rate": 1.878362337135699e-06, + "loss": 0.0011, + "step": 4019 + }, + { + "clip_ratio": 0.0024074516259133816, + "epoch": 0.14999300405764657, + "grad_norm": 0.0434550903737545, + "kl": 0.0142059326171875, + "learning_rate": 1.876633014627423e-06, + "loss": 0.0009, + "step": 4020 + }, + { + "clip_ratio": 0.004698403121437877, + "completion_length": 720.1250228881836, + "epoch": 0.1500303157501982, + "grad_norm": 0.07658740133047104, + "kl": 0.019683837890625, + "learning_rate": 1.8749052124166296e-06, + "loss": -0.0306, + "num_tokens": 94129150.0, + "reward": 0.5758495144546032, + "reward_std": 0.27250834181904793, + "rewards/code_reward": 0.4285280928015709, + "rewards/format_reward": 1.4732142984867096, + "step": 4021 + }, + { + "clip_ratio": 0.004904812551103532, + "epoch": 0.15006762744274987, + "grad_norm": 0.09863916039466858, + "kl": 0.0199127197265625, + "learning_rate": 1.8731789312282688e-06, + "loss": -0.0304, + "step": 4022 + }, + { + "clip_ratio": 0.004921187995932996, + "epoch": 0.1501049391353015, + "grad_norm": 0.08056825399398804, + "kl": 0.0201416015625, + "learning_rate": 1.8714541717866586e-06, + "loss": -0.0305, + "step": 4023 + }, + { + "clip_ratio": 0.004006708913948387, + "completion_length": 680.1428833007812, + "epoch": 0.15014225082785318, + "grad_norm": 0.06331422179937363, + "kl": 0.0139312744140625, + "learning_rate": 1.8697309348154757e-06, + "loss": 0.0129, + "num_tokens": 94209840.0, + "reward": 0.3890169821679592, + "reward_std": 0.18366621807217598, + "rewards/code_reward": 0.23901697620749474, + "rewards/format_reward": 1.5, + "step": 4024 + }, + { + "clip_ratio": 0.004097026016097516, + "epoch": 0.15017956252040482, + "grad_norm": 0.06402117758989334, + "kl": 0.0137481689453125, + "learning_rate": 1.8680092210377582e-06, + "loss": 0.0128, + "step": 4025 + }, + { + "clip_ratio": 0.00403916125651449, + "epoch": 0.15021687421295649, + "grad_norm": 0.06552883982658386, + "kl": 0.0134124755859375, + "learning_rate": 1.866289031175906e-06, + "loss": 0.0126, + "step": 4026 + }, + { + "clip_ratio": 0.0024125311174429953, + "completion_length": 636.2500228881836, + "epoch": 0.15025418590550813, + "grad_norm": 0.04700860381126404, + "kl": 0.016021728515625, + "learning_rate": 1.8645703659516774e-06, + "loss": 0.0045, + "num_tokens": 94267506.0, + "reward": 0.5298076830804348, + "reward_std": 0.09998643398284912, + "rewards/code_reward": 0.37980769015848637, + "rewards/format_reward": 1.5, + "step": 4027 + }, + { + "clip_ratio": 0.002808008692227304, + "epoch": 0.1502914975980598, + "grad_norm": 0.047337617725133896, + "kl": 0.0156402587890625, + "learning_rate": 1.8628532260861936e-06, + "loss": 0.0045, + "step": 4028 + }, + { + "clip_ratio": 0.0029026827542111278, + "epoch": 0.15032880929061143, + "grad_norm": 0.045254047960042953, + "kl": 0.0154571533203125, + "learning_rate": 1.8611376122999336e-06, + "loss": 0.0044, + "step": 4029 + }, + { + "clip_ratio": 0.0036196550354361534, + "completion_length": 753.7321853637695, + "epoch": 0.1503661209831631, + "grad_norm": 0.10439632833003998, + "kl": 0.0169219970703125, + "learning_rate": 1.8594235253127373e-06, + "loss": -0.0182, + "num_tokens": 94341985.0, + "reward": 0.667224857956171, + "reward_std": 0.2271574977785349, + "rewards/code_reward": 0.5172248035669327, + "rewards/format_reward": 1.5, + "step": 4030 + }, + { + "clip_ratio": 0.0036729072453454137, + "epoch": 0.15040343267571477, + "grad_norm": 0.10476329177618027, + "kl": 0.01576995849609375, + "learning_rate": 1.8577109658438035e-06, + "loss": -0.0179, + "step": 4031 + }, + { + "clip_ratio": 0.003559533681254834, + "epoch": 0.1504407443682664, + "grad_norm": 0.09237037599086761, + "kl": 0.01568603515625, + "learning_rate": 1.8559999346116913e-06, + "loss": -0.0184, + "step": 4032 + }, + { + "clip_ratio": 0.004892462166026235, + "completion_length": 727.6607513427734, + "epoch": 0.15047805606081807, + "grad_norm": 0.08080349117517471, + "kl": 0.01531982421875, + "learning_rate": 1.854290432334318e-06, + "loss": 0.0068, + "num_tokens": 94424048.0, + "reward": 0.5293720029294491, + "reward_std": 0.28037151077296585, + "rewards/code_reward": 0.37937202263856307, + "rewards/format_reward": 1.5, + "step": 4033 + }, + { + "clip_ratio": 0.004446330829523504, + "epoch": 0.1505153677533697, + "grad_norm": 0.07808221131563187, + "kl": 0.015655517578125, + "learning_rate": 1.852582459728957e-06, + "loss": 0.0066, + "step": 4034 + }, + { + "clip_ratio": 0.004876351507846266, + "epoch": 0.15055267944592138, + "grad_norm": 0.07788339257240295, + "kl": 0.0154571533203125, + "learning_rate": 1.8508760175122435e-06, + "loss": 0.0065, + "step": 4035 + }, + { + "clip_ratio": 0.0028729282785207033, + "completion_length": 502.12500762939453, + "epoch": 0.15058999113847302, + "grad_norm": 0.08534721285104752, + "kl": 0.020355224609375, + "learning_rate": 1.8491711064001693e-06, + "loss": 0.0091, + "num_tokens": 94480167.0, + "reward": 0.8677983731031418, + "reward_std": 0.10167924594134092, + "rewards/code_reward": 0.7177983466535807, + "rewards/format_reward": 1.5, + "step": 4036 + }, + { + "clip_ratio": 0.0027327663265168667, + "epoch": 0.15062730283102468, + "grad_norm": 0.08188679814338684, + "kl": 0.01995849609375, + "learning_rate": 1.847467727108081e-06, + "loss": 0.009, + "step": 4037 + }, + { + "clip_ratio": 0.002875867357943207, + "epoch": 0.15066461452357632, + "grad_norm": 0.08522690087556839, + "kl": 0.020172119140625, + "learning_rate": 1.8457658803506884e-06, + "loss": 0.0092, + "step": 4038 + }, + { + "clip_ratio": 0.0036091088550165296, + "completion_length": 713.1786041259766, + "epoch": 0.150701926216128, + "grad_norm": 0.06990039348602295, + "kl": 0.0180816650390625, + "learning_rate": 1.844065566842051e-06, + "loss": 0.0104, + "num_tokens": 94548511.0, + "reward": 0.844200000166893, + "reward_std": 0.16020464524626732, + "rewards/code_reward": 0.6941999811679125, + "rewards/format_reward": 1.5, + "step": 4039 + }, + { + "clip_ratio": 0.003756970341783017, + "epoch": 0.15073923790867963, + "grad_norm": 0.07216975837945938, + "kl": 0.0181121826171875, + "learning_rate": 1.8423667872955915e-06, + "loss": 0.0105, + "step": 4040 + }, + { + "clip_ratio": 0.0032989188912324607, + "epoch": 0.1507765496012313, + "grad_norm": 0.11023581027984619, + "kl": 0.018524169921875, + "learning_rate": 1.8406695424240851e-06, + "loss": 0.0104, + "step": 4041 + }, + { + "clip_ratio": 0.0033316475455649197, + "completion_length": 750.0000305175781, + "epoch": 0.15081386129378294, + "grad_norm": 0.06278635561466217, + "kl": 0.013641357421875, + "learning_rate": 1.8389738329396639e-06, + "loss": -0.0067, + "num_tokens": 94622375.0, + "reward": 0.5352095440030098, + "reward_std": 0.3267304450273514, + "rewards/code_reward": 0.3852095417678356, + "rewards/format_reward": 1.5, + "step": 4042 + }, + { + "clip_ratio": 0.0032334823918063194, + "epoch": 0.1508511729863346, + "grad_norm": 0.06571146845817566, + "kl": 0.013824462890625, + "learning_rate": 1.8372796595538183e-06, + "loss": -0.0068, + "step": 4043 + }, + { + "clip_ratio": 0.0034581219078972936, + "epoch": 0.15088848467888624, + "grad_norm": 0.0769183561205864, + "kl": 0.013824462890625, + "learning_rate": 1.8355870229773898e-06, + "loss": -0.0066, + "step": 4044 + }, + { + "clip_ratio": 0.004806887358427048, + "completion_length": 702.1785888671875, + "epoch": 0.1509257963714379, + "grad_norm": 0.0650140717625618, + "kl": 0.01654052734375, + "learning_rate": 1.8338959239205773e-06, + "loss": -0.0027, + "num_tokens": 94696753.0, + "reward": 0.2549387291073799, + "reward_std": 0.02803594060242176, + "rewards/code_reward": 0.10493870871141553, + "rewards/format_reward": 1.5, + "step": 4045 + }, + { + "clip_ratio": 0.004786718636751175, + "epoch": 0.15096310806398955, + "grad_norm": 0.06168833747506142, + "kl": 0.016357421875, + "learning_rate": 1.832206363092936e-06, + "loss": -0.0027, + "step": 4046 + }, + { + "clip_ratio": 0.004521949915215373, + "epoch": 0.15100041975654122, + "grad_norm": 0.059622906148433685, + "kl": 0.016326904296875, + "learning_rate": 1.8305183412033728e-06, + "loss": -0.0027, + "step": 4047 + }, + { + "clip_ratio": 0.003494888893328607, + "completion_length": 736.8214569091797, + "epoch": 0.15103773144909285, + "grad_norm": 0.10277701169252396, + "kl": 0.0471649169921875, + "learning_rate": 1.8288318589601505e-06, + "loss": 0.0078, + "num_tokens": 94769627.0, + "reward": 0.660748977214098, + "reward_std": 0.23823311924934387, + "rewards/code_reward": 0.5107489638030529, + "rewards/format_reward": 1.5, + "step": 4048 + }, + { + "clip_ratio": 0.003225523163564503, + "epoch": 0.15107504314164452, + "grad_norm": 0.06439051777124405, + "kl": 0.0400848388671875, + "learning_rate": 1.8271469170708864e-06, + "loss": 0.0077, + "step": 4049 + }, + { + "clip_ratio": 0.0032581795239821076, + "epoch": 0.15111235483419616, + "grad_norm": 0.08737075328826904, + "kl": 0.040069580078125, + "learning_rate": 1.8254635162425506e-06, + "loss": 0.0075, + "step": 4050 + }, + { + "clip_ratio": 0.0023838410852476954, + "completion_length": 793.9643402099609, + "epoch": 0.15114966652674783, + "grad_norm": 0.06947948783636093, + "kl": 0.0156402587890625, + "learning_rate": 1.8237816571814678e-06, + "loss": -0.005, + "num_tokens": 94849229.0, + "reward": 0.8597211912274361, + "reward_std": 0.24203574657440186, + "rewards/code_reward": 0.7097212001681328, + "rewards/format_reward": 1.5, + "step": 4051 + }, + { + "clip_ratio": 0.002153387787984684, + "epoch": 0.15118697821929947, + "grad_norm": 0.07031185179948807, + "kl": 0.015838623046875, + "learning_rate": 1.8221013405933124e-06, + "loss": -0.005, + "step": 4052 + }, + { + "clip_ratio": 0.002667787251994014, + "epoch": 0.15122428991185113, + "grad_norm": 0.06524248421192169, + "kl": 0.01580810546875, + "learning_rate": 1.820422567183116e-06, + "loss": -0.0052, + "step": 4053 + }, + { + "clip_ratio": 0.0038723602774553, + "completion_length": 555.5357437133789, + "epoch": 0.15126160160440277, + "grad_norm": 0.08465789258480072, + "kl": 0.0228118896484375, + "learning_rate": 1.8187453376552605e-06, + "loss": -0.0095, + "num_tokens": 94919091.0, + "reward": 1.0366344153881073, + "reward_std": 0.18389954790472984, + "rewards/code_reward": 0.886634349822998, + "rewards/format_reward": 1.5, + "step": 4054 + }, + { + "clip_ratio": 0.00393825676292181, + "epoch": 0.15129891329695444, + "grad_norm": 0.08804000914096832, + "kl": 0.023040771484375, + "learning_rate": 1.8170696527134793e-06, + "loss": -0.0093, + "step": 4055 + }, + { + "clip_ratio": 0.0031426813220605254, + "epoch": 0.15133622498950608, + "grad_norm": 0.08633329719305038, + "kl": 0.0228118896484375, + "learning_rate": 1.8153955130608603e-06, + "loss": -0.0098, + "step": 4056 + }, + { + "clip_ratio": 0.0039490575436502695, + "completion_length": 575.357177734375, + "epoch": 0.15137353668205775, + "grad_norm": 0.06835346668958664, + "kl": 0.026153564453125, + "learning_rate": 1.8137229193998386e-06, + "loss": 0.0226, + "num_tokens": 94980641.0, + "reward": 0.6586734652519226, + "reward_std": 0.02903678361326456, + "rewards/code_reward": 0.5086734807118773, + "rewards/format_reward": 1.5, + "step": 4057 + }, + { + "clip_ratio": 0.003909139370080084, + "epoch": 0.15141084837460939, + "grad_norm": 0.06440743058919907, + "kl": 0.02496337890625, + "learning_rate": 1.812051872432206e-06, + "loss": 0.0226, + "step": 4058 + }, + { + "clip_ratio": 0.0035249319043941796, + "epoch": 0.15144816006716105, + "grad_norm": 0.061298660933971405, + "kl": 0.02490234375, + "learning_rate": 1.8103823728591015e-06, + "loss": 0.0223, + "step": 4059 + }, + { + "clip_ratio": 0.0018684524111449718, + "completion_length": 564.3571548461914, + "epoch": 0.1514854717597127, + "grad_norm": 0.06377794593572617, + "kl": 0.0199737548828125, + "learning_rate": 1.8087144213810169e-06, + "loss": 0.0056, + "num_tokens": 95040959.0, + "reward": 0.8195854499936104, + "reward_std": 0.16811832040548325, + "rewards/code_reward": 0.6695854589343071, + "rewards/format_reward": 1.5, + "step": 4060 + }, + { + "clip_ratio": 0.0019142708042636514, + "epoch": 0.15152278345226436, + "grad_norm": 0.06274618208408356, + "kl": 0.01861572265625, + "learning_rate": 1.8070480186977944e-06, + "loss": 0.0057, + "step": 4061 + }, + { + "clip_ratio": 0.0019543685484677553, + "epoch": 0.151560095144816, + "grad_norm": 0.05996251106262207, + "kl": 0.0194091796875, + "learning_rate": 1.8053831655086235e-06, + "loss": 0.0056, + "step": 4062 + }, + { + "clip_ratio": 0.0019463779754005373, + "completion_length": 457.73216247558594, + "epoch": 0.15159740683736767, + "grad_norm": 0.04849989339709282, + "kl": 0.02276611328125, + "learning_rate": 1.803719862512046e-06, + "loss": 0.0067, + "num_tokens": 95088712.0, + "reward": 0.8752747289836407, + "reward_std": 0.06285055726766586, + "rewards/code_reward": 0.7252747267484665, + "rewards/format_reward": 1.5, + "step": 4063 + }, + { + "clip_ratio": 0.002428429783321917, + "epoch": 0.1516347185299193, + "grad_norm": 0.05019763857126236, + "kl": 0.021484375, + "learning_rate": 1.8020581104059555e-06, + "loss": 0.0068, + "step": 4064 + }, + { + "clip_ratio": 0.0022159615182317793, + "epoch": 0.15167203022247097, + "grad_norm": 0.048938605934381485, + "kl": 0.02081298828125, + "learning_rate": 1.8003979098875896e-06, + "loss": 0.0068, + "step": 4065 + }, + { + "clip_ratio": 0.003235133015550673, + "completion_length": 680.6428985595703, + "epoch": 0.1517093419150226, + "grad_norm": 0.07376420497894287, + "kl": 0.01690673828125, + "learning_rate": 1.7987392616535376e-06, + "loss": 0.0096, + "num_tokens": 95158636.0, + "reward": 0.9395878314971924, + "reward_std": 0.2005494385957718, + "rewards/code_reward": 0.7895878106355667, + "rewards/format_reward": 1.5, + "step": 4066 + }, + { + "clip_ratio": 0.0028534168959595263, + "epoch": 0.15174665360757428, + "grad_norm": 0.08352267742156982, + "kl": 0.016693115234375, + "learning_rate": 1.797082166399739e-06, + "loss": 0.0093, + "step": 4067 + }, + { + "clip_ratio": 0.0027701081708073616, + "epoch": 0.15178396530012592, + "grad_norm": 0.06594755500555038, + "kl": 0.0168304443359375, + "learning_rate": 1.795426624821479e-06, + "loss": 0.0095, + "step": 4068 + }, + { + "clip_ratio": 0.004281431203708053, + "completion_length": 677.9464721679688, + "epoch": 0.15182127699267758, + "grad_norm": 0.05627885088324547, + "kl": 0.0168914794921875, + "learning_rate": 1.793772637613394e-06, + "loss": 0.0007, + "num_tokens": 95229893.0, + "reward": 0.45178648084402084, + "reward_std": 0.030716873472556472, + "rewards/code_reward": 0.301786453317618, + "rewards/format_reward": 1.5, + "step": 4069 + }, + { + "clip_ratio": 0.0038971573812887073, + "epoch": 0.15185858868522922, + "grad_norm": 0.05433022603392601, + "kl": 0.0168304443359375, + "learning_rate": 1.7921202054694643e-06, + "loss": 0.0005, + "step": 4070 + }, + { + "clip_ratio": 0.0036369271692819893, + "epoch": 0.1518959003777809, + "grad_norm": 0.05790727958083153, + "kl": 0.016876220703125, + "learning_rate": 1.7904693290830216e-06, + "loss": 0.0004, + "step": 4071 + }, + { + "clip_ratio": 0.003154009871650487, + "completion_length": 789.6607666015625, + "epoch": 0.15193321207033253, + "grad_norm": 0.06214753910899162, + "kl": 0.0175933837890625, + "learning_rate": 1.78882000914674e-06, + "loss": 0.0056, + "num_tokens": 95310500.0, + "reward": 0.6702255085110664, + "reward_std": 0.2564242444932461, + "rewards/code_reward": 0.5229040794074535, + "rewards/format_reward": 1.4732142984867096, + "step": 4072 + }, + { + "clip_ratio": 0.003088046971242875, + "epoch": 0.1519705237628842, + "grad_norm": 0.0640522912144661, + "kl": 0.01812744140625, + "learning_rate": 1.7871722463526454e-06, + "loss": 0.0056, + "step": 4073 + }, + { + "clip_ratio": 0.003189256298355758, + "epoch": 0.15200783545543584, + "grad_norm": 0.06230806186795235, + "kl": 0.0177154541015625, + "learning_rate": 1.785526041392109e-06, + "loss": 0.0057, + "step": 4074 + }, + { + "clip_ratio": 0.003227527136914432, + "completion_length": 768.9821701049805, + "epoch": 0.1520451471479875, + "grad_norm": 0.04199837148189545, + "kl": 0.018402099609375, + "learning_rate": 1.7838813949558464e-06, + "loss": 0.0142, + "num_tokens": 95388227.0, + "reward": 0.4195908457040787, + "reward_std": 0.06922912085428834, + "rewards/code_reward": 0.26959084696136415, + "rewards/format_reward": 1.5, + "step": 4075 + }, + { + "clip_ratio": 0.003328690363559872, + "epoch": 0.15208245884053914, + "grad_norm": 0.04176390916109085, + "kl": 0.018310546875, + "learning_rate": 1.7822383077339217e-06, + "loss": 0.0143, + "step": 4076 + }, + { + "clip_ratio": 0.0034970208653248847, + "epoch": 0.1521197705330908, + "grad_norm": 0.04016958922147751, + "kl": 0.0182647705078125, + "learning_rate": 1.7805967804157436e-06, + "loss": 0.0142, + "step": 4077 + }, + { + "clip_ratio": 0.0016549015417695045, + "completion_length": 671.6785888671875, + "epoch": 0.15215708222564245, + "grad_norm": 0.04430824890732765, + "kl": 0.013519287109375, + "learning_rate": 1.7789568136900659e-06, + "loss": 0.0055, + "num_tokens": 95457775.0, + "reward": 0.8285714201629162, + "reward_std": 0.11720181256532669, + "rewards/code_reward": 0.6785714328289032, + "rewards/format_reward": 1.5, + "step": 4078 + }, + { + "clip_ratio": 0.001918272813782096, + "epoch": 0.15219439391819412, + "grad_norm": 0.0456777885556221, + "kl": 0.01311492919921875, + "learning_rate": 1.7773184082449899e-06, + "loss": 0.0057, + "step": 4079 + }, + { + "clip_ratio": 0.001538525684736669, + "epoch": 0.15223170561074575, + "grad_norm": 0.0453985370695591, + "kl": 0.0136260986328125, + "learning_rate": 1.7756815647679576e-06, + "loss": 0.0056, + "step": 4080 + }, + { + "clip_ratio": 0.004293997713830322, + "completion_length": 655.607177734375, + "epoch": 0.15226901730329742, + "grad_norm": 0.08328623324632645, + "kl": 0.0178070068359375, + "learning_rate": 1.77404628394576e-06, + "loss": -0.0294, + "num_tokens": 95524589.0, + "reward": 0.6390882134437561, + "reward_std": 0.2222528774291277, + "rewards/code_reward": 0.48908818513154984, + "rewards/format_reward": 1.5, + "step": 4081 + }, + { + "clip_ratio": 0.00418506097048521, + "epoch": 0.15230632899584906, + "grad_norm": 0.09323514997959137, + "kl": 0.0179290771484375, + "learning_rate": 1.7724125664645303e-06, + "loss": -0.0293, + "step": 4082 + }, + { + "clip_ratio": 0.004495500528719276, + "epoch": 0.15234364068840073, + "grad_norm": 0.08334168791770935, + "kl": 0.017822265625, + "learning_rate": 1.7707804130097447e-06, + "loss": -0.0293, + "step": 4083 + }, + { + "clip_ratio": 0.005133077851496637, + "completion_length": 821.3750457763672, + "epoch": 0.1523809523809524, + "grad_norm": 0.087307870388031, + "kl": 0.023651123046875, + "learning_rate": 1.7691498242662255e-06, + "loss": 0.0213, + "num_tokens": 95612768.0, + "reward": 0.7161701992154121, + "reward_std": 0.24555892124772072, + "rewards/code_reward": 0.5688487514853477, + "rewards/format_reward": 1.4732142984867096, + "step": 4084 + }, + { + "clip_ratio": 0.005128298536874354, + "epoch": 0.15241826407350403, + "grad_norm": 0.08461406826972961, + "kl": 0.024658203125, + "learning_rate": 1.7675208009181372e-06, + "loss": 0.0211, + "step": 4085 + }, + { + "clip_ratio": 0.0051128523773513734, + "epoch": 0.1524555757660557, + "grad_norm": 0.08506777882575989, + "kl": 0.02398681640625, + "learning_rate": 1.7658933436489877e-06, + "loss": 0.0212, + "step": 4086 + }, + { + "clip_ratio": 0.004497839428950101, + "completion_length": 655.4286041259766, + "epoch": 0.15249288745860734, + "grad_norm": 0.0940258651971817, + "kl": 0.015899658203125, + "learning_rate": 1.7642674531416298e-06, + "loss": -0.0131, + "num_tokens": 95677962.0, + "reward": 0.9172366410493851, + "reward_std": 0.31046992633491755, + "rewards/code_reward": 0.767236664891243, + "rewards/format_reward": 1.5, + "step": 4087 + }, + { + "clip_ratio": 0.004572460951749235, + "epoch": 0.152530199151159, + "grad_norm": 0.09344897419214249, + "kl": 0.0157623291015625, + "learning_rate": 1.7626431300782538e-06, + "loss": -0.0131, + "step": 4088 + }, + { + "clip_ratio": 0.004626894951798022, + "epoch": 0.15256751084371065, + "grad_norm": 0.09505636990070343, + "kl": 0.0161895751953125, + "learning_rate": 1.7610203751403977e-06, + "loss": -0.0132, + "step": 4089 + }, + { + "clip_ratio": 0.003685915027745068, + "completion_length": 534.0178871154785, + "epoch": 0.1526048225362623, + "grad_norm": 0.06029903516173363, + "kl": 0.0178070068359375, + "learning_rate": 1.7593991890089379e-06, + "loss": -0.0033, + "num_tokens": 95742741.0, + "reward": 0.5387116931378841, + "reward_std": 0.054518332704901695, + "rewards/code_reward": 0.3887116778641939, + "rewards/format_reward": 1.5, + "step": 4090 + }, + { + "clip_ratio": 0.003472817945294082, + "epoch": 0.15264213422881395, + "grad_norm": 0.061074867844581604, + "kl": 0.017547607421875, + "learning_rate": 1.7577795723640939e-06, + "loss": -0.0033, + "step": 4091 + }, + { + "clip_ratio": 0.003447483526542783, + "epoch": 0.15267944592136562, + "grad_norm": 0.057306788861751556, + "kl": 0.0176239013671875, + "learning_rate": 1.7561615258854288e-06, + "loss": -0.0033, + "step": 4092 + }, + { + "clip_ratio": 0.002230495447292924, + "completion_length": 722.3750305175781, + "epoch": 0.15271675761391726, + "grad_norm": 0.038260605186223984, + "kl": 0.014251708984375, + "learning_rate": 1.7545450502518424e-06, + "loss": 0.0133, + "num_tokens": 95813280.0, + "reward": 0.7991596721112728, + "reward_std": 0.10311993956565857, + "rewards/code_reward": 0.6491596698760986, + "rewards/format_reward": 1.5, + "step": 4093 + }, + { + "clip_ratio": 0.0022826282074674964, + "epoch": 0.15275406930646893, + "grad_norm": 0.03685878962278366, + "kl": 0.01422882080078125, + "learning_rate": 1.7529301461415793e-06, + "loss": 0.0134, + "step": 4094 + }, + { + "clip_ratio": 0.002026498899795115, + "epoch": 0.15279138099902057, + "grad_norm": 0.03690643981099129, + "kl": 0.0143585205078125, + "learning_rate": 1.7513168142322224e-06, + "loss": 0.0133, + "step": 4095 + }, + { + "clip_ratio": 0.0031715218210592866, + "completion_length": 577.5714492797852, + "epoch": 0.15282869269157223, + "grad_norm": 0.07653654366731644, + "kl": 0.0204925537109375, + "learning_rate": 1.7497050552006966e-06, + "loss": 0.0028, + "num_tokens": 95867730.0, + "reward": 1.0007519125938416, + "reward_std": 0.24641069769859314, + "rewards/code_reward": 0.8507518768310547, + "rewards/format_reward": 1.5, + "step": 4096 + }, + { + "clip_ratio": 0.0028393184184096754, + "epoch": 0.15286600438412387, + "grad_norm": 0.07509982585906982, + "kl": 0.019805908203125, + "learning_rate": 1.7480948697232666e-06, + "loss": 0.0026, + "step": 4097 + }, + { + "clip_ratio": 0.002450478496029973, + "epoch": 0.15290331607667554, + "grad_norm": 0.07677613943815231, + "kl": 0.0203857421875, + "learning_rate": 1.7464862584755343e-06, + "loss": 0.0025, + "step": 4098 + }, + { + "clip_ratio": 0.002725782571360469, + "completion_length": 658.232177734375, + "epoch": 0.15294062776922718, + "grad_norm": 0.06660380214452744, + "kl": 0.016082763671875, + "learning_rate": 1.7448792221324457e-06, + "loss": -0.0157, + "num_tokens": 95933711.0, + "reward": 0.6226329877972603, + "reward_std": 0.16153405793011189, + "rewards/code_reward": 0.4726329818367958, + "rewards/format_reward": 1.5, + "step": 4099 + }, + { + "clip_ratio": 0.0023766043595969677, + "epoch": 0.15297793946177884, + "grad_norm": 0.07160631567239761, + "kl": 0.016143798828125, + "learning_rate": 1.743273761368281e-06, + "loss": -0.0157, + "step": 4100 + }, + { + "clip_ratio": 0.0026091230974998325, + "epoch": 0.15301525115433048, + "grad_norm": 0.06607470661401749, + "kl": 0.0160369873046875, + "learning_rate": 1.741669876856662e-06, + "loss": -0.0159, + "step": 4101 + }, + { + "clip_ratio": 0.001996713050175458, + "completion_length": 522.2678833007812, + "epoch": 0.15305256284688215, + "grad_norm": 0.10288728773593903, + "kl": 0.0223846435546875, + "learning_rate": 1.7400675692705498e-06, + "loss": 0.0071, + "num_tokens": 95986408.0, + "reward": 0.8037414997816086, + "reward_std": 0.0801687017083168, + "rewards/code_reward": 0.6537414975464344, + "rewards/format_reward": 1.5, + "step": 4102 + }, + { + "clip_ratio": 0.002317958860658109, + "epoch": 0.1530898745394338, + "grad_norm": 0.06857869029045105, + "kl": 0.0213775634765625, + "learning_rate": 1.7384668392822429e-06, + "loss": 0.0072, + "step": 4103 + }, + { + "clip_ratio": 0.001804991450626403, + "epoch": 0.15312718623198546, + "grad_norm": 0.07638493925333023, + "kl": 0.0216827392578125, + "learning_rate": 1.7368676875633784e-06, + "loss": 0.0068, + "step": 4104 + }, + { + "clip_ratio": 0.0036732094013132155, + "completion_length": 758.9286193847656, + "epoch": 0.1531644979245371, + "grad_norm": 0.07448741793632507, + "kl": 0.017486572265625, + "learning_rate": 1.7352701147849284e-06, + "loss": -0.0074, + "num_tokens": 96063754.0, + "reward": 0.5826298706233501, + "reward_std": 0.25912606716156006, + "rewards/code_reward": 0.43262986093759537, + "rewards/format_reward": 1.5, + "step": 4105 + }, + { + "clip_ratio": 0.003734889905899763, + "epoch": 0.15320180961708876, + "grad_norm": 0.0723259374499321, + "kl": 0.016815185546875, + "learning_rate": 1.7336741216172071e-06, + "loss": -0.0076, + "step": 4106 + }, + { + "clip_ratio": 0.003642427211161703, + "epoch": 0.1532391213096404, + "grad_norm": 0.07068640738725662, + "kl": 0.0170745849609375, + "learning_rate": 1.732079708729863e-06, + "loss": -0.0076, + "step": 4107 + }, + { + "clip_ratio": 0.002498058951459825, + "completion_length": 523.3928985595703, + "epoch": 0.15327643300219207, + "grad_norm": 0.16857337951660156, + "kl": 0.015655517578125, + "learning_rate": 1.7304868767918814e-06, + "loss": 0.0136, + "num_tokens": 96121036.0, + "reward": 1.0793841779232025, + "reward_std": 0.16781965643167496, + "rewards/code_reward": 0.9293841272592545, + "rewards/format_reward": 1.5, + "step": 4108 + }, + { + "clip_ratio": 0.0029294524574652314, + "epoch": 0.1533137446947437, + "grad_norm": 0.08913716673851013, + "kl": 0.0157318115234375, + "learning_rate": 1.7288956264715848e-06, + "loss": 0.0137, + "step": 4109 + }, + { + "clip_ratio": 0.0029189212364144623, + "epoch": 0.15335105638729538, + "grad_norm": 0.08549756556749344, + "kl": 0.0160369873046875, + "learning_rate": 1.7273059584366342e-06, + "loss": 0.0138, + "step": 4110 + }, + { + "clip_ratio": 0.0030570728704333305, + "completion_length": 626.1786041259766, + "epoch": 0.15338836807984702, + "grad_norm": 0.0683617815375328, + "kl": 0.0188751220703125, + "learning_rate": 1.7257178733540225e-06, + "loss": 0.0013, + "num_tokens": 96190234.0, + "reward": 0.5910578332841396, + "reward_std": 0.07742626219987869, + "rewards/code_reward": 0.44105782359838486, + "rewards/format_reward": 1.5, + "step": 4111 + }, + { + "clip_ratio": 0.00320763629861176, + "epoch": 0.15342567977239868, + "grad_norm": 0.0670735090970993, + "kl": 0.01885986328125, + "learning_rate": 1.724131371890082e-06, + "loss": 0.0012, + "step": 4112 + }, + { + "clip_ratio": 0.0032353433198295534, + "epoch": 0.15346299146495032, + "grad_norm": 0.06900418549776077, + "kl": 0.0188446044921875, + "learning_rate": 1.7225464547104787e-06, + "loss": 0.0012, + "step": 4113 + }, + { + "clip_ratio": 0.0018082296010106802, + "completion_length": 501.5893020629883, + "epoch": 0.153500303157502, + "grad_norm": 0.06903253495693207, + "kl": 0.01727294921875, + "learning_rate": 1.7209631224802143e-06, + "loss": 0.0009, + "num_tokens": 96246323.0, + "reward": 0.9186157733201981, + "reward_std": 0.041122482158243656, + "rewards/code_reward": 0.7686157915741205, + "rewards/format_reward": 1.5, + "step": 4114 + }, + { + "clip_ratio": 0.0016617428627796471, + "epoch": 0.15353761485005363, + "grad_norm": 0.0710916817188263, + "kl": 0.0173492431640625, + "learning_rate": 1.7193813758636268e-06, + "loss": 0.001, + "step": 4115 + }, + { + "clip_ratio": 0.001767114968970418, + "epoch": 0.1535749265426053, + "grad_norm": 0.06927190721035004, + "kl": 0.017333984375, + "learning_rate": 1.7178012155243862e-06, + "loss": 0.0008, + "step": 4116 + }, + { + "clip_ratio": 0.0010944762034341693, + "completion_length": 540.8571548461914, + "epoch": 0.15361223823515693, + "grad_norm": 0.039712224155664444, + "kl": 0.0183258056640625, + "learning_rate": 1.7162226421255002e-06, + "loss": 0.0051, + "num_tokens": 96302399.0, + "reward": 0.9912698268890381, + "reward_std": 0.06732101738452911, + "rewards/code_reward": 0.8412698358297348, + "rewards/format_reward": 1.5, + "step": 4117 + }, + { + "clip_ratio": 0.0009745044517330825, + "epoch": 0.1536495499277086, + "grad_norm": 0.039926379919052124, + "kl": 0.0183868408203125, + "learning_rate": 1.7146456563293068e-06, + "loss": 0.005, + "step": 4118 + }, + { + "clip_ratio": 0.0008251804392784834, + "epoch": 0.15368686162026024, + "grad_norm": 0.039585672318935394, + "kl": 0.0183563232421875, + "learning_rate": 1.7130702587974803e-06, + "loss": 0.0049, + "step": 4119 + }, + { + "clip_ratio": 0.0029584780568256974, + "completion_length": 660.1071701049805, + "epoch": 0.1537241733128119, + "grad_norm": 0.07187850773334503, + "kl": 0.0208282470703125, + "learning_rate": 1.7114964501910289e-06, + "loss": -0.0214, + "num_tokens": 96372601.0, + "reward": 0.7440168112516403, + "reward_std": 0.17437602020800114, + "rewards/code_reward": 0.5940168090164661, + "rewards/format_reward": 1.5, + "step": 4120 + }, + { + "clip_ratio": 0.0029542535194195807, + "epoch": 0.15376148500536355, + "grad_norm": 0.06706704199314117, + "kl": 0.0206451416015625, + "learning_rate": 1.7099242311702935e-06, + "loss": -0.0212, + "step": 4121 + }, + { + "clip_ratio": 0.0031306484597735107, + "epoch": 0.1537987966979152, + "grad_norm": 0.06649968028068542, + "kl": 0.0205841064453125, + "learning_rate": 1.7083536023949487e-06, + "loss": -0.0213, + "step": 4122 + }, + { + "clip_ratio": 0.004092606483027339, + "completion_length": 654.3036041259766, + "epoch": 0.15383610839046685, + "grad_norm": 0.08302795141935349, + "kl": 0.0184173583984375, + "learning_rate": 1.7067845645239994e-06, + "loss": -0.0023, + "num_tokens": 96442926.0, + "reward": 0.7599595636129379, + "reward_std": 0.2927747033536434, + "rewards/code_reward": 0.6099595353007317, + "rewards/format_reward": 1.5, + "step": 4123 + }, + { + "clip_ratio": 0.0036259391345083714, + "epoch": 0.15387342008301852, + "grad_norm": 0.07770251482725143, + "kl": 0.0188751220703125, + "learning_rate": 1.7052171182157861e-06, + "loss": -0.0024, + "step": 4124 + }, + { + "clip_ratio": 0.004071315110195428, + "epoch": 0.15391073177557016, + "grad_norm": 0.08054489642381668, + "kl": 0.0181427001953125, + "learning_rate": 1.7036512641279807e-06, + "loss": -0.0023, + "step": 4125 + }, + { + "clip_ratio": 0.0031019056914374232, + "completion_length": 586.8571701049805, + "epoch": 0.15394804346812183, + "grad_norm": 0.0585300587117672, + "kl": 0.0173797607421875, + "learning_rate": 1.7020870029175836e-06, + "loss": 0.0056, + "num_tokens": 96512610.0, + "reward": 0.7147321365773678, + "reward_std": 0.13680752366781235, + "rewards/code_reward": 0.5647321343421936, + "rewards/format_reward": 1.5, + "step": 4126 + }, + { + "clip_ratio": 0.0026417429908178747, + "epoch": 0.15398535516067346, + "grad_norm": 0.05739627406001091, + "kl": 0.0168609619140625, + "learning_rate": 1.7005243352409334e-06, + "loss": 0.0054, + "step": 4127 + }, + { + "clip_ratio": 0.0028120552888140082, + "epoch": 0.15402266685322513, + "grad_norm": 0.07685583084821701, + "kl": 0.0170745849609375, + "learning_rate": 1.6989632617536935e-06, + "loss": 0.0055, + "step": 4128 + }, + { + "clip_ratio": 0.0035222192527726293, + "completion_length": 704.482177734375, + "epoch": 0.15405997854577677, + "grad_norm": 0.06477408111095428, + "kl": 0.0184783935546875, + "learning_rate": 1.6974037831108615e-06, + "loss": 0.0041, + "num_tokens": 96586529.0, + "reward": 0.48167721182107925, + "reward_std": 0.09615428280085325, + "rewards/code_reward": 0.3316772016696632, + "rewards/format_reward": 1.5, + "step": 4129 + }, + { + "clip_ratio": 0.003935446497052908, + "epoch": 0.15409729023832844, + "grad_norm": 0.06336659938097, + "kl": 0.0187835693359375, + "learning_rate": 1.6958458999667688e-06, + "loss": 0.004, + "step": 4130 + }, + { + "clip_ratio": 0.003410621080547571, + "epoch": 0.15413460193088008, + "grad_norm": 0.06138784438371658, + "kl": 0.0186309814453125, + "learning_rate": 1.6942896129750713e-06, + "loss": 0.0039, + "step": 4131 + }, + { + "clip_ratio": 0.0032161845010705292, + "completion_length": 620.7500457763672, + "epoch": 0.15417191362343174, + "grad_norm": 0.06455332785844803, + "kl": 0.0137481689453125, + "learning_rate": 1.6927349227887592e-06, + "loss": -0.0004, + "num_tokens": 96646281.0, + "reward": 0.6446255296468735, + "reward_std": 0.10310285538434982, + "rewards/code_reward": 0.4946255153045058, + "rewards/format_reward": 1.5, + "step": 4132 + }, + { + "clip_ratio": 0.0029186902102082968, + "epoch": 0.15420922531598338, + "grad_norm": 0.06059269607067108, + "kl": 0.013763427734375, + "learning_rate": 1.6911818300601507e-06, + "loss": -0.0005, + "step": 4133 + }, + { + "clip_ratio": 0.0031489981338381767, + "epoch": 0.15424653700853505, + "grad_norm": 0.060412630438804626, + "kl": 0.0134735107421875, + "learning_rate": 1.6896303354408958e-06, + "loss": -0.0005, + "step": 4134 + }, + { + "clip_ratio": 0.0031302605057135224, + "completion_length": 554.7857360839844, + "epoch": 0.1542838487010867, + "grad_norm": 0.06315844506025314, + "kl": 0.030120849609375, + "learning_rate": 1.6880804395819727e-06, + "loss": -0.0018, + "num_tokens": 96699707.0, + "reward": 0.747222226113081, + "reward_std": 0.10258935391902924, + "rewards/code_reward": 0.5972222238779068, + "rewards/format_reward": 1.5, + "step": 4135 + }, + { + "clip_ratio": 0.0030240810592658818, + "epoch": 0.15432116039363836, + "grad_norm": 0.060925327241420746, + "kl": 0.027984619140625, + "learning_rate": 1.686532143133688e-06, + "loss": -0.0018, + "step": 4136 + }, + { + "clip_ratio": 0.0028738741821143776, + "epoch": 0.15435847208619, + "grad_norm": 0.058947935700416565, + "kl": 0.029388427734375, + "learning_rate": 1.6849854467456784e-06, + "loss": -0.0019, + "step": 4137 + }, + { + "clip_ratio": 0.0025034597201738507, + "completion_length": 653.3928833007812, + "epoch": 0.15439578377874166, + "grad_norm": 0.07855196297168732, + "kl": 0.01611328125, + "learning_rate": 1.6834403510669092e-06, + "loss": 0.0327, + "num_tokens": 96758291.0, + "reward": 0.8242512159049511, + "reward_std": 0.13306625303812325, + "rewards/code_reward": 0.6742512171622366, + "rewards/format_reward": 1.5, + "step": 4138 + }, + { + "clip_ratio": 0.0023201713629532605, + "epoch": 0.15443309547129333, + "grad_norm": 0.06327469646930695, + "kl": 0.015716552734375, + "learning_rate": 1.6818968567456742e-06, + "loss": 0.0326, + "step": 4139 + }, + { + "clip_ratio": 0.0025812929961830378, + "epoch": 0.15447040716384497, + "grad_norm": 0.06476325541734695, + "kl": 0.01629638671875, + "learning_rate": 1.6803549644295951e-06, + "loss": 0.0328, + "step": 4140 + }, + { + "clip_ratio": 0.00276425841730088, + "completion_length": 582.0536041259766, + "epoch": 0.15450771885639664, + "grad_norm": 0.07477007061243057, + "kl": 0.0166778564453125, + "learning_rate": 1.6788146747656204e-06, + "loss": -0.01, + "num_tokens": 96810864.0, + "reward": 0.6798045709729195, + "reward_std": 0.2046244628727436, + "rewards/code_reward": 0.5298045761883259, + "rewards/format_reward": 1.5, + "step": 4141 + }, + { + "clip_ratio": 0.002712163550313562, + "epoch": 0.15454503054894828, + "grad_norm": 0.079165980219841, + "kl": 0.0165863037109375, + "learning_rate": 1.6772759884000276e-06, + "loss": -0.01, + "step": 4142 + }, + { + "clip_ratio": 0.0030617309384979308, + "epoch": 0.15458234224149994, + "grad_norm": 0.07458940893411636, + "kl": 0.0164947509765625, + "learning_rate": 1.6757389059784212e-06, + "loss": -0.0099, + "step": 4143 + }, + { + "clip_ratio": 0.0037455311976373196, + "completion_length": 688.0000228881836, + "epoch": 0.15461965393405158, + "grad_norm": 0.07972555607557297, + "kl": 0.021209716796875, + "learning_rate": 1.6742034281457315e-06, + "loss": -0.0022, + "num_tokens": 96881340.0, + "reward": 0.6956701762974262, + "reward_std": 0.37464335933327675, + "rewards/code_reward": 0.5456701461225748, + "rewards/format_reward": 1.5, + "step": 4144 + }, + { + "clip_ratio": 0.003918973729014397, + "epoch": 0.15465696562660325, + "grad_norm": 0.07949138432741165, + "kl": 0.02105712890625, + "learning_rate": 1.6726695555462175e-06, + "loss": -0.002, + "step": 4145 + }, + { + "clip_ratio": 0.0035815940937027335, + "epoch": 0.1546942773191549, + "grad_norm": 0.08234457671642303, + "kl": 0.021453857421875, + "learning_rate": 1.6711372888234623e-06, + "loss": -0.002, + "step": 4146 + }, + { + "clip_ratio": 0.0016347038908861578, + "completion_length": 743.7143173217773, + "epoch": 0.15473158901170656, + "grad_norm": 0.045884907245635986, + "kl": 0.0130767822265625, + "learning_rate": 1.669606628620376e-06, + "loss": 0.0021, + "num_tokens": 96956602.0, + "reward": 0.760028850287199, + "reward_std": 0.0040494175627827644, + "rewards/code_reward": 0.610028862953186, + "rewards/format_reward": 1.5, + "step": 4147 + }, + { + "clip_ratio": 0.0018802823033183813, + "epoch": 0.1547689007042582, + "grad_norm": 0.04645939916372299, + "kl": 0.0134735107421875, + "learning_rate": 1.6680775755791995e-06, + "loss": 0.0021, + "step": 4148 + }, + { + "clip_ratio": 0.0016840521129779518, + "epoch": 0.15480621239680986, + "grad_norm": 0.046179816126823425, + "kl": 0.0132293701171875, + "learning_rate": 1.66655013034149e-06, + "loss": 0.002, + "step": 4149 + }, + { + "clip_ratio": 0.002847007126547396, + "completion_length": 783.0000228881836, + "epoch": 0.1548435240893615, + "grad_norm": 0.05272578075528145, + "kl": 0.0124359130859375, + "learning_rate": 1.665024293548139e-06, + "loss": 0.0125, + "num_tokens": 97040000.0, + "reward": 0.6037273444235325, + "reward_std": 0.14422350749373436, + "rewards/code_reward": 0.45372732542455196, + "rewards/format_reward": 1.5, + "step": 4150 + }, + { + "clip_ratio": 0.0032582435524091125, + "epoch": 0.15488083578191317, + "grad_norm": 0.05092893913388252, + "kl": 0.0123138427734375, + "learning_rate": 1.6635000658393562e-06, + "loss": 0.0126, + "step": 4151 + }, + { + "clip_ratio": 0.0025832659448496997, + "epoch": 0.1549181474744648, + "grad_norm": 0.04788275435566902, + "kl": 0.012725830078125, + "learning_rate": 1.6619774478546808e-06, + "loss": 0.0125, + "step": 4152 + }, + { + "clip_ratio": 0.003714182646945119, + "completion_length": 671.4464569091797, + "epoch": 0.15495545916701647, + "grad_norm": 0.061865225434303284, + "kl": 0.015777587890625, + "learning_rate": 1.660456440232976e-06, + "loss": -0.0093, + "num_tokens": 97112111.0, + "reward": 0.584710668772459, + "reward_std": 0.16987569816410542, + "rewards/code_reward": 0.43471065536141396, + "rewards/format_reward": 1.5, + "step": 4153 + }, + { + "clip_ratio": 0.004327302856836468, + "epoch": 0.1549927708595681, + "grad_norm": 0.06414765119552612, + "kl": 0.01581573486328125, + "learning_rate": 1.6589370436124257e-06, + "loss": -0.0092, + "step": 4154 + }, + { + "clip_ratio": 0.0039452966884709895, + "epoch": 0.15503008255211978, + "grad_norm": 0.06327521055936813, + "kl": 0.0158843994140625, + "learning_rate": 1.6574192586305421e-06, + "loss": -0.0094, + "step": 4155 + }, + { + "clip_ratio": 0.003736830723937601, + "completion_length": 767.857177734375, + "epoch": 0.15506739424467142, + "grad_norm": 0.09584072977304459, + "kl": 0.0155029296875, + "learning_rate": 1.65590308592416e-06, + "loss": 0.0194, + "num_tokens": 97187143.0, + "reward": 0.7001588642597198, + "reward_std": 0.344393752515316, + "rewards/code_reward": 0.5501588322222233, + "rewards/format_reward": 1.5, + "step": 4156 + }, + { + "clip_ratio": 0.003633255895692855, + "epoch": 0.1551047059372231, + "grad_norm": 0.0924258828163147, + "kl": 0.01593017578125, + "learning_rate": 1.6543885261294356e-06, + "loss": 0.0192, + "step": 4157 + }, + { + "clip_ratio": 0.0033207208034582436, + "epoch": 0.15514201762977473, + "grad_norm": 0.0782269760966301, + "kl": 0.015625, + "learning_rate": 1.652875579881853e-06, + "loss": 0.0191, + "step": 4158 + }, + { + "clip_ratio": 0.0020613656379282475, + "completion_length": 657.1071624755859, + "epoch": 0.1551793293223264, + "grad_norm": 0.04577987641096115, + "kl": 0.02362060546875, + "learning_rate": 1.6513642478162132e-06, + "loss": -0.0007, + "num_tokens": 97259907.0, + "reward": 1.1468416154384613, + "reward_std": 0.007276914082467556, + "rewards/code_reward": 0.9968415647745132, + "rewards/format_reward": 1.5, + "step": 4159 + }, + { + "clip_ratio": 0.0023152275243774056, + "epoch": 0.15521664101487803, + "grad_norm": 0.04661145806312561, + "kl": 0.02301025390625, + "learning_rate": 1.6498545305666452e-06, + "loss": -0.0006, + "step": 4160 + }, + { + "clip_ratio": 0.0022630589664913714, + "epoch": 0.1552539527074297, + "grad_norm": 0.04772175848484039, + "kl": 0.0229339599609375, + "learning_rate": 1.648346428766597e-06, + "loss": -0.0007, + "step": 4161 + }, + { + "clip_ratio": 0.003933594096451998, + "completion_length": 547.1071548461914, + "epoch": 0.15529126439998134, + "grad_norm": 0.04001963511109352, + "kl": 0.019622802734375, + "learning_rate": 1.6468399430488401e-06, + "loss": 0.0005, + "num_tokens": 97311839.0, + "reward": 0.5071428641676903, + "reward_std": 0.12838813662528992, + "rewards/code_reward": 0.3571428582072258, + "rewards/format_reward": 1.5, + "step": 4162 + }, + { + "clip_ratio": 0.004103720071725547, + "epoch": 0.155328576092533, + "grad_norm": 0.03817526623606682, + "kl": 0.019012451171875, + "learning_rate": 1.64533507404547e-06, + "loss": 0.0004, + "step": 4163 + }, + { + "clip_ratio": 0.004145069047808647, + "epoch": 0.15536588778508464, + "grad_norm": 0.037641964852809906, + "kl": 0.019744873046875, + "learning_rate": 1.643831822387898e-06, + "loss": 0.0005, + "step": 4164 + }, + { + "clip_ratio": 0.0036870843614451587, + "completion_length": 609.3928909301758, + "epoch": 0.1554031994776363, + "grad_norm": 0.07146704196929932, + "kl": 0.017486572265625, + "learning_rate": 1.6423301887068648e-06, + "loss": 0.0053, + "num_tokens": 97368575.0, + "reward": 0.6830855160951614, + "reward_std": 0.12430264643626288, + "rewards/code_reward": 0.5330855081992922, + "rewards/format_reward": 1.5, + "step": 4165 + }, + { + "clip_ratio": 0.003289908228907734, + "epoch": 0.15544051117018795, + "grad_norm": 0.07420887053012848, + "kl": 0.017791748046875, + "learning_rate": 1.6408301736324248e-06, + "loss": 0.0053, + "step": 4166 + }, + { + "clip_ratio": 0.0030968641804065555, + "epoch": 0.15547782286273962, + "grad_norm": 0.07219193875789642, + "kl": 0.01678466796875, + "learning_rate": 1.6393317777939577e-06, + "loss": 0.0052, + "step": 4167 + }, + { + "clip_ratio": 0.0040794837404973805, + "completion_length": 720.3393249511719, + "epoch": 0.15551513455529126, + "grad_norm": 0.046082694083452225, + "kl": 0.013885498046875, + "learning_rate": 1.6378350018201638e-06, + "loss": 0.01, + "num_tokens": 97446336.0, + "reward": 0.3509771153330803, + "reward_std": 0.23829786479473114, + "rewards/code_reward": 0.20097708702087402, + "rewards/format_reward": 1.5, + "step": 4168 + }, + { + "clip_ratio": 0.004420206125359982, + "epoch": 0.15555244624784292, + "grad_norm": 0.04876362159848213, + "kl": 0.0140533447265625, + "learning_rate": 1.6363398463390595e-06, + "loss": 0.0101, + "step": 4169 + }, + { + "clip_ratio": 0.003871642576996237, + "epoch": 0.15558975794039456, + "grad_norm": 0.04872468113899231, + "kl": 0.0140380859375, + "learning_rate": 1.6348463119779861e-06, + "loss": 0.01, + "step": 4170 + }, + { + "clip_ratio": 0.004578151507303119, + "completion_length": 749.8214721679688, + "epoch": 0.15562706963294623, + "grad_norm": 0.08874976634979248, + "kl": 0.0188140869140625, + "learning_rate": 1.6333543993636041e-06, + "loss": 0.0048, + "num_tokens": 97517992.0, + "reward": 0.7012382857501507, + "reward_std": 0.21260358020663261, + "rewards/code_reward": 0.5512382090091705, + "rewards/format_reward": 1.5, + "step": 4171 + }, + { + "clip_ratio": 0.004257742082700133, + "epoch": 0.15566438132549787, + "grad_norm": 0.08114977180957794, + "kl": 0.0194854736328125, + "learning_rate": 1.63186410912189e-06, + "loss": 0.0046, + "step": 4172 + }, + { + "clip_ratio": 0.004315964237321168, + "epoch": 0.15570169301804954, + "grad_norm": 0.08518066257238388, + "kl": 0.0191802978515625, + "learning_rate": 1.6303754418781428e-06, + "loss": 0.0043, + "step": 4173 + }, + { + "clip_ratio": 0.003115071216598153, + "completion_length": 936.7857666015625, + "epoch": 0.15573900471060118, + "grad_norm": 0.04974203184247017, + "kl": 0.016693115234375, + "learning_rate": 1.6288883982569797e-06, + "loss": -0.0045, + "num_tokens": 97614286.0, + "reward": 0.45307986810803413, + "reward_std": 0.10029695462435484, + "rewards/code_reward": 0.30307986633852124, + "rewards/format_reward": 1.5, + "step": 4174 + }, + { + "clip_ratio": 0.003205414453987032, + "epoch": 0.15577631640315284, + "grad_norm": 0.05232011526823044, + "kl": 0.0168304443359375, + "learning_rate": 1.6274029788823365e-06, + "loss": -0.0045, + "step": 4175 + }, + { + "clip_ratio": 0.002978930191602558, + "epoch": 0.15581362809570448, + "grad_norm": 0.04711124673485756, + "kl": 0.0168304443359375, + "learning_rate": 1.6259191843774685e-06, + "loss": -0.0045, + "step": 4176 + }, + { + "clip_ratio": 0.0026902910321950912, + "completion_length": 561.1428833007812, + "epoch": 0.15585093978825615, + "grad_norm": 0.051200758665800095, + "kl": 0.01556396484375, + "learning_rate": 1.6244370153649465e-06, + "loss": -0.0038, + "num_tokens": 97670812.0, + "reward": 0.582539688795805, + "reward_std": 0.02770135924220085, + "rewards/code_reward": 0.4325396828353405, + "rewards/format_reward": 1.5, + "step": 4177 + }, + { + "clip_ratio": 0.0026442554080858827, + "epoch": 0.1558882514808078, + "grad_norm": 0.048353567719459534, + "kl": 0.01544189453125, + "learning_rate": 1.6229564724666633e-06, + "loss": -0.0038, + "step": 4178 + }, + { + "clip_ratio": 0.0024498646380379796, + "epoch": 0.15592556317335945, + "grad_norm": 0.043704185634851456, + "kl": 0.0159759521484375, + "learning_rate": 1.6214775563038245e-06, + "loss": -0.0038, + "step": 4179 + }, + { + "clip_ratio": 0.0028367958730086684, + "completion_length": 688.7857513427734, + "epoch": 0.1559628748659111, + "grad_norm": 0.05691620707511902, + "kl": 0.021636962890625, + "learning_rate": 1.6200002674969574e-06, + "loss": -0.0022, + "num_tokens": 97756452.0, + "reward": 0.6706043928861618, + "reward_std": 0.016429541632533073, + "rewards/code_reward": 0.5206043981015682, + "rewards/format_reward": 1.5, + "step": 4180 + }, + { + "clip_ratio": 0.0027665792731568217, + "epoch": 0.15600018655846276, + "grad_norm": 0.042764727026224136, + "kl": 0.0220489501953125, + "learning_rate": 1.6185246066659046e-06, + "loss": -0.0022, + "step": 4181 + }, + { + "clip_ratio": 0.002860999433323741, + "epoch": 0.1560374982510144, + "grad_norm": 0.050270192325115204, + "kl": 0.022003173828125, + "learning_rate": 1.617050574429827e-06, + "loss": -0.0023, + "step": 4182 + }, + { + "clip_ratio": 0.0036595865967683494, + "completion_length": 680.5536041259766, + "epoch": 0.15607480994356607, + "grad_norm": 0.07619096338748932, + "kl": 0.013946533203125, + "learning_rate": 1.6155781714072004e-06, + "loss": -0.0119, + "num_tokens": 97824929.0, + "reward": 0.6543499529361725, + "reward_std": 0.25333315413445234, + "rewards/code_reward": 0.5043499004095793, + "rewards/format_reward": 1.5, + "step": 4183 + }, + { + "clip_ratio": 0.003475522273220122, + "epoch": 0.1561121216361177, + "grad_norm": 0.06405338644981384, + "kl": 0.0140228271484375, + "learning_rate": 1.6141073982158175e-06, + "loss": -0.0119, + "step": 4184 + }, + { + "clip_ratio": 0.003064706572331488, + "epoch": 0.15614943332866937, + "grad_norm": 0.06486498564481735, + "kl": 0.0141143798828125, + "learning_rate": 1.6126382554727873e-06, + "loss": -0.0123, + "step": 4185 + }, + { + "clip_ratio": 0.004096611344721168, + "completion_length": 844.732177734375, + "epoch": 0.156186745021221, + "grad_norm": 0.06630203127861023, + "kl": 0.0161590576171875, + "learning_rate": 1.6111707437945357e-06, + "loss": 0.0136, + "num_tokens": 97907836.0, + "reward": 0.5677606239914894, + "reward_std": 0.3133349008858204, + "rewards/code_reward": 0.4177606161683798, + "rewards/format_reward": 1.5, + "step": 4186 + }, + { + "clip_ratio": 0.004106380103621632, + "epoch": 0.15622405671377268, + "grad_norm": 0.06545209884643555, + "kl": 0.0160980224609375, + "learning_rate": 1.6097048637968024e-06, + "loss": 0.0135, + "step": 4187 + }, + { + "clip_ratio": 0.0037050495739094913, + "epoch": 0.15626136840632432, + "grad_norm": 0.06600600481033325, + "kl": 0.0164642333984375, + "learning_rate": 1.6082406160946446e-06, + "loss": 0.0135, + "step": 4188 + }, + { + "clip_ratio": 0.005150126584339887, + "completion_length": 829.4464874267578, + "epoch": 0.15629868009887599, + "grad_norm": 0.06584896147251129, + "kl": 0.0177764892578125, + "learning_rate": 1.6067780013024314e-06, + "loss": -0.0042, + "num_tokens": 97989887.0, + "reward": 0.489698588848114, + "reward_std": 0.2411893643438816, + "rewards/code_reward": 0.33969856100156903, + "rewards/format_reward": 1.5, + "step": 4189 + }, + { + "clip_ratio": 0.004517289053183049, + "epoch": 0.15633599179142763, + "grad_norm": 0.06738822162151337, + "kl": 0.017730712890625, + "learning_rate": 1.6053170200338502e-06, + "loss": -0.0044, + "step": 4190 + }, + { + "clip_ratio": 0.004594201222062111, + "epoch": 0.1563733034839793, + "grad_norm": 0.070496566593647, + "kl": 0.01800537109375, + "learning_rate": 1.6038576729019018e-06, + "loss": -0.0043, + "step": 4191 + }, + { + "clip_ratio": 0.0024124188348650932, + "completion_length": 582.5535888671875, + "epoch": 0.15641061517653093, + "grad_norm": 0.0018403447465971112, + "kl": 0.018890380859375, + "learning_rate": 1.6023999605189e-06, + "loss": 0.0003, + "num_tokens": 98045218.0, + "reward": 0.40242718905210495, + "reward_std": 0.0, + "rewards/code_reward": 0.25242718448862433, + "rewards/format_reward": 1.5, + "step": 4192 + }, + { + "clip_ratio": 0.002154019777663052, + "epoch": 0.1564479268690826, + "grad_norm": 0.0018327709985896945, + "kl": 0.0189666748046875, + "learning_rate": 1.6009438834964757e-06, + "loss": 0.0003, + "step": 4193 + }, + { + "clip_ratio": 0.002071451162919402, + "epoch": 0.15648523856163427, + "grad_norm": 0.0018370654433965683, + "kl": 0.019195556640625, + "learning_rate": 1.5994894424455702e-06, + "loss": 0.0003, + "step": 4194 + }, + { + "clip_ratio": 0.004840819397941232, + "completion_length": 636.482177734375, + "epoch": 0.1565225502541859, + "grad_norm": 0.08962289243936539, + "kl": 0.016876220703125, + "learning_rate": 1.5980366379764401e-06, + "loss": 0.0042, + "num_tokens": 98118829.0, + "reward": 0.7236688137054443, + "reward_std": 0.3447258435189724, + "rewards/code_reward": 0.573668833822012, + "rewards/format_reward": 1.5, + "step": 4195 + }, + { + "clip_ratio": 0.004872186342254281, + "epoch": 0.15655986194673757, + "grad_norm": 0.08802531659603119, + "kl": 0.0171661376953125, + "learning_rate": 1.5965854706986562e-06, + "loss": 0.0042, + "step": 4196 + }, + { + "clip_ratio": 0.004473175213206559, + "epoch": 0.1565971736392892, + "grad_norm": 0.09369193762540817, + "kl": 0.017242431640625, + "learning_rate": 1.5951359412211004e-06, + "loss": 0.004, + "step": 4197 + }, + { + "clip_ratio": 0.0024966384517028928, + "completion_length": 521.6607437133789, + "epoch": 0.15663448533184088, + "grad_norm": 0.05583123862743378, + "kl": 0.0139923095703125, + "learning_rate": 1.5936880501519678e-06, + "loss": 0.0031, + "num_tokens": 98169168.0, + "reward": 0.8715368323028088, + "reward_std": 0.07744744420051575, + "rewards/code_reward": 0.7215368151664734, + "rewards/format_reward": 1.5, + "step": 4198 + }, + { + "clip_ratio": 0.002485555305611342, + "epoch": 0.15667179702439252, + "grad_norm": 0.05390317365527153, + "kl": 0.0135345458984375, + "learning_rate": 1.5922417980987676e-06, + "loss": 0.0029, + "step": 4199 + }, + { + "clip_ratio": 0.002764796663541347, + "epoch": 0.15670910871694418, + "grad_norm": 0.061850883066654205, + "kl": 0.0142364501953125, + "learning_rate": 1.5907971856683201e-06, + "loss": 0.0031, + "step": 4200 + }, + { + "clip_ratio": 0.0031665691640228033, + "completion_length": 703.3750228881836, + "epoch": 0.15674642040949582, + "grad_norm": 0.03638725355267525, + "kl": 0.02056884765625, + "learning_rate": 1.5893542134667588e-06, + "loss": -0.0006, + "num_tokens": 98239007.0, + "reward": 0.2928571477532387, + "reward_std": 0.06681530922651291, + "rewards/code_reward": 0.14285714365541935, + "rewards/format_reward": 1.5, + "step": 4201 + }, + { + "clip_ratio": 0.00279585353564471, + "epoch": 0.1567837321020475, + "grad_norm": 0.03540707379579544, + "kl": 0.02032470703125, + "learning_rate": 1.587912882099526e-06, + "loss": -0.0007, + "step": 4202 + }, + { + "clip_ratio": 0.0035086260177195072, + "epoch": 0.15682104379459913, + "grad_norm": 0.037933509796857834, + "kl": 0.02081298828125, + "learning_rate": 1.5864731921713786e-06, + "loss": -0.0006, + "step": 4203 + }, + { + "clip_ratio": 0.003113536396995187, + "completion_length": 656.6786117553711, + "epoch": 0.1568583554871508, + "grad_norm": 0.0677608922123909, + "kl": 0.0178985595703125, + "learning_rate": 1.5850351442863849e-06, + "loss": 0.0048, + "num_tokens": 98307533.0, + "reward": 0.4972490519285202, + "reward_std": 0.09509982075542212, + "rewards/code_reward": 0.34724903665483, + "rewards/format_reward": 1.5, + "step": 4204 + }, + { + "clip_ratio": 0.0030785270500928164, + "epoch": 0.15689566717970244, + "grad_norm": 0.0839429646730423, + "kl": 0.0176544189453125, + "learning_rate": 1.5835987390479206e-06, + "loss": 0.0046, + "step": 4205 + }, + { + "clip_ratio": 0.003434319980442524, + "epoch": 0.1569329788722541, + "grad_norm": 0.08268710970878601, + "kl": 0.018035888671875, + "learning_rate": 1.5821639770586769e-06, + "loss": 0.0046, + "step": 4206 + }, + { + "clip_ratio": 0.0020391446305438876, + "completion_length": 582.6785888671875, + "epoch": 0.15697029056480574, + "grad_norm": 0.03352895379066467, + "kl": 0.016815185546875, + "learning_rate": 1.5807308589206518e-06, + "loss": -0.0066, + "num_tokens": 98374855.0, + "reward": 0.8157245628535748, + "reward_std": 0.11751863360404968, + "rewards/code_reward": 0.6657245457172394, + "rewards/format_reward": 1.5, + "step": 4207 + }, + { + "clip_ratio": 0.002022822853177786, + "epoch": 0.1570076022573574, + "grad_norm": 0.0346827358007431, + "kl": 0.0167694091796875, + "learning_rate": 1.5792993852351555e-06, + "loss": -0.0067, + "step": 4208 + }, + { + "clip_ratio": 0.0020197119447402656, + "epoch": 0.15704491394990905, + "grad_norm": 0.03358834236860275, + "kl": 0.016876220703125, + "learning_rate": 1.577869556602808e-06, + "loss": -0.0067, + "step": 4209 + }, + { + "clip_ratio": 0.003127516887616366, + "completion_length": 492.60716247558594, + "epoch": 0.15708222564246072, + "grad_norm": 0.08302333950996399, + "kl": 0.0155181884765625, + "learning_rate": 1.5764413736235385e-06, + "loss": -0.0012, + "num_tokens": 98426745.0, + "reward": 0.6910835690796375, + "reward_std": 0.10290459188399836, + "rewards/code_reward": 0.5410835509828757, + "rewards/format_reward": 1.5, + "step": 4210 + }, + { + "clip_ratio": 0.003032979730051011, + "epoch": 0.15711953733501235, + "grad_norm": 0.0820062980055809, + "kl": 0.015594482421875, + "learning_rate": 1.5750148368965871e-06, + "loss": -0.0013, + "step": 4211 + }, + { + "clip_ratio": 0.0031957244500517845, + "epoch": 0.15715684902756402, + "grad_norm": 0.07533209025859833, + "kl": 0.0156707763671875, + "learning_rate": 1.5735899470205006e-06, + "loss": -0.0015, + "step": 4212 + }, + { + "clip_ratio": 0.004587343253660947, + "completion_length": 881.8036041259766, + "epoch": 0.15719416072011566, + "grad_norm": 0.05620914697647095, + "kl": 0.0167999267578125, + "learning_rate": 1.5721667045931366e-06, + "loss": 0.004, + "num_tokens": 98518350.0, + "reward": 0.3657738193869591, + "reward_std": 0.17168286815285683, + "rewards/code_reward": 0.2157738097012043, + "rewards/format_reward": 1.5, + "step": 4213 + }, + { + "clip_ratio": 0.0043206114787608385, + "epoch": 0.15723147241266733, + "grad_norm": 0.055890489369630814, + "kl": 0.0171966552734375, + "learning_rate": 1.570745110211663e-06, + "loss": 0.0038, + "step": 4214 + }, + { + "clip_ratio": 0.0044914602767676115, + "epoch": 0.15726878410521897, + "grad_norm": 0.05454843118786812, + "kl": 0.016815185546875, + "learning_rate": 1.569325164472552e-06, + "loss": 0.0037, + "step": 4215 + }, + { + "clip_ratio": 0.0043458808795548975, + "completion_length": 639.4643096923828, + "epoch": 0.15730609579777063, + "grad_norm": 0.08637524396181107, + "kl": 0.0189971923828125, + "learning_rate": 1.567906867971587e-06, + "loss": -0.0016, + "num_tokens": 98585338.0, + "reward": 0.5136010982096195, + "reward_std": 0.10684968531131744, + "rewards/code_reward": 0.36360107734799385, + "rewards/format_reward": 1.5, + "step": 4216 + }, + { + "clip_ratio": 0.004430457251146436, + "epoch": 0.15734340749032227, + "grad_norm": 0.07487939298152924, + "kl": 0.0273895263671875, + "learning_rate": 1.5664902213038597e-06, + "loss": -0.0016, + "step": 4217 + }, + { + "clip_ratio": 0.003999961190856993, + "epoch": 0.15738071918287394, + "grad_norm": 0.07138683646917343, + "kl": 0.0230560302734375, + "learning_rate": 1.5650752250637683e-06, + "loss": -0.0019, + "step": 4218 + }, + { + "clip_ratio": 0.001048062345944345, + "completion_length": 506.30359649658203, + "epoch": 0.15741803087542558, + "grad_norm": 0.04653027653694153, + "kl": 0.035675048828125, + "learning_rate": 1.5636618798450198e-06, + "loss": -0.0052, + "num_tokens": 98647641.0, + "reward": 0.8950745612382889, + "reward_std": 0.07407370954751968, + "rewards/code_reward": 0.7450745888054371, + "rewards/format_reward": 1.5, + "step": 4219 + }, + { + "clip_ratio": 0.0009513592231087387, + "epoch": 0.15745534256797725, + "grad_norm": 0.040911272168159485, + "kl": 0.0320587158203125, + "learning_rate": 1.562250186240626e-06, + "loss": -0.0053, + "step": 4220 + }, + { + "clip_ratio": 0.0010981446830555797, + "epoch": 0.15749265426052889, + "grad_norm": 0.041476327925920486, + "kl": 0.0355072021484375, + "learning_rate": 1.5608401448429084e-06, + "loss": -0.0054, + "step": 4221 + }, + { + "clip_ratio": 0.004447543527930975, + "completion_length": 722.5357513427734, + "epoch": 0.15752996595308055, + "grad_norm": 0.08791866153478622, + "kl": 0.0175323486328125, + "learning_rate": 1.5594317562434939e-06, + "loss": 0.0189, + "num_tokens": 98714059.0, + "reward": 0.5708877071738243, + "reward_std": 0.3446964956820011, + "rewards/code_reward": 0.42088770866394043, + "rewards/format_reward": 1.5, + "step": 4222 + }, + { + "clip_ratio": 0.004352724994532764, + "epoch": 0.1575672776456322, + "grad_norm": 0.08835244923830032, + "kl": 0.0170745849609375, + "learning_rate": 1.5580250210333153e-06, + "loss": 0.0188, + "step": 4223 + }, + { + "clip_ratio": 0.004699610290117562, + "epoch": 0.15760458933818386, + "grad_norm": 0.08753608912229538, + "kl": 0.017333984375, + "learning_rate": 1.556619939802615e-06, + "loss": 0.019, + "step": 4224 + }, + { + "clip_ratio": 0.001910457736812532, + "completion_length": 663.0714569091797, + "epoch": 0.1576419010307355, + "grad_norm": 0.0791865810751915, + "kl": 0.0141143798828125, + "learning_rate": 1.5552165131409361e-06, + "loss": 0.0034, + "num_tokens": 98771869.0, + "reward": 0.8172247149050236, + "reward_std": 0.07273988169617951, + "rewards/code_reward": 0.6672246982343495, + "rewards/format_reward": 1.5, + "step": 4225 + }, + { + "clip_ratio": 0.0018608878017403185, + "epoch": 0.15767921272328717, + "grad_norm": 0.07863335311412811, + "kl": 0.0143585205078125, + "learning_rate": 1.5538147416371317e-06, + "loss": 0.0036, + "step": 4226 + }, + { + "clip_ratio": 0.001700944674666971, + "epoch": 0.1577165244158388, + "grad_norm": 0.07982944697141647, + "kl": 0.0136260986328125, + "learning_rate": 1.552414625879359e-06, + "loss": 0.0033, + "step": 4227 + }, + { + "clip_ratio": 0.0016833693662192672, + "completion_length": 640.0178833007812, + "epoch": 0.15775383610839047, + "grad_norm": 0.0533568374812603, + "kl": 0.017822265625, + "learning_rate": 1.5510161664550806e-06, + "loss": 0.0073, + "num_tokens": 98840624.0, + "reward": 0.7675284296274185, + "reward_std": 0.15696460753679276, + "rewards/code_reward": 0.6175284311175346, + "rewards/format_reward": 1.5, + "step": 4228 + }, + { + "clip_ratio": 0.0016087769763544202, + "epoch": 0.1577911478009421, + "grad_norm": 0.05639702081680298, + "kl": 0.017913818359375, + "learning_rate": 1.5496193639510653e-06, + "loss": 0.0072, + "step": 4229 + }, + { + "clip_ratio": 0.0015045604668557644, + "epoch": 0.15782845949349378, + "grad_norm": 0.054584652185440063, + "kl": 0.0178375244140625, + "learning_rate": 1.5482242189533833e-06, + "loss": 0.0072, + "step": 4230 + }, + { + "clip_ratio": 0.0037505034124478698, + "completion_length": 727.7143249511719, + "epoch": 0.15786577118604542, + "grad_norm": 0.089059017598629, + "kl": 0.02264404296875, + "learning_rate": 1.546830732047412e-06, + "loss": -0.0028, + "num_tokens": 98911084.0, + "reward": 0.5207244232296944, + "reward_std": 0.18586770072579384, + "rewards/code_reward": 0.37072440423071384, + "rewards/format_reward": 1.5, + "step": 4231 + }, + { + "clip_ratio": 0.003533079114276916, + "epoch": 0.15790308287859708, + "grad_norm": 0.08186834305524826, + "kl": 0.0228271484375, + "learning_rate": 1.5454389038178345e-06, + "loss": -0.0028, + "step": 4232 + }, + { + "clip_ratio": 0.0035082019749097526, + "epoch": 0.15794039457114872, + "grad_norm": 0.10513678193092346, + "kl": 0.022308349609375, + "learning_rate": 1.5440487348486335e-06, + "loss": -0.003, + "step": 4233 + }, + { + "clip_ratio": 0.0031429821392521262, + "completion_length": 952.2143402099609, + "epoch": 0.1579777062637004, + "grad_norm": 0.057069312781095505, + "kl": 0.0169219970703125, + "learning_rate": 1.5426602257230998e-06, + "loss": 0.0082, + "num_tokens": 99005946.0, + "reward": 0.624993059784174, + "reward_std": 0.1377062276005745, + "rewards/code_reward": 0.4749930687248707, + "rewards/format_reward": 1.5, + "step": 4234 + }, + { + "clip_ratio": 0.0032345952349714935, + "epoch": 0.15801501795625203, + "grad_norm": 0.058460015803575516, + "kl": 0.01671600341796875, + "learning_rate": 1.541273377023825e-06, + "loss": 0.0082, + "step": 4235 + }, + { + "clip_ratio": 0.003419381915591657, + "epoch": 0.1580523296488037, + "grad_norm": 0.05719854310154915, + "kl": 0.01708984375, + "learning_rate": 1.5398881893327058e-06, + "loss": 0.0082, + "step": 4236 + }, + { + "clip_ratio": 0.004403818573337048, + "completion_length": 754.3928833007812, + "epoch": 0.15808964134135534, + "grad_norm": 0.08058004826307297, + "kl": 0.0242919921875, + "learning_rate": 1.5385046632309423e-06, + "loss": -0.0061, + "num_tokens": 99084340.0, + "reward": 0.262953270226717, + "reward_std": 0.13992928341031075, + "rewards/code_reward": 0.1129532614722848, + "rewards/format_reward": 1.5, + "step": 4237 + }, + { + "clip_ratio": 0.004617123166099191, + "epoch": 0.158126953033907, + "grad_norm": 0.06340429186820984, + "kl": 0.0240478515625, + "learning_rate": 1.5371227992990334e-06, + "loss": -0.0062, + "step": 4238 + }, + { + "clip_ratio": 0.004397748387418687, + "epoch": 0.15816426472645864, + "grad_norm": 0.07258884608745575, + "kl": 0.024322509765625, + "learning_rate": 1.5357425981167865e-06, + "loss": -0.0063, + "step": 4239 + }, + { + "clip_ratio": 0.00246106565464288, + "completion_length": 549.4286041259766, + "epoch": 0.1582015764190103, + "grad_norm": 0.04151083528995514, + "kl": 0.016448974609375, + "learning_rate": 1.5343640602633065e-06, + "loss": -0.0048, + "num_tokens": 99142632.0, + "reward": 0.7232368849217892, + "reward_std": 0.07600875198841095, + "rewards/code_reward": 0.5732368901371956, + "rewards/format_reward": 1.5, + "step": 4240 + }, + { + "clip_ratio": 0.0025206011487171054, + "epoch": 0.15823888811156195, + "grad_norm": 0.040080904960632324, + "kl": 0.01611328125, + "learning_rate": 1.532987186317002e-06, + "loss": -0.0048, + "step": 4241 + }, + { + "clip_ratio": 0.0024037614930421114, + "epoch": 0.15827619980411362, + "grad_norm": 0.04355061054229736, + "kl": 0.0161895751953125, + "learning_rate": 1.531611976855586e-06, + "loss": -0.0048, + "step": 4242 + }, + { + "clip_ratio": 0.0031458704033866525, + "completion_length": 736.1964569091797, + "epoch": 0.15831351149666525, + "grad_norm": 0.06912952661514282, + "kl": 0.0200958251953125, + "learning_rate": 1.5302384324560682e-06, + "loss": 0.0323, + "num_tokens": 99213217.0, + "reward": 0.6531039737164974, + "reward_std": 0.12325534597039223, + "rewards/code_reward": 0.5057825185358524, + "rewards/format_reward": 1.4732142984867096, + "step": 4243 + }, + { + "clip_ratio": 0.002858368621673435, + "epoch": 0.15835082318921692, + "grad_norm": 0.05526183918118477, + "kl": 0.0199432373046875, + "learning_rate": 1.528866553694764e-06, + "loss": 0.0323, + "step": 4244 + }, + { + "clip_ratio": 0.0030627468950115144, + "epoch": 0.15838813488176856, + "grad_norm": 0.06754317879676819, + "kl": 0.019989013671875, + "learning_rate": 1.5274963411472873e-06, + "loss": 0.0323, + "step": 4245 + }, + { + "clip_ratio": 0.004175329173449427, + "completion_length": 731.3750305175781, + "epoch": 0.15842544657432023, + "grad_norm": 0.08494614064693451, + "kl": 0.026123046875, + "learning_rate": 1.5261277953885546e-06, + "loss": 0.0111, + "num_tokens": 99286696.0, + "reward": 0.48540638387203217, + "reward_std": 0.2984174154698849, + "rewards/code_reward": 0.3354063667356968, + "rewards/format_reward": 1.5, + "step": 4246 + }, + { + "clip_ratio": 0.004449267406016588, + "epoch": 0.1584627582668719, + "grad_norm": 0.08559686690568924, + "kl": 0.0260009765625, + "learning_rate": 1.5247609169927828e-06, + "loss": 0.0113, + "step": 4247 + }, + { + "clip_ratio": 0.004222371673677117, + "epoch": 0.15850006995942353, + "grad_norm": 0.08528362959623337, + "kl": 0.026275634765625, + "learning_rate": 1.523395706533487e-06, + "loss": 0.0111, + "step": 4248 + }, + { + "clip_ratio": 0.003511365270242095, + "completion_length": 669.8750152587891, + "epoch": 0.1585373816519752, + "grad_norm": 0.07124966382980347, + "kl": 0.01995849609375, + "learning_rate": 1.5220321645834856e-06, + "loss": -0.0048, + "num_tokens": 99360007.0, + "reward": 0.5507723316550255, + "reward_std": 0.22452562348917127, + "rewards/code_reward": 0.40077234600903466, + "rewards/format_reward": 1.5, + "step": 4249 + }, + { + "clip_ratio": 0.003715359780471772, + "epoch": 0.15857469334452684, + "grad_norm": 0.08230073004961014, + "kl": 0.02044677734375, + "learning_rate": 1.5206702917148948e-06, + "loss": -0.0048, + "step": 4250 + }, + { + "clip_ratio": 0.003515651624184102, + "epoch": 0.1586120050370785, + "grad_norm": 0.07022235542535782, + "kl": 0.020660400390625, + "learning_rate": 1.5193100884991304e-06, + "loss": -0.0049, + "step": 4251 + }, + { + "clip_ratio": 0.004584379494190216, + "completion_length": 598.8214569091797, + "epoch": 0.15864931672963015, + "grad_norm": 0.08495364338159561, + "kl": 0.0179901123046875, + "learning_rate": 1.5179515555069094e-06, + "loss": 0.0007, + "num_tokens": 99423263.0, + "reward": 0.5005610138177872, + "reward_std": 0.20423077419400215, + "rewards/code_reward": 0.35056097432971, + "rewards/format_reward": 1.5, + "step": 4252 + }, + { + "clip_ratio": 0.005004429840482771, + "epoch": 0.1586866284221818, + "grad_norm": 0.08511221408843994, + "kl": 0.018096923828125, + "learning_rate": 1.5165946933082473e-06, + "loss": 0.0007, + "step": 4253 + }, + { + "clip_ratio": 0.0043500736355781555, + "epoch": 0.15872394011473345, + "grad_norm": 0.0814647451043129, + "kl": 0.018157958984375, + "learning_rate": 1.5152395024724587e-06, + "loss": 0.0004, + "step": 4254 + }, + { + "clip_ratio": 0.0032982173725031316, + "completion_length": 657.8214721679688, + "epoch": 0.15876125180728512, + "grad_norm": 0.058922842144966125, + "kl": 0.0164794921875, + "learning_rate": 1.5138859835681544e-06, + "loss": -0.0053, + "num_tokens": 99494807.0, + "reward": 0.5413332283496857, + "reward_std": 0.026084575336426497, + "rewards/code_reward": 0.3913331697694957, + "rewards/format_reward": 1.5, + "step": 4255 + }, + { + "clip_ratio": 0.0033089967910200357, + "epoch": 0.15879856349983676, + "grad_norm": 0.06000510975718498, + "kl": 0.0164337158203125, + "learning_rate": 1.5125341371632472e-06, + "loss": -0.0054, + "step": 4256 + }, + { + "clip_ratio": 0.0034308016183786094, + "epoch": 0.15883587519238843, + "grad_norm": 0.06041426584124565, + "kl": 0.0165252685546875, + "learning_rate": 1.5111839638249476e-06, + "loss": -0.0054, + "step": 4257 + }, + { + "clip_ratio": 0.004808855359442532, + "completion_length": 728.4821853637695, + "epoch": 0.15887318688494007, + "grad_norm": 0.05849829316139221, + "kl": 0.0185394287109375, + "learning_rate": 1.509835464119761e-06, + "loss": 0.0076, + "num_tokens": 99571834.0, + "reward": 0.4224540516734123, + "reward_std": 0.19108930975198746, + "rewards/code_reward": 0.27245403081178665, + "rewards/format_reward": 1.5, + "step": 4258 + }, + { + "clip_ratio": 0.0045886008301749825, + "epoch": 0.15891049857749173, + "grad_norm": 0.05784980207681656, + "kl": 0.01861572265625, + "learning_rate": 1.508488638613494e-06, + "loss": 0.0077, + "step": 4259 + }, + { + "clip_ratio": 0.004389868816360831, + "epoch": 0.15894781027004337, + "grad_norm": 0.061627067625522614, + "kl": 0.018585205078125, + "learning_rate": 1.507143487871251e-06, + "loss": 0.0075, + "step": 4260 + }, + { + "clip_ratio": 0.002596792357508093, + "completion_length": 711.6964645385742, + "epoch": 0.15898512196259504, + "grad_norm": 0.20706702768802643, + "kl": 0.0226898193359375, + "learning_rate": 1.5058000124574296e-06, + "loss": 0.0715, + "num_tokens": 99640207.0, + "reward": 0.7973424196243286, + "reward_std": 0.14961105026304722, + "rewards/code_reward": 0.6500209793448448, + "rewards/format_reward": 1.4732142984867096, + "step": 4261 + }, + { + "clip_ratio": 0.0029163375729694963, + "epoch": 0.15902243365514668, + "grad_norm": 0.06244482100009918, + "kl": 0.02239990234375, + "learning_rate": 1.5044582129357285e-06, + "loss": 0.071, + "step": 4262 + }, + { + "clip_ratio": 0.0024245752720162272, + "epoch": 0.15905974534769834, + "grad_norm": 0.057670529931783676, + "kl": 0.0221710205078125, + "learning_rate": 1.503118089869142e-06, + "loss": 0.0709, + "step": 4263 + }, + { + "clip_ratio": 0.0035201606224291027, + "completion_length": 767.2143249511719, + "epoch": 0.15909705704024998, + "grad_norm": 0.07493014633655548, + "kl": 0.018890380859375, + "learning_rate": 1.50177964381996e-06, + "loss": 0.0168, + "num_tokens": 99714825.0, + "reward": 0.43884041905403137, + "reward_std": 0.2354419466573745, + "rewards/code_reward": 0.2888404028490186, + "rewards/format_reward": 1.5, + "step": 4264 + }, + { + "clip_ratio": 0.0033515318064019084, + "epoch": 0.15913436873280165, + "grad_norm": 0.07801833003759384, + "kl": 0.01763916015625, + "learning_rate": 1.5004428753497715e-06, + "loss": 0.0166, + "step": 4265 + }, + { + "clip_ratio": 0.0034218025975860655, + "epoch": 0.1591716804253533, + "grad_norm": 0.0738137736916542, + "kl": 0.0181884765625, + "learning_rate": 1.4991077850194578e-06, + "loss": 0.0166, + "step": 4266 + }, + { + "clip_ratio": 0.003204001870471984, + "completion_length": 640.8928756713867, + "epoch": 0.15920899211790496, + "grad_norm": 0.04913729801774025, + "kl": 0.0142669677734375, + "learning_rate": 1.4977743733891991e-06, + "loss": -0.004, + "num_tokens": 99773171.0, + "reward": 0.6083995141088963, + "reward_std": 0.17935745790600777, + "rewards/code_reward": 0.4583994671702385, + "rewards/format_reward": 1.5, + "step": 4267 + }, + { + "clip_ratio": 0.0028805450128857046, + "epoch": 0.1592463038104566, + "grad_norm": 0.04766136780381203, + "kl": 0.0140533447265625, + "learning_rate": 1.4964426410184685e-06, + "loss": -0.0041, + "step": 4268 + }, + { + "clip_ratio": 0.0031361831934191287, + "epoch": 0.15928361550300826, + "grad_norm": 0.0569269135594368, + "kl": 0.0137481689453125, + "learning_rate": 1.4951125884660378e-06, + "loss": -0.004, + "step": 4269 + }, + { + "clip_ratio": 0.003297125396784395, + "completion_length": 730.0536193847656, + "epoch": 0.1593209271955599, + "grad_norm": 0.04844774305820465, + "kl": 0.0174713134765625, + "learning_rate": 1.493784216289972e-06, + "loss": 0.0017, + "num_tokens": 99843090.0, + "reward": 0.36938774958252907, + "reward_std": 0.1961793452501297, + "rewards/code_reward": 0.21938775479793549, + "rewards/format_reward": 1.5, + "step": 4270 + }, + { + "clip_ratio": 0.0036188638187013566, + "epoch": 0.15935823888811157, + "grad_norm": 0.053020525723695755, + "kl": 0.01806640625, + "learning_rate": 1.4924575250476309e-06, + "loss": 0.0016, + "step": 4271 + }, + { + "clip_ratio": 0.003354721935465932, + "epoch": 0.1593955505806632, + "grad_norm": 0.04935632273554802, + "kl": 0.0182342529296875, + "learning_rate": 1.491132515295671e-06, + "loss": 0.0018, + "step": 4272 + }, + { + "clip_ratio": 0.0035727331414818764, + "completion_length": 539.7500152587891, + "epoch": 0.15943286227321488, + "grad_norm": 0.052034907042980194, + "kl": 0.021881103515625, + "learning_rate": 1.4898091875900396e-06, + "loss": 0.0081, + "num_tokens": 99895254.0, + "reward": 0.506775937974453, + "reward_std": 0.044623224064707756, + "rewards/code_reward": 0.35677592922002077, + "rewards/format_reward": 1.5, + "step": 4273 + }, + { + "clip_ratio": 0.00363435031613335, + "epoch": 0.15947017396576652, + "grad_norm": 0.052813004702329636, + "kl": 0.0216217041015625, + "learning_rate": 1.488487542485982e-06, + "loss": 0.008, + "step": 4274 + }, + { + "clip_ratio": 0.0032664532773196697, + "epoch": 0.15950748565831818, + "grad_norm": 0.05164273455739021, + "kl": 0.021514892578125, + "learning_rate": 1.4871675805380358e-06, + "loss": 0.0079, + "step": 4275 + }, + { + "clip_ratio": 0.004395827534608543, + "completion_length": 779.6607360839844, + "epoch": 0.15954479735086982, + "grad_norm": 0.07985920459032059, + "kl": 0.0203704833984375, + "learning_rate": 1.4858493023000315e-06, + "loss": 0.0017, + "num_tokens": 99980411.0, + "reward": 0.45919523015618324, + "reward_std": 0.17873779800720513, + "rewards/code_reward": 0.30919521872419864, + "rewards/format_reward": 1.5, + "step": 4276 + }, + { + "clip_ratio": 0.004792773514054716, + "epoch": 0.1595821090434215, + "grad_norm": 0.08000540733337402, + "kl": 0.0204010009765625, + "learning_rate": 1.4845327083250953e-06, + "loss": 0.002, + "step": 4277 + }, + { + "clip_ratio": 0.0045957029215060174, + "epoch": 0.15961942073597313, + "grad_norm": 0.0798782929778099, + "kl": 0.0201568603515625, + "learning_rate": 1.4832177991656446e-06, + "loss": 0.0018, + "step": 4278 + }, + { + "clip_ratio": 0.004300229076761752, + "completion_length": 533.9107360839844, + "epoch": 0.1596567324285248, + "grad_norm": 0.09000969678163528, + "kl": 0.0192108154296875, + "learning_rate": 1.4819045753733909e-06, + "loss": 0.0019, + "num_tokens": 100037050.0, + "reward": 0.4391000345349312, + "reward_std": 0.25099348940420896, + "rewards/code_reward": 0.28910001181066036, + "rewards/format_reward": 1.5, + "step": 4279 + }, + { + "clip_ratio": 0.004316634265705943, + "epoch": 0.15969404412107643, + "grad_norm": 0.09168195724487305, + "kl": 0.019012451171875, + "learning_rate": 1.4805930374993394e-06, + "loss": 0.0021, + "step": 4280 + }, + { + "clip_ratio": 0.004711580055300146, + "epoch": 0.1597313558136281, + "grad_norm": 0.08665557950735092, + "kl": 0.0189361572265625, + "learning_rate": 1.4792831860937864e-06, + "loss": 0.0016, + "step": 4281 + }, + { + "clip_ratio": 0.0027684883098118007, + "completion_length": 679.1071853637695, + "epoch": 0.15976866750617974, + "grad_norm": 0.05620564892888069, + "kl": 0.017852783203125, + "learning_rate": 1.4779750217063221e-06, + "loss": 0.0093, + "num_tokens": 100105278.0, + "reward": 0.8994658291339874, + "reward_std": 0.17836644127964973, + "rewards/code_reward": 0.749465811997652, + "rewards/format_reward": 1.5, + "step": 4282 + }, + { + "clip_ratio": 0.0028267757152207196, + "epoch": 0.1598059791987314, + "grad_norm": 0.05954772233963013, + "kl": 0.0182342529296875, + "learning_rate": 1.4766685448858264e-06, + "loss": 0.0093, + "step": 4283 + }, + { + "clip_ratio": 0.0026951520703732967, + "epoch": 0.15984329089128305, + "grad_norm": 0.057129401713609695, + "kl": 0.017822265625, + "learning_rate": 1.4753637561804734e-06, + "loss": 0.0093, + "step": 4284 + }, + { + "clip_ratio": 0.003329312603455037, + "completion_length": 768.8036193847656, + "epoch": 0.1598806025838347, + "grad_norm": 0.06733833253383636, + "kl": 0.0152435302734375, + "learning_rate": 1.4740606561377299e-06, + "loss": 0.0871, + "num_tokens": 100174301.0, + "reward": 0.5299333743751049, + "reward_std": 0.30722721572965384, + "rewards/code_reward": 0.3826119229197502, + "rewards/format_reward": 1.4732142984867096, + "step": 4285 + }, + { + "clip_ratio": 0.0031844701734371483, + "epoch": 0.15991791427638635, + "grad_norm": 0.06933679431676865, + "kl": 0.0158843994140625, + "learning_rate": 1.4727592453043493e-06, + "loss": 0.0871, + "step": 4286 + }, + { + "clip_ratio": 0.0033201498445123434, + "epoch": 0.15995522596893802, + "grad_norm": 0.06559601426124573, + "kl": 0.0154571533203125, + "learning_rate": 1.4714595242263818e-06, + "loss": 0.0871, + "step": 4287 + }, + { + "clip_ratio": 0.003207281173672527, + "completion_length": 555.1428756713867, + "epoch": 0.15999253766148966, + "grad_norm": 0.05964940786361694, + "kl": 0.0149078369140625, + "learning_rate": 1.4701614934491652e-06, + "loss": 0.0009, + "num_tokens": 100232759.0, + "reward": 0.8630952537059784, + "reward_std": 0.16387485340237617, + "rewards/code_reward": 0.7130952142179012, + "rewards/format_reward": 1.5, + "step": 4288 + }, + { + "clip_ratio": 0.0029709290829487145, + "epoch": 0.16002984935404133, + "grad_norm": 0.057603657245635986, + "kl": 0.014556884765625, + "learning_rate": 1.4688651535173293e-06, + "loss": 0.0009, + "step": 4289 + }, + { + "clip_ratio": 0.0031714283395558596, + "epoch": 0.16006716104659297, + "grad_norm": 0.05503363907337189, + "kl": 0.0152740478515625, + "learning_rate": 1.4675705049747948e-06, + "loss": 0.0008, + "step": 4290 + }, + { + "clip_ratio": 0.003689744509756565, + "completion_length": 558.303596496582, + "epoch": 0.16010447273914463, + "grad_norm": 0.07060585170984268, + "kl": 0.013885498046875, + "learning_rate": 1.4662775483647708e-06, + "loss": -0.0174, + "num_tokens": 100289728.0, + "reward": 0.7369872890412807, + "reward_std": 0.0592624451383017, + "rewards/code_reward": 0.5869872570037842, + "rewards/format_reward": 1.5, + "step": 4291 + }, + { + "clip_ratio": 0.003605636360589415, + "epoch": 0.16014178443169627, + "grad_norm": 0.06306285411119461, + "kl": 0.01373291015625, + "learning_rate": 1.4649862842297593e-06, + "loss": -0.0176, + "step": 4292 + }, + { + "clip_ratio": 0.0029780271579511464, + "epoch": 0.16017909612424794, + "grad_norm": 0.07647627592086792, + "kl": 0.01385498046875, + "learning_rate": 1.4636967131115514e-06, + "loss": -0.0176, + "step": 4293 + }, + { + "clip_ratio": 0.003155490616336465, + "completion_length": 799.0893402099609, + "epoch": 0.16021640781679958, + "grad_norm": 0.07648050785064697, + "kl": 0.01812744140625, + "learning_rate": 1.4624088355512244e-06, + "loss": 0.0039, + "num_tokens": 100371493.0, + "reward": 0.5969742015004158, + "reward_std": 0.1797792725265026, + "rewards/code_reward": 0.44697421323508024, + "rewards/format_reward": 1.5, + "step": 4294 + }, + { + "clip_ratio": 0.0034763081930577755, + "epoch": 0.16025371950935124, + "grad_norm": 0.07748584449291229, + "kl": 0.0183258056640625, + "learning_rate": 1.4611226520891507e-06, + "loss": 0.0037, + "step": 4295 + }, + { + "clip_ratio": 0.0033757358323782682, + "epoch": 0.16029103120190288, + "grad_norm": 0.07757866382598877, + "kl": 0.017608642578125, + "learning_rate": 1.459838163264987e-06, + "loss": 0.0037, + "step": 4296 + }, + { + "clip_ratio": 0.003834189788904041, + "completion_length": 743.0893096923828, + "epoch": 0.16032834289445455, + "grad_norm": 0.07197756320238113, + "kl": 0.021484375, + "learning_rate": 1.4585553696176813e-06, + "loss": 0.0059, + "num_tokens": 100444284.0, + "reward": 0.2917881943285465, + "reward_std": 0.10816663037985563, + "rewards/code_reward": 0.14375247107818723, + "rewards/format_reward": 1.480357140302658, + "step": 4297 + }, + { + "clip_ratio": 0.004348687070887536, + "epoch": 0.1603656545870062, + "grad_norm": 0.07228725403547287, + "kl": 0.0205078125, + "learning_rate": 1.4572742716854707e-06, + "loss": 0.006, + "step": 4298 + }, + { + "clip_ratio": 0.004329268238507211, + "epoch": 0.16040296627955786, + "grad_norm": 0.07215365022420883, + "kl": 0.02191162109375, + "learning_rate": 1.4559948700058796e-06, + "loss": 0.006, + "step": 4299 + }, + { + "clip_ratio": 0.0032027270062826574, + "completion_length": 705.482177734375, + "epoch": 0.1604402779721095, + "grad_norm": 0.08580952882766724, + "kl": 0.0172119140625, + "learning_rate": 1.4547171651157216e-06, + "loss": -0.014, + "num_tokens": 100514367.0, + "reward": 0.7204565405845642, + "reward_std": 0.15302017703652382, + "rewards/code_reward": 0.5704564452171326, + "rewards/format_reward": 1.5, + "step": 4300 + }, + { + "clip_ratio": 0.0031232660985551775, + "epoch": 0.16047758966466116, + "grad_norm": 0.07968419045209885, + "kl": 0.0169219970703125, + "learning_rate": 1.453441157551097e-06, + "loss": -0.0142, + "step": 4301 + }, + { + "clip_ratio": 0.0031818344141356647, + "epoch": 0.16051490135721283, + "grad_norm": 0.08059517294168472, + "kl": 0.0167999267578125, + "learning_rate": 1.4521668478473963e-06, + "loss": -0.0141, + "step": 4302 + }, + { + "clip_ratio": 0.003111906931735575, + "completion_length": 636.1250152587891, + "epoch": 0.16055221304976447, + "grad_norm": 0.07124502956867218, + "kl": 0.0182647705078125, + "learning_rate": 1.450894236539296e-06, + "loss": 0.001, + "num_tokens": 100586586.0, + "reward": 0.83907201141119, + "reward_std": 0.25416504591703415, + "rewards/code_reward": 0.6890719793736935, + "rewards/format_reward": 1.5, + "step": 4303 + }, + { + "clip_ratio": 0.0029244880424812436, + "epoch": 0.16058952474231614, + "grad_norm": 0.07200515270233154, + "kl": 0.0181732177734375, + "learning_rate": 1.4496233241607581e-06, + "loss": 0.0009, + "step": 4304 + }, + { + "clip_ratio": 0.0030211203265935183, + "epoch": 0.16062683643486778, + "grad_norm": 0.07402489334344864, + "kl": 0.01763916015625, + "learning_rate": 1.4483541112450355e-06, + "loss": 0.001, + "step": 4305 + }, + { + "clip_ratio": 0.0027668203692883253, + "completion_length": 612.7857437133789, + "epoch": 0.16066414812741944, + "grad_norm": 0.06532993167638779, + "kl": 0.016265869140625, + "learning_rate": 1.4470865983246664e-06, + "loss": -0.0092, + "num_tokens": 100659220.0, + "reward": 0.8921400606632233, + "reward_std": 0.057761965319514275, + "rewards/code_reward": 0.7421400733292103, + "rewards/format_reward": 1.5, + "step": 4306 + }, + { + "clip_ratio": 0.0018745440756902099, + "epoch": 0.16070145981997108, + "grad_norm": 0.06222628429532051, + "kl": 0.01580810546875, + "learning_rate": 1.4458207859314755e-06, + "loss": -0.0092, + "step": 4307 + }, + { + "clip_ratio": 0.0021955915726721287, + "epoch": 0.16073877151252275, + "grad_norm": 0.06520511955022812, + "kl": 0.016357421875, + "learning_rate": 1.4445566745965747e-06, + "loss": -0.0093, + "step": 4308 + }, + { + "clip_ratio": 0.004717974225059152, + "completion_length": 695.3035888671875, + "epoch": 0.1607760832050744, + "grad_norm": 0.04132429137825966, + "kl": 0.015777587890625, + "learning_rate": 1.4432942648503607e-06, + "loss": 0.0273, + "num_tokens": 100726185.0, + "reward": 0.40134313330054283, + "reward_std": 0.1637575700879097, + "rewards/code_reward": 0.25134310871362686, + "rewards/format_reward": 1.5, + "step": 4309 + }, + { + "clip_ratio": 0.004293839156161994, + "epoch": 0.16081339489762606, + "grad_norm": 0.05101567134261131, + "kl": 0.0162200927734375, + "learning_rate": 1.442033557222518e-06, + "loss": 0.0272, + "step": 4310 + }, + { + "clip_ratio": 0.004613392520695925, + "epoch": 0.1608507065901777, + "grad_norm": 0.04352287948131561, + "kl": 0.015655517578125, + "learning_rate": 1.4407745522420143e-06, + "loss": 0.0272, + "step": 4311 + }, + { + "clip_ratio": 0.004047049675136805, + "completion_length": 710.3928833007812, + "epoch": 0.16088801828272936, + "grad_norm": 0.07356540858745575, + "kl": 0.019073486328125, + "learning_rate": 1.4395172504371065e-06, + "loss": 0.008, + "num_tokens": 100796949.0, + "reward": 0.2824096865952015, + "reward_std": 0.12931846920400858, + "rewards/code_reward": 0.13240967132151127, + "rewards/format_reward": 1.5, + "step": 4312 + }, + { + "clip_ratio": 0.003568220476154238, + "epoch": 0.160925329975281, + "grad_norm": 0.36803925037384033, + "kl": 0.019775390625, + "learning_rate": 1.4382616523353358e-06, + "loss": 0.0081, + "step": 4313 + }, + { + "clip_ratio": 0.004224102245643735, + "epoch": 0.16096264166783267, + "grad_norm": 0.06851167231798172, + "kl": 0.0194244384765625, + "learning_rate": 1.4370077584635256e-06, + "loss": 0.008, + "step": 4314 + }, + { + "clip_ratio": 0.004813161096535623, + "completion_length": 727.8393096923828, + "epoch": 0.1609999533603843, + "grad_norm": 0.08463913202285767, + "kl": 0.02374267578125, + "learning_rate": 1.435755569347787e-06, + "loss": -0.0041, + "num_tokens": 100878770.0, + "reward": 0.6607118621468544, + "reward_std": 0.15741590992547572, + "rewards/code_reward": 0.5107118343003094, + "rewards/format_reward": 1.5, + "step": 4315 + }, + { + "clip_ratio": 0.003961391979828477, + "epoch": 0.16103726505293597, + "grad_norm": 0.09617298096418381, + "kl": 0.024078369140625, + "learning_rate": 1.4345050855135162e-06, + "loss": -0.0041, + "step": 4316 + }, + { + "clip_ratio": 0.004310358373913914, + "epoch": 0.1610745767454876, + "grad_norm": 0.09256730228662491, + "kl": 0.02386474609375, + "learning_rate": 1.4332563074853928e-06, + "loss": -0.0045, + "step": 4317 + }, + { + "clip_ratio": 0.003978108172304928, + "completion_length": 688.9464569091797, + "epoch": 0.16111188843803928, + "grad_norm": 0.07741431146860123, + "kl": 0.0174407958984375, + "learning_rate": 1.4320092357873817e-06, + "loss": 0.0112, + "num_tokens": 100951805.0, + "reward": 0.4012732096016407, + "reward_std": 0.11610875837504864, + "rewards/code_reward": 0.2512731935130432, + "rewards/format_reward": 1.5, + "step": 4318 + }, + { + "clip_ratio": 0.004261363879777491, + "epoch": 0.16114920013059092, + "grad_norm": 0.07211699336767197, + "kl": 0.017120361328125, + "learning_rate": 1.4307638709427281e-06, + "loss": 0.0113, + "step": 4319 + }, + { + "clip_ratio": 0.003843295096885413, + "epoch": 0.1611865118231426, + "grad_norm": 0.07312023639678955, + "kl": 0.017578125, + "learning_rate": 1.429520213473967e-06, + "loss": 0.0113, + "step": 4320 + }, + { + "clip_ratio": 0.004885072878096253, + "completion_length": 690.9821472167969, + "epoch": 0.16122382351569423, + "grad_norm": 0.09589970856904984, + "kl": 0.0184326171875, + "learning_rate": 1.428278263902913e-06, + "loss": -0.0146, + "num_tokens": 101019858.0, + "reward": 0.566999051719904, + "reward_std": 0.180905070155859, + "rewards/code_reward": 0.41699903924018145, + "rewards/format_reward": 1.5, + "step": 4321 + }, + { + "clip_ratio": 0.004500042414292693, + "epoch": 0.1612611352082459, + "grad_norm": 0.0921039804816246, + "kl": 0.01849365234375, + "learning_rate": 1.4270380227506642e-06, + "loss": -0.0147, + "step": 4322 + }, + { + "clip_ratio": 0.004054702876601368, + "epoch": 0.16129844690079753, + "grad_norm": 0.08619450032711029, + "kl": 0.018402099609375, + "learning_rate": 1.4257994905376032e-06, + "loss": -0.0148, + "step": 4323 + }, + { + "clip_ratio": 0.0034546010429039598, + "completion_length": 803.2857513427734, + "epoch": 0.1613357585933492, + "grad_norm": 0.060955267399549484, + "kl": 0.016571044921875, + "learning_rate": 1.4245626677833954e-06, + "loss": 0.0144, + "num_tokens": 101098918.0, + "reward": 0.6212882809340954, + "reward_std": 0.14361895725596696, + "rewards/code_reward": 0.47128825113759376, + "rewards/format_reward": 1.5, + "step": 4324 + }, + { + "clip_ratio": 0.0037060442846268415, + "epoch": 0.16137307028590084, + "grad_norm": 0.06637341529130936, + "kl": 0.0164642333984375, + "learning_rate": 1.4233275550069892e-06, + "loss": 0.0146, + "step": 4325 + }, + { + "clip_ratio": 0.0032076191273517907, + "epoch": 0.1614103819784525, + "grad_norm": 0.061333734542131424, + "kl": 0.01611328125, + "learning_rate": 1.4220941527266146e-06, + "loss": 0.0145, + "step": 4326 + }, + { + "clip_ratio": 0.003123158821836114, + "completion_length": 812.8928985595703, + "epoch": 0.16144769367100414, + "grad_norm": 0.058805350214242935, + "kl": 0.0153350830078125, + "learning_rate": 1.4208624614597836e-06, + "loss": -0.0124, + "num_tokens": 101167652.0, + "reward": 0.5536007396876812, + "reward_std": 0.14276272058486938, + "rewards/code_reward": 0.4036007225513458, + "rewards/format_reward": 1.5, + "step": 4327 + }, + { + "clip_ratio": 0.0032594723161309958, + "epoch": 0.1614850053635558, + "grad_norm": 0.059636782854795456, + "kl": 0.01531982421875, + "learning_rate": 1.4196324817232917e-06, + "loss": -0.0124, + "step": 4328 + }, + { + "clip_ratio": 0.0032422527438029647, + "epoch": 0.16152231705610745, + "grad_norm": 0.05853171646595001, + "kl": 0.0155029296875, + "learning_rate": 1.4184042140332152e-06, + "loss": -0.0124, + "step": 4329 + }, + { + "clip_ratio": 0.005455620936118066, + "completion_length": 639.7500305175781, + "epoch": 0.16155962874865912, + "grad_norm": 0.06265689432621002, + "kl": 0.03204345703125, + "learning_rate": 1.4171776589049122e-06, + "loss": 0.0067, + "num_tokens": 101245352.0, + "reward": 0.4279017969965935, + "reward_std": 0.1348801627755165, + "rewards/code_reward": 0.2779017798602581, + "rewards/format_reward": 1.5, + "step": 4330 + }, + { + "clip_ratio": 0.004997552954591811, + "epoch": 0.16159694044121076, + "grad_norm": 0.06567402184009552, + "kl": 0.0330810546875, + "learning_rate": 1.4159528168530238e-06, + "loss": 0.0067, + "step": 4331 + }, + { + "clip_ratio": 0.005513814045116305, + "epoch": 0.16163425213376242, + "grad_norm": 0.06951781362295151, + "kl": 0.03173828125, + "learning_rate": 1.4147296883914684e-06, + "loss": 0.0067, + "step": 4332 + }, + { + "clip_ratio": 0.0018894788809120655, + "completion_length": 558.6607360839844, + "epoch": 0.16167156382631406, + "grad_norm": 0.052148837596178055, + "kl": 0.0138092041015625, + "learning_rate": 1.4135082740334502e-06, + "loss": -0.0076, + "num_tokens": 101303629.0, + "reward": 0.672749500721693, + "reward_std": 0.07925148960202932, + "rewards/code_reward": 0.5227495115250349, + "rewards/format_reward": 1.5, + "step": 4333 + }, + { + "clip_ratio": 0.001934286323376, + "epoch": 0.16170887551886573, + "grad_norm": 0.05012771487236023, + "kl": 0.0137939453125, + "learning_rate": 1.4122885742914506e-06, + "loss": -0.0078, + "step": 4334 + }, + { + "clip_ratio": 0.001995621540118009, + "epoch": 0.16174618721141737, + "grad_norm": 0.2031993716955185, + "kl": 0.0135040283203125, + "learning_rate": 1.4110705896772345e-06, + "loss": -0.0077, + "step": 4335 + }, + { + "clip_ratio": 0.0031266192090697587, + "completion_length": 601.4821624755859, + "epoch": 0.16178349890396904, + "grad_norm": 0.05888118967413902, + "kl": 0.01702880859375, + "learning_rate": 1.4098543207018447e-06, + "loss": 0.0125, + "num_tokens": 101363072.0, + "reward": 1.0498683899641037, + "reward_std": 0.09224594384431839, + "rewards/code_reward": 0.899868369102478, + "rewards/format_reward": 1.5, + "step": 4336 + }, + { + "clip_ratio": 0.002976719755679369, + "epoch": 0.16182081059652068, + "grad_norm": 0.05741974338889122, + "kl": 0.017181396484375, + "learning_rate": 1.4086397678756047e-06, + "loss": 0.0125, + "step": 4337 + }, + { + "clip_ratio": 0.0031093726865947247, + "epoch": 0.16185812228907234, + "grad_norm": 0.05862647294998169, + "kl": 0.01702880859375, + "learning_rate": 1.4074269317081198e-06, + "loss": 0.0124, + "step": 4338 + }, + { + "clip_ratio": 0.003920277929864824, + "completion_length": 615.5178833007812, + "epoch": 0.16189543398162398, + "grad_norm": 0.06978099048137665, + "kl": 0.021484375, + "learning_rate": 1.4062158127082711e-06, + "loss": 0.0012, + "num_tokens": 101430903.0, + "reward": 0.4208451360464096, + "reward_std": 0.22585285501554608, + "rewards/code_reward": 0.27263082563877106, + "rewards/format_reward": 1.4821428656578064, + "step": 4339 + }, + { + "clip_ratio": 0.0041743002948351204, + "epoch": 0.16193274567417565, + "grad_norm": 0.07090900838375092, + "kl": 0.0218963623046875, + "learning_rate": 1.4050064113842234e-06, + "loss": 0.0013, + "step": 4340 + }, + { + "clip_ratio": 0.0036822398542426527, + "epoch": 0.1619700573667273, + "grad_norm": 0.06979607790708542, + "kl": 0.0211639404296875, + "learning_rate": 1.4037987282434187e-06, + "loss": 0.0011, + "step": 4341 + }, + { + "clip_ratio": 0.001545328792417422, + "completion_length": 567.2857513427734, + "epoch": 0.16200736905927896, + "grad_norm": 0.05728784203529358, + "kl": 0.0182342529296875, + "learning_rate": 1.4025927637925779e-06, + "loss": -0.0083, + "num_tokens": 101497741.0, + "reward": 1.0071428418159485, + "reward_std": 0.23440362513065338, + "rewards/code_reward": 0.8571428656578064, + "rewards/format_reward": 1.5, + "step": 4342 + }, + { + "clip_ratio": 0.0017959363758563995, + "epoch": 0.1620446807518306, + "grad_norm": 0.059026677161455154, + "kl": 0.01800537109375, + "learning_rate": 1.4013885185377023e-06, + "loss": -0.0084, + "step": 4343 + }, + { + "clip_ratio": 0.001976812200155109, + "epoch": 0.16208199244438226, + "grad_norm": 0.058140527456998825, + "kl": 0.0178985595703125, + "learning_rate": 1.4001859929840706e-06, + "loss": -0.0085, + "step": 4344 + }, + { + "clip_ratio": 0.0029285444761626422, + "completion_length": 721.7143402099609, + "epoch": 0.1621193041369339, + "grad_norm": 0.03082297183573246, + "kl": 0.018096923828125, + "learning_rate": 1.3989851876362387e-06, + "loss": 0.0022, + "num_tokens": 101571099.0, + "reward": 0.27517594024538994, + "reward_std": 0.00044721298036165535, + "rewards/code_reward": 0.12517593243683223, + "rewards/format_reward": 1.5, + "step": 4345 + }, + { + "clip_ratio": 0.0035551850451156497, + "epoch": 0.16215661582948557, + "grad_norm": 0.035980723798274994, + "kl": 0.017791748046875, + "learning_rate": 1.3977861029980447e-06, + "loss": 0.0022, + "step": 4346 + }, + { + "clip_ratio": 0.0032490845769643784, + "epoch": 0.1621939275220372, + "grad_norm": 0.031190617009997368, + "kl": 0.0180511474609375, + "learning_rate": 1.3965887395726e-06, + "loss": 0.0022, + "step": 4347 + }, + { + "clip_ratio": 0.0019965480314567685, + "completion_length": 738.4286041259766, + "epoch": 0.16223123921458887, + "grad_norm": 0.03801708668470383, + "kl": 0.014312744140625, + "learning_rate": 1.3953930978622976e-06, + "loss": 0.0016, + "num_tokens": 101645505.0, + "reward": 0.413305327296257, + "reward_std": 0.03144250437617302, + "rewards/code_reward": 0.2633053227327764, + "rewards/format_reward": 1.5, + "step": 4348 + }, + { + "clip_ratio": 0.0019968441338278353, + "epoch": 0.1622685509071405, + "grad_norm": 0.03485913574695587, + "kl": 0.0140838623046875, + "learning_rate": 1.3941991783688053e-06, + "loss": 0.0015, + "step": 4349 + }, + { + "clip_ratio": 0.002018342027440667, + "epoch": 0.16230586259969218, + "grad_norm": 0.0382806770503521, + "kl": 0.0144195556640625, + "learning_rate": 1.3930069815930699e-06, + "loss": 0.0016, + "step": 4350 + }, + { + "clip_ratio": 0.004476660338696092, + "completion_length": 742.2678985595703, + "epoch": 0.16234317429224382, + "grad_norm": 0.10318528860807419, + "kl": 0.026397705078125, + "learning_rate": 1.3918165080353153e-06, + "loss": -0.0057, + "num_tokens": 101724368.0, + "reward": 0.5817417427897453, + "reward_std": 0.2934573017992079, + "rewards/code_reward": 0.43174171878490597, + "rewards/format_reward": 1.5, + "step": 4351 + }, + { + "clip_ratio": 0.004042900167405605, + "epoch": 0.1623804859847955, + "grad_norm": 0.08356702327728271, + "kl": 0.024688720703125, + "learning_rate": 1.3906277581950422e-06, + "loss": -0.0058, + "step": 4352 + }, + { + "clip_ratio": 0.0037809786736033857, + "epoch": 0.16241779767734713, + "grad_norm": 0.08101043105125427, + "kl": 0.0246734619140625, + "learning_rate": 1.389440732571028e-06, + "loss": -0.006, + "step": 4353 + }, + { + "clip_ratio": 0.003772821743041277, + "completion_length": 638.2500228881836, + "epoch": 0.1624551093698988, + "grad_norm": 0.07165667414665222, + "kl": 0.016021728515625, + "learning_rate": 1.388255431661327e-06, + "loss": -0.0174, + "num_tokens": 101796384.0, + "reward": 0.45926716178655624, + "reward_std": 0.3465396426618099, + "rewards/code_reward": 0.3119457308202982, + "rewards/format_reward": 1.4732142984867096, + "step": 4354 + }, + { + "clip_ratio": 0.0040661648963578045, + "epoch": 0.16249242106245043, + "grad_norm": 0.07232223451137543, + "kl": 0.0157012939453125, + "learning_rate": 1.3870718559632676e-06, + "loss": -0.0176, + "step": 4355 + }, + { + "clip_ratio": 0.0038126599974930286, + "epoch": 0.1625297327550021, + "grad_norm": 0.07406057417392731, + "kl": 0.0158538818359375, + "learning_rate": 1.385890005973459e-06, + "loss": -0.0175, + "step": 4356 + }, + { + "clip_ratio": 0.0022794133983552456, + "completion_length": 576.0893173217773, + "epoch": 0.16256704444755377, + "grad_norm": 0.05774098262190819, + "kl": 0.0185546875, + "learning_rate": 1.3847098821877805e-06, + "loss": -0.003, + "num_tokens": 101853341.0, + "reward": 0.9251098930835724, + "reward_std": 0.19046473503112793, + "rewards/code_reward": 0.7751099094748497, + "rewards/format_reward": 1.5, + "step": 4357 + }, + { + "clip_ratio": 0.0024156079161912203, + "epoch": 0.1626043561401054, + "grad_norm": 0.05749962478876114, + "kl": 0.019012451171875, + "learning_rate": 1.3835314851013917e-06, + "loss": -0.003, + "step": 4358 + }, + { + "clip_ratio": 0.002468820894137025, + "epoch": 0.16264166783265707, + "grad_norm": 0.05749843642115593, + "kl": 0.018768310546875, + "learning_rate": 1.3823548152087263e-06, + "loss": -0.0032, + "step": 4359 + }, + { + "clip_ratio": 0.00308584189042449, + "completion_length": 738.0536117553711, + "epoch": 0.1626789795252087, + "grad_norm": 0.0737309604883194, + "kl": 0.018951416015625, + "learning_rate": 1.3811798730034925e-06, + "loss": 0.0097, + "num_tokens": 101935230.0, + "reward": 0.8202326707541943, + "reward_std": 0.11244811397045851, + "rewards/code_reward": 0.6729111820459366, + "rewards/format_reward": 1.4732142984867096, + "step": 4360 + }, + { + "clip_ratio": 0.003449109790381044, + "epoch": 0.16271629121776038, + "grad_norm": 0.07031244784593582, + "kl": 0.0202178955078125, + "learning_rate": 1.3800066589786751e-06, + "loss": 0.0096, + "step": 4361 + }, + { + "clip_ratio": 0.0034084991202689707, + "epoch": 0.16275360291031202, + "grad_norm": 0.07524265348911285, + "kl": 0.0195770263671875, + "learning_rate": 1.378835173626532e-06, + "loss": 0.0096, + "step": 4362 + }, + { + "clip_ratio": 0.0019284298177808523, + "completion_length": 620.1964645385742, + "epoch": 0.16279091460286368, + "grad_norm": 0.047804806381464005, + "kl": 0.0163116455078125, + "learning_rate": 1.377665417438597e-06, + "loss": 0.0045, + "num_tokens": 102002805.0, + "reward": 0.6642803251743317, + "reward_std": 0.1740465983748436, + "rewards/code_reward": 0.5142803341150284, + "rewards/format_reward": 1.5, + "step": 4363 + }, + { + "clip_ratio": 0.0021979036973789334, + "epoch": 0.16282822629541532, + "grad_norm": 0.048906683921813965, + "kl": 0.01641845703125, + "learning_rate": 1.3764973909056787e-06, + "loss": 0.0047, + "step": 4364 + }, + { + "clip_ratio": 0.001816023315768689, + "epoch": 0.162865537987967, + "grad_norm": 0.04732561483979225, + "kl": 0.0164337158203125, + "learning_rate": 1.3753310945178576e-06, + "loss": 0.0045, + "step": 4365 + }, + { + "clip_ratio": 0.0033465372980572283, + "completion_length": 696.053596496582, + "epoch": 0.16290284968051863, + "grad_norm": 0.059140875935554504, + "kl": 0.022979736328125, + "learning_rate": 1.3741665287644928e-06, + "loss": 0.0032, + "num_tokens": 102069812.0, + "reward": 0.484404269605875, + "reward_std": 0.09306867606937885, + "rewards/code_reward": 0.33440425992012024, + "rewards/format_reward": 1.5, + "step": 4366 + }, + { + "clip_ratio": 0.004056474193930626, + "epoch": 0.1629401613730703, + "grad_norm": 0.05972182750701904, + "kl": 0.02288818359375, + "learning_rate": 1.3730036941342118e-06, + "loss": 0.0032, + "step": 4367 + }, + { + "clip_ratio": 0.00374753080541268, + "epoch": 0.16297747306562194, + "grad_norm": 0.05511901155114174, + "kl": 0.022735595703125, + "learning_rate": 1.3718425911149184e-06, + "loss": 0.0032, + "step": 4368 + }, + { + "clip_ratio": 0.003790546383243054, + "completion_length": 623.6428909301758, + "epoch": 0.1630147847581736, + "grad_norm": 0.11309012025594711, + "kl": 0.0173797607421875, + "learning_rate": 1.3706832201937922e-06, + "loss": -0.0001, + "num_tokens": 102141290.0, + "reward": 0.5956026613712311, + "reward_std": 0.2011763583868742, + "rewards/code_reward": 0.44560263119637966, + "rewards/format_reward": 1.5, + "step": 4369 + }, + { + "clip_ratio": 0.003675708139780909, + "epoch": 0.16305209645072524, + "grad_norm": 0.11199542135000229, + "kl": 0.0173187255859375, + "learning_rate": 1.3695255818572817e-06, + "loss": -0.0002, + "step": 4370 + }, + { + "clip_ratio": 0.003755724581424147, + "epoch": 0.1630894081432769, + "grad_norm": 0.10382279753684998, + "kl": 0.01763916015625, + "learning_rate": 1.3683696765911117e-06, + "loss": -0.0003, + "step": 4371 + }, + { + "clip_ratio": 0.0042149966466240585, + "completion_length": 742.0178985595703, + "epoch": 0.16312671983582855, + "grad_norm": 0.08534660190343857, + "kl": 0.018341064453125, + "learning_rate": 1.3672155048802779e-06, + "loss": 0.0079, + "num_tokens": 102213741.0, + "reward": 0.46601777523756027, + "reward_std": 0.2276138998568058, + "rewards/code_reward": 0.3160177255049348, + "rewards/format_reward": 1.5, + "step": 4372 + }, + { + "clip_ratio": 0.004099890124052763, + "epoch": 0.16316403152838022, + "grad_norm": 0.08351980149745941, + "kl": 0.018524169921875, + "learning_rate": 1.3660630672090491e-06, + "loss": 0.0079, + "step": 4373 + }, + { + "clip_ratio": 0.003932144434656948, + "epoch": 0.16320134322093185, + "grad_norm": 0.07969745993614197, + "kl": 0.01861572265625, + "learning_rate": 1.3649123640609671e-06, + "loss": 0.0074, + "step": 4374 + }, + { + "clip_ratio": 0.004216431931126863, + "completion_length": 583.0357360839844, + "epoch": 0.16323865491348352, + "grad_norm": 0.048529524356126785, + "kl": 0.0159149169921875, + "learning_rate": 1.3637633959188457e-06, + "loss": 0.0013, + "num_tokens": 102275133.0, + "reward": 0.4101541079580784, + "reward_std": 0.014017435722053051, + "rewards/code_reward": 0.26015407033264637, + "rewards/format_reward": 1.5, + "step": 4375 + }, + { + "clip_ratio": 0.004598202300257981, + "epoch": 0.16327596660603516, + "grad_norm": 0.04777834564447403, + "kl": 0.0157928466796875, + "learning_rate": 1.3626161632647702e-06, + "loss": 0.0011, + "step": 4376 + }, + { + "clip_ratio": 0.0040639807702973485, + "epoch": 0.16331327829858683, + "grad_norm": 0.07714294642210007, + "kl": 0.015899658203125, + "learning_rate": 1.3614706665800978e-06, + "loss": 0.001, + "step": 4377 + }, + { + "clip_ratio": 0.0021523014875128865, + "completion_length": 703.678596496582, + "epoch": 0.16335058999113847, + "grad_norm": 0.05570599436759949, + "kl": 0.01922607421875, + "learning_rate": 1.3603269063454585e-06, + "loss": 0.0065, + "num_tokens": 102344905.0, + "reward": 0.812660276889801, + "reward_std": 0.14646360278129578, + "rewards/code_reward": 0.6626602523028851, + "rewards/format_reward": 1.5, + "step": 4378 + }, + { + "clip_ratio": 0.0021540315356105566, + "epoch": 0.16338790168369013, + "grad_norm": 0.05453840270638466, + "kl": 0.0193328857421875, + "learning_rate": 1.3591848830407535e-06, + "loss": 0.0064, + "step": 4379 + }, + { + "clip_ratio": 0.002064177766442299, + "epoch": 0.16342521337624177, + "grad_norm": 0.05485059320926666, + "kl": 0.0197296142578125, + "learning_rate": 1.3580445971451523e-06, + "loss": 0.0066, + "step": 4380 + }, + { + "clip_ratio": 0.0023447658168151975, + "completion_length": 690.7500305175781, + "epoch": 0.16346252506879344, + "grad_norm": 0.05807450786232948, + "kl": 0.0213623046875, + "learning_rate": 1.356906049137099e-06, + "loss": -0.005, + "num_tokens": 102416641.0, + "reward": 0.7188530042767525, + "reward_std": 0.23495113104581833, + "rewards/code_reward": 0.5688530281186104, + "rewards/format_reward": 1.5, + "step": 4381 + }, + { + "clip_ratio": 0.001990951714105904, + "epoch": 0.16349983676134508, + "grad_norm": 0.05734914913773537, + "kl": 0.021636962890625, + "learning_rate": 1.3557692394943078e-06, + "loss": -0.0052, + "step": 4382 + }, + { + "clip_ratio": 0.0021244847448542714, + "epoch": 0.16353714845389675, + "grad_norm": 0.058196283876895905, + "kl": 0.02197265625, + "learning_rate": 1.3546341686937619e-06, + "loss": -0.0052, + "step": 4383 + }, + { + "clip_ratio": 0.002955729723908007, + "completion_length": 724.3750228881836, + "epoch": 0.16357446014644839, + "grad_norm": 0.06710544973611832, + "kl": 0.0150604248046875, + "learning_rate": 1.353500837211717e-06, + "loss": -0.0007, + "num_tokens": 102488720.0, + "reward": 0.7674858830869198, + "reward_std": 0.16976212814915925, + "rewards/code_reward": 0.6174858532904182, + "rewards/format_reward": 1.5, + "step": 4384 + }, + { + "clip_ratio": 0.0031479046447202563, + "epoch": 0.16361177183900005, + "grad_norm": 0.08427748084068298, + "kl": 0.0151214599609375, + "learning_rate": 1.3523692455236962e-06, + "loss": -0.0005, + "step": 4385 + }, + { + "clip_ratio": 0.003242897568270564, + "epoch": 0.1636490835315517, + "grad_norm": 0.05891707167029381, + "kl": 0.0150146484375, + "learning_rate": 1.3512393941044943e-06, + "loss": -0.0006, + "step": 4386 + }, + { + "clip_ratio": 0.003557418647687882, + "completion_length": 1015.0000610351562, + "epoch": 0.16368639522410336, + "grad_norm": 0.0712597444653511, + "kl": 0.0142822265625, + "learning_rate": 1.3501112834281787e-06, + "loss": -0.0035, + "num_tokens": 102583276.0, + "reward": 0.590773843228817, + "reward_std": 0.3208548501133919, + "rewards/code_reward": 0.443452388048172, + "rewards/format_reward": 1.4732142984867096, + "step": 4387 + }, + { + "clip_ratio": 0.003291789093054831, + "epoch": 0.163723706916655, + "grad_norm": 0.06999381631612778, + "kl": 0.0143585205078125, + "learning_rate": 1.3489849139680817e-06, + "loss": -0.0038, + "step": 4388 + }, + { + "clip_ratio": 0.0034939894103445113, + "epoch": 0.16376101860920667, + "grad_norm": 0.07020147144794464, + "kl": 0.01422119140625, + "learning_rate": 1.3478602861968077e-06, + "loss": -0.0036, + "step": 4389 + }, + { + "clip_ratio": 0.003441845066845417, + "completion_length": 680.9286117553711, + "epoch": 0.1637983303017583, + "grad_norm": 0.05908428877592087, + "kl": 0.0169677734375, + "learning_rate": 1.3467374005862282e-06, + "loss": -0.0102, + "num_tokens": 102646832.0, + "reward": 0.489325400441885, + "reward_std": 0.08833647146821022, + "rewards/code_reward": 0.3393254023976624, + "rewards/format_reward": 1.5, + "step": 4390 + }, + { + "clip_ratio": 0.003477579972241074, + "epoch": 0.16383564199430997, + "grad_norm": 0.061904292553663254, + "kl": 0.01690673828125, + "learning_rate": 1.3456162576074864e-06, + "loss": -0.0103, + "step": 4391 + }, + { + "clip_ratio": 0.003734475700184703, + "epoch": 0.1638729536868616, + "grad_norm": 0.05785106122493744, + "kl": 0.016876220703125, + "learning_rate": 1.3444968577309938e-06, + "loss": -0.0104, + "step": 4392 + }, + { + "clip_ratio": 0.0038142871344462037, + "completion_length": 986.8750610351562, + "epoch": 0.16391026537941328, + "grad_norm": 0.06752622127532959, + "kl": 0.0243682861328125, + "learning_rate": 1.3433792014264277e-06, + "loss": -0.0007, + "num_tokens": 102742627.0, + "reward": 0.3436383940279484, + "reward_std": 0.19832166656851768, + "rewards/code_reward": 0.1936383955180645, + "rewards/format_reward": 1.5, + "step": 4393 + }, + { + "clip_ratio": 0.0041483252425678074, + "epoch": 0.16394757707196492, + "grad_norm": 0.0664059966802597, + "kl": 0.0237579345703125, + "learning_rate": 1.3422632891627374e-06, + "loss": -0.0007, + "step": 4394 + }, + { + "clip_ratio": 0.00343633332522586, + "epoch": 0.16398488876451658, + "grad_norm": 0.0656871348619461, + "kl": 0.022796630859375, + "learning_rate": 1.3411491214081388e-06, + "loss": -0.0009, + "step": 4395 + }, + { + "clip_ratio": 0.0037715735379606485, + "completion_length": 634.4643249511719, + "epoch": 0.16402220045706822, + "grad_norm": 0.09039798378944397, + "kl": 0.028076171875, + "learning_rate": 1.3400366986301152e-06, + "loss": 0.018, + "num_tokens": 102803931.0, + "reward": 0.48432767763733864, + "reward_std": 0.1965427976101637, + "rewards/code_reward": 0.33432766888290644, + "rewards/format_reward": 1.5, + "step": 4396 + }, + { + "clip_ratio": 0.004005810362286866, + "epoch": 0.1640595121496199, + "grad_norm": 0.07992666214704514, + "kl": 0.02801513671875, + "learning_rate": 1.33892602129542e-06, + "loss": 0.0182, + "step": 4397 + }, + { + "clip_ratio": 0.004071039205882698, + "epoch": 0.16409682384217153, + "grad_norm": 0.07995712012052536, + "kl": 0.0274810791015625, + "learning_rate": 1.3378170898700716e-06, + "loss": 0.0179, + "step": 4398 + }, + { + "clip_ratio": 0.0033525037579238415, + "completion_length": 606.2678909301758, + "epoch": 0.1641341355347232, + "grad_norm": 0.07372643053531647, + "kl": 0.0250701904296875, + "learning_rate": 1.336709904819358e-06, + "loss": -0.0081, + "num_tokens": 102869312.0, + "reward": 0.4680362455546856, + "reward_std": 0.12440896406769753, + "rewards/code_reward": 0.31803623400628567, + "rewards/format_reward": 1.5, + "step": 4399 + }, + { + "clip_ratio": 0.0031080536427907646, + "epoch": 0.16417144722727484, + "grad_norm": 0.06377160549163818, + "kl": 0.0257568359375, + "learning_rate": 1.3356044666078316e-06, + "loss": -0.0082, + "step": 4400 + }, + { + "clip_ratio": 0.003312819520942867, + "epoch": 0.1642087589198265, + "grad_norm": 0.05472346767783165, + "kl": 0.0261993408203125, + "learning_rate": 1.3345007756993155e-06, + "loss": -0.0081, + "step": 4401 + }, + { + "clip_ratio": 0.004404786741361022, + "completion_length": 740.4464569091797, + "epoch": 0.16424607061237814, + "grad_norm": 0.08159511536359787, + "kl": 0.0218658447265625, + "learning_rate": 1.3333988325568981e-06, + "loss": 0.002, + "num_tokens": 102947107.0, + "reward": 0.697027288377285, + "reward_std": 0.2626163884997368, + "rewards/code_reward": 0.5470273047685623, + "rewards/format_reward": 1.5, + "step": 4402 + }, + { + "clip_ratio": 0.00391702726483345, + "epoch": 0.1642833823049298, + "grad_norm": 0.09074235707521439, + "kl": 0.02215576171875, + "learning_rate": 1.3322986376429316e-06, + "loss": 0.0019, + "step": 4403 + }, + { + "clip_ratio": 0.004193397413473576, + "epoch": 0.16432069399748145, + "grad_norm": 0.0982644110918045, + "kl": 0.0221710205078125, + "learning_rate": 1.331200191419041e-06, + "loss": 0.0018, + "step": 4404 + }, + { + "clip_ratio": 0.0038556394283659756, + "completion_length": 694.5178985595703, + "epoch": 0.16435800569003312, + "grad_norm": 0.06539756804704666, + "kl": 0.01556396484375, + "learning_rate": 1.3301034943461114e-06, + "loss": 0.0011, + "num_tokens": 103020398.0, + "reward": 0.5119342654943466, + "reward_std": 0.17138049146160483, + "rewards/code_reward": 0.36193425254896283, + "rewards/format_reward": 1.5, + "step": 4405 + }, + { + "clip_ratio": 0.004019168496597558, + "epoch": 0.16439531738258475, + "grad_norm": 0.06140906363725662, + "kl": 0.01556396484375, + "learning_rate": 1.3290085468842975e-06, + "loss": 0.0009, + "step": 4406 + }, + { + "clip_ratio": 0.003990217344835401, + "epoch": 0.16443262907513642, + "grad_norm": 0.06160224974155426, + "kl": 0.0157012939453125, + "learning_rate": 1.3279153494930192e-06, + "loss": 0.0009, + "step": 4407 + }, + { + "clip_ratio": 0.002924362081103027, + "completion_length": 642.1964569091797, + "epoch": 0.16446994076768806, + "grad_norm": 0.07083138078451157, + "kl": 0.023101806640625, + "learning_rate": 1.326823902630961e-06, + "loss": 0.0638, + "num_tokens": 103088771.0, + "reward": 0.7899618148803711, + "reward_std": 0.193943340331316, + "rewards/code_reward": 0.6453189477324486, + "rewards/format_reward": 1.4464285671710968, + "step": 4408 + }, + { + "clip_ratio": 0.0030060826102271676, + "epoch": 0.16450725246023973, + "grad_norm": 0.07096196711063385, + "kl": 0.0227813720703125, + "learning_rate": 1.3257342067560737e-06, + "loss": 0.0637, + "step": 4409 + }, + { + "clip_ratio": 0.0027291957521811128, + "epoch": 0.16454456415279137, + "grad_norm": 0.07146240025758743, + "kl": 0.0225830078125, + "learning_rate": 1.3246462623255752e-06, + "loss": 0.0634, + "step": 4410 + }, + { + "clip_ratio": 0.005490974406711757, + "completion_length": 666.2143096923828, + "epoch": 0.16458187584534303, + "grad_norm": 0.0880747064948082, + "kl": 0.025360107421875, + "learning_rate": 1.3235600697959442e-06, + "loss": -0.0145, + "num_tokens": 103163067.0, + "reward": 0.6048406884074211, + "reward_std": 0.23899445962160826, + "rewards/code_reward": 0.4548406434478238, + "rewards/format_reward": 1.5, + "step": 4411 + }, + { + "clip_ratio": 0.005034172383602709, + "epoch": 0.1646191875378947, + "grad_norm": 0.08765324205160141, + "kl": 0.024993896484375, + "learning_rate": 1.3224756296229282e-06, + "loss": -0.0146, + "step": 4412 + }, + { + "clip_ratio": 0.004577833809889853, + "epoch": 0.16465649923044634, + "grad_norm": 0.08783625066280365, + "kl": 0.02508544921875, + "learning_rate": 1.3213929422615381e-06, + "loss": -0.015, + "step": 4413 + }, + { + "clip_ratio": 0.0037061796756461263, + "completion_length": 810.2857513427734, + "epoch": 0.164693810922998, + "grad_norm": 0.06601102650165558, + "kl": 0.02707672119140625, + "learning_rate": 1.3203120081660498e-06, + "loss": 0.0778, + "num_tokens": 103237325.0, + "reward": 0.8929282315075397, + "reward_std": 0.09485308639705181, + "rewards/code_reward": 0.745606804266572, + "rewards/format_reward": 1.4732142984867096, + "step": 4414 + }, + { + "clip_ratio": 0.003896380076184869, + "epoch": 0.16473112261554965, + "grad_norm": 0.07975679636001587, + "kl": 0.029937744140625, + "learning_rate": 1.319232827790003e-06, + "loss": 0.0777, + "step": 4415 + }, + { + "clip_ratio": 0.003992722718976438, + "epoch": 0.1647684343081013, + "grad_norm": 0.07549572736024857, + "kl": 0.02773284912109375, + "learning_rate": 1.3181554015862008e-06, + "loss": 0.0777, + "step": 4416 + }, + { + "clip_ratio": 0.003474510507658124, + "completion_length": 759.6964416503906, + "epoch": 0.16480574600065295, + "grad_norm": 0.07179895043373108, + "kl": 0.020233154296875, + "learning_rate": 1.3170797300067127e-06, + "loss": -0.0029, + "num_tokens": 103317094.0, + "reward": 0.4475119970738888, + "reward_std": 0.2038910835981369, + "rewards/code_reward": 0.2975119799375534, + "rewards/format_reward": 1.5, + "step": 4417 + }, + { + "clip_ratio": 0.0042164872866123915, + "epoch": 0.16484305769320462, + "grad_norm": 0.07547718286514282, + "kl": 0.0198516845703125, + "learning_rate": 1.316005813502869e-06, + "loss": -0.0028, + "step": 4418 + }, + { + "clip_ratio": 0.003918158705346286, + "epoch": 0.16488036938575626, + "grad_norm": 0.06783078610897064, + "kl": 0.0202484130859375, + "learning_rate": 1.3149336525252663e-06, + "loss": -0.0029, + "step": 4419 + }, + { + "clip_ratio": 0.005614471971057355, + "completion_length": 700.232177734375, + "epoch": 0.16491768107830793, + "grad_norm": 0.0716719925403595, + "kl": 0.02117919921875, + "learning_rate": 1.3138632475237622e-06, + "loss": -0.0078, + "num_tokens": 103388003.0, + "reward": 0.40229347348213196, + "reward_std": 0.14016077003907412, + "rewards/code_reward": 0.2522934494554647, + "rewards/format_reward": 1.5, + "step": 4420 + }, + { + "clip_ratio": 0.004674654337577522, + "epoch": 0.16495499277085957, + "grad_norm": 0.0703306645154953, + "kl": 0.02117919921875, + "learning_rate": 1.3127945989474803e-06, + "loss": -0.0079, + "step": 4421 + }, + { + "clip_ratio": 0.004912028438411653, + "epoch": 0.16499230446341123, + "grad_norm": 0.07297638803720474, + "kl": 0.0212554931640625, + "learning_rate": 1.3117277072448053e-06, + "loss": -0.008, + "step": 4422 + }, + { + "clip_ratio": 0.003934286127332598, + "completion_length": 670.5714569091797, + "epoch": 0.16502961615596287, + "grad_norm": 0.08031389117240906, + "kl": 0.025299072265625, + "learning_rate": 1.3106625728633836e-06, + "loss": -0.0048, + "num_tokens": 103456369.0, + "reward": 0.4163532480597496, + "reward_std": 0.2436487078666687, + "rewards/code_reward": 0.26903180591762066, + "rewards/format_reward": 1.4732142984867096, + "step": 4423 + }, + { + "clip_ratio": 0.0033249343978241086, + "epoch": 0.16506692784851454, + "grad_norm": 0.08229026198387146, + "kl": 0.02490234375, + "learning_rate": 1.3095991962501273e-06, + "loss": -0.0049, + "step": 4424 + }, + { + "clip_ratio": 0.003246618143748492, + "epoch": 0.16510423954106618, + "grad_norm": 0.08108910918235779, + "kl": 0.024658203125, + "learning_rate": 1.30853757785121e-06, + "loss": -0.0048, + "step": 4425 + }, + { + "clip_ratio": 0.003200853243470192, + "completion_length": 767.9643096923828, + "epoch": 0.16514155123361784, + "grad_norm": 0.0702899843454361, + "kl": 0.016448974609375, + "learning_rate": 1.3074777181120655e-06, + "loss": 0.0027, + "num_tokens": 103529153.0, + "reward": 0.6111483722925186, + "reward_std": 0.2428972211200744, + "rewards/code_reward": 0.4611483757616952, + "rewards/format_reward": 1.5, + "step": 4426 + }, + { + "clip_ratio": 0.0034025824861600995, + "epoch": 0.16517886292616948, + "grad_norm": 0.06946637481451035, + "kl": 0.016845703125, + "learning_rate": 1.3064196174773926e-06, + "loss": 0.0028, + "step": 4427 + }, + { + "clip_ratio": 0.0029172429349273443, + "epoch": 0.16521617461872115, + "grad_norm": 0.07000788301229477, + "kl": 0.0171661376953125, + "learning_rate": 1.3053632763911497e-06, + "loss": 0.0027, + "step": 4428 + }, + { + "clip_ratio": 0.004354913718998432, + "completion_length": 819.5000457763672, + "epoch": 0.1652534863112728, + "grad_norm": 0.08013001084327698, + "kl": 0.022369384765625, + "learning_rate": 1.3043086952965575e-06, + "loss": 0.0074, + "num_tokens": 103605579.0, + "reward": 0.4467243514955044, + "reward_std": 0.2752465680241585, + "rewards/code_reward": 0.2967243306338787, + "rewards/format_reward": 1.5, + "step": 4429 + }, + { + "clip_ratio": 0.004216563538648188, + "epoch": 0.16529079800382446, + "grad_norm": 0.06785256415605545, + "kl": 0.02203369140625, + "learning_rate": 1.3032558746360998e-06, + "loss": 0.0072, + "step": 4430 + }, + { + "clip_ratio": 0.004424224840477109, + "epoch": 0.1653281096963761, + "grad_norm": 0.0685456395149231, + "kl": 0.022674560546875, + "learning_rate": 1.3022048148515202e-06, + "loss": 0.0072, + "step": 4431 + }, + { + "clip_ratio": 0.003931596642360091, + "completion_length": 543.1964492797852, + "epoch": 0.16536542138892776, + "grad_norm": 0.06803177297115326, + "kl": 0.0185546875, + "learning_rate": 1.3011555163838244e-06, + "loss": -0.0013, + "num_tokens": 103670592.0, + "reward": 0.6770270206034184, + "reward_std": 0.12328886985778809, + "rewards/code_reward": 0.5270270295441151, + "rewards/format_reward": 1.5, + "step": 4432 + }, + { + "clip_ratio": 0.0031318649416789412, + "epoch": 0.1654027330814794, + "grad_norm": 0.06413131207227707, + "kl": 0.0185699462890625, + "learning_rate": 1.3001079796732774e-06, + "loss": -0.0015, + "step": 4433 + }, + { + "clip_ratio": 0.003177433740347624, + "epoch": 0.16544004477403107, + "grad_norm": 0.06527669727802277, + "kl": 0.018798828125, + "learning_rate": 1.2990622051594065e-06, + "loss": -0.0016, + "step": 4434 + }, + { + "clip_ratio": 0.0033306237892247736, + "completion_length": 615.3750305175781, + "epoch": 0.1654773564665827, + "grad_norm": 0.08179078251123428, + "kl": 0.0234222412109375, + "learning_rate": 1.2980181932810001e-06, + "loss": 0.0047, + "num_tokens": 103742909.0, + "reward": 0.6493844091892242, + "reward_std": 0.16225039586424828, + "rewards/code_reward": 0.4993843771517277, + "rewards/format_reward": 1.5, + "step": 4435 + }, + { + "clip_ratio": 0.0033062166767194867, + "epoch": 0.16551466815913438, + "grad_norm": 0.08717439323663712, + "kl": 0.0233001708984375, + "learning_rate": 1.2969759444761052e-06, + "loss": 0.0048, + "step": 4436 + }, + { + "clip_ratio": 0.003396505955606699, + "epoch": 0.16555197985168602, + "grad_norm": 0.08145851641893387, + "kl": 0.023895263671875, + "learning_rate": 1.295935459182031e-06, + "loss": 0.0047, + "step": 4437 + }, + { + "clip_ratio": 0.0023330737312790006, + "completion_length": 737.7678680419922, + "epoch": 0.16558929154423768, + "grad_norm": 0.05411041900515556, + "kl": 0.0152587890625, + "learning_rate": 1.294896737835344e-06, + "loss": -0.0004, + "num_tokens": 103814898.0, + "reward": 0.5964285545051098, + "reward_std": 0.10645382478833199, + "rewards/code_reward": 0.4464285671710968, + "rewards/format_reward": 1.5, + "step": 4438 + }, + { + "clip_ratio": 0.002398943208390847, + "epoch": 0.16562660323678932, + "grad_norm": 0.04713457077741623, + "kl": 0.015228271484375, + "learning_rate": 1.2938597808718737e-06, + "loss": -0.0005, + "step": 4439 + }, + { + "clip_ratio": 0.0023853827733546495, + "epoch": 0.165663914929341, + "grad_norm": 0.05519581213593483, + "kl": 0.015289306640625, + "learning_rate": 1.2928245887267085e-06, + "loss": -0.0005, + "step": 4440 + }, + { + "clip_ratio": 0.002880877524148673, + "completion_length": 783.357177734375, + "epoch": 0.16570122662189263, + "grad_norm": 0.062245000153779984, + "kl": 0.0179595947265625, + "learning_rate": 1.2917911618341952e-06, + "loss": 0.0018, + "num_tokens": 103905176.0, + "reward": 0.7269025966525078, + "reward_std": 0.21738190948963165, + "rewards/code_reward": 0.5769025981426239, + "rewards/format_reward": 1.5, + "step": 4441 + }, + { + "clip_ratio": 0.002792412880808115, + "epoch": 0.1657385383144443, + "grad_norm": 0.06197294220328331, + "kl": 0.017669677734375, + "learning_rate": 1.2907595006279397e-06, + "loss": 0.0018, + "step": 4442 + }, + { + "clip_ratio": 0.00269593350822106, + "epoch": 0.16577585000699593, + "grad_norm": 0.05524874106049538, + "kl": 0.0180206298828125, + "learning_rate": 1.2897296055408098e-06, + "loss": 0.0017, + "step": 4443 + }, + { + "clip_ratio": 0.0025179777294397354, + "completion_length": 635.8036041259766, + "epoch": 0.1658131616995476, + "grad_norm": 0.03643682226538658, + "kl": 0.01556396484375, + "learning_rate": 1.2887014770049289e-06, + "loss": 0.0012, + "num_tokens": 103969831.0, + "reward": 0.6642857119441032, + "reward_std": 0.023440364748239517, + "rewards/code_reward": 0.5142857143655419, + "rewards/format_reward": 1.5, + "step": 4444 + }, + { + "clip_ratio": 0.0023959141690284014, + "epoch": 0.16585047339209924, + "grad_norm": 0.03627316653728485, + "kl": 0.0154876708984375, + "learning_rate": 1.287675115451681e-06, + "loss": 0.0012, + "step": 4445 + }, + { + "clip_ratio": 0.0022969887359067798, + "epoch": 0.1658877850846509, + "grad_norm": 0.035963933914899826, + "kl": 0.015625, + "learning_rate": 1.2866505213117078e-06, + "loss": 0.0011, + "step": 4446 + }, + { + "clip_ratio": 0.0027556910063140094, + "completion_length": 649.4107513427734, + "epoch": 0.16592509677720255, + "grad_norm": 0.07532083243131638, + "kl": 0.017547607421875, + "learning_rate": 1.2856276950149105e-06, + "loss": 0.0077, + "num_tokens": 104040960.0, + "reward": 0.9088564142584801, + "reward_std": 0.24452299065887928, + "rewards/code_reward": 0.758856400847435, + "rewards/format_reward": 1.5, + "step": 4447 + }, + { + "clip_ratio": 0.0030389801831915975, + "epoch": 0.1659624084697542, + "grad_norm": 0.07185807824134827, + "kl": 0.0179290771484375, + "learning_rate": 1.2846066369904484e-06, + "loss": 0.0076, + "step": 4448 + }, + { + "clip_ratio": 0.003180583589710295, + "epoch": 0.16599972016230585, + "grad_norm": 0.07018543034791946, + "kl": 0.0186309814453125, + "learning_rate": 1.2835873476667374e-06, + "loss": 0.0075, + "step": 4449 + }, + { + "clip_ratio": 0.0038201510906219482, + "completion_length": 493.1964416503906, + "epoch": 0.16603703185485752, + "grad_norm": 0.07804539799690247, + "kl": 0.0185546875, + "learning_rate": 1.2825698274714542e-06, + "loss": 0.0021, + "num_tokens": 104091231.0, + "reward": 0.7477082163095474, + "reward_std": 0.2342238761484623, + "rewards/code_reward": 0.5977082215249538, + "rewards/format_reward": 1.5, + "step": 4450 + }, + { + "clip_ratio": 0.003578230971470475, + "epoch": 0.16607434354740916, + "grad_norm": 0.078806571662426, + "kl": 0.018707275390625, + "learning_rate": 1.281554076831529e-06, + "loss": 0.0019, + "step": 4451 + }, + { + "clip_ratio": 0.0034436313435435295, + "epoch": 0.16611165523996083, + "grad_norm": 0.07942061126232147, + "kl": 0.019195556640625, + "learning_rate": 1.2805400961731535e-06, + "loss": 0.0019, + "step": 4452 + }, + { + "clip_ratio": 0.00472801917931065, + "completion_length": 746.1964569091797, + "epoch": 0.16614896693251247, + "grad_norm": 0.07020154595375061, + "kl": 0.018524169921875, + "learning_rate": 1.2795278859217756e-06, + "loss": 0.0118, + "num_tokens": 104174486.0, + "reward": 0.49921997636556625, + "reward_std": 0.21625496074557304, + "rewards/code_reward": 0.3492199741303921, + "rewards/format_reward": 1.5, + "step": 4453 + }, + { + "clip_ratio": 0.0049539696192368865, + "epoch": 0.16618627862506413, + "grad_norm": 0.06829983741044998, + "kl": 0.0185699462890625, + "learning_rate": 1.2785174465020977e-06, + "loss": 0.0118, + "step": 4454 + }, + { + "clip_ratio": 0.004772784071974456, + "epoch": 0.16622359031761577, + "grad_norm": 0.06825791299343109, + "kl": 0.0183868408203125, + "learning_rate": 1.2775087783380836e-06, + "loss": 0.0117, + "step": 4455 + }, + { + "clip_ratio": 0.003915629582479596, + "completion_length": 719.2678833007812, + "epoch": 0.16626090201016744, + "grad_norm": 0.052235011011362076, + "kl": 0.0203704833984375, + "learning_rate": 1.27650188185295e-06, + "loss": -0.0058, + "num_tokens": 104238957.0, + "reward": 0.7047413177788258, + "reward_std": 0.06856431998312473, + "rewards/code_reward": 0.5547412931919098, + "rewards/format_reward": 1.5, + "step": 4456 + }, + { + "clip_ratio": 0.0034121001372113824, + "epoch": 0.16629821370271908, + "grad_norm": 0.05076708644628525, + "kl": 0.0204315185546875, + "learning_rate": 1.2754967574691738e-06, + "loss": -0.0058, + "step": 4457 + }, + { + "clip_ratio": 0.003498072794172913, + "epoch": 0.16633552539527074, + "grad_norm": 0.05300191789865494, + "kl": 0.0203857421875, + "learning_rate": 1.2744934056084857e-06, + "loss": -0.0059, + "step": 4458 + }, + { + "clip_ratio": 0.001121370354667306, + "completion_length": 553.6607437133789, + "epoch": 0.16637283708782238, + "grad_norm": 0.04390263929963112, + "kl": 0.030853271484375, + "learning_rate": 1.2734918266918726e-06, + "loss": 0.0046, + "num_tokens": 104301854.0, + "reward": 1.0357850044965744, + "reward_std": 0.10413189232349396, + "rewards/code_reward": 0.8857849985361099, + "rewards/format_reward": 1.5, + "step": 4459 + }, + { + "clip_ratio": 0.0012254699831828475, + "epoch": 0.16641014878037405, + "grad_norm": 0.04449215903878212, + "kl": 0.030731201171875, + "learning_rate": 1.27249202113958e-06, + "loss": 0.0045, + "step": 4460 + }, + { + "clip_ratio": 0.0012000525603070855, + "epoch": 0.1664474604729257, + "grad_norm": 0.08413644134998322, + "kl": 0.028656005859375, + "learning_rate": 1.2714939893711062e-06, + "loss": 0.0045, + "step": 4461 + }, + { + "clip_ratio": 0.000575553101953119, + "completion_length": 418.51788330078125, + "epoch": 0.16648477216547736, + "grad_norm": 0.038373153656721115, + "kl": 0.0155487060546875, + "learning_rate": 1.2704977318052078e-06, + "loss": -0.0045, + "num_tokens": 104348755.0, + "reward": 0.8861944600939751, + "reward_std": 0.047553591430187225, + "rewards/code_reward": 0.7361944769509137, + "rewards/format_reward": 1.5, + "step": 4462 + }, + { + "clip_ratio": 0.0006337894010357559, + "epoch": 0.166522083858029, + "grad_norm": 0.03720249608159065, + "kl": 0.014923095703125, + "learning_rate": 1.2695032488598962e-06, + "loss": -0.0046, + "step": 4463 + }, + { + "clip_ratio": 0.0006019791471771896, + "epoch": 0.16655939555058066, + "grad_norm": 0.03794878348708153, + "kl": 0.01507568359375, + "learning_rate": 1.2685105409524364e-06, + "loss": -0.0046, + "step": 4464 + }, + { + "clip_ratio": 0.002872211567591876, + "completion_length": 664.8571853637695, + "epoch": 0.16659670724313233, + "grad_norm": 0.05180424824357033, + "kl": 0.0183563232421875, + "learning_rate": 1.2675196084993519e-06, + "loss": 0.0098, + "num_tokens": 104409507.0, + "reward": 0.7035714350640774, + "reward_std": 0.15184589102864265, + "rewards/code_reward": 0.5535714253783226, + "rewards/format_reward": 1.5, + "step": 4465 + }, + { + "clip_ratio": 0.002853213285561651, + "epoch": 0.16663401893568397, + "grad_norm": 0.052612707018852234, + "kl": 0.01812744140625, + "learning_rate": 1.266530451916419e-06, + "loss": 0.0098, + "step": 4466 + }, + { + "clip_ratio": 0.0026332888519391418, + "epoch": 0.16667133062823564, + "grad_norm": 0.051136281341314316, + "kl": 0.0180816650390625, + "learning_rate": 1.2655430716186687e-06, + "loss": 0.0096, + "step": 4467 + }, + { + "clip_ratio": 0.004261029418557882, + "completion_length": 900.8036117553711, + "epoch": 0.16670864232078728, + "grad_norm": 0.05155981332063675, + "kl": 0.02447509765625, + "learning_rate": 1.2645574680203892e-06, + "loss": 0.0127, + "num_tokens": 104492530.0, + "reward": 0.38919683173298836, + "reward_std": 0.1562081053853035, + "rewards/code_reward": 0.24455394968390465, + "rewards/format_reward": 1.4464285671710968, + "step": 4468 + }, + { + "clip_ratio": 0.004491911153309047, + "epoch": 0.16674595401333894, + "grad_norm": 0.04804673418402672, + "kl": 0.0252685546875, + "learning_rate": 1.263573641535119e-06, + "loss": 0.0129, + "step": 4469 + }, + { + "clip_ratio": 0.004587475617881864, + "epoch": 0.16678326570589058, + "grad_norm": 0.04671561345458031, + "kl": 0.02471923828125, + "learning_rate": 1.2625915925756556e-06, + "loss": 0.0128, + "step": 4470 + }, + { + "clip_ratio": 0.0029767140513285995, + "completion_length": 662.3214645385742, + "epoch": 0.16682057739844225, + "grad_norm": 0.07279055565595627, + "kl": 0.0162506103515625, + "learning_rate": 1.2616113215540483e-06, + "loss": -0.0013, + "num_tokens": 104560182.0, + "reward": 0.5687675066292286, + "reward_std": 0.14856063574552536, + "rewards/code_reward": 0.4187675043940544, + "rewards/format_reward": 1.5, + "step": 4471 + }, + { + "clip_ratio": 0.0031810608925297856, + "epoch": 0.1668578890909939, + "grad_norm": 0.06631052494049072, + "kl": 0.0160675048828125, + "learning_rate": 1.2606328288815984e-06, + "loss": -0.0013, + "step": 4472 + }, + { + "clip_ratio": 0.003073050291277468, + "epoch": 0.16689520078354556, + "grad_norm": 0.0668003112077713, + "kl": 0.0161895751953125, + "learning_rate": 1.2596561149688651e-06, + "loss": -0.0011, + "step": 4473 + }, + { + "clip_ratio": 0.0033938675769604743, + "completion_length": 753.4107513427734, + "epoch": 0.1669325124760972, + "grad_norm": 0.07562398165464401, + "kl": 0.0213623046875, + "learning_rate": 1.2586811802256598e-06, + "loss": 0.0033, + "num_tokens": 104625403.0, + "reward": 0.40810395404696465, + "reward_std": 0.0974729098379612, + "rewards/code_reward": 0.25810395274311304, + "rewards/format_reward": 1.5, + "step": 4474 + }, + { + "clip_ratio": 0.003363278752658516, + "epoch": 0.16696982416864886, + "grad_norm": 0.07319894433021545, + "kl": 0.021820068359375, + "learning_rate": 1.257708025061046e-06, + "loss": 0.0032, + "step": 4475 + }, + { + "clip_ratio": 0.003308293758891523, + "epoch": 0.1670071358612005, + "grad_norm": 0.06406404823064804, + "kl": 0.02166748046875, + "learning_rate": 1.2567366498833425e-06, + "loss": 0.0031, + "step": 4476 + }, + { + "clip_ratio": 0.0024009408662095666, + "completion_length": 565.6964569091797, + "epoch": 0.16704444755375217, + "grad_norm": 0.06158602237701416, + "kl": 0.0161285400390625, + "learning_rate": 1.2557670551001186e-06, + "loss": 0.0037, + "num_tokens": 104686566.0, + "reward": 0.7318184524774551, + "reward_std": 0.07665149495005608, + "rewards/code_reward": 0.5818184548988938, + "rewards/format_reward": 1.5, + "step": 4477 + }, + { + "clip_ratio": 0.002309908508323133, + "epoch": 0.1670817592463038, + "grad_norm": 0.05907340347766876, + "kl": 0.015411376953125, + "learning_rate": 1.2547992411182002e-06, + "loss": 0.0036, + "step": 4478 + }, + { + "clip_ratio": 0.0022639718372374773, + "epoch": 0.16711907093885547, + "grad_norm": 0.0729338750243187, + "kl": 0.015167236328125, + "learning_rate": 1.253833208343663e-06, + "loss": 0.0036, + "step": 4479 + }, + { + "clip_ratio": 0.004791363375261426, + "completion_length": 653.3035888671875, + "epoch": 0.1671563826314071, + "grad_norm": 0.0818883553147316, + "kl": 0.0209503173828125, + "learning_rate": 1.2528689571818362e-06, + "loss": 0.0039, + "num_tokens": 104755673.0, + "reward": 0.892317958176136, + "reward_std": 0.23096806555986404, + "rewards/code_reward": 0.7423178926110268, + "rewards/format_reward": 1.5, + "step": 4480 + }, + { + "clip_ratio": 0.004286064533516765, + "epoch": 0.16719369432395878, + "grad_norm": 0.08172067254781723, + "kl": 0.0205230712890625, + "learning_rate": 1.2519064880373028e-06, + "loss": 0.0039, + "step": 4481 + }, + { + "clip_ratio": 0.00405416201101616, + "epoch": 0.16723100601651042, + "grad_norm": 0.07906752079725266, + "kl": 0.020721435546875, + "learning_rate": 1.2509458013138954e-06, + "loss": 0.0036, + "step": 4482 + }, + { + "clip_ratio": 0.004330731346271932, + "completion_length": 695.8214569091797, + "epoch": 0.1672683177090621, + "grad_norm": 0.09348166733980179, + "kl": 0.02252197265625, + "learning_rate": 1.249986897414701e-06, + "loss": 0.0084, + "num_tokens": 104837409.0, + "reward": 0.49072396382689476, + "reward_std": 0.2058271011337638, + "rewards/code_reward": 0.3407239429652691, + "rewards/format_reward": 1.5, + "step": 4483 + }, + { + "clip_ratio": 0.004634186101611704, + "epoch": 0.16730562940161373, + "grad_norm": 0.08815918862819672, + "kl": 0.022735595703125, + "learning_rate": 1.2490297767420583e-06, + "loss": 0.0084, + "step": 4484 + }, + { + "clip_ratio": 0.004583500907756388, + "epoch": 0.1673429410941654, + "grad_norm": 0.08518687635660172, + "kl": 0.0226898193359375, + "learning_rate": 1.2480744396975571e-06, + "loss": 0.0086, + "step": 4485 + }, + { + "clip_ratio": 0.004138159798458219, + "completion_length": 716.3750305175781, + "epoch": 0.16738025278671703, + "grad_norm": 0.04393760859966278, + "kl": 0.01904296875, + "learning_rate": 1.2471208866820397e-06, + "loss": 0.0081, + "num_tokens": 104911812.0, + "reward": 0.4773317724466324, + "reward_std": 0.11454074084758759, + "rewards/code_reward": 0.3273317590355873, + "rewards/format_reward": 1.5, + "step": 4486 + }, + { + "clip_ratio": 0.004058493068441749, + "epoch": 0.1674175644792687, + "grad_norm": 0.04535364732146263, + "kl": 0.018402099609375, + "learning_rate": 1.2461691180955976e-06, + "loss": 0.008, + "step": 4487 + }, + { + "clip_ratio": 0.003928259829990566, + "epoch": 0.16745487617182034, + "grad_norm": 0.04485855996608734, + "kl": 0.0190277099609375, + "learning_rate": 1.2452191343375773e-06, + "loss": 0.0081, + "step": 4488 + }, + { + "clip_ratio": 0.0038028220878914, + "completion_length": 541.9643173217773, + "epoch": 0.167492187864372, + "grad_norm": 0.08944117277860641, + "kl": 0.0175018310546875, + "learning_rate": 1.2442709358065719e-06, + "loss": 0.0128, + "num_tokens": 104972612.0, + "reward": 0.6435224115848541, + "reward_std": 0.17008509556762874, + "rewards/code_reward": 0.4935223984066397, + "rewards/format_reward": 1.5, + "step": 4489 + }, + { + "clip_ratio": 0.003376762440893799, + "epoch": 0.16752949955692364, + "grad_norm": 0.08995610475540161, + "kl": 0.0176849365234375, + "learning_rate": 1.2433245229004297e-06, + "loss": 0.0129, + "step": 4490 + }, + { + "clip_ratio": 0.00311944040004164, + "epoch": 0.1675668112494753, + "grad_norm": 0.09224142134189606, + "kl": 0.0170745849609375, + "learning_rate": 1.2423798960162477e-06, + "loss": 0.0129, + "step": 4491 + }, + { + "clip_ratio": 0.004145882499869913, + "completion_length": 765.732177734375, + "epoch": 0.16760412294202695, + "grad_norm": 0.08290258049964905, + "kl": 0.02020263671875, + "learning_rate": 1.241437055550373e-06, + "loss": 0.0139, + "num_tokens": 105047175.0, + "reward": 0.6608617603778839, + "reward_std": 0.23641132935881615, + "rewards/code_reward": 0.5108617562800646, + "rewards/format_reward": 1.5, + "step": 4492 + }, + { + "clip_ratio": 0.004203932825475931, + "epoch": 0.16764143463457862, + "grad_norm": 0.07831406593322754, + "kl": 0.020233154296875, + "learning_rate": 1.2404960018984057e-06, + "loss": 0.014, + "step": 4493 + }, + { + "clip_ratio": 0.003586840000934899, + "epoch": 0.16767874632713026, + "grad_norm": 0.0767676830291748, + "kl": 0.019561767578125, + "learning_rate": 1.2395567354551924e-06, + "loss": 0.0136, + "step": 4494 + }, + { + "clip_ratio": 0.005112466344144195, + "completion_length": 665.5357513427734, + "epoch": 0.16771605801968192, + "grad_norm": 0.062307555228471756, + "kl": 0.023040771484375, + "learning_rate": 1.2386192566148327e-06, + "loss": -0.0033, + "num_tokens": 105117191.0, + "reward": 0.355357151478529, + "reward_std": 0.25162195414304733, + "rewards/code_reward": 0.2053571455180645, + "rewards/format_reward": 1.5, + "step": 4495 + }, + { + "clip_ratio": 0.005159518506843597, + "epoch": 0.16775336971223356, + "grad_norm": 0.06480368971824646, + "kl": 0.022308349609375, + "learning_rate": 1.2376835657706757e-06, + "loss": -0.0031, + "step": 4496 + }, + { + "clip_ratio": 0.005090549821034074, + "epoch": 0.16779068140478523, + "grad_norm": 0.06477876007556915, + "kl": 0.022674560546875, + "learning_rate": 1.2367496633153192e-06, + "loss": -0.0032, + "step": 4497 + }, + { + "clip_ratio": 0.0024950364022515714, + "completion_length": 619.1250228881836, + "epoch": 0.16782799309733687, + "grad_norm": 0.06485455483198166, + "kl": 0.0174102783203125, + "learning_rate": 1.2358175496406124e-06, + "loss": 0.003, + "num_tokens": 105184888.0, + "reward": 0.6491230987012386, + "reward_std": 0.13662854582071304, + "rewards/code_reward": 0.49912309274077415, + "rewards/format_reward": 1.5, + "step": 4498 + }, + { + "clip_ratio": 0.0027301537920720875, + "epoch": 0.16786530478988854, + "grad_norm": 0.06372258812189102, + "kl": 0.017669677734375, + "learning_rate": 1.2348872251376513e-06, + "loss": 0.003, + "step": 4499 + }, + { + "clip_ratio": 0.0030634445138275623, + "epoch": 0.16790261648244018, + "grad_norm": 0.06377812474966049, + "kl": 0.0176849365234375, + "learning_rate": 1.2339586901967831e-06, + "loss": 0.003, + "step": 4500 + }, + { + "clip_ratio": 0.004369587171822786, + "completion_length": 669.3750228881836, + "epoch": 0.16793992817499184, + "grad_norm": 0.07502199709415436, + "kl": 0.0211944580078125, + "learning_rate": 1.233031945207605e-06, + "loss": 0.0084, + "num_tokens": 105254559.0, + "reward": 0.55882353708148, + "reward_std": 0.2755800262093544, + "rewards/code_reward": 0.40882352739572525, + "rewards/format_reward": 1.5, + "step": 4501 + }, + { + "clip_ratio": 0.004389165376778692, + "epoch": 0.16797723986754348, + "grad_norm": 0.0849851593375206, + "kl": 0.02117919921875, + "learning_rate": 1.232106990558961e-06, + "loss": 0.0086, + "step": 4502 + }, + { + "clip_ratio": 0.004276332270819694, + "epoch": 0.16801455156009515, + "grad_norm": 0.07488040626049042, + "kl": 0.0211181640625, + "learning_rate": 1.2311838266389455e-06, + "loss": 0.0083, + "step": 4503 + }, + { + "clip_ratio": 0.003191764175426215, + "completion_length": 584.8928833007812, + "epoch": 0.1680518632526468, + "grad_norm": 0.07917433232069016, + "kl": 0.020111083984375, + "learning_rate": 1.2302624538349013e-06, + "loss": 0.016, + "num_tokens": 105321641.0, + "reward": 0.7079387083649635, + "reward_std": 0.23682337999343872, + "rewards/code_reward": 0.5579386800527573, + "rewards/format_reward": 1.5, + "step": 4504 + }, + { + "clip_ratio": 0.003570818225853145, + "epoch": 0.16808917494519846, + "grad_norm": 0.07655249536037445, + "kl": 0.0202484130859375, + "learning_rate": 1.2293428725334174e-06, + "loss": 0.0161, + "step": 4505 + }, + { + "clip_ratio": 0.0028175492770969868, + "epoch": 0.1681264866377501, + "grad_norm": 0.07812930643558502, + "kl": 0.019805908203125, + "learning_rate": 1.2284250831203354e-06, + "loss": 0.0158, + "step": 4506 + }, + { + "clip_ratio": 0.004936922458000481, + "completion_length": 644.1607513427734, + "epoch": 0.16816379833030176, + "grad_norm": 0.25965017080307007, + "kl": 0.026702880859375, + "learning_rate": 1.2275090859807404e-06, + "loss": -0.0134, + "num_tokens": 105399896.0, + "reward": 0.6513392813503742, + "reward_std": 0.2247637752443552, + "rewards/code_reward": 0.5013392791152, + "rewards/format_reward": 1.5, + "step": 4507 + }, + { + "clip_ratio": 0.005071716092061251, + "epoch": 0.1682011100228534, + "grad_norm": 0.11182613670825958, + "kl": 0.026947021484375, + "learning_rate": 1.2265948814989693e-06, + "loss": -0.0138, + "step": 4508 + }, + { + "clip_ratio": 0.005067757854703814, + "epoch": 0.16823842171540507, + "grad_norm": 0.08594534546136856, + "kl": 0.027801513671875, + "learning_rate": 1.2256824700586048e-06, + "loss": -0.0136, + "step": 4509 + }, + { + "clip_ratio": 0.0020352835417725146, + "completion_length": 530.7857437133789, + "epoch": 0.1682757334079567, + "grad_norm": 0.04349170997738838, + "kl": 0.048248291015625, + "learning_rate": 1.2247718520424786e-06, + "loss": -0.0096, + "num_tokens": 105450928.0, + "reward": 0.8821428529918194, + "reward_std": 0.06681530922651291, + "rewards/code_reward": 0.7321428507566452, + "rewards/format_reward": 1.5, + "step": 4510 + }, + { + "clip_ratio": 0.0023033914621919394, + "epoch": 0.16831304510050837, + "grad_norm": 0.038436777889728546, + "kl": 0.04461669921875, + "learning_rate": 1.2238630278326686e-06, + "loss": -0.0097, + "step": 4511 + }, + { + "clip_ratio": 0.0015289867733372375, + "epoch": 0.16835035679306, + "grad_norm": 0.04807792603969574, + "kl": 0.04815673828125, + "learning_rate": 1.2229559978104997e-06, + "loss": -0.0098, + "step": 4512 + }, + { + "clip_ratio": 0.00472184392856434, + "completion_length": 772.8393249511719, + "epoch": 0.16838766848561168, + "grad_norm": 0.08651792258024216, + "kl": 0.023193359375, + "learning_rate": 1.2220507623565454e-06, + "loss": 0.0234, + "num_tokens": 105525049.0, + "reward": 0.4649951905012131, + "reward_std": 0.24164149910211563, + "rewards/code_reward": 0.3149951919913292, + "rewards/format_reward": 1.5, + "step": 4513 + }, + { + "clip_ratio": 0.0041756650898605585, + "epoch": 0.16842498017816332, + "grad_norm": 0.08973449468612671, + "kl": 0.023712158203125, + "learning_rate": 1.2211473218506268e-06, + "loss": 0.0233, + "step": 4514 + }, + { + "clip_ratio": 0.003822208906058222, + "epoch": 0.168462291870715, + "grad_norm": 0.08583038300275803, + "kl": 0.02386474609375, + "learning_rate": 1.2202456766718092e-06, + "loss": 0.023, + "step": 4515 + }, + { + "clip_ratio": 0.0020528981694951653, + "completion_length": 632.9107360839844, + "epoch": 0.16849960356326663, + "grad_norm": 0.05411574989557266, + "kl": 0.018280029296875, + "learning_rate": 1.2193458271984071e-06, + "loss": 0.0018, + "num_tokens": 105596094.0, + "reward": 0.8288340494036674, + "reward_std": 0.15956582501530647, + "rewards/code_reward": 0.6788340285420418, + "rewards/format_reward": 1.5, + "step": 4516 + }, + { + "clip_ratio": 0.002196506189648062, + "epoch": 0.1685369152558183, + "grad_norm": 0.05481616407632828, + "kl": 0.0180206298828125, + "learning_rate": 1.21844777380798e-06, + "loss": 0.0018, + "step": 4517 + }, + { + "clip_ratio": 0.002084712323267013, + "epoch": 0.16857422694836993, + "grad_norm": 0.05276383459568024, + "kl": 0.01904296875, + "learning_rate": 1.2175515168773348e-06, + "loss": 0.0018, + "step": 4518 + }, + { + "clip_ratio": 0.0044857668108306825, + "completion_length": 776.5357513427734, + "epoch": 0.1686115386409216, + "grad_norm": 0.05492033436894417, + "kl": 0.0185394287109375, + "learning_rate": 1.2166570567825233e-06, + "loss": 0.0088, + "num_tokens": 105669078.0, + "reward": 0.23397012054920197, + "reward_std": 0.14075121469795704, + "rewards/code_reward": 0.08397010015323758, + "rewards/format_reward": 1.5, + "step": 4519 + }, + { + "clip_ratio": 0.004242660652380437, + "epoch": 0.16864885033347327, + "grad_norm": 0.05453726649284363, + "kl": 0.0183563232421875, + "learning_rate": 1.215764393898846e-06, + "loss": 0.0087, + "step": 4520 + }, + { + "clip_ratio": 0.0042436879593878984, + "epoch": 0.1686861620260249, + "grad_norm": 0.05665130168199539, + "kl": 0.018890380859375, + "learning_rate": 1.2148735286008474e-06, + "loss": 0.0087, + "step": 4521 + }, + { + "clip_ratio": 0.004630512383300811, + "completion_length": 745.7143096923828, + "epoch": 0.16872347371857657, + "grad_norm": 0.08346777409315109, + "kl": 0.0204620361328125, + "learning_rate": 1.2139844612623166e-06, + "loss": 0.0086, + "num_tokens": 105748036.0, + "reward": 0.48771611973643303, + "reward_std": 0.3317128922790289, + "rewards/code_reward": 0.3377161156386137, + "rewards/format_reward": 1.5, + "step": 4522 + }, + { + "clip_ratio": 0.004616960475686938, + "epoch": 0.1687607854111282, + "grad_norm": 0.08859588205814362, + "kl": 0.02032470703125, + "learning_rate": 1.2130971922562913e-06, + "loss": 0.0085, + "step": 4523 + }, + { + "clip_ratio": 0.004828026227187365, + "epoch": 0.16879809710367988, + "grad_norm": 0.09469194710254669, + "kl": 0.02008056640625, + "learning_rate": 1.2122117219550526e-06, + "loss": 0.0085, + "step": 4524 + }, + { + "clip_ratio": 0.0027677061734721065, + "completion_length": 596.2500228881836, + "epoch": 0.16883540879623152, + "grad_norm": 0.032192204147577286, + "kl": 0.0204620361328125, + "learning_rate": 1.2113280507301262e-06, + "loss": -0.0029, + "num_tokens": 105813248.0, + "reward": 0.7928571365773678, + "reward_std": 0.10805472731590271, + "rewards/code_reward": 0.6428571492433548, + "rewards/format_reward": 1.5, + "step": 4525 + }, + { + "clip_ratio": 0.00304168404545635, + "epoch": 0.16887272048878318, + "grad_norm": 0.03193075209856033, + "kl": 0.02020263671875, + "learning_rate": 1.2104461789522858e-06, + "loss": -0.0029, + "step": 4526 + }, + { + "clip_ratio": 0.002728578634560108, + "epoch": 0.16891003218133482, + "grad_norm": 0.036073874682188034, + "kl": 0.0200347900390625, + "learning_rate": 1.2095661069915477e-06, + "loss": -0.0029, + "step": 4527 + }, + { + "clip_ratio": 0.00290050933836028, + "completion_length": 741.7857513427734, + "epoch": 0.1689473438738865, + "grad_norm": 0.10140713304281235, + "kl": 0.0174560546875, + "learning_rate": 1.2086878352171733e-06, + "loss": 0.0207, + "num_tokens": 105891692.0, + "reward": 0.7193220369517803, + "reward_std": 0.20722112338989973, + "rewards/code_reward": 0.572000578045845, + "rewards/format_reward": 1.4732142984867096, + "step": 4528 + }, + { + "clip_ratio": 0.0037923271884210408, + "epoch": 0.16898465556643813, + "grad_norm": 0.09736106544733047, + "kl": 0.0176239013671875, + "learning_rate": 1.2078113639976711e-06, + "loss": 0.0207, + "step": 4529 + }, + { + "clip_ratio": 0.003188733651768416, + "epoch": 0.1690219672589898, + "grad_norm": 0.1512734740972519, + "kl": 0.0180206298828125, + "learning_rate": 1.2069366937007896e-06, + "loss": 0.0207, + "step": 4530 + }, + { + "clip_ratio": 0.00279060669708997, + "completion_length": 602.8214492797852, + "epoch": 0.16905927895154144, + "grad_norm": 0.050531551241874695, + "kl": 0.0175628662109375, + "learning_rate": 1.206063824693525e-06, + "loss": 0.0123, + "num_tokens": 105957062.0, + "reward": 0.5657913163304329, + "reward_std": 0.18615923821926117, + "rewards/code_reward": 0.4157913215458393, + "rewards/format_reward": 1.5, + "step": 4531 + }, + { + "clip_ratio": 0.003526871732901782, + "epoch": 0.1690965906440931, + "grad_norm": 0.04957040771842003, + "kl": 0.0179443359375, + "learning_rate": 1.205192757342118e-06, + "loss": 0.0123, + "step": 4532 + }, + { + "clip_ratio": 0.003194837481714785, + "epoch": 0.16913390233664474, + "grad_norm": 0.05107559263706207, + "kl": 0.0180206298828125, + "learning_rate": 1.2043234920120513e-06, + "loss": 0.0123, + "step": 4533 + }, + { + "clip_ratio": 0.0029311164398677647, + "completion_length": 616.8036041259766, + "epoch": 0.1691712140291964, + "grad_norm": 0.059080079197883606, + "kl": 0.0206298828125, + "learning_rate": 1.2034560290680528e-06, + "loss": -0.0045, + "num_tokens": 106027443.0, + "reward": 0.6103379540145397, + "reward_std": 0.0699460506439209, + "rewards/code_reward": 0.46033795922994614, + "rewards/format_reward": 1.5, + "step": 4534 + }, + { + "clip_ratio": 0.003617177775595337, + "epoch": 0.16920852572174805, + "grad_norm": 0.05996927618980408, + "kl": 0.0208740234375, + "learning_rate": 1.2025903688740932e-06, + "loss": -0.0043, + "step": 4535 + }, + { + "clip_ratio": 0.003132225130684674, + "epoch": 0.16924583741429972, + "grad_norm": 0.05736885964870453, + "kl": 0.020721435546875, + "learning_rate": 1.2017265117933882e-06, + "loss": -0.0045, + "step": 4536 + }, + { + "clip_ratio": 0.0037831621011719108, + "completion_length": 570.857177734375, + "epoch": 0.16928314910685135, + "grad_norm": 0.05592634156346321, + "kl": 0.03204345703125, + "learning_rate": 1.200864458188396e-06, + "loss": 0.0023, + "num_tokens": 106088853.0, + "reward": 0.5927878208458424, + "reward_std": 0.11927668075077236, + "rewards/code_reward": 0.4427877888083458, + "rewards/format_reward": 1.5, + "step": 4537 + }, + { + "clip_ratio": 0.0036952124792151153, + "epoch": 0.16932046079940302, + "grad_norm": 0.055148862302303314, + "kl": 0.032806396484375, + "learning_rate": 1.2000042084208178e-06, + "loss": 0.0022, + "step": 4538 + }, + { + "clip_ratio": 0.0038855604943819344, + "epoch": 0.16935777249195466, + "grad_norm": 0.056432534009218216, + "kl": 0.032012939453125, + "learning_rate": 1.1991457628515998e-06, + "loss": 0.0024, + "step": 4539 + }, + { + "clip_ratio": 0.005136184161528945, + "completion_length": 625.1964492797852, + "epoch": 0.16939508418450633, + "grad_norm": 0.0634026899933815, + "kl": 0.0253753662109375, + "learning_rate": 1.1982891218409282e-06, + "loss": -0.0051, + "num_tokens": 106159816.0, + "reward": 0.24210526049137115, + "reward_std": 0.13845158368349075, + "rewards/code_reward": 0.09210526198148727, + "rewards/format_reward": 1.5, + "step": 4540 + }, + { + "clip_ratio": 0.005300175864249468, + "epoch": 0.16943239587705797, + "grad_norm": 0.06429332494735718, + "kl": 0.025787353515625, + "learning_rate": 1.1974342857482343e-06, + "loss": -0.0051, + "step": 4541 + }, + { + "clip_ratio": 0.004915020428597927, + "epoch": 0.16946970756960963, + "grad_norm": 0.06365612894296646, + "kl": 0.0255126953125, + "learning_rate": 1.1965812549321918e-06, + "loss": -0.0051, + "step": 4542 + }, + { + "clip_ratio": 0.0023470570449717343, + "completion_length": 633.8036041259766, + "epoch": 0.16950701926216127, + "grad_norm": 0.05785273015499115, + "kl": 0.0208740234375, + "learning_rate": 1.1957300297507158e-06, + "loss": 0.0035, + "num_tokens": 106222917.0, + "reward": 0.49596867337822914, + "reward_std": 0.14944268018007278, + "rewards/code_reward": 0.3459686771966517, + "rewards/format_reward": 1.5, + "step": 4543 + }, + { + "clip_ratio": 0.0024362479452975094, + "epoch": 0.16954433095471294, + "grad_norm": 0.05619259923696518, + "kl": 0.0201416015625, + "learning_rate": 1.194880610560965e-06, + "loss": 0.0035, + "step": 4544 + }, + { + "clip_ratio": 0.002296158578246832, + "epoch": 0.16958164264726458, + "grad_norm": 0.054541513323783875, + "kl": 0.020965576171875, + "learning_rate": 1.19403299771934e-06, + "loss": 0.0034, + "step": 4545 + }, + { + "clip_ratio": 0.004160825163125992, + "completion_length": 741.2321624755859, + "epoch": 0.16961895433981625, + "grad_norm": 0.08808150142431259, + "kl": 0.02069091796875, + "learning_rate": 1.1931871915814835e-06, + "loss": 0.0001, + "num_tokens": 106300658.0, + "reward": 0.6875163912773132, + "reward_std": 0.33221206441521645, + "rewards/code_reward": 0.5375163778662682, + "rewards/format_reward": 1.5, + "step": 4546 + }, + { + "clip_ratio": 0.004420655779540539, + "epoch": 0.1696562660323679, + "grad_norm": 0.0885004997253418, + "kl": 0.0214080810546875, + "learning_rate": 1.1923431925022801e-06, + "loss": 0.0002, + "step": 4547 + }, + { + "clip_ratio": 0.004561750160064548, + "epoch": 0.16969357772491955, + "grad_norm": 0.1289420872926712, + "kl": 0.0215911865234375, + "learning_rate": 1.1915010008358559e-06, + "loss": 0.0002, + "step": 4548 + }, + { + "clip_ratio": 0.004635521909222007, + "completion_length": 899.732177734375, + "epoch": 0.1697308894174712, + "grad_norm": 0.07489874213933945, + "kl": 0.0163726806640625, + "learning_rate": 1.1906606169355794e-06, + "loss": -0.0032, + "num_tokens": 106384237.0, + "reward": 0.32418226078152657, + "reward_std": 0.1926881056278944, + "rewards/code_reward": 0.17418225156143308, + "rewards/format_reward": 1.5, + "step": 4549 + }, + { + "clip_ratio": 0.004380609898362309, + "epoch": 0.16976820111002286, + "grad_norm": 0.08436152338981628, + "kl": 0.0164947509765625, + "learning_rate": 1.1898220411540584e-06, + "loss": -0.0034, + "step": 4550 + }, + { + "clip_ratio": 0.0042487840401008725, + "epoch": 0.1698055128025745, + "grad_norm": 0.08369635045528412, + "kl": 0.0165863037109375, + "learning_rate": 1.1889852738431453e-06, + "loss": -0.0036, + "step": 4551 + }, + { + "clip_ratio": 0.003896556794643402, + "completion_length": 724.6607513427734, + "epoch": 0.16984282449512617, + "grad_norm": 0.06706634908914566, + "kl": 0.0147705078125, + "learning_rate": 1.1881503153539323e-06, + "loss": 0.0016, + "num_tokens": 106449486.0, + "reward": 0.4547131881117821, + "reward_std": 0.12470301892608404, + "rewards/code_reward": 0.30471319518983364, + "rewards/format_reward": 1.5, + "step": 4552 + }, + { + "clip_ratio": 0.00394583260640502, + "epoch": 0.1698801361876778, + "grad_norm": 0.07154355943202972, + "kl": 0.0145111083984375, + "learning_rate": 1.1873171660367511e-06, + "loss": 0.0014, + "step": 4553 + }, + { + "clip_ratio": 0.003688706026878208, + "epoch": 0.16991744788022947, + "grad_norm": 0.06881438940763474, + "kl": 0.014251708984375, + "learning_rate": 1.1864858262411764e-06, + "loss": 0.0012, + "step": 4554 + }, + { + "clip_ratio": 0.004373001283966005, + "completion_length": 748.4643096923828, + "epoch": 0.1699547595727811, + "grad_norm": 0.07500675320625305, + "kl": 0.0200653076171875, + "learning_rate": 1.1856562963160233e-06, + "loss": -0.0055, + "num_tokens": 106518210.0, + "reward": 0.5796998664736748, + "reward_std": 0.25990896578878164, + "rewards/code_reward": 0.4296998311765492, + "rewards/format_reward": 1.5, + "step": 4555 + }, + { + "clip_ratio": 0.004000176850240678, + "epoch": 0.16999207126533278, + "grad_norm": 0.07376934587955475, + "kl": 0.0194854736328125, + "learning_rate": 1.1848285766093466e-06, + "loss": -0.0056, + "step": 4556 + }, + { + "clip_ratio": 0.004214460728690028, + "epoch": 0.17002938295788442, + "grad_norm": 0.08405973762273788, + "kl": 0.0193939208984375, + "learning_rate": 1.1840026674684432e-06, + "loss": -0.0056, + "step": 4557 + }, + { + "clip_ratio": 0.004187888989690691, + "completion_length": 830.2857513427734, + "epoch": 0.17006669465043608, + "grad_norm": 0.07106459885835648, + "kl": 0.0196990966796875, + "learning_rate": 1.1831785692398478e-06, + "loss": 0.0293, + "num_tokens": 106602856.0, + "reward": 0.39773469790816307, + "reward_std": 0.18331428803503513, + "rewards/code_reward": 0.24773469008505344, + "rewards/format_reward": 1.5, + "step": 4558 + }, + { + "clip_ratio": 0.004244801704771817, + "epoch": 0.17010400634298772, + "grad_norm": 0.0835878923535347, + "kl": 0.0202789306640625, + "learning_rate": 1.1823562822693373e-06, + "loss": 0.0292, + "step": 4559 + }, + { + "clip_ratio": 0.004215382214169949, + "epoch": 0.1701413180355394, + "grad_norm": 0.07732629030942917, + "kl": 0.019805908203125, + "learning_rate": 1.181535806901928e-06, + "loss": 0.0292, + "step": 4560 + }, + { + "clip_ratio": 0.004174297209829092, + "completion_length": 729.0893249511719, + "epoch": 0.17017862972809103, + "grad_norm": 0.07493416219949722, + "kl": 0.019287109375, + "learning_rate": 1.180717143481876e-06, + "loss": 0.0156, + "num_tokens": 106681653.0, + "reward": 0.6418066881597042, + "reward_std": 0.1170416846871376, + "rewards/code_reward": 0.4918066766113043, + "rewards/format_reward": 1.5, + "step": 4561 + }, + { + "clip_ratio": 0.0036942267324775457, + "epoch": 0.1702159414206427, + "grad_norm": 0.07109812647104263, + "kl": 0.01904296875, + "learning_rate": 1.1799002923526778e-06, + "loss": 0.0155, + "step": 4562 + }, + { + "clip_ratio": 0.004226404940709472, + "epoch": 0.17025325311319434, + "grad_norm": 0.07180389761924744, + "kl": 0.01956939697265625, + "learning_rate": 1.1790852538570682e-06, + "loss": 0.0155, + "step": 4563 + }, + { + "clip_ratio": 0.004204121476504952, + "completion_length": 637.9464645385742, + "epoch": 0.170290564805746, + "grad_norm": 0.06856434047222137, + "kl": 0.0154266357421875, + "learning_rate": 1.1782720283370227e-06, + "loss": -0.0172, + "num_tokens": 106745616.0, + "reward": 0.6017857380211353, + "reward_std": 0.14304947666823864, + "rewards/code_reward": 0.45178571017459035, + "rewards/format_reward": 1.5, + "step": 4564 + }, + { + "clip_ratio": 0.004541957983747125, + "epoch": 0.17032787649829764, + "grad_norm": 0.06237967312335968, + "kl": 0.015411376953125, + "learning_rate": 1.1774606161337563e-06, + "loss": -0.0174, + "step": 4565 + }, + { + "clip_ratio": 0.004751783329993486, + "epoch": 0.1703651881908493, + "grad_norm": 0.06795634329319, + "kl": 0.01544189453125, + "learning_rate": 1.1766510175877206e-06, + "loss": -0.0172, + "step": 4566 + }, + { + "clip_ratio": 0.00430522853275761, + "completion_length": 652.7857513427734, + "epoch": 0.17040249988340095, + "grad_norm": 0.045454468578100204, + "kl": 0.017822265625, + "learning_rate": 1.17584323303861e-06, + "loss": -0.0008, + "num_tokens": 106811110.0, + "reward": 0.19433500245213509, + "reward_std": 0.09283886849880219, + "rewards/code_reward": 0.04433497413992882, + "rewards/format_reward": 1.5, + "step": 4567 + }, + { + "clip_ratio": 0.004686467465944588, + "epoch": 0.17043981157595262, + "grad_norm": 0.04586433991789818, + "kl": 0.01763916015625, + "learning_rate": 1.1750372628253549e-06, + "loss": -0.0008, + "step": 4568 + }, + { + "clip_ratio": 0.004662451567128301, + "epoch": 0.17047712326850425, + "grad_norm": 0.04631416127085686, + "kl": 0.017547607421875, + "learning_rate": 1.1742331072861248e-06, + "loss": -0.0008, + "step": 4569 + }, + { + "clip_ratio": 0.0021955417469143867, + "completion_length": 706.1428833007812, + "epoch": 0.17051443496105592, + "grad_norm": 0.056987449526786804, + "kl": 0.0166015625, + "learning_rate": 1.1734307667583305e-06, + "loss": 0.0053, + "num_tokens": 106889504.0, + "reward": 0.7287545651197433, + "reward_std": 0.12274308037012815, + "rewards/code_reward": 0.5787545824423432, + "rewards/format_reward": 1.5, + "step": 4570 + }, + { + "clip_ratio": 0.001992985897231847, + "epoch": 0.17055174665360756, + "grad_norm": 0.05837460607290268, + "kl": 0.0170745849609375, + "learning_rate": 1.1726302415786176e-06, + "loss": 0.0053, + "step": 4571 + }, + { + "clip_ratio": 0.0020200307480990887, + "epoch": 0.17058905834615923, + "grad_norm": 0.057752642780542374, + "kl": 0.016937255859375, + "learning_rate": 1.1718315320828714e-06, + "loss": 0.0053, + "step": 4572 + }, + { + "clip_ratio": 0.0030759034561924636, + "completion_length": 582.5893173217773, + "epoch": 0.17062637003871087, + "grad_norm": 0.09191315621137619, + "kl": 0.02545166015625, + "learning_rate": 1.1710346386062166e-06, + "loss": 0.0097, + "num_tokens": 106948533.0, + "reward": 0.5235573388636112, + "reward_std": 0.08941856399178505, + "rewards/code_reward": 0.37355732917785645, + "rewards/format_reward": 1.5, + "step": 4573 + }, + { + "clip_ratio": 0.002737301925662905, + "epoch": 0.17066368173126253, + "grad_norm": 0.07307116687297821, + "kl": 0.02630615234375, + "learning_rate": 1.170239561483014e-06, + "loss": 0.0098, + "step": 4574 + }, + { + "clip_ratio": 0.0028520713094621897, + "epoch": 0.1707009934238142, + "grad_norm": 0.071467325091362, + "kl": 0.0256500244140625, + "learning_rate": 1.1694463010468643e-06, + "loss": 0.0098, + "step": 4575 + }, + { + "clip_ratio": 0.0027804076089523733, + "completion_length": 620.3571701049805, + "epoch": 0.17073830511636584, + "grad_norm": 0.07035429775714874, + "kl": 0.029510498046875, + "learning_rate": 1.1686548576306037e-06, + "loss": -0.0027, + "num_tokens": 107012421.0, + "reward": 0.7445260435342789, + "reward_std": 0.12210584618151188, + "rewards/code_reward": 0.5945260431617498, + "rewards/format_reward": 1.5, + "step": 4576 + }, + { + "clip_ratio": 0.003084543044678867, + "epoch": 0.1707756168089175, + "grad_norm": 0.06867628544569016, + "kl": 0.029022216796875, + "learning_rate": 1.1678652315663079e-06, + "loss": -0.0027, + "step": 4577 + }, + { + "clip_ratio": 0.002798978239297867, + "epoch": 0.17081292850146915, + "grad_norm": 0.06888876855373383, + "kl": 0.02923583984375, + "learning_rate": 1.167077423185289e-06, + "loss": -0.0029, + "step": 4578 + }, + { + "clip_ratio": 0.002727980143390596, + "completion_length": 578.3928833007812, + "epoch": 0.1708502401940208, + "grad_norm": 0.061004143208265305, + "kl": 0.0210723876953125, + "learning_rate": 1.166291432818096e-06, + "loss": 0.0052, + "num_tokens": 107078775.0, + "reward": 0.6878194026648998, + "reward_std": 0.08518718183040619, + "rewards/code_reward": 0.537819392979145, + "rewards/format_reward": 1.5, + "step": 4579 + }, + { + "clip_ratio": 0.00280043511884287, + "epoch": 0.17088755188657245, + "grad_norm": 0.060706160962581635, + "kl": 0.0204925537109375, + "learning_rate": 1.1655072607945173e-06, + "loss": 0.0051, + "step": 4580 + }, + { + "clip_ratio": 0.00283962459070608, + "epoch": 0.17092486357912412, + "grad_norm": 0.05621176213026047, + "kl": 0.0218963623046875, + "learning_rate": 1.1647249074435762e-06, + "loss": 0.0052, + "step": 4581 + }, + { + "clip_ratio": 0.004732510540634394, + "completion_length": 649.0714569091797, + "epoch": 0.17096217527167576, + "grad_norm": 0.0990590751171112, + "kl": 0.0178985595703125, + "learning_rate": 1.1639443730935337e-06, + "loss": -0.009, + "num_tokens": 107146693.0, + "reward": 0.770991176366806, + "reward_std": 0.2634056508541107, + "rewards/code_reward": 0.6209911610931158, + "rewards/format_reward": 1.5, + "step": 4582 + }, + { + "clip_ratio": 0.00447635562159121, + "epoch": 0.17099948696422743, + "grad_norm": 0.09653246402740479, + "kl": 0.017303466796875, + "learning_rate": 1.163165658071888e-06, + "loss": -0.0093, + "step": 4583 + }, + { + "clip_ratio": 0.004535724932793528, + "epoch": 0.17103679865677907, + "grad_norm": 0.08865941315889359, + "kl": 0.017425537109375, + "learning_rate": 1.162388762705372e-06, + "loss": -0.0094, + "step": 4584 + }, + { + "clip_ratio": 0.0027064071618951857, + "completion_length": 656.3750305175781, + "epoch": 0.17107411034933073, + "grad_norm": 0.09386590868234634, + "kl": 0.07122802734375, + "learning_rate": 1.1616136873199587e-06, + "loss": 0.0128, + "num_tokens": 107212108.0, + "reward": 0.6120255663990974, + "reward_std": 0.22366144508123398, + "rewards/code_reward": 0.46381125412881374, + "rewards/format_reward": 1.4821428656578064, + "step": 4585 + }, + { + "clip_ratio": 0.002742143115028739, + "epoch": 0.17111142204188237, + "grad_norm": 0.08443504571914673, + "kl": 0.070831298828125, + "learning_rate": 1.1608404322408533e-06, + "loss": 0.0126, + "step": 4586 + }, + { + "clip_ratio": 0.002693757531233132, + "epoch": 0.17114873373443404, + "grad_norm": 0.09251810610294342, + "kl": 0.07122802734375, + "learning_rate": 1.1600689977925005e-06, + "loss": 0.0125, + "step": 4587 + }, + { + "clip_ratio": 0.002298771549249068, + "completion_length": 686.982177734375, + "epoch": 0.17118604542698568, + "grad_norm": 0.057785291224718094, + "kl": 0.01361083984375, + "learning_rate": 1.1592993842985793e-06, + "loss": -0.0144, + "num_tokens": 107288183.0, + "reward": 0.8193210028111935, + "reward_std": 0.0855292024789378, + "rewards/code_reward": 0.6693210043595172, + "rewards/format_reward": 1.5, + "step": 4588 + }, + { + "clip_ratio": 0.0023325980873778462, + "epoch": 0.17122335711953734, + "grad_norm": 0.055363621562719345, + "kl": 0.013763427734375, + "learning_rate": 1.1585315920820053e-06, + "loss": -0.0146, + "step": 4589 + }, + { + "clip_ratio": 0.0018707485869526863, + "epoch": 0.17126066881208898, + "grad_norm": 0.054759684950113297, + "kl": 0.0135345458984375, + "learning_rate": 1.15776562146493e-06, + "loss": -0.0146, + "step": 4590 + }, + { + "clip_ratio": 0.0019017515587620437, + "completion_length": 667.9821853637695, + "epoch": 0.17129798050464065, + "grad_norm": 0.058369286358356476, + "kl": 0.012298583984375, + "learning_rate": 1.1570014727687406e-06, + "loss": -0.0053, + "num_tokens": 107352932.0, + "reward": 0.9421542435884476, + "reward_std": 0.24183353781700134, + "rewards/code_reward": 0.7921542450785637, + "rewards/format_reward": 1.5, + "step": 4591 + }, + { + "clip_ratio": 0.00194005377124995, + "epoch": 0.1713352921971923, + "grad_norm": 0.059323232620954514, + "kl": 0.012176513671875, + "learning_rate": 1.1562391463140597e-06, + "loss": -0.0053, + "step": 4592 + }, + { + "clip_ratio": 0.0018575406284071505, + "epoch": 0.17137260388974396, + "grad_norm": 0.05785789340734482, + "kl": 0.012115478515625, + "learning_rate": 1.155478642420745e-06, + "loss": -0.0053, + "step": 4593 + }, + { + "clip_ratio": 0.0041344462078996, + "completion_length": 582.2143173217773, + "epoch": 0.1714099155822956, + "grad_norm": 0.0875110849738121, + "kl": 0.021270751953125, + "learning_rate": 1.15471996140789e-06, + "loss": 0.0133, + "num_tokens": 107417260.0, + "reward": 0.7057821489870548, + "reward_std": 0.13219483150169253, + "rewards/code_reward": 0.5557821141555905, + "rewards/format_reward": 1.5, + "step": 4594 + }, + { + "clip_ratio": 0.0038797457236796618, + "epoch": 0.17144722727484726, + "grad_norm": 0.08921915292739868, + "kl": 0.01885986328125, + "learning_rate": 1.153963103593823e-06, + "loss": 0.0132, + "step": 4595 + }, + { + "clip_ratio": 0.004323358240071684, + "epoch": 0.1714845389673989, + "grad_norm": 0.08568331599235535, + "kl": 0.0200653076171875, + "learning_rate": 1.1532080692961079e-06, + "loss": 0.0136, + "step": 4596 + }, + { + "clip_ratio": 0.0029134260839782655, + "completion_length": 663.0714569091797, + "epoch": 0.17152185065995057, + "grad_norm": 0.08122823387384415, + "kl": 0.0232086181640625, + "learning_rate": 1.1524548588315423e-06, + "loss": 0.0214, + "num_tokens": 107494040.0, + "reward": 0.7150361686944962, + "reward_std": 0.2512827031314373, + "rewards/code_reward": 0.5650361627340317, + "rewards/format_reward": 1.5, + "step": 4597 + }, + { + "clip_ratio": 0.003198797523509711, + "epoch": 0.1715591623525022, + "grad_norm": 0.08058255910873413, + "kl": 0.022613525390625, + "learning_rate": 1.1517034725161599e-06, + "loss": 0.0212, + "step": 4598 + }, + { + "clip_ratio": 0.002971555630210787, + "epoch": 0.17159647404505388, + "grad_norm": 0.07869701832532883, + "kl": 0.0228729248046875, + "learning_rate": 1.150953910665228e-06, + "loss": 0.0212, + "step": 4599 + }, + { + "clip_ratio": 0.003536954987794161, + "completion_length": 538.928596496582, + "epoch": 0.17163378573760552, + "grad_norm": 0.0768325999379158, + "kl": 0.01971435546875, + "learning_rate": 1.15020617359325e-06, + "loss": 0.0057, + "num_tokens": 107550918.0, + "reward": 0.8789977133274078, + "reward_std": 0.22533227689564228, + "rewards/code_reward": 0.728997677564621, + "rewards/format_reward": 1.5, + "step": 4600 + }, + { + "clip_ratio": 0.0033985376940108836, + "epoch": 0.17167109743015718, + "grad_norm": 0.07665298134088516, + "kl": 0.0202789306640625, + "learning_rate": 1.1494602616139612e-06, + "loss": 0.0056, + "step": 4601 + }, + { + "clip_ratio": 0.003052067302633077, + "epoch": 0.17170840912270882, + "grad_norm": 0.07325685024261475, + "kl": 0.020294189453125, + "learning_rate": 1.1487161750403321e-06, + "loss": 0.0054, + "step": 4602 + }, + { + "clip_ratio": 0.0029810118721798062, + "completion_length": 627.8214416503906, + "epoch": 0.1717457208152605, + "grad_norm": 0.07629849761724472, + "kl": 0.02984619140625, + "learning_rate": 1.1479739141845696e-06, + "loss": 0.0033, + "num_tokens": 107616988.0, + "reward": 0.7658963799476624, + "reward_std": 0.3268798142671585, + "rewards/code_reward": 0.6158963516354561, + "rewards/format_reward": 1.5, + "step": 4603 + }, + { + "clip_ratio": 0.0033166371867991984, + "epoch": 0.17178303250781213, + "grad_norm": 0.09144412726163864, + "kl": 0.029876708984375, + "learning_rate": 1.1472334793581105e-06, + "loss": 0.0033, + "step": 4604 + }, + { + "clip_ratio": 0.0030650560511276126, + "epoch": 0.1718203442003638, + "grad_norm": 0.07213156670331955, + "kl": 0.0269775390625, + "learning_rate": 1.1464948708716293e-06, + "loss": 0.0031, + "step": 4605 + }, + { + "clip_ratio": 0.0040228982688859105, + "completion_length": 584.2857513427734, + "epoch": 0.17185765589291543, + "grad_norm": 0.08044511079788208, + "kl": 0.0176544189453125, + "learning_rate": 1.1457580890350305e-06, + "loss": -0.0045, + "num_tokens": 107685912.0, + "reward": 0.645231369882822, + "reward_std": 0.24700343515723944, + "rewards/code_reward": 0.4952313578687608, + "rewards/format_reward": 1.5, + "step": 4606 + }, + { + "clip_ratio": 0.004083295469172299, + "epoch": 0.1718949675854671, + "grad_norm": 0.07916970551013947, + "kl": 0.0178985595703125, + "learning_rate": 1.1450231341574546e-06, + "loss": -0.0045, + "step": 4607 + }, + { + "clip_ratio": 0.00424329488305375, + "epoch": 0.17193227927801874, + "grad_norm": 0.07869011908769608, + "kl": 0.01788330078125, + "learning_rate": 1.1442900065472764e-06, + "loss": -0.0046, + "step": 4608 + }, + { + "clip_ratio": 0.003495948505587876, + "completion_length": 578.1071624755859, + "epoch": 0.1719695909705704, + "grad_norm": 0.041493818163871765, + "kl": 0.0170745849609375, + "learning_rate": 1.1435587065121007e-06, + "loss": 0.0027, + "num_tokens": 107747434.0, + "reward": 0.6716643311083317, + "reward_std": 0.004167804028838873, + "rewards/code_reward": 0.521664310246706, + "rewards/format_reward": 1.5, + "step": 4609 + }, + { + "clip_ratio": 0.003448568400926888, + "epoch": 0.17200690266312205, + "grad_norm": 0.03784569725394249, + "kl": 0.0174102783203125, + "learning_rate": 1.1428292343587691e-06, + "loss": 0.0027, + "step": 4610 + }, + { + "clip_ratio": 0.003483773092739284, + "epoch": 0.1720442143556737, + "grad_norm": 0.041641391813755035, + "kl": 0.01715087890625, + "learning_rate": 1.142101590393354e-06, + "loss": 0.0027, + "step": 4611 + }, + { + "clip_ratio": 0.0021431699278764427, + "completion_length": 594.9285888671875, + "epoch": 0.17208152604822535, + "grad_norm": 0.03036302886903286, + "kl": 0.0153656005859375, + "learning_rate": 1.1413757749211602e-06, + "loss": 0.0075, + "num_tokens": 107805166.0, + "reward": 0.8107142709195614, + "reward_std": 0.12431129068136215, + "rewards/code_reward": 0.6607142835855484, + "rewards/format_reward": 1.5, + "step": 4612 + }, + { + "clip_ratio": 0.0019233835628256202, + "epoch": 0.17211883774077702, + "grad_norm": 0.030119679868221283, + "kl": 0.015838623046875, + "learning_rate": 1.140651788246728e-06, + "loss": 0.0076, + "step": 4613 + }, + { + "clip_ratio": 0.0018963106558658183, + "epoch": 0.17215614943332866, + "grad_norm": 0.03217928111553192, + "kl": 0.01531982421875, + "learning_rate": 1.1399296306738274e-06, + "loss": 0.0075, + "step": 4614 + }, + { + "clip_ratio": 0.002006198570597917, + "completion_length": 481.3393096923828, + "epoch": 0.17219346112588033, + "grad_norm": 0.06595055013895035, + "kl": 0.0167083740234375, + "learning_rate": 1.1392093025054632e-06, + "loss": 0.005, + "num_tokens": 107860163.0, + "reward": 1.1179265081882477, + "reward_std": 0.12000807747244835, + "rewards/code_reward": 0.9679265022277832, + "rewards/format_reward": 1.5, + "step": 4615 + }, + { + "clip_ratio": 0.0019227006705477834, + "epoch": 0.17223077281843197, + "grad_norm": 0.06142893806099892, + "kl": 0.01617431640625, + "learning_rate": 1.1384908040438717e-06, + "loss": 0.005, + "step": 4616 + }, + { + "clip_ratio": 0.0018030215287581086, + "epoch": 0.17226808451098363, + "grad_norm": 0.06468953937292099, + "kl": 0.016265869140625, + "learning_rate": 1.137774135590521e-06, + "loss": 0.0049, + "step": 4617 + }, + { + "clip_ratio": 0.0028554294258356094, + "completion_length": 720.4107437133789, + "epoch": 0.17230539620353527, + "grad_norm": 0.06119692698121071, + "kl": 0.023956298828125, + "learning_rate": 1.1370592974461127e-06, + "loss": 0.0017, + "num_tokens": 107930508.0, + "reward": 0.555613212287426, + "reward_std": 0.16854730248451233, + "rewards/code_reward": 0.40561316697858274, + "rewards/format_reward": 1.5, + "step": 4618 + }, + { + "clip_ratio": 0.002894064411520958, + "epoch": 0.17234270789608694, + "grad_norm": 0.06221595034003258, + "kl": 0.0225982666015625, + "learning_rate": 1.1363462899105785e-06, + "loss": 0.0017, + "step": 4619 + }, + { + "clip_ratio": 0.0030253748991526663, + "epoch": 0.17238001958863858, + "grad_norm": 0.06071529537439346, + "kl": 0.0227203369140625, + "learning_rate": 1.1356351132830843e-06, + "loss": 0.0017, + "step": 4620 + }, + { + "clip_ratio": 0.0023080537212081254, + "completion_length": 613.9107360839844, + "epoch": 0.17241733128119024, + "grad_norm": 0.06203092634677887, + "kl": 0.022247314453125, + "learning_rate": 1.1349257678620253e-06, + "loss": 0.0073, + "num_tokens": 107993041.0, + "reward": 0.7345283813774586, + "reward_std": 0.16467509604990482, + "rewards/code_reward": 0.584528380073607, + "rewards/format_reward": 1.5, + "step": 4621 + }, + { + "clip_ratio": 0.0024467622861266136, + "epoch": 0.17245464297374188, + "grad_norm": 0.061706509441137314, + "kl": 0.02203369140625, + "learning_rate": 1.13421825394503e-06, + "loss": 0.0074, + "step": 4622 + }, + { + "clip_ratio": 0.0023994052316993475, + "epoch": 0.17249195466629355, + "grad_norm": 0.060527998954057693, + "kl": 0.021881103515625, + "learning_rate": 1.1335125718289598e-06, + "loss": 0.0074, + "step": 4623 + }, + { + "clip_ratio": 0.0032064300612546504, + "completion_length": 650.5000305175781, + "epoch": 0.1725292663588452, + "grad_norm": 0.032000333070755005, + "kl": 0.0151824951171875, + "learning_rate": 1.1328087218099032e-06, + "loss": -0.0012, + "num_tokens": 108057945.0, + "reward": 0.4947694167494774, + "reward_std": 0.11689986288547516, + "rewards/code_reward": 0.3447694033384323, + "rewards/format_reward": 1.5, + "step": 4624 + }, + { + "clip_ratio": 0.003005675331223756, + "epoch": 0.17256657805139686, + "grad_norm": 0.03092026151716709, + "kl": 0.01568603515625, + "learning_rate": 1.132106704183184e-06, + "loss": -0.0011, + "step": 4625 + }, + { + "clip_ratio": 0.003208957496099174, + "epoch": 0.1726038897439485, + "grad_norm": 0.030740896239876747, + "kl": 0.015289306640625, + "learning_rate": 1.1314065192433559e-06, + "loss": -0.0011, + "step": 4626 + }, + { + "clip_ratio": 0.003378237539436668, + "completion_length": 754.8214492797852, + "epoch": 0.17264120143650016, + "grad_norm": 0.06980716437101364, + "kl": 0.0212249755859375, + "learning_rate": 1.130708167284203e-06, + "loss": -0.0013, + "num_tokens": 108129979.0, + "reward": 0.6628017649054527, + "reward_std": 0.15608486533164978, + "rewards/code_reward": 0.5128017645329237, + "rewards/format_reward": 1.5, + "step": 4627 + }, + { + "clip_ratio": 0.0028917622403241694, + "epoch": 0.17267851312905183, + "grad_norm": 0.07046365737915039, + "kl": 0.0214996337890625, + "learning_rate": 1.1300116485987416e-06, + "loss": -0.0014, + "step": 4628 + }, + { + "clip_ratio": 0.003085459058638662, + "epoch": 0.17271582482160347, + "grad_norm": 0.06330619752407074, + "kl": 0.02130126953125, + "learning_rate": 1.1293169634792167e-06, + "loss": -0.0016, + "step": 4629 + }, + { + "clip_ratio": 0.0022706740419380367, + "completion_length": 654.4464569091797, + "epoch": 0.17275313651415514, + "grad_norm": 0.05335311219096184, + "kl": 0.0162811279296875, + "learning_rate": 1.1286241122171057e-06, + "loss": -0.0052, + "num_tokens": 108207886.0, + "reward": 0.7074611075222492, + "reward_std": 0.11709216982126236, + "rewards/code_reward": 0.5574610903859138, + "rewards/format_reward": 1.5, + "step": 4630 + }, + { + "clip_ratio": 0.002196363697294146, + "epoch": 0.17279044820670678, + "grad_norm": 0.04917552322149277, + "kl": 0.016998291015625, + "learning_rate": 1.1279330951031173e-06, + "loss": -0.005, + "step": 4631 + }, + { + "clip_ratio": 0.002293766854563728, + "epoch": 0.17282775989925844, + "grad_norm": 0.050910092890262604, + "kl": 0.0160064697265625, + "learning_rate": 1.127243912427188e-06, + "loss": -0.0052, + "step": 4632 + }, + { + "clip_ratio": 0.00247967429459095, + "completion_length": 549.2678833007812, + "epoch": 0.17286507159181008, + "grad_norm": 0.06430565565824509, + "kl": 0.0140228271484375, + "learning_rate": 1.1265565644784867e-06, + "loss": -0.004, + "num_tokens": 108261017.0, + "reward": 0.8434304893016815, + "reward_std": 0.26466479897499084, + "rewards/code_reward": 0.6934304982423782, + "rewards/format_reward": 1.5, + "step": 4633 + }, + { + "clip_ratio": 0.0019209621823392808, + "epoch": 0.17290238328436175, + "grad_norm": 0.06365299969911575, + "kl": 0.014190673828125, + "learning_rate": 1.125871051545411e-06, + "loss": -0.0044, + "step": 4634 + }, + { + "clip_ratio": 0.0019501790520735085, + "epoch": 0.1729396949769134, + "grad_norm": 0.061750926077365875, + "kl": 0.01416015625, + "learning_rate": 1.125187373915591e-06, + "loss": -0.0045, + "step": 4635 + }, + { + "clip_ratio": 0.003606435318943113, + "completion_length": 670.4643173217773, + "epoch": 0.17297700666946506, + "grad_norm": 0.07529428601264954, + "kl": 0.026153564453125, + "learning_rate": 1.1245055318758844e-06, + "loss": -0.0046, + "num_tokens": 108331435.0, + "reward": 0.5643688589334488, + "reward_std": 0.29106811434030533, + "rewards/code_reward": 0.4143688753247261, + "rewards/format_reward": 1.5, + "step": 4636 + }, + { + "clip_ratio": 0.003488742164336145, + "epoch": 0.1730143183620167, + "grad_norm": 0.07457151263952255, + "kl": 0.0258331298828125, + "learning_rate": 1.1238255257123787e-06, + "loss": -0.0045, + "step": 4637 + }, + { + "clip_ratio": 0.0035235225805081427, + "epoch": 0.17305163005456836, + "grad_norm": 0.07470888644456863, + "kl": 0.02618408203125, + "learning_rate": 1.1231473557103928e-06, + "loss": -0.0045, + "step": 4638 + }, + { + "clip_ratio": 0.005056629539467394, + "completion_length": 551.2321548461914, + "epoch": 0.17308894174712, + "grad_norm": 0.07004967331886292, + "kl": 0.0178375244140625, + "learning_rate": 1.1224710221544736e-06, + "loss": -0.0019, + "num_tokens": 108391340.0, + "reward": 0.4628005847334862, + "reward_std": 0.2387058287858963, + "rewards/code_reward": 0.31280056573450565, + "rewards/format_reward": 1.5, + "step": 4639 + }, + { + "clip_ratio": 0.004366845882032067, + "epoch": 0.17312625343967167, + "grad_norm": 0.07652102410793304, + "kl": 0.0175018310546875, + "learning_rate": 1.1217965253283983e-06, + "loss": -0.0017, + "step": 4640 + }, + { + "clip_ratio": 0.004758750437758863, + "epoch": 0.1731635651322233, + "grad_norm": 0.06898526102304459, + "kl": 0.017852783203125, + "learning_rate": 1.1211238655151741e-06, + "loss": -0.002, + "step": 4641 + }, + { + "clip_ratio": 0.003759671642910689, + "completion_length": 813.357177734375, + "epoch": 0.17320087682477497, + "grad_norm": 0.07322220504283905, + "kl": 0.0182952880859375, + "learning_rate": 1.1204530429970347e-06, + "loss": 0.0179, + "num_tokens": 108476144.0, + "reward": 0.6652454473078251, + "reward_std": 0.2035085808020085, + "rewards/code_reward": 0.5152453889604658, + "rewards/format_reward": 1.5, + "step": 4642 + }, + { + "clip_ratio": 0.003890397696522996, + "epoch": 0.1732381885173266, + "grad_norm": 0.07443380355834961, + "kl": 0.0183868408203125, + "learning_rate": 1.1197840580554466e-06, + "loss": 0.0178, + "step": 4643 + }, + { + "clip_ratio": 0.003334593609906733, + "epoch": 0.17327550020987828, + "grad_norm": 0.07751709967851639, + "kl": 0.018157958984375, + "learning_rate": 1.119116910971102e-06, + "loss": 0.0177, + "step": 4644 + }, + { + "clip_ratio": 0.004694767587352544, + "completion_length": 770.9643249511719, + "epoch": 0.17331281190242992, + "grad_norm": 0.07824430614709854, + "kl": 0.0181732177734375, + "learning_rate": 1.1184516020239248e-06, + "loss": 0.0028, + "num_tokens": 108560884.0, + "reward": 0.45841188356280327, + "reward_std": 0.2682241937145591, + "rewards/code_reward": 0.3084118729457259, + "rewards/format_reward": 1.5, + "step": 4645 + }, + { + "clip_ratio": 0.005115268984809518, + "epoch": 0.1733501235949816, + "grad_norm": 0.07516408711671829, + "kl": 0.0178985595703125, + "learning_rate": 1.1177881314930659e-06, + "loss": 0.0027, + "step": 4646 + }, + { + "clip_ratio": 0.004245338961482048, + "epoch": 0.17338743528753323, + "grad_norm": 0.078591488301754, + "kl": 0.01837158203125, + "learning_rate": 1.1171264996569036e-06, + "loss": 0.0024, + "step": 4647 + }, + { + "clip_ratio": 0.003579170675948262, + "completion_length": 760.5536041259766, + "epoch": 0.1734247469800849, + "grad_norm": 0.07177338749170303, + "kl": 0.02801513671875, + "learning_rate": 1.1164667067930492e-06, + "loss": 0.0071, + "num_tokens": 108631741.0, + "reward": 0.5257267840206623, + "reward_std": 0.11127032630611211, + "rewards/code_reward": 0.37572676833951846, + "rewards/format_reward": 1.5, + "step": 4648 + }, + { + "clip_ratio": 0.00364395824726671, + "epoch": 0.17346205867263653, + "grad_norm": 0.07474454492330551, + "kl": 0.032958984375, + "learning_rate": 1.1158087531783371e-06, + "loss": 0.007, + "step": 4649 + }, + { + "clip_ratio": 0.0034211668535135686, + "epoch": 0.1734993703651882, + "grad_norm": 0.06999935954809189, + "kl": 0.0271453857421875, + "learning_rate": 1.1151526390888332e-06, + "loss": 0.0067, + "step": 4650 + }, + { + "clip_ratio": 0.004414349794387817, + "completion_length": 723.357177734375, + "epoch": 0.17353668205773984, + "grad_norm": 0.12519921362400055, + "kl": 0.0318756103515625, + "learning_rate": 1.114498364799831e-06, + "loss": 0.0021, + "num_tokens": 108702097.0, + "reward": 0.5114504434168339, + "reward_std": 0.13872225349768996, + "rewards/code_reward": 0.3614504439756274, + "rewards/format_reward": 1.5, + "step": 4651 + }, + { + "clip_ratio": 0.003909348044544458, + "epoch": 0.1735739937502915, + "grad_norm": 0.08096978068351746, + "kl": 0.0254974365234375, + "learning_rate": 1.1138459305858516e-06, + "loss": 0.0019, + "step": 4652 + }, + { + "clip_ratio": 0.004618531558662653, + "epoch": 0.17361130544284314, + "grad_norm": 0.07833287864923477, + "kl": 0.0246124267578125, + "learning_rate": 1.1131953367206442e-06, + "loss": 0.0021, + "step": 4653 + }, + { + "clip_ratio": 0.0013376366114243865, + "completion_length": 518.2143249511719, + "epoch": 0.1736486171353948, + "grad_norm": 0.05493692308664322, + "kl": 0.021087646484375, + "learning_rate": 1.1125465834771868e-06, + "loss": 0.0032, + "num_tokens": 108760615.0, + "reward": 0.999297246336937, + "reward_std": 0.120208490639925, + "rewards/code_reward": 0.8492972701787949, + "rewards/format_reward": 1.5, + "step": 4654 + }, + { + "clip_ratio": 0.0017916225478984416, + "epoch": 0.17368592882794645, + "grad_norm": 0.05583493411540985, + "kl": 0.0211639404296875, + "learning_rate": 1.1118996711276829e-06, + "loss": 0.0033, + "step": 4655 + }, + { + "clip_ratio": 0.0017335563898086548, + "epoch": 0.17372324052049812, + "grad_norm": 0.0577990859746933, + "kl": 0.0208282470703125, + "learning_rate": 1.1112545999435661e-06, + "loss": 0.0033, + "step": 4656 + }, + { + "clip_ratio": 0.0055189087288454175, + "completion_length": 630.5893096923828, + "epoch": 0.17376055221304976, + "grad_norm": 0.07442331314086914, + "kl": 0.0201263427734375, + "learning_rate": 1.1106113701954954e-06, + "loss": 0.0202, + "num_tokens": 108814422.0, + "reward": 0.5414951331913471, + "reward_std": 0.27437320724129677, + "rewards/code_reward": 0.39149510115385056, + "rewards/format_reward": 1.5, + "step": 4657 + }, + { + "clip_ratio": 0.00458738551242277, + "epoch": 0.17379786390560142, + "grad_norm": 0.07118334621191025, + "kl": 0.0199737548828125, + "learning_rate": 1.109969982153358e-06, + "loss": 0.0198, + "step": 4658 + }, + { + "clip_ratio": 0.004598052590154111, + "epoch": 0.17383517559815306, + "grad_norm": 0.07453103363513947, + "kl": 0.020263671875, + "learning_rate": 1.109330436086269e-06, + "loss": 0.0199, + "step": 4659 + }, + { + "clip_ratio": 0.0030709195998497307, + "completion_length": 582.053596496582, + "epoch": 0.17387248729070473, + "grad_norm": 0.06732434034347534, + "kl": 0.0170440673828125, + "learning_rate": 1.1086927322625694e-06, + "loss": 0.0015, + "num_tokens": 108876683.0, + "reward": 0.7324753254652023, + "reward_std": 0.28300363197922707, + "rewards/code_reward": 0.5824753195047379, + "rewards/format_reward": 1.5, + "step": 4660 + }, + { + "clip_ratio": 0.002845007285941392, + "epoch": 0.17390979898325637, + "grad_norm": 0.9626320004463196, + "kl": 0.032989501953125, + "learning_rate": 1.1080568709498291e-06, + "loss": 0.0018, + "step": 4661 + }, + { + "clip_ratio": 0.002876255370210856, + "epoch": 0.17394711067580804, + "grad_norm": 0.06473027169704437, + "kl": 0.0169830322265625, + "learning_rate": 1.1074228524148427e-06, + "loss": 0.0015, + "step": 4662 + }, + { + "clip_ratio": 0.002783374220598489, + "completion_length": 467.8928756713867, + "epoch": 0.17398442236835968, + "grad_norm": 0.08092832565307617, + "kl": 0.0216827392578125, + "learning_rate": 1.1067906769236325e-06, + "loss": 0.0139, + "num_tokens": 108932397.0, + "reward": 0.8001875653862953, + "reward_std": 0.13321477081626654, + "rewards/code_reward": 0.6501875519752502, + "rewards/format_reward": 1.5, + "step": 4663 + }, + { + "clip_ratio": 0.0027825062279589474, + "epoch": 0.17402173406091134, + "grad_norm": 0.0835123062133789, + "kl": 0.0214385986328125, + "learning_rate": 1.106160344741448e-06, + "loss": 0.0141, + "step": 4664 + }, + { + "clip_ratio": 0.0031100272317416966, + "epoch": 0.17405904575346298, + "grad_norm": 0.08415407687425613, + "kl": 0.020233154296875, + "learning_rate": 1.1055318561327646e-06, + "loss": 0.0141, + "step": 4665 + }, + { + "clip_ratio": 0.004077606368809938, + "completion_length": 819.9464721679688, + "epoch": 0.17409635744601465, + "grad_norm": 0.05976257473230362, + "kl": 0.01513671875, + "learning_rate": 1.104905211361285e-06, + "loss": -0.0035, + "num_tokens": 109013300.0, + "reward": 0.5886730924248695, + "reward_std": 0.24018286913633347, + "rewards/code_reward": 0.4386730380356312, + "rewards/format_reward": 1.5, + "step": 4666 + }, + { + "clip_ratio": 0.004094318370334804, + "epoch": 0.1741336691385663, + "grad_norm": 0.05834727734327316, + "kl": 0.0148162841796875, + "learning_rate": 1.1042804106899365e-06, + "loss": -0.0036, + "step": 4667 + }, + { + "clip_ratio": 0.00402359914733097, + "epoch": 0.17417098083111796, + "grad_norm": 0.056043580174446106, + "kl": 0.014923095703125, + "learning_rate": 1.1036574543808753e-06, + "loss": -0.0038, + "step": 4668 + }, + { + "clip_ratio": 0.003352468484081328, + "completion_length": 710.8928833007812, + "epoch": 0.1742082925236696, + "grad_norm": 0.06761772185564041, + "kl": 0.022613525390625, + "learning_rate": 1.103036342695481e-06, + "loss": -0.006, + "num_tokens": 109086150.0, + "reward": 0.6419254653155804, + "reward_std": 0.165767852216959, + "rewards/code_reward": 0.4919254556298256, + "rewards/format_reward": 1.5, + "step": 4669 + }, + { + "clip_ratio": 0.00366726063657552, + "epoch": 0.17424560421622126, + "grad_norm": 0.06522256135940552, + "kl": 0.02288818359375, + "learning_rate": 1.1024170758943616e-06, + "loss": -0.0061, + "step": 4670 + }, + { + "clip_ratio": 0.003387488191947341, + "epoch": 0.1742829159087729, + "grad_norm": 0.06273630261421204, + "kl": 0.022613525390625, + "learning_rate": 1.1017996542373496e-06, + "loss": -0.0061, + "step": 4671 + }, + { + "clip_ratio": 0.0029612143407575786, + "completion_length": 593.3214569091797, + "epoch": 0.17432022760132457, + "grad_norm": 0.0681232437491417, + "kl": 0.018035888671875, + "learning_rate": 1.1011840779835034e-06, + "loss": -0.0134, + "num_tokens": 109151744.0, + "reward": 0.8872199133038521, + "reward_std": 0.25947612151503563, + "rewards/code_reward": 0.7372198663651943, + "rewards/format_reward": 1.5, + "step": 4672 + }, + { + "clip_ratio": 0.002931646886281669, + "epoch": 0.1743575392938762, + "grad_norm": 0.07099587470293045, + "kl": 0.0177001953125, + "learning_rate": 1.1005703473911073e-06, + "loss": -0.0135, + "step": 4673 + }, + { + "clip_ratio": 0.0028188886935822666, + "epoch": 0.17439485098642787, + "grad_norm": 0.07596860080957413, + "kl": 0.0178375244140625, + "learning_rate": 1.0999584627176727e-06, + "loss": -0.0137, + "step": 4674 + }, + { + "clip_ratio": 0.0042031940538436174, + "completion_length": 660.5714721679688, + "epoch": 0.1744321626789795, + "grad_norm": 0.07688650488853455, + "kl": 0.0225830078125, + "learning_rate": 1.0993484242199326e-06, + "loss": -0.0047, + "num_tokens": 109219214.0, + "reward": 0.6209481172263622, + "reward_std": 0.17826257646083832, + "rewards/code_reward": 0.47094807773828506, + "rewards/format_reward": 1.5, + "step": 4675 + }, + { + "clip_ratio": 0.004022015782538801, + "epoch": 0.17446947437153118, + "grad_norm": 0.0904514342546463, + "kl": 0.0259552001953125, + "learning_rate": 1.0987402321538498e-06, + "loss": -0.0047, + "step": 4676 + }, + { + "clip_ratio": 0.003971964120864868, + "epoch": 0.17450678606408282, + "grad_norm": 0.07785293459892273, + "kl": 0.0253753662109375, + "learning_rate": 1.0981338867746087e-06, + "loss": -0.0047, + "step": 4677 + }, + { + "clip_ratio": 0.0034927353844977915, + "completion_length": 547.7321701049805, + "epoch": 0.1745440977566345, + "grad_norm": 0.06105601787567139, + "kl": 0.021636962890625, + "learning_rate": 1.0975293883366217e-06, + "loss": -0.0029, + "num_tokens": 109279161.0, + "reward": 0.503933247178793, + "reward_std": 0.1437048725783825, + "rewards/code_reward": 0.3539332393556833, + "rewards/format_reward": 1.5, + "step": 4678 + }, + { + "clip_ratio": 0.003987942938692868, + "epoch": 0.17458140944918613, + "grad_norm": 0.060355134308338165, + "kl": 0.022064208984375, + "learning_rate": 1.0969267370935257e-06, + "loss": -0.0028, + "step": 4679 + }, + { + "clip_ratio": 0.003645249526016414, + "epoch": 0.1746187211417378, + "grad_norm": 0.06231670081615448, + "kl": 0.0219573974609375, + "learning_rate": 1.0963259332981805e-06, + "loss": -0.003, + "step": 4680 + }, + { + "clip_ratio": 0.00289878249168396, + "completion_length": 853.6786041259766, + "epoch": 0.17465603283428943, + "grad_norm": 0.10931673645973206, + "kl": 0.0174102783203125, + "learning_rate": 1.0957269772026735e-06, + "loss": 0.0076, + "num_tokens": 109364767.0, + "reward": 0.6211368590593338, + "reward_std": 0.2168826088309288, + "rewards/code_reward": 0.4731011353433132, + "rewards/format_reward": 1.480357140302658, + "step": 4681 + }, + { + "clip_ratio": 0.0029090146417729557, + "epoch": 0.1746933445268411, + "grad_norm": 0.06663733720779419, + "kl": 0.0171661376953125, + "learning_rate": 1.0951298690583143e-06, + "loss": 0.0076, + "step": 4682 + }, + { + "clip_ratio": 0.0028334843227639794, + "epoch": 0.17473065621939277, + "grad_norm": 0.06763603538274765, + "kl": 0.01702880859375, + "learning_rate": 1.0945346091156396e-06, + "loss": 0.0075, + "step": 4683 + }, + { + "clip_ratio": 0.0026171592762693763, + "completion_length": 677.0714797973633, + "epoch": 0.1747679679119444, + "grad_norm": 0.04278041422367096, + "kl": 0.0166778564453125, + "learning_rate": 1.0939411976244086e-06, + "loss": -0.0044, + "num_tokens": 109440833.0, + "reward": 0.7488331012427807, + "reward_std": 0.035717856138944626, + "rewards/code_reward": 0.5988330990076065, + "rewards/format_reward": 1.5, + "step": 4684 + }, + { + "clip_ratio": 0.0030316811753436923, + "epoch": 0.17480527960449607, + "grad_norm": 0.041002318263053894, + "kl": 0.01611328125, + "learning_rate": 1.093349634833606e-06, + "loss": -0.0044, + "step": 4685 + }, + { + "clip_ratio": 0.003118123160675168, + "epoch": 0.1748425912970477, + "grad_norm": 0.04777739942073822, + "kl": 0.016571044921875, + "learning_rate": 1.0927599209914407e-06, + "loss": -0.0044, + "step": 4686 + }, + { + "clip_ratio": 0.004538132227025926, + "completion_length": 757.4107513427734, + "epoch": 0.17487990298959938, + "grad_norm": 0.07947534322738647, + "kl": 0.023040771484375, + "learning_rate": 1.092172056345345e-06, + "loss": 0.0246, + "num_tokens": 109511206.0, + "reward": 0.455290786921978, + "reward_std": 0.2121727615594864, + "rewards/code_reward": 0.30529073998332024, + "rewards/format_reward": 1.5, + "step": 4687 + }, + { + "clip_ratio": 0.004087837703991681, + "epoch": 0.17491721468215102, + "grad_norm": 0.0776732936501503, + "kl": 0.0217132568359375, + "learning_rate": 1.0915860411419768e-06, + "loss": 0.0242, + "step": 4688 + }, + { + "clip_ratio": 0.004080110578797758, + "epoch": 0.17495452637470268, + "grad_norm": 0.07934108376502991, + "kl": 0.021759033203125, + "learning_rate": 1.0910018756272172e-06, + "loss": 0.0243, + "step": 4689 + }, + { + "clip_ratio": 0.0035830072010867298, + "completion_length": 781.9643249511719, + "epoch": 0.17499183806725432, + "grad_norm": 0.06380458176136017, + "kl": 0.014312744140625, + "learning_rate": 1.0904195600461707e-06, + "loss": -0.0045, + "num_tokens": 109581358.0, + "reward": 0.5421933084726334, + "reward_std": 0.18074816837906837, + "rewards/code_reward": 0.39219329319894314, + "rewards/format_reward": 1.5, + "step": 4690 + }, + { + "clip_ratio": 0.002960453159175813, + "epoch": 0.175029149759806, + "grad_norm": 0.06471610814332962, + "kl": 0.0140228271484375, + "learning_rate": 1.0898390946431662e-06, + "loss": -0.0044, + "step": 4691 + }, + { + "clip_ratio": 0.0031403183238580823, + "epoch": 0.17506646145235763, + "grad_norm": 0.06360378861427307, + "kl": 0.0143585205078125, + "learning_rate": 1.0892604796617568e-06, + "loss": -0.0045, + "step": 4692 + }, + { + "clip_ratio": 0.0019268859759904444, + "completion_length": 585.053596496582, + "epoch": 0.1751037731449093, + "grad_norm": 0.06659739464521408, + "kl": 0.0220794677734375, + "learning_rate": 1.0886837153447174e-06, + "loss": 0.0043, + "num_tokens": 109637879.0, + "reward": 0.7539915814995766, + "reward_std": 0.05650217481888831, + "rewards/code_reward": 0.6039915946312249, + "rewards/format_reward": 1.5, + "step": 4693 + }, + { + "clip_ratio": 0.002026072528678924, + "epoch": 0.17514108483746094, + "grad_norm": 0.06623296439647675, + "kl": 0.0217742919921875, + "learning_rate": 1.088108801934049e-06, + "loss": 0.0042, + "step": 4694 + }, + { + "clip_ratio": 0.001531690824776888, + "epoch": 0.1751783965300126, + "grad_norm": 0.07333555072546005, + "kl": 0.0221099853515625, + "learning_rate": 1.087535739670973e-06, + "loss": 0.0042, + "step": 4695 + }, + { + "clip_ratio": 0.004422827041707933, + "completion_length": 557.8393173217773, + "epoch": 0.17521570822256424, + "grad_norm": 0.07251855731010437, + "kl": 0.02471923828125, + "learning_rate": 1.0869645287959372e-06, + "loss": -0.0043, + "num_tokens": 109698896.0, + "reward": 0.8032613471150398, + "reward_std": 0.17519478127360344, + "rewards/code_reward": 0.6532612927258015, + "rewards/format_reward": 1.5, + "step": 4696 + }, + { + "clip_ratio": 0.0046627523843199015, + "epoch": 0.1752530199151159, + "grad_norm": 0.0727694034576416, + "kl": 0.024322509765625, + "learning_rate": 1.0863951695486101e-06, + "loss": -0.0042, + "step": 4697 + }, + { + "clip_ratio": 0.0043434634571895, + "epoch": 0.17529033160766755, + "grad_norm": 0.07148072123527527, + "kl": 0.02490234375, + "learning_rate": 1.085827662167885e-06, + "loss": -0.0043, + "step": 4698 + }, + { + "clip_ratio": 0.004082098603248596, + "completion_length": 658.0714569091797, + "epoch": 0.17532764330021922, + "grad_norm": 0.06389391422271729, + "kl": 0.01654052734375, + "learning_rate": 1.0852620068918772e-06, + "loss": 0.0135, + "num_tokens": 109766198.0, + "reward": 0.5377434380352497, + "reward_std": 0.17636644560843706, + "rewards/code_reward": 0.3877434451133013, + "rewards/format_reward": 1.5, + "step": 4699 + }, + { + "clip_ratio": 0.003636073670350015, + "epoch": 0.17536495499277086, + "grad_norm": 0.06301137059926987, + "kl": 0.017059326171875, + "learning_rate": 1.0846982039579245e-06, + "loss": 0.0134, + "step": 4700 + }, + { + "clip_ratio": 0.003888288338202983, + "epoch": 0.17540226668532252, + "grad_norm": 0.06453907489776611, + "kl": 0.01690673828125, + "learning_rate": 1.0841362536025884e-06, + "loss": 0.0135, + "step": 4701 + }, + { + "clip_ratio": 0.004726887098513544, + "completion_length": 666.3393249511719, + "epoch": 0.17543957837787416, + "grad_norm": 0.08576319366693497, + "kl": 0.0216064453125, + "learning_rate": 1.083576156061654e-06, + "loss": -0.0054, + "num_tokens": 109840599.0, + "reward": 0.5961866304278374, + "reward_std": 0.28980111749842763, + "rewards/code_reward": 0.4461866207420826, + "rewards/format_reward": 1.5, + "step": 4702 + }, + { + "clip_ratio": 0.004936827579513192, + "epoch": 0.17547689007042583, + "grad_norm": 0.0783696249127388, + "kl": 0.02117919921875, + "learning_rate": 1.0830179115701263e-06, + "loss": -0.0053, + "step": 4703 + }, + { + "clip_ratio": 0.00438063230831176, + "epoch": 0.17551420176297747, + "grad_norm": 0.080999456346035, + "kl": 0.0211944580078125, + "learning_rate": 1.0824615203622349e-06, + "loss": -0.0057, + "step": 4704 + }, + { + "clip_ratio": 0.004662185092456639, + "completion_length": 824.1964721679688, + "epoch": 0.17555151345552913, + "grad_norm": 0.07290715724229813, + "kl": 0.0233306884765625, + "learning_rate": 1.0819069826714313e-06, + "loss": 0.0052, + "num_tokens": 109929846.0, + "reward": 0.5967292301356792, + "reward_std": 0.26081532984972, + "rewards/code_reward": 0.4467292055487633, + "rewards/format_reward": 1.5, + "step": 4705 + }, + { + "clip_ratio": 0.004926713299937546, + "epoch": 0.17558882514808077, + "grad_norm": 0.07144549489021301, + "kl": 0.0214385986328125, + "learning_rate": 1.0813542987303893e-06, + "loss": 0.005, + "step": 4706 + }, + { + "clip_ratio": 0.004910602292511612, + "epoch": 0.17562613684063244, + "grad_norm": 0.07120677828788757, + "kl": 0.021759033203125, + "learning_rate": 1.0808034687710047e-06, + "loss": 0.005, + "step": 4707 + }, + { + "clip_ratio": 0.005380555929150432, + "completion_length": 752.9643096923828, + "epoch": 0.17566344853318408, + "grad_norm": 0.07282764464616776, + "kl": 0.023101806640625, + "learning_rate": 1.0802544930243954e-06, + "loss": -0.0052, + "num_tokens": 110008944.0, + "reward": 0.5130387656390667, + "reward_std": 0.23572266474366188, + "rewards/code_reward": 0.3630387410521507, + "rewards/format_reward": 1.5, + "step": 4708 + }, + { + "clip_ratio": 0.005186505324672908, + "epoch": 0.17570076022573575, + "grad_norm": 0.07126310467720032, + "kl": 0.02288818359375, + "learning_rate": 1.0797073717209014e-06, + "loss": -0.0052, + "step": 4709 + }, + { + "clip_ratio": 0.005318060051649809, + "epoch": 0.1757380719182874, + "grad_norm": 0.07680342346429825, + "kl": 0.022552490234375, + "learning_rate": 1.079162105090085e-06, + "loss": -0.0051, + "step": 4710 + }, + { + "clip_ratio": 0.004202237701974809, + "completion_length": 545.5178833007812, + "epoch": 0.17577538361083905, + "grad_norm": 0.09726255387067795, + "kl": 0.026397705078125, + "learning_rate": 1.0786186933607288e-06, + "loss": 0.0046, + "num_tokens": 110067549.0, + "reward": 0.8265052810311317, + "reward_std": 0.21233223844319582, + "rewards/code_reward": 0.6765052601695061, + "rewards/format_reward": 1.5, + "step": 4711 + }, + { + "clip_ratio": 0.0039541206788271666, + "epoch": 0.1758126953033907, + "grad_norm": 0.10781541466712952, + "kl": 0.028472900390625, + "learning_rate": 1.0780771367608397e-06, + "loss": 0.0047, + "step": 4712 + }, + { + "clip_ratio": 0.003831505251582712, + "epoch": 0.17585000699594236, + "grad_norm": 0.08742685616016388, + "kl": 0.026214599609375, + "learning_rate": 1.0775374355176447e-06, + "loss": 0.0042, + "step": 4713 + }, + { + "clip_ratio": 0.0013573620235547423, + "completion_length": 572.2500305175781, + "epoch": 0.175887318688494, + "grad_norm": 0.04355664178729057, + "kl": 0.01045989990234375, + "learning_rate": 1.0769995898575915e-06, + "loss": -0.0018, + "num_tokens": 110133125.0, + "reward": 0.9639930203557014, + "reward_std": 0.13037475012242794, + "rewards/code_reward": 0.813993014395237, + "rewards/format_reward": 1.5, + "step": 4714 + }, + { + "clip_ratio": 0.0016824278282001615, + "epoch": 0.17592463038104567, + "grad_norm": 0.044127628207206726, + "kl": 0.01065826416015625, + "learning_rate": 1.076463600006351e-06, + "loss": -0.0018, + "step": 4715 + }, + { + "clip_ratio": 0.0017242543981410563, + "epoch": 0.1759619420735973, + "grad_norm": 0.04587621986865997, + "kl": 0.010528564453125, + "learning_rate": 1.0759294661888137e-06, + "loss": -0.0017, + "step": 4716 + }, + { + "clip_ratio": 0.0020323681528680027, + "completion_length": 708.7857437133789, + "epoch": 0.17599925376614897, + "grad_norm": 0.05359084904193878, + "kl": 0.019775390625, + "learning_rate": 1.075397188629093e-06, + "loss": 0.0476, + "num_tokens": 110200005.0, + "reward": 0.5987499989569187, + "reward_std": 0.12151261325925589, + "rewards/code_reward": 0.4514285735785961, + "rewards/format_reward": 1.4732142984867096, + "step": 4717 + }, + { + "clip_ratio": 0.002215488289948553, + "epoch": 0.1760365654587006, + "grad_norm": 0.04736717417836189, + "kl": 0.019256591796875, + "learning_rate": 1.0748667675505224e-06, + "loss": 0.0478, + "step": 4718 + }, + { + "clip_ratio": 0.0022223355481401086, + "epoch": 0.17607387715125228, + "grad_norm": 0.046951379626989365, + "kl": 0.0193939208984375, + "learning_rate": 1.0743382031756569e-06, + "loss": 0.0477, + "step": 4719 + }, + { + "clip_ratio": 0.0024443361326120794, + "completion_length": 663.9821853637695, + "epoch": 0.17611118884380392, + "grad_norm": 0.09406314790248871, + "kl": 0.018768310546875, + "learning_rate": 1.0738114957262718e-06, + "loss": 0.0319, + "num_tokens": 110269206.0, + "reward": 0.9483988434076309, + "reward_std": 0.1887863203883171, + "rewards/code_reward": 0.7983988523483276, + "rewards/format_reward": 1.5, + "step": 4720 + }, + { + "clip_ratio": 0.0024390975013375282, + "epoch": 0.17614850053635558, + "grad_norm": 0.11613498628139496, + "kl": 0.018890380859375, + "learning_rate": 1.0732866454233643e-06, + "loss": 0.0315, + "step": 4721 + }, + { + "clip_ratio": 0.002444194338750094, + "epoch": 0.17618581222890722, + "grad_norm": 0.08464488387107849, + "kl": 0.01861572265625, + "learning_rate": 1.0727636524871513e-06, + "loss": 0.0316, + "step": 4722 + }, + { + "clip_ratio": 0.0038865813985466957, + "completion_length": 663.6785888671875, + "epoch": 0.1762231239214589, + "grad_norm": 0.08497145026922226, + "kl": 0.01910400390625, + "learning_rate": 1.0722425171370716e-06, + "loss": 0.0008, + "num_tokens": 110339664.0, + "reward": 0.6671692952513695, + "reward_std": 0.14934443915262818, + "rewards/code_reward": 0.5171692781150341, + "rewards/format_reward": 1.5, + "step": 4723 + }, + { + "clip_ratio": 0.00364309165161103, + "epoch": 0.17626043561401053, + "grad_norm": 0.07878562808036804, + "kl": 0.01800537109375, + "learning_rate": 1.0717232395917831e-06, + "loss": 0.0007, + "step": 4724 + }, + { + "clip_ratio": 0.0032329337554983795, + "epoch": 0.1762977473065622, + "grad_norm": 0.08970313519239426, + "kl": 0.018524169921875, + "learning_rate": 1.0712058200691661e-06, + "loss": 0.0005, + "step": 4725 + }, + { + "clip_ratio": 0.0026789200492203236, + "completion_length": 675.3036041259766, + "epoch": 0.17633505899911384, + "grad_norm": 0.058410562574863434, + "kl": 0.02520751953125, + "learning_rate": 1.0706902587863195e-06, + "loss": 0.0123, + "num_tokens": 110409051.0, + "reward": 0.43585650622844696, + "reward_std": 0.05905279517173767, + "rewards/code_reward": 0.2878207750618458, + "rewards/format_reward": 1.480357140302658, + "step": 4726 + }, + { + "clip_ratio": 0.0027457450050860643, + "epoch": 0.1763723706916655, + "grad_norm": 0.05406040698289871, + "kl": 0.027557373046875, + "learning_rate": 1.0701765559595633e-06, + "loss": 0.0122, + "step": 4727 + }, + { + "clip_ratio": 0.0026541040861047804, + "epoch": 0.17640968238421714, + "grad_norm": 0.075042724609375, + "kl": 0.0260772705078125, + "learning_rate": 1.0696647118044375e-06, + "loss": 0.0121, + "step": 4728 + }, + { + "clip_ratio": 0.004281170142348856, + "completion_length": 851.3393096923828, + "epoch": 0.1764469940767688, + "grad_norm": 0.08581329882144928, + "kl": 0.0615234375, + "learning_rate": 1.0691547265357023e-06, + "loss": 0.0063, + "num_tokens": 110494708.0, + "reward": 0.6213655658066273, + "reward_std": 0.2585800960659981, + "rewards/code_reward": 0.4713655486702919, + "rewards/format_reward": 1.5, + "step": 4729 + }, + { + "clip_ratio": 0.004045183886773884, + "epoch": 0.17648430576932045, + "grad_norm": 0.08227747678756714, + "kl": 0.061676025390625, + "learning_rate": 1.0686466003673388e-06, + "loss": 0.006, + "step": 4730 + }, + { + "clip_ratio": 0.004215537395793945, + "epoch": 0.17652161746187212, + "grad_norm": 0.07637447863817215, + "kl": 0.052032470703125, + "learning_rate": 1.0681403335125465e-06, + "loss": 0.0058, + "step": 4731 + }, + { + "clip_ratio": 0.005067566875368357, + "completion_length": 683.6786041259766, + "epoch": 0.17655892915442375, + "grad_norm": 0.08466284722089767, + "kl": 0.0158538818359375, + "learning_rate": 1.0676359261837463e-06, + "loss": -0.0097, + "num_tokens": 110559566.0, + "reward": 0.3877837508916855, + "reward_std": 0.16416872292757034, + "rewards/code_reward": 0.23778371512889862, + "rewards/format_reward": 1.5, + "step": 4732 + }, + { + "clip_ratio": 0.004505676566623151, + "epoch": 0.17659624084697542, + "grad_norm": 0.05859016627073288, + "kl": 0.016754150390625, + "learning_rate": 1.0671333785925776e-06, + "loss": -0.0097, + "step": 4733 + }, + { + "clip_ratio": 0.004632582422345877, + "epoch": 0.17663355253952706, + "grad_norm": 0.06686241179704666, + "kl": 0.0166473388671875, + "learning_rate": 1.0666326909498999e-06, + "loss": -0.0097, + "step": 4734 + }, + { + "clip_ratio": 0.0037776694516651332, + "completion_length": 594.4107360839844, + "epoch": 0.17667086423207873, + "grad_norm": 0.09476308524608612, + "kl": 0.0179290771484375, + "learning_rate": 1.0661338634657935e-06, + "loss": 0.0038, + "num_tokens": 110618823.0, + "reward": 0.7115849852561951, + "reward_std": 0.12829064566176385, + "rewards/code_reward": 0.5615849141031504, + "rewards/format_reward": 1.5, + "step": 4735 + }, + { + "clip_ratio": 0.0039374890038743615, + "epoch": 0.17670817592463037, + "grad_norm": 0.09096316248178482, + "kl": 0.0177001953125, + "learning_rate": 1.0656368963495553e-06, + "loss": 0.0038, + "step": 4736 + }, + { + "clip_ratio": 0.0037177903577685356, + "epoch": 0.17674548761718203, + "grad_norm": 0.12368657439947128, + "kl": 0.0174407958984375, + "learning_rate": 1.0651417898097057e-06, + "loss": 0.0037, + "step": 4737 + }, + { + "clip_ratio": 0.0033531890949234366, + "completion_length": 634.9107360839844, + "epoch": 0.1767827993097337, + "grad_norm": 0.03549851104617119, + "kl": 0.017913818359375, + "learning_rate": 1.0646485440539802e-06, + "loss": 0.005, + "num_tokens": 110687824.0, + "reward": 0.3200848773121834, + "reward_std": 0.06713198125362396, + "rewards/code_reward": 0.1700848676264286, + "rewards/format_reward": 1.5, + "step": 4738 + }, + { + "clip_ratio": 0.003198424354195595, + "epoch": 0.17682011100228534, + "grad_norm": 0.035339001566171646, + "kl": 0.0181884765625, + "learning_rate": 1.0641571592893365e-06, + "loss": 0.005, + "step": 4739 + }, + { + "clip_ratio": 0.0032121496042236686, + "epoch": 0.176857422694837, + "grad_norm": 0.034999534487724304, + "kl": 0.0181884765625, + "learning_rate": 1.0636676357219508e-06, + "loss": 0.0049, + "step": 4740 + }, + { + "clip_ratio": 0.0032617449178360403, + "completion_length": 601.8214492797852, + "epoch": 0.17689473438738865, + "grad_norm": 0.07903102040290833, + "kl": 0.020721435546875, + "learning_rate": 1.0631799735572174e-06, + "loss": -0.0045, + "num_tokens": 110752928.0, + "reward": 0.6688882187008858, + "reward_std": 0.26369862258434296, + "rewards/code_reward": 0.5188882052898407, + "rewards/format_reward": 1.5, + "step": 4741 + }, + { + "clip_ratio": 0.0033043923904187977, + "epoch": 0.1769320460799403, + "grad_norm": 0.15693603456020355, + "kl": 0.020721435546875, + "learning_rate": 1.062694172999751e-06, + "loss": -0.0048, + "step": 4742 + }, + { + "clip_ratio": 0.003403474227525294, + "epoch": 0.17696935777249195, + "grad_norm": 0.08457998186349869, + "kl": 0.02056884765625, + "learning_rate": 1.0622102342533836e-06, + "loss": -0.0047, + "step": 4743 + }, + { + "clip_ratio": 0.003880980657413602, + "completion_length": 608.535758972168, + "epoch": 0.17700666946504362, + "grad_norm": 0.08608081936836243, + "kl": 0.02203369140625, + "learning_rate": 1.0617281575211674e-06, + "loss": 0.0119, + "num_tokens": 110825156.0, + "reward": 0.4883449114859104, + "reward_std": 0.1406005684984848, + "rewards/code_reward": 0.33834491594461724, + "rewards/format_reward": 1.5, + "step": 4744 + }, + { + "clip_ratio": 0.004227644298225641, + "epoch": 0.17704398115759526, + "grad_norm": 0.08736395090818405, + "kl": 0.022613525390625, + "learning_rate": 1.061247943005373e-06, + "loss": 0.0118, + "step": 4745 + }, + { + "clip_ratio": 0.003858003648929298, + "epoch": 0.17708129285014693, + "grad_norm": 0.08617958426475525, + "kl": 0.02227783203125, + "learning_rate": 1.0607695909074898e-06, + "loss": 0.0116, + "step": 4746 + }, + { + "clip_ratio": 0.004060807055793703, + "completion_length": 671.7143249511719, + "epoch": 0.17711860454269857, + "grad_norm": 0.11408925801515579, + "kl": 0.0165252685546875, + "learning_rate": 1.0602931014282242e-06, + "loss": -0.0016, + "num_tokens": 110896512.0, + "reward": 0.6980839520692825, + "reward_std": 0.27570540830492973, + "rewards/code_reward": 0.5480839610099792, + "rewards/format_reward": 1.5, + "step": 4747 + }, + { + "clip_ratio": 0.004303631401853636, + "epoch": 0.17715591623525023, + "grad_norm": 0.08505352586507797, + "kl": 0.016937255859375, + "learning_rate": 1.0598184747675036e-06, + "loss": -0.0015, + "step": 4748 + }, + { + "clip_ratio": 0.004071580013260245, + "epoch": 0.17719322792780187, + "grad_norm": 0.08826383203268051, + "kl": 0.01610565185546875, + "learning_rate": 1.0593457111244716e-06, + "loss": -0.0018, + "step": 4749 + }, + { + "clip_ratio": 0.0045845769927836955, + "completion_length": 696.2857513427734, + "epoch": 0.17723053962035354, + "grad_norm": 0.05949310213327408, + "kl": 0.0192718505859375, + "learning_rate": 1.0588748106974919e-06, + "loss": 0.0139, + "num_tokens": 110963392.0, + "reward": 0.3177209608256817, + "reward_std": 0.1987769277766347, + "rewards/code_reward": 0.17039951775223017, + "rewards/format_reward": 1.4732142984867096, + "step": 4750 + }, + { + "clip_ratio": 0.004064344451762736, + "epoch": 0.17726785131290518, + "grad_norm": 0.05901026725769043, + "kl": 0.0188751220703125, + "learning_rate": 1.0584057736841453e-06, + "loss": 0.0136, + "step": 4751 + }, + { + "clip_ratio": 0.004341116931755096, + "epoch": 0.17730516300545685, + "grad_norm": 0.05923433601856232, + "kl": 0.0193023681640625, + "learning_rate": 1.0579386002812305e-06, + "loss": 0.0137, + "step": 4752 + }, + { + "clip_ratio": 0.003932253748644143, + "completion_length": 821.8036041259766, + "epoch": 0.17734247469800848, + "grad_norm": 0.06683509796857834, + "kl": 0.0152435302734375, + "learning_rate": 1.0574732906847657e-06, + "loss": -0.0067, + "num_tokens": 111042775.0, + "reward": 0.4587166979908943, + "reward_std": 0.11014351225458086, + "rewards/code_reward": 0.30871665431186557, + "rewards/format_reward": 1.5, + "step": 4753 + }, + { + "clip_ratio": 0.0037127473624423146, + "epoch": 0.17737978639056015, + "grad_norm": 0.06954092532396317, + "kl": 0.0147857666015625, + "learning_rate": 1.0570098450899852e-06, + "loss": -0.0066, + "step": 4754 + }, + { + "clip_ratio": 0.004060858685988933, + "epoch": 0.1774170980831118, + "grad_norm": 0.06481573730707169, + "kl": 0.0150146484375, + "learning_rate": 1.056548263691343e-06, + "loss": -0.0065, + "step": 4755 + }, + { + "clip_ratio": 0.004727064922917634, + "completion_length": 665.7678833007812, + "epoch": 0.17745440977566346, + "grad_norm": 0.07488967478275299, + "kl": 0.020263671875, + "learning_rate": 1.0560885466825092e-06, + "loss": 0.0156, + "num_tokens": 111113210.0, + "reward": 0.6040437929332256, + "reward_std": 0.14057358936406672, + "rewards/code_reward": 0.4540437839459628, + "rewards/format_reward": 1.5, + "step": 4756 + }, + { + "clip_ratio": 0.004536133317742497, + "epoch": 0.1774917214682151, + "grad_norm": 0.08245870471000671, + "kl": 0.0199737548828125, + "learning_rate": 1.0556306942563732e-06, + "loss": 0.0155, + "step": 4757 + }, + { + "clip_ratio": 0.004752356791868806, + "epoch": 0.17752903316076676, + "grad_norm": 0.07638705521821976, + "kl": 0.0208892822265625, + "learning_rate": 1.0551747066050412e-06, + "loss": 0.0154, + "step": 4758 + }, + { + "clip_ratio": 0.003044782963115722, + "completion_length": 679.1607360839844, + "epoch": 0.1775663448533184, + "grad_norm": 0.05479145422577858, + "kl": 0.0201416015625, + "learning_rate": 1.0547205839198368e-06, + "loss": 0.0029, + "num_tokens": 111192347.0, + "reward": 0.5646205358207226, + "reward_std": 0.133870679885149, + "rewards/code_reward": 0.414620541036129, + "rewards/format_reward": 1.5, + "step": 4759 + }, + { + "clip_ratio": 0.0030792821780778468, + "epoch": 0.17760365654587007, + "grad_norm": 0.05473511293530464, + "kl": 0.0201416015625, + "learning_rate": 1.0542683263913022e-06, + "loss": 0.0031, + "step": 4760 + }, + { + "clip_ratio": 0.003250304434914142, + "epoch": 0.1776409682384217, + "grad_norm": 0.052474651485681534, + "kl": 0.020263671875, + "learning_rate": 1.0538179342091956e-06, + "loss": 0.003, + "step": 4761 + }, + { + "clip_ratio": 0.004397369339130819, + "completion_length": 735.5178985595703, + "epoch": 0.17767827993097338, + "grad_norm": 0.06949807703495026, + "kl": 0.024932861328125, + "learning_rate": 1.0533694075624934e-06, + "loss": 0.0039, + "num_tokens": 111266772.0, + "reward": 0.44960592314600945, + "reward_std": 0.24166872259229422, + "rewards/code_reward": 0.30157019942998886, + "rewards/format_reward": 1.480357140302658, + "step": 4762 + }, + { + "clip_ratio": 0.004126197483856231, + "epoch": 0.17771559162352502, + "grad_norm": 0.07014523446559906, + "kl": 0.027191162109375, + "learning_rate": 1.052922746639389e-06, + "loss": 0.0037, + "step": 4763 + }, + { + "clip_ratio": 0.004046353802550584, + "epoch": 0.17775290331607668, + "grad_norm": 0.06924174726009369, + "kl": 0.02349853515625, + "learning_rate": 1.0524779516272934e-06, + "loss": 0.0038, + "step": 4764 + }, + { + "clip_ratio": 0.0019675593939609826, + "completion_length": 466.08929443359375, + "epoch": 0.17779021500862832, + "grad_norm": 0.06105449050664902, + "kl": 0.0184783935546875, + "learning_rate": 1.0520350227128333e-06, + "loss": -0.0052, + "num_tokens": 111314083.0, + "reward": 0.8821428567171097, + "reward_std": 0.15759943425655365, + "rewards/code_reward": 0.732142860069871, + "rewards/format_reward": 1.5, + "step": 4765 + }, + { + "clip_ratio": 0.00229720922652632, + "epoch": 0.17782752670118, + "grad_norm": 0.06489388644695282, + "kl": 0.018096923828125, + "learning_rate": 1.0515939600818549e-06, + "loss": -0.005, + "step": 4766 + }, + { + "clip_ratio": 0.0020172997610643506, + "epoch": 0.17786483839373163, + "grad_norm": 0.06646997481584549, + "kl": 0.01812744140625, + "learning_rate": 1.0511547639194188e-06, + "loss": -0.0051, + "step": 4767 + }, + { + "clip_ratio": 0.0029075953643769026, + "completion_length": 675.0714645385742, + "epoch": 0.1779021500862833, + "grad_norm": 0.08531330525875092, + "kl": 0.015869140625, + "learning_rate": 1.0507174344098044e-06, + "loss": 0.0053, + "num_tokens": 111388665.0, + "reward": 0.7992987185716629, + "reward_std": 0.340716827660799, + "rewards/code_reward": 0.6492986977100372, + "rewards/format_reward": 1.5, + "step": 4768 + }, + { + "clip_ratio": 0.0033934509847313166, + "epoch": 0.17793946177883493, + "grad_norm": 0.08559834212064743, + "kl": 0.015899658203125, + "learning_rate": 1.050281971736506e-06, + "loss": 0.0051, + "step": 4769 + }, + { + "clip_ratio": 0.003440192376729101, + "epoch": 0.1779767734713866, + "grad_norm": 0.08355550467967987, + "kl": 0.0159149169921875, + "learning_rate": 1.0498483760822361e-06, + "loss": 0.0052, + "step": 4770 + }, + { + "clip_ratio": 0.0036541063454933465, + "completion_length": 535.3571624755859, + "epoch": 0.17801408516393824, + "grad_norm": 0.05229857563972473, + "kl": 0.02105712890625, + "learning_rate": 1.0494166476289239e-06, + "loss": 0.0615, + "num_tokens": 111444489.0, + "reward": 0.9533044025301933, + "reward_std": 0.027102170512080193, + "rewards/code_reward": 0.805982980877161, + "rewards/format_reward": 1.4732142984867096, + "step": 4771 + }, + { + "clip_ratio": 0.0034788656048476696, + "epoch": 0.1780513968564899, + "grad_norm": 0.05151180177927017, + "kl": 0.021087646484375, + "learning_rate": 1.0489867865577136e-06, + "loss": 0.0615, + "step": 4772 + }, + { + "clip_ratio": 0.0037325264420360327, + "epoch": 0.17808870854904155, + "grad_norm": 0.05939551070332527, + "kl": 0.02020263671875, + "learning_rate": 1.0485587930489684e-06, + "loss": 0.0615, + "step": 4773 + }, + { + "clip_ratio": 0.001754996133968234, + "completion_length": 600.5357437133789, + "epoch": 0.1781260202415932, + "grad_norm": 0.04614995792508125, + "kl": 0.0159454345703125, + "learning_rate": 1.0481326672822648e-06, + "loss": -0.0048, + "num_tokens": 111504439.0, + "reward": 0.7658536434173584, + "reward_std": 0.15029976516962051, + "rewards/code_reward": 0.6158536598086357, + "rewards/format_reward": 1.5, + "step": 4774 + }, + { + "clip_ratio": 0.0018417078536003828, + "epoch": 0.17816333193414485, + "grad_norm": 0.04668118804693222, + "kl": 0.015594482421875, + "learning_rate": 1.0477084094363982e-06, + "loss": -0.0048, + "step": 4775 + }, + { + "clip_ratio": 0.0017259703599847853, + "epoch": 0.17820064362669652, + "grad_norm": 0.04854033142328262, + "kl": 0.0156707763671875, + "learning_rate": 1.047286019689379e-06, + "loss": -0.0049, + "step": 4776 + }, + { + "clip_ratio": 0.0031343234004452825, + "completion_length": 619.9286041259766, + "epoch": 0.17823795531924816, + "grad_norm": 0.04796977341175079, + "kl": 0.0157623291015625, + "learning_rate": 1.0468654982184344e-06, + "loss": 0.0204, + "num_tokens": 111561947.0, + "reward": 0.736538466066122, + "reward_std": 0.13459057081490755, + "rewards/code_reward": 0.5865384489297867, + "rewards/format_reward": 1.5, + "step": 4777 + }, + { + "clip_ratio": 0.003570408618543297, + "epoch": 0.17827526701179983, + "grad_norm": 0.0484616793692112, + "kl": 0.0153350830078125, + "learning_rate": 1.0464468452000074e-06, + "loss": 0.0203, + "step": 4778 + }, + { + "clip_ratio": 0.0033069102209992707, + "epoch": 0.17831257870435147, + "grad_norm": 0.0474642813205719, + "kl": 0.0158538818359375, + "learning_rate": 1.0460300608097567e-06, + "loss": 0.0203, + "step": 4779 + }, + { + "clip_ratio": 0.0021428545005619526, + "completion_length": 742.7143325805664, + "epoch": 0.17834989039690313, + "grad_norm": 0.04146403819322586, + "kl": 0.0137176513671875, + "learning_rate": 1.045615145222557e-06, + "loss": 0.014, + "num_tokens": 111634619.0, + "reward": 0.8648161254823208, + "reward_std": 0.09183812304399908, + "rewards/code_reward": 0.7148161265649833, + "rewards/format_reward": 1.5, + "step": 4780 + }, + { + "clip_ratio": 0.0019459704053588212, + "epoch": 0.17838720208945477, + "grad_norm": 0.04199595004320145, + "kl": 0.013519287109375, + "learning_rate": 1.0452020986125003e-06, + "loss": 0.0139, + "step": 4781 + }, + { + "clip_ratio": 0.001848478161264211, + "epoch": 0.17842451378200644, + "grad_norm": 0.04123804718255997, + "kl": 0.0135650634765625, + "learning_rate": 1.0447909211528917e-06, + "loss": 0.0139, + "step": 4782 + }, + { + "clip_ratio": 0.002864136011339724, + "completion_length": 610.1607437133789, + "epoch": 0.17846182547455808, + "grad_norm": 0.04918195307254791, + "kl": 0.019317626953125, + "learning_rate": 1.0443816130162552e-06, + "loss": 0.0014, + "num_tokens": 111698244.0, + "reward": 0.6380952522158623, + "reward_std": 0.019533634185791016, + "rewards/code_reward": 0.4880952388048172, + "rewards/format_reward": 1.5, + "step": 4783 + }, + { + "clip_ratio": 0.0029816533788107336, + "epoch": 0.17849913716710974, + "grad_norm": 0.04790894314646721, + "kl": 0.019775390625, + "learning_rate": 1.0439741743743282e-06, + "loss": 0.0014, + "step": 4784 + }, + { + "clip_ratio": 0.003157580504193902, + "epoch": 0.17853644885966138, + "grad_norm": 0.047219425439834595, + "kl": 0.019378662109375, + "learning_rate": 1.0435686053980638e-06, + "loss": 0.0014, + "step": 4785 + }, + { + "clip_ratio": 0.0008292266284115613, + "completion_length": 413.4464416503906, + "epoch": 0.17857376055221305, + "grad_norm": 0.056201450526714325, + "kl": 0.0158233642578125, + "learning_rate": 1.0431649062576323e-06, + "loss": 0.0017, + "num_tokens": 111744623.0, + "reward": 1.0964285582304, + "reward_std": 0.10645382851362228, + "rewards/code_reward": 0.9464285671710968, + "rewards/format_reward": 1.5, + "step": 4786 + }, + { + "clip_ratio": 0.0011822071392089128, + "epoch": 0.1786110722447647, + "grad_norm": 0.05549895390868187, + "kl": 0.0158538818359375, + "learning_rate": 1.0427630771224179e-06, + "loss": 0.0017, + "step": 4787 + }, + { + "clip_ratio": 0.0013236977392807603, + "epoch": 0.17864838393731636, + "grad_norm": 0.05031300708651543, + "kl": 0.0157623291015625, + "learning_rate": 1.0423631181610208e-06, + "loss": 0.0019, + "step": 4788 + }, + { + "clip_ratio": 0.0041958128567785025, + "completion_length": 633.5714569091797, + "epoch": 0.178685695629868, + "grad_norm": 0.07230450212955475, + "kl": 0.019317626953125, + "learning_rate": 1.0419650295412558e-06, + "loss": 0.0012, + "num_tokens": 111819779.0, + "reward": 0.47707415744662285, + "reward_std": 0.10582172241993248, + "rewards/code_reward": 0.32707415107870474, + "rewards/format_reward": 1.5, + "step": 4789 + }, + { + "clip_ratio": 0.004134826478548348, + "epoch": 0.17872300732241966, + "grad_norm": 0.08384477347135544, + "kl": 0.0189666748046875, + "learning_rate": 1.0415688114301536e-06, + "loss": 0.0013, + "step": 4790 + }, + { + "clip_ratio": 0.003866623097565025, + "epoch": 0.1787603190149713, + "grad_norm": 0.07416152209043503, + "kl": 0.019012451171875, + "learning_rate": 1.0411744639939606e-06, + "loss": 0.0012, + "step": 4791 + }, + { + "clip_ratio": 0.0036781300441361964, + "completion_length": 951.0536193847656, + "epoch": 0.17879763070752297, + "grad_norm": 0.07595773041248322, + "kl": 0.018524169921875, + "learning_rate": 1.0407819873981367e-06, + "loss": 0.016, + "num_tokens": 111916760.0, + "reward": 0.7027736082673073, + "reward_std": 0.36788291297852993, + "rewards/code_reward": 0.5554521456360817, + "rewards/format_reward": 1.4732142984867096, + "step": 4792 + }, + { + "clip_ratio": 0.0033736287732608616, + "epoch": 0.17883494240007464, + "grad_norm": 0.07759159803390503, + "kl": 0.018524169921875, + "learning_rate": 1.040391381807359e-06, + "loss": 0.0159, + "step": 4793 + }, + { + "clip_ratio": 0.0036012023920193315, + "epoch": 0.17887225409262628, + "grad_norm": 0.08910400420427322, + "kl": 0.018218994140625, + "learning_rate": 1.0400026473855173e-06, + "loss": 0.0159, + "step": 4794 + }, + { + "clip_ratio": 0.004028127645142376, + "completion_length": 635.5000305175781, + "epoch": 0.17890956578517794, + "grad_norm": 0.08015409111976624, + "kl": 0.0308837890625, + "learning_rate": 1.0396157842957177e-06, + "loss": 0.0016, + "num_tokens": 111986732.0, + "reward": 0.6095588356256485, + "reward_std": 0.22467638924717903, + "rewards/code_reward": 0.4595588259398937, + "rewards/format_reward": 1.5, + "step": 4795 + }, + { + "clip_ratio": 0.0036842882400378585, + "epoch": 0.17894687747772958, + "grad_norm": 0.2809779942035675, + "kl": 0.031219482421875, + "learning_rate": 1.0392307927002807e-06, + "loss": 0.0016, + "step": 4796 + }, + { + "clip_ratio": 0.004151096276473254, + "epoch": 0.17898418917028125, + "grad_norm": 0.07552175968885422, + "kl": 0.031951904296875, + "learning_rate": 1.038847672760742e-06, + "loss": 0.0014, + "step": 4797 + }, + { + "clip_ratio": 0.004086330533027649, + "completion_length": 604.6607360839844, + "epoch": 0.1790215008628329, + "grad_norm": 0.11347341537475586, + "kl": 0.0257568359375, + "learning_rate": 1.0384664246378514e-06, + "loss": 0.0892, + "num_tokens": 112048733.0, + "reward": 0.7051404565572739, + "reward_std": 0.3004727615043521, + "rewards/code_reward": 0.5578190088272095, + "rewards/format_reward": 1.4732142984867096, + "step": 4798 + }, + { + "clip_ratio": 0.0037287137238308787, + "epoch": 0.17905881255538456, + "grad_norm": 0.08306080847978592, + "kl": 0.025390625, + "learning_rate": 1.0380870484915738e-06, + "loss": 0.0891, + "step": 4799 + }, + { + "clip_ratio": 0.004043981316499412, + "epoch": 0.1790961242479362, + "grad_norm": 0.0798492357134819, + "kl": 0.02484130859375, + "learning_rate": 1.0377095444810873e-06, + "loss": 0.0891, + "step": 4800 + }, + { + "clip_ratio": 0.0018376728403382003, + "completion_length": 648.8035888671875, + "epoch": 0.17913343594048786, + "grad_norm": 0.057209983468055725, + "kl": 0.02069091796875, + "learning_rate": 1.0373339127647867e-06, + "loss": 0.0034, + "num_tokens": 112115516.0, + "reward": 0.9458683654665947, + "reward_std": 0.043417025823146105, + "rewards/code_reward": 0.795868344604969, + "rewards/format_reward": 1.5, + "step": 4801 + }, + { + "clip_ratio": 0.0019594258046709, + "epoch": 0.1791707476330395, + "grad_norm": 0.05693288519978523, + "kl": 0.02081298828125, + "learning_rate": 1.0369601535002794e-06, + "loss": 0.0032, + "step": 4802 + }, + { + "clip_ratio": 0.0021220631315372884, + "epoch": 0.17920805932559117, + "grad_norm": 0.05699194595217705, + "kl": 0.021026611328125, + "learning_rate": 1.036588266844388e-06, + "loss": 0.0034, + "step": 4803 + }, + { + "clip_ratio": 0.0047445377567783, + "completion_length": 538.2857513427734, + "epoch": 0.1792453710181428, + "grad_norm": 0.06615351885557175, + "kl": 0.0187530517578125, + "learning_rate": 1.03621825295315e-06, + "loss": -0.0028, + "num_tokens": 112165960.0, + "reward": 0.2949984520673752, + "reward_std": 0.15425672009587288, + "rewards/code_reward": 0.1449984274804592, + "rewards/format_reward": 1.5, + "step": 4804 + }, + { + "clip_ratio": 0.004869483411312103, + "epoch": 0.17928268271069447, + "grad_norm": 0.06616777926683426, + "kl": 0.0193634033203125, + "learning_rate": 1.0358501119818155e-06, + "loss": -0.0031, + "step": 4805 + }, + { + "clip_ratio": 0.004456997266970575, + "epoch": 0.1793199944032461, + "grad_norm": 0.0660829097032547, + "kl": 0.0186920166015625, + "learning_rate": 1.0354838440848502e-06, + "loss": -0.0031, + "step": 4806 + }, + { + "clip_ratio": 0.003031856380403042, + "completion_length": 632.8571701049805, + "epoch": 0.17935730609579778, + "grad_norm": 0.23953324556350708, + "kl": 0.0195159912109375, + "learning_rate": 1.0351194494159325e-06, + "loss": -0.0075, + "num_tokens": 112236554.0, + "reward": 0.624354898929596, + "reward_std": 0.2514793574810028, + "rewards/code_reward": 0.47435490041971207, + "rewards/format_reward": 1.5, + "step": 4807 + }, + { + "clip_ratio": 0.002901640022173524, + "epoch": 0.17939461778834942, + "grad_norm": 0.0641901046037674, + "kl": 0.019622802734375, + "learning_rate": 1.0347569281279558e-06, + "loss": -0.0074, + "step": 4808 + }, + { + "clip_ratio": 0.003154214529786259, + "epoch": 0.1794319294809011, + "grad_norm": 0.06496423482894897, + "kl": 0.01959228515625, + "learning_rate": 1.0343962803730283e-06, + "loss": -0.0076, + "step": 4809 + }, + { + "clip_ratio": 0.005730622971896082, + "completion_length": 659.6964416503906, + "epoch": 0.17946924117345273, + "grad_norm": 0.07360963523387909, + "kl": 0.0179595947265625, + "learning_rate": 1.0340375063024702e-06, + "loss": -0.0114, + "num_tokens": 112296667.0, + "reward": 0.23427236825227737, + "reward_std": 0.12783031910657883, + "rewards/code_reward": 0.0842723399400711, + "rewards/format_reward": 1.5, + "step": 4810 + }, + { + "clip_ratio": 0.005960182985290885, + "epoch": 0.1795065528660044, + "grad_norm": 0.07591160386800766, + "kl": 0.018218994140625, + "learning_rate": 1.0336806060668162e-06, + "loss": -0.0115, + "step": 4811 + }, + { + "clip_ratio": 0.0056930327555164695, + "epoch": 0.17954386455855603, + "grad_norm": 0.06360498815774918, + "kl": 0.0181732177734375, + "learning_rate": 1.033325579815816e-06, + "loss": -0.0114, + "step": 4812 + }, + { + "clip_ratio": 0.001947166514582932, + "completion_length": 608.607177734375, + "epoch": 0.1795811762511077, + "grad_norm": 0.07037253677845001, + "kl": 0.0199127197265625, + "learning_rate": 1.0329724276984311e-06, + "loss": 0.0055, + "num_tokens": 112368199.0, + "reward": 0.8693254441022873, + "reward_std": 0.0917478664778173, + "rewards/code_reward": 0.7193254455924034, + "rewards/format_reward": 1.5, + "step": 4813 + }, + { + "clip_ratio": 0.0020912016625516117, + "epoch": 0.17961848794365934, + "grad_norm": 0.07137355953454971, + "kl": 0.020294189453125, + "learning_rate": 1.032621149862838e-06, + "loss": 0.0054, + "step": 4814 + }, + { + "clip_ratio": 0.002069272508379072, + "epoch": 0.179655799636211, + "grad_norm": 0.10448697954416275, + "kl": 0.0200653076171875, + "learning_rate": 1.0322717464564263e-06, + "loss": 0.0054, + "step": 4815 + }, + { + "clip_ratio": 0.003612557949963957, + "completion_length": 607.5893020629883, + "epoch": 0.17969311132876264, + "grad_norm": 0.14974766969680786, + "kl": 0.0165557861328125, + "learning_rate": 1.0319242176257997e-06, + "loss": -0.0027, + "num_tokens": 112439618.0, + "reward": 0.7920295521616936, + "reward_std": 0.29609325900673866, + "rewards/code_reward": 0.642029520124197, + "rewards/format_reward": 1.5, + "step": 4816 + }, + { + "clip_ratio": 0.003963231458328664, + "epoch": 0.1797304230213143, + "grad_norm": 0.11004608869552612, + "kl": 0.0167236328125, + "learning_rate": 1.0315785635167732e-06, + "loss": -0.0025, + "step": 4817 + }, + { + "clip_ratio": 0.003433912352193147, + "epoch": 0.17976773471386595, + "grad_norm": 0.09237872809171677, + "kl": 0.01739501953125, + "learning_rate": 1.0312347842743787e-06, + "loss": -0.0029, + "step": 4818 + }, + { + "clip_ratio": 0.003061880241148174, + "completion_length": 613.6607666015625, + "epoch": 0.17980504640641762, + "grad_norm": 0.053012847900390625, + "kl": 0.0176849365234375, + "learning_rate": 1.0308928800428583e-06, + "loss": -0.0045, + "num_tokens": 112507009.0, + "reward": 0.7629196308553219, + "reward_std": 0.17430133000016212, + "rewards/code_reward": 0.6129196584224701, + "rewards/format_reward": 1.5, + "step": 4819 + }, + { + "clip_ratio": 0.0034509781980887055, + "epoch": 0.17984235809896926, + "grad_norm": 0.055672287940979004, + "kl": 0.0180511474609375, + "learning_rate": 1.0305528509656696e-06, + "loss": -0.0044, + "step": 4820 + }, + { + "clip_ratio": 0.003096316650044173, + "epoch": 0.17987966979152092, + "grad_norm": 0.052224528044462204, + "kl": 0.0178985595703125, + "learning_rate": 1.0302146971854824e-06, + "loss": -0.0045, + "step": 4821 + }, + { + "clip_ratio": 0.0038652143557555974, + "completion_length": 835.0536041259766, + "epoch": 0.17991698148407256, + "grad_norm": 0.05646035820245743, + "kl": 0.01778411865234375, + "learning_rate": 1.0298784188441788e-06, + "loss": -0.0069, + "num_tokens": 112585594.0, + "reward": 0.4640292376279831, + "reward_std": 0.12786676967516541, + "rewards/code_reward": 0.31402922328561544, + "rewards/format_reward": 1.5, + "step": 4822 + }, + { + "clip_ratio": 0.0031825198384467512, + "epoch": 0.17995429317662423, + "grad_norm": 0.05646822974085808, + "kl": 0.01744842529296875, + "learning_rate": 1.0295440160828555e-06, + "loss": -0.007, + "step": 4823 + }, + { + "clip_ratio": 0.003736739919986576, + "epoch": 0.17999160486917587, + "grad_norm": 0.05697550252079964, + "kl": 0.017120361328125, + "learning_rate": 1.0292114890418226e-06, + "loss": -0.0069, + "step": 4824 + }, + { + "clip_ratio": 0.0015400305856019258, + "completion_length": 535.0357360839844, + "epoch": 0.18002891656172754, + "grad_norm": 0.06964331120252609, + "kl": 0.0178985595703125, + "learning_rate": 1.028880837860601e-06, + "loss": -0.0045, + "num_tokens": 112640210.0, + "reward": 1.096462219953537, + "reward_std": 0.09745091758668423, + "rewards/code_reward": 0.9464622288942337, + "rewards/format_reward": 1.5, + "step": 4825 + }, + { + "clip_ratio": 0.0018270047148689628, + "epoch": 0.18006622825427918, + "grad_norm": 0.06973888725042343, + "kl": 0.017425537109375, + "learning_rate": 1.0285520626779257e-06, + "loss": -0.0046, + "step": 4826 + }, + { + "clip_ratio": 0.001730204385239631, + "epoch": 0.18010353994683084, + "grad_norm": 0.07246018201112747, + "kl": 0.0175933837890625, + "learning_rate": 1.0282251636317458e-06, + "loss": -0.0047, + "step": 4827 + }, + { + "clip_ratio": 0.004393864830490202, + "completion_length": 598.4286041259766, + "epoch": 0.18014085163938248, + "grad_norm": 0.061801742762327194, + "kl": 0.0193328857421875, + "learning_rate": 1.027900140859221e-06, + "loss": -0.0021, + "num_tokens": 112697606.0, + "reward": 0.6079782769083977, + "reward_std": 0.07385576795786619, + "rewards/code_reward": 0.4579782485961914, + "rewards/format_reward": 1.5, + "step": 4828 + }, + { + "clip_ratio": 0.004490230057854205, + "epoch": 0.18017816333193415, + "grad_norm": 0.058412063866853714, + "kl": 0.0193328857421875, + "learning_rate": 1.0275769944967256e-06, + "loss": -0.0021, + "step": 4829 + }, + { + "clip_ratio": 0.004199371731374413, + "epoch": 0.1802154750244858, + "grad_norm": 0.06053660437464714, + "kl": 0.0191650390625, + "learning_rate": 1.027255724679845e-06, + "loss": -0.0022, + "step": 4830 + }, + { + "clip_ratio": 0.002521210233680904, + "completion_length": 602.0893173217773, + "epoch": 0.18025278671703746, + "grad_norm": 0.03824908286333084, + "kl": 0.02301025390625, + "learning_rate": 1.0269363315433793e-06, + "loss": -0.0057, + "num_tokens": 112765147.0, + "reward": 0.6678571403026581, + "reward_std": 0.02661345712840557, + "rewards/code_reward": 0.5178571436554193, + "rewards/format_reward": 1.5, + "step": 4831 + }, + { + "clip_ratio": 0.0021547985961660743, + "epoch": 0.1802900984095891, + "grad_norm": 0.038734760135412216, + "kl": 0.022979736328125, + "learning_rate": 1.0266188152213386e-06, + "loss": -0.0058, + "step": 4832 + }, + { + "clip_ratio": 0.0022082579089328647, + "epoch": 0.18032741010214076, + "grad_norm": 0.0409354567527771, + "kl": 0.0228271484375, + "learning_rate": 1.0263031758469474e-06, + "loss": -0.0058, + "step": 4833 + }, + { + "clip_ratio": 0.0029291691025719047, + "completion_length": 559.0000228881836, + "epoch": 0.1803647217946924, + "grad_norm": 0.027548570185899734, + "kl": 0.0220947265625, + "learning_rate": 1.0259894135526424e-06, + "loss": 0.0004, + "num_tokens": 112826537.0, + "reward": 0.8000000081956387, + "reward_std": 0.0, + "rewards/code_reward": 0.6500000059604645, + "rewards/format_reward": 1.5, + "step": 4834 + }, + { + "clip_ratio": 0.0029689394868910313, + "epoch": 0.18040203348724407, + "grad_norm": 0.03557354956865311, + "kl": 0.0241546630859375, + "learning_rate": 1.0256775284700717e-06, + "loss": 0.0004, + "step": 4835 + }, + { + "clip_ratio": 0.002921758219599724, + "epoch": 0.1804393451797957, + "grad_norm": 0.011304040439426899, + "kl": 0.0199737548828125, + "learning_rate": 1.0253675207300972e-06, + "loss": 0.0004, + "step": 4836 + }, + { + "clip_ratio": 0.004412254900671542, + "completion_length": 888.5000305175781, + "epoch": 0.18047665687234737, + "grad_norm": 0.0921308845281601, + "kl": 0.0145263671875, + "learning_rate": 1.0250593904627923e-06, + "loss": 0.0042, + "num_tokens": 112914807.0, + "reward": 0.24320930242538452, + "reward_std": 0.1789141836343333, + "rewards/code_reward": 0.0932092975708656, + "rewards/format_reward": 1.5, + "step": 4837 + }, + { + "clip_ratio": 0.004074520606081933, + "epoch": 0.180513968564899, + "grad_norm": 0.09620164334774017, + "kl": 0.014373779296875, + "learning_rate": 1.0247531377974424e-06, + "loss": 0.0039, + "step": 4838 + }, + { + "clip_ratio": 0.004019484098535031, + "epoch": 0.18055128025745068, + "grad_norm": 0.09110686182975769, + "kl": 0.0143280029296875, + "learning_rate": 1.0244487628625455e-06, + "loss": 0.0039, + "step": 4839 + }, + { + "clip_ratio": 0.0008494465728290379, + "completion_length": 556.2500152587891, + "epoch": 0.18058859195000232, + "grad_norm": 0.03517924249172211, + "kl": 0.0178985595703125, + "learning_rate": 1.0241462657858116e-06, + "loss": -0.0065, + "num_tokens": 112976071.0, + "reward": 1.1118644028902054, + "reward_std": 0.089778833091259, + "rewards/code_reward": 0.9618643969297409, + "rewards/format_reward": 1.5, + "step": 4840 + }, + { + "clip_ratio": 0.0007752749952487648, + "epoch": 0.180625903642554, + "grad_norm": 0.0353025384247303, + "kl": 0.0178375244140625, + "learning_rate": 1.0238456466941632e-06, + "loss": -0.0065, + "step": 4841 + }, + { + "clip_ratio": 0.0007907483959570527, + "epoch": 0.18066321533510563, + "grad_norm": 0.03414708748459816, + "kl": 0.0186309814453125, + "learning_rate": 1.0235469057137348e-06, + "loss": -0.0066, + "step": 4842 + }, + { + "clip_ratio": 0.004469375184271485, + "completion_length": 709.6071624755859, + "epoch": 0.1807005270276573, + "grad_norm": 0.09618409723043442, + "kl": 0.032989501953125, + "learning_rate": 1.0232500429698712e-06, + "loss": 0.0162, + "num_tokens": 113057319.0, + "reward": 0.5646763816475868, + "reward_std": 0.27242880314588547, + "rewards/code_reward": 0.41467639431357384, + "rewards/format_reward": 1.5, + "step": 4843 + }, + { + "clip_ratio": 0.004111597372684628, + "epoch": 0.18073783872020893, + "grad_norm": 0.08649135380983353, + "kl": 0.03448486328125, + "learning_rate": 1.0229550585871323e-06, + "loss": 0.016, + "step": 4844 + }, + { + "clip_ratio": 0.004575729137286544, + "epoch": 0.1807751504127606, + "grad_norm": 0.09065035730600357, + "kl": 0.033233642578125, + "learning_rate": 1.0226619526892864e-06, + "loss": 0.0159, + "step": 4845 + }, + { + "clip_ratio": 0.003313062246888876, + "completion_length": 658.5357437133789, + "epoch": 0.18081246210531227, + "grad_norm": 0.06716571003198624, + "kl": 0.021331787109375, + "learning_rate": 1.0223707253993158e-06, + "loss": 0.0125, + "num_tokens": 113129197.0, + "reward": 0.7425411492586136, + "reward_std": 0.3697335720062256, + "rewards/code_reward": 0.5925411432981491, + "rewards/format_reward": 1.5, + "step": 4846 + }, + { + "clip_ratio": 0.003351059276610613, + "epoch": 0.1808497737978639, + "grad_norm": 0.06632404029369354, + "kl": 0.0211181640625, + "learning_rate": 1.0220813768394149e-06, + "loss": 0.0124, + "step": 4847 + }, + { + "clip_ratio": 0.0031452946132048965, + "epoch": 0.18088708549041557, + "grad_norm": 0.06876785308122635, + "kl": 0.021209716796875, + "learning_rate": 1.0217939071309883e-06, + "loss": 0.0125, + "step": 4848 + }, + { + "clip_ratio": 0.0050069838762283325, + "completion_length": 604.8035926818848, + "epoch": 0.1809243971829672, + "grad_norm": 0.05782214552164078, + "kl": 0.0204010009765625, + "learning_rate": 1.0215083163946532e-06, + "loss": 0.0031, + "num_tokens": 113192760.0, + "reward": 0.5646877214312553, + "reward_std": 0.04737889318494126, + "rewards/code_reward": 0.41468770802021027, + "rewards/format_reward": 1.5, + "step": 4849 + }, + { + "clip_ratio": 0.005032581044360995, + "epoch": 0.18096170887551888, + "grad_norm": 0.0605592206120491, + "kl": 0.01910400390625, + "learning_rate": 1.0212246047502374e-06, + "loss": 0.0032, + "step": 4850 + }, + { + "clip_ratio": 0.004698261618614197, + "epoch": 0.18099902056807052, + "grad_norm": 0.05807293578982353, + "kl": 0.019927978515625, + "learning_rate": 1.0209427723167816e-06, + "loss": 0.0031, + "step": 4851 + }, + { + "clip_ratio": 0.0026587211759760976, + "completion_length": 850.732177734375, + "epoch": 0.18103633226062218, + "grad_norm": 0.048455264419317245, + "kl": 0.016143798828125, + "learning_rate": 1.020662819212538e-06, + "loss": 0.0057, + "num_tokens": 113277403.0, + "reward": 0.7035188935697079, + "reward_std": 0.19345993548631668, + "rewards/code_reward": 0.5535188764333725, + "rewards/format_reward": 1.5, + "step": 4852 + }, + { + "clip_ratio": 0.002719174197409302, + "epoch": 0.18107364395317382, + "grad_norm": 0.046141840517520905, + "kl": 0.0164337158203125, + "learning_rate": 1.0203847455549686e-06, + "loss": 0.0057, + "step": 4853 + }, + { + "clip_ratio": 0.0026667193742468953, + "epoch": 0.1811109556457255, + "grad_norm": 0.04935196414589882, + "kl": 0.016265869140625, + "learning_rate": 1.0201085514607488e-06, + "loss": 0.0057, + "step": 4854 + }, + { + "clip_ratio": 0.0022921962663531303, + "completion_length": 836.6071624755859, + "epoch": 0.18114826733827713, + "grad_norm": 0.06578797847032547, + "kl": 0.0176849365234375, + "learning_rate": 1.0198342370457643e-06, + "loss": 0.0124, + "num_tokens": 113356789.0, + "reward": 0.6203231289982796, + "reward_std": 0.20445271208882332, + "rewards/code_reward": 0.47300170361995697, + "rewards/format_reward": 1.4732142984867096, + "step": 4855 + }, + { + "clip_ratio": 0.002621664898470044, + "epoch": 0.1811855790308288, + "grad_norm": 0.0653228610754013, + "kl": 0.0178070068359375, + "learning_rate": 1.0195618024251122e-06, + "loss": 0.0124, + "step": 4856 + }, + { + "clip_ratio": 0.002493790932931006, + "epoch": 0.18122289072338044, + "grad_norm": 6.890539169311523, + "kl": 0.0171356201171875, + "learning_rate": 1.019291247713101e-06, + "loss": 0.015, + "step": 4857 + }, + { + "clip_ratio": 0.0035038518253713846, + "completion_length": 625.2143096923828, + "epoch": 0.1812602024159321, + "grad_norm": 0.03790009766817093, + "kl": 0.02679443359375, + "learning_rate": 1.0190225730232508e-06, + "loss": -0.0048, + "num_tokens": 113421103.0, + "reward": 0.5230357274413109, + "reward_std": 0.12787531316280365, + "rewards/code_reward": 0.375, + "rewards/format_reward": 1.480357140302658, + "step": 4858 + }, + { + "clip_ratio": 0.0037875204579904675, + "epoch": 0.18129751410848374, + "grad_norm": 0.03588951379060745, + "kl": 0.026702880859375, + "learning_rate": 1.0187557784682919e-06, + "loss": -0.0048, + "step": 4859 + }, + { + "clip_ratio": 0.0036727177212014794, + "epoch": 0.1813348258010354, + "grad_norm": 0.035046473145484924, + "kl": 0.026397705078125, + "learning_rate": 1.0184908641601667e-06, + "loss": -0.0047, + "step": 4860 + }, + { + "clip_ratio": 0.00489484379068017, + "completion_length": 678.303596496582, + "epoch": 0.18137213749358705, + "grad_norm": 0.035985201597213745, + "kl": 0.020599365234375, + "learning_rate": 1.0182278302100284e-06, + "loss": 0.003, + "num_tokens": 113485436.0, + "reward": 0.3139881193637848, + "reward_std": 0.12730364501476288, + "rewards/code_reward": 0.1666666679084301, + "rewards/format_reward": 1.4732142984867096, + "step": 4861 + }, + { + "clip_ratio": 0.005472870543599129, + "epoch": 0.18140944918613872, + "grad_norm": 0.04653188958764076, + "kl": 0.020477294921875, + "learning_rate": 1.017966676728241e-06, + "loss": 0.003, + "step": 4862 + }, + { + "clip_ratio": 0.005503320833668113, + "epoch": 0.18144676087869036, + "grad_norm": 0.04274260625243187, + "kl": 0.0202178955078125, + "learning_rate": 1.0177074038243798e-06, + "loss": 0.0031, + "step": 4863 + }, + { + "clip_ratio": 0.0028466752264648676, + "completion_length": 688.1964569091797, + "epoch": 0.18148407257124202, + "grad_norm": 0.0712277740240097, + "kl": 0.018310546875, + "learning_rate": 1.0174500116072308e-06, + "loss": 0.0029, + "num_tokens": 113553151.0, + "reward": 0.7322994470596313, + "reward_std": 0.28339770063757896, + "rewards/code_reward": 0.5822994410991669, + "rewards/format_reward": 1.5, + "step": 4864 + }, + { + "clip_ratio": 0.0027813376509584486, + "epoch": 0.18152138426379366, + "grad_norm": 0.06499433517456055, + "kl": 0.0184478759765625, + "learning_rate": 1.0171945001847905e-06, + "loss": 0.0026, + "step": 4865 + }, + { + "clip_ratio": 0.003062174073420465, + "epoch": 0.18155869595634533, + "grad_norm": 0.06722516566514969, + "kl": 0.018646240234375, + "learning_rate": 1.0169408696642679e-06, + "loss": 0.0026, + "step": 4866 + }, + { + "clip_ratio": 0.003864807018544525, + "completion_length": 735.0714569091797, + "epoch": 0.18159600764889697, + "grad_norm": 0.07056848704814911, + "kl": 0.0166778564453125, + "learning_rate": 1.016689120152081e-06, + "loss": 0.0048, + "num_tokens": 113625689.0, + "reward": 0.4861801452934742, + "reward_std": 0.37039512395858765, + "rewards/code_reward": 0.3361801281571388, + "rewards/format_reward": 1.5, + "step": 4867 + }, + { + "clip_ratio": 0.00374506798107177, + "epoch": 0.18163331934144863, + "grad_norm": 0.06871522217988968, + "kl": 0.01641845703125, + "learning_rate": 1.016439251753859e-06, + "loss": 0.0048, + "step": 4868 + }, + { + "clip_ratio": 0.003836250340100378, + "epoch": 0.18167063103400027, + "grad_norm": 0.0702635794878006, + "kl": 0.0167236328125, + "learning_rate": 1.0161912645744418e-06, + "loss": 0.0048, + "step": 4869 + }, + { + "clip_ratio": 0.0014993588556535542, + "completion_length": 595.7143096923828, + "epoch": 0.18170794272655194, + "grad_norm": 0.057110607624053955, + "kl": 0.01361083984375, + "learning_rate": 1.015945158717881e-06, + "loss": 0.0046, + "num_tokens": 113681827.0, + "reward": 0.9812629371881485, + "reward_std": 0.17295972257852554, + "rewards/code_reward": 0.8312629386782646, + "rewards/format_reward": 1.5, + "step": 4870 + }, + { + "clip_ratio": 0.001351366110611707, + "epoch": 0.18174525441910358, + "grad_norm": 0.05600610375404358, + "kl": 0.0132904052734375, + "learning_rate": 1.0157009342874375e-06, + "loss": 0.0047, + "step": 4871 + }, + { + "clip_ratio": 0.0013510348508134484, + "epoch": 0.18178256611165525, + "grad_norm": 0.052877914160490036, + "kl": 0.0133514404296875, + "learning_rate": 1.0154585913855839e-06, + "loss": 0.0045, + "step": 4872 + }, + { + "clip_ratio": 0.0033791546593420208, + "completion_length": 687.1250305175781, + "epoch": 0.1818198778042069, + "grad_norm": 0.053783368319272995, + "kl": 0.02105712890625, + "learning_rate": 1.015218130114002e-06, + "loss": 0.0009, + "num_tokens": 113760486.0, + "reward": 0.4694580212235451, + "reward_std": 0.05900773126631975, + "rewards/code_reward": 0.3194580152630806, + "rewards/format_reward": 1.5, + "step": 4873 + }, + { + "clip_ratio": 0.0031325341551564634, + "epoch": 0.18185718949675855, + "grad_norm": 0.05435797572135925, + "kl": 0.02056884765625, + "learning_rate": 1.0149795505735857e-06, + "loss": 0.001, + "step": 4874 + }, + { + "clip_ratio": 0.003187030553817749, + "epoch": 0.1818945011893102, + "grad_norm": 0.05312034487724304, + "kl": 0.02056884765625, + "learning_rate": 1.014742852864438e-06, + "loss": 0.0008, + "step": 4875 + }, + { + "clip_ratio": 0.004040240892209113, + "completion_length": 845.0714569091797, + "epoch": 0.18193181288186186, + "grad_norm": 0.08144980669021606, + "kl": 0.019989013671875, + "learning_rate": 1.0145080370858726e-06, + "loss": -0.0009, + "num_tokens": 113846114.0, + "reward": 0.6595485471189022, + "reward_std": 0.17386498511768878, + "rewards/code_reward": 0.5122270921710879, + "rewards/format_reward": 1.4732142984867096, + "step": 4876 + }, + { + "clip_ratio": 0.004560014000162482, + "epoch": 0.1819691245744135, + "grad_norm": 0.07908791303634644, + "kl": 0.019989013671875, + "learning_rate": 1.0142751033364148e-06, + "loss": -0.0005, + "step": 4877 + }, + { + "clip_ratio": 0.003813510586041957, + "epoch": 0.18200643626696517, + "grad_norm": 0.07611626386642456, + "kl": 0.0201873779296875, + "learning_rate": 1.0140440517137987e-06, + "loss": -0.0009, + "step": 4878 + }, + { + "clip_ratio": 0.005138642736710608, + "completion_length": 752.1428985595703, + "epoch": 0.1820437479595168, + "grad_norm": 0.08839469403028488, + "kl": 0.0204010009765625, + "learning_rate": 1.0138148823149691e-06, + "loss": -0.0082, + "num_tokens": 113921806.0, + "reward": 0.3542364239692688, + "reward_std": 0.2656909376382828, + "rewards/code_reward": 0.2042364238295704, + "rewards/format_reward": 1.5, + "step": 4879 + }, + { + "clip_ratio": 0.005225700268056244, + "epoch": 0.18208105965206847, + "grad_norm": 0.08265320211648941, + "kl": 0.02020263671875, + "learning_rate": 1.0135875952360813e-06, + "loss": -0.0085, + "step": 4880 + }, + { + "clip_ratio": 0.004616454942151904, + "epoch": 0.1821183713446201, + "grad_norm": 0.08435844630002975, + "kl": 0.0207672119140625, + "learning_rate": 1.0133621905725006e-06, + "loss": -0.0085, + "step": 4881 + }, + { + "clip_ratio": 0.004314380930736661, + "completion_length": 672.3214492797852, + "epoch": 0.18215568303717178, + "grad_norm": 0.097336545586586, + "kl": 0.02386474609375, + "learning_rate": 1.0131386684188035e-06, + "loss": -0.0045, + "num_tokens": 113985916.0, + "reward": 0.650123730301857, + "reward_std": 0.2563900426030159, + "rewards/code_reward": 0.5001237131655216, + "rewards/format_reward": 1.5, + "step": 4882 + }, + { + "clip_ratio": 0.004705982748419046, + "epoch": 0.18219299472972342, + "grad_norm": 0.09797197580337524, + "kl": 0.023834228515625, + "learning_rate": 1.0129170288687748e-06, + "loss": -0.0042, + "step": 4883 + }, + { + "clip_ratio": 0.004302834160625935, + "epoch": 0.18223030642227508, + "grad_norm": 0.09594123065471649, + "kl": 0.02459716796875, + "learning_rate": 1.0126972720154102e-06, + "loss": -0.0044, + "step": 4884 + }, + { + "clip_ratio": 0.0028105967794544995, + "completion_length": 650.4286041259766, + "epoch": 0.18226761811482672, + "grad_norm": 0.06516654789447784, + "kl": 0.0202789306640625, + "learning_rate": 1.0124793979509168e-06, + "loss": 0.0252, + "num_tokens": 114049584.0, + "reward": 0.8557864353060722, + "reward_std": 0.2103920802474022, + "rewards/code_reward": 0.7057864181697369, + "rewards/format_reward": 1.5, + "step": 4885 + }, + { + "clip_ratio": 0.002837697451468557, + "epoch": 0.1823049298073784, + "grad_norm": 0.06156042590737343, + "kl": 0.0210418701171875, + "learning_rate": 1.0122634067667093e-06, + "loss": 0.0253, + "step": 4886 + }, + { + "clip_ratio": 0.002621037943754345, + "epoch": 0.18234224149993003, + "grad_norm": 0.060767438262701035, + "kl": 0.0191192626953125, + "learning_rate": 1.0120492985534144e-06, + "loss": 0.0252, + "step": 4887 + }, + { + "clip_ratio": 0.003931874060072005, + "completion_length": 678.9107513427734, + "epoch": 0.1823795531924817, + "grad_norm": 0.08850991725921631, + "kl": 0.0160980224609375, + "learning_rate": 1.0118370734008667e-06, + "loss": -0.0059, + "num_tokens": 114119845.0, + "reward": 0.8097507059574127, + "reward_std": 0.4308738112449646, + "rewards/code_reward": 0.6597506999969482, + "rewards/format_reward": 1.5, + "step": 4888 + }, + { + "clip_ratio": 0.003909885534085333, + "epoch": 0.18241686488503334, + "grad_norm": 0.08814465254545212, + "kl": 0.01605224609375, + "learning_rate": 1.0116267313981138e-06, + "loss": -0.0061, + "step": 4889 + }, + { + "clip_ratio": 0.004307626863010228, + "epoch": 0.182454176577585, + "grad_norm": 0.08783148974180222, + "kl": 0.0160369873046875, + "learning_rate": 1.0114182726334093e-06, + "loss": -0.0057, + "step": 4890 + }, + { + "clip_ratio": 0.0027736101765185595, + "completion_length": 797.1964569091797, + "epoch": 0.18249148827013664, + "grad_norm": 0.04543232545256615, + "kl": 0.02288818359375, + "learning_rate": 1.0112116971942203e-06, + "loss": 0.003, + "num_tokens": 114201714.0, + "reward": 0.46674758940935135, + "reward_std": 0.01329424511641264, + "rewards/code_reward": 0.3167475759983063, + "rewards/format_reward": 1.5, + "step": 4891 + }, + { + "clip_ratio": 0.0031728100148029625, + "epoch": 0.1825287999626883, + "grad_norm": 0.04877093434333801, + "kl": 0.022918701171875, + "learning_rate": 1.011007005167221e-06, + "loss": 0.0031, + "step": 4892 + }, + { + "clip_ratio": 0.002767407801002264, + "epoch": 0.18256611165523995, + "grad_norm": 0.04345519095659256, + "kl": 0.023834228515625, + "learning_rate": 1.0108041966382968e-06, + "loss": 0.0029, + "step": 4893 + }, + { + "clip_ratio": 0.003333454340463504, + "completion_length": 749.0714721679688, + "epoch": 0.18260342334779162, + "grad_norm": 0.05988834425806999, + "kl": 0.0112152099609375, + "learning_rate": 1.0106032716925424e-06, + "loss": 0.0067, + "num_tokens": 114277682.0, + "reward": 0.6006908230483532, + "reward_std": 0.23847627267241478, + "rewards/code_reward": 0.4506908133625984, + "rewards/format_reward": 1.5, + "step": 4894 + }, + { + "clip_ratio": 0.003438986896071583, + "epoch": 0.18264073504034326, + "grad_norm": 0.05722615495324135, + "kl": 0.011138916015625, + "learning_rate": 1.0104042304142622e-06, + "loss": 0.0067, + "step": 4895 + }, + { + "clip_ratio": 0.003409188997466117, + "epoch": 0.18267804673289492, + "grad_norm": 0.05721330642700195, + "kl": 0.0108642578125, + "learning_rate": 1.01020707288697e-06, + "loss": 0.0067, + "step": 4896 + }, + { + "clip_ratio": 0.0033842776902019978, + "completion_length": 686.4643325805664, + "epoch": 0.18271535842544656, + "grad_norm": 0.06836321949958801, + "kl": 0.021087646484375, + "learning_rate": 1.010011799193389e-06, + "loss": 0.0052, + "num_tokens": 114366482.0, + "reward": 0.6919977888464928, + "reward_std": 0.11434380523860455, + "rewards/code_reward": 0.5419977707788348, + "rewards/format_reward": 1.5, + "step": 4897 + }, + { + "clip_ratio": 0.003290527849458158, + "epoch": 0.18275267011799823, + "grad_norm": 0.06427133083343506, + "kl": 0.021026611328125, + "learning_rate": 1.0098184094154534e-06, + "loss": 0.005, + "step": 4898 + }, + { + "clip_ratio": 0.0034161500516347587, + "epoch": 0.18278998181054987, + "grad_norm": 0.06525655835866928, + "kl": 0.021331787109375, + "learning_rate": 1.0096269036343054e-06, + "loss": 0.005, + "step": 4899 + }, + { + "clip_ratio": 0.0034052904811687768, + "completion_length": 790.357177734375, + "epoch": 0.18282729350310153, + "grad_norm": 0.07122056186199188, + "kl": 0.018951416015625, + "learning_rate": 1.0094372819302978e-06, + "loss": 0.0517, + "num_tokens": 114452392.0, + "reward": 0.66787513718009, + "reward_std": 0.1614639637991786, + "rewards/code_reward": 0.5205536894500256, + "rewards/format_reward": 1.4732142984867096, + "step": 4900 + }, + { + "clip_ratio": 0.0033239948097616434, + "epoch": 0.1828646051956532, + "grad_norm": 0.0699150338768959, + "kl": 0.0191192626953125, + "learning_rate": 1.0092495443829912e-06, + "loss": 0.0515, + "step": 4901 + }, + { + "clip_ratio": 0.002921986044384539, + "epoch": 0.18290191688820484, + "grad_norm": 0.07147300243377686, + "kl": 0.019287109375, + "learning_rate": 1.0090636910711579e-06, + "loss": 0.0515, + "step": 4902 + }, + { + "clip_ratio": 0.003541447629686445, + "completion_length": 710.4107513427734, + "epoch": 0.1829392285807565, + "grad_norm": 0.08014116436243057, + "kl": 0.0172576904296875, + "learning_rate": 1.008879722072778e-06, + "loss": 0.0488, + "num_tokens": 114516325.0, + "reward": 0.6638175025582314, + "reward_std": 0.23446135595440865, + "rewards/code_reward": 0.5163174718618393, + "rewards/format_reward": 1.4749999940395355, + "step": 4903 + }, + { + "clip_ratio": 0.003339960239827633, + "epoch": 0.18297654027330815, + "grad_norm": 0.0720963105559349, + "kl": 0.017913818359375, + "learning_rate": 1.0086976374650414e-06, + "loss": 0.0488, + "step": 4904 + }, + { + "clip_ratio": 0.003670533187687397, + "epoch": 0.18301385196585981, + "grad_norm": 0.07613822817802429, + "kl": 0.0171356201171875, + "learning_rate": 1.0085174373243484e-06, + "loss": 0.0488, + "step": 4905 + }, + { + "clip_ratio": 0.0036561418091878295, + "completion_length": 688.0714492797852, + "epoch": 0.18305116365841145, + "grad_norm": 0.11224811524152756, + "kl": 0.0150909423828125, + "learning_rate": 1.008339121726306e-06, + "loss": 0.0058, + "num_tokens": 114590641.0, + "reward": 0.5248037949204445, + "reward_std": 0.3139909077435732, + "rewards/code_reward": 0.3748037740588188, + "rewards/format_reward": 1.5, + "step": 4906 + }, + { + "clip_ratio": 0.003768674621824175, + "epoch": 0.18308847535096312, + "grad_norm": 0.0816093385219574, + "kl": 0.014617919921875, + "learning_rate": 1.0081626907457328e-06, + "loss": 0.0059, + "step": 4907 + }, + { + "clip_ratio": 0.003957038163207471, + "epoch": 0.18312578704351476, + "grad_norm": 0.08258915692567825, + "kl": 0.0155487060546875, + "learning_rate": 1.0079881444566565e-06, + "loss": 0.0058, + "step": 4908 + }, + { + "clip_ratio": 0.003081594652030617, + "completion_length": 712.8393249511719, + "epoch": 0.18316309873606643, + "grad_norm": 0.06267881393432617, + "kl": 0.019622802734375, + "learning_rate": 1.007815482932313e-06, + "loss": -0.0008, + "num_tokens": 114655844.0, + "reward": 0.5882687270641327, + "reward_std": 0.12109949998557568, + "rewards/code_reward": 0.44094728771597147, + "rewards/format_reward": 1.4732142984867096, + "step": 4909 + }, + { + "clip_ratio": 0.0030279442435130477, + "epoch": 0.18320041042861807, + "grad_norm": 0.06327924132347107, + "kl": 0.01959228515625, + "learning_rate": 1.0076447062451483e-06, + "loss": -0.0008, + "step": 4910 + }, + { + "clip_ratio": 0.003101945505477488, + "epoch": 0.18323772212116973, + "grad_norm": 0.062499918043613434, + "kl": 0.0196380615234375, + "learning_rate": 1.0074758144668162e-06, + "loss": -0.0007, + "step": 4911 + }, + { + "clip_ratio": 0.003911683859769255, + "completion_length": 822.6786346435547, + "epoch": 0.18327503381372137, + "grad_norm": 0.05734298750758171, + "kl": 0.0196533203125, + "learning_rate": 1.0073088076681813e-06, + "loss": 0.0589, + "num_tokens": 114736270.0, + "reward": 0.16295108571648598, + "reward_std": 0.023040080443024635, + "rewards/code_reward": 0.015629642526619136, + "rewards/format_reward": 1.4732142984867096, + "step": 4912 + }, + { + "clip_ratio": 0.004162194381933659, + "epoch": 0.18331234550627304, + "grad_norm": 0.05766749009490013, + "kl": 0.019287109375, + "learning_rate": 1.0071436859193169e-06, + "loss": 0.059, + "step": 4913 + }, + { + "clip_ratio": 0.0043776126112788916, + "epoch": 0.18334965719882468, + "grad_norm": 0.056627023965120316, + "kl": 0.0192413330078125, + "learning_rate": 1.0069804492895035e-06, + "loss": 0.059, + "step": 4914 + }, + { + "clip_ratio": 0.002794035361148417, + "completion_length": 734.107177734375, + "epoch": 0.18338696889137635, + "grad_norm": 0.0571884922683239, + "kl": 0.0157012939453125, + "learning_rate": 1.0068190978472339e-06, + "loss": 0.0054, + "num_tokens": 114815306.0, + "reward": 0.5477534644305706, + "reward_std": 0.129465801990591, + "rewards/code_reward": 0.3977534584701061, + "rewards/format_reward": 1.5, + "step": 4915 + }, + { + "clip_ratio": 0.0031783247250132263, + "epoch": 0.18342428058392798, + "grad_norm": 0.05841774120926857, + "kl": 0.0156097412109375, + "learning_rate": 1.0066596316602065e-06, + "loss": 0.0052, + "step": 4916 + }, + { + "clip_ratio": 0.003422824665904045, + "epoch": 0.18346159227647965, + "grad_norm": 0.058034420013427734, + "kl": 0.0154571533203125, + "learning_rate": 1.0065020507953322e-06, + "loss": 0.0052, + "step": 4917 + }, + { + "clip_ratio": 0.0030681847129017115, + "completion_length": 709.3036041259766, + "epoch": 0.1834989039690313, + "grad_norm": 0.0768556222319603, + "kl": 0.022857666015625, + "learning_rate": 1.006346355318728e-06, + "loss": -0.0052, + "num_tokens": 114889117.0, + "reward": 0.4771868512034416, + "reward_std": 0.16491187922656536, + "rewards/code_reward": 0.32986542815342546, + "rewards/format_reward": 1.4732142984867096, + "step": 4918 + }, + { + "clip_ratio": 0.0037788626505061984, + "epoch": 0.18353621566158296, + "grad_norm": 0.07673569768667221, + "kl": 0.0218963623046875, + "learning_rate": 1.0061925452957203e-06, + "loss": -0.005, + "step": 4919 + }, + { + "clip_ratio": 0.0037363249575719237, + "epoch": 0.1835735273541346, + "grad_norm": 0.0756516307592392, + "kl": 0.022674560546875, + "learning_rate": 1.0060406207908464e-06, + "loss": -0.0053, + "step": 4920 + }, + { + "clip_ratio": 0.004271632875315845, + "completion_length": 639.8928756713867, + "epoch": 0.18361083904668626, + "grad_norm": 0.08059495687484741, + "kl": 0.025970458984375, + "learning_rate": 1.0058905818678496e-06, + "loss": 0.0032, + "num_tokens": 114957179.0, + "reward": 0.7957517728209496, + "reward_std": 0.25458812713623047, + "rewards/code_reward": 0.6457517817616463, + "rewards/format_reward": 1.5, + "step": 4921 + }, + { + "clip_ratio": 0.0036227820673957467, + "epoch": 0.1836481507392379, + "grad_norm": 0.07979083806276321, + "kl": 0.026611328125, + "learning_rate": 1.0057424285896841e-06, + "loss": 0.0027, + "step": 4922 + }, + { + "clip_ratio": 0.0031813065288588405, + "epoch": 0.18368546243178957, + "grad_norm": 0.07822716981172562, + "kl": 0.026397705078125, + "learning_rate": 1.0055961610185122e-06, + "loss": 0.0026, + "step": 4923 + }, + { + "clip_ratio": 0.004431177396327257, + "completion_length": 553.4107437133789, + "epoch": 0.1837227741243412, + "grad_norm": 0.0840197429060936, + "kl": 0.01690673828125, + "learning_rate": 1.0054517792157046e-06, + "loss": 0.0149, + "num_tokens": 115017318.0, + "reward": 0.5841785557568073, + "reward_std": 0.16807065717875957, + "rewards/code_reward": 0.43417854607105255, + "rewards/format_reward": 1.5, + "step": 4924 + }, + { + "clip_ratio": 0.004129382141400129, + "epoch": 0.18376008581689288, + "grad_norm": 0.09216003119945526, + "kl": 0.017333984375, + "learning_rate": 1.0053092832418421e-06, + "loss": 0.0146, + "step": 4925 + }, + { + "clip_ratio": 0.004773347289301455, + "epoch": 0.18379739750944452, + "grad_norm": 0.0809362381696701, + "kl": 0.017364501953125, + "learning_rate": 1.0051686731567122e-06, + "loss": 0.015, + "step": 4926 + }, + { + "clip_ratio": 0.004686727654188871, + "completion_length": 704.8571624755859, + "epoch": 0.18383470920199618, + "grad_norm": 0.0821441039443016, + "kl": 0.0205078125, + "learning_rate": 1.0050299490193134e-06, + "loss": -0.0017, + "num_tokens": 115097558.0, + "reward": 0.458490289747715, + "reward_std": 0.32918864488601685, + "rewards/code_reward": 0.3084902912378311, + "rewards/format_reward": 1.5, + "step": 4927 + }, + { + "clip_ratio": 0.0048555375542491674, + "epoch": 0.18387202089454782, + "grad_norm": 0.08260658383369446, + "kl": 0.020782470703125, + "learning_rate": 1.0048931108878516e-06, + "loss": -0.0019, + "step": 4928 + }, + { + "clip_ratio": 0.005010454449802637, + "epoch": 0.1839093325870995, + "grad_norm": 0.0799916684627533, + "kl": 0.020538330078125, + "learning_rate": 1.004758158819741e-06, + "loss": -0.0018, + "step": 4929 + }, + { + "clip_ratio": 0.0037417663843370974, + "completion_length": 848.0893249511719, + "epoch": 0.18394664427965113, + "grad_norm": 0.06993139535188675, + "kl": 0.020355224609375, + "learning_rate": 1.0046250928716052e-06, + "loss": 0.0344, + "num_tokens": 115177391.0, + "reward": 0.46495118737220764, + "reward_std": 0.24341906420886517, + "rewards/code_reward": 0.31495115254074335, + "rewards/format_reward": 1.5, + "step": 4930 + }, + { + "clip_ratio": 0.004199851304292679, + "epoch": 0.1839839559722028, + "grad_norm": 0.1621919870376587, + "kl": 0.019866943359375, + "learning_rate": 1.0044939130992762e-06, + "loss": 0.0347, + "step": 4931 + }, + { + "clip_ratio": 0.004231809638440609, + "epoch": 0.18402126766475443, + "grad_norm": 0.07420668005943298, + "kl": 0.020111083984375, + "learning_rate": 1.0043646195577949e-06, + "loss": 0.0344, + "step": 4932 + }, + { + "clip_ratio": 0.0019523465889506042, + "completion_length": 759.5714721679688, + "epoch": 0.1840585793573061, + "grad_norm": 0.05471804365515709, + "kl": 0.0150299072265625, + "learning_rate": 1.0042372123014098e-06, + "loss": -0.0149, + "num_tokens": 115247395.0, + "reward": 0.772981833666563, + "reward_std": 0.030557405203580856, + "rewards/code_reward": 0.6229818626306951, + "rewards/format_reward": 1.5, + "step": 4933 + }, + { + "clip_ratio": 0.0017043753759935498, + "epoch": 0.18409589104985774, + "grad_norm": 0.054934605956077576, + "kl": 0.01495361328125, + "learning_rate": 1.0041116913835797e-06, + "loss": -0.0148, + "step": 4934 + }, + { + "clip_ratio": 0.0017706865910440683, + "epoch": 0.1841332027424094, + "grad_norm": 0.05356956273317337, + "kl": 0.014862060546875, + "learning_rate": 1.0039880568569698e-06, + "loss": -0.0149, + "step": 4935 + }, + { + "clip_ratio": 0.004551880003418773, + "completion_length": 985.8393402099609, + "epoch": 0.18417051443496105, + "grad_norm": 0.07479742914438248, + "kl": 0.0227203369140625, + "learning_rate": 1.0038663087734552e-06, + "loss": 0.0094, + "num_tokens": 115336664.0, + "reward": 0.38822659105062485, + "reward_std": 0.07877579377964139, + "rewards/code_reward": 0.23822657321579754, + "rewards/format_reward": 1.5, + "step": 4936 + }, + { + "clip_ratio": 0.004923534987028688, + "epoch": 0.1842078261275127, + "grad_norm": 0.07315382361412048, + "kl": 0.02288818359375, + "learning_rate": 1.0037464471841192e-06, + "loss": 0.0094, + "step": 4937 + }, + { + "clip_ratio": 0.003982328227721155, + "epoch": 0.18424513782006435, + "grad_norm": 0.07364225387573242, + "kl": 0.02313232421875, + "learning_rate": 1.0036284721392534e-06, + "loss": 0.009, + "step": 4938 + }, + { + "clip_ratio": 0.003308701445348561, + "completion_length": 687.4821853637695, + "epoch": 0.18428244951261602, + "grad_norm": 0.07426559925079346, + "kl": 0.02178955078125, + "learning_rate": 1.0035123836883578e-06, + "loss": -0.0042, + "num_tokens": 115411871.0, + "reward": 0.6797149479389191, + "reward_std": 0.05540906242094934, + "rewards/code_reward": 0.529714931268245, + "rewards/format_reward": 1.5, + "step": 4939 + }, + { + "clip_ratio": 0.003263940045144409, + "epoch": 0.18431976120516766, + "grad_norm": 0.07578637450933456, + "kl": 0.021209716796875, + "learning_rate": 1.0033981818801418e-06, + "loss": -0.0044, + "step": 4940 + }, + { + "clip_ratio": 0.0033825082355178893, + "epoch": 0.18435707289771933, + "grad_norm": 0.07498207688331604, + "kl": 0.02166748046875, + "learning_rate": 1.0032858667625208e-06, + "loss": -0.0045, + "step": 4941 + }, + { + "clip_ratio": 0.0030744560644961894, + "completion_length": 749.232177734375, + "epoch": 0.18439438459027097, + "grad_norm": 0.0691540315747261, + "kl": 0.023193359375, + "learning_rate": 1.0031754383826218e-06, + "loss": 0.0069, + "num_tokens": 115481002.0, + "reward": 0.5372497886419296, + "reward_std": 0.13722421135753393, + "rewards/code_reward": 0.38724977895617485, + "rewards/format_reward": 1.5, + "step": 4942 + }, + { + "clip_ratio": 0.003357070905622095, + "epoch": 0.18443169628282263, + "grad_norm": 0.06940356642007828, + "kl": 0.0241851806640625, + "learning_rate": 1.0030668967867773e-06, + "loss": 0.0069, + "step": 4943 + }, + { + "clip_ratio": 0.0031604914111085236, + "epoch": 0.18446900797537427, + "grad_norm": 0.06819117069244385, + "kl": 0.02337646484375, + "learning_rate": 1.0029602420205301e-06, + "loss": 0.0068, + "step": 4944 + }, + { + "clip_ratio": 0.002952014096081257, + "completion_length": 519.6428756713867, + "epoch": 0.18450631966792594, + "grad_norm": 0.095924511551857, + "kl": 0.0321197509765625, + "learning_rate": 1.0028554741286302e-06, + "loss": 0.0143, + "num_tokens": 115534714.0, + "reward": 0.8257771097123623, + "reward_std": 0.09812801552470773, + "rewards/code_reward": 0.6757771087286528, + "rewards/format_reward": 1.5, + "step": 4945 + }, + { + "clip_ratio": 0.002912824274972081, + "epoch": 0.18454363136047758, + "grad_norm": 0.06836595386266708, + "kl": 0.031585693359375, + "learning_rate": 1.0027525931550363e-06, + "loss": 0.0145, + "step": 4946 + }, + { + "clip_ratio": 0.0029186720494180918, + "epoch": 0.18458094305302924, + "grad_norm": 0.06913241744041443, + "kl": 0.0321044921875, + "learning_rate": 1.002651599142915e-06, + "loss": 0.0144, + "step": 4947 + }, + { + "clip_ratio": 0.005691418133210391, + "completion_length": 616.732177734375, + "epoch": 0.18461825474558088, + "grad_norm": 0.26815712451934814, + "kl": 0.1652679443359375, + "learning_rate": 1.0025524921346424e-06, + "loss": 0.0095, + "num_tokens": 115605861.0, + "reward": 0.9088505208492279, + "reward_std": 0.2100351294502616, + "rewards/code_reward": 0.758850485086441, + "rewards/format_reward": 1.5, + "step": 4948 + }, + { + "clip_ratio": 0.005330817133653909, + "epoch": 0.18465556643813255, + "grad_norm": 0.2778048515319824, + "kl": 0.123748779296875, + "learning_rate": 1.0024552721718008e-06, + "loss": 0.009, + "step": 4949 + }, + { + "clip_ratio": 0.00533178640762344, + "epoch": 0.1846928781306842, + "grad_norm": 0.2701621651649475, + "kl": 0.123748779296875, + "learning_rate": 1.002359939295183e-06, + "loss": 0.0088, + "step": 4950 + }, + { + "clip_ratio": 0.0019961066136602312, + "completion_length": 615.7857437133789, + "epoch": 0.18473018982323586, + "grad_norm": 0.018957726657390594, + "kl": 0.01631927490234375, + "learning_rate": 1.0022664935447883e-06, + "loss": 0.0025, + "num_tokens": 115679307.0, + "reward": 0.8540311269462109, + "reward_std": 0.05159993842244148, + "rewards/code_reward": 0.7040311098098755, + "rewards/format_reward": 1.5, + "step": 4951 + }, + { + "clip_ratio": 0.002000346838030964, + "epoch": 0.1847675015157875, + "grad_norm": 0.01643308810889721, + "kl": 0.016082763671875, + "learning_rate": 1.0021749349598245e-06, + "loss": 0.0025, + "step": 4952 + }, + { + "clip_ratio": 0.002322339336387813, + "epoch": 0.18480481320833916, + "grad_norm": 0.01861509308218956, + "kl": 0.0156097412109375, + "learning_rate": 1.0020852635787091e-06, + "loss": 0.0025, + "step": 4953 + }, + { + "clip_ratio": 0.004314250487368554, + "completion_length": 643.7500305175781, + "epoch": 0.1848421249008908, + "grad_norm": 0.08405992388725281, + "kl": 0.020233154296875, + "learning_rate": 1.0019974794390652e-06, + "loss": -0.0023, + "num_tokens": 115742139.0, + "reward": 0.5830778181552887, + "reward_std": 0.20340330433100462, + "rewards/code_reward": 0.4330777742434293, + "rewards/format_reward": 1.5, + "step": 4954 + }, + { + "clip_ratio": 0.004343165084719658, + "epoch": 0.18487943659344247, + "grad_norm": 0.1721213459968567, + "kl": 0.020538330078125, + "learning_rate": 1.001911582577726e-06, + "loss": -0.0024, + "step": 4955 + }, + { + "clip_ratio": 0.004115457704756409, + "epoch": 0.18491674828599414, + "grad_norm": 0.10780323296785355, + "kl": 0.01995849609375, + "learning_rate": 1.0018275730307325e-06, + "loss": -0.0027, + "step": 4956 + }, + { + "clip_ratio": 0.0030218286556191742, + "completion_length": 533.1785888671875, + "epoch": 0.18495405997854578, + "grad_norm": 0.08497894555330276, + "kl": 0.0206146240234375, + "learning_rate": 1.0017454508333326e-06, + "loss": 0.0249, + "num_tokens": 115798765.0, + "reward": 0.7773809656500816, + "reward_std": 0.12030485272407532, + "rewards/code_reward": 0.6273809429258108, + "rewards/format_reward": 1.5, + "step": 4957 + }, + { + "clip_ratio": 0.003142401808872819, + "epoch": 0.18499137167109744, + "grad_norm": 0.08545269817113876, + "kl": 0.0209197998046875, + "learning_rate": 1.0016652160199844e-06, + "loss": 0.025, + "step": 4958 + }, + { + "clip_ratio": 0.002992764988448471, + "epoch": 0.18502868336364908, + "grad_norm": 0.08345384150743484, + "kl": 0.021484375, + "learning_rate": 1.0015868686243523e-06, + "loss": 0.0249, + "step": 4959 + }, + { + "clip_ratio": 0.0032376607414335012, + "completion_length": 721.9821624755859, + "epoch": 0.18506599505620075, + "grad_norm": 0.07233951985836029, + "kl": 0.017974853515625, + "learning_rate": 1.0015104086793093e-06, + "loss": 0.0049, + "num_tokens": 115868302.0, + "reward": 0.8953630924224854, + "reward_std": 0.23180746659636497, + "rewards/code_reward": 0.7453630492091179, + "rewards/format_reward": 1.5, + "step": 4960 + }, + { + "clip_ratio": 0.0032881583319976926, + "epoch": 0.1851033067487524, + "grad_norm": 0.07604499161243439, + "kl": 0.017913818359375, + "learning_rate": 1.0014358362169366e-06, + "loss": 0.0049, + "step": 4961 + }, + { + "clip_ratio": 0.0034919320605695248, + "epoch": 0.18514061844130406, + "grad_norm": 0.07082372158765793, + "kl": 0.017578125, + "learning_rate": 1.0013631512685242e-06, + "loss": 0.0047, + "step": 4962 + }, + { + "clip_ratio": 0.0025317854015156627, + "completion_length": 585.9464569091797, + "epoch": 0.1851779301338557, + "grad_norm": 0.058489762246608734, + "kl": 0.0118865966796875, + "learning_rate": 1.0012923538645684e-06, + "loss": 0.0257, + "num_tokens": 115932699.0, + "reward": 0.4925239570438862, + "reward_std": 0.1584053337574005, + "rewards/code_reward": 0.34252396831288934, + "rewards/format_reward": 1.5, + "step": 4963 + }, + { + "clip_ratio": 0.002567778341472149, + "epoch": 0.18521524182640736, + "grad_norm": 0.057969700545072556, + "kl": 0.01177978515625, + "learning_rate": 1.001223444034775e-06, + "loss": 0.0257, + "step": 4964 + }, + { + "clip_ratio": 0.0024730670265853405, + "epoch": 0.185252553518959, + "grad_norm": 0.0535825751721859, + "kl": 0.0120697021484375, + "learning_rate": 1.001156421808057e-06, + "loss": 0.0256, + "step": 4965 + }, + { + "clip_ratio": 0.0014569707564078271, + "completion_length": 559.8571624755859, + "epoch": 0.18528986521151067, + "grad_norm": 0.050564173609018326, + "kl": 0.014129638671875, + "learning_rate": 1.0010912872125353e-06, + "loss": -0.0146, + "num_tokens": 115998449.0, + "reward": 1.0949013084173203, + "reward_std": 0.10951147205196321, + "rewards/code_reward": 0.9449013322591782, + "rewards/format_reward": 1.5, + "step": 4966 + }, + { + "clip_ratio": 0.0012692722375504673, + "epoch": 0.1853271769040623, + "grad_norm": 0.05205455794930458, + "kl": 0.013427734375, + "learning_rate": 1.0010280402755401e-06, + "loss": -0.0145, + "step": 4967 + }, + { + "clip_ratio": 0.0016115647740662098, + "epoch": 0.18536448859661397, + "grad_norm": 0.052645325660705566, + "kl": 0.01373291015625, + "learning_rate": 1.0009666810236083e-06, + "loss": -0.0146, + "step": 4968 + }, + { + "clip_ratio": 0.0026192901423200965, + "completion_length": 511.7143020629883, + "epoch": 0.1854018002891656, + "grad_norm": 0.06825586408376694, + "kl": 0.015411376953125, + "learning_rate": 1.000907209482485e-06, + "loss": 0.01, + "num_tokens": 116049449.0, + "reward": 0.9080106765031815, + "reward_std": 0.0923470463603735, + "rewards/code_reward": 0.7580106947571039, + "rewards/format_reward": 1.5, + "step": 4969 + }, + { + "clip_ratio": 0.001910280087031424, + "epoch": 0.18543911198171728, + "grad_norm": 0.050389472395181656, + "kl": 0.015228271484375, + "learning_rate": 1.0008496256771236e-06, + "loss": 0.0099, + "step": 4970 + }, + { + "clip_ratio": 0.001722924062050879, + "epoch": 0.18547642367426892, + "grad_norm": 0.04864838719367981, + "kl": 0.01519775390625, + "learning_rate": 1.0007939296316843e-06, + "loss": 0.0097, + "step": 4971 + }, + { + "clip_ratio": 0.004301126697100699, + "completion_length": 696.8393096923828, + "epoch": 0.1855137353668206, + "grad_norm": 0.07877504825592041, + "kl": 0.016876220703125, + "learning_rate": 1.0007401213695374e-06, + "loss": 0.0042, + "num_tokens": 116120042.0, + "reward": 0.7373362295329571, + "reward_std": 0.15679777413606644, + "rewards/code_reward": 0.5873362123966217, + "rewards/format_reward": 1.5, + "step": 4972 + }, + { + "clip_ratio": 0.004592489916831255, + "epoch": 0.18555104705937223, + "grad_norm": 0.07799327373504639, + "kl": 0.016937255859375, + "learning_rate": 1.0006882009132586e-06, + "loss": 0.0043, + "step": 4973 + }, + { + "clip_ratio": 0.004743017314467579, + "epoch": 0.1855883587519239, + "grad_norm": 0.07951816916465759, + "kl": 0.0167236328125, + "learning_rate": 1.000638168284634e-06, + "loss": 0.0043, + "step": 4974 + }, + { + "clip_ratio": 0.004440537479240447, + "completion_length": 556.803596496582, + "epoch": 0.18562567044447553, + "grad_norm": 0.18213476240634918, + "kl": 0.0169830322265625, + "learning_rate": 1.0005900235046552e-06, + "loss": -0.0052, + "num_tokens": 116173063.0, + "reward": 0.7350588850677013, + "reward_std": 0.14585820399224758, + "rewards/code_reward": 0.5850588828325272, + "rewards/format_reward": 1.5, + "step": 4975 + }, + { + "clip_ratio": 0.005361497518606484, + "epoch": 0.1856629821370272, + "grad_norm": 0.09917157143354416, + "kl": 0.0166778564453125, + "learning_rate": 1.0005437665935238e-06, + "loss": -0.0048, + "step": 4976 + }, + { + "clip_ratio": 0.004729320527985692, + "epoch": 0.18570029382957884, + "grad_norm": 0.08542672544717789, + "kl": 0.0165557861328125, + "learning_rate": 1.000499397570648e-06, + "loss": -0.0052, + "step": 4977 + }, + { + "clip_ratio": 0.0022681248374283314, + "completion_length": 848.6250457763672, + "epoch": 0.1857376055221305, + "grad_norm": 0.07983025163412094, + "kl": 0.01654052734375, + "learning_rate": 1.000456916454644e-06, + "loss": 0.0399, + "num_tokens": 116263216.0, + "reward": 0.7949971668422222, + "reward_std": 0.19602136965841055, + "rewards/code_reward": 0.6476757377386093, + "rewards/format_reward": 1.4732142984867096, + "step": 4978 + }, + { + "clip_ratio": 0.0023708015214651823, + "epoch": 0.18577491721468214, + "grad_norm": 0.07230312377214432, + "kl": 0.0163421630859375, + "learning_rate": 1.0004163232633362e-06, + "loss": 0.04, + "step": 4979 + }, + { + "clip_ratio": 0.002053675998467952, + "epoch": 0.1858122289072338, + "grad_norm": 0.06203877553343773, + "kl": 0.0162506103515625, + "learning_rate": 1.0003776180137568e-06, + "loss": 0.0397, + "step": 4980 + }, + { + "clip_ratio": 0.0024821070255711675, + "completion_length": 692.3393249511719, + "epoch": 0.18584954059978545, + "grad_norm": 0.05836297944188118, + "kl": 0.0169830322265625, + "learning_rate": 1.000340800722146e-06, + "loss": 0.0235, + "num_tokens": 116330071.0, + "reward": 0.8121492192149162, + "reward_std": 0.18565012514591217, + "rewards/code_reward": 0.6621492393314838, + "rewards/format_reward": 1.5, + "step": 4981 + }, + { + "clip_ratio": 0.002549616154283285, + "epoch": 0.18588685229233712, + "grad_norm": 0.06253621727228165, + "kl": 0.0160675048828125, + "learning_rate": 1.000305871403951e-06, + "loss": 0.0235, + "step": 4982 + }, + { + "clip_ratio": 0.0027376218931749463, + "epoch": 0.18592416398488876, + "grad_norm": 0.06281345337629318, + "kl": 0.0165252685546875, + "learning_rate": 1.0002728300738281e-06, + "loss": 0.0235, + "step": 4983 + }, + { + "clip_ratio": 0.003339278628118336, + "completion_length": 624.5714569091797, + "epoch": 0.18596147567744042, + "grad_norm": 0.07717486470937729, + "kl": 0.0171966552734375, + "learning_rate": 1.000241676745641e-06, + "loss": -0.0289, + "num_tokens": 116392263.0, + "reward": 0.6330492421984673, + "reward_std": 0.14427792094647884, + "rewards/code_reward": 0.48304921854287386, + "rewards/format_reward": 1.5, + "step": 4984 + }, + { + "clip_ratio": 0.003688159747980535, + "epoch": 0.18599878736999206, + "grad_norm": 0.07578425854444504, + "kl": 0.0169677734375, + "learning_rate": 1.0002124114324607e-06, + "loss": -0.029, + "step": 4985 + }, + { + "clip_ratio": 0.00351711199618876, + "epoch": 0.18603609906254373, + "grad_norm": 0.08318106085062027, + "kl": 0.0172119140625, + "learning_rate": 1.0001850341465658e-06, + "loss": -0.029, + "step": 4986 + }, + { + "clip_ratio": 0.004468280414585024, + "completion_length": 638.2678985595703, + "epoch": 0.18607341075509537, + "grad_norm": 0.09646852314472198, + "kl": 0.024688720703125, + "learning_rate": 1.0001595448994443e-06, + "loss": 0.0163, + "num_tokens": 116476376.0, + "reward": 0.4418463669717312, + "reward_std": 0.18572973366826773, + "rewards/code_reward": 0.2918463461101055, + "rewards/format_reward": 1.5, + "step": 4987 + }, + { + "clip_ratio": 0.004534912528470159, + "epoch": 0.18611072244764704, + "grad_norm": 0.07013077288866043, + "kl": 0.024627685546875, + "learning_rate": 1.0001359437017914e-06, + "loss": 0.0166, + "step": 4988 + }, + { + "clip_ratio": 0.00484630634309724, + "epoch": 0.18614803414019868, + "grad_norm": 0.06935515254735947, + "kl": 0.02459716796875, + "learning_rate": 1.0001142305635079e-06, + "loss": 0.0166, + "step": 4989 + }, + { + "clip_ratio": 0.002930548507720232, + "completion_length": 765.0893249511719, + "epoch": 0.18618534583275034, + "grad_norm": 0.07058601826429367, + "kl": 0.019866943359375, + "learning_rate": 1.0000944054937055e-06, + "loss": 0.0038, + "num_tokens": 116555089.0, + "reward": 0.6718254014849663, + "reward_std": 0.2947990819811821, + "rewards/code_reward": 0.5218253992497921, + "rewards/format_reward": 1.5, + "step": 4990 + }, + { + "clip_ratio": 0.0029079223168082535, + "epoch": 0.18622265752530198, + "grad_norm": 0.07306607067584991, + "kl": 0.0197906494140625, + "learning_rate": 1.0000764685007027e-06, + "loss": 0.0036, + "step": 4991 + }, + { + "clip_ratio": 0.0029600639827549458, + "epoch": 0.18625996921785365, + "grad_norm": 0.07973650097846985, + "kl": 0.019683837890625, + "learning_rate": 1.0000604195920246e-06, + "loss": 0.0036, + "step": 4992 + }, + { + "clip_ratio": 0.0025857315049506724, + "completion_length": 749.1428833007812, + "epoch": 0.1862972809104053, + "grad_norm": 0.04351843148469925, + "kl": 0.017974853515625, + "learning_rate": 1.0000462587744056e-06, + "loss": -0.0023, + "num_tokens": 116631887.0, + "reward": 0.6142857186496258, + "reward_std": 0.21797148883342743, + "rewards/code_reward": 0.4642857313156128, + "rewards/format_reward": 1.5, + "step": 4993 + }, + { + "clip_ratio": 0.002446506347041577, + "epoch": 0.18633459260295696, + "grad_norm": 0.04352651536464691, + "kl": 0.0176544189453125, + "learning_rate": 1.000033986053787e-06, + "loss": -0.0023, + "step": 4994 + }, + { + "clip_ratio": 0.0026354716392233968, + "epoch": 0.1863719042955086, + "grad_norm": 0.04488508403301239, + "kl": 0.0178070068359375, + "learning_rate": 1.0000236014353187e-06, + "loss": -0.0022, + "step": 4995 + }, + { + "clip_ratio": 0.0030114605906419456, + "completion_length": 612.232177734375, + "epoch": 0.18640921598806026, + "grad_norm": 0.05294683948159218, + "kl": 0.0237274169921875, + "learning_rate": 1.0000151049233573e-06, + "loss": 0.0018, + "num_tokens": 116697644.0, + "reward": 0.658928569406271, + "reward_std": 0.2095615118741989, + "rewards/code_reward": 0.5089285746216774, + "rewards/format_reward": 1.5, + "step": 4996 + }, + { + "clip_ratio": 0.0031113256700336933, + "epoch": 0.1864465276806119, + "grad_norm": 0.05228548124432564, + "kl": 0.0236053466796875, + "learning_rate": 1.000008496521468e-06, + "loss": 0.0018, + "step": 4997 + }, + { + "clip_ratio": 0.00333787762792781, + "epoch": 0.18648383937316357, + "grad_norm": 0.052448466420173645, + "kl": 0.0235748291015625, + "learning_rate": 1.0000037762324238e-06, + "loss": 0.0019, + "step": 4998 + }, + { + "clip_ratio": 0.0050649072509258986, + "completion_length": 666.857177734375, + "epoch": 0.1865211510657152, + "grad_norm": 0.05883262678980827, + "kl": 0.020599365234375, + "learning_rate": 1.0000009440582051e-06, + "loss": -0.0157, + "num_tokens": 116763814.0, + "reward": 0.2775028795003891, + "reward_std": 0.02030420838855207, + "rewards/code_reward": 0.12750287086237222, + "rewards/format_reward": 1.5, + "step": 4999 + }, + { + "clip_ratio": 0.005517961108125746, + "epoch": 0.18655846275826687, + "grad_norm": 0.057221852242946625, + "kl": 0.020721435546875, + "learning_rate": 1.0000000000000002e-06, + "loss": -0.0158, + "step": 5000 + }, + { + "epoch": 0.18655846275826687, + "step": 5000, + "total_flos": 0.0, + "train_loss": 1480.463632439661, + "train_runtime": 448707.0421, + "train_samples_per_second": 0.624, + "train_steps_per_second": 0.011 + } + ], + "logging_steps": 1, + "max_steps": 5000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}