Qwen2.5-3B-Open-R1-GRPO-Self-TQA / trainer_state.json
qingyangzhang's picture
Model save
b8efdea verified
raw
history blame
20.2 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.875912408759124,
"eval_steps": 100,
"global_step": 51,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 29.870370149612427,
"epoch": 0.058394160583941604,
"grad_norm": 0.5967049598693848,
"kl": 0.0,
"learning_rate": 3.333333333333333e-07,
"loss": 0.0,
"reward": 0.526620376855135,
"reward_std": 0.372432217001915,
"rewards/semantic_entropy": 0.526620376855135,
"step": 1
},
{
"completion_length": 26.355324506759644,
"epoch": 0.11678832116788321,
"grad_norm": 0.27608758211135864,
"kl": 0.0,
"learning_rate": 6.666666666666666e-07,
"loss": 0.0,
"reward": 0.6192129664123058,
"reward_std": 0.3255546223372221,
"rewards/semantic_entropy": 0.6192129664123058,
"step": 2
},
{
"completion_length": 31.85300898551941,
"epoch": 0.17518248175182483,
"grad_norm": 0.3493832051753998,
"kl": 0.0005927085876464844,
"learning_rate": 1e-06,
"loss": 0.0,
"reward": 0.49884259700775146,
"reward_std": 0.36885398998856544,
"rewards/semantic_entropy": 0.49884259700775146,
"step": 3
},
{
"completion_length": 27.081018686294556,
"epoch": 0.23357664233576642,
"grad_norm": 0.8119116425514221,
"kl": 0.0008134841918945312,
"learning_rate": 1.3333333333333332e-06,
"loss": 0.0,
"reward": 0.6111111231148243,
"reward_std": 0.3842464517802,
"rewards/semantic_entropy": 0.6111111231148243,
"step": 4
},
{
"completion_length": 30.222222089767456,
"epoch": 0.291970802919708,
"grad_norm": 0.5014259815216064,
"kl": 0.0007073879241943359,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0,
"reward": 0.6006944514811039,
"reward_std": 0.3485470600426197,
"rewards/semantic_entropy": 0.6006944514811039,
"step": 5
},
{
"completion_length": 29.46759271621704,
"epoch": 0.35036496350364965,
"grad_norm": 0.5462167859077454,
"kl": 0.0006594657897949219,
"learning_rate": 2e-06,
"loss": 0.0,
"reward": 0.5115740709006786,
"reward_std": 0.3930557183921337,
"rewards/semantic_entropy": 0.5115740709006786,
"step": 6
},
{
"completion_length": 30.0775465965271,
"epoch": 0.40875912408759124,
"grad_norm": 0.3974973261356354,
"kl": 0.0009469985961914062,
"learning_rate": 1.997564050259824e-06,
"loss": 0.0,
"reward": 0.6053240820765495,
"reward_std": 0.3901033569127321,
"rewards/semantic_entropy": 0.6053240820765495,
"step": 7
},
{
"completion_length": 30.643518686294556,
"epoch": 0.46715328467153283,
"grad_norm": 0.4026181399822235,
"kl": 0.0013263225555419922,
"learning_rate": 1.99026806874157e-06,
"loss": 0.0001,
"reward": 0.5821759402751923,
"reward_std": 0.3471612483263016,
"rewards/semantic_entropy": 0.5821759402751923,
"step": 8
},
{
"completion_length": 36.761574029922485,
"epoch": 0.5255474452554745,
"grad_norm": 0.46980857849121094,
"kl": 0.0016498565673828125,
"learning_rate": 1.9781476007338054e-06,
"loss": 0.0001,
"reward": 0.4861111231148243,
"reward_std": 0.43300675973296165,
"rewards/semantic_entropy": 0.4861111231148243,
"step": 9
},
{
"completion_length": 32.00000071525574,
"epoch": 0.583941605839416,
"grad_norm": 0.3003987669944763,
"kl": 0.0032224655151367188,
"learning_rate": 1.9612616959383188e-06,
"loss": 0.0001,
"reward": 0.5763888917863369,
"reward_std": 0.3751811906695366,
"rewards/semantic_entropy": 0.5763888917863369,
"step": 10
},
{
"completion_length": 26.78703737258911,
"epoch": 0.6423357664233577,
"grad_norm": 0.6452711224555969,
"kl": 0.007064342498779297,
"learning_rate": 1.9396926207859082e-06,
"loss": 0.0003,
"reward": 0.6087963059544563,
"reward_std": 0.3837307542562485,
"rewards/semantic_entropy": 0.6087963059544563,
"step": 11
},
{
"completion_length": 28.517361164093018,
"epoch": 0.7007299270072993,
"grad_norm": 0.5347678065299988,
"kl": 0.011034965515136719,
"learning_rate": 1.9135454576426007e-06,
"loss": 0.0004,
"reward": 0.5949074178934097,
"reward_std": 0.38593913801014423,
"rewards/semantic_entropy": 0.5949074178934097,
"step": 12
},
{
"completion_length": 32.70833349227905,
"epoch": 0.7591240875912408,
"grad_norm": 0.3103489875793457,
"kl": 0.006297111511230469,
"learning_rate": 1.8829475928589268e-06,
"loss": 0.0003,
"reward": 0.5405092723667622,
"reward_std": 0.3787935618311167,
"rewards/semantic_entropy": 0.5405092723667622,
"step": 13
},
{
"completion_length": 28.361111402511597,
"epoch": 0.8175182481751825,
"grad_norm": 0.341845840215683,
"kl": 0.011461257934570312,
"learning_rate": 1.8480480961564257e-06,
"loss": 0.0005,
"reward": 0.5798611156642437,
"reward_std": 0.37119201570749283,
"rewards/semantic_entropy": 0.5798611156642437,
"step": 14
},
{
"completion_length": 27.28703737258911,
"epoch": 0.8759124087591241,
"grad_norm": 0.3116164803504944,
"kl": 0.00528717041015625,
"learning_rate": 1.8090169943749474e-06,
"loss": 0.0002,
"reward": 0.5891203731298447,
"reward_std": 0.39476561546325684,
"rewards/semantic_entropy": 0.5891203731298447,
"step": 15
},
{
"completion_length": 27.549768924713135,
"epoch": 0.9343065693430657,
"grad_norm": 0.4672207236289978,
"kl": 0.012537002563476562,
"learning_rate": 1.766044443118978e-06,
"loss": 0.0005,
"reward": 0.6006944589316845,
"reward_std": 0.3798612989485264,
"rewards/semantic_entropy": 0.6006944589316845,
"step": 16
},
{
"completion_length": 27.44560170173645,
"epoch": 0.9927007299270073,
"grad_norm": 0.4451679587364197,
"kl": 0.026611328125,
"learning_rate": 1.719339800338651e-06,
"loss": 0.0011,
"reward": 0.6435185112059116,
"reward_std": 0.36070936545729637,
"rewards/semantic_entropy": 0.6435185112059116,
"step": 17
},
{
"completion_length": 16.0,
"epoch": 1.0,
"grad_norm": 0.4451679587364197,
"kl": 0.08837890625,
"learning_rate": 1.669130606358858e-06,
"loss": 0.0001,
"reward": 1.0,
"reward_std": 0.0,
"rewards/semantic_entropy": 1.0,
"step": 18
},
{
"completion_length": 24.4120374917984,
"epoch": 1.0583941605839415,
"grad_norm": 0.3066134452819824,
"kl": 0.04150390625,
"learning_rate": 1.615661475325658e-06,
"loss": 0.0017,
"reward": 0.6423611082136631,
"reward_std": 0.3426077160984278,
"rewards/semantic_entropy": 0.6423611082136631,
"step": 19
},
{
"completion_length": 24.349537134170532,
"epoch": 1.1167883211678833,
"grad_norm": 0.4213350713253021,
"kl": 0.0197906494140625,
"learning_rate": 1.5591929034707466e-06,
"loss": 0.0008,
"reward": 0.6168981604278088,
"reward_std": 0.35933491215109825,
"rewards/semantic_entropy": 0.6168981604278088,
"step": 20
},
{
"completion_length": 24.63078737258911,
"epoch": 1.1751824817518248,
"grad_norm": 0.35187050700187683,
"kl": 0.03948211669921875,
"learning_rate": 1.5e-06,
"loss": 0.0016,
"reward": 0.6712963134050369,
"reward_std": 0.3001057803630829,
"rewards/semantic_entropy": 0.6712963134050369,
"step": 21
},
{
"completion_length": 23.54745364189148,
"epoch": 1.2335766423357664,
"grad_norm": 0.30250656604766846,
"kl": 0.02053070068359375,
"learning_rate": 1.4383711467890773e-06,
"loss": 0.0008,
"reward": 0.7118055671453476,
"reward_std": 0.2657315619289875,
"rewards/semantic_entropy": 0.7118055671453476,
"step": 22
},
{
"completion_length": 29.81944465637207,
"epoch": 1.2919708029197081,
"grad_norm": 0.5680757164955139,
"kl": 0.0273590087890625,
"learning_rate": 1.374606593415912e-06,
"loss": 0.0011,
"reward": 0.5763888992369175,
"reward_std": 0.38675259053707123,
"rewards/semantic_entropy": 0.5763888992369175,
"step": 23
},
{
"completion_length": 24.825231790542603,
"epoch": 1.3503649635036497,
"grad_norm": 0.4038240611553192,
"kl": 0.02518463134765625,
"learning_rate": 1.3090169943749473e-06,
"loss": 0.001,
"reward": 0.6053240820765495,
"reward_std": 0.3557727001607418,
"rewards/semantic_entropy": 0.6053240820765495,
"step": 24
},
{
"completion_length": 31.438657999038696,
"epoch": 1.4087591240875912,
"grad_norm": 0.3054307699203491,
"kl": 0.01131439208984375,
"learning_rate": 1.2419218955996676e-06,
"loss": 0.0005,
"reward": 0.5127314850687981,
"reward_std": 0.41082172095775604,
"rewards/semantic_entropy": 0.5127314850687981,
"step": 25
},
{
"completion_length": 27.61805558204651,
"epoch": 1.4671532846715327,
"grad_norm": 0.3926026523113251,
"kl": 0.017902374267578125,
"learning_rate": 1.1736481776669305e-06,
"loss": 0.0007,
"reward": 0.6412037126719952,
"reward_std": 0.3502417653799057,
"rewards/semantic_entropy": 0.6412037126719952,
"step": 26
},
{
"completion_length": 28.688657522201538,
"epoch": 1.5255474452554745,
"grad_norm": 0.31289467215538025,
"kl": 0.018411636352539062,
"learning_rate": 1.1045284632676535e-06,
"loss": 0.0007,
"reward": 0.6319444552063942,
"reward_std": 0.36655068024992943,
"rewards/semantic_entropy": 0.6319444552063942,
"step": 27
},
{
"completion_length": 30.806713581085205,
"epoch": 1.583941605839416,
"grad_norm": 0.2388666570186615,
"kl": 0.015939712524414062,
"learning_rate": 1.034899496702501e-06,
"loss": 0.0006,
"reward": 0.6053240783512592,
"reward_std": 0.3488955218344927,
"rewards/semantic_entropy": 0.6053240783512592,
"step": 28
},
{
"completion_length": 26.402778148651123,
"epoch": 1.6423357664233578,
"grad_norm": 0.6465526819229126,
"kl": 0.016387939453125,
"learning_rate": 9.651005032974993e-07,
"loss": 0.0007,
"reward": 0.6932870522141457,
"reward_std": 0.30794387497007847,
"rewards/semantic_entropy": 0.6932870522141457,
"step": 29
},
{
"completion_length": 30.527778148651123,
"epoch": 1.7007299270072993,
"grad_norm": 0.2623353600502014,
"kl": 0.020229339599609375,
"learning_rate": 8.954715367323466e-07,
"loss": 0.0008,
"reward": 0.634259257465601,
"reward_std": 0.304907551035285,
"rewards/semantic_entropy": 0.634259257465601,
"step": 30
},
{
"completion_length": 28.02430558204651,
"epoch": 1.7591240875912408,
"grad_norm": 0.26452869176864624,
"kl": 0.01360321044921875,
"learning_rate": 8.263518223330696e-07,
"loss": 0.0005,
"reward": 0.6238426044583321,
"reward_std": 0.3165153060108423,
"rewards/semantic_entropy": 0.6238426044583321,
"step": 31
},
{
"completion_length": 29.64814805984497,
"epoch": 1.8175182481751824,
"grad_norm": 0.6390478014945984,
"kl": 0.03284645080566406,
"learning_rate": 7.580781044003324e-07,
"loss": 0.0013,
"reward": 0.6192129626870155,
"reward_std": 0.3468264602124691,
"rewards/semantic_entropy": 0.6192129626870155,
"step": 32
},
{
"completion_length": 31.30439829826355,
"epoch": 1.8759124087591241,
"grad_norm": 0.24422068893909454,
"kl": 0.010921478271484375,
"learning_rate": 6.909830056250526e-07,
"loss": 0.0004,
"reward": 0.6435185223817825,
"reward_std": 0.33870384842157364,
"rewards/semantic_entropy": 0.6435185223817825,
"step": 33
},
{
"completion_length": 29.41319465637207,
"epoch": 1.9343065693430657,
"grad_norm": 0.2262941151857376,
"kl": 0.016880035400390625,
"learning_rate": 6.253934065840879e-07,
"loss": 0.0007,
"reward": 0.6273148208856583,
"reward_std": 0.3323148675262928,
"rewards/semantic_entropy": 0.6273148208856583,
"step": 34
},
{
"completion_length": 28.489583492279053,
"epoch": 1.9927007299270074,
"grad_norm": 0.276526540517807,
"kl": 0.01245880126953125,
"learning_rate": 5.616288532109224e-07,
"loss": 0.0005,
"reward": 0.6238425932824612,
"reward_std": 0.3457994442433119,
"rewards/semantic_entropy": 0.6238425932824612,
"step": 35
},
{
"completion_length": 43.0,
"epoch": 2.0,
"grad_norm": 0.276526540517807,
"kl": 0.00799560546875,
"learning_rate": 5.000000000000002e-07,
"loss": 0.0,
"reward": 0.0,
"reward_std": 0.4608885943889618,
"rewards/semantic_entropy": 0.0,
"step": 36
},
{
"completion_length": 29.586805820465088,
"epoch": 2.0583941605839415,
"grad_norm": 0.2671959400177002,
"kl": 0.018100738525390625,
"learning_rate": 4.408070965292533e-07,
"loss": 0.0007,
"reward": 0.6157407388091087,
"reward_std": 0.36300016567111015,
"rewards/semantic_entropy": 0.6157407388091087,
"step": 37
},
{
"completion_length": 25.49074101448059,
"epoch": 2.116788321167883,
"grad_norm": 0.2453327476978302,
"kl": 0.0186920166015625,
"learning_rate": 3.843385246743417e-07,
"loss": 0.0007,
"reward": 0.6689814887940884,
"reward_std": 0.2830717135220766,
"rewards/semantic_entropy": 0.6689814887940884,
"step": 38
},
{
"completion_length": 30.723379850387573,
"epoch": 2.1751824817518246,
"grad_norm": 0.44519904255867004,
"kl": 0.02108001708984375,
"learning_rate": 3.308693936411421e-07,
"loss": 0.0008,
"reward": 0.6018518693745136,
"reward_std": 0.32297887466847897,
"rewards/semantic_entropy": 0.6018518693745136,
"step": 39
},
{
"completion_length": 24.181713104248047,
"epoch": 2.2335766423357666,
"grad_norm": 0.29531192779541016,
"kl": 0.020915985107421875,
"learning_rate": 2.8066019966134904e-07,
"loss": 0.0008,
"reward": 0.6793981567025185,
"reward_std": 0.3183311792090535,
"rewards/semantic_entropy": 0.6793981567025185,
"step": 40
},
{
"completion_length": 27.152777671813965,
"epoch": 2.291970802919708,
"grad_norm": 0.3067900836467743,
"kl": 0.01100921630859375,
"learning_rate": 2.339555568810221e-07,
"loss": 0.0004,
"reward": 0.6238426119089127,
"reward_std": 0.36597814224660397,
"rewards/semantic_entropy": 0.6238426119089127,
"step": 41
},
{
"completion_length": 28.2928249835968,
"epoch": 2.3503649635036497,
"grad_norm": 0.27559277415275574,
"kl": 0.012683868408203125,
"learning_rate": 1.9098300562505264e-07,
"loss": 0.0005,
"reward": 0.6967592611908913,
"reward_std": 0.29866353049874306,
"rewards/semantic_entropy": 0.6967592611908913,
"step": 42
},
{
"completion_length": 26.355324029922485,
"epoch": 2.408759124087591,
"grad_norm": 0.3306453227996826,
"kl": 0.02254486083984375,
"learning_rate": 1.5195190384357404e-07,
"loss": 0.0009,
"reward": 0.6990740746259689,
"reward_std": 0.30733868665993214,
"rewards/semantic_entropy": 0.6990740746259689,
"step": 43
},
{
"completion_length": 27.85185170173645,
"epoch": 2.4671532846715327,
"grad_norm": 0.37372887134552,
"kl": 0.03179931640625,
"learning_rate": 1.1705240714107301e-07,
"loss": 0.0013,
"reward": 0.6562500074505806,
"reward_std": 0.28820149786770344,
"rewards/semantic_entropy": 0.6562500074505806,
"step": 44
},
{
"completion_length": 24.635416746139526,
"epoch": 2.5255474452554747,
"grad_norm": 0.3697076439857483,
"kl": 0.015590667724609375,
"learning_rate": 8.645454235739902e-08,
"loss": 0.0006,
"reward": 0.7037037089467049,
"reward_std": 0.2801394369453192,
"rewards/semantic_entropy": 0.7037037089467049,
"step": 45
},
{
"completion_length": 26.35185217857361,
"epoch": 2.5839416058394162,
"grad_norm": 0.31710338592529297,
"kl": 0.03106689453125,
"learning_rate": 6.030737921409168e-08,
"loss": 0.0012,
"reward": 0.6574074104428291,
"reward_std": 0.33865234442055225,
"rewards/semantic_entropy": 0.6574074104428291,
"step": 46
},
{
"completion_length": 26.200231552124023,
"epoch": 2.6423357664233578,
"grad_norm": 0.2386447936296463,
"kl": 0.021045684814453125,
"learning_rate": 3.87383040616811e-08,
"loss": 0.0008,
"reward": 0.6574074104428291,
"reward_std": 0.30992276407778263,
"rewards/semantic_entropy": 0.6574074104428291,
"step": 47
},
{
"completion_length": 26.50810217857361,
"epoch": 2.7007299270072993,
"grad_norm": 0.5202280282974243,
"kl": 0.02829742431640625,
"learning_rate": 2.185239926619431e-08,
"loss": 0.0011,
"reward": 0.7141203731298447,
"reward_std": 0.28790368139743805,
"rewards/semantic_entropy": 0.7141203731298447,
"step": 48
},
{
"completion_length": 29.20138907432556,
"epoch": 2.759124087591241,
"grad_norm": 0.30392101407051086,
"kl": 0.036502838134765625,
"learning_rate": 9.731931258429638e-09,
"loss": 0.0015,
"reward": 0.6631944477558136,
"reward_std": 0.31401310954242945,
"rewards/semantic_entropy": 0.6631944477558136,
"step": 49
},
{
"completion_length": 24.504629611968994,
"epoch": 2.8175182481751824,
"grad_norm": 0.2555871307849884,
"kl": 0.017333984375,
"learning_rate": 2.435949740175802e-09,
"loss": 0.0007,
"reward": 0.6504629701375961,
"reward_std": 0.3291088491678238,
"rewards/semantic_entropy": 0.6504629701375961,
"step": 50
},
{
"completion_length": 30.652778148651123,
"epoch": 2.875912408759124,
"grad_norm": 0.3172709345817566,
"kl": 0.014835357666015625,
"learning_rate": 0.0,
"loss": 0.0006,
"reward": 0.5914351791143417,
"reward_std": 0.3534049801528454,
"rewards/semantic_entropy": 0.5914351791143417,
"step": 51
},
{
"epoch": 2.875912408759124,
"step": 51,
"total_flos": 0.0,
"train_loss": 0.0006088586881692043,
"train_runtime": 10241.9671,
"train_samples_per_second": 0.239,
"train_steps_per_second": 0.005
}
],
"logging_steps": 1,
"max_steps": 51,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}