Qwen2.5-3B-Open-R1-GRPO-Self-TQA / trainer_state.json
qingyangzhang's picture
Model save
69d74e2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.875912408759124,
"eval_steps": 100,
"global_step": 51,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 17.3368057012558,
"epoch": 0.058394160583941604,
"grad_norm": 0.5516418814659119,
"kl": 0.0,
"learning_rate": 3.333333333333333e-07,
"loss": 0.0,
"reward": 0.635416679084301,
"reward_std": 0.33513265289366245,
"rewards/semantic_entropy": 0.635416679084301,
"step": 1
},
{
"completion_length": 17.217013835906982,
"epoch": 0.11678832116788321,
"grad_norm": 0.4640360474586487,
"kl": 0.0,
"learning_rate": 6.666666666666666e-07,
"loss": 0.0,
"reward": 0.725694440305233,
"reward_std": 0.29016363993287086,
"rewards/semantic_entropy": 0.725694440305233,
"step": 2
},
{
"completion_length": 18.128472328186035,
"epoch": 0.17518248175182483,
"grad_norm": 0.540382981300354,
"kl": 0.0012841224670410156,
"learning_rate": 1e-06,
"loss": 0.0001,
"reward": 0.6006944552063942,
"reward_std": 0.3786292914301157,
"rewards/semantic_entropy": 0.6006944552063942,
"step": 3
},
{
"completion_length": 17.270833373069763,
"epoch": 0.23357664233576642,
"grad_norm": 0.5315675139427185,
"kl": 0.0009038448333740234,
"learning_rate": 1.3333333333333332e-06,
"loss": 0.0,
"reward": 0.666666679084301,
"reward_std": 0.2968092504888773,
"rewards/semantic_entropy": 0.666666679084301,
"step": 4
},
{
"completion_length": 18.501736402511597,
"epoch": 0.291970802919708,
"grad_norm": 0.5848979353904724,
"kl": 0.0011067390441894531,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0,
"reward": 0.642361119389534,
"reward_std": 0.37005409598350525,
"rewards/semantic_entropy": 0.642361119389534,
"step": 5
},
{
"completion_length": 19.102431058883667,
"epoch": 0.35036496350364965,
"grad_norm": 0.8071303367614746,
"kl": 0.001129150390625,
"learning_rate": 2e-06,
"loss": 0.0,
"reward": 0.5868055522441864,
"reward_std": 0.40071484073996544,
"rewards/semantic_entropy": 0.5868055522441864,
"step": 6
},
{
"completion_length": 18.661458492279053,
"epoch": 0.40875912408759124,
"grad_norm": 0.4785407781600952,
"kl": 0.0016431808471679688,
"learning_rate": 1.997564050259824e-06,
"loss": 0.0001,
"reward": 0.6614583432674408,
"reward_std": 0.35613277927041054,
"rewards/semantic_entropy": 0.6614583432674408,
"step": 7
},
{
"completion_length": 18.498263835906982,
"epoch": 0.46715328467153283,
"grad_norm": 0.7530333995819092,
"kl": 0.0045261383056640625,
"learning_rate": 1.99026806874157e-06,
"loss": 0.0002,
"reward": 0.6145833469927311,
"reward_std": 0.35138164833188057,
"rewards/semantic_entropy": 0.6145833469927311,
"step": 8
},
{
"completion_length": 19.208333611488342,
"epoch": 0.5255474452554745,
"grad_norm": 0.741835355758667,
"kl": 0.00656890869140625,
"learning_rate": 1.9781476007338054e-06,
"loss": 0.0003,
"reward": 0.5364583395421505,
"reward_std": 0.39881302043795586,
"rewards/semantic_entropy": 0.5364583395421505,
"step": 9
},
{
"completion_length": 18.463541746139526,
"epoch": 0.583941605839416,
"grad_norm": 0.8095004558563232,
"kl": 0.0157012939453125,
"learning_rate": 1.9612616959383188e-06,
"loss": 0.0006,
"reward": 0.6388888880610466,
"reward_std": 0.3762888126075268,
"rewards/semantic_entropy": 0.6388888880610466,
"step": 10
},
{
"completion_length": 16.302083492279053,
"epoch": 0.6423357664233577,
"grad_norm": 1.0500741004943848,
"kl": 0.05213165283203125,
"learning_rate": 1.9396926207859082e-06,
"loss": 0.0021,
"reward": 0.7083333432674408,
"reward_std": 0.3477053064852953,
"rewards/semantic_entropy": 0.7083333432674408,
"step": 11
},
{
"completion_length": 15.480902791023254,
"epoch": 0.7007299270072993,
"grad_norm": 0.6965835690498352,
"kl": 0.107666015625,
"learning_rate": 1.9135454576426007e-06,
"loss": 0.0043,
"reward": 0.6979166641831398,
"reward_std": 0.3180003799498081,
"rewards/semantic_entropy": 0.6979166641831398,
"step": 12
},
{
"completion_length": 16.611111283302307,
"epoch": 0.7591240875912408,
"grad_norm": 0.8703776001930237,
"kl": 0.080535888671875,
"learning_rate": 1.8829475928589268e-06,
"loss": 0.0032,
"reward": 0.6857638955116272,
"reward_std": 0.3688342422246933,
"rewards/semantic_entropy": 0.6857638955116272,
"step": 13
},
{
"completion_length": 14.387152791023254,
"epoch": 0.8175182481751825,
"grad_norm": 0.7894781827926636,
"kl": 0.4075927734375,
"learning_rate": 1.8480480961564257e-06,
"loss": 0.0163,
"reward": 0.6805555745959282,
"reward_std": 0.31897793617099524,
"rewards/semantic_entropy": 0.6805555745959282,
"step": 14
},
{
"completion_length": 14.901041746139526,
"epoch": 0.8759124087591241,
"grad_norm": 0.8611342906951904,
"kl": 0.185882568359375,
"learning_rate": 1.8090169943749474e-06,
"loss": 0.0074,
"reward": 0.7274305671453476,
"reward_std": 0.27765000611543655,
"rewards/semantic_entropy": 0.7274305671453476,
"step": 15
},
{
"completion_length": 13.159722089767456,
"epoch": 0.9343065693430657,
"grad_norm": 0.9914915561676025,
"kl": 0.35858154296875,
"learning_rate": 1.766044443118978e-06,
"loss": 0.0143,
"reward": 0.7239583432674408,
"reward_std": 0.34174920059740543,
"rewards/semantic_entropy": 0.7239583432674408,
"step": 16
},
{
"completion_length": 15.265625238418579,
"epoch": 0.9927007299270073,
"grad_norm": 0.7431650757789612,
"kl": 0.18798828125,
"learning_rate": 1.719339800338651e-06,
"loss": 0.0075,
"reward": 0.7552083358168602,
"reward_std": 0.2897039409726858,
"rewards/semantic_entropy": 0.7552083358168602,
"step": 17
},
{
"completion_length": 2.0,
"epoch": 1.0,
"grad_norm": 0.7431650757789612,
"kl": 1.125,
"learning_rate": 1.669130606358858e-06,
"loss": 0.0012,
"reward": 1.0,
"reward_std": 0.38924944400787354,
"rewards/semantic_entropy": 1.0,
"step": 18
},
{
"completion_length": 15.090277791023254,
"epoch": 1.0583941605839415,
"grad_norm": 0.8040208220481873,
"kl": 0.3365478515625,
"learning_rate": 1.615661475325658e-06,
"loss": 0.0135,
"reward": 0.7135416716337204,
"reward_std": 0.3099258504807949,
"rewards/semantic_entropy": 0.7135416716337204,
"step": 19
},
{
"completion_length": 15.520833373069763,
"epoch": 1.1167883211678833,
"grad_norm": 0.8632144927978516,
"kl": 0.32586669921875,
"learning_rate": 1.5591929034707466e-06,
"loss": 0.0131,
"reward": 0.737847238779068,
"reward_std": 0.28588614612817764,
"rewards/semantic_entropy": 0.737847238779068,
"step": 20
},
{
"completion_length": 16.050347328186035,
"epoch": 1.1751824817518248,
"grad_norm": 0.74057936668396,
"kl": 0.1895751953125,
"learning_rate": 1.5e-06,
"loss": 0.0076,
"reward": 0.75,
"reward_std": 0.31531847827136517,
"rewards/semantic_entropy": 0.75,
"step": 21
},
{
"completion_length": 16.63194465637207,
"epoch": 1.2335766423357664,
"grad_norm": 0.4329465627670288,
"kl": 0.22442626953125,
"learning_rate": 1.4383711467890773e-06,
"loss": 0.009,
"reward": 0.734375,
"reward_std": 0.2730935662984848,
"rewards/semantic_entropy": 0.734375,
"step": 22
},
{
"completion_length": 19.901041984558105,
"epoch": 1.2919708029197081,
"grad_norm": 0.652396023273468,
"kl": 0.12689208984375,
"learning_rate": 1.374606593415912e-06,
"loss": 0.0051,
"reward": 0.7239583507180214,
"reward_std": 0.33322223369032145,
"rewards/semantic_entropy": 0.7239583507180214,
"step": 23
},
{
"completion_length": 17.720486402511597,
"epoch": 1.3503649635036497,
"grad_norm": 0.5013155937194824,
"kl": 0.1468505859375,
"learning_rate": 1.3090169943749473e-06,
"loss": 0.0059,
"reward": 0.75,
"reward_std": 0.29686133936047554,
"rewards/semantic_entropy": 0.75,
"step": 24
},
{
"completion_length": 19.574653148651123,
"epoch": 1.4087591240875912,
"grad_norm": 0.5545840263366699,
"kl": 0.14691162109375,
"learning_rate": 1.2419218955996676e-06,
"loss": 0.0059,
"reward": 0.7378472313284874,
"reward_std": 0.29906335659325123,
"rewards/semantic_entropy": 0.7378472313284874,
"step": 25
},
{
"completion_length": 15.946180701255798,
"epoch": 1.4671532846715327,
"grad_norm": 0.5206867456436157,
"kl": 0.1771240234375,
"learning_rate": 1.1736481776669305e-06,
"loss": 0.0071,
"reward": 0.8107638955116272,
"reward_std": 0.24001463688910007,
"rewards/semantic_entropy": 0.8107638955116272,
"step": 26
},
{
"completion_length": 18.86805558204651,
"epoch": 1.5255474452554745,
"grad_norm": 0.7857072949409485,
"kl": 0.1768798828125,
"learning_rate": 1.1045284632676535e-06,
"loss": 0.0071,
"reward": 0.7552083432674408,
"reward_std": 0.3070409968495369,
"rewards/semantic_entropy": 0.7552083432674408,
"step": 27
},
{
"completion_length": 19.182291865348816,
"epoch": 1.583941605839416,
"grad_norm": 0.6400216221809387,
"kl": 0.2479248046875,
"learning_rate": 1.034899496702501e-06,
"loss": 0.0099,
"reward": 0.7534722238779068,
"reward_std": 0.2666480904445052,
"rewards/semantic_entropy": 0.7534722238779068,
"step": 28
},
{
"completion_length": 16.279513955116272,
"epoch": 1.6423357664233578,
"grad_norm": 0.6639309525489807,
"kl": 0.14581298828125,
"learning_rate": 9.651005032974993e-07,
"loss": 0.0058,
"reward": 0.8368055522441864,
"reward_std": 0.19957617949694395,
"rewards/semantic_entropy": 0.8368055522441864,
"step": 29
},
{
"completion_length": 19.109375,
"epoch": 1.7007299270072993,
"grad_norm": 0.6287054419517517,
"kl": 0.17852783203125,
"learning_rate": 8.954715367323466e-07,
"loss": 0.0071,
"reward": 0.798611119389534,
"reward_std": 0.2921114172786474,
"rewards/semantic_entropy": 0.798611119389534,
"step": 30
},
{
"completion_length": 16.519097566604614,
"epoch": 1.7591240875912408,
"grad_norm": 0.6585462689399719,
"kl": 0.157318115234375,
"learning_rate": 8.263518223330696e-07,
"loss": 0.0063,
"reward": 0.7708333358168602,
"reward_std": 0.2721500750631094,
"rewards/semantic_entropy": 0.7708333358168602,
"step": 31
},
{
"completion_length": 18.239583730697632,
"epoch": 1.8175182481751824,
"grad_norm": 0.6048464775085449,
"kl": 0.147125244140625,
"learning_rate": 7.580781044003324e-07,
"loss": 0.0059,
"reward": 0.7326388955116272,
"reward_std": 0.29634279757738113,
"rewards/semantic_entropy": 0.7326388955116272,
"step": 32
},
{
"completion_length": 19.44270896911621,
"epoch": 1.8759124087591241,
"grad_norm": 0.430084228515625,
"kl": 0.1063232421875,
"learning_rate": 6.909830056250526e-07,
"loss": 0.0043,
"reward": 0.774305559694767,
"reward_std": 0.27460889145731926,
"rewards/semantic_entropy": 0.774305559694767,
"step": 33
},
{
"completion_length": 16.817708730697632,
"epoch": 1.9343065693430657,
"grad_norm": 0.40789568424224854,
"kl": 0.070526123046875,
"learning_rate": 6.253934065840879e-07,
"loss": 0.0028,
"reward": 0.8107639029622078,
"reward_std": 0.2299627624452114,
"rewards/semantic_entropy": 0.8107639029622078,
"step": 34
},
{
"completion_length": 20.5625,
"epoch": 1.9927007299270074,
"grad_norm": 0.4874630868434906,
"kl": 0.13616943359375,
"learning_rate": 5.616288532109224e-07,
"loss": 0.0054,
"reward": 0.7361111044883728,
"reward_std": 0.3129718992859125,
"rewards/semantic_entropy": 0.7361111044883728,
"step": 35
},
{
"completion_length": 34.0,
"epoch": 2.0,
"grad_norm": 0.4874630868434906,
"kl": 0.036865234375,
"learning_rate": 5.000000000000002e-07,
"loss": 0.0023,
"reward": 1.0,
"reward_std": 0.0,
"rewards/semantic_entropy": 1.0,
"step": 36
},
{
"completion_length": 18.72743058204651,
"epoch": 2.0583941605839415,
"grad_norm": 0.5846592783927917,
"kl": 0.12078857421875,
"learning_rate": 4.408070965292533e-07,
"loss": 0.0048,
"reward": 0.7465277835726738,
"reward_std": 0.31662504002451897,
"rewards/semantic_entropy": 0.7465277835726738,
"step": 37
},
{
"completion_length": 17.776041984558105,
"epoch": 2.116788321167883,
"grad_norm": 0.6230023503303528,
"kl": 0.21380615234375,
"learning_rate": 3.843385246743417e-07,
"loss": 0.0085,
"reward": 0.7482638880610466,
"reward_std": 0.28513461723923683,
"rewards/semantic_entropy": 0.7482638880610466,
"step": 38
},
{
"completion_length": 19.468750476837158,
"epoch": 2.1751824817518246,
"grad_norm": 0.6272074580192566,
"kl": 0.096343994140625,
"learning_rate": 3.308693936411421e-07,
"loss": 0.0039,
"reward": 0.7291666716337204,
"reward_std": 0.32780924811959267,
"rewards/semantic_entropy": 0.7291666716337204,
"step": 39
},
{
"completion_length": 17.072916984558105,
"epoch": 2.2335766423357666,
"grad_norm": 0.5045897960662842,
"kl": 0.077362060546875,
"learning_rate": 2.8066019966134904e-07,
"loss": 0.0031,
"reward": 0.8090277835726738,
"reward_std": 0.18912154575809836,
"rewards/semantic_entropy": 0.8090277835726738,
"step": 40
},
{
"completion_length": 18.598958492279053,
"epoch": 2.291970802919708,
"grad_norm": 0.48655831813812256,
"kl": 0.09051513671875,
"learning_rate": 2.339555568810221e-07,
"loss": 0.0036,
"reward": 0.7760416641831398,
"reward_std": 0.26947965286672115,
"rewards/semantic_entropy": 0.7760416641831398,
"step": 41
},
{
"completion_length": 17.996527791023254,
"epoch": 2.3503649635036497,
"grad_norm": 0.5561981797218323,
"kl": 0.0677490234375,
"learning_rate": 1.9098300562505264e-07,
"loss": 0.0027,
"reward": 0.7638888955116272,
"reward_std": 0.2775236200541258,
"rewards/semantic_entropy": 0.7638888955116272,
"step": 42
},
{
"completion_length": 17.697916746139526,
"epoch": 2.408759124087591,
"grad_norm": 0.5540634989738464,
"kl": 0.15765380859375,
"learning_rate": 1.5195190384357404e-07,
"loss": 0.0063,
"reward": 0.774305559694767,
"reward_std": 0.2512203995138407,
"rewards/semantic_entropy": 0.774305559694767,
"step": 43
},
{
"completion_length": 18.682291865348816,
"epoch": 2.4671532846715327,
"grad_norm": 0.4448810815811157,
"kl": 0.11761474609375,
"learning_rate": 1.1705240714107301e-07,
"loss": 0.0047,
"reward": 0.7447916716337204,
"reward_std": 0.2556060552597046,
"rewards/semantic_entropy": 0.7447916716337204,
"step": 44
},
{
"completion_length": 16.9149307012558,
"epoch": 2.5255474452554747,
"grad_norm": 0.5861647725105286,
"kl": 0.09136962890625,
"learning_rate": 8.645454235739902e-08,
"loss": 0.0037,
"reward": 0.798611119389534,
"reward_std": 0.2555408189073205,
"rewards/semantic_entropy": 0.798611119389534,
"step": 45
},
{
"completion_length": 17.83506965637207,
"epoch": 2.5839416058394162,
"grad_norm": 0.4424433708190918,
"kl": 0.1358642578125,
"learning_rate": 6.030737921409168e-08,
"loss": 0.0054,
"reward": 0.7934027835726738,
"reward_std": 0.27376995235681534,
"rewards/semantic_entropy": 0.7934027835726738,
"step": 46
},
{
"completion_length": 18.23263943195343,
"epoch": 2.6423357664233578,
"grad_norm": 0.4687785804271698,
"kl": 0.10284423828125,
"learning_rate": 3.87383040616811e-08,
"loss": 0.0041,
"reward": 0.7934027761220932,
"reward_std": 0.2595429290086031,
"rewards/semantic_entropy": 0.7934027761220932,
"step": 47
},
{
"completion_length": 18.30381965637207,
"epoch": 2.7007299270072993,
"grad_norm": 0.5063730478286743,
"kl": 0.10589599609375,
"learning_rate": 2.185239926619431e-08,
"loss": 0.0042,
"reward": 0.758680559694767,
"reward_std": 0.29030087031424046,
"rewards/semantic_entropy": 0.758680559694767,
"step": 48
},
{
"completion_length": 18.715277910232544,
"epoch": 2.759124087591241,
"grad_norm": 0.5857909321784973,
"kl": 0.117889404296875,
"learning_rate": 9.731931258429638e-09,
"loss": 0.0047,
"reward": 0.7777777835726738,
"reward_std": 0.2577416365966201,
"rewards/semantic_entropy": 0.7777777835726738,
"step": 49
},
{
"completion_length": 17.322916626930237,
"epoch": 2.8175182481751824,
"grad_norm": 0.4949776828289032,
"kl": 0.09075927734375,
"learning_rate": 2.435949740175802e-09,
"loss": 0.0036,
"reward": 0.760416679084301,
"reward_std": 0.2918264754116535,
"rewards/semantic_entropy": 0.760416679084301,
"step": 50
},
{
"completion_length": 19.02256977558136,
"epoch": 2.875912408759124,
"grad_norm": 0.554519772529602,
"kl": 0.09710693359375,
"learning_rate": 0.0,
"loss": 0.0039,
"reward": 0.7395833358168602,
"reward_std": 0.2774972226470709,
"rewards/semantic_entropy": 0.7395833358168602,
"step": 51
},
{
"epoch": 2.875912408759124,
"step": 51,
"total_flos": 0.0,
"train_loss": 0.004883308103913604,
"train_runtime": 5205.8109,
"train_samples_per_second": 0.471,
"train_steps_per_second": 0.01
}
],
"logging_steps": 1,
"max_steps": 51,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}