qingyangzhang commited on
Commit
69d74e2
·
verified ·
1 Parent(s): 6af94cb

Model save

Browse files
Files changed (4) hide show
  1. README.md +2 -4
  2. all_results.json +4 -4
  3. train_results.json +4 -4
  4. trainer_state.json +547 -139
README.md CHANGED
@@ -1,10 +1,8 @@
1
  ---
2
- datasets: domenicrosati/TruthfulQA
3
  library_name: transformers
4
  model_name: Qwen2.5-3B-Open-R1-GRPO-Self-TQA
5
  tags:
6
  - generated_from_trainer
7
- - open-r1
8
  - trl
9
  - grpo
10
  licence: license
@@ -12,7 +10,7 @@ licence: license
12
 
13
  # Model Card for Qwen2.5-3B-Open-R1-GRPO-Self-TQA
14
 
15
- This model is a fine-tuned version of [None](https://huggingface.co/None) on the [domenicrosati/TruthfulQA](https://huggingface.co/datasets/domenicrosati/TruthfulQA) dataset.
16
  It has been trained using [TRL](https://github.com/huggingface/trl).
17
 
18
  ## Quick start
@@ -28,7 +26,7 @@ print(output["generated_text"])
28
 
29
  ## Training procedure
30
 
31
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/zqyoung1127-tianjin-university/huggingface/runs/hkmsr4fu)
32
 
33
 
34
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
1
  ---
 
2
  library_name: transformers
3
  model_name: Qwen2.5-3B-Open-R1-GRPO-Self-TQA
4
  tags:
5
  - generated_from_trainer
 
6
  - trl
7
  - grpo
8
  licence: license
 
10
 
11
  # Model Card for Qwen2.5-3B-Open-R1-GRPO-Self-TQA
12
 
13
+ This model is a fine-tuned version of [None](https://huggingface.co/None).
14
  It has been trained using [TRL](https://github.com/huggingface/trl).
15
 
16
  ## Quick start
 
26
 
27
  ## Training procedure
28
 
29
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/zqyoung1127-tianjin-university/huggingface/runs/tpi134uc)
30
 
31
 
32
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 2.8922741848572622e-08,
4
- "train_runtime": 6158.5867,
5
  "train_samples": 817,
6
- "train_samples_per_second": 0.133,
7
- "train_steps_per_second": 0.003
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.004883308103913604,
4
+ "train_runtime": 5205.8109,
5
  "train_samples": 817,
6
+ "train_samples_per_second": 0.471,
7
+ "train_steps_per_second": 0.01
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 2.8922741848572622e-08,
4
- "train_runtime": 6158.5867,
5
  "train_samples": 817,
6
- "train_samples_per_second": 0.133,
7
- "train_steps_per_second": 0.003
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.004883308103913604,
4
+ "train_runtime": 5205.8109,
5
  "train_samples": 817,
6
+ "train_samples_per_second": 0.471,
7
+ "train_steps_per_second": 0.01
8
  }
trainer_state.json CHANGED
@@ -1,231 +1,639 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9927007299270073,
5
  "eval_steps": 100,
6
- "global_step": 17,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 31.543402910232544,
13
  "epoch": 0.058394160583941604,
14
- "grad_norm": 0.6463247537612915,
15
  "kl": 0.0,
16
- "learning_rate": 1e-06,
17
  "loss": 0.0,
18
- "reward": 0.4809027835726738,
19
- "reward_std": 0.4057047627866268,
20
- "rewards/semantic_entropy": 0.4809027835726738,
21
  "step": 1
22
  },
23
  {
24
- "completion_length": 27.753472566604614,
25
  "epoch": 0.11678832116788321,
26
- "grad_norm": 0.31535470485687256,
27
  "kl": 0.0,
28
- "learning_rate": 2e-06,
29
  "loss": 0.0,
30
- "reward": 0.5850694514811039,
31
- "reward_std": 0.3451368249952793,
32
- "rewards/semantic_entropy": 0.5850694514811039,
33
  "step": 2
34
  },
35
  {
36
- "completion_length": 32.34895849227905,
37
  "epoch": 0.17518248175182483,
38
- "grad_norm": 0.38025209307670593,
39
- "kl": 0.0008878707885742188,
40
- "learning_rate": 1.9781476007338054e-06,
41
- "loss": 0.0,
42
- "reward": 0.4791666679084301,
43
- "reward_std": 0.38540373742580414,
44
- "rewards/semantic_entropy": 0.4791666679084301,
45
  "step": 3
46
  },
47
  {
48
- "completion_length": 27.75607681274414,
49
  "epoch": 0.23357664233576642,
50
- "grad_norm": 0.7206792235374451,
51
- "kl": 0.0020771026611328125,
52
- "learning_rate": 1.9135454576426007e-06,
53
  "loss": 0.0,
54
- "reward": 0.5998263880610466,
55
- "reward_std": 0.348458593711257,
56
- "rewards/semantic_entropy": 0.5998263880610466,
57
  "step": 4
58
  },
59
  {
60
- "completion_length": 29.860243558883667,
61
  "epoch": 0.291970802919708,
62
- "grad_norm": 0.5087810754776001,
63
- "kl": 0.0018243789672851562,
64
- "learning_rate": 1.8090169943749474e-06,
65
  "loss": 0.0,
66
- "reward": 0.5034722238779068,
67
- "reward_std": 0.39532990381121635,
68
- "rewards/semantic_entropy": 0.5034722238779068,
69
  "step": 5
70
  },
71
  {
72
- "completion_length": 28.811631679534912,
73
  "epoch": 0.35036496350364965,
74
- "grad_norm": 0.4511905610561371,
75
- "kl": 0.0026292800903320312,
76
- "learning_rate": 1.669130606358858e-06,
77
  "loss": 0.0,
78
- "reward": 0.5295138992369175,
79
- "reward_std": 0.3658079691231251,
80
- "rewards/semantic_entropy": 0.5295138992369175,
81
  "step": 6
82
  },
83
  {
84
- "completion_length": 31.047743320465088,
85
  "epoch": 0.40875912408759124,
86
- "grad_norm": 0.46206873655319214,
87
- "kl": 0.003326416015625,
88
- "learning_rate": 1.5e-06,
89
- "loss": 0.0,
90
- "reward": 0.4973958395421505,
91
- "reward_std": 0.39973679929971695,
92
- "rewards/semantic_entropy": 0.4973958395421505,
93
  "step": 7
94
  },
95
  {
96
- "completion_length": 30.878472328186035,
97
  "epoch": 0.46715328467153283,
98
- "grad_norm": 0.4796462655067444,
99
- "kl": 0.004016876220703125,
100
- "learning_rate": 1.3090169943749473e-06,
101
- "loss": 0.0,
102
- "reward": 0.5217013955116272,
103
- "reward_std": 0.36933426558971405,
104
- "rewards/semantic_entropy": 0.5217013955116272,
105
  "step": 8
106
  },
107
  {
108
- "completion_length": 34.5590283870697,
109
  "epoch": 0.5255474452554745,
110
- "grad_norm": 0.43365350365638733,
111
- "kl": 0.0037174224853515625,
112
- "learning_rate": 1.1045284632676535e-06,
113
- "loss": 0.0,
114
- "reward": 0.4444444589316845,
115
- "reward_std": 0.4142540544271469,
116
- "rewards/semantic_entropy": 0.4444444589316845,
117
  "step": 9
118
  },
119
  {
120
- "completion_length": 31.54600763320923,
121
  "epoch": 0.583941605839416,
122
- "grad_norm": 0.3461940586566925,
123
- "kl": 0.0054531097412109375,
124
- "learning_rate": 8.954715367323466e-07,
125
- "loss": 0.0,
126
- "reward": 0.5312499962747097,
127
- "reward_std": 0.33972141705453396,
128
- "rewards/semantic_entropy": 0.5312499962747097,
129
  "step": 10
130
  },
131
  {
132
- "completion_length": 25.934895992279053,
133
  "epoch": 0.6423357664233577,
134
- "grad_norm": 0.7275694012641907,
135
- "kl": 0.0059986114501953125,
136
- "learning_rate": 6.909830056250526e-07,
137
- "loss": 0.0,
138
- "reward": 0.5425347350537777,
139
- "reward_std": 0.39345845952630043,
140
- "rewards/semantic_entropy": 0.5425347350537777,
141
  "step": 11
142
  },
143
  {
144
- "completion_length": 27.58420157432556,
145
  "epoch": 0.7007299270072993,
146
- "grad_norm": 0.578926146030426,
147
- "kl": 0.010272979736328125,
148
- "learning_rate": 5.000000000000002e-07,
149
- "loss": 0.0,
150
- "reward": 0.5564236119389534,
151
- "reward_std": 0.38245424441993237,
152
- "rewards/semantic_entropy": 0.5564236119389534,
153
  "step": 12
154
  },
155
  {
156
- "completion_length": 32.76302146911621,
157
  "epoch": 0.7591240875912408,
158
- "grad_norm": 0.3081968128681183,
159
- "kl": 0.0044574737548828125,
160
- "learning_rate": 3.308693936411421e-07,
161
- "loss": 0.0,
162
- "reward": 0.4583333395421505,
163
- "reward_std": 0.39592672139406204,
164
- "rewards/semantic_entropy": 0.4583333395421505,
165
  "step": 13
166
  },
167
  {
168
- "completion_length": 28.50086808204651,
169
  "epoch": 0.8175182481751825,
170
- "grad_norm": 0.4480704367160797,
171
- "kl": 0.006603240966796875,
172
- "learning_rate": 1.9098300562505264e-07,
173
- "loss": 0.0,
174
- "reward": 0.4782986231148243,
175
- "reward_std": 0.3709658682346344,
176
- "rewards/semantic_entropy": 0.4782986231148243,
177
  "step": 14
178
  },
179
  {
180
- "completion_length": 28.36545181274414,
181
  "epoch": 0.8759124087591241,
182
- "grad_norm": 0.2726985514163971,
183
- "kl": 0.00714111328125,
184
- "learning_rate": 8.645454235739902e-08,
185
- "loss": 0.0,
186
- "reward": 0.572048619389534,
187
- "reward_std": 0.3576664440333843,
188
- "rewards/semantic_entropy": 0.572048619389534,
189
  "step": 15
190
  },
191
  {
192
- "completion_length": 29.555555820465088,
193
  "epoch": 0.9343065693430657,
194
- "grad_norm": 0.5542200207710266,
195
- "kl": 0.011875152587890625,
196
- "learning_rate": 2.185239926619431e-08,
197
- "loss": 0.0,
198
- "reward": 0.5442708358168602,
199
- "reward_std": 0.4052053317427635,
200
- "rewards/semantic_entropy": 0.5442708358168602,
201
  "step": 16
202
  },
203
  {
204
- "completion_length": 28.62326431274414,
205
  "epoch": 0.9927007299270073,
206
- "grad_norm": 0.4658753275871277,
207
- "kl": 0.010679244995117188,
208
- "learning_rate": 0.0,
209
- "loss": 0.0,
210
- "reward": 0.564236119389534,
211
- "reward_std": 0.3762592002749443,
212
- "rewards/semantic_entropy": 0.564236119389534,
213
  "step": 17
214
  },
215
  {
216
- "epoch": 0.9927007299270073,
217
- "step": 17,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  "total_flos": 0.0,
219
- "train_loss": 2.8922741848572622e-08,
220
- "train_runtime": 6158.5867,
221
- "train_samples_per_second": 0.133,
222
- "train_steps_per_second": 0.003
223
  }
224
  ],
225
  "logging_steps": 1,
226
- "max_steps": 17,
227
  "num_input_tokens_seen": 0,
228
- "num_train_epochs": 1,
229
  "save_steps": 10,
230
  "stateful_callbacks": {
231
  "TrainerControl": {
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.875912408759124,
5
  "eval_steps": 100,
6
+ "global_step": 51,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "completion_length": 17.3368057012558,
13
  "epoch": 0.058394160583941604,
14
+ "grad_norm": 0.5516418814659119,
15
  "kl": 0.0,
16
+ "learning_rate": 3.333333333333333e-07,
17
  "loss": 0.0,
18
+ "reward": 0.635416679084301,
19
+ "reward_std": 0.33513265289366245,
20
+ "rewards/semantic_entropy": 0.635416679084301,
21
  "step": 1
22
  },
23
  {
24
+ "completion_length": 17.217013835906982,
25
  "epoch": 0.11678832116788321,
26
+ "grad_norm": 0.4640360474586487,
27
  "kl": 0.0,
28
+ "learning_rate": 6.666666666666666e-07,
29
  "loss": 0.0,
30
+ "reward": 0.725694440305233,
31
+ "reward_std": 0.29016363993287086,
32
+ "rewards/semantic_entropy": 0.725694440305233,
33
  "step": 2
34
  },
35
  {
36
+ "completion_length": 18.128472328186035,
37
  "epoch": 0.17518248175182483,
38
+ "grad_norm": 0.540382981300354,
39
+ "kl": 0.0012841224670410156,
40
+ "learning_rate": 1e-06,
41
+ "loss": 0.0001,
42
+ "reward": 0.6006944552063942,
43
+ "reward_std": 0.3786292914301157,
44
+ "rewards/semantic_entropy": 0.6006944552063942,
45
  "step": 3
46
  },
47
  {
48
+ "completion_length": 17.270833373069763,
49
  "epoch": 0.23357664233576642,
50
+ "grad_norm": 0.5315675139427185,
51
+ "kl": 0.0009038448333740234,
52
+ "learning_rate": 1.3333333333333332e-06,
53
  "loss": 0.0,
54
+ "reward": 0.666666679084301,
55
+ "reward_std": 0.2968092504888773,
56
+ "rewards/semantic_entropy": 0.666666679084301,
57
  "step": 4
58
  },
59
  {
60
+ "completion_length": 18.501736402511597,
61
  "epoch": 0.291970802919708,
62
+ "grad_norm": 0.5848979353904724,
63
+ "kl": 0.0011067390441894531,
64
+ "learning_rate": 1.6666666666666667e-06,
65
  "loss": 0.0,
66
+ "reward": 0.642361119389534,
67
+ "reward_std": 0.37005409598350525,
68
+ "rewards/semantic_entropy": 0.642361119389534,
69
  "step": 5
70
  },
71
  {
72
+ "completion_length": 19.102431058883667,
73
  "epoch": 0.35036496350364965,
74
+ "grad_norm": 0.8071303367614746,
75
+ "kl": 0.001129150390625,
76
+ "learning_rate": 2e-06,
77
  "loss": 0.0,
78
+ "reward": 0.5868055522441864,
79
+ "reward_std": 0.40071484073996544,
80
+ "rewards/semantic_entropy": 0.5868055522441864,
81
  "step": 6
82
  },
83
  {
84
+ "completion_length": 18.661458492279053,
85
  "epoch": 0.40875912408759124,
86
+ "grad_norm": 0.4785407781600952,
87
+ "kl": 0.0016431808471679688,
88
+ "learning_rate": 1.997564050259824e-06,
89
+ "loss": 0.0001,
90
+ "reward": 0.6614583432674408,
91
+ "reward_std": 0.35613277927041054,
92
+ "rewards/semantic_entropy": 0.6614583432674408,
93
  "step": 7
94
  },
95
  {
96
+ "completion_length": 18.498263835906982,
97
  "epoch": 0.46715328467153283,
98
+ "grad_norm": 0.7530333995819092,
99
+ "kl": 0.0045261383056640625,
100
+ "learning_rate": 1.99026806874157e-06,
101
+ "loss": 0.0002,
102
+ "reward": 0.6145833469927311,
103
+ "reward_std": 0.35138164833188057,
104
+ "rewards/semantic_entropy": 0.6145833469927311,
105
  "step": 8
106
  },
107
  {
108
+ "completion_length": 19.208333611488342,
109
  "epoch": 0.5255474452554745,
110
+ "grad_norm": 0.741835355758667,
111
+ "kl": 0.00656890869140625,
112
+ "learning_rate": 1.9781476007338054e-06,
113
+ "loss": 0.0003,
114
+ "reward": 0.5364583395421505,
115
+ "reward_std": 0.39881302043795586,
116
+ "rewards/semantic_entropy": 0.5364583395421505,
117
  "step": 9
118
  },
119
  {
120
+ "completion_length": 18.463541746139526,
121
  "epoch": 0.583941605839416,
122
+ "grad_norm": 0.8095004558563232,
123
+ "kl": 0.0157012939453125,
124
+ "learning_rate": 1.9612616959383188e-06,
125
+ "loss": 0.0006,
126
+ "reward": 0.6388888880610466,
127
+ "reward_std": 0.3762888126075268,
128
+ "rewards/semantic_entropy": 0.6388888880610466,
129
  "step": 10
130
  },
131
  {
132
+ "completion_length": 16.302083492279053,
133
  "epoch": 0.6423357664233577,
134
+ "grad_norm": 1.0500741004943848,
135
+ "kl": 0.05213165283203125,
136
+ "learning_rate": 1.9396926207859082e-06,
137
+ "loss": 0.0021,
138
+ "reward": 0.7083333432674408,
139
+ "reward_std": 0.3477053064852953,
140
+ "rewards/semantic_entropy": 0.7083333432674408,
141
  "step": 11
142
  },
143
  {
144
+ "completion_length": 15.480902791023254,
145
  "epoch": 0.7007299270072993,
146
+ "grad_norm": 0.6965835690498352,
147
+ "kl": 0.107666015625,
148
+ "learning_rate": 1.9135454576426007e-06,
149
+ "loss": 0.0043,
150
+ "reward": 0.6979166641831398,
151
+ "reward_std": 0.3180003799498081,
152
+ "rewards/semantic_entropy": 0.6979166641831398,
153
  "step": 12
154
  },
155
  {
156
+ "completion_length": 16.611111283302307,
157
  "epoch": 0.7591240875912408,
158
+ "grad_norm": 0.8703776001930237,
159
+ "kl": 0.080535888671875,
160
+ "learning_rate": 1.8829475928589268e-06,
161
+ "loss": 0.0032,
162
+ "reward": 0.6857638955116272,
163
+ "reward_std": 0.3688342422246933,
164
+ "rewards/semantic_entropy": 0.6857638955116272,
165
  "step": 13
166
  },
167
  {
168
+ "completion_length": 14.387152791023254,
169
  "epoch": 0.8175182481751825,
170
+ "grad_norm": 0.7894781827926636,
171
+ "kl": 0.4075927734375,
172
+ "learning_rate": 1.8480480961564257e-06,
173
+ "loss": 0.0163,
174
+ "reward": 0.6805555745959282,
175
+ "reward_std": 0.31897793617099524,
176
+ "rewards/semantic_entropy": 0.6805555745959282,
177
  "step": 14
178
  },
179
  {
180
+ "completion_length": 14.901041746139526,
181
  "epoch": 0.8759124087591241,
182
+ "grad_norm": 0.8611342906951904,
183
+ "kl": 0.185882568359375,
184
+ "learning_rate": 1.8090169943749474e-06,
185
+ "loss": 0.0074,
186
+ "reward": 0.7274305671453476,
187
+ "reward_std": 0.27765000611543655,
188
+ "rewards/semantic_entropy": 0.7274305671453476,
189
  "step": 15
190
  },
191
  {
192
+ "completion_length": 13.159722089767456,
193
  "epoch": 0.9343065693430657,
194
+ "grad_norm": 0.9914915561676025,
195
+ "kl": 0.35858154296875,
196
+ "learning_rate": 1.766044443118978e-06,
197
+ "loss": 0.0143,
198
+ "reward": 0.7239583432674408,
199
+ "reward_std": 0.34174920059740543,
200
+ "rewards/semantic_entropy": 0.7239583432674408,
201
  "step": 16
202
  },
203
  {
204
+ "completion_length": 15.265625238418579,
205
  "epoch": 0.9927007299270073,
206
+ "grad_norm": 0.7431650757789612,
207
+ "kl": 0.18798828125,
208
+ "learning_rate": 1.719339800338651e-06,
209
+ "loss": 0.0075,
210
+ "reward": 0.7552083358168602,
211
+ "reward_std": 0.2897039409726858,
212
+ "rewards/semantic_entropy": 0.7552083358168602,
213
  "step": 17
214
  },
215
  {
216
+ "completion_length": 2.0,
217
+ "epoch": 1.0,
218
+ "grad_norm": 0.7431650757789612,
219
+ "kl": 1.125,
220
+ "learning_rate": 1.669130606358858e-06,
221
+ "loss": 0.0012,
222
+ "reward": 1.0,
223
+ "reward_std": 0.38924944400787354,
224
+ "rewards/semantic_entropy": 1.0,
225
+ "step": 18
226
+ },
227
+ {
228
+ "completion_length": 15.090277791023254,
229
+ "epoch": 1.0583941605839415,
230
+ "grad_norm": 0.8040208220481873,
231
+ "kl": 0.3365478515625,
232
+ "learning_rate": 1.615661475325658e-06,
233
+ "loss": 0.0135,
234
+ "reward": 0.7135416716337204,
235
+ "reward_std": 0.3099258504807949,
236
+ "rewards/semantic_entropy": 0.7135416716337204,
237
+ "step": 19
238
+ },
239
+ {
240
+ "completion_length": 15.520833373069763,
241
+ "epoch": 1.1167883211678833,
242
+ "grad_norm": 0.8632144927978516,
243
+ "kl": 0.32586669921875,
244
+ "learning_rate": 1.5591929034707466e-06,
245
+ "loss": 0.0131,
246
+ "reward": 0.737847238779068,
247
+ "reward_std": 0.28588614612817764,
248
+ "rewards/semantic_entropy": 0.737847238779068,
249
+ "step": 20
250
+ },
251
+ {
252
+ "completion_length": 16.050347328186035,
253
+ "epoch": 1.1751824817518248,
254
+ "grad_norm": 0.74057936668396,
255
+ "kl": 0.1895751953125,
256
+ "learning_rate": 1.5e-06,
257
+ "loss": 0.0076,
258
+ "reward": 0.75,
259
+ "reward_std": 0.31531847827136517,
260
+ "rewards/semantic_entropy": 0.75,
261
+ "step": 21
262
+ },
263
+ {
264
+ "completion_length": 16.63194465637207,
265
+ "epoch": 1.2335766423357664,
266
+ "grad_norm": 0.4329465627670288,
267
+ "kl": 0.22442626953125,
268
+ "learning_rate": 1.4383711467890773e-06,
269
+ "loss": 0.009,
270
+ "reward": 0.734375,
271
+ "reward_std": 0.2730935662984848,
272
+ "rewards/semantic_entropy": 0.734375,
273
+ "step": 22
274
+ },
275
+ {
276
+ "completion_length": 19.901041984558105,
277
+ "epoch": 1.2919708029197081,
278
+ "grad_norm": 0.652396023273468,
279
+ "kl": 0.12689208984375,
280
+ "learning_rate": 1.374606593415912e-06,
281
+ "loss": 0.0051,
282
+ "reward": 0.7239583507180214,
283
+ "reward_std": 0.33322223369032145,
284
+ "rewards/semantic_entropy": 0.7239583507180214,
285
+ "step": 23
286
+ },
287
+ {
288
+ "completion_length": 17.720486402511597,
289
+ "epoch": 1.3503649635036497,
290
+ "grad_norm": 0.5013155937194824,
291
+ "kl": 0.1468505859375,
292
+ "learning_rate": 1.3090169943749473e-06,
293
+ "loss": 0.0059,
294
+ "reward": 0.75,
295
+ "reward_std": 0.29686133936047554,
296
+ "rewards/semantic_entropy": 0.75,
297
+ "step": 24
298
+ },
299
+ {
300
+ "completion_length": 19.574653148651123,
301
+ "epoch": 1.4087591240875912,
302
+ "grad_norm": 0.5545840263366699,
303
+ "kl": 0.14691162109375,
304
+ "learning_rate": 1.2419218955996676e-06,
305
+ "loss": 0.0059,
306
+ "reward": 0.7378472313284874,
307
+ "reward_std": 0.29906335659325123,
308
+ "rewards/semantic_entropy": 0.7378472313284874,
309
+ "step": 25
310
+ },
311
+ {
312
+ "completion_length": 15.946180701255798,
313
+ "epoch": 1.4671532846715327,
314
+ "grad_norm": 0.5206867456436157,
315
+ "kl": 0.1771240234375,
316
+ "learning_rate": 1.1736481776669305e-06,
317
+ "loss": 0.0071,
318
+ "reward": 0.8107638955116272,
319
+ "reward_std": 0.24001463688910007,
320
+ "rewards/semantic_entropy": 0.8107638955116272,
321
+ "step": 26
322
+ },
323
+ {
324
+ "completion_length": 18.86805558204651,
325
+ "epoch": 1.5255474452554745,
326
+ "grad_norm": 0.7857072949409485,
327
+ "kl": 0.1768798828125,
328
+ "learning_rate": 1.1045284632676535e-06,
329
+ "loss": 0.0071,
330
+ "reward": 0.7552083432674408,
331
+ "reward_std": 0.3070409968495369,
332
+ "rewards/semantic_entropy": 0.7552083432674408,
333
+ "step": 27
334
+ },
335
+ {
336
+ "completion_length": 19.182291865348816,
337
+ "epoch": 1.583941605839416,
338
+ "grad_norm": 0.6400216221809387,
339
+ "kl": 0.2479248046875,
340
+ "learning_rate": 1.034899496702501e-06,
341
+ "loss": 0.0099,
342
+ "reward": 0.7534722238779068,
343
+ "reward_std": 0.2666480904445052,
344
+ "rewards/semantic_entropy": 0.7534722238779068,
345
+ "step": 28
346
+ },
347
+ {
348
+ "completion_length": 16.279513955116272,
349
+ "epoch": 1.6423357664233578,
350
+ "grad_norm": 0.6639309525489807,
351
+ "kl": 0.14581298828125,
352
+ "learning_rate": 9.651005032974993e-07,
353
+ "loss": 0.0058,
354
+ "reward": 0.8368055522441864,
355
+ "reward_std": 0.19957617949694395,
356
+ "rewards/semantic_entropy": 0.8368055522441864,
357
+ "step": 29
358
+ },
359
+ {
360
+ "completion_length": 19.109375,
361
+ "epoch": 1.7007299270072993,
362
+ "grad_norm": 0.6287054419517517,
363
+ "kl": 0.17852783203125,
364
+ "learning_rate": 8.954715367323466e-07,
365
+ "loss": 0.0071,
366
+ "reward": 0.798611119389534,
367
+ "reward_std": 0.2921114172786474,
368
+ "rewards/semantic_entropy": 0.798611119389534,
369
+ "step": 30
370
+ },
371
+ {
372
+ "completion_length": 16.519097566604614,
373
+ "epoch": 1.7591240875912408,
374
+ "grad_norm": 0.6585462689399719,
375
+ "kl": 0.157318115234375,
376
+ "learning_rate": 8.263518223330696e-07,
377
+ "loss": 0.0063,
378
+ "reward": 0.7708333358168602,
379
+ "reward_std": 0.2721500750631094,
380
+ "rewards/semantic_entropy": 0.7708333358168602,
381
+ "step": 31
382
+ },
383
+ {
384
+ "completion_length": 18.239583730697632,
385
+ "epoch": 1.8175182481751824,
386
+ "grad_norm": 0.6048464775085449,
387
+ "kl": 0.147125244140625,
388
+ "learning_rate": 7.580781044003324e-07,
389
+ "loss": 0.0059,
390
+ "reward": 0.7326388955116272,
391
+ "reward_std": 0.29634279757738113,
392
+ "rewards/semantic_entropy": 0.7326388955116272,
393
+ "step": 32
394
+ },
395
+ {
396
+ "completion_length": 19.44270896911621,
397
+ "epoch": 1.8759124087591241,
398
+ "grad_norm": 0.430084228515625,
399
+ "kl": 0.1063232421875,
400
+ "learning_rate": 6.909830056250526e-07,
401
+ "loss": 0.0043,
402
+ "reward": 0.774305559694767,
403
+ "reward_std": 0.27460889145731926,
404
+ "rewards/semantic_entropy": 0.774305559694767,
405
+ "step": 33
406
+ },
407
+ {
408
+ "completion_length": 16.817708730697632,
409
+ "epoch": 1.9343065693430657,
410
+ "grad_norm": 0.40789568424224854,
411
+ "kl": 0.070526123046875,
412
+ "learning_rate": 6.253934065840879e-07,
413
+ "loss": 0.0028,
414
+ "reward": 0.8107639029622078,
415
+ "reward_std": 0.2299627624452114,
416
+ "rewards/semantic_entropy": 0.8107639029622078,
417
+ "step": 34
418
+ },
419
+ {
420
+ "completion_length": 20.5625,
421
+ "epoch": 1.9927007299270074,
422
+ "grad_norm": 0.4874630868434906,
423
+ "kl": 0.13616943359375,
424
+ "learning_rate": 5.616288532109224e-07,
425
+ "loss": 0.0054,
426
+ "reward": 0.7361111044883728,
427
+ "reward_std": 0.3129718992859125,
428
+ "rewards/semantic_entropy": 0.7361111044883728,
429
+ "step": 35
430
+ },
431
+ {
432
+ "completion_length": 34.0,
433
+ "epoch": 2.0,
434
+ "grad_norm": 0.4874630868434906,
435
+ "kl": 0.036865234375,
436
+ "learning_rate": 5.000000000000002e-07,
437
+ "loss": 0.0023,
438
+ "reward": 1.0,
439
+ "reward_std": 0.0,
440
+ "rewards/semantic_entropy": 1.0,
441
+ "step": 36
442
+ },
443
+ {
444
+ "completion_length": 18.72743058204651,
445
+ "epoch": 2.0583941605839415,
446
+ "grad_norm": 0.5846592783927917,
447
+ "kl": 0.12078857421875,
448
+ "learning_rate": 4.408070965292533e-07,
449
+ "loss": 0.0048,
450
+ "reward": 0.7465277835726738,
451
+ "reward_std": 0.31662504002451897,
452
+ "rewards/semantic_entropy": 0.7465277835726738,
453
+ "step": 37
454
+ },
455
+ {
456
+ "completion_length": 17.776041984558105,
457
+ "epoch": 2.116788321167883,
458
+ "grad_norm": 0.6230023503303528,
459
+ "kl": 0.21380615234375,
460
+ "learning_rate": 3.843385246743417e-07,
461
+ "loss": 0.0085,
462
+ "reward": 0.7482638880610466,
463
+ "reward_std": 0.28513461723923683,
464
+ "rewards/semantic_entropy": 0.7482638880610466,
465
+ "step": 38
466
+ },
467
+ {
468
+ "completion_length": 19.468750476837158,
469
+ "epoch": 2.1751824817518246,
470
+ "grad_norm": 0.6272074580192566,
471
+ "kl": 0.096343994140625,
472
+ "learning_rate": 3.308693936411421e-07,
473
+ "loss": 0.0039,
474
+ "reward": 0.7291666716337204,
475
+ "reward_std": 0.32780924811959267,
476
+ "rewards/semantic_entropy": 0.7291666716337204,
477
+ "step": 39
478
+ },
479
+ {
480
+ "completion_length": 17.072916984558105,
481
+ "epoch": 2.2335766423357666,
482
+ "grad_norm": 0.5045897960662842,
483
+ "kl": 0.077362060546875,
484
+ "learning_rate": 2.8066019966134904e-07,
485
+ "loss": 0.0031,
486
+ "reward": 0.8090277835726738,
487
+ "reward_std": 0.18912154575809836,
488
+ "rewards/semantic_entropy": 0.8090277835726738,
489
+ "step": 40
490
+ },
491
+ {
492
+ "completion_length": 18.598958492279053,
493
+ "epoch": 2.291970802919708,
494
+ "grad_norm": 0.48655831813812256,
495
+ "kl": 0.09051513671875,
496
+ "learning_rate": 2.339555568810221e-07,
497
+ "loss": 0.0036,
498
+ "reward": 0.7760416641831398,
499
+ "reward_std": 0.26947965286672115,
500
+ "rewards/semantic_entropy": 0.7760416641831398,
501
+ "step": 41
502
+ },
503
+ {
504
+ "completion_length": 17.996527791023254,
505
+ "epoch": 2.3503649635036497,
506
+ "grad_norm": 0.5561981797218323,
507
+ "kl": 0.0677490234375,
508
+ "learning_rate": 1.9098300562505264e-07,
509
+ "loss": 0.0027,
510
+ "reward": 0.7638888955116272,
511
+ "reward_std": 0.2775236200541258,
512
+ "rewards/semantic_entropy": 0.7638888955116272,
513
+ "step": 42
514
+ },
515
+ {
516
+ "completion_length": 17.697916746139526,
517
+ "epoch": 2.408759124087591,
518
+ "grad_norm": 0.5540634989738464,
519
+ "kl": 0.15765380859375,
520
+ "learning_rate": 1.5195190384357404e-07,
521
+ "loss": 0.0063,
522
+ "reward": 0.774305559694767,
523
+ "reward_std": 0.2512203995138407,
524
+ "rewards/semantic_entropy": 0.774305559694767,
525
+ "step": 43
526
+ },
527
+ {
528
+ "completion_length": 18.682291865348816,
529
+ "epoch": 2.4671532846715327,
530
+ "grad_norm": 0.4448810815811157,
531
+ "kl": 0.11761474609375,
532
+ "learning_rate": 1.1705240714107301e-07,
533
+ "loss": 0.0047,
534
+ "reward": 0.7447916716337204,
535
+ "reward_std": 0.2556060552597046,
536
+ "rewards/semantic_entropy": 0.7447916716337204,
537
+ "step": 44
538
+ },
539
+ {
540
+ "completion_length": 16.9149307012558,
541
+ "epoch": 2.5255474452554747,
542
+ "grad_norm": 0.5861647725105286,
543
+ "kl": 0.09136962890625,
544
+ "learning_rate": 8.645454235739902e-08,
545
+ "loss": 0.0037,
546
+ "reward": 0.798611119389534,
547
+ "reward_std": 0.2555408189073205,
548
+ "rewards/semantic_entropy": 0.798611119389534,
549
+ "step": 45
550
+ },
551
+ {
552
+ "completion_length": 17.83506965637207,
553
+ "epoch": 2.5839416058394162,
554
+ "grad_norm": 0.4424433708190918,
555
+ "kl": 0.1358642578125,
556
+ "learning_rate": 6.030737921409168e-08,
557
+ "loss": 0.0054,
558
+ "reward": 0.7934027835726738,
559
+ "reward_std": 0.27376995235681534,
560
+ "rewards/semantic_entropy": 0.7934027835726738,
561
+ "step": 46
562
+ },
563
+ {
564
+ "completion_length": 18.23263943195343,
565
+ "epoch": 2.6423357664233578,
566
+ "grad_norm": 0.4687785804271698,
567
+ "kl": 0.10284423828125,
568
+ "learning_rate": 3.87383040616811e-08,
569
+ "loss": 0.0041,
570
+ "reward": 0.7934027761220932,
571
+ "reward_std": 0.2595429290086031,
572
+ "rewards/semantic_entropy": 0.7934027761220932,
573
+ "step": 47
574
+ },
575
+ {
576
+ "completion_length": 18.30381965637207,
577
+ "epoch": 2.7007299270072993,
578
+ "grad_norm": 0.5063730478286743,
579
+ "kl": 0.10589599609375,
580
+ "learning_rate": 2.185239926619431e-08,
581
+ "loss": 0.0042,
582
+ "reward": 0.758680559694767,
583
+ "reward_std": 0.29030087031424046,
584
+ "rewards/semantic_entropy": 0.758680559694767,
585
+ "step": 48
586
+ },
587
+ {
588
+ "completion_length": 18.715277910232544,
589
+ "epoch": 2.759124087591241,
590
+ "grad_norm": 0.5857909321784973,
591
+ "kl": 0.117889404296875,
592
+ "learning_rate": 9.731931258429638e-09,
593
+ "loss": 0.0047,
594
+ "reward": 0.7777777835726738,
595
+ "reward_std": 0.2577416365966201,
596
+ "rewards/semantic_entropy": 0.7777777835726738,
597
+ "step": 49
598
+ },
599
+ {
600
+ "completion_length": 17.322916626930237,
601
+ "epoch": 2.8175182481751824,
602
+ "grad_norm": 0.4949776828289032,
603
+ "kl": 0.09075927734375,
604
+ "learning_rate": 2.435949740175802e-09,
605
+ "loss": 0.0036,
606
+ "reward": 0.760416679084301,
607
+ "reward_std": 0.2918264754116535,
608
+ "rewards/semantic_entropy": 0.760416679084301,
609
+ "step": 50
610
+ },
611
+ {
612
+ "completion_length": 19.02256977558136,
613
+ "epoch": 2.875912408759124,
614
+ "grad_norm": 0.554519772529602,
615
+ "kl": 0.09710693359375,
616
+ "learning_rate": 0.0,
617
+ "loss": 0.0039,
618
+ "reward": 0.7395833358168602,
619
+ "reward_std": 0.2774972226470709,
620
+ "rewards/semantic_entropy": 0.7395833358168602,
621
+ "step": 51
622
+ },
623
+ {
624
+ "epoch": 2.875912408759124,
625
+ "step": 51,
626
  "total_flos": 0.0,
627
+ "train_loss": 0.004883308103913604,
628
+ "train_runtime": 5205.8109,
629
+ "train_samples_per_second": 0.471,
630
+ "train_steps_per_second": 0.01
631
  }
632
  ],
633
  "logging_steps": 1,
634
+ "max_steps": 51,
635
  "num_input_tokens_seen": 0,
636
+ "num_train_epochs": 3,
637
  "save_steps": 10,
638
  "stateful_callbacks": {
639
  "TrainerControl": {