qingyangzhang commited on
Commit
3d56725
·
verified ·
1 Parent(s): e21cb51

Model save

Browse files
Files changed (4) hide show
  1. README.md +2 -4
  2. all_results.json +4 -4
  3. train_results.json +4 -4
  4. trainer_state.json +137 -545
README.md CHANGED
@@ -1,10 +1,8 @@
1
  ---
2
- datasets: domenicrosati/TruthfulQA
3
  library_name: transformers
4
  model_name: Qwen2.5-3B-Open-R1-GRPO-Self-TQA
5
  tags:
6
  - generated_from_trainer
7
- - open-r1
8
  - trl
9
  - grpo
10
  licence: license
@@ -12,7 +10,7 @@ licence: license
12
 
13
  # Model Card for Qwen2.5-3B-Open-R1-GRPO-Self-TQA
14
 
15
- This model is a fine-tuned version of [None](https://huggingface.co/None) on the [domenicrosati/TruthfulQA](https://huggingface.co/datasets/domenicrosati/TruthfulQA) dataset.
16
  It has been trained using [TRL](https://github.com/huggingface/trl).
17
 
18
  ## Quick start
@@ -28,7 +26,7 @@ print(output["generated_text"])
28
 
29
  ## Training procedure
30
 
31
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/zqyoung1127-tianjin-university/huggingface/runs/lme5anvg)
32
 
33
 
34
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
1
  ---
 
2
  library_name: transformers
3
  model_name: Qwen2.5-3B-Open-R1-GRPO-Self-TQA
4
  tags:
5
  - generated_from_trainer
 
6
  - trl
7
  - grpo
8
  licence: license
 
10
 
11
  # Model Card for Qwen2.5-3B-Open-R1-GRPO-Self-TQA
12
 
13
+ This model is a fine-tuned version of [None](https://huggingface.co/None).
14
  It has been trained using [TRL](https://github.com/huggingface/trl).
15
 
16
  ## Quick start
 
26
 
27
  ## Training procedure
28
 
29
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/zqyoung1127-tianjin-university/huggingface/runs/hkmsr4fu)
30
 
31
 
32
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.0006088586881692043,
4
- "train_runtime": 10241.9671,
5
  "train_samples": 817,
6
- "train_samples_per_second": 0.239,
7
- "train_steps_per_second": 0.005
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 2.8922741848572622e-08,
4
+ "train_runtime": 6158.5867,
5
  "train_samples": 817,
6
+ "train_samples_per_second": 0.133,
7
+ "train_steps_per_second": 0.003
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.0006088586881692043,
4
- "train_runtime": 10241.9671,
5
  "train_samples": 817,
6
- "train_samples_per_second": 0.239,
7
- "train_steps_per_second": 0.005
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 2.8922741848572622e-08,
4
+ "train_runtime": 6158.5867,
5
  "train_samples": 817,
6
+ "train_samples_per_second": 0.133,
7
+ "train_steps_per_second": 0.003
8
  }
trainer_state.json CHANGED
@@ -1,639 +1,231 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.875912408759124,
5
  "eval_steps": 100,
6
- "global_step": 51,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 29.870370149612427,
13
  "epoch": 0.058394160583941604,
14
- "grad_norm": 0.5967049598693848,
15
  "kl": 0.0,
16
- "learning_rate": 3.333333333333333e-07,
17
  "loss": 0.0,
18
- "reward": 0.526620376855135,
19
- "reward_std": 0.372432217001915,
20
- "rewards/semantic_entropy": 0.526620376855135,
21
  "step": 1
22
  },
23
  {
24
- "completion_length": 26.355324506759644,
25
  "epoch": 0.11678832116788321,
26
- "grad_norm": 0.27608758211135864,
27
  "kl": 0.0,
28
- "learning_rate": 6.666666666666666e-07,
29
  "loss": 0.0,
30
- "reward": 0.6192129664123058,
31
- "reward_std": 0.3255546223372221,
32
- "rewards/semantic_entropy": 0.6192129664123058,
33
  "step": 2
34
  },
35
  {
36
- "completion_length": 31.85300898551941,
37
  "epoch": 0.17518248175182483,
38
- "grad_norm": 0.3493832051753998,
39
- "kl": 0.0005927085876464844,
40
- "learning_rate": 1e-06,
41
  "loss": 0.0,
42
- "reward": 0.49884259700775146,
43
- "reward_std": 0.36885398998856544,
44
- "rewards/semantic_entropy": 0.49884259700775146,
45
  "step": 3
46
  },
47
  {
48
- "completion_length": 27.081018686294556,
49
  "epoch": 0.23357664233576642,
50
- "grad_norm": 0.8119116425514221,
51
- "kl": 0.0008134841918945312,
52
- "learning_rate": 1.3333333333333332e-06,
53
  "loss": 0.0,
54
- "reward": 0.6111111231148243,
55
- "reward_std": 0.3842464517802,
56
- "rewards/semantic_entropy": 0.6111111231148243,
57
  "step": 4
58
  },
59
  {
60
- "completion_length": 30.222222089767456,
61
  "epoch": 0.291970802919708,
62
- "grad_norm": 0.5014259815216064,
63
- "kl": 0.0007073879241943359,
64
- "learning_rate": 1.6666666666666667e-06,
65
  "loss": 0.0,
66
- "reward": 0.6006944514811039,
67
- "reward_std": 0.3485470600426197,
68
- "rewards/semantic_entropy": 0.6006944514811039,
69
  "step": 5
70
  },
71
  {
72
- "completion_length": 29.46759271621704,
73
  "epoch": 0.35036496350364965,
74
- "grad_norm": 0.5462167859077454,
75
- "kl": 0.0006594657897949219,
76
- "learning_rate": 2e-06,
77
  "loss": 0.0,
78
- "reward": 0.5115740709006786,
79
- "reward_std": 0.3930557183921337,
80
- "rewards/semantic_entropy": 0.5115740709006786,
81
  "step": 6
82
  },
83
  {
84
- "completion_length": 30.0775465965271,
85
  "epoch": 0.40875912408759124,
86
- "grad_norm": 0.3974973261356354,
87
- "kl": 0.0009469985961914062,
88
- "learning_rate": 1.997564050259824e-06,
89
  "loss": 0.0,
90
- "reward": 0.6053240820765495,
91
- "reward_std": 0.3901033569127321,
92
- "rewards/semantic_entropy": 0.6053240820765495,
93
  "step": 7
94
  },
95
  {
96
- "completion_length": 30.643518686294556,
97
  "epoch": 0.46715328467153283,
98
- "grad_norm": 0.4026181399822235,
99
- "kl": 0.0013263225555419922,
100
- "learning_rate": 1.99026806874157e-06,
101
- "loss": 0.0001,
102
- "reward": 0.5821759402751923,
103
- "reward_std": 0.3471612483263016,
104
- "rewards/semantic_entropy": 0.5821759402751923,
105
  "step": 8
106
  },
107
  {
108
- "completion_length": 36.761574029922485,
109
  "epoch": 0.5255474452554745,
110
- "grad_norm": 0.46980857849121094,
111
- "kl": 0.0016498565673828125,
112
- "learning_rate": 1.9781476007338054e-06,
113
- "loss": 0.0001,
114
- "reward": 0.4861111231148243,
115
- "reward_std": 0.43300675973296165,
116
- "rewards/semantic_entropy": 0.4861111231148243,
117
  "step": 9
118
  },
119
  {
120
- "completion_length": 32.00000071525574,
121
  "epoch": 0.583941605839416,
122
- "grad_norm": 0.3003987669944763,
123
- "kl": 0.0032224655151367188,
124
- "learning_rate": 1.9612616959383188e-06,
125
- "loss": 0.0001,
126
- "reward": 0.5763888917863369,
127
- "reward_std": 0.3751811906695366,
128
- "rewards/semantic_entropy": 0.5763888917863369,
129
  "step": 10
130
  },
131
  {
132
- "completion_length": 26.78703737258911,
133
  "epoch": 0.6423357664233577,
134
- "grad_norm": 0.6452711224555969,
135
- "kl": 0.007064342498779297,
136
- "learning_rate": 1.9396926207859082e-06,
137
- "loss": 0.0003,
138
- "reward": 0.6087963059544563,
139
- "reward_std": 0.3837307542562485,
140
- "rewards/semantic_entropy": 0.6087963059544563,
141
  "step": 11
142
  },
143
  {
144
- "completion_length": 28.517361164093018,
145
  "epoch": 0.7007299270072993,
146
- "grad_norm": 0.5347678065299988,
147
- "kl": 0.011034965515136719,
148
- "learning_rate": 1.9135454576426007e-06,
149
- "loss": 0.0004,
150
- "reward": 0.5949074178934097,
151
- "reward_std": 0.38593913801014423,
152
- "rewards/semantic_entropy": 0.5949074178934097,
153
  "step": 12
154
  },
155
  {
156
- "completion_length": 32.70833349227905,
157
  "epoch": 0.7591240875912408,
158
- "grad_norm": 0.3103489875793457,
159
- "kl": 0.006297111511230469,
160
- "learning_rate": 1.8829475928589268e-06,
161
- "loss": 0.0003,
162
- "reward": 0.5405092723667622,
163
- "reward_std": 0.3787935618311167,
164
- "rewards/semantic_entropy": 0.5405092723667622,
165
  "step": 13
166
  },
167
  {
168
- "completion_length": 28.361111402511597,
169
  "epoch": 0.8175182481751825,
170
- "grad_norm": 0.341845840215683,
171
- "kl": 0.011461257934570312,
172
- "learning_rate": 1.8480480961564257e-06,
173
- "loss": 0.0005,
174
- "reward": 0.5798611156642437,
175
- "reward_std": 0.37119201570749283,
176
- "rewards/semantic_entropy": 0.5798611156642437,
177
  "step": 14
178
  },
179
  {
180
- "completion_length": 27.28703737258911,
181
  "epoch": 0.8759124087591241,
182
- "grad_norm": 0.3116164803504944,
183
- "kl": 0.00528717041015625,
184
- "learning_rate": 1.8090169943749474e-06,
185
- "loss": 0.0002,
186
- "reward": 0.5891203731298447,
187
- "reward_std": 0.39476561546325684,
188
- "rewards/semantic_entropy": 0.5891203731298447,
189
  "step": 15
190
  },
191
  {
192
- "completion_length": 27.549768924713135,
193
  "epoch": 0.9343065693430657,
194
- "grad_norm": 0.4672207236289978,
195
- "kl": 0.012537002563476562,
196
- "learning_rate": 1.766044443118978e-06,
197
- "loss": 0.0005,
198
- "reward": 0.6006944589316845,
199
- "reward_std": 0.3798612989485264,
200
- "rewards/semantic_entropy": 0.6006944589316845,
201
  "step": 16
202
  },
203
  {
204
- "completion_length": 27.44560170173645,
205
  "epoch": 0.9927007299270073,
206
- "grad_norm": 0.4451679587364197,
207
- "kl": 0.026611328125,
208
- "learning_rate": 1.719339800338651e-06,
209
- "loss": 0.0011,
210
- "reward": 0.6435185112059116,
211
- "reward_std": 0.36070936545729637,
212
- "rewards/semantic_entropy": 0.6435185112059116,
213
- "step": 17
214
- },
215
- {
216
- "completion_length": 16.0,
217
- "epoch": 1.0,
218
- "grad_norm": 0.4451679587364197,
219
- "kl": 0.08837890625,
220
- "learning_rate": 1.669130606358858e-06,
221
- "loss": 0.0001,
222
- "reward": 1.0,
223
- "reward_std": 0.0,
224
- "rewards/semantic_entropy": 1.0,
225
- "step": 18
226
- },
227
- {
228
- "completion_length": 24.4120374917984,
229
- "epoch": 1.0583941605839415,
230
- "grad_norm": 0.3066134452819824,
231
- "kl": 0.04150390625,
232
- "learning_rate": 1.615661475325658e-06,
233
- "loss": 0.0017,
234
- "reward": 0.6423611082136631,
235
- "reward_std": 0.3426077160984278,
236
- "rewards/semantic_entropy": 0.6423611082136631,
237
- "step": 19
238
- },
239
- {
240
- "completion_length": 24.349537134170532,
241
- "epoch": 1.1167883211678833,
242
- "grad_norm": 0.4213350713253021,
243
- "kl": 0.0197906494140625,
244
- "learning_rate": 1.5591929034707466e-06,
245
- "loss": 0.0008,
246
- "reward": 0.6168981604278088,
247
- "reward_std": 0.35933491215109825,
248
- "rewards/semantic_entropy": 0.6168981604278088,
249
- "step": 20
250
- },
251
- {
252
- "completion_length": 24.63078737258911,
253
- "epoch": 1.1751824817518248,
254
- "grad_norm": 0.35187050700187683,
255
- "kl": 0.03948211669921875,
256
- "learning_rate": 1.5e-06,
257
- "loss": 0.0016,
258
- "reward": 0.6712963134050369,
259
- "reward_std": 0.3001057803630829,
260
- "rewards/semantic_entropy": 0.6712963134050369,
261
- "step": 21
262
- },
263
- {
264
- "completion_length": 23.54745364189148,
265
- "epoch": 1.2335766423357664,
266
- "grad_norm": 0.30250656604766846,
267
- "kl": 0.02053070068359375,
268
- "learning_rate": 1.4383711467890773e-06,
269
- "loss": 0.0008,
270
- "reward": 0.7118055671453476,
271
- "reward_std": 0.2657315619289875,
272
- "rewards/semantic_entropy": 0.7118055671453476,
273
- "step": 22
274
- },
275
- {
276
- "completion_length": 29.81944465637207,
277
- "epoch": 1.2919708029197081,
278
- "grad_norm": 0.5680757164955139,
279
- "kl": 0.0273590087890625,
280
- "learning_rate": 1.374606593415912e-06,
281
- "loss": 0.0011,
282
- "reward": 0.5763888992369175,
283
- "reward_std": 0.38675259053707123,
284
- "rewards/semantic_entropy": 0.5763888992369175,
285
- "step": 23
286
- },
287
- {
288
- "completion_length": 24.825231790542603,
289
- "epoch": 1.3503649635036497,
290
- "grad_norm": 0.4038240611553192,
291
- "kl": 0.02518463134765625,
292
- "learning_rate": 1.3090169943749473e-06,
293
- "loss": 0.001,
294
- "reward": 0.6053240820765495,
295
- "reward_std": 0.3557727001607418,
296
- "rewards/semantic_entropy": 0.6053240820765495,
297
- "step": 24
298
- },
299
- {
300
- "completion_length": 31.438657999038696,
301
- "epoch": 1.4087591240875912,
302
- "grad_norm": 0.3054307699203491,
303
- "kl": 0.01131439208984375,
304
- "learning_rate": 1.2419218955996676e-06,
305
- "loss": 0.0005,
306
- "reward": 0.5127314850687981,
307
- "reward_std": 0.41082172095775604,
308
- "rewards/semantic_entropy": 0.5127314850687981,
309
- "step": 25
310
- },
311
- {
312
- "completion_length": 27.61805558204651,
313
- "epoch": 1.4671532846715327,
314
- "grad_norm": 0.3926026523113251,
315
- "kl": 0.017902374267578125,
316
- "learning_rate": 1.1736481776669305e-06,
317
- "loss": 0.0007,
318
- "reward": 0.6412037126719952,
319
- "reward_std": 0.3502417653799057,
320
- "rewards/semantic_entropy": 0.6412037126719952,
321
- "step": 26
322
- },
323
- {
324
- "completion_length": 28.688657522201538,
325
- "epoch": 1.5255474452554745,
326
- "grad_norm": 0.31289467215538025,
327
- "kl": 0.018411636352539062,
328
- "learning_rate": 1.1045284632676535e-06,
329
- "loss": 0.0007,
330
- "reward": 0.6319444552063942,
331
- "reward_std": 0.36655068024992943,
332
- "rewards/semantic_entropy": 0.6319444552063942,
333
- "step": 27
334
- },
335
- {
336
- "completion_length": 30.806713581085205,
337
- "epoch": 1.583941605839416,
338
- "grad_norm": 0.2388666570186615,
339
- "kl": 0.015939712524414062,
340
- "learning_rate": 1.034899496702501e-06,
341
- "loss": 0.0006,
342
- "reward": 0.6053240783512592,
343
- "reward_std": 0.3488955218344927,
344
- "rewards/semantic_entropy": 0.6053240783512592,
345
- "step": 28
346
- },
347
- {
348
- "completion_length": 26.402778148651123,
349
- "epoch": 1.6423357664233578,
350
- "grad_norm": 0.6465526819229126,
351
- "kl": 0.016387939453125,
352
- "learning_rate": 9.651005032974993e-07,
353
- "loss": 0.0007,
354
- "reward": 0.6932870522141457,
355
- "reward_std": 0.30794387497007847,
356
- "rewards/semantic_entropy": 0.6932870522141457,
357
- "step": 29
358
- },
359
- {
360
- "completion_length": 30.527778148651123,
361
- "epoch": 1.7007299270072993,
362
- "grad_norm": 0.2623353600502014,
363
- "kl": 0.020229339599609375,
364
- "learning_rate": 8.954715367323466e-07,
365
- "loss": 0.0008,
366
- "reward": 0.634259257465601,
367
- "reward_std": 0.304907551035285,
368
- "rewards/semantic_entropy": 0.634259257465601,
369
- "step": 30
370
- },
371
- {
372
- "completion_length": 28.02430558204651,
373
- "epoch": 1.7591240875912408,
374
- "grad_norm": 0.26452869176864624,
375
- "kl": 0.01360321044921875,
376
- "learning_rate": 8.263518223330696e-07,
377
- "loss": 0.0005,
378
- "reward": 0.6238426044583321,
379
- "reward_std": 0.3165153060108423,
380
- "rewards/semantic_entropy": 0.6238426044583321,
381
- "step": 31
382
- },
383
- {
384
- "completion_length": 29.64814805984497,
385
- "epoch": 1.8175182481751824,
386
- "grad_norm": 0.6390478014945984,
387
- "kl": 0.03284645080566406,
388
- "learning_rate": 7.580781044003324e-07,
389
- "loss": 0.0013,
390
- "reward": 0.6192129626870155,
391
- "reward_std": 0.3468264602124691,
392
- "rewards/semantic_entropy": 0.6192129626870155,
393
- "step": 32
394
- },
395
- {
396
- "completion_length": 31.30439829826355,
397
- "epoch": 1.8759124087591241,
398
- "grad_norm": 0.24422068893909454,
399
- "kl": 0.010921478271484375,
400
- "learning_rate": 6.909830056250526e-07,
401
- "loss": 0.0004,
402
- "reward": 0.6435185223817825,
403
- "reward_std": 0.33870384842157364,
404
- "rewards/semantic_entropy": 0.6435185223817825,
405
- "step": 33
406
- },
407
- {
408
- "completion_length": 29.41319465637207,
409
- "epoch": 1.9343065693430657,
410
- "grad_norm": 0.2262941151857376,
411
- "kl": 0.016880035400390625,
412
- "learning_rate": 6.253934065840879e-07,
413
- "loss": 0.0007,
414
- "reward": 0.6273148208856583,
415
- "reward_std": 0.3323148675262928,
416
- "rewards/semantic_entropy": 0.6273148208856583,
417
- "step": 34
418
- },
419
- {
420
- "completion_length": 28.489583492279053,
421
- "epoch": 1.9927007299270074,
422
- "grad_norm": 0.276526540517807,
423
- "kl": 0.01245880126953125,
424
- "learning_rate": 5.616288532109224e-07,
425
- "loss": 0.0005,
426
- "reward": 0.6238425932824612,
427
- "reward_std": 0.3457994442433119,
428
- "rewards/semantic_entropy": 0.6238425932824612,
429
- "step": 35
430
- },
431
- {
432
- "completion_length": 43.0,
433
- "epoch": 2.0,
434
- "grad_norm": 0.276526540517807,
435
- "kl": 0.00799560546875,
436
- "learning_rate": 5.000000000000002e-07,
437
- "loss": 0.0,
438
- "reward": 0.0,
439
- "reward_std": 0.4608885943889618,
440
- "rewards/semantic_entropy": 0.0,
441
- "step": 36
442
- },
443
- {
444
- "completion_length": 29.586805820465088,
445
- "epoch": 2.0583941605839415,
446
- "grad_norm": 0.2671959400177002,
447
- "kl": 0.018100738525390625,
448
- "learning_rate": 4.408070965292533e-07,
449
- "loss": 0.0007,
450
- "reward": 0.6157407388091087,
451
- "reward_std": 0.36300016567111015,
452
- "rewards/semantic_entropy": 0.6157407388091087,
453
- "step": 37
454
- },
455
- {
456
- "completion_length": 25.49074101448059,
457
- "epoch": 2.116788321167883,
458
- "grad_norm": 0.2453327476978302,
459
- "kl": 0.0186920166015625,
460
- "learning_rate": 3.843385246743417e-07,
461
- "loss": 0.0007,
462
- "reward": 0.6689814887940884,
463
- "reward_std": 0.2830717135220766,
464
- "rewards/semantic_entropy": 0.6689814887940884,
465
- "step": 38
466
- },
467
- {
468
- "completion_length": 30.723379850387573,
469
- "epoch": 2.1751824817518246,
470
- "grad_norm": 0.44519904255867004,
471
- "kl": 0.02108001708984375,
472
- "learning_rate": 3.308693936411421e-07,
473
- "loss": 0.0008,
474
- "reward": 0.6018518693745136,
475
- "reward_std": 0.32297887466847897,
476
- "rewards/semantic_entropy": 0.6018518693745136,
477
- "step": 39
478
- },
479
- {
480
- "completion_length": 24.181713104248047,
481
- "epoch": 2.2335766423357666,
482
- "grad_norm": 0.29531192779541016,
483
- "kl": 0.020915985107421875,
484
- "learning_rate": 2.8066019966134904e-07,
485
- "loss": 0.0008,
486
- "reward": 0.6793981567025185,
487
- "reward_std": 0.3183311792090535,
488
- "rewards/semantic_entropy": 0.6793981567025185,
489
- "step": 40
490
- },
491
- {
492
- "completion_length": 27.152777671813965,
493
- "epoch": 2.291970802919708,
494
- "grad_norm": 0.3067900836467743,
495
- "kl": 0.01100921630859375,
496
- "learning_rate": 2.339555568810221e-07,
497
- "loss": 0.0004,
498
- "reward": 0.6238426119089127,
499
- "reward_std": 0.36597814224660397,
500
- "rewards/semantic_entropy": 0.6238426119089127,
501
- "step": 41
502
- },
503
- {
504
- "completion_length": 28.2928249835968,
505
- "epoch": 2.3503649635036497,
506
- "grad_norm": 0.27559277415275574,
507
- "kl": 0.012683868408203125,
508
- "learning_rate": 1.9098300562505264e-07,
509
- "loss": 0.0005,
510
- "reward": 0.6967592611908913,
511
- "reward_std": 0.29866353049874306,
512
- "rewards/semantic_entropy": 0.6967592611908913,
513
- "step": 42
514
- },
515
- {
516
- "completion_length": 26.355324029922485,
517
- "epoch": 2.408759124087591,
518
- "grad_norm": 0.3306453227996826,
519
- "kl": 0.02254486083984375,
520
- "learning_rate": 1.5195190384357404e-07,
521
- "loss": 0.0009,
522
- "reward": 0.6990740746259689,
523
- "reward_std": 0.30733868665993214,
524
- "rewards/semantic_entropy": 0.6990740746259689,
525
- "step": 43
526
- },
527
- {
528
- "completion_length": 27.85185170173645,
529
- "epoch": 2.4671532846715327,
530
- "grad_norm": 0.37372887134552,
531
- "kl": 0.03179931640625,
532
- "learning_rate": 1.1705240714107301e-07,
533
- "loss": 0.0013,
534
- "reward": 0.6562500074505806,
535
- "reward_std": 0.28820149786770344,
536
- "rewards/semantic_entropy": 0.6562500074505806,
537
- "step": 44
538
- },
539
- {
540
- "completion_length": 24.635416746139526,
541
- "epoch": 2.5255474452554747,
542
- "grad_norm": 0.3697076439857483,
543
- "kl": 0.015590667724609375,
544
- "learning_rate": 8.645454235739902e-08,
545
- "loss": 0.0006,
546
- "reward": 0.7037037089467049,
547
- "reward_std": 0.2801394369453192,
548
- "rewards/semantic_entropy": 0.7037037089467049,
549
- "step": 45
550
- },
551
- {
552
- "completion_length": 26.35185217857361,
553
- "epoch": 2.5839416058394162,
554
- "grad_norm": 0.31710338592529297,
555
- "kl": 0.03106689453125,
556
- "learning_rate": 6.030737921409168e-08,
557
- "loss": 0.0012,
558
- "reward": 0.6574074104428291,
559
- "reward_std": 0.33865234442055225,
560
- "rewards/semantic_entropy": 0.6574074104428291,
561
- "step": 46
562
- },
563
- {
564
- "completion_length": 26.200231552124023,
565
- "epoch": 2.6423357664233578,
566
- "grad_norm": 0.2386447936296463,
567
- "kl": 0.021045684814453125,
568
- "learning_rate": 3.87383040616811e-08,
569
- "loss": 0.0008,
570
- "reward": 0.6574074104428291,
571
- "reward_std": 0.30992276407778263,
572
- "rewards/semantic_entropy": 0.6574074104428291,
573
- "step": 47
574
- },
575
- {
576
- "completion_length": 26.50810217857361,
577
- "epoch": 2.7007299270072993,
578
- "grad_norm": 0.5202280282974243,
579
- "kl": 0.02829742431640625,
580
- "learning_rate": 2.185239926619431e-08,
581
- "loss": 0.0011,
582
- "reward": 0.7141203731298447,
583
- "reward_std": 0.28790368139743805,
584
- "rewards/semantic_entropy": 0.7141203731298447,
585
- "step": 48
586
- },
587
- {
588
- "completion_length": 29.20138907432556,
589
- "epoch": 2.759124087591241,
590
- "grad_norm": 0.30392101407051086,
591
- "kl": 0.036502838134765625,
592
- "learning_rate": 9.731931258429638e-09,
593
- "loss": 0.0015,
594
- "reward": 0.6631944477558136,
595
- "reward_std": 0.31401310954242945,
596
- "rewards/semantic_entropy": 0.6631944477558136,
597
- "step": 49
598
- },
599
- {
600
- "completion_length": 24.504629611968994,
601
- "epoch": 2.8175182481751824,
602
- "grad_norm": 0.2555871307849884,
603
- "kl": 0.017333984375,
604
- "learning_rate": 2.435949740175802e-09,
605
- "loss": 0.0007,
606
- "reward": 0.6504629701375961,
607
- "reward_std": 0.3291088491678238,
608
- "rewards/semantic_entropy": 0.6504629701375961,
609
- "step": 50
610
- },
611
- {
612
- "completion_length": 30.652778148651123,
613
- "epoch": 2.875912408759124,
614
- "grad_norm": 0.3172709345817566,
615
- "kl": 0.014835357666015625,
616
  "learning_rate": 0.0,
617
- "loss": 0.0006,
618
- "reward": 0.5914351791143417,
619
- "reward_std": 0.3534049801528454,
620
- "rewards/semantic_entropy": 0.5914351791143417,
621
- "step": 51
622
  },
623
  {
624
- "epoch": 2.875912408759124,
625
- "step": 51,
626
  "total_flos": 0.0,
627
- "train_loss": 0.0006088586881692043,
628
- "train_runtime": 10241.9671,
629
- "train_samples_per_second": 0.239,
630
- "train_steps_per_second": 0.005
631
  }
632
  ],
633
  "logging_steps": 1,
634
- "max_steps": 51,
635
  "num_input_tokens_seen": 0,
636
- "num_train_epochs": 3,
637
  "save_steps": 10,
638
  "stateful_callbacks": {
639
  "TrainerControl": {
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9927007299270073,
5
  "eval_steps": 100,
6
+ "global_step": 17,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "completion_length": 31.543402910232544,
13
  "epoch": 0.058394160583941604,
14
+ "grad_norm": 0.6463247537612915,
15
  "kl": 0.0,
16
+ "learning_rate": 1e-06,
17
  "loss": 0.0,
18
+ "reward": 0.4809027835726738,
19
+ "reward_std": 0.4057047627866268,
20
+ "rewards/semantic_entropy": 0.4809027835726738,
21
  "step": 1
22
  },
23
  {
24
+ "completion_length": 27.753472566604614,
25
  "epoch": 0.11678832116788321,
26
+ "grad_norm": 0.31535470485687256,
27
  "kl": 0.0,
28
+ "learning_rate": 2e-06,
29
  "loss": 0.0,
30
+ "reward": 0.5850694514811039,
31
+ "reward_std": 0.3451368249952793,
32
+ "rewards/semantic_entropy": 0.5850694514811039,
33
  "step": 2
34
  },
35
  {
36
+ "completion_length": 32.34895849227905,
37
  "epoch": 0.17518248175182483,
38
+ "grad_norm": 0.38025209307670593,
39
+ "kl": 0.0008878707885742188,
40
+ "learning_rate": 1.9781476007338054e-06,
41
  "loss": 0.0,
42
+ "reward": 0.4791666679084301,
43
+ "reward_std": 0.38540373742580414,
44
+ "rewards/semantic_entropy": 0.4791666679084301,
45
  "step": 3
46
  },
47
  {
48
+ "completion_length": 27.75607681274414,
49
  "epoch": 0.23357664233576642,
50
+ "grad_norm": 0.7206792235374451,
51
+ "kl": 0.0020771026611328125,
52
+ "learning_rate": 1.9135454576426007e-06,
53
  "loss": 0.0,
54
+ "reward": 0.5998263880610466,
55
+ "reward_std": 0.348458593711257,
56
+ "rewards/semantic_entropy": 0.5998263880610466,
57
  "step": 4
58
  },
59
  {
60
+ "completion_length": 29.860243558883667,
61
  "epoch": 0.291970802919708,
62
+ "grad_norm": 0.5087810754776001,
63
+ "kl": 0.0018243789672851562,
64
+ "learning_rate": 1.8090169943749474e-06,
65
  "loss": 0.0,
66
+ "reward": 0.5034722238779068,
67
+ "reward_std": 0.39532990381121635,
68
+ "rewards/semantic_entropy": 0.5034722238779068,
69
  "step": 5
70
  },
71
  {
72
+ "completion_length": 28.811631679534912,
73
  "epoch": 0.35036496350364965,
74
+ "grad_norm": 0.4511905610561371,
75
+ "kl": 0.0026292800903320312,
76
+ "learning_rate": 1.669130606358858e-06,
77
  "loss": 0.0,
78
+ "reward": 0.5295138992369175,
79
+ "reward_std": 0.3658079691231251,
80
+ "rewards/semantic_entropy": 0.5295138992369175,
81
  "step": 6
82
  },
83
  {
84
+ "completion_length": 31.047743320465088,
85
  "epoch": 0.40875912408759124,
86
+ "grad_norm": 0.46206873655319214,
87
+ "kl": 0.003326416015625,
88
+ "learning_rate": 1.5e-06,
89
  "loss": 0.0,
90
+ "reward": 0.4973958395421505,
91
+ "reward_std": 0.39973679929971695,
92
+ "rewards/semantic_entropy": 0.4973958395421505,
93
  "step": 7
94
  },
95
  {
96
+ "completion_length": 30.878472328186035,
97
  "epoch": 0.46715328467153283,
98
+ "grad_norm": 0.4796462655067444,
99
+ "kl": 0.004016876220703125,
100
+ "learning_rate": 1.3090169943749473e-06,
101
+ "loss": 0.0,
102
+ "reward": 0.5217013955116272,
103
+ "reward_std": 0.36933426558971405,
104
+ "rewards/semantic_entropy": 0.5217013955116272,
105
  "step": 8
106
  },
107
  {
108
+ "completion_length": 34.5590283870697,
109
  "epoch": 0.5255474452554745,
110
+ "grad_norm": 0.43365350365638733,
111
+ "kl": 0.0037174224853515625,
112
+ "learning_rate": 1.1045284632676535e-06,
113
+ "loss": 0.0,
114
+ "reward": 0.4444444589316845,
115
+ "reward_std": 0.4142540544271469,
116
+ "rewards/semantic_entropy": 0.4444444589316845,
117
  "step": 9
118
  },
119
  {
120
+ "completion_length": 31.54600763320923,
121
  "epoch": 0.583941605839416,
122
+ "grad_norm": 0.3461940586566925,
123
+ "kl": 0.0054531097412109375,
124
+ "learning_rate": 8.954715367323466e-07,
125
+ "loss": 0.0,
126
+ "reward": 0.5312499962747097,
127
+ "reward_std": 0.33972141705453396,
128
+ "rewards/semantic_entropy": 0.5312499962747097,
129
  "step": 10
130
  },
131
  {
132
+ "completion_length": 25.934895992279053,
133
  "epoch": 0.6423357664233577,
134
+ "grad_norm": 0.7275694012641907,
135
+ "kl": 0.0059986114501953125,
136
+ "learning_rate": 6.909830056250526e-07,
137
+ "loss": 0.0,
138
+ "reward": 0.5425347350537777,
139
+ "reward_std": 0.39345845952630043,
140
+ "rewards/semantic_entropy": 0.5425347350537777,
141
  "step": 11
142
  },
143
  {
144
+ "completion_length": 27.58420157432556,
145
  "epoch": 0.7007299270072993,
146
+ "grad_norm": 0.578926146030426,
147
+ "kl": 0.010272979736328125,
148
+ "learning_rate": 5.000000000000002e-07,
149
+ "loss": 0.0,
150
+ "reward": 0.5564236119389534,
151
+ "reward_std": 0.38245424441993237,
152
+ "rewards/semantic_entropy": 0.5564236119389534,
153
  "step": 12
154
  },
155
  {
156
+ "completion_length": 32.76302146911621,
157
  "epoch": 0.7591240875912408,
158
+ "grad_norm": 0.3081968128681183,
159
+ "kl": 0.0044574737548828125,
160
+ "learning_rate": 3.308693936411421e-07,
161
+ "loss": 0.0,
162
+ "reward": 0.4583333395421505,
163
+ "reward_std": 0.39592672139406204,
164
+ "rewards/semantic_entropy": 0.4583333395421505,
165
  "step": 13
166
  },
167
  {
168
+ "completion_length": 28.50086808204651,
169
  "epoch": 0.8175182481751825,
170
+ "grad_norm": 0.4480704367160797,
171
+ "kl": 0.006603240966796875,
172
+ "learning_rate": 1.9098300562505264e-07,
173
+ "loss": 0.0,
174
+ "reward": 0.4782986231148243,
175
+ "reward_std": 0.3709658682346344,
176
+ "rewards/semantic_entropy": 0.4782986231148243,
177
  "step": 14
178
  },
179
  {
180
+ "completion_length": 28.36545181274414,
181
  "epoch": 0.8759124087591241,
182
+ "grad_norm": 0.2726985514163971,
183
+ "kl": 0.00714111328125,
184
+ "learning_rate": 8.645454235739902e-08,
185
+ "loss": 0.0,
186
+ "reward": 0.572048619389534,
187
+ "reward_std": 0.3576664440333843,
188
+ "rewards/semantic_entropy": 0.572048619389534,
189
  "step": 15
190
  },
191
  {
192
+ "completion_length": 29.555555820465088,
193
  "epoch": 0.9343065693430657,
194
+ "grad_norm": 0.5542200207710266,
195
+ "kl": 0.011875152587890625,
196
+ "learning_rate": 2.185239926619431e-08,
197
+ "loss": 0.0,
198
+ "reward": 0.5442708358168602,
199
+ "reward_std": 0.4052053317427635,
200
+ "rewards/semantic_entropy": 0.5442708358168602,
201
  "step": 16
202
  },
203
  {
204
+ "completion_length": 28.62326431274414,
205
  "epoch": 0.9927007299270073,
206
+ "grad_norm": 0.4658753275871277,
207
+ "kl": 0.010679244995117188,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  "learning_rate": 0.0,
209
+ "loss": 0.0,
210
+ "reward": 0.564236119389534,
211
+ "reward_std": 0.3762592002749443,
212
+ "rewards/semantic_entropy": 0.564236119389534,
213
+ "step": 17
214
  },
215
  {
216
+ "epoch": 0.9927007299270073,
217
+ "step": 17,
218
  "total_flos": 0.0,
219
+ "train_loss": 2.8922741848572622e-08,
220
+ "train_runtime": 6158.5867,
221
+ "train_samples_per_second": 0.133,
222
+ "train_steps_per_second": 0.003
223
  }
224
  ],
225
  "logging_steps": 1,
226
+ "max_steps": 17,
227
  "num_input_tokens_seen": 0,
228
+ "num_train_epochs": 1,
229
  "save_steps": 10,
230
  "stateful_callbacks": {
231
  "TrainerControl": {