Wenboz commited on
Commit
fd11885
·
verified ·
1 Parent(s): 96aa8c3

Model save

Browse files
Files changed (4) hide show
  1. README.md +9 -9
  2. all_results.json +4 -4
  3. train_results.json +4 -4
  4. trainer_state.json +947 -947
README.md CHANGED
@@ -18,15 +18,15 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.9206
22
- - Rewards/chosen: -4.6196
23
- - Rewards/rejected: -6.0458
24
  - Rewards/accuracies: 0.7320
25
- - Rewards/margins: 1.4262
26
- - Logps/rejected: -3.0229
27
- - Logps/chosen: -2.3098
28
- - Logits/rejected: -2.1167
29
- - Logits/chosen: -2.1038
30
 
31
  ## Model description
32
 
@@ -63,7 +63,7 @@ The following hyperparameters were used during training:
63
 
64
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
65
  |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
66
- | 0.9739 | 0.8375 | 400 | 0.9206 | -4.6196 | -6.0458 | 0.7320 | 1.4262 | -3.0229 | -2.3098 | -2.1167 | -2.1038 |
67
 
68
 
69
  ### Framework versions
 
18
 
19
  This model is a fine-tuned version of [alignment-handbook/zephyr-7b-sft-full](https://huggingface.co/alignment-handbook/zephyr-7b-sft-full) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 1.2093
22
+ - Rewards/chosen: -5.8139
23
+ - Rewards/rejected: -7.6630
24
  - Rewards/accuracies: 0.7320
25
+ - Rewards/margins: 1.8490
26
+ - Logps/rejected: -3.8315
27
+ - Logps/chosen: -2.9070
28
+ - Logits/rejected: -1.8947
29
+ - Logits/chosen: -1.8796
30
 
31
  ## Model description
32
 
 
63
 
64
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
65
  |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
66
+ | 1.267 | 0.8375 | 400 | 1.2093 | -5.8139 | -7.6630 | 0.7320 | 1.8490 | -3.8315 | -2.9070 | -1.8947 | -1.8796 |
67
 
68
 
69
  ### Framework versions
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.998691442030882,
3
  "total_flos": 0.0,
4
- "train_loss": 1.0352550020257882,
5
- "train_runtime": 22152.8972,
6
  "train_samples": 61135,
7
- "train_samples_per_second": 2.76,
8
- "train_steps_per_second": 0.022
9
  }
 
1
  {
2
  "epoch": 0.998691442030882,
3
  "total_flos": 0.0,
4
+ "train_loss": 1.3686470238167785,
5
+ "train_runtime": 22217.0339,
6
  "train_samples": 61135,
7
+ "train_samples_per_second": 2.752,
8
+ "train_steps_per_second": 0.021
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.998691442030882,
3
  "total_flos": 0.0,
4
- "train_loss": 1.0352550020257882,
5
- "train_runtime": 22152.8972,
6
  "train_samples": 61135,
7
- "train_samples_per_second": 2.76,
8
- "train_steps_per_second": 0.022
9
  }
 
1
  {
2
  "epoch": 0.998691442030882,
3
  "total_flos": 0.0,
4
+ "train_loss": 1.3686470238167785,
5
+ "train_runtime": 22217.0339,
6
  "train_samples": 61135,
7
+ "train_samples_per_second": 2.752,
8
+ "train_steps_per_second": 0.021
9
  }
trainer_state.json CHANGED
@@ -10,13 +10,13 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.002093692750588851,
13
- "grad_norm": 21.93495506191106,
14
  "learning_rate": 1.0416666666666666e-08,
15
  "logits/chosen": -2.8280014991760254,
16
  "logits/rejected": -2.8466408252716064,
17
  "logps/chosen": -1.1081000566482544,
18
  "logps/rejected": -1.146370530128479,
19
- "loss": 1.383,
20
  "rewards/accuracies": 0.46875,
21
  "rewards/chosen": -2.216200113296509,
22
  "rewards/margins": 0.0765407383441925,
@@ -25,1453 +25,1453 @@
25
  },
26
  {
27
  "epoch": 0.010468463752944255,
28
- "grad_norm": 18.624658350148188,
29
  "learning_rate": 5.208333333333333e-08,
30
- "logits/chosen": -2.881028652191162,
31
- "logits/rejected": -2.847470760345459,
32
- "logps/chosen": -1.0212035179138184,
33
- "logps/rejected": -1.0807099342346191,
34
- "loss": 1.2828,
35
  "rewards/accuracies": 0.5390625,
36
- "rewards/chosen": -2.0424070358276367,
37
- "rewards/margins": 0.11901294440031052,
38
- "rewards/rejected": -2.1614198684692383,
39
  "step": 5
40
  },
41
  {
42
  "epoch": 0.02093692750588851,
43
- "grad_norm": 17.13109379985505,
44
  "learning_rate": 1.0416666666666667e-07,
45
- "logits/chosen": -2.9920997619628906,
46
- "logits/rejected": -2.94093918800354,
47
- "logps/chosen": -1.0227670669555664,
48
- "logps/rejected": -1.165470004081726,
49
- "loss": 1.3069,
50
  "rewards/accuracies": 0.581250011920929,
51
- "rewards/chosen": -2.045534133911133,
52
- "rewards/margins": 0.28540605306625366,
53
- "rewards/rejected": -2.330940008163452,
54
  "step": 10
55
  },
56
  {
57
  "epoch": 0.031405391258832765,
58
- "grad_norm": 20.205927927566222,
59
  "learning_rate": 1.5624999999999999e-07,
60
- "logits/chosen": -2.919098377227783,
61
- "logits/rejected": -2.905874729156494,
62
- "logps/chosen": -1.0498722791671753,
63
- "logps/rejected": -1.1000678539276123,
64
- "loss": 1.2921,
65
  "rewards/accuracies": 0.5375000238418579,
66
- "rewards/chosen": -2.0997445583343506,
67
- "rewards/margins": 0.10039126873016357,
68
- "rewards/rejected": -2.2001357078552246,
69
  "step": 15
70
  },
71
  {
72
  "epoch": 0.04187385501177702,
73
- "grad_norm": 15.5061106280485,
74
  "learning_rate": 2.0833333333333333e-07,
75
- "logits/chosen": -2.941197156906128,
76
- "logits/rejected": -2.8902478218078613,
77
- "logps/chosen": -0.9996241331100464,
78
- "logps/rejected": -1.094834566116333,
79
- "loss": 1.2669,
80
- "rewards/accuracies": 0.5562499761581421,
81
- "rewards/chosen": -1.9992482662200928,
82
- "rewards/margins": 0.190421000123024,
83
- "rewards/rejected": -2.189669132232666,
84
  "step": 20
85
  },
86
  {
87
  "epoch": 0.05234231876472128,
88
- "grad_norm": 15.150417333437225,
89
  "learning_rate": 2.604166666666667e-07,
90
- "logits/chosen": -2.944608449935913,
91
- "logits/rejected": -2.8806254863739014,
92
- "logps/chosen": -0.9165313839912415,
93
- "logps/rejected": -1.103171467781067,
94
- "loss": 1.2359,
95
- "rewards/accuracies": 0.5687500238418579,
96
- "rewards/chosen": -1.833062767982483,
97
- "rewards/margins": 0.3732803463935852,
98
- "rewards/rejected": -2.206342935562134,
99
  "step": 25
100
  },
101
  {
102
  "epoch": 0.06281078251766553,
103
- "grad_norm": 15.053243876819153,
104
  "learning_rate": 3.1249999999999997e-07,
105
- "logits/chosen": -2.916238307952881,
106
- "logits/rejected": -2.8971691131591797,
107
- "logps/chosen": -0.8922262191772461,
108
- "logps/rejected": -0.924017071723938,
109
- "loss": 1.2538,
110
  "rewards/accuracies": 0.550000011920929,
111
- "rewards/chosen": -1.7844524383544922,
112
- "rewards/margins": 0.06358186155557632,
113
- "rewards/rejected": -1.848034143447876,
114
  "step": 30
115
  },
116
  {
117
  "epoch": 0.07327924627060979,
118
- "grad_norm": 15.295985558098911,
119
  "learning_rate": 3.645833333333333e-07,
120
- "logits/chosen": -2.924506664276123,
121
- "logits/rejected": -2.8910322189331055,
122
- "logps/chosen": -0.9058906435966492,
123
- "logps/rejected": -1.0956400632858276,
124
- "loss": 1.2321,
125
- "rewards/accuracies": 0.59375,
126
- "rewards/chosen": -1.8117812871932983,
127
- "rewards/margins": 0.3794988691806793,
128
- "rewards/rejected": -2.1912801265716553,
129
  "step": 35
130
  },
131
  {
132
  "epoch": 0.08374771002355404,
133
- "grad_norm": 15.187740492675982,
134
  "learning_rate": 4.1666666666666667e-07,
135
- "logits/chosen": -2.912510871887207,
136
- "logits/rejected": -2.8711163997650146,
137
- "logps/chosen": -0.9113177061080933,
138
- "logps/rejected": -1.0306214094161987,
139
- "loss": 1.2069,
140
- "rewards/accuracies": 0.574999988079071,
141
- "rewards/chosen": -1.8226354122161865,
142
- "rewards/margins": 0.23860761523246765,
143
- "rewards/rejected": -2.0612428188323975,
144
  "step": 40
145
  },
146
  {
147
  "epoch": 0.0942161737764983,
148
- "grad_norm": 13.694056162547822,
149
  "learning_rate": 4.6874999999999996e-07,
150
- "logits/chosen": -2.8661983013153076,
151
- "logits/rejected": -2.871129274368286,
152
- "logps/chosen": -0.8866590261459351,
153
- "logps/rejected": -1.0527524948120117,
154
- "loss": 1.171,
155
- "rewards/accuracies": 0.6187499761581421,
156
- "rewards/chosen": -1.7733180522918701,
157
- "rewards/margins": 0.33218708634376526,
158
- "rewards/rejected": -2.1055049896240234,
159
  "step": 45
160
  },
161
  {
162
  "epoch": 0.10468463752944256,
163
- "grad_norm": 16.852078598312655,
164
  "learning_rate": 4.999731868769026e-07,
165
- "logits/chosen": -2.8084216117858887,
166
- "logits/rejected": -2.8203370571136475,
167
- "logps/chosen": -0.9374567866325378,
168
- "logps/rejected": -1.173828125,
169
- "loss": 1.2165,
170
- "rewards/accuracies": 0.643750011920929,
171
- "rewards/chosen": -1.8749135732650757,
172
- "rewards/margins": 0.47274255752563477,
173
- "rewards/rejected": -2.34765625,
174
  "step": 50
175
  },
176
  {
177
  "epoch": 0.11515310128238682,
178
- "grad_norm": 21.114541338810998,
179
  "learning_rate": 4.996716052911017e-07,
180
- "logits/chosen": -2.8955435752868652,
181
- "logits/rejected": -2.90877103805542,
182
- "logps/chosen": -1.0479611158370972,
183
- "logps/rejected": -1.1501401662826538,
184
- "loss": 1.2167,
185
- "rewards/accuracies": 0.5687500238418579,
186
- "rewards/chosen": -2.0959222316741943,
187
- "rewards/margins": 0.20435786247253418,
188
- "rewards/rejected": -2.3002803325653076,
189
  "step": 55
190
  },
191
  {
192
  "epoch": 0.12562156503533106,
193
- "grad_norm": 24.574129872487944,
194
  "learning_rate": 4.990353313429303e-07,
195
- "logits/chosen": -2.8288521766662598,
196
- "logits/rejected": -2.821425199508667,
197
- "logps/chosen": -1.0197687149047852,
198
- "logps/rejected": -1.1724817752838135,
199
- "loss": 1.1905,
200
- "rewards/accuracies": 0.5562499761581421,
201
- "rewards/chosen": -2.0395374298095703,
202
- "rewards/margins": 0.30542629957199097,
203
- "rewards/rejected": -2.344963550567627,
204
  "step": 60
205
  },
206
  {
207
  "epoch": 0.1360900287882753,
208
- "grad_norm": 19.926854717028363,
209
  "learning_rate": 4.980652179769217e-07,
210
- "logits/chosen": -2.8469738960266113,
211
- "logits/rejected": -2.8695998191833496,
212
- "logps/chosen": -0.9902658462524414,
213
- "logps/rejected": -1.1480392217636108,
214
- "loss": 1.1935,
215
  "rewards/accuracies": 0.59375,
216
- "rewards/chosen": -1.9805316925048828,
217
- "rewards/margins": 0.3155469298362732,
218
- "rewards/rejected": -2.2960784435272217,
219
  "step": 65
220
  },
221
  {
222
  "epoch": 0.14655849254121958,
223
- "grad_norm": 19.196504743878798,
224
  "learning_rate": 4.967625656594781e-07,
225
- "logits/chosen": -2.8496787548065186,
226
- "logits/rejected": -2.8487606048583984,
227
- "logps/chosen": -1.0576945543289185,
228
- "logps/rejected": -1.2366907596588135,
229
- "loss": 1.1509,
230
- "rewards/accuracies": 0.612500011920929,
231
- "rewards/chosen": -2.115389108657837,
232
- "rewards/margins": 0.35799235105514526,
233
- "rewards/rejected": -2.473381519317627,
234
  "step": 70
235
  },
236
  {
237
  "epoch": 0.15702695629416383,
238
- "grad_norm": 27.309614082392827,
239
  "learning_rate": 4.951291206355559e-07,
240
- "logits/chosen": -2.802968740463257,
241
- "logits/rejected": -2.753392219543457,
242
- "logps/chosen": -1.0896332263946533,
243
- "logps/rejected": -1.3515465259552002,
244
- "loss": 1.1705,
245
- "rewards/accuracies": 0.643750011920929,
246
- "rewards/chosen": -2.1792664527893066,
247
- "rewards/margins": 0.5238265991210938,
248
- "rewards/rejected": -2.7030930519104004,
249
  "step": 75
250
  },
251
  {
252
  "epoch": 0.16749542004710807,
253
- "grad_norm": 19.557847979657012,
254
  "learning_rate": 4.93167072587771e-07,
255
- "logits/chosen": -2.7415664196014404,
256
- "logits/rejected": -2.7370200157165527,
257
- "logps/chosen": -1.0582047700881958,
258
- "logps/rejected": -1.385558843612671,
259
- "loss": 1.1638,
260
  "rewards/accuracies": 0.6875,
261
- "rewards/chosen": -2.1164095401763916,
262
- "rewards/margins": 0.6547082662582397,
263
- "rewards/rejected": -2.771117687225342,
264
  "step": 80
265
  },
266
  {
267
  "epoch": 0.17796388380005235,
268
- "grad_norm": 20.605876232904194,
269
  "learning_rate": 4.908790517010636e-07,
270
- "logits/chosen": -2.7899672985076904,
271
- "logits/rejected": -2.7431349754333496,
272
- "logps/chosen": -1.0243644714355469,
273
- "logps/rejected": -1.3202240467071533,
274
- "loss": 1.154,
275
- "rewards/accuracies": 0.6312500238418579,
276
- "rewards/chosen": -2.0487289428710938,
277
- "rewards/margins": 0.591719388961792,
278
- "rewards/rejected": -2.6404480934143066,
279
  "step": 85
280
  },
281
  {
282
  "epoch": 0.1884323475529966,
283
- "grad_norm": 15.343218366521764,
284
  "learning_rate": 4.882681251368548e-07,
285
- "logits/chosen": -2.7653021812438965,
286
- "logits/rejected": -2.7124242782592773,
287
- "logps/chosen": -1.115639567375183,
288
- "logps/rejected": -1.4925037622451782,
289
- "loss": 1.1234,
290
- "rewards/accuracies": 0.6187499761581421,
291
- "rewards/chosen": -2.231279134750366,
292
- "rewards/margins": 0.7537283897399902,
293
- "rewards/rejected": -2.9850075244903564,
294
  "step": 90
295
  },
296
  {
297
  "epoch": 0.19890081130594087,
298
- "grad_norm": 28.33639713577856,
299
  "learning_rate": 4.853377929214243e-07,
300
- "logits/chosen": -2.6936557292938232,
301
- "logits/rejected": -2.679147243499756,
302
- "logps/chosen": -1.1330680847167969,
303
- "logps/rejected": -1.4060020446777344,
304
- "loss": 1.1476,
305
- "rewards/accuracies": 0.6000000238418579,
306
- "rewards/chosen": -2.2661361694335938,
307
- "rewards/margins": 0.5458678007125854,
308
- "rewards/rejected": -2.8120040893554688,
309
  "step": 95
310
  },
311
  {
312
  "epoch": 0.2093692750588851,
313
- "grad_norm": 26.32383832858066,
314
  "learning_rate": 4.820919832540181e-07,
315
- "logits/chosen": -2.7667784690856934,
316
- "logits/rejected": -2.7402002811431885,
317
- "logps/chosen": -1.1223466396331787,
318
- "logps/rejected": -1.3316584825515747,
319
- "loss": 1.2012,
320
- "rewards/accuracies": 0.543749988079071,
321
- "rewards/chosen": -2.2446932792663574,
322
- "rewards/margins": 0.41862398386001587,
323
- "rewards/rejected": -2.6633169651031494,
324
  "step": 100
325
  },
326
  {
327
  "epoch": 0.21983773881182936,
328
- "grad_norm": 17.267993211929276,
329
  "learning_rate": 4.785350472409791e-07,
330
- "logits/chosen": -2.70428729057312,
331
- "logits/rejected": -2.6778674125671387,
332
- "logps/chosen": -1.146924376487732,
333
- "logps/rejected": -1.4666913747787476,
334
- "loss": 1.075,
335
- "rewards/accuracies": 0.643750011920929,
336
- "rewards/chosen": -2.293848752975464,
337
- "rewards/margins": 0.6395338773727417,
338
- "rewards/rejected": -2.933382749557495,
339
  "step": 105
340
  },
341
  {
342
  "epoch": 0.23030620256477363,
343
- "grad_norm": 35.64550114208054,
344
  "learning_rate": 4.7467175306295647e-07,
345
- "logits/chosen": -2.6520252227783203,
346
- "logits/rejected": -2.6314826011657715,
347
- "logps/chosen": -1.064710259437561,
348
- "logps/rejected": -1.3563224077224731,
349
- "loss": 1.0764,
350
- "rewards/accuracies": 0.637499988079071,
351
- "rewards/chosen": -2.129420518875122,
352
- "rewards/margins": 0.5832241773605347,
353
- "rewards/rejected": -2.7126448154449463,
354
  "step": 110
355
  },
356
  {
357
  "epoch": 0.24077466631771788,
358
- "grad_norm": 22.177575846770726,
359
  "learning_rate": 4.70507279583015e-07,
360
- "logits/chosen": -2.6880195140838623,
361
- "logits/rejected": -2.6790332794189453,
362
- "logps/chosen": -1.1167609691619873,
363
- "logps/rejected": -1.3075841665267944,
364
- "loss": 1.1395,
365
- "rewards/accuracies": 0.637499988079071,
366
- "rewards/chosen": -2.2335219383239746,
367
- "rewards/margins": 0.381646066904068,
368
- "rewards/rejected": -2.615168333053589,
369
  "step": 115
370
  },
371
  {
372
  "epoch": 0.2512431300706621,
373
- "grad_norm": 19.13585701945862,
374
  "learning_rate": 4.6604720940421207e-07,
375
- "logits/chosen": -2.7028918266296387,
376
- "logits/rejected": -2.651949405670166,
377
- "logps/chosen": -1.063138484954834,
378
- "logps/rejected": -1.4494984149932861,
379
- "loss": 1.1271,
380
- "rewards/accuracies": 0.637499988079071,
381
- "rewards/chosen": -2.126276969909668,
382
- "rewards/margins": 0.7727198600769043,
383
- "rewards/rejected": -2.8989968299865723,
384
  "step": 120
385
  },
386
  {
387
  "epoch": 0.26171159382360637,
388
- "grad_norm": 25.557691612030105,
389
  "learning_rate": 4.612975213859487e-07,
390
- "logits/chosen": -2.602529764175415,
391
- "logits/rejected": -2.5662357807159424,
392
- "logps/chosen": -1.115337610244751,
393
- "logps/rejected": -1.4704806804656982,
394
- "loss": 1.129,
395
- "rewards/accuracies": 0.6625000238418579,
396
- "rewards/chosen": -2.230675220489502,
397
- "rewards/margins": 0.710286021232605,
398
- "rewards/rejected": -2.9409613609313965,
399
  "step": 125
400
  },
401
  {
402
  "epoch": 0.2721800575765506,
403
- "grad_norm": 24.75408152143484,
404
  "learning_rate": 4.5626458262912735e-07,
405
- "logits/chosen": -2.6199193000793457,
406
- "logits/rejected": -2.6276497840881348,
407
- "logps/chosen": -1.1212177276611328,
408
- "logps/rejected": -1.4887864589691162,
409
- "loss": 1.0755,
410
- "rewards/accuracies": 0.706250011920929,
411
- "rewards/chosen": -2.2424354553222656,
412
- "rewards/margins": 0.735137403011322,
413
- "rewards/rejected": -2.9775729179382324,
414
  "step": 130
415
  },
416
  {
417
  "epoch": 0.2826485213294949,
418
- "grad_norm": 25.657005868555796,
419
  "learning_rate": 4.5095513994085974e-07,
420
- "logits/chosen": -2.5629477500915527,
421
- "logits/rejected": -2.5350534915924072,
422
- "logps/chosen": -1.0578689575195312,
423
- "logps/rejected": -1.465559482574463,
424
- "loss": 1.1166,
425
- "rewards/accuracies": 0.6812499761581421,
426
- "rewards/chosen": -2.1157379150390625,
427
- "rewards/margins": 0.8153812289237976,
428
- "rewards/rejected": -2.931118965148926,
429
  "step": 135
430
  },
431
  {
432
  "epoch": 0.29311698508243916,
433
- "grad_norm": 33.368623664763554,
434
  "learning_rate": 4.453763107901675e-07,
435
- "logits/chosen": -2.5612621307373047,
436
- "logits/rejected": -2.5635030269622803,
437
- "logps/chosen": -1.290777325630188,
438
- "logps/rejected": -1.5123212337493896,
439
- "loss": 1.1713,
440
- "rewards/accuracies": 0.612500011920929,
441
- "rewards/chosen": -2.581554651260376,
442
- "rewards/margins": 0.44308751821517944,
443
- "rewards/rejected": -3.0246424674987793,
444
  "step": 140
445
  },
446
  {
447
  "epoch": 0.3035854488353834,
448
- "grad_norm": 28.378170027761122,
449
  "learning_rate": 4.395355737667985e-07,
450
- "logits/chosen": -2.5412211418151855,
451
- "logits/rejected": -2.52405047416687,
452
- "logps/chosen": -1.1427204608917236,
453
- "logps/rejected": -1.5294116735458374,
454
- "loss": 1.0713,
455
  "rewards/accuracies": 0.675000011920929,
456
- "rewards/chosen": -2.2854409217834473,
457
- "rewards/margins": 0.7733823657035828,
458
- "rewards/rejected": -3.058823347091675,
459
  "step": 145
460
  },
461
  {
462
  "epoch": 0.31405391258832765,
463
- "grad_norm": 46.6740015037233,
464
  "learning_rate": 4.3344075855595097e-07,
465
- "logits/chosen": -2.585468292236328,
466
- "logits/rejected": -2.542560338973999,
467
- "logps/chosen": -1.1231515407562256,
468
- "logps/rejected": -1.4225043058395386,
469
- "loss": 1.0963,
470
- "rewards/accuracies": 0.5874999761581421,
471
- "rewards/chosen": -2.246303081512451,
472
- "rewards/margins": 0.5987052321434021,
473
- "rewards/rejected": -2.845008611679077,
474
  "step": 150
475
  },
476
  {
477
  "epoch": 0.3245223763412719,
478
- "grad_norm": 56.50289037671195,
479
  "learning_rate": 4.271000354423425e-07,
480
- "logits/chosen": -2.5081095695495605,
481
- "logits/rejected": -2.486605167388916,
482
- "logps/chosen": -1.1743667125701904,
483
- "logps/rejected": -1.6648305654525757,
484
- "loss": 1.0424,
485
  "rewards/accuracies": 0.699999988079071,
486
- "rewards/chosen": -2.348733425140381,
487
- "rewards/margins": 0.9809279441833496,
488
- "rewards/rejected": -3.3296611309051514,
489
  "step": 155
490
  },
491
  {
492
  "epoch": 0.33499084009421615,
493
- "grad_norm": 28.276686642324012,
494
  "learning_rate": 4.2052190435769554e-07,
495
- "logits/chosen": -2.4815526008605957,
496
- "logits/rejected": -2.4718894958496094,
497
- "logps/chosen": -1.218774676322937,
498
- "logps/rejected": -1.7314999103546143,
499
- "loss": 1.0686,
500
- "rewards/accuracies": 0.699999988079071,
501
- "rewards/chosen": -2.437549352645874,
502
- "rewards/margins": 1.0254504680633545,
503
- "rewards/rejected": -3.4629998207092285,
504
  "step": 160
505
  },
506
  {
507
  "epoch": 0.34545930384716045,
508
- "grad_norm": 30.685468209669434,
509
  "learning_rate": 4.137151834863213e-07,
510
- "logits/chosen": -2.5408642292022705,
511
- "logits/rejected": -2.5061981678009033,
512
- "logps/chosen": -1.1855158805847168,
513
- "logps/rejected": -1.5292389392852783,
514
- "loss": 1.0436,
515
- "rewards/accuracies": 0.6812499761581421,
516
- "rewards/chosen": -2.3710317611694336,
517
- "rewards/margins": 0.6874457001686096,
518
- "rewards/rejected": -3.0584778785705566,
519
  "step": 165
520
  },
521
  {
522
  "epoch": 0.3559277676001047,
523
- "grad_norm": 40.531749208814226,
524
  "learning_rate": 4.0668899744407567e-07,
525
- "logits/chosen": -2.533921003341675,
526
- "logits/rejected": -2.497112989425659,
527
- "logps/chosen": -1.1383014917373657,
528
- "logps/rejected": -1.505181074142456,
529
- "loss": 1.0589,
530
- "rewards/accuracies": 0.7124999761581421,
531
- "rewards/chosen": -2.2766029834747314,
532
- "rewards/margins": 0.7337592244148254,
533
- "rewards/rejected": -3.010362148284912,
534
  "step": 170
535
  },
536
  {
537
  "epoch": 0.36639623135304894,
538
- "grad_norm": 23.332727041453218,
539
  "learning_rate": 3.994527650465352e-07,
540
- "logits/chosen": -2.4560179710388184,
541
- "logits/rejected": -2.4505882263183594,
542
- "logps/chosen": -1.1397249698638916,
543
- "logps/rejected": -1.5572229623794556,
544
- "loss": 1.0495,
545
- "rewards/accuracies": 0.7250000238418579,
546
- "rewards/chosen": -2.279449939727783,
547
- "rewards/margins": 0.834996223449707,
548
- "rewards/rejected": -3.114445924758911,
549
  "step": 175
550
  },
551
  {
552
  "epoch": 0.3768646951059932,
553
- "grad_norm": 40.28868470678003,
554
  "learning_rate": 3.920161866827889e-07,
555
- "logits/chosen": -2.4761745929718018,
556
- "logits/rejected": -2.46667218208313,
557
- "logps/chosen": -1.2438228130340576,
558
- "logps/rejected": -1.7211487293243408,
559
- "loss": 1.0691,
560
- "rewards/accuracies": 0.71875,
561
- "rewards/chosen": -2.4876456260681152,
562
- "rewards/margins": 0.9546514749526978,
563
- "rewards/rejected": -3.4422974586486816,
564
  "step": 180
565
  },
566
  {
567
  "epoch": 0.38733315885893743,
568
- "grad_norm": 28.601855368944907,
569
  "learning_rate": 3.8438923131177237e-07,
570
- "logits/chosen": -2.4253547191619873,
571
- "logits/rejected": -2.4443154335021973,
572
- "logps/chosen": -1.2864909172058105,
573
- "logps/rejected": -1.7701524496078491,
574
- "loss": 0.9977,
575
- "rewards/accuracies": 0.6875,
576
- "rewards/chosen": -2.572981834411621,
577
- "rewards/margins": 0.9673231840133667,
578
- "rewards/rejected": -3.5403048992156982,
579
  "step": 185
580
  },
581
  {
582
  "epoch": 0.39780162261188173,
583
- "grad_norm": 24.252753054651354,
584
  "learning_rate": 3.765821230985757e-07,
585
- "logits/chosen": -2.431131362915039,
586
- "logits/rejected": -2.4391722679138184,
587
- "logps/chosen": -1.2784379720687866,
588
- "logps/rejected": -1.7420759201049805,
589
- "loss": 1.0192,
590
- "rewards/accuracies": 0.6937500238418579,
591
- "rewards/chosen": -2.5568759441375732,
592
- "rewards/margins": 0.9272757768630981,
593
- "rewards/rejected": -3.484151840209961,
594
  "step": 190
595
  },
596
  {
597
  "epoch": 0.408270086364826,
598
- "grad_norm": 38.5583254942953,
599
  "learning_rate": 3.6860532770864005e-07,
600
- "logits/chosen": -2.446394681930542,
601
- "logits/rejected": -2.4406747817993164,
602
- "logps/chosen": -1.3122910261154175,
603
- "logps/rejected": -1.8817335367202759,
604
- "loss": 1.0854,
605
- "rewards/accuracies": 0.731249988079071,
606
- "rewards/chosen": -2.624582052230835,
607
- "rewards/margins": 1.1388850212097168,
608
- "rewards/rejected": -3.7634670734405518,
609
  "step": 195
610
  },
611
  {
612
  "epoch": 0.4187385501177702,
613
- "grad_norm": 33.60032889557618,
614
  "learning_rate": 3.604695382782159e-07,
615
- "logits/chosen": -2.3602447509765625,
616
- "logits/rejected": -2.3452978134155273,
617
- "logps/chosen": -1.418891191482544,
618
- "logps/rejected": -1.8115421533584595,
619
- "loss": 1.0362,
620
- "rewards/accuracies": 0.625,
621
- "rewards/chosen": -2.837782382965088,
622
- "rewards/margins": 0.7853015661239624,
623
- "rewards/rejected": -3.623084306716919,
624
  "step": 200
625
  },
626
  {
627
  "epoch": 0.42920701387071447,
628
- "grad_norm": 41.77530109838899,
629
  "learning_rate": 3.5218566107988867e-07,
630
- "logits/chosen": -2.440084934234619,
631
- "logits/rejected": -2.434321641921997,
632
- "logps/chosen": -1.4680618047714233,
633
- "logps/rejected": -2.0431008338928223,
634
- "loss": 1.0237,
635
- "rewards/accuracies": 0.6875,
636
- "rewards/chosen": -2.9361236095428467,
637
- "rewards/margins": 1.1500780582427979,
638
- "rewards/rejected": -4.0862016677856445,
639
  "step": 205
640
  },
641
  {
642
  "epoch": 0.4396754776236587,
643
- "grad_norm": 38.42881832336701,
644
  "learning_rate": 3.4376480090239047e-07,
645
- "logits/chosen": -2.3035776615142822,
646
- "logits/rejected": -2.3068020343780518,
647
- "logps/chosen": -1.538132667541504,
648
- "logps/rejected": -1.9348046779632568,
649
- "loss": 1.0792,
650
  "rewards/accuracies": 0.699999988079071,
651
- "rewards/chosen": -3.076265335083008,
652
- "rewards/margins": 0.793343722820282,
653
- "rewards/rejected": -3.8696093559265137,
654
  "step": 210
655
  },
656
  {
657
  "epoch": 0.45014394137660296,
658
- "grad_norm": 37.864128714760994,
659
  "learning_rate": 3.3521824616429284e-07,
660
- "logits/chosen": -2.3211138248443604,
661
- "logits/rejected": -2.3472557067871094,
662
- "logps/chosen": -1.592795968055725,
663
- "logps/rejected": -2.076181411743164,
664
- "loss": 1.0398,
665
- "rewards/accuracies": 0.737500011920929,
666
- "rewards/chosen": -3.18559193611145,
667
- "rewards/margins": 0.9667709469795227,
668
- "rewards/rejected": -4.152362823486328,
669
  "step": 215
670
  },
671
  {
672
  "epoch": 0.46061240512954726,
673
- "grad_norm": 49.02469658888052,
674
  "learning_rate": 3.265574537815398e-07,
675
- "logits/chosen": -2.3193514347076416,
676
- "logits/rejected": -2.3292932510375977,
677
- "logps/chosen": -1.618334174156189,
678
- "logps/rejected": -2.132713794708252,
679
- "loss": 0.9556,
680
- "rewards/accuracies": 0.731249988079071,
681
- "rewards/chosen": -3.236668348312378,
682
- "rewards/margins": 1.028759241104126,
683
- "rewards/rejected": -4.265427589416504,
684
  "step": 220
685
  },
686
  {
687
  "epoch": 0.4710808688824915,
688
- "grad_norm": 55.910411237501854,
689
  "learning_rate": 3.1779403380910425e-07,
690
- "logits/chosen": -2.300808906555176,
691
- "logits/rejected": -2.271570920944214,
692
- "logps/chosen": -1.7557439804077148,
693
- "logps/rejected": -2.248976230621338,
694
- "loss": 1.0217,
695
- "rewards/accuracies": 0.706250011920929,
696
- "rewards/chosen": -3.5114879608154297,
697
- "rewards/margins": 0.9864643216133118,
698
- "rewards/rejected": -4.497952461242676,
699
  "step": 225
700
  },
701
  {
702
  "epoch": 0.48154933263543576,
703
- "grad_norm": 42.0888069104163,
704
  "learning_rate": 3.0893973387735683e-07,
705
- "logits/chosen": -2.294128894805908,
706
- "logits/rejected": -2.309138774871826,
707
- "logps/chosen": -1.751056432723999,
708
- "logps/rejected": -2.3280422687530518,
709
- "loss": 0.9723,
710
- "rewards/accuracies": 0.699999988079071,
711
- "rewards/chosen": -3.502112865447998,
712
- "rewards/margins": 1.1539720296859741,
713
- "rewards/rejected": -4.6560845375061035,
714
  "step": 230
715
  },
716
  {
717
  "epoch": 0.49201779638838,
718
- "grad_norm": 35.0417945479427,
719
  "learning_rate": 3.000064234440111e-07,
720
- "logits/chosen": -2.2189602851867676,
721
- "logits/rejected": -2.2302825450897217,
722
- "logps/chosen": -1.7158355712890625,
723
- "logps/rejected": -2.1802239418029785,
724
- "loss": 0.9931,
725
  "rewards/accuracies": 0.706250011920929,
726
- "rewards/chosen": -3.431671142578125,
727
- "rewards/margins": 0.9287766218185425,
728
- "rewards/rejected": -4.360447883605957,
729
  "step": 235
730
  },
731
  {
732
  "epoch": 0.5024862601413242,
733
- "grad_norm": 36.090443312871884,
734
  "learning_rate": 2.910060778827554e-07,
735
- "logits/chosen": -2.2169604301452637,
736
- "logits/rejected": -2.2240233421325684,
737
- "logps/chosen": -1.7415260076522827,
738
- "logps/rejected": -2.3678994178771973,
739
- "loss": 1.0067,
740
  "rewards/accuracies": 0.71875,
741
- "rewards/chosen": -3.4830520153045654,
742
- "rewards/margins": 1.25274658203125,
743
- "rewards/rejected": -4.7357988357543945,
744
  "step": 240
745
  },
746
  {
747
  "epoch": 0.5129547238942685,
748
- "grad_norm": 44.496210416378226,
749
  "learning_rate": 2.8195076242990116e-07,
750
- "logits/chosen": -2.2511677742004395,
751
- "logits/rejected": -2.272930860519409,
752
- "logps/chosen": -1.718396782875061,
753
- "logps/rejected": -2.421145439147949,
754
- "loss": 0.9378,
755
- "rewards/accuracies": 0.7437499761581421,
756
- "rewards/chosen": -3.436793565750122,
757
- "rewards/margins": 1.4054975509643555,
758
- "rewards/rejected": -4.842290878295898,
759
  "step": 245
760
  },
761
  {
762
  "epoch": 0.5234231876472127,
763
- "grad_norm": 37.890024882995576,
764
  "learning_rate": 2.7285261601056697e-07,
765
- "logits/chosen": -2.1709089279174805,
766
- "logits/rejected": -2.190825939178467,
767
- "logps/chosen": -1.9037107229232788,
768
- "logps/rejected": -2.5919413566589355,
769
- "loss": 1.059,
770
- "rewards/accuracies": 0.7562500238418579,
771
- "rewards/chosen": -3.8074214458465576,
772
- "rewards/margins": 1.3764612674713135,
773
- "rewards/rejected": -5.183882713317871,
774
  "step": 250
775
  },
776
  {
777
  "epoch": 0.533891651400157,
778
- "grad_norm": 43.16172159761982,
779
  "learning_rate": 2.6372383496608186e-07,
780
- "logits/chosen": -2.2374463081359863,
781
- "logits/rejected": -2.2212741374969482,
782
- "logps/chosen": -1.9738657474517822,
783
- "logps/rejected": -2.5382752418518066,
784
- "loss": 1.0355,
785
- "rewards/accuracies": 0.643750011920929,
786
- "rewards/chosen": -3.9477314949035645,
787
- "rewards/margins": 1.128818392753601,
788
- "rewards/rejected": -5.076550483703613,
789
  "step": 255
790
  },
791
  {
792
  "epoch": 0.5443601151531012,
793
- "grad_norm": 53.257572754578675,
794
  "learning_rate": 2.5457665670441937e-07,
795
- "logits/chosen": -2.1852753162384033,
796
- "logits/rejected": -2.1949639320373535,
797
- "logps/chosen": -1.8928565979003906,
798
- "logps/rejected": -2.5031745433807373,
799
- "loss": 0.9936,
800
- "rewards/accuracies": 0.737500011920929,
801
- "rewards/chosen": -3.7857131958007812,
802
- "rewards/margins": 1.2206356525421143,
803
- "rewards/rejected": -5.006349086761475,
804
  "step": 260
805
  },
806
  {
807
  "epoch": 0.5548285789060455,
808
- "grad_norm": 50.49637822416631,
809
  "learning_rate": 2.454233432955807e-07,
810
- "logits/chosen": -2.1886794567108154,
811
- "logits/rejected": -2.1979708671569824,
812
- "logps/chosen": -1.947239637374878,
813
- "logps/rejected": -2.452030658721924,
814
- "loss": 0.9884,
815
- "rewards/accuracies": 0.731249988079071,
816
- "rewards/chosen": -3.894479274749756,
817
- "rewards/margins": 1.0095816850662231,
818
- "rewards/rejected": -4.904061317443848,
819
  "step": 265
820
  },
821
  {
822
  "epoch": 0.5652970426589898,
823
- "grad_norm": 48.68640959435197,
824
  "learning_rate": 2.3627616503391812e-07,
825
- "logits/chosen": -2.229895830154419,
826
- "logits/rejected": -2.239255428314209,
827
- "logps/chosen": -2.0069568157196045,
828
- "logps/rejected": -2.4461960792541504,
829
- "loss": 0.9656,
830
- "rewards/accuracies": 0.6875,
831
- "rewards/chosen": -4.013913631439209,
832
- "rewards/margins": 0.8784781694412231,
833
- "rewards/rejected": -4.892392158508301,
834
  "step": 270
835
  },
836
  {
837
  "epoch": 0.575765506411934,
838
- "grad_norm": 30.269087239551055,
839
  "learning_rate": 2.2714738398943308e-07,
840
- "logits/chosen": -2.2119524478912354,
841
- "logits/rejected": -2.2117958068847656,
842
- "logps/chosen": -1.8545528650283813,
843
- "logps/rejected": -2.6458373069763184,
844
- "loss": 0.9144,
845
- "rewards/accuracies": 0.71875,
846
- "rewards/chosen": -3.7091057300567627,
847
- "rewards/margins": 1.5825694799423218,
848
- "rewards/rejected": -5.291674613952637,
849
  "step": 275
850
  },
851
  {
852
  "epoch": 0.5862339701648783,
853
- "grad_norm": 37.24400806922662,
854
  "learning_rate": 2.1804923757009882e-07,
855
- "logits/chosen": -2.2228572368621826,
856
- "logits/rejected": -2.2220091819763184,
857
- "logps/chosen": -1.8731377124786377,
858
- "logps/rejected": -2.5260300636291504,
859
- "loss": 1.014,
860
- "rewards/accuracies": 0.7250000238418579,
861
- "rewards/chosen": -3.7462754249572754,
862
- "rewards/margins": 1.3057842254638672,
863
- "rewards/rejected": -5.052060127258301,
864
  "step": 280
865
  },
866
  {
867
  "epoch": 0.5967024339178225,
868
- "grad_norm": 43.31209376980203,
869
  "learning_rate": 2.089939221172446e-07,
870
- "logits/chosen": -2.1984705924987793,
871
- "logits/rejected": -2.1894819736480713,
872
- "logps/chosen": -1.953238844871521,
873
- "logps/rejected": -2.5444319248199463,
874
- "loss": 1.0048,
875
- "rewards/accuracies": 0.71875,
876
- "rewards/chosen": -3.906477689743042,
877
- "rewards/margins": 1.1823861598968506,
878
- "rewards/rejected": -5.088863849639893,
879
  "step": 285
880
  },
881
  {
882
  "epoch": 0.6071708976707668,
883
- "grad_norm": 46.642401554710816,
884
  "learning_rate": 1.9999357655598891e-07,
885
- "logits/chosen": -2.1803629398345947,
886
- "logits/rejected": -2.1622722148895264,
887
- "logps/chosen": -1.891466498374939,
888
- "logps/rejected": -2.350858688354492,
889
- "loss": 0.9927,
890
- "rewards/accuracies": 0.6937500238418579,
891
- "rewards/chosen": -3.782932996749878,
892
- "rewards/margins": 0.9187847971916199,
893
- "rewards/rejected": -4.701717376708984,
894
  "step": 290
895
  },
896
  {
897
  "epoch": 0.6176393614237111,
898
- "grad_norm": 52.58617249560728,
899
  "learning_rate": 1.9106026612264315e-07,
900
- "logits/chosen": -2.2187323570251465,
901
- "logits/rejected": -2.205200672149658,
902
- "logps/chosen": -1.8883600234985352,
903
- "logps/rejected": -2.3388593196868896,
904
- "loss": 0.9493,
905
- "rewards/accuracies": 0.6875,
906
- "rewards/chosen": -3.7767200469970703,
907
- "rewards/margins": 0.9009987115859985,
908
- "rewards/rejected": -4.677718639373779,
909
  "step": 295
910
  },
911
  {
912
  "epoch": 0.6281078251766553,
913
- "grad_norm": 52.91294946354268,
914
  "learning_rate": 1.8220596619089573e-07,
915
- "logits/chosen": -2.092263698577881,
916
- "logits/rejected": -2.0721356868743896,
917
- "logps/chosen": -2.0280632972717285,
918
- "logps/rejected": -2.621304988861084,
919
- "loss": 1.0111,
920
- "rewards/accuracies": 0.731249988079071,
921
- "rewards/chosen": -4.056126594543457,
922
- "rewards/margins": 1.1864832639694214,
923
- "rewards/rejected": -5.242609977722168,
924
  "step": 300
925
  },
926
  {
927
  "epoch": 0.6385762889295996,
928
- "grad_norm": 46.52800472294266,
929
  "learning_rate": 1.7344254621846017e-07,
930
- "logits/chosen": -2.181384563446045,
931
- "logits/rejected": -2.1939797401428223,
932
- "logps/chosen": -1.9072606563568115,
933
- "logps/rejected": -2.3984622955322266,
934
- "loss": 0.9429,
935
- "rewards/accuracies": 0.706250011920929,
936
- "rewards/chosen": -3.814521312713623,
937
- "rewards/margins": 0.9824029803276062,
938
- "rewards/rejected": -4.796924591064453,
939
  "step": 305
940
  },
941
  {
942
  "epoch": 0.6490447526825438,
943
- "grad_norm": 46.97995595685562,
944
  "learning_rate": 1.647817538357072e-07,
945
- "logits/chosen": -2.121640682220459,
946
- "logits/rejected": -2.135894298553467,
947
- "logps/chosen": -2.1177263259887695,
948
- "logps/rejected": -2.7280328273773193,
949
- "loss": 0.9275,
950
- "rewards/accuracies": 0.731249988079071,
951
- "rewards/chosen": -4.235452651977539,
952
- "rewards/margins": 1.2206127643585205,
953
- "rewards/rejected": -5.456065654754639,
954
  "step": 310
955
  },
956
  {
957
  "epoch": 0.6595132164354881,
958
- "grad_norm": 45.80872073055723,
959
  "learning_rate": 1.562351990976095e-07,
960
- "logits/chosen": -2.1575684547424316,
961
- "logits/rejected": -2.1615099906921387,
962
- "logps/chosen": -2.148725986480713,
963
- "logps/rejected": -2.741818904876709,
964
- "loss": 0.9761,
965
  "rewards/accuracies": 0.6937500238418579,
966
- "rewards/chosen": -4.297451972961426,
967
- "rewards/margins": 1.1861859560012817,
968
- "rewards/rejected": -5.483637809753418,
969
  "step": 315
970
  },
971
  {
972
  "epoch": 0.6699816801884323,
973
- "grad_norm": 47.83368873073484,
974
  "learning_rate": 1.478143389201113e-07,
975
- "logits/chosen": -2.1588540077209473,
976
- "logits/rejected": -2.154210329055786,
977
- "logps/chosen": -2.074022054672241,
978
- "logps/rejected": -2.7597062587738037,
979
- "loss": 0.9235,
980
- "rewards/accuracies": 0.7250000238418579,
981
- "rewards/chosen": -4.148044109344482,
982
- "rewards/margins": 1.3713690042495728,
983
- "rewards/rejected": -5.519412517547607,
984
  "step": 320
985
  },
986
  {
987
  "epoch": 0.6804501439413766,
988
- "grad_norm": 55.31825641332028,
989
  "learning_rate": 1.3953046172178413e-07,
990
- "logits/chosen": -2.119729518890381,
991
- "logits/rejected": -2.1389968395233154,
992
- "logps/chosen": -2.3134312629699707,
993
- "logps/rejected": -2.9678821563720703,
994
- "loss": 1.0012,
995
- "rewards/accuracies": 0.7124999761581421,
996
- "rewards/chosen": -4.626862525939941,
997
- "rewards/margins": 1.3089015483856201,
998
- "rewards/rejected": -5.935764312744141,
999
  "step": 325
1000
  },
1001
  {
1002
  "epoch": 0.6909186076943209,
1003
- "grad_norm": 50.371210241190134,
1004
  "learning_rate": 1.3139467229135998e-07,
1005
- "logits/chosen": -2.1119532585144043,
1006
- "logits/rejected": -2.120948314666748,
1007
- "logps/chosen": -2.2528138160705566,
1008
- "logps/rejected": -3.02119779586792,
1009
- "loss": 0.9473,
1010
  "rewards/accuracies": 0.7437499761581421,
1011
- "rewards/chosen": -4.505627632141113,
1012
- "rewards/margins": 1.536767601966858,
1013
- "rewards/rejected": -6.04239559173584,
1014
  "step": 330
1015
  },
1016
  {
1017
  "epoch": 0.7013870714472651,
1018
- "grad_norm": 64.75546200262123,
1019
  "learning_rate": 1.2341787690142435e-07,
1020
- "logits/chosen": -2.0687994956970215,
1021
- "logits/rejected": -2.0951106548309326,
1022
- "logps/chosen": -2.3926031589508057,
1023
- "logps/rejected": -3.0681991577148438,
1024
- "loss": 0.973,
1025
- "rewards/accuracies": 0.7250000238418579,
1026
- "rewards/chosen": -4.785206317901611,
1027
- "rewards/margins": 1.3511921167373657,
1028
- "rewards/rejected": -6.1363983154296875,
1029
  "step": 335
1030
  },
1031
  {
1032
  "epoch": 0.7118555352002094,
1033
- "grad_norm": 62.37625935660813,
1034
  "learning_rate": 1.1561076868822755e-07,
1035
- "logits/chosen": -2.0640933513641357,
1036
- "logits/rejected": -2.1258063316345215,
1037
- "logps/chosen": -2.3601365089416504,
1038
- "logps/rejected": -3.2155094146728516,
1039
- "loss": 0.9726,
1040
- "rewards/accuracies": 0.7562500238418579,
1041
- "rewards/chosen": -4.720273017883301,
1042
- "rewards/margins": 1.710745096206665,
1043
- "rewards/rejected": -6.431018829345703,
1044
  "step": 340
1045
  },
1046
  {
1047
  "epoch": 0.7223239989531536,
1048
- "grad_norm": 42.42351768699483,
1049
  "learning_rate": 1.0798381331721107e-07,
1050
- "logits/chosen": -2.1099228858947754,
1051
- "logits/rejected": -2.1320974826812744,
1052
- "logps/chosen": -2.260925769805908,
1053
- "logps/rejected": -2.945192575454712,
1054
- "loss": 0.9736,
1055
- "rewards/accuracies": 0.7562500238418579,
1056
- "rewards/chosen": -4.521851539611816,
1057
- "rewards/margins": 1.3685338497161865,
1058
- "rewards/rejected": -5.890385150909424,
1059
  "step": 345
1060
  },
1061
  {
1062
  "epoch": 0.7327924627060979,
1063
- "grad_norm": 43.99023051401102,
1064
  "learning_rate": 1.0054723495346482e-07,
1065
- "logits/chosen": -2.1317076683044434,
1066
- "logits/rejected": -2.1474900245666504,
1067
- "logps/chosen": -2.075162887573242,
1068
- "logps/rejected": -2.634793281555176,
1069
- "loss": 0.9165,
1070
- "rewards/accuracies": 0.706250011920929,
1071
- "rewards/chosen": -4.150325775146484,
1072
- "rewards/margins": 1.1192606687545776,
1073
- "rewards/rejected": -5.269586563110352,
1074
  "step": 350
1075
  },
1076
  {
1077
  "epoch": 0.7432609264590422,
1078
- "grad_norm": 41.72182947472904,
1079
  "learning_rate": 9.331100255592436e-08,
1080
- "logits/chosen": -2.1080145835876465,
1081
- "logits/rejected": -2.1329493522644043,
1082
- "logps/chosen": -1.9976682662963867,
1083
- "logps/rejected": -2.5884833335876465,
1084
- "loss": 1.0031,
1085
- "rewards/accuracies": 0.71875,
1086
- "rewards/chosen": -3.9953365325927734,
1087
- "rewards/margins": 1.1816307306289673,
1088
- "rewards/rejected": -5.176966667175293,
1089
  "step": 355
1090
  },
1091
  {
1092
  "epoch": 0.7537293902119864,
1093
- "grad_norm": 44.34047823991735,
1094
  "learning_rate": 8.628481651367875e-08,
1095
- "logits/chosen": -2.111832857131958,
1096
- "logits/rejected": -2.128884792327881,
1097
- "logps/chosen": -1.9469451904296875,
1098
- "logps/rejected": -2.5760998725891113,
1099
- "loss": 0.9578,
1100
- "rewards/accuracies": 0.75,
1101
- "rewards/chosen": -3.893890380859375,
1102
- "rewards/margins": 1.2583085298538208,
1103
- "rewards/rejected": -5.152199745178223,
1104
  "step": 360
1105
  },
1106
  {
1107
  "epoch": 0.7641978539649307,
1108
- "grad_norm": 44.24679829957238,
1109
  "learning_rate": 7.947809564230445e-08,
1110
- "logits/chosen": -2.130434513092041,
1111
- "logits/rejected": -2.136301040649414,
1112
- "logps/chosen": -2.001988172531128,
1113
- "logps/rejected": -2.7044124603271484,
1114
- "loss": 0.9305,
1115
- "rewards/accuracies": 0.7124999761581421,
1116
- "rewards/chosen": -4.003976345062256,
1117
- "rewards/margins": 1.404848337173462,
1118
- "rewards/rejected": -5.408824920654297,
1119
  "step": 365
1120
  },
1121
  {
1122
  "epoch": 0.7746663177178749,
1123
- "grad_norm": 41.688418517292405,
1124
  "learning_rate": 7.289996455765748e-08,
1125
- "logits/chosen": -2.106525421142578,
1126
- "logits/rejected": -2.1191978454589844,
1127
- "logps/chosen": -1.9732780456542969,
1128
- "logps/rejected": -2.7075438499450684,
1129
- "loss": 0.8596,
1130
- "rewards/accuracies": 0.793749988079071,
1131
- "rewards/chosen": -3.9465560913085938,
1132
- "rewards/margins": 1.4685311317443848,
1133
- "rewards/rejected": -5.415087699890137,
1134
  "step": 370
1135
  },
1136
  {
1137
  "epoch": 0.7851347814708192,
1138
- "grad_norm": 49.6620038718469,
1139
  "learning_rate": 6.655924144404906e-08,
1140
- "logits/chosen": -2.0811514854431152,
1141
- "logits/rejected": -2.098820924758911,
1142
- "logps/chosen": -2.00937819480896,
1143
- "logps/rejected": -2.715106248855591,
1144
- "loss": 0.9508,
1145
- "rewards/accuracies": 0.7124999761581421,
1146
- "rewards/chosen": -4.01875638961792,
1147
- "rewards/margins": 1.4114553928375244,
1148
- "rewards/rejected": -5.430212497711182,
1149
  "step": 375
1150
  },
1151
  {
1152
  "epoch": 0.7956032452237635,
1153
- "grad_norm": 49.439098964705394,
1154
  "learning_rate": 6.046442623320145e-08,
1155
- "logits/chosen": -2.1282591819763184,
1156
- "logits/rejected": -2.172515869140625,
1157
- "logps/chosen": -2.048567056655884,
1158
- "logps/rejected": -2.6656880378723145,
1159
- "loss": 0.947,
1160
- "rewards/accuracies": 0.7250000238418579,
1161
- "rewards/chosen": -4.097134113311768,
1162
- "rewards/margins": 1.2342422008514404,
1163
- "rewards/rejected": -5.331376075744629,
1164
  "step": 380
1165
  },
1166
  {
1167
  "epoch": 0.8060717089767077,
1168
- "grad_norm": 42.61496853359177,
1169
  "learning_rate": 5.4623689209832484e-08,
1170
- "logits/chosen": -2.0846481323242188,
1171
- "logits/rejected": -2.10569429397583,
1172
- "logps/chosen": -2.0798821449279785,
1173
- "logps/rejected": -2.7054901123046875,
1174
- "loss": 0.9314,
1175
  "rewards/accuracies": 0.7562500238418579,
1176
- "rewards/chosen": -4.159764289855957,
1177
- "rewards/margins": 1.2512160539627075,
1178
- "rewards/rejected": -5.410980224609375,
1179
  "step": 385
1180
  },
1181
  {
1182
  "epoch": 0.816540172729652,
1183
- "grad_norm": 44.10407381547168,
1184
  "learning_rate": 4.904486005914027e-08,
1185
- "logits/chosen": -2.091681957244873,
1186
- "logits/rejected": -2.098536252975464,
1187
- "logps/chosen": -2.2144787311553955,
1188
- "logps/rejected": -2.8681719303131104,
1189
- "loss": 0.9219,
1190
  "rewards/accuracies": 0.731249988079071,
1191
- "rewards/chosen": -4.428957462310791,
1192
- "rewards/margins": 1.3073859214782715,
1193
- "rewards/rejected": -5.736343860626221,
1194
  "step": 390
1195
  },
1196
  {
1197
  "epoch": 0.8270086364825961,
1198
- "grad_norm": 46.05335739253767,
1199
  "learning_rate": 4.373541737087263e-08,
1200
- "logits/chosen": -2.0259995460510254,
1201
- "logits/rejected": -2.0590269565582275,
1202
- "logps/chosen": -2.194620370864868,
1203
- "logps/rejected": -2.9196109771728516,
1204
- "loss": 0.9562,
1205
- "rewards/accuracies": 0.7250000238418579,
1206
- "rewards/chosen": -4.389240741729736,
1207
- "rewards/margins": 1.449981689453125,
1208
- "rewards/rejected": -5.839221954345703,
1209
  "step": 395
1210
  },
1211
  {
1212
  "epoch": 0.8374771002355405,
1213
- "grad_norm": 47.14141799320461,
1214
  "learning_rate": 3.8702478614051345e-08,
1215
- "logits/chosen": -2.0541300773620605,
1216
- "logits/rejected": -2.0651955604553223,
1217
- "logps/chosen": -2.281395435333252,
1218
- "logps/rejected": -2.873800277709961,
1219
- "loss": 0.9739,
1220
- "rewards/accuracies": 0.7250000238418579,
1221
- "rewards/chosen": -4.562790870666504,
1222
- "rewards/margins": 1.1848098039627075,
1223
- "rewards/rejected": -5.747600555419922,
1224
  "step": 400
1225
  },
1226
  {
1227
  "epoch": 0.8374771002355405,
1228
- "eval_logits/chosen": -2.103816270828247,
1229
- "eval_logits/rejected": -2.1166677474975586,
1230
- "eval_logps/chosen": -2.3097894191741943,
1231
- "eval_logps/rejected": -3.02291202545166,
1232
- "eval_loss": 0.9205789566040039,
1233
  "eval_rewards/accuracies": 0.7319999933242798,
1234
- "eval_rewards/chosen": -4.619578838348389,
1235
- "eval_rewards/margins": 1.4262455701828003,
1236
- "eval_rewards/rejected": -6.04582405090332,
1237
- "eval_runtime": 171.4733,
1238
- "eval_samples_per_second": 11.664,
1239
  "eval_steps_per_second": 0.729,
1240
  "step": 400
1241
  },
1242
  {
1243
  "epoch": 0.8479455639884846,
1244
- "grad_norm": 47.00773953228135,
1245
  "learning_rate": 3.3952790595787986e-08,
1246
- "logits/chosen": -2.060488224029541,
1247
- "logits/rejected": -2.077713966369629,
1248
- "logps/chosen": -2.361910104751587,
1249
- "logps/rejected": -2.9994819164276123,
1250
- "loss": 0.9015,
1251
- "rewards/accuracies": 0.7124999761581421,
1252
- "rewards/chosen": -4.723820209503174,
1253
- "rewards/margins": 1.2751436233520508,
1254
- "rewards/rejected": -5.998963832855225,
1255
  "step": 405
1256
  },
1257
  {
1258
  "epoch": 0.8584140277414289,
1259
- "grad_norm": 41.35340944536794,
1260
  "learning_rate": 2.9492720416985e-08,
1261
- "logits/chosen": -2.0293195247650146,
1262
- "logits/rejected": -2.0557637214660645,
1263
- "logps/chosen": -2.3440773487091064,
1264
- "logps/rejected": -2.9938693046569824,
1265
- "loss": 0.9306,
1266
- "rewards/accuracies": 0.675000011920929,
1267
- "rewards/chosen": -4.688154697418213,
1268
- "rewards/margins": 1.299583911895752,
1269
- "rewards/rejected": -5.987738609313965,
1270
  "step": 410
1271
  },
1272
  {
1273
  "epoch": 0.8688824914943732,
1274
- "grad_norm": 69.32805874095317,
1275
  "learning_rate": 2.5328246937043525e-08,
1276
- "logits/chosen": -2.061347484588623,
1277
- "logits/rejected": -2.070070505142212,
1278
- "logps/chosen": -2.4142396450042725,
1279
- "logps/rejected": -2.9347751140594482,
1280
- "loss": 0.9747,
1281
- "rewards/accuracies": 0.668749988079071,
1282
- "rewards/chosen": -4.828479290008545,
1283
- "rewards/margins": 1.0410706996917725,
1284
- "rewards/rejected": -5.8695502281188965,
1285
  "step": 415
1286
  },
1287
  {
1288
  "epoch": 0.8793509552473174,
1289
- "grad_norm": 51.664685591057165,
1290
  "learning_rate": 2.1464952759020856e-08,
1291
- "logits/chosen": -2.0183963775634766,
1292
- "logits/rejected": -2.0151352882385254,
1293
- "logps/chosen": -2.2587149143218994,
1294
- "logps/rejected": -3.042794704437256,
1295
- "loss": 0.8968,
1296
  "rewards/accuracies": 0.699999988079071,
1297
- "rewards/chosen": -4.517429828643799,
1298
- "rewards/margins": 1.568159818649292,
1299
- "rewards/rejected": -6.085589408874512,
1300
  "step": 420
1301
  },
1302
  {
1303
  "epoch": 0.8898194190002617,
1304
- "grad_norm": 56.49783544348942,
1305
  "learning_rate": 1.7908016745981856e-08,
1306
- "logits/chosen": -2.0497357845306396,
1307
- "logits/rejected": -2.0780141353607178,
1308
- "logps/chosen": -2.142561435699463,
1309
- "logps/rejected": -2.9286532402038574,
1310
- "loss": 0.8707,
1311
- "rewards/accuracies": 0.768750011920929,
1312
- "rewards/chosen": -4.285122871398926,
1313
- "rewards/margins": 1.5721828937530518,
1314
- "rewards/rejected": -5.857306480407715,
1315
  "step": 425
1316
  },
1317
  {
1318
  "epoch": 0.9002878827532059,
1319
- "grad_norm": 55.57606306413903,
1320
  "learning_rate": 1.4662207078575684e-08,
1321
- "logits/chosen": -2.0301456451416016,
1322
- "logits/rejected": -2.055358409881592,
1323
- "logps/chosen": -2.3221189975738525,
1324
- "logps/rejected": -2.9942822456359863,
1325
- "loss": 0.8793,
1326
- "rewards/accuracies": 0.7250000238418579,
1327
- "rewards/chosen": -4.644237995147705,
1328
- "rewards/margins": 1.344326376914978,
1329
- "rewards/rejected": -5.988564491271973,
1330
  "step": 430
1331
  },
1332
  {
1333
  "epoch": 0.9107563465061502,
1334
- "grad_norm": 48.809574680145,
1335
  "learning_rate": 1.1731874863145142e-08,
1336
- "logits/chosen": -2.028263568878174,
1337
- "logits/rejected": -2.0348639488220215,
1338
- "logps/chosen": -2.202627182006836,
1339
- "logps/rejected": -2.7498812675476074,
1340
- "loss": 0.9088,
1341
  "rewards/accuracies": 0.668749988079071,
1342
- "rewards/chosen": -4.405254364013672,
1343
- "rewards/margins": 1.0945093631744385,
1344
- "rewards/rejected": -5.499762535095215,
1345
  "step": 435
1346
  },
1347
  {
1348
  "epoch": 0.9212248102590945,
1349
- "grad_norm": 58.77712348946075,
1350
  "learning_rate": 9.12094829893642e-09,
1351
- "logits/chosen": -2.078192949295044,
1352
- "logits/rejected": -2.0777792930603027,
1353
- "logps/chosen": -2.2879416942596436,
1354
- "logps/rejected": -2.850325584411621,
1355
- "loss": 0.9329,
1356
- "rewards/accuracies": 0.675000011920929,
1357
- "rewards/chosen": -4.575883388519287,
1358
- "rewards/margins": 1.1247674226760864,
1359
- "rewards/rejected": -5.700651168823242,
1360
  "step": 440
1361
  },
1362
  {
1363
  "epoch": 0.9316932740120387,
1364
- "grad_norm": 53.962863944581244,
1365
  "learning_rate": 6.832927412229017e-09,
1366
- "logits/chosen": -1.9925105571746826,
1367
- "logits/rejected": -2.016268014907837,
1368
- "logps/chosen": -2.286975860595703,
1369
- "logps/rejected": -3.0245378017425537,
1370
- "loss": 0.9372,
1371
  "rewards/accuracies": 0.737500011920929,
1372
- "rewards/chosen": -4.573951721191406,
1373
- "rewards/margins": 1.4751240015029907,
1374
- "rewards/rejected": -6.049075603485107,
1375
  "step": 445
1376
  },
1377
  {
1378
  "epoch": 0.942161737764983,
1379
- "grad_norm": 52.6353894854054,
1380
  "learning_rate": 4.8708793644441086e-09,
1381
- "logits/chosen": -2.0495445728302,
1382
- "logits/rejected": -2.0841915607452393,
1383
- "logps/chosen": -2.1221392154693604,
1384
- "logps/rejected": -2.9009857177734375,
1385
- "loss": 0.8934,
1386
- "rewards/accuracies": 0.78125,
1387
- "rewards/chosen": -4.244278430938721,
1388
- "rewards/margins": 1.5576937198638916,
1389
- "rewards/rejected": -5.801971435546875,
1390
  "step": 450
1391
  },
1392
  {
1393
  "epoch": 0.9526302015179272,
1394
- "grad_norm": 48.71626303564986,
1395
  "learning_rate": 3.2374343405217884e-09,
1396
- "logits/chosen": -2.043102979660034,
1397
- "logits/rejected": -2.0290329456329346,
1398
- "logps/chosen": -2.246981143951416,
1399
- "logps/rejected": -2.93644118309021,
1400
- "loss": 0.9693,
1401
  "rewards/accuracies": 0.706250011920929,
1402
- "rewards/chosen": -4.493962287902832,
1403
- "rewards/margins": 1.3789204359054565,
1404
- "rewards/rejected": -5.87288236618042,
1405
  "step": 455
1406
  },
1407
  {
1408
  "epoch": 0.9630986652708715,
1409
- "grad_norm": 63.760211123086705,
1410
  "learning_rate": 1.9347820230782295e-09,
1411
- "logits/chosen": -2.0251128673553467,
1412
- "logits/rejected": -2.0602123737335205,
1413
- "logps/chosen": -2.2238211631774902,
1414
- "logps/rejected": -3.0641932487487793,
1415
- "loss": 0.9167,
1416
- "rewards/accuracies": 0.793749988079071,
1417
- "rewards/chosen": -4.4476423263549805,
1418
- "rewards/margins": 1.6807438135147095,
1419
- "rewards/rejected": -6.128386497497559,
1420
  "step": 460
1421
  },
1422
  {
1423
  "epoch": 0.9735671290238157,
1424
- "grad_norm": 35.043466848608176,
1425
  "learning_rate": 9.64668657069706e-10,
1426
- "logits/chosen": -2.059887409210205,
1427
- "logits/rejected": -2.0487585067749023,
1428
- "logps/chosen": -2.2237155437469482,
1429
- "logps/rejected": -3.084481716156006,
1430
- "loss": 0.8842,
1431
- "rewards/accuracies": 0.768750011920929,
1432
- "rewards/chosen": -4.4474310874938965,
1433
- "rewards/margins": 1.7215325832366943,
1434
- "rewards/rejected": -6.168963432312012,
1435
  "step": 465
1436
  },
1437
  {
1438
  "epoch": 0.98403559277676,
1439
- "grad_norm": 42.50622828359277,
1440
  "learning_rate": 3.2839470889836627e-10,
1441
- "logits/chosen": -1.998681664466858,
1442
- "logits/rejected": -1.9859771728515625,
1443
- "logps/chosen": -2.061518430709839,
1444
- "logps/rejected": -2.9629697799682617,
1445
- "loss": 0.8959,
1446
- "rewards/accuracies": 0.75,
1447
- "rewards/chosen": -4.123036861419678,
1448
- "rewards/margins": 1.8029022216796875,
1449
- "rewards/rejected": -5.925939559936523,
1450
  "step": 470
1451
  },
1452
  {
1453
  "epoch": 0.9945040565297043,
1454
- "grad_norm": 69.11127660659228,
1455
  "learning_rate": 2.6813123097352287e-11,
1456
- "logits/chosen": -2.068897008895874,
1457
- "logits/rejected": -2.0939974784851074,
1458
- "logps/chosen": -2.2420966625213623,
1459
- "logps/rejected": -2.8523502349853516,
1460
- "loss": 0.8943,
1461
- "rewards/accuracies": 0.7250000238418579,
1462
- "rewards/chosen": -4.484193325042725,
1463
- "rewards/margins": 1.2205069065093994,
1464
- "rewards/rejected": -5.704700469970703,
1465
  "step": 475
1466
  },
1467
  {
1468
  "epoch": 0.998691442030882,
1469
  "step": 477,
1470
  "total_flos": 0.0,
1471
- "train_loss": 1.0352550020257882,
1472
- "train_runtime": 22152.8972,
1473
- "train_samples_per_second": 2.76,
1474
- "train_steps_per_second": 0.022
1475
  }
1476
  ],
1477
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.002093692750588851,
13
+ "grad_norm": 23.597004065952554,
14
  "learning_rate": 1.0416666666666666e-08,
15
  "logits/chosen": -2.8280014991760254,
16
  "logits/rejected": -2.8466408252716064,
17
  "logps/chosen": -1.1081000566482544,
18
  "logps/rejected": -1.146370530128479,
19
+ "loss": 1.825,
20
  "rewards/accuracies": 0.46875,
21
  "rewards/chosen": -2.216200113296509,
22
  "rewards/margins": 0.0765407383441925,
 
25
  },
26
  {
27
  "epoch": 0.010468463752944255,
28
+ "grad_norm": 20.151143431686044,
29
  "learning_rate": 5.208333333333333e-08,
30
+ "logits/chosen": -2.88156795501709,
31
+ "logits/rejected": -2.84816837310791,
32
+ "logps/chosen": -1.0210057497024536,
33
+ "logps/rejected": -1.0806357860565186,
34
+ "loss": 1.7101,
35
  "rewards/accuracies": 0.5390625,
36
+ "rewards/chosen": -2.0420114994049072,
37
+ "rewards/margins": 0.11925993114709854,
38
+ "rewards/rejected": -2.161271572113037,
39
  "step": 5
40
  },
41
  {
42
  "epoch": 0.02093692750588851,
43
+ "grad_norm": 19.420093362135834,
44
  "learning_rate": 1.0416666666666667e-07,
45
+ "logits/chosen": -2.992948055267334,
46
+ "logits/rejected": -2.9420628547668457,
47
+ "logps/chosen": -1.0234142541885376,
48
+ "logps/rejected": -1.165908694267273,
49
+ "loss": 1.7409,
50
  "rewards/accuracies": 0.581250011920929,
51
+ "rewards/chosen": -2.046828508377075,
52
+ "rewards/margins": 0.28498905897140503,
53
+ "rewards/rejected": -2.331817388534546,
54
  "step": 10
55
  },
56
  {
57
  "epoch": 0.031405391258832765,
58
+ "grad_norm": 22.360933443209454,
59
  "learning_rate": 1.5624999999999999e-07,
60
+ "logits/chosen": -2.9190802574157715,
61
+ "logits/rejected": -2.9058239459991455,
62
+ "logps/chosen": -1.0527833700180054,
63
+ "logps/rejected": -1.1032918691635132,
64
+ "loss": 1.7228,
65
  "rewards/accuracies": 0.5375000238418579,
66
+ "rewards/chosen": -2.1055667400360107,
67
+ "rewards/margins": 0.10101678222417831,
68
+ "rewards/rejected": -2.2065837383270264,
69
  "step": 15
70
  },
71
  {
72
  "epoch": 0.04187385501177702,
73
+ "grad_norm": 18.11495891073632,
74
  "learning_rate": 2.0833333333333333e-07,
75
+ "logits/chosen": -2.9407191276550293,
76
+ "logits/rejected": -2.8896334171295166,
77
+ "logps/chosen": -1.0061452388763428,
78
+ "logps/rejected": -1.1030395030975342,
79
+ "loss": 1.6957,
80
+ "rewards/accuracies": 0.550000011920929,
81
+ "rewards/chosen": -2.0122904777526855,
82
+ "rewards/margins": 0.19378867745399475,
83
+ "rewards/rejected": -2.2060790061950684,
84
  "step": 20
85
  },
86
  {
87
  "epoch": 0.05234231876472128,
88
+ "grad_norm": 17.73214868239448,
89
  "learning_rate": 2.604166666666667e-07,
90
+ "logits/chosen": -2.945413112640381,
91
+ "logits/rejected": -2.8812930583953857,
92
+ "logps/chosen": -0.9288013577461243,
93
+ "logps/rejected": -1.1164944171905518,
94
+ "loss": 1.6642,
95
+ "rewards/accuracies": 0.574999988079071,
96
+ "rewards/chosen": -1.8576027154922485,
97
+ "rewards/margins": 0.3753865361213684,
98
+ "rewards/rejected": -2.2329888343811035,
99
  "step": 25
100
  },
101
  {
102
  "epoch": 0.06281078251766553,
103
+ "grad_norm": 17.49667039937059,
104
  "learning_rate": 3.1249999999999997e-07,
105
+ "logits/chosen": -2.920177459716797,
106
+ "logits/rejected": -2.9008939266204834,
107
+ "logps/chosen": -0.9087249636650085,
108
+ "logps/rejected": -0.9417479634284973,
109
+ "loss": 1.6764,
110
  "rewards/accuracies": 0.550000011920929,
111
+ "rewards/chosen": -1.817449927330017,
112
+ "rewards/margins": 0.06604615598917007,
113
+ "rewards/rejected": -1.8834959268569946,
114
  "step": 30
115
  },
116
  {
117
  "epoch": 0.07327924627060979,
118
+ "grad_norm": 18.317659111307954,
119
  "learning_rate": 3.645833333333333e-07,
120
+ "logits/chosen": -2.929032564163208,
121
+ "logits/rejected": -2.8953332901000977,
122
+ "logps/chosen": -0.9193178415298462,
123
+ "logps/rejected": -1.1149028539657593,
124
+ "loss": 1.65,
125
+ "rewards/accuracies": 0.606249988079071,
126
+ "rewards/chosen": -1.8386356830596924,
127
+ "rewards/margins": 0.39116981625556946,
128
+ "rewards/rejected": -2.2298057079315186,
129
  "step": 35
130
  },
131
  {
132
  "epoch": 0.08374771002355404,
133
+ "grad_norm": 17.730458935387485,
134
  "learning_rate": 4.1666666666666667e-07,
135
+ "logits/chosen": -2.9188663959503174,
136
+ "logits/rejected": -2.877103328704834,
137
+ "logps/chosen": -0.9255539178848267,
138
+ "logps/rejected": -1.0478177070617676,
139
+ "loss": 1.6224,
140
+ "rewards/accuracies": 0.581250011920929,
141
+ "rewards/chosen": -1.8511078357696533,
142
+ "rewards/margins": 0.24452760815620422,
143
+ "rewards/rejected": -2.095635414123535,
144
  "step": 40
145
  },
146
  {
147
  "epoch": 0.0942161737764983,
148
+ "grad_norm": 15.937073180460068,
149
  "learning_rate": 4.6874999999999996e-07,
150
+ "logits/chosen": -2.875232696533203,
151
+ "logits/rejected": -2.8806424140930176,
152
+ "logps/chosen": -0.9038581848144531,
153
+ "logps/rejected": -1.0820366144180298,
154
+ "loss": 1.5724,
155
+ "rewards/accuracies": 0.625,
156
+ "rewards/chosen": -1.8077163696289062,
157
+ "rewards/margins": 0.35635682940483093,
158
+ "rewards/rejected": -2.1640732288360596,
159
  "step": 45
160
  },
161
  {
162
  "epoch": 0.10468463752944256,
163
+ "grad_norm": 23.986363177533022,
164
  "learning_rate": 4.999731868769026e-07,
165
+ "logits/chosen": -2.820544719696045,
166
+ "logits/rejected": -2.832681179046631,
167
+ "logps/chosen": -0.9590412378311157,
168
+ "logps/rejected": -1.2149635553359985,
169
+ "loss": 1.6276,
170
+ "rewards/accuracies": 0.637499988079071,
171
+ "rewards/chosen": -1.9180824756622314,
172
+ "rewards/margins": 0.5118446946144104,
173
+ "rewards/rejected": -2.429927110671997,
174
  "step": 50
175
  },
176
  {
177
  "epoch": 0.11515310128238682,
178
+ "grad_norm": 24.997319339492417,
179
  "learning_rate": 4.996716052911017e-07,
180
+ "logits/chosen": -2.910067081451416,
181
+ "logits/rejected": -2.9242091178894043,
182
+ "logps/chosen": -1.0754766464233398,
183
+ "logps/rejected": -1.1867624521255493,
184
+ "loss": 1.6255,
185
+ "rewards/accuracies": 0.574999988079071,
186
+ "rewards/chosen": -2.1509532928466797,
187
+ "rewards/margins": 0.22257177531719208,
188
+ "rewards/rejected": -2.3735249042510986,
189
  "step": 55
190
  },
191
  {
192
  "epoch": 0.12562156503533106,
193
+ "grad_norm": 29.673956562215213,
194
  "learning_rate": 4.990353313429303e-07,
195
+ "logits/chosen": -2.846463441848755,
196
+ "logits/rejected": -2.8395891189575195,
197
+ "logps/chosen": -1.0373256206512451,
198
+ "logps/rejected": -1.2098807096481323,
199
+ "loss": 1.5824,
200
+ "rewards/accuracies": 0.5625,
201
+ "rewards/chosen": -2.0746512413024902,
202
+ "rewards/margins": 0.34511035680770874,
203
+ "rewards/rejected": -2.4197614192962646,
204
  "step": 60
205
  },
206
  {
207
  "epoch": 0.1360900287882753,
208
+ "grad_norm": 25.70155018194367,
209
  "learning_rate": 4.980652179769217e-07,
210
+ "logits/chosen": -2.8658077716827393,
211
+ "logits/rejected": -2.889425039291382,
212
+ "logps/chosen": -1.0232688188552856,
213
+ "logps/rejected": -1.1985584497451782,
214
+ "loss": 1.5955,
215
  "rewards/accuracies": 0.59375,
216
+ "rewards/chosen": -2.0465376377105713,
217
+ "rewards/margins": 0.3505793511867523,
218
+ "rewards/rejected": -2.3971168994903564,
219
  "step": 65
220
  },
221
  {
222
  "epoch": 0.14655849254121958,
223
+ "grad_norm": 26.911214626746915,
224
  "learning_rate": 4.967625656594781e-07,
225
+ "logits/chosen": -2.8660895824432373,
226
+ "logits/rejected": -2.8656294345855713,
227
+ "logps/chosen": -1.1060866117477417,
228
+ "logps/rejected": -1.3024221658706665,
229
+ "loss": 1.5443,
230
+ "rewards/accuracies": 0.625,
231
+ "rewards/chosen": -2.2121732234954834,
232
+ "rewards/margins": 0.39267101883888245,
233
+ "rewards/rejected": -2.604844331741333,
234
  "step": 70
235
  },
236
  {
237
  "epoch": 0.15702695629416383,
238
+ "grad_norm": 31.97064281949989,
239
  "learning_rate": 4.951291206355559e-07,
240
+ "logits/chosen": -2.813340425491333,
241
+ "logits/rejected": -2.76302170753479,
242
+ "logps/chosen": -1.1282150745391846,
243
+ "logps/rejected": -1.4221516847610474,
244
+ "loss": 1.5693,
245
+ "rewards/accuracies": 0.6312500238418579,
246
+ "rewards/chosen": -2.256430149078369,
247
+ "rewards/margins": 0.5878733992576599,
248
+ "rewards/rejected": -2.8443033695220947,
249
  "step": 75
250
  },
251
  {
252
  "epoch": 0.16749542004710807,
253
+ "grad_norm": 22.991199981656727,
254
  "learning_rate": 4.93167072587771e-07,
255
+ "logits/chosen": -2.7443230152130127,
256
+ "logits/rejected": -2.7397193908691406,
257
+ "logps/chosen": -1.1170539855957031,
258
+ "logps/rejected": -1.485013723373413,
259
+ "loss": 1.5498,
260
  "rewards/accuracies": 0.6875,
261
+ "rewards/chosen": -2.2341079711914062,
262
+ "rewards/margins": 0.7359195947647095,
263
+ "rewards/rejected": -2.970027446746826,
264
  "step": 80
265
  },
266
  {
267
  "epoch": 0.17796388380005235,
268
+ "grad_norm": 27.36414724828374,
269
  "learning_rate": 4.908790517010636e-07,
270
+ "logits/chosen": -2.798058271408081,
271
+ "logits/rejected": -2.7502901554107666,
272
+ "logps/chosen": -1.1033599376678467,
273
+ "logps/rejected": -1.4456737041473389,
274
+ "loss": 1.5329,
275
+ "rewards/accuracies": 0.637499988079071,
276
+ "rewards/chosen": -2.2067198753356934,
277
+ "rewards/margins": 0.6846276521682739,
278
+ "rewards/rejected": -2.8913474082946777,
279
  "step": 85
280
  },
281
  {
282
  "epoch": 0.1884323475529966,
283
+ "grad_norm": 21.16101861051188,
284
  "learning_rate": 4.882681251368548e-07,
285
+ "logits/chosen": -2.7800021171569824,
286
+ "logits/rejected": -2.7292940616607666,
287
+ "logps/chosen": -1.1692888736724854,
288
+ "logps/rejected": -1.5973972082138062,
289
+ "loss": 1.5052,
290
+ "rewards/accuracies": 0.625,
291
+ "rewards/chosen": -2.3385777473449707,
292
+ "rewards/margins": 0.856216549873352,
293
+ "rewards/rejected": -3.1947944164276123,
294
  "step": 90
295
  },
296
  {
297
  "epoch": 0.19890081130594087,
298
+ "grad_norm": 26.528533058821825,
299
  "learning_rate": 4.853377929214243e-07,
300
+ "logits/chosen": -2.704568386077881,
301
+ "logits/rejected": -2.6892647743225098,
302
+ "logps/chosen": -1.1889941692352295,
303
+ "logps/rejected": -1.5050609111785889,
304
+ "loss": 1.5355,
305
+ "rewards/accuracies": 0.581250011920929,
306
+ "rewards/chosen": -2.377988338470459,
307
+ "rewards/margins": 0.6321329474449158,
308
+ "rewards/rejected": -3.0101218223571777,
309
  "step": 95
310
  },
311
  {
312
  "epoch": 0.2093692750588851,
313
+ "grad_norm": 29.65374977644177,
314
  "learning_rate": 4.820919832540181e-07,
315
+ "logits/chosen": -2.7762622833251953,
316
+ "logits/rejected": -2.750223398208618,
317
+ "logps/chosen": -1.1571177244186401,
318
+ "logps/rejected": -1.3776543140411377,
319
+ "loss": 1.5847,
320
+ "rewards/accuracies": 0.5687500238418579,
321
+ "rewards/chosen": -2.3142354488372803,
322
+ "rewards/margins": 0.4410727620124817,
323
+ "rewards/rejected": -2.7553086280822754,
324
  "step": 100
325
  },
326
  {
327
  "epoch": 0.21983773881182936,
328
+ "grad_norm": 22.838032447560202,
329
  "learning_rate": 4.785350472409791e-07,
330
+ "logits/chosen": -2.712097644805908,
331
+ "logits/rejected": -2.684450626373291,
332
+ "logps/chosen": -1.1789191961288452,
333
+ "logps/rejected": -1.5337121486663818,
334
+ "loss": 1.4327,
335
+ "rewards/accuracies": 0.65625,
336
+ "rewards/chosen": -2.3578383922576904,
337
+ "rewards/margins": 0.7095857858657837,
338
+ "rewards/rejected": -3.0674242973327637,
339
  "step": 105
340
  },
341
  {
342
  "epoch": 0.23030620256477363,
343
+ "grad_norm": 42.989486471273096,
344
  "learning_rate": 4.7467175306295647e-07,
345
+ "logits/chosen": -2.6487460136413574,
346
+ "logits/rejected": -2.626556873321533,
347
+ "logps/chosen": -1.1375197172164917,
348
+ "logps/rejected": -1.4804375171661377,
349
+ "loss": 1.4322,
350
+ "rewards/accuracies": 0.625,
351
+ "rewards/chosen": -2.2750394344329834,
352
+ "rewards/margins": 0.6858354210853577,
353
+ "rewards/rejected": -2.9608750343322754,
354
  "step": 110
355
  },
356
  {
357
  "epoch": 0.24077466631771788,
358
+ "grad_norm": 29.408403128680163,
359
  "learning_rate": 4.70507279583015e-07,
360
+ "logits/chosen": -2.6792874336242676,
361
+ "logits/rejected": -2.669962167739868,
362
+ "logps/chosen": -1.179626226425171,
363
+ "logps/rejected": -1.3990623950958252,
364
+ "loss": 1.5301,
365
+ "rewards/accuracies": 0.6625000238418579,
366
+ "rewards/chosen": -2.359252452850342,
367
+ "rewards/margins": 0.43887215852737427,
368
+ "rewards/rejected": -2.7981247901916504,
369
  "step": 115
370
  },
371
  {
372
  "epoch": 0.2512431300706621,
373
+ "grad_norm": 25.256732287485413,
374
  "learning_rate": 4.6604720940421207e-07,
375
+ "logits/chosen": -2.6912872791290283,
376
+ "logits/rejected": -2.6357336044311523,
377
+ "logps/chosen": -1.1251033544540405,
378
+ "logps/rejected": -1.5543628931045532,
379
+ "loss": 1.5099,
380
+ "rewards/accuracies": 0.625,
381
+ "rewards/chosen": -2.250206708908081,
382
+ "rewards/margins": 0.8585190773010254,
383
+ "rewards/rejected": -3.1087257862091064,
384
  "step": 120
385
  },
386
  {
387
  "epoch": 0.26171159382360637,
388
+ "grad_norm": 33.03440558906921,
389
  "learning_rate": 4.612975213859487e-07,
390
+ "logits/chosen": -2.584962844848633,
391
+ "logits/rejected": -2.54630970954895,
392
+ "logps/chosen": -1.1838756799697876,
393
+ "logps/rejected": -1.5963369607925415,
394
+ "loss": 1.5107,
395
+ "rewards/accuracies": 0.6812499761581421,
396
+ "rewards/chosen": -2.367751359939575,
397
+ "rewards/margins": 0.8249226808547974,
398
+ "rewards/rejected": -3.192673921585083,
399
  "step": 125
400
  },
401
  {
402
  "epoch": 0.2721800575765506,
403
+ "grad_norm": 32.822453962687824,
404
  "learning_rate": 4.5626458262912735e-07,
405
+ "logits/chosen": -2.6030898094177246,
406
+ "logits/rejected": -2.6097538471221924,
407
+ "logps/chosen": -1.1971023082733154,
408
+ "logps/rejected": -1.6219829320907593,
409
+ "loss": 1.4413,
410
+ "rewards/accuracies": 0.7124999761581421,
411
+ "rewards/chosen": -2.394204616546631,
412
+ "rewards/margins": 0.8497610092163086,
413
+ "rewards/rejected": -3.2439658641815186,
414
  "step": 130
415
  },
416
  {
417
  "epoch": 0.2826485213294949,
418
+ "grad_norm": 28.065819409874635,
419
  "learning_rate": 4.5095513994085974e-07,
420
+ "logits/chosen": -2.538353204727173,
421
+ "logits/rejected": -2.5087146759033203,
422
+ "logps/chosen": -1.0895415544509888,
423
+ "logps/rejected": -1.5814229249954224,
424
+ "loss": 1.4933,
425
+ "rewards/accuracies": 0.6937500238418579,
426
+ "rewards/chosen": -2.1790831089019775,
427
+ "rewards/margins": 0.9837629199028015,
428
+ "rewards/rejected": -3.1628458499908447,
429
  "step": 135
430
  },
431
  {
432
  "epoch": 0.29311698508243916,
433
+ "grad_norm": 32.58412935415965,
434
  "learning_rate": 4.453763107901675e-07,
435
+ "logits/chosen": -2.5400352478027344,
436
+ "logits/rejected": -2.5418732166290283,
437
+ "logps/chosen": -1.3108186721801758,
438
+ "logps/rejected": -1.5650501251220703,
439
+ "loss": 1.542,
440
+ "rewards/accuracies": 0.606249988079071,
441
+ "rewards/chosen": -2.6216373443603516,
442
+ "rewards/margins": 0.5084627866744995,
443
+ "rewards/rejected": -3.1301002502441406,
444
  "step": 140
445
  },
446
  {
447
  "epoch": 0.3035854488353834,
448
+ "grad_norm": 30.400278372842216,
449
  "learning_rate": 4.395355737667985e-07,
450
+ "logits/chosen": -2.5238261222839355,
451
+ "logits/rejected": -2.5089011192321777,
452
+ "logps/chosen": -1.1779248714447021,
453
+ "logps/rejected": -1.6300004720687866,
454
+ "loss": 1.4238,
455
  "rewards/accuracies": 0.675000011920929,
456
+ "rewards/chosen": -2.3558497428894043,
457
+ "rewards/margins": 0.9041512608528137,
458
+ "rewards/rejected": -3.2600009441375732,
459
  "step": 145
460
  },
461
  {
462
  "epoch": 0.31405391258832765,
463
+ "grad_norm": 60.958920561437736,
464
  "learning_rate": 4.3344075855595097e-07,
465
+ "logits/chosen": -2.566622734069824,
466
+ "logits/rejected": -2.518723964691162,
467
+ "logps/chosen": -1.3376280069351196,
468
+ "logps/rejected": -1.7097371816635132,
469
+ "loss": 1.4812,
470
+ "rewards/accuracies": 0.59375,
471
+ "rewards/chosen": -2.6752560138702393,
472
+ "rewards/margins": 0.7442184686660767,
473
+ "rewards/rejected": -3.4194743633270264,
474
  "step": 150
475
  },
476
  {
477
  "epoch": 0.3245223763412719,
478
+ "grad_norm": 62.56084518909412,
479
  "learning_rate": 4.271000354423425e-07,
480
+ "logits/chosen": -2.484179735183716,
481
+ "logits/rejected": -2.459049701690674,
482
+ "logps/chosen": -1.3308216333389282,
483
+ "logps/rejected": -1.9237397909164429,
484
+ "loss": 1.387,
485
  "rewards/accuracies": 0.699999988079071,
486
+ "rewards/chosen": -2.6616432666778564,
487
+ "rewards/margins": 1.1858360767364502,
488
+ "rewards/rejected": -3.8474795818328857,
489
  "step": 155
490
  },
491
  {
492
  "epoch": 0.33499084009421615,
493
+ "grad_norm": 38.23483378023031,
494
  "learning_rate": 4.2052190435769554e-07,
495
+ "logits/chosen": -2.454179286956787,
496
+ "logits/rejected": -2.4418118000030518,
497
+ "logps/chosen": -1.250301718711853,
498
+ "logps/rejected": -1.8722400665283203,
499
+ "loss": 1.3988,
500
+ "rewards/accuracies": 0.7124999761581421,
501
+ "rewards/chosen": -2.500603437423706,
502
+ "rewards/margins": 1.2438766956329346,
503
+ "rewards/rejected": -3.7444801330566406,
504
  "step": 160
505
  },
506
  {
507
  "epoch": 0.34545930384716045,
508
+ "grad_norm": 36.680206717248936,
509
  "learning_rate": 4.137151834863213e-07,
510
+ "logits/chosen": -2.516385316848755,
511
+ "logits/rejected": -2.476135730743408,
512
+ "logps/chosen": -1.2074108123779297,
513
+ "logps/rejected": -1.579379916191101,
514
+ "loss": 1.3981,
515
+ "rewards/accuracies": 0.65625,
516
+ "rewards/chosen": -2.4148216247558594,
517
+ "rewards/margins": 0.7439382672309875,
518
+ "rewards/rejected": -3.158759832382202,
519
  "step": 165
520
  },
521
  {
522
  "epoch": 0.3559277676001047,
523
+ "grad_norm": 45.599190265402456,
524
  "learning_rate": 4.0668899744407567e-07,
525
+ "logits/chosen": -2.5008692741394043,
526
+ "logits/rejected": -2.456566572189331,
527
+ "logps/chosen": -1.2382426261901855,
528
+ "logps/rejected": -1.7240244150161743,
529
+ "loss": 1.3766,
530
+ "rewards/accuracies": 0.71875,
531
+ "rewards/chosen": -2.476485252380371,
532
+ "rewards/margins": 0.9715633392333984,
533
+ "rewards/rejected": -3.4480488300323486,
534
  "step": 170
535
  },
536
  {
537
  "epoch": 0.36639623135304894,
538
+ "grad_norm": 44.372524637477866,
539
  "learning_rate": 3.994527650465352e-07,
540
+ "logits/chosen": -2.417241096496582,
541
+ "logits/rejected": -2.406471014022827,
542
+ "logps/chosen": -1.3205429315567017,
543
+ "logps/rejected": -1.8458646535873413,
544
+ "loss": 1.3967,
545
+ "rewards/accuracies": 0.675000011920929,
546
+ "rewards/chosen": -2.6410858631134033,
547
+ "rewards/margins": 1.0506436824798584,
548
+ "rewards/rejected": -3.6917293071746826,
549
  "step": 175
550
  },
551
  {
552
  "epoch": 0.3768646951059932,
553
+ "grad_norm": 59.067677833224565,
554
  "learning_rate": 3.920161866827889e-07,
555
+ "logits/chosen": -2.438202381134033,
556
+ "logits/rejected": -2.4266159534454346,
557
+ "logps/chosen": -1.3483916521072388,
558
+ "logps/rejected": -1.9187591075897217,
559
+ "loss": 1.404,
560
+ "rewards/accuracies": 0.706250011920929,
561
+ "rewards/chosen": -2.6967833042144775,
562
+ "rewards/margins": 1.1407346725463867,
563
+ "rewards/rejected": -3.8375182151794434,
564
  "step": 180
565
  },
566
  {
567
  "epoch": 0.38733315885893743,
568
+ "grad_norm": 47.3796081172543,
569
  "learning_rate": 3.8438923131177237e-07,
570
+ "logits/chosen": -2.385611057281494,
571
+ "logits/rejected": -2.4027440547943115,
572
+ "logps/chosen": -1.4026473760604858,
573
+ "logps/rejected": -1.966017723083496,
574
+ "loss": 1.3488,
575
+ "rewards/accuracies": 0.706250011920929,
576
+ "rewards/chosen": -2.8052947521209717,
577
+ "rewards/margins": 1.1267404556274414,
578
+ "rewards/rejected": -3.932035446166992,
579
  "step": 185
580
  },
581
  {
582
  "epoch": 0.39780162261188173,
583
+ "grad_norm": 35.917242818850596,
584
  "learning_rate": 3.765821230985757e-07,
585
+ "logits/chosen": -2.374563455581665,
586
+ "logits/rejected": -2.382572889328003,
587
+ "logps/chosen": -1.4945417642593384,
588
+ "logps/rejected": -2.0413851737976074,
589
+ "loss": 1.3537,
590
+ "rewards/accuracies": 0.675000011920929,
591
+ "rewards/chosen": -2.9890835285186768,
592
+ "rewards/margins": 1.0936866998672485,
593
+ "rewards/rejected": -4.082770347595215,
594
  "step": 190
595
  },
596
  {
597
  "epoch": 0.408270086364826,
598
+ "grad_norm": 51.34440878644271,
599
  "learning_rate": 3.6860532770864005e-07,
600
+ "logits/chosen": -2.3812708854675293,
601
+ "logits/rejected": -2.3748836517333984,
602
+ "logps/chosen": -1.507237195968628,
603
+ "logps/rejected": -2.213073253631592,
604
+ "loss": 1.4291,
605
+ "rewards/accuracies": 0.7562500238418579,
606
+ "rewards/chosen": -3.014474391937256,
607
+ "rewards/margins": 1.4116714000701904,
608
+ "rewards/rejected": -4.426146507263184,
609
  "step": 195
610
  },
611
  {
612
  "epoch": 0.4187385501177702,
613
+ "grad_norm": 50.386830292932075,
614
  "learning_rate": 3.604695382782159e-07,
615
+ "logits/chosen": -2.2745728492736816,
616
+ "logits/rejected": -2.256195545196533,
617
+ "logps/chosen": -1.647243857383728,
618
+ "logps/rejected": -2.149318218231201,
619
+ "loss": 1.3579,
620
+ "rewards/accuracies": 0.637499988079071,
621
+ "rewards/chosen": -3.294487714767456,
622
+ "rewards/margins": 1.0041488409042358,
623
+ "rewards/rejected": -4.298636436462402,
624
  "step": 200
625
  },
626
  {
627
  "epoch": 0.42920701387071447,
628
+ "grad_norm": 65.73777051804583,
629
  "learning_rate": 3.5218566107988867e-07,
630
+ "logits/chosen": -2.335175037384033,
631
+ "logits/rejected": -2.326826333999634,
632
+ "logps/chosen": -1.8318570852279663,
633
+ "logps/rejected": -2.546074390411377,
634
+ "loss": 1.3635,
635
+ "rewards/accuracies": 0.675000011920929,
636
+ "rewards/chosen": -3.6637141704559326,
637
+ "rewards/margins": 1.4284346103668213,
638
+ "rewards/rejected": -5.092148780822754,
639
  "step": 205
640
  },
641
  {
642
  "epoch": 0.4396754776236587,
643
+ "grad_norm": 57.20248301568887,
644
  "learning_rate": 3.4376480090239047e-07,
645
+ "logits/chosen": -2.184452533721924,
646
+ "logits/rejected": -2.1850171089172363,
647
+ "logps/chosen": -1.9411817789077759,
648
+ "logps/rejected": -2.38714861869812,
649
+ "loss": 1.468,
650
  "rewards/accuracies": 0.699999988079071,
651
+ "rewards/chosen": -3.8823635578155518,
652
+ "rewards/margins": 0.8919339179992676,
653
+ "rewards/rejected": -4.77429723739624,
654
  "step": 210
655
  },
656
  {
657
  "epoch": 0.45014394137660296,
658
+ "grad_norm": 57.18796146027631,
659
  "learning_rate": 3.3521824616429284e-07,
660
+ "logits/chosen": -2.20176100730896,
661
+ "logits/rejected": -2.227836847305298,
662
+ "logps/chosen": -1.8610153198242188,
663
+ "logps/rejected": -2.4027678966522217,
664
+ "loss": 1.3628,
665
+ "rewards/accuracies": 0.7437499761581421,
666
+ "rewards/chosen": -3.7220306396484375,
667
+ "rewards/margins": 1.083505392074585,
668
+ "rewards/rejected": -4.805535793304443,
669
  "step": 215
670
  },
671
  {
672
  "epoch": 0.46061240512954726,
673
+ "grad_norm": 49.48323591731237,
674
  "learning_rate": 3.265574537815398e-07,
675
+ "logits/chosen": -2.202357053756714,
676
+ "logits/rejected": -2.212911367416382,
677
+ "logps/chosen": -1.8679640293121338,
678
+ "logps/rejected": -2.4319264888763428,
679
+ "loss": 1.262,
680
+ "rewards/accuracies": 0.71875,
681
+ "rewards/chosen": -3.7359280586242676,
682
+ "rewards/margins": 1.127925157546997,
683
+ "rewards/rejected": -4.8638529777526855,
684
  "step": 220
685
  },
686
  {
687
  "epoch": 0.4710808688824915,
688
+ "grad_norm": 55.810856216454965,
689
  "learning_rate": 3.1779403380910425e-07,
690
+ "logits/chosen": -2.1685309410095215,
691
+ "logits/rejected": -2.140174388885498,
692
+ "logps/chosen": -2.047653913497925,
693
+ "logps/rejected": -2.60907244682312,
694
+ "loss": 1.3463,
695
+ "rewards/accuracies": 0.71875,
696
+ "rewards/chosen": -4.09530782699585,
697
+ "rewards/margins": 1.122836709022522,
698
+ "rewards/rejected": -5.21814489364624,
699
  "step": 225
700
  },
701
  {
702
  "epoch": 0.48154933263543576,
703
+ "grad_norm": 65.54809342203177,
704
  "learning_rate": 3.0893973387735683e-07,
705
+ "logits/chosen": -2.1521670818328857,
706
+ "logits/rejected": -2.1663339138031006,
707
+ "logps/chosen": -2.1521265506744385,
708
+ "logps/rejected": -2.8482584953308105,
709
+ "loss": 1.273,
710
+ "rewards/accuracies": 0.71875,
711
+ "rewards/chosen": -4.304253101348877,
712
+ "rewards/margins": 1.3922632932662964,
713
+ "rewards/rejected": -5.696516990661621,
714
  "step": 230
715
  },
716
  {
717
  "epoch": 0.49201779638838,
718
+ "grad_norm": 54.795057799713966,
719
  "learning_rate": 3.000064234440111e-07,
720
+ "logits/chosen": -2.0703492164611816,
721
+ "logits/rejected": -2.0829060077667236,
722
+ "logps/chosen": -2.1279358863830566,
723
+ "logps/rejected": -2.6611745357513428,
724
+ "loss": 1.3093,
725
  "rewards/accuracies": 0.706250011920929,
726
+ "rewards/chosen": -4.255871772766113,
727
+ "rewards/margins": 1.0664775371551514,
728
+ "rewards/rejected": -5.3223490715026855,
729
  "step": 235
730
  },
731
  {
732
  "epoch": 0.5024862601413242,
733
+ "grad_norm": 56.25864061883905,
734
  "learning_rate": 2.910060778827554e-07,
735
+ "logits/chosen": -2.075435161590576,
736
+ "logits/rejected": -2.08357310295105,
737
+ "logps/chosen": -2.1075491905212402,
738
+ "logps/rejected": -2.84082293510437,
739
+ "loss": 1.3276,
740
  "rewards/accuracies": 0.71875,
741
+ "rewards/chosen": -4.2150983810424805,
742
+ "rewards/margins": 1.4665473699569702,
743
+ "rewards/rejected": -5.68164587020874,
744
  "step": 240
745
  },
746
  {
747
  "epoch": 0.5129547238942685,
748
+ "grad_norm": 62.1729319764741,
749
  "learning_rate": 2.8195076242990116e-07,
750
+ "logits/chosen": -2.0973756313323975,
751
+ "logits/rejected": -2.1240689754486084,
752
+ "logps/chosen": -2.0223472118377686,
753
+ "logps/rejected": -2.807509660720825,
754
+ "loss": 1.2667,
755
+ "rewards/accuracies": 0.737500011920929,
756
+ "rewards/chosen": -4.044694423675537,
757
+ "rewards/margins": 1.570324182510376,
758
+ "rewards/rejected": -5.61501932144165,
759
  "step": 245
760
  },
761
  {
762
  "epoch": 0.5234231876472127,
763
+ "grad_norm": 50.46640637368305,
764
  "learning_rate": 2.7285261601056697e-07,
765
+ "logits/chosen": -1.9956550598144531,
766
+ "logits/rejected": -2.027296543121338,
767
+ "logps/chosen": -2.24354887008667,
768
+ "logps/rejected": -3.0298171043395996,
769
+ "loss": 1.3873,
770
+ "rewards/accuracies": 0.7437499761581421,
771
+ "rewards/chosen": -4.48709774017334,
772
+ "rewards/margins": 1.5725353956222534,
773
+ "rewards/rejected": -6.059634208679199,
774
  "step": 250
775
  },
776
  {
777
  "epoch": 0.533891651400157,
778
+ "grad_norm": 60.6854410849544,
779
  "learning_rate": 2.6372383496608186e-07,
780
+ "logits/chosen": -2.0544848442077637,
781
+ "logits/rejected": -2.0431137084960938,
782
+ "logps/chosen": -2.3652796745300293,
783
+ "logps/rejected": -3.058457136154175,
784
+ "loss": 1.3487,
785
+ "rewards/accuracies": 0.6875,
786
+ "rewards/chosen": -4.730559349060059,
787
+ "rewards/margins": 1.386354923248291,
788
+ "rewards/rejected": -6.11691427230835,
789
  "step": 255
790
  },
791
  {
792
  "epoch": 0.5443601151531012,
793
+ "grad_norm": 73.46613876186566,
794
  "learning_rate": 2.5457665670441937e-07,
795
+ "logits/chosen": -1.9965794086456299,
796
+ "logits/rejected": -2.0038514137268066,
797
+ "logps/chosen": -2.4099831581115723,
798
+ "logps/rejected": -3.187678337097168,
799
+ "loss": 1.2961,
800
+ "rewards/accuracies": 0.75,
801
+ "rewards/chosen": -4.8199663162231445,
802
+ "rewards/margins": 1.5553903579711914,
803
+ "rewards/rejected": -6.375356674194336,
804
  "step": 260
805
  },
806
  {
807
  "epoch": 0.5548285789060455,
808
+ "grad_norm": 74.04251596633621,
809
  "learning_rate": 2.454233432955807e-07,
810
+ "logits/chosen": -1.9874013662338257,
811
+ "logits/rejected": -1.9983584880828857,
812
+ "logps/chosen": -2.573876142501831,
813
+ "logps/rejected": -3.171705722808838,
814
+ "loss": 1.3148,
815
+ "rewards/accuracies": 0.71875,
816
+ "rewards/chosen": -5.147752285003662,
817
+ "rewards/margins": 1.1956590414047241,
818
+ "rewards/rejected": -6.343411445617676,
819
  "step": 265
820
  },
821
  {
822
  "epoch": 0.5652970426589898,
823
+ "grad_norm": 76.35568912258938,
824
  "learning_rate": 2.3627616503391812e-07,
825
+ "logits/chosen": -2.017387628555298,
826
+ "logits/rejected": -2.024291753768921,
827
+ "logps/chosen": -2.639585018157959,
828
+ "logps/rejected": -3.2247214317321777,
829
+ "loss": 1.2604,
830
+ "rewards/accuracies": 0.7124999761581421,
831
+ "rewards/chosen": -5.279170036315918,
832
+ "rewards/margins": 1.1702723503112793,
833
+ "rewards/rejected": -6.4494428634643555,
834
  "step": 270
835
  },
836
  {
837
  "epoch": 0.575765506411934,
838
+ "grad_norm": 47.964272961306655,
839
  "learning_rate": 2.2714738398943308e-07,
840
+ "logits/chosen": -1.9820992946624756,
841
+ "logits/rejected": -1.986830472946167,
842
+ "logps/chosen": -2.4876163005828857,
843
+ "logps/rejected": -3.4850916862487793,
844
+ "loss": 1.2026,
845
+ "rewards/accuracies": 0.768750011920929,
846
+ "rewards/chosen": -4.9752326011657715,
847
+ "rewards/margins": 1.9949508905410767,
848
+ "rewards/rejected": -6.970183372497559,
849
  "step": 275
850
  },
851
  {
852
  "epoch": 0.5862339701648783,
853
+ "grad_norm": 66.6426445717076,
854
  "learning_rate": 2.1804923757009882e-07,
855
+ "logits/chosen": -1.9717257022857666,
856
+ "logits/rejected": -1.9761455059051514,
857
+ "logps/chosen": -2.604233980178833,
858
+ "logps/rejected": -3.449998140335083,
859
+ "loss": 1.3207,
860
+ "rewards/accuracies": 0.737500011920929,
861
+ "rewards/chosen": -5.208467960357666,
862
+ "rewards/margins": 1.691528081893921,
863
+ "rewards/rejected": -6.899996280670166,
864
  "step": 280
865
  },
866
  {
867
  "epoch": 0.5967024339178225,
868
+ "grad_norm": 78.85704380261409,
869
  "learning_rate": 2.089939221172446e-07,
870
+ "logits/chosen": -1.9817512035369873,
871
+ "logits/rejected": -1.976833701133728,
872
+ "logps/chosen": -2.4398930072784424,
873
+ "logps/rejected": -3.1973671913146973,
874
+ "loss": 1.275,
875
+ "rewards/accuracies": 0.7250000238418579,
876
+ "rewards/chosen": -4.879786014556885,
877
+ "rewards/margins": 1.5149486064910889,
878
+ "rewards/rejected": -6.3947343826293945,
879
  "step": 285
880
  },
881
  {
882
  "epoch": 0.6071708976707668,
883
+ "grad_norm": 70.57880724344702,
884
  "learning_rate": 1.9999357655598891e-07,
885
+ "logits/chosen": -1.9725377559661865,
886
+ "logits/rejected": -1.9514259099960327,
887
+ "logps/chosen": -2.342991828918457,
888
+ "logps/rejected": -2.9306890964508057,
889
+ "loss": 1.3026,
890
+ "rewards/accuracies": 0.71875,
891
+ "rewards/chosen": -4.685983657836914,
892
+ "rewards/margins": 1.1753947734832764,
893
+ "rewards/rejected": -5.861378192901611,
894
  "step": 290
895
  },
896
  {
897
  "epoch": 0.6176393614237111,
898
+ "grad_norm": 64.72926534110535,
899
  "learning_rate": 1.9106026612264315e-07,
900
+ "logits/chosen": -2.001755714416504,
901
+ "logits/rejected": -1.9883407354354858,
902
+ "logps/chosen": -2.397254705429077,
903
+ "logps/rejected": -2.957282781600952,
904
+ "loss": 1.249,
905
+ "rewards/accuracies": 0.71875,
906
+ "rewards/chosen": -4.794509410858154,
907
+ "rewards/margins": 1.1200562715530396,
908
+ "rewards/rejected": -5.914565563201904,
909
  "step": 295
910
  },
911
  {
912
  "epoch": 0.6281078251766553,
913
+ "grad_norm": 91.3728151363829,
914
  "learning_rate": 1.8220596619089573e-07,
915
+ "logits/chosen": -1.8486881256103516,
916
+ "logits/rejected": -1.8306758403778076,
917
+ "logps/chosen": -2.625481128692627,
918
+ "logps/rejected": -3.437105178833008,
919
+ "loss": 1.2989,
920
+ "rewards/accuracies": 0.7437499761581421,
921
+ "rewards/chosen": -5.250962257385254,
922
+ "rewards/margins": 1.6232483386993408,
923
+ "rewards/rejected": -6.874210357666016,
924
  "step": 300
925
  },
926
  {
927
  "epoch": 0.6385762889295996,
928
+ "grad_norm": 72.03502684523782,
929
  "learning_rate": 1.7344254621846017e-07,
930
+ "logits/chosen": -1.9142115116119385,
931
+ "logits/rejected": -1.9298557043075562,
932
+ "logps/chosen": -2.655311107635498,
933
+ "logps/rejected": -3.314296007156372,
934
+ "loss": 1.2273,
935
+ "rewards/accuracies": 0.731249988079071,
936
+ "rewards/chosen": -5.310622215270996,
937
+ "rewards/margins": 1.3179702758789062,
938
+ "rewards/rejected": -6.628592014312744,
939
  "step": 305
940
  },
941
  {
942
  "epoch": 0.6490447526825438,
943
+ "grad_norm": 69.39610632162989,
944
  "learning_rate": 1.647817538357072e-07,
945
+ "logits/chosen": -1.8438358306884766,
946
+ "logits/rejected": -1.8745906352996826,
947
+ "logps/chosen": -2.8499069213867188,
948
+ "logps/rejected": -3.5957629680633545,
949
+ "loss": 1.2436,
950
+ "rewards/accuracies": 0.7562500238418579,
951
+ "rewards/chosen": -5.6998138427734375,
952
+ "rewards/margins": 1.491711974143982,
953
+ "rewards/rejected": -7.191525936126709,
954
  "step": 310
955
  },
956
  {
957
  "epoch": 0.6595132164354881,
958
+ "grad_norm": 60.61281750085851,
959
  "learning_rate": 1.562351990976095e-07,
960
+ "logits/chosen": -1.9220491647720337,
961
+ "logits/rejected": -1.9316775798797607,
962
+ "logps/chosen": -2.5944266319274902,
963
+ "logps/rejected": -3.3770358562469482,
964
+ "loss": 1.2926,
965
  "rewards/accuracies": 0.6937500238418579,
966
+ "rewards/chosen": -5.1888532638549805,
967
+ "rewards/margins": 1.5652191638946533,
968
+ "rewards/rejected": -6.7540717124938965,
969
  "step": 315
970
  },
971
  {
972
  "epoch": 0.6699816801884323,
973
+ "grad_norm": 64.49517967587298,
974
  "learning_rate": 1.478143389201113e-07,
975
+ "logits/chosen": -1.954124093055725,
976
+ "logits/rejected": -1.9501478672027588,
977
+ "logps/chosen": -2.416090250015259,
978
+ "logps/rejected": -3.2859389781951904,
979
+ "loss": 1.1985,
980
+ "rewards/accuracies": 0.706250011920929,
981
+ "rewards/chosen": -4.832180500030518,
982
+ "rewards/margins": 1.7396974563598633,
983
+ "rewards/rejected": -6.571877956390381,
984
  "step": 320
985
  },
986
  {
987
  "epoch": 0.6804501439413766,
988
+ "grad_norm": 77.10656027012405,
989
  "learning_rate": 1.3953046172178413e-07,
990
+ "logits/chosen": -1.9018971920013428,
991
+ "logits/rejected": -1.9205232858657837,
992
+ "logps/chosen": -2.819211483001709,
993
+ "logps/rejected": -3.631371021270752,
994
+ "loss": 1.3417,
995
+ "rewards/accuracies": 0.6812499761581421,
996
+ "rewards/chosen": -5.638422966003418,
997
+ "rewards/margins": 1.6243181228637695,
998
+ "rewards/rejected": -7.262742042541504,
999
  "step": 325
1000
  },
1001
  {
1002
  "epoch": 0.6909186076943209,
1003
+ "grad_norm": 64.1696366253842,
1004
  "learning_rate": 1.3139467229135998e-07,
1005
+ "logits/chosen": -1.8825531005859375,
1006
+ "logits/rejected": -1.8870893716812134,
1007
+ "logps/chosen": -2.785536289215088,
1008
+ "logps/rejected": -3.764813184738159,
1009
+ "loss": 1.2374,
1010
  "rewards/accuracies": 0.7437499761581421,
1011
+ "rewards/chosen": -5.571072578430176,
1012
+ "rewards/margins": 1.9585535526275635,
1013
+ "rewards/rejected": -7.529626369476318,
1014
  "step": 330
1015
  },
1016
  {
1017
  "epoch": 0.7013870714472651,
1018
+ "grad_norm": 80.55442176578629,
1019
  "learning_rate": 1.2341787690142435e-07,
1020
+ "logits/chosen": -1.8318296670913696,
1021
+ "logits/rejected": -1.8602631092071533,
1022
+ "logps/chosen": -3.0103182792663574,
1023
+ "logps/rejected": -3.835423707962036,
1024
+ "loss": 1.2754,
1025
+ "rewards/accuracies": 0.706250011920929,
1026
+ "rewards/chosen": -6.020636558532715,
1027
+ "rewards/margins": 1.6502106189727783,
1028
+ "rewards/rejected": -7.670847415924072,
1029
  "step": 335
1030
  },
1031
  {
1032
  "epoch": 0.7118555352002094,
1033
+ "grad_norm": 87.70174157824101,
1034
  "learning_rate": 1.1561076868822755e-07,
1035
+ "logits/chosen": -1.8182014226913452,
1036
+ "logits/rejected": -1.893776535987854,
1037
+ "logps/chosen": -2.9932782649993896,
1038
+ "logps/rejected": -4.012537002563477,
1039
+ "loss": 1.2641,
1040
+ "rewards/accuracies": 0.762499988079071,
1041
+ "rewards/chosen": -5.986556529998779,
1042
+ "rewards/margins": 2.0385169982910156,
1043
+ "rewards/rejected": -8.025074005126953,
1044
  "step": 340
1045
  },
1046
  {
1047
  "epoch": 0.7223239989531536,
1048
+ "grad_norm": 58.44045230920631,
1049
  "learning_rate": 1.0798381331721107e-07,
1050
+ "logits/chosen": -1.874028205871582,
1051
+ "logits/rejected": -1.8978573083877563,
1052
+ "logps/chosen": -2.8497276306152344,
1053
+ "logps/rejected": -3.7342123985290527,
1054
+ "loss": 1.2615,
1055
+ "rewards/accuracies": 0.762499988079071,
1056
+ "rewards/chosen": -5.699455261230469,
1057
+ "rewards/margins": 1.7689688205718994,
1058
+ "rewards/rejected": -7.4684247970581055,
1059
  "step": 345
1060
  },
1061
  {
1062
  "epoch": 0.7327924627060979,
1063
+ "grad_norm": 72.00876135010044,
1064
  "learning_rate": 1.0054723495346482e-07,
1065
+ "logits/chosen": -1.9043935537338257,
1066
+ "logits/rejected": -1.9324207305908203,
1067
+ "logps/chosen": -2.6209471225738525,
1068
+ "logps/rejected": -3.339960813522339,
1069
+ "loss": 1.2008,
1070
+ "rewards/accuracies": 0.7562500238418579,
1071
+ "rewards/chosen": -5.241894245147705,
1072
+ "rewards/margins": 1.4380273818969727,
1073
+ "rewards/rejected": -6.679921627044678,
1074
  "step": 350
1075
  },
1076
  {
1077
  "epoch": 0.7432609264590422,
1078
+ "grad_norm": 55.618657855003114,
1079
  "learning_rate": 9.331100255592436e-08,
1080
+ "logits/chosen": -1.8889986276626587,
1081
+ "logits/rejected": -1.9200356006622314,
1082
+ "logps/chosen": -2.522282838821411,
1083
+ "logps/rejected": -3.240691661834717,
1084
+ "loss": 1.3353,
1085
+ "rewards/accuracies": 0.699999988079071,
1086
+ "rewards/chosen": -5.044565677642822,
1087
+ "rewards/margins": 1.43681800365448,
1088
+ "rewards/rejected": -6.481383323669434,
1089
  "step": 355
1090
  },
1091
  {
1092
  "epoch": 0.7537293902119864,
1093
+ "grad_norm": 67.73876893903605,
1094
  "learning_rate": 8.628481651367875e-08,
1095
+ "logits/chosen": -1.8986294269561768,
1096
+ "logits/rejected": -1.9174484014511108,
1097
+ "logps/chosen": -2.437084674835205,
1098
+ "logps/rejected": -3.152095317840576,
1099
+ "loss": 1.2784,
1100
+ "rewards/accuracies": 0.731249988079071,
1101
+ "rewards/chosen": -4.87416934967041,
1102
+ "rewards/margins": 1.43002188205719,
1103
+ "rewards/rejected": -6.304190635681152,
1104
  "step": 360
1105
  },
1106
  {
1107
  "epoch": 0.7641978539649307,
1108
+ "grad_norm": 60.67240818670899,
1109
  "learning_rate": 7.947809564230445e-08,
1110
+ "logits/chosen": -1.921534538269043,
1111
+ "logits/rejected": -1.9346189498901367,
1112
+ "logps/chosen": -2.425230026245117,
1113
+ "logps/rejected": -3.2995903491973877,
1114
+ "loss": 1.2435,
1115
+ "rewards/accuracies": 0.7250000238418579,
1116
+ "rewards/chosen": -4.850460052490234,
1117
+ "rewards/margins": 1.7487205266952515,
1118
+ "rewards/rejected": -6.599180698394775,
1119
  "step": 365
1120
  },
1121
  {
1122
  "epoch": 0.7746663177178749,
1123
+ "grad_norm": 59.56252419947236,
1124
  "learning_rate": 7.289996455765748e-08,
1125
+ "logits/chosen": -1.898923635482788,
1126
+ "logits/rejected": -1.9162523746490479,
1127
+ "logps/chosen": -2.415956497192383,
1128
+ "logps/rejected": -3.294524669647217,
1129
+ "loss": 1.1245,
1130
+ "rewards/accuracies": 0.762499988079071,
1131
+ "rewards/chosen": -4.831912994384766,
1132
+ "rewards/margins": 1.757136583328247,
1133
+ "rewards/rejected": -6.589049339294434,
1134
  "step": 370
1135
  },
1136
  {
1137
  "epoch": 0.7851347814708192,
1138
+ "grad_norm": 71.07320248012725,
1139
  "learning_rate": 6.655924144404906e-08,
1140
+ "logits/chosen": -1.866040587425232,
1141
+ "logits/rejected": -1.8857864141464233,
1142
+ "logps/chosen": -2.5201144218444824,
1143
+ "logps/rejected": -3.3979249000549316,
1144
+ "loss": 1.2524,
1145
+ "rewards/accuracies": 0.699999988079071,
1146
+ "rewards/chosen": -5.040228843688965,
1147
+ "rewards/margins": 1.7556209564208984,
1148
+ "rewards/rejected": -6.795849800109863,
1149
  "step": 375
1150
  },
1151
  {
1152
  "epoch": 0.7956032452237635,
1153
+ "grad_norm": 66.56871290886237,
1154
  "learning_rate": 6.046442623320145e-08,
1155
+ "logits/chosen": -1.910027265548706,
1156
+ "logits/rejected": -1.9534069299697876,
1157
+ "logps/chosen": -2.5121512413024902,
1158
+ "logps/rejected": -3.314141035079956,
1159
+ "loss": 1.2226,
1160
+ "rewards/accuracies": 0.731249988079071,
1161
+ "rewards/chosen": -5.0243024826049805,
1162
+ "rewards/margins": 1.6039783954620361,
1163
+ "rewards/rejected": -6.628282070159912,
1164
  "step": 380
1165
  },
1166
  {
1167
  "epoch": 0.8060717089767077,
1168
+ "grad_norm": 62.398640149446855,
1169
  "learning_rate": 5.4623689209832484e-08,
1170
+ "logits/chosen": -1.8576492071151733,
1171
+ "logits/rejected": -1.8747966289520264,
1172
+ "logps/chosen": -2.6333398818969727,
1173
+ "logps/rejected": -3.4325785636901855,
1174
+ "loss": 1.2061,
1175
  "rewards/accuracies": 0.7562500238418579,
1176
+ "rewards/chosen": -5.266679763793945,
1177
+ "rewards/margins": 1.5984779596328735,
1178
+ "rewards/rejected": -6.865157127380371,
1179
  "step": 385
1180
  },
1181
  {
1182
  "epoch": 0.816540172729652,
1183
+ "grad_norm": 70.95289047647472,
1184
  "learning_rate": 4.904486005914027e-08,
1185
+ "logits/chosen": -1.8401978015899658,
1186
+ "logits/rejected": -1.8537371158599854,
1187
+ "logps/chosen": -2.873885154724121,
1188
+ "logps/rejected": -3.697465419769287,
1189
+ "loss": 1.1959,
1190
  "rewards/accuracies": 0.731249988079071,
1191
+ "rewards/chosen": -5.747770309448242,
1192
+ "rewards/margins": 1.6471607685089111,
1193
+ "rewards/rejected": -7.394930839538574,
1194
  "step": 390
1195
  },
1196
  {
1197
  "epoch": 0.8270086364825961,
1198
+ "grad_norm": 67.80264650886268,
1199
  "learning_rate": 4.373541737087263e-08,
1200
+ "logits/chosen": -1.7799686193466187,
1201
+ "logits/rejected": -1.8199493885040283,
1202
+ "logps/chosen": -2.8214058876037598,
1203
+ "logps/rejected": -3.7097229957580566,
1204
+ "loss": 1.2552,
1205
+ "rewards/accuracies": 0.71875,
1206
+ "rewards/chosen": -5.6428117752075195,
1207
+ "rewards/margins": 1.7766335010528564,
1208
+ "rewards/rejected": -7.419445991516113,
1209
  "step": 395
1210
  },
1211
  {
1212
  "epoch": 0.8374771002355405,
1213
+ "grad_norm": 65.87128834814716,
1214
  "learning_rate": 3.8702478614051345e-08,
1215
+ "logits/chosen": -1.8025257587432861,
1216
+ "logits/rejected": -1.8182452917099,
1217
+ "logps/chosen": -2.924140214920044,
1218
+ "logps/rejected": -3.7234292030334473,
1219
+ "loss": 1.267,
1220
+ "rewards/accuracies": 0.7437499761581421,
1221
+ "rewards/chosen": -5.848280429840088,
1222
+ "rewards/margins": 1.5985779762268066,
1223
+ "rewards/rejected": -7.4468584060668945,
1224
  "step": 400
1225
  },
1226
  {
1227
  "epoch": 0.8374771002355405,
1228
+ "eval_logits/chosen": -1.8795839548110962,
1229
+ "eval_logits/rejected": -1.8947079181671143,
1230
+ "eval_logps/chosen": -2.906954526901245,
1231
+ "eval_logps/rejected": -3.8314788341522217,
1232
+ "eval_loss": 1.209315299987793,
1233
  "eval_rewards/accuracies": 0.7319999933242798,
1234
+ "eval_rewards/chosen": -5.81390905380249,
1235
+ "eval_rewards/margins": 1.8490480184555054,
1236
+ "eval_rewards/rejected": -7.662957668304443,
1237
+ "eval_runtime": 171.5509,
1238
+ "eval_samples_per_second": 11.658,
1239
  "eval_steps_per_second": 0.729,
1240
  "step": 400
1241
  },
1242
  {
1243
  "epoch": 0.8479455639884846,
1244
+ "grad_norm": 60.339051405492995,
1245
  "learning_rate": 3.3952790595787986e-08,
1246
+ "logits/chosen": -1.8179550170898438,
1247
+ "logits/rejected": -1.843348741531372,
1248
+ "logps/chosen": -3.005042552947998,
1249
+ "logps/rejected": -3.817539930343628,
1250
+ "loss": 1.1835,
1251
+ "rewards/accuracies": 0.706250011920929,
1252
+ "rewards/chosen": -6.010085105895996,
1253
+ "rewards/margins": 1.6249958276748657,
1254
+ "rewards/rejected": -7.635079860687256,
1255
  "step": 405
1256
  },
1257
  {
1258
  "epoch": 0.8584140277414289,
1259
+ "grad_norm": 59.27380738977581,
1260
  "learning_rate": 2.9492720416985e-08,
1261
+ "logits/chosen": -1.7845852375030518,
1262
+ "logits/rejected": -1.819650650024414,
1263
+ "logps/chosen": -2.9665215015411377,
1264
+ "logps/rejected": -3.7718894481658936,
1265
+ "loss": 1.2356,
1266
+ "rewards/accuracies": 0.668749988079071,
1267
+ "rewards/chosen": -5.933043003082275,
1268
+ "rewards/margins": 1.610735297203064,
1269
+ "rewards/rejected": -7.543778896331787,
1270
  "step": 410
1271
  },
1272
  {
1273
  "epoch": 0.8688824914943732,
1274
+ "grad_norm": 91.6278901379261,
1275
  "learning_rate": 2.5328246937043525e-08,
1276
+ "logits/chosen": -1.8167459964752197,
1277
+ "logits/rejected": -1.827742576599121,
1278
+ "logps/chosen": -3.0838284492492676,
1279
+ "logps/rejected": -3.7930634021759033,
1280
+ "loss": 1.2442,
1281
+ "rewards/accuracies": 0.6625000238418579,
1282
+ "rewards/chosen": -6.167656898498535,
1283
+ "rewards/margins": 1.4184691905975342,
1284
+ "rewards/rejected": -7.586126804351807,
1285
  "step": 415
1286
  },
1287
  {
1288
  "epoch": 0.8793509552473174,
1289
+ "grad_norm": 76.88765480732745,
1290
  "learning_rate": 2.1464952759020856e-08,
1291
+ "logits/chosen": -1.767154335975647,
1292
+ "logits/rejected": -1.7657482624053955,
1293
+ "logps/chosen": -2.938920497894287,
1294
+ "logps/rejected": -3.925022840499878,
1295
+ "loss": 1.1843,
1296
  "rewards/accuracies": 0.699999988079071,
1297
+ "rewards/chosen": -5.877840995788574,
1298
+ "rewards/margins": 1.972204566001892,
1299
+ "rewards/rejected": -7.850045680999756,
1300
  "step": 420
1301
  },
1302
  {
1303
  "epoch": 0.8898194190002617,
1304
+ "grad_norm": 71.47276585551965,
1305
  "learning_rate": 1.7908016745981856e-08,
1306
+ "logits/chosen": -1.7983713150024414,
1307
+ "logits/rejected": -1.8315776586532593,
1308
+ "logps/chosen": -2.782409429550171,
1309
+ "logps/rejected": -3.797370433807373,
1310
+ "loss": 1.1376,
1311
+ "rewards/accuracies": 0.7562500238418579,
1312
+ "rewards/chosen": -5.564818859100342,
1313
+ "rewards/margins": 2.0299227237701416,
1314
+ "rewards/rejected": -7.594740867614746,
1315
  "step": 425
1316
  },
1317
  {
1318
  "epoch": 0.9002878827532059,
1319
+ "grad_norm": 73.92788729675112,
1320
  "learning_rate": 1.4662207078575684e-08,
1321
+ "logits/chosen": -1.7802276611328125,
1322
+ "logits/rejected": -1.8125686645507812,
1323
+ "logps/chosen": -2.976722240447998,
1324
+ "logps/rejected": -3.855865955352783,
1325
+ "loss": 1.138,
1326
+ "rewards/accuracies": 0.706250011920929,
1327
+ "rewards/chosen": -5.953444480895996,
1328
+ "rewards/margins": 1.7582868337631226,
1329
+ "rewards/rejected": -7.711731910705566,
1330
  "step": 430
1331
  },
1332
  {
1333
  "epoch": 0.9107563465061502,
1334
+ "grad_norm": 67.44618318869006,
1335
  "learning_rate": 1.1731874863145142e-08,
1336
+ "logits/chosen": -1.7870285511016846,
1337
+ "logits/rejected": -1.7898231744766235,
1338
+ "logps/chosen": -2.865501880645752,
1339
+ "logps/rejected": -3.5326790809631348,
1340
+ "loss": 1.1911,
1341
  "rewards/accuracies": 0.668749988079071,
1342
+ "rewards/chosen": -5.731003761291504,
1343
+ "rewards/margins": 1.3343536853790283,
1344
+ "rewards/rejected": -7.0653581619262695,
1345
  "step": 435
1346
  },
1347
  {
1348
  "epoch": 0.9212248102590945,
1349
+ "grad_norm": 80.91405517261822,
1350
  "learning_rate": 9.12094829893642e-09,
1351
+ "logits/chosen": -1.8377736806869507,
1352
+ "logits/rejected": -1.8392757177352905,
1353
+ "logps/chosen": -2.909419536590576,
1354
+ "logps/rejected": -3.649876356124878,
1355
+ "loss": 1.2198,
1356
+ "rewards/accuracies": 0.6937500238418579,
1357
+ "rewards/chosen": -5.818839073181152,
1358
+ "rewards/margins": 1.4809141159057617,
1359
+ "rewards/rejected": -7.299752712249756,
1360
  "step": 440
1361
  },
1362
  {
1363
  "epoch": 0.9316932740120387,
1364
+ "grad_norm": 71.48155448577984,
1365
  "learning_rate": 6.832927412229017e-09,
1366
+ "logits/chosen": -1.7438195943832397,
1367
+ "logits/rejected": -1.7705223560333252,
1368
+ "logps/chosen": -2.9339725971221924,
1369
+ "logps/rejected": -3.8835110664367676,
1370
+ "loss": 1.2304,
1371
  "rewards/accuracies": 0.737500011920929,
1372
+ "rewards/chosen": -5.867945194244385,
1373
+ "rewards/margins": 1.899076223373413,
1374
+ "rewards/rejected": -7.767022132873535,
1375
  "step": 445
1376
  },
1377
  {
1378
  "epoch": 0.942161737764983,
1379
+ "grad_norm": 75.54710086328386,
1380
  "learning_rate": 4.8708793644441086e-09,
1381
+ "logits/chosen": -1.8001352548599243,
1382
+ "logits/rejected": -1.8404371738433838,
1383
+ "logps/chosen": -2.698850154876709,
1384
+ "logps/rejected": -3.7303435802459717,
1385
+ "loss": 1.1496,
1386
+ "rewards/accuracies": 0.8062499761581421,
1387
+ "rewards/chosen": -5.397700309753418,
1388
+ "rewards/margins": 2.0629868507385254,
1389
+ "rewards/rejected": -7.460687160491943,
1390
  "step": 450
1391
  },
1392
  {
1393
  "epoch": 0.9526302015179272,
1394
+ "grad_norm": 67.21839705145132,
1395
  "learning_rate": 3.2374343405217884e-09,
1396
+ "logits/chosen": -1.7952674627304077,
1397
+ "logits/rejected": -1.7791658639907837,
1398
+ "logps/chosen": -2.888683795928955,
1399
+ "logps/rejected": -3.7380385398864746,
1400
+ "loss": 1.2744,
1401
  "rewards/accuracies": 0.706250011920929,
1402
+ "rewards/chosen": -5.77736759185791,
1403
+ "rewards/margins": 1.6987104415893555,
1404
+ "rewards/rejected": -7.476077079772949,
1405
  "step": 455
1406
  },
1407
  {
1408
  "epoch": 0.9630986652708715,
1409
+ "grad_norm": 83.08799813263991,
1410
  "learning_rate": 1.9347820230782295e-09,
1411
+ "logits/chosen": -1.7771613597869873,
1412
+ "logits/rejected": -1.8283106088638306,
1413
+ "logps/chosen": -2.847031831741333,
1414
+ "logps/rejected": -3.8970909118652344,
1415
+ "loss": 1.2244,
1416
+ "rewards/accuracies": 0.8125,
1417
+ "rewards/chosen": -5.694063663482666,
1418
+ "rewards/margins": 2.1001174449920654,
1419
+ "rewards/rejected": -7.794181823730469,
1420
  "step": 460
1421
  },
1422
  {
1423
  "epoch": 0.9735671290238157,
1424
+ "grad_norm": 55.2538190833758,
1425
  "learning_rate": 9.64668657069706e-10,
1426
+ "logits/chosen": -1.8154325485229492,
1427
+ "logits/rejected": -1.8061832189559937,
1428
+ "logps/chosen": -2.8277297019958496,
1429
+ "logps/rejected": -3.931652069091797,
1430
+ "loss": 1.1408,
1431
+ "rewards/accuracies": 0.762499988079071,
1432
+ "rewards/chosen": -5.655459403991699,
1433
+ "rewards/margins": 2.2078452110290527,
1434
+ "rewards/rejected": -7.863304138183594,
1435
  "step": 465
1436
  },
1437
  {
1438
  "epoch": 0.98403559277676,
1439
+ "grad_norm": 63.55697491973192,
1440
  "learning_rate": 3.2839470889836627e-10,
1441
+ "logits/chosen": -1.762578010559082,
1442
+ "logits/rejected": -1.7515432834625244,
1443
+ "logps/chosen": -2.639904022216797,
1444
+ "logps/rejected": -3.7671687602996826,
1445
+ "loss": 1.1961,
1446
+ "rewards/accuracies": 0.762499988079071,
1447
+ "rewards/chosen": -5.279808044433594,
1448
+ "rewards/margins": 2.2545294761657715,
1449
+ "rewards/rejected": -7.534337520599365,
1450
  "step": 470
1451
  },
1452
  {
1453
  "epoch": 0.9945040565297043,
1454
+ "grad_norm": 96.99566903301525,
1455
  "learning_rate": 2.6813123097352287e-11,
1456
+ "logits/chosen": -1.8280794620513916,
1457
+ "logits/rejected": -1.8603473901748657,
1458
+ "logps/chosen": -2.8405585289001465,
1459
+ "logps/rejected": -3.6063880920410156,
1460
+ "loss": 1.1728,
1461
+ "rewards/accuracies": 0.6937500238418579,
1462
+ "rewards/chosen": -5.681117057800293,
1463
+ "rewards/margins": 1.5316593647003174,
1464
+ "rewards/rejected": -7.212776184082031,
1465
  "step": 475
1466
  },
1467
  {
1468
  "epoch": 0.998691442030882,
1469
  "step": 477,
1470
  "total_flos": 0.0,
1471
+ "train_loss": 1.3686470238167785,
1472
+ "train_runtime": 22217.0339,
1473
+ "train_samples_per_second": 2.752,
1474
+ "train_steps_per_second": 0.021
1475
  }
1476
  ],
1477
  "logging_steps": 5,