File size: 23,196 Bytes
a071964
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.0,
  "eval_steps": 50,
  "global_step": 352,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.028409090909090908,
      "grad_norm": 31.782667280345194,
      "learning_rate": 1.3888888888888888e-07,
      "logits/chosen": -2.8591694831848145,
      "logits/rejected": -2.6428685188293457,
      "logps/chosen": -390.5384216308594,
      "logps/rejected": -607.8155517578125,
      "loss": 0.6897,
      "rewards/accuracies": 0.581250011920929,
      "rewards/chosen": 0.0022137626074254513,
      "rewards/margins": 0.013292843475937843,
      "rewards/rejected": -0.011079080402851105,
      "step": 10
    },
    {
      "epoch": 0.056818181818181816,
      "grad_norm": 18.732725603737485,
      "learning_rate": 2.7777777777777776e-07,
      "logits/chosen": -2.8422160148620605,
      "logits/rejected": -2.694746494293213,
      "logps/chosen": -328.73382568359375,
      "logps/rejected": -775.1841430664062,
      "loss": 0.556,
      "rewards/accuracies": 0.9937499761581421,
      "rewards/chosen": 0.08411312848329544,
      "rewards/margins": 0.6615578532218933,
      "rewards/rejected": -0.5774446725845337,
      "step": 20
    },
    {
      "epoch": 0.08522727272727272,
      "grad_norm": 5.623138797985183,
      "learning_rate": 4.1666666666666667e-07,
      "logits/chosen": -2.8572211265563965,
      "logits/rejected": -2.6727840900421143,
      "logps/chosen": -296.5441589355469,
      "logps/rejected": -1129.43017578125,
      "loss": 0.2375,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.3573785722255707,
      "rewards/margins": 4.872136116027832,
      "rewards/rejected": -4.5147576332092285,
      "step": 30
    },
    {
      "epoch": 0.11363636363636363,
      "grad_norm": 1.8250836888812216,
      "learning_rate": 4.998023493068254e-07,
      "logits/chosen": -2.813652753829956,
      "logits/rejected": -2.606616258621216,
      "logps/chosen": -323.4709167480469,
      "logps/rejected": -2253.32470703125,
      "loss": 0.0646,
      "rewards/accuracies": 1.0,
      "rewards/chosen": 0.23284561932086945,
      "rewards/margins": 15.6871337890625,
      "rewards/rejected": -15.4542875289917,
      "step": 40
    },
    {
      "epoch": 0.14204545454545456,
      "grad_norm": 1.2011523237420525,
      "learning_rate": 4.975823666181255e-07,
      "logits/chosen": -2.769263744354248,
      "logits/rejected": -2.4520652294158936,
      "logps/chosen": -483.3968200683594,
      "logps/rejected": -4912.21728515625,
      "loss": 0.0079,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.2076160907745361,
      "rewards/margins": 41.37514877319336,
      "rewards/rejected": -42.582763671875,
      "step": 50
    },
    {
      "epoch": 0.14204545454545456,
      "eval_logits/chosen": -2.8844106197357178,
      "eval_logits/rejected": -2.4010605812072754,
      "eval_logps/chosen": -508.0701904296875,
      "eval_logps/rejected": -5776.9990234375,
      "eval_loss": 0.005187372677028179,
      "eval_rewards/accuracies": 0.9959677457809448,
      "eval_rewards/chosen": -1.4108844995498657,
      "eval_rewards/margins": 49.95121765136719,
      "eval_rewards/rejected": -51.36210250854492,
      "eval_runtime": 197.1461,
      "eval_samples_per_second": 19.808,
      "eval_steps_per_second": 0.314,
      "step": 50
    },
    {
      "epoch": 0.17045454545454544,
      "grad_norm": 0.511680193897244,
      "learning_rate": 4.929173350101024e-07,
      "logits/chosen": -2.8946661949157715,
      "logits/rejected": -2.239091396331787,
      "logps/chosen": -522.5350341796875,
      "logps/rejected": -6444.09033203125,
      "loss": 0.0077,
      "rewards/accuracies": 0.9937499761581421,
      "rewards/chosen": -1.6264005899429321,
      "rewards/margins": 56.385963439941406,
      "rewards/rejected": -58.012359619140625,
      "step": 60
    },
    {
      "epoch": 0.19886363636363635,
      "grad_norm": 0.2533965055751647,
      "learning_rate": 4.858533249305336e-07,
      "logits/chosen": -2.7772889137268066,
      "logits/rejected": -1.765015959739685,
      "logps/chosen": -555.8414306640625,
      "logps/rejected": -6928.0419921875,
      "loss": 0.0037,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.7668079137802124,
      "rewards/margins": 61.386253356933594,
      "rewards/rejected": -63.1530647277832,
      "step": 70
    },
    {
      "epoch": 0.22727272727272727,
      "grad_norm": 5.78873598153483,
      "learning_rate": 4.764600984163808e-07,
      "logits/chosen": -2.8613221645355225,
      "logits/rejected": -1.6336866617202759,
      "logps/chosen": -575.8856201171875,
      "logps/rejected": -8230.505859375,
      "loss": 0.003,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.6956470012664795,
      "rewards/margins": 73.87208557128906,
      "rewards/rejected": -75.56773376464844,
      "step": 80
    },
    {
      "epoch": 0.2556818181818182,
      "grad_norm": 0.05441796989473776,
      "learning_rate": 4.6483042014491527e-07,
      "logits/chosen": -2.866097927093506,
      "logits/rejected": -1.7091907262802124,
      "logps/chosen": -556.7591552734375,
      "logps/rejected": -7405.0654296875,
      "loss": 0.0112,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.946458101272583,
      "rewards/margins": 65.7303466796875,
      "rewards/rejected": -67.67679595947266,
      "step": 90
    },
    {
      "epoch": 0.2840909090909091,
      "grad_norm": 1.0199863397394129,
      "learning_rate": 4.510791413176912e-07,
      "logits/chosen": -2.6974339485168457,
      "logits/rejected": -0.42245426774024963,
      "logps/chosen": -575.6227416992188,
      "logps/rejected": -9247.611328125,
      "loss": 0.0031,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -2.0615272521972656,
      "rewards/margins": 83.35676574707031,
      "rewards/rejected": -85.41829681396484,
      "step": 100
    },
    {
      "epoch": 0.2840909090909091,
      "eval_logits/chosen": -2.4172048568725586,
      "eval_logits/rejected": 0.017224134877324104,
      "eval_logps/chosen": -573.0657348632812,
      "eval_logps/rejected": -8932.8349609375,
      "eval_loss": 0.001179259386844933,
      "eval_rewards/accuracies": 1.0,
      "eval_rewards/chosen": -2.060839891433716,
      "eval_rewards/margins": 80.85962677001953,
      "eval_rewards/rejected": -82.92045593261719,
      "eval_runtime": 196.8335,
      "eval_samples_per_second": 19.839,
      "eval_steps_per_second": 0.315,
      "step": 100
    },
    {
      "epoch": 0.3125,
      "grad_norm": 0.0720935751683844,
      "learning_rate": 4.353420654246546e-07,
      "logits/chosen": -2.386807441711426,
      "logits/rejected": -0.0012192248832434416,
      "logps/chosen": -584.6697998046875,
      "logps/rejected": -7824.6484375,
      "loss": 0.0034,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -2.0924859046936035,
      "rewards/margins": 70.16132354736328,
      "rewards/rejected": -72.2538070678711,
      "step": 110
    },
    {
      "epoch": 0.3409090909090909,
      "grad_norm": 0.13283878655315423,
      "learning_rate": 4.177746070897592e-07,
      "logits/chosen": -2.329523801803589,
      "logits/rejected": 0.7127262949943542,
      "logps/chosen": -563.5079956054688,
      "logps/rejected": -7580.98974609375,
      "loss": 0.007,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.8199115991592407,
      "rewards/margins": 67.61772155761719,
      "rewards/rejected": -69.43761444091797,
      "step": 120
    },
    {
      "epoch": 0.3693181818181818,
      "grad_norm": 0.24292729812234612,
      "learning_rate": 3.9855025724292763e-07,
      "logits/chosen": -2.526569366455078,
      "logits/rejected": 1.3170499801635742,
      "logps/chosen": -577.057861328125,
      "logps/rejected": -8586.48046875,
      "loss": 0.0012,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -2.0660626888275146,
      "rewards/margins": 77.39335632324219,
      "rewards/rejected": -79.45941925048828,
      "step": 130
    },
    {
      "epoch": 0.3977272727272727,
      "grad_norm": 0.9183173633670716,
      "learning_rate": 3.7785886977585555e-07,
      "logits/chosen": -2.464595317840576,
      "logits/rejected": 1.4278135299682617,
      "logps/chosen": -540.9710083007812,
      "logps/rejected": -10038.2763671875,
      "loss": 0.0016,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.749346375465393,
      "rewards/margins": 92.21351623535156,
      "rewards/rejected": -93.96287536621094,
      "step": 140
    },
    {
      "epoch": 0.42613636363636365,
      "grad_norm": 0.6993679548084264,
      "learning_rate": 3.5590478660213206e-07,
      "logits/chosen": -2.191847562789917,
      "logits/rejected": 1.6170787811279297,
      "logps/chosen": -609.4722900390625,
      "logps/rejected": -9242.5185546875,
      "loss": 0.0016,
      "rewards/accuracies": 0.9937499761581421,
      "rewards/chosen": -2.3140909671783447,
      "rewards/margins": 82.9913101196289,
      "rewards/rejected": -85.30540466308594,
      "step": 150
    },
    {
      "epoch": 0.42613636363636365,
      "eval_logits/chosen": -2.19913911819458,
      "eval_logits/rejected": 1.8917714357376099,
      "eval_logps/chosen": -571.1802368164062,
      "eval_logps/rejected": -8522.7255859375,
      "eval_loss": 0.0007830065442249179,
      "eval_rewards/accuracies": 1.0,
      "eval_rewards/chosen": -2.041984796524048,
      "eval_rewards/margins": 76.77738952636719,
      "eval_rewards/rejected": -78.81936645507812,
      "eval_runtime": 195.9934,
      "eval_samples_per_second": 19.924,
      "eval_steps_per_second": 0.316,
      "step": 150
    },
    {
      "epoch": 0.45454545454545453,
      "grad_norm": 1.8645892720332131,
      "learning_rate": 3.3290481963801696e-07,
      "logits/chosen": -2.4417898654937744,
      "logits/rejected": 1.9294321537017822,
      "logps/chosen": -544.4041748046875,
      "logps/rejected": -8048.359375,
      "loss": 0.0004,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.7426624298095703,
      "rewards/margins": 72.57603454589844,
      "rewards/rejected": -74.31868743896484,
      "step": 160
    },
    {
      "epoch": 0.48295454545454547,
      "grad_norm": 0.026359625456043548,
      "learning_rate": 3.0908610963322626e-07,
      "logits/chosen": -2.6247737407684326,
      "logits/rejected": 1.3285863399505615,
      "logps/chosen": -582.7529907226562,
      "logps/rejected": -8232.6865234375,
      "loss": 0.0027,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.9513471126556396,
      "rewards/margins": 73.0924072265625,
      "rewards/rejected": -75.04375457763672,
      "step": 170
    },
    {
      "epoch": 0.5113636363636364,
      "grad_norm": 0.023938485830486744,
      "learning_rate": 2.846838829972671e-07,
      "logits/chosen": -2.2369513511657715,
      "logits/rejected": 2.0209977626800537,
      "logps/chosen": -574.5399780273438,
      "logps/rejected": -7650.44775390625,
      "loss": 0.0056,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -2.136356830596924,
      "rewards/margins": 68.23828887939453,
      "rewards/rejected": -70.37464904785156,
      "step": 180
    },
    {
      "epoch": 0.5397727272727273,
      "grad_norm": 0.22173170076093188,
      "learning_rate": 2.5993912877423147e-07,
      "logits/chosen": -1.6255722045898438,
      "logits/rejected": 1.8459784984588623,
      "logps/chosen": -532.0623168945312,
      "logps/rejected": -8512.8662109375,
      "loss": 0.0009,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.864465355873108,
      "rewards/margins": 76.34809875488281,
      "rewards/rejected": -78.21255493164062,
      "step": 190
    },
    {
      "epoch": 0.5681818181818182,
      "grad_norm": 0.7897968463196222,
      "learning_rate": 2.3509621870754504e-07,
      "logits/chosen": -1.029783010482788,
      "logits/rejected": 2.887779474258423,
      "logps/chosen": -551.5975952148438,
      "logps/rejected": -8021.84912109375,
      "loss": 0.0015,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.8646786212921143,
      "rewards/margins": 72.0453872680664,
      "rewards/rejected": -73.91007232666016,
      "step": 200
    },
    {
      "epoch": 0.5681818181818182,
      "eval_logits/chosen": -0.8412286043167114,
      "eval_logits/rejected": 2.5732452869415283,
      "eval_logps/chosen": -564.5499877929688,
      "eval_logps/rejected": -8622.9443359375,
      "eval_loss": 0.0007432692218571901,
      "eval_rewards/accuracies": 1.0,
      "eval_rewards/chosen": -1.9756826162338257,
      "eval_rewards/margins": 77.84588623046875,
      "eval_rewards/rejected": -79.8215560913086,
      "eval_runtime": 196.9649,
      "eval_samples_per_second": 19.826,
      "eval_steps_per_second": 0.315,
      "step": 200
    },
    {
      "epoch": 0.5965909090909091,
      "grad_norm": 0.4603520409149027,
      "learning_rate": 2.1040049389819624e-07,
      "logits/chosen": -0.8681282997131348,
      "logits/rejected": 2.764690399169922,
      "logps/chosen": -578.1253662109375,
      "logps/rejected": -8362.7060546875,
      "loss": 0.0032,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.8748207092285156,
      "rewards/margins": 75.43318939208984,
      "rewards/rejected": -77.30801391601562,
      "step": 210
    },
    {
      "epoch": 0.625,
      "grad_norm": 2.35697501687302,
      "learning_rate": 1.8609584188988133e-07,
      "logits/chosen": -1.0785493850708008,
      "logits/rejected": 2.1844732761383057,
      "logps/chosen": -606.7586059570312,
      "logps/rejected": -7175.0869140625,
      "loss": 0.0036,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -2.4215800762176514,
      "rewards/margins": 62.83292770385742,
      "rewards/rejected": -65.25450897216797,
      "step": 220
    },
    {
      "epoch": 0.6534090909090909,
      "grad_norm": 22.84103924366115,
      "learning_rate": 1.624222881090439e-07,
      "logits/chosen": -1.365192174911499,
      "logits/rejected": 1.863221526145935,
      "logps/chosen": -607.694091796875,
      "logps/rejected": -8029.53759765625,
      "loss": 0.0058,
      "rewards/accuracies": 0.9937499761581421,
      "rewards/chosen": -2.130295515060425,
      "rewards/margins": 71.30926513671875,
      "rewards/rejected": -73.43955993652344,
      "step": 230
    },
    {
      "epoch": 0.6818181818181818,
      "grad_norm": 0.4015693433898369,
      "learning_rate": 1.3961362544602212e-07,
      "logits/chosen": -1.195718765258789,
      "logits/rejected": 2.2943453788757324,
      "logps/chosen": -563.174560546875,
      "logps/rejected": -6934.9736328125,
      "loss": 0.0015,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.8716214895248413,
      "rewards/margins": 61.537391662597656,
      "rewards/rejected": -63.409019470214844,
      "step": 240
    },
    {
      "epoch": 0.7102272727272727,
      "grad_norm": 0.304169201785893,
      "learning_rate": 1.1789510538684522e-07,
      "logits/chosen": -1.2848708629608154,
      "logits/rejected": 2.0380046367645264,
      "logps/chosen": -532.541015625,
      "logps/rejected": -8912.1201171875,
      "loss": 0.0016,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.8050743341445923,
      "rewards/margins": 80.02030944824219,
      "rewards/rejected": -81.82538604736328,
      "step": 250
    },
    {
      "epoch": 0.7102272727272727,
      "eval_logits/chosen": -1.2004659175872803,
      "eval_logits/rejected": 2.2580416202545166,
      "eval_logps/chosen": -549.9619750976562,
      "eval_logps/rejected": -7654.19775390625,
      "eval_loss": 0.0007973507163114846,
      "eval_rewards/accuracies": 1.0,
      "eval_rewards/chosen": -1.829802393913269,
      "eval_rewards/margins": 68.30428314208984,
      "eval_rewards/rejected": -70.13408660888672,
      "eval_runtime": 195.5911,
      "eval_samples_per_second": 19.965,
      "eval_steps_per_second": 0.317,
      "step": 250
    },
    {
      "epoch": 0.7386363636363636,
      "grad_norm": 0.5568394356105595,
      "learning_rate": 9.748121349736891e-08,
      "logits/chosen": -1.168460488319397,
      "logits/rejected": 2.0924360752105713,
      "logps/chosen": -588.483642578125,
      "logps/rejected": -7967.40478515625,
      "loss": 0.0008,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -2.046555995941162,
      "rewards/margins": 70.84620666503906,
      "rewards/rejected": -72.89276885986328,
      "step": 260
    },
    {
      "epoch": 0.7670454545454546,
      "grad_norm": 0.07097046676874795,
      "learning_rate": 7.857355122839673e-08,
      "logits/chosen": -1.3962290287017822,
      "logits/rejected": 1.8883154392242432,
      "logps/chosen": -566.6204833984375,
      "logps/rejected": -7627.25634765625,
      "loss": 0.0008,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.8577096462249756,
      "rewards/margins": 67.72525024414062,
      "rewards/rejected": -69.58296203613281,
      "step": 270
    },
    {
      "epoch": 0.7954545454545454,
      "grad_norm": 0.05520595932113315,
      "learning_rate": 6.135884496044244e-08,
      "logits/chosen": -1.3116520643234253,
      "logits/rejected": 2.1938819885253906,
      "logps/chosen": -557.5162353515625,
      "logps/rejected": -7626.7578125,
      "loss": 0.0013,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.7834333181381226,
      "rewards/margins": 68.03524017333984,
      "rewards/rejected": -69.81867980957031,
      "step": 280
    },
    {
      "epoch": 0.8238636363636364,
      "grad_norm": 0.05646545972920247,
      "learning_rate": 4.600710195020982e-08,
      "logits/chosen": -1.1199910640716553,
      "logits/rejected": 2.300516128540039,
      "logps/chosen": -577.44482421875,
      "logps/rejected": -7700.6279296875,
      "loss": 0.0024,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.8830159902572632,
      "rewards/margins": 68.71068572998047,
      "rewards/rejected": -70.59370422363281,
      "step": 290
    },
    {
      "epoch": 0.8522727272727273,
      "grad_norm": 0.5065392892121787,
      "learning_rate": 3.2669931390104374e-08,
      "logits/chosen": -1.1116163730621338,
      "logits/rejected": 2.1315221786499023,
      "logps/chosen": -531.3294677734375,
      "logps/rejected": -8432.990234375,
      "loss": 0.0008,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.7507492303848267,
      "rewards/margins": 76.37090301513672,
      "rewards/rejected": -78.12163543701172,
      "step": 300
    },
    {
      "epoch": 0.8522727272727273,
      "eval_logits/chosen": -0.9558252096176147,
      "eval_logits/rejected": 2.2921457290649414,
      "eval_logps/chosen": -557.7684936523438,
      "eval_logps/rejected": -8073.39990234375,
      "eval_loss": 0.0005955722881481051,
      "eval_rewards/accuracies": 1.0,
      "eval_rewards/chosen": -1.907867670059204,
      "eval_rewards/margins": 72.41824340820312,
      "eval_rewards/rejected": -74.32611846923828,
      "eval_runtime": 196.4288,
      "eval_samples_per_second": 19.88,
      "eval_steps_per_second": 0.316,
      "step": 300
    },
    {
      "epoch": 0.8806818181818182,
      "grad_norm": 0.11084443137467888,
      "learning_rate": 2.147904716149135e-08,
      "logits/chosen": -0.904153048992157,
      "logits/rejected": 2.1967928409576416,
      "logps/chosen": -541.1990966796875,
      "logps/rejected": -8187.125,
      "loss": 0.002,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.7913249731063843,
      "rewards/margins": 73.07491302490234,
      "rewards/rejected": -74.86624145507812,
      "step": 310
    },
    {
      "epoch": 0.9090909090909091,
      "grad_norm": 0.06043015851279257,
      "learning_rate": 1.254496706805433e-08,
      "logits/chosen": -1.1182914972305298,
      "logits/rejected": 2.186739206314087,
      "logps/chosen": -570.1544189453125,
      "logps/rejected": -8264.416015625,
      "loss": 0.0005,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.8748136758804321,
      "rewards/margins": 74.24952697753906,
      "rewards/rejected": -76.12433624267578,
      "step": 320
    },
    {
      "epoch": 0.9375,
      "grad_norm": 0.14908479424865714,
      "learning_rate": 5.955921395237318e-09,
      "logits/chosen": -0.9981291890144348,
      "logits/rejected": 2.14846134185791,
      "logps/chosen": -528.0042114257812,
      "logps/rejected": -8246.408203125,
      "loss": 0.0004,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.727360486984253,
      "rewards/margins": 74.25918579101562,
      "rewards/rejected": -75.98654174804688,
      "step": 330
    },
    {
      "epoch": 0.9659090909090909,
      "grad_norm": 0.18250364966205163,
      "learning_rate": 1.7769815745066474e-09,
      "logits/chosen": -1.248327612876892,
      "logits/rejected": 2.005960702896118,
      "logps/chosen": -543.0258178710938,
      "logps/rejected": -7556.515625,
      "loss": 0.0008,
      "rewards/accuracies": 1.0,
      "rewards/chosen": -1.7404506206512451,
      "rewards/margins": 67.73704528808594,
      "rewards/rejected": -69.47749328613281,
      "step": 340
    },
    {
      "epoch": 0.9943181818181818,
      "grad_norm": 1.0466321976770805,
      "learning_rate": 4.9417557483610875e-11,
      "logits/chosen": -0.9777113199234009,
      "logits/rejected": 2.094377040863037,
      "logps/chosen": -576.5343017578125,
      "logps/rejected": -8160.74853515625,
      "loss": 0.0029,
      "rewards/accuracies": 0.9937499761581421,
      "rewards/chosen": -2.0226657390594482,
      "rewards/margins": 73.10923767089844,
      "rewards/rejected": -75.13190460205078,
      "step": 350
    },
    {
      "epoch": 0.9943181818181818,
      "eval_logits/chosen": -0.908790111541748,
      "eval_logits/rejected": 2.3223326206207275,
      "eval_logps/chosen": -561.191650390625,
      "eval_logps/rejected": -8146.36962890625,
      "eval_loss": 0.0005959240952506661,
      "eval_rewards/accuracies": 1.0,
      "eval_rewards/chosen": -1.9420990943908691,
      "eval_rewards/margins": 73.11370849609375,
      "eval_rewards/rejected": -75.0558090209961,
      "eval_runtime": 196.8349,
      "eval_samples_per_second": 19.839,
      "eval_steps_per_second": 0.315,
      "step": 350
    },
    {
      "epoch": 1.0,
      "step": 352,
      "total_flos": 0.0,
      "train_loss": 0.0465552191166022,
      "train_runtime": 10297.8682,
      "train_samples_per_second": 4.37,
      "train_steps_per_second": 0.034
    }
  ],
  "logging_steps": 10,
  "max_steps": 352,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 100,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}