File size: 29,831 Bytes
f3678be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
{
  "best_metric": 14.074385643005371,
  "best_model_checkpoint": "./qwen2.5-0.5b/qwen2.5-0.5b-expo-L1EXPO-ES-10/checkpoint-700",
  "epoch": 3.4057628719886632,
  "eval_steps": 50,
  "global_step": 1200,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "dpo_loss": 0.6931471824645996,
      "epoch": 0.002834199338686821,
      "grad_norm": 3688.5065763773923,
      "learning_rate": 2.840909090909091e-08,
      "logits": -1.359458565711975,
      "logps": -84.69721221923828,
      "loss": 0.0051,
      "objective": 0.0046141319908201694,
      "ranking_idealized": 0.5833333134651184,
      "ranking_idealized_expo": 0.5833333134651184,
      "ranking_simple": 0.5833333134651184,
      "regularize": 0.0046141319908201694,
      "step": 1,
      "wo_beta": 14.840873718261719
    },
    {
      "dpo_loss": 2.3264636993408203,
      "epoch": 0.14170996693434104,
      "grad_norm": 3322.6545378359765,
      "learning_rate": 1.4204545454545458e-06,
      "logits": -1.454339623451233,
      "logps": -84.50347900390625,
      "loss": 4.2778,
      "objective": 4.120908737182617,
      "ranking_idealized": 0.5225340127944946,
      "ranking_idealized_expo": 0.5216836929321289,
      "ranking_simple": 0.521258533000946,
      "regularize": 4.120908737182617,
      "step": 50,
      "wo_beta": 15.655658721923828
    },
    {
      "epoch": 0.14170996693434104,
      "eval_dpo_loss": 2.8787100315093994,
      "eval_logits": -1.4301204681396484,
      "eval_logps": -91.78133392333984,
      "eval_loss": 5.651101589202881,
      "eval_objective": 5.578580379486084,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.5243270993232727,
      "eval_regularize": 5.578580379486084,
      "eval_runtime": 307.7497,
      "eval_samples_per_second": 18.814,
      "eval_steps_per_second": 1.569,
      "eval_wo_beta": 16.107044219970703,
      "step": 50
    },
    {
      "dpo_loss": 8.834875106811523,
      "epoch": 0.2834199338686821,
      "grad_norm": 2883.7985857251942,
      "learning_rate": 2.8409090909090916e-06,
      "logits": -1.3840159177780151,
      "logps": -82.65471649169922,
      "loss": 17.3516,
      "objective": 17.624128341674805,
      "ranking_idealized": 0.5141666531562805,
      "ranking_idealized_expo": 0.5137500166893005,
      "ranking_simple": 0.5179166793823242,
      "regularize": 17.624128341674805,
      "step": 100,
      "wo_beta": 15.28693675994873
    },
    {
      "epoch": 0.2834199338686821,
      "eval_dpo_loss": 7.968704700469971,
      "eval_logits": -1.3171318769454956,
      "eval_logps": -86.66349792480469,
      "eval_loss": 15.683460235595703,
      "eval_objective": 15.754798889160156,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.5279502868652344,
      "eval_regularize": 15.754798889160156,
      "eval_runtime": 307.2479,
      "eval_samples_per_second": 18.845,
      "eval_steps_per_second": 1.572,
      "eval_wo_beta": 15.626057624816895,
      "step": 100
    },
    {
      "dpo_loss": 14.751253128051758,
      "epoch": 0.42512990080302315,
      "grad_norm": 2254.7228314416952,
      "learning_rate": 4.2613636363636365e-06,
      "logits": -1.1572282314300537,
      "logps": -80.76160430908203,
      "loss": 28.6009,
      "objective": 28.620296478271484,
      "ranking_idealized": 0.5287500023841858,
      "ranking_idealized_expo": 0.527916669845581,
      "ranking_simple": 0.5266666412353516,
      "regularize": 28.620296478271484,
      "step": 150,
      "wo_beta": 15.1625394821167
    },
    {
      "epoch": 0.42512990080302315,
      "eval_dpo_loss": 15.000219345092773,
      "eval_logits": -1.1259195804595947,
      "eval_logps": -81.49861145019531,
      "eval_loss": 29.075258255004883,
      "eval_objective": 28.90445327758789,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.5243270993232727,
      "eval_regularize": 28.90445327758789,
      "eval_runtime": 307.0327,
      "eval_samples_per_second": 18.858,
      "eval_steps_per_second": 1.573,
      "eval_wo_beta": 15.236913681030273,
      "step": 150
    },
    {
      "dpo_loss": 18.452308654785156,
      "epoch": 0.5668398677373642,
      "grad_norm": 2255.157628060128,
      "learning_rate": 4.997168347957521e-06,
      "logits": -0.9300950169563293,
      "logps": -76.25523376464844,
      "loss": 35.0698,
      "objective": 35.79060745239258,
      "ranking_idealized": 0.51583331823349,
      "ranking_idealized_expo": 0.51541668176651,
      "ranking_simple": 0.5104166865348816,
      "regularize": 35.79060745239258,
      "step": 200,
      "wo_beta": 15.353928565979004
    },
    {
      "epoch": 0.5668398677373642,
      "eval_dpo_loss": 21.391788482666016,
      "eval_logits": -0.8775973916053772,
      "eval_logps": -82.15784454345703,
      "eval_loss": 41.12628173828125,
      "eval_objective": 40.45929718017578,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.5124223828315735,
      "eval_regularize": 40.45929718017578,
      "eval_runtime": 307.0481,
      "eval_samples_per_second": 18.857,
      "eval_steps_per_second": 1.573,
      "eval_wo_beta": 14.911209106445312,
      "step": 200
    },
    {
      "dpo_loss": 19.97781753540039,
      "epoch": 0.7085498346717053,
      "grad_norm": 1862.9598435340467,
      "learning_rate": 4.973122855144066e-06,
      "logits": -0.7163826823234558,
      "logps": -77.4970932006836,
      "loss": 37.7822,
      "objective": 38.173789978027344,
      "ranking_idealized": 0.5166666507720947,
      "ranking_idealized_expo": 0.5162500143051147,
      "ranking_simple": 0.5112500190734863,
      "regularize": 38.173789978027344,
      "step": 250,
      "wo_beta": 15.578652381896973
    },
    {
      "epoch": 0.7085498346717053,
      "eval_dpo_loss": 21.928752899169922,
      "eval_logits": -0.641853392124176,
      "eval_logps": -83.0038833618164,
      "eval_loss": 44.07463836669922,
      "eval_objective": 43.393341064453125,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.5279502868652344,
      "eval_regularize": 43.393341064453125,
      "eval_runtime": 307.199,
      "eval_samples_per_second": 18.848,
      "eval_steps_per_second": 1.572,
      "eval_wo_beta": 14.620430946350098,
      "step": 250
    },
    {
      "dpo_loss": 17.413480758666992,
      "epoch": 0.8502598016060463,
      "grad_norm": 1744.6732754071961,
      "learning_rate": 4.924776641419513e-06,
      "logits": -0.40934881567955017,
      "logps": -79.10726165771484,
      "loss": 35.2811,
      "objective": 35.4559326171875,
      "ranking_idealized": 0.4962500035762787,
      "ranking_idealized_expo": 0.4950000047683716,
      "ranking_simple": 0.502916693687439,
      "regularize": 35.4559326171875,
      "step": 300,
      "wo_beta": 15.202095031738281
    },
    {
      "epoch": 0.8502598016060463,
      "eval_dpo_loss": 21.43065071105957,
      "eval_logits": -0.5315975546836853,
      "eval_logps": -83.84294891357422,
      "eval_loss": 43.6626091003418,
      "eval_objective": 43.46427536010742,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.5320910811424255,
      "eval_regularize": 43.46427536010742,
      "eval_runtime": 307.206,
      "eval_samples_per_second": 18.847,
      "eval_steps_per_second": 1.572,
      "eval_wo_beta": 14.544736862182617,
      "step": 300
    },
    {
      "dpo_loss": 17.524351119995117,
      "epoch": 0.9919697685403873,
      "grad_norm": 1787.213275862853,
      "learning_rate": 4.8526047530778175e-06,
      "logits": -0.5016722679138184,
      "logps": -80.09149169921875,
      "loss": 33.8034,
      "objective": 34.494503021240234,
      "ranking_idealized": 0.5262500047683716,
      "ranking_idealized_expo": 0.5254166722297668,
      "ranking_simple": 0.5249999761581421,
      "regularize": 34.494503021240234,
      "step": 350,
      "wo_beta": 15.207830429077148
    },
    {
      "epoch": 0.9919697685403873,
      "eval_dpo_loss": 23.330080032348633,
      "eval_logits": -0.593406081199646,
      "eval_logps": -84.05725860595703,
      "eval_loss": 45.264923095703125,
      "eval_objective": 45.35862731933594,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.523809552192688,
      "eval_regularize": 45.35862731933594,
      "eval_runtime": 307.0631,
      "eval_samples_per_second": 18.856,
      "eval_steps_per_second": 1.573,
      "eval_wo_beta": 14.60231876373291,
      "step": 350
    },
    {
      "dpo_loss": 16.205705642700195,
      "epoch": 1.1336797354747283,
      "grad_norm": 1658.338167111395,
      "learning_rate": 4.757316345716554e-06,
      "logits": -0.5499605536460876,
      "logps": -80.1341552734375,
      "loss": 30.8702,
      "objective": 30.992847442626953,
      "ranking_idealized": 0.5333333611488342,
      "ranking_idealized_expo": 0.5320833325386047,
      "ranking_simple": 0.528333306312561,
      "regularize": 30.992847442626953,
      "step": 400,
      "wo_beta": 15.376312255859375
    },
    {
      "epoch": 1.1336797354747283,
      "eval_dpo_loss": 23.827035903930664,
      "eval_logits": -0.62712162733078,
      "eval_logps": -82.20217895507812,
      "eval_loss": 47.269775390625,
      "eval_objective": 47.26739501953125,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.5248447060585022,
      "eval_regularize": 47.26739501953125,
      "eval_runtime": 307.8491,
      "eval_samples_per_second": 18.808,
      "eval_steps_per_second": 1.569,
      "eval_wo_beta": 14.336685180664062,
      "step": 400
    },
    {
      "dpo_loss": 14.983359336853027,
      "epoch": 1.2753897024090695,
      "grad_norm": 1630.7914622079197,
      "learning_rate": 4.639847716126855e-06,
      "logits": -0.5104279518127441,
      "logps": -78.46994018554688,
      "loss": 29.5027,
      "objective": 29.416109085083008,
      "ranking_idealized": 0.5195833444595337,
      "ranking_idealized_expo": 0.5191666483879089,
      "ranking_simple": 0.5170833468437195,
      "regularize": 29.416109085083008,
      "step": 450,
      "wo_beta": 16.006542205810547
    },
    {
      "epoch": 1.2753897024090695,
      "eval_dpo_loss": 25.179445266723633,
      "eval_logits": -0.5507553815841675,
      "eval_logps": -82.72330474853516,
      "eval_loss": 49.341182708740234,
      "eval_objective": 49.47369384765625,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.5201863646507263,
      "eval_regularize": 49.47369384765625,
      "eval_runtime": 307.2653,
      "eval_samples_per_second": 18.844,
      "eval_steps_per_second": 1.572,
      "eval_wo_beta": 14.343340873718262,
      "step": 450
    },
    {
      "dpo_loss": 13.962078094482422,
      "epoch": 1.4170996693434104,
      "grad_norm": 1627.1136853969401,
      "learning_rate": 4.501353102310901e-06,
      "logits": -0.4764183461666107,
      "logps": -78.08194732666016,
      "loss": 27.7693,
      "objective": 28.35871696472168,
      "ranking_idealized": 0.49791666865348816,
      "ranking_idealized_expo": 0.4970833361148834,
      "ranking_simple": 0.503333330154419,
      "regularize": 28.35871696472168,
      "step": 500,
      "wo_beta": 15.235273361206055
    },
    {
      "epoch": 1.4170996693434104,
      "eval_dpo_loss": 24.62739372253418,
      "eval_logits": -0.5208410024642944,
      "eval_logps": -83.14039611816406,
      "eval_loss": 48.41379928588867,
      "eval_objective": 48.561553955078125,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.5181159377098083,
      "eval_regularize": 48.561553955078125,
      "eval_runtime": 313.5843,
      "eval_samples_per_second": 18.464,
      "eval_steps_per_second": 1.54,
      "eval_wo_beta": 14.325936317443848,
      "step": 500
    },
    {
      "dpo_loss": 14.243717193603516,
      "epoch": 1.5588096362777515,
      "grad_norm": 1567.3979312158642,
      "learning_rate": 4.34319334202531e-06,
      "logits": -0.4176904857158661,
      "logps": -79.26414489746094,
      "loss": 26.3455,
      "objective": 27.205766677856445,
      "ranking_idealized": 0.5112500190734863,
      "ranking_idealized_expo": 0.5104166865348816,
      "ranking_simple": 0.5066666603088379,
      "regularize": 27.205766677856445,
      "step": 550,
      "wo_beta": 15.118928909301758
    },
    {
      "epoch": 1.5588096362777515,
      "eval_dpo_loss": 24.8875732421875,
      "eval_logits": -0.5377052426338196,
      "eval_logps": -81.67108154296875,
      "eval_loss": 49.475399017333984,
      "eval_objective": 49.75130081176758,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.5263975262641907,
      "eval_regularize": 49.75130081176758,
      "eval_runtime": 307.1071,
      "eval_samples_per_second": 18.853,
      "eval_steps_per_second": 1.573,
      "eval_wo_beta": 14.233548164367676,
      "step": 550
    },
    {
      "dpo_loss": 13.567865371704102,
      "epoch": 1.7005196032120926,
      "grad_norm": 1510.6295336293697,
      "learning_rate": 4.16692250129073e-06,
      "logits": -0.4348069727420807,
      "logps": -78.36796569824219,
      "loss": 25.3777,
      "objective": 25.583778381347656,
      "ranking_idealized": 0.51541668176651,
      "ranking_idealized_expo": 0.5149999856948853,
      "ranking_simple": 0.5049999952316284,
      "regularize": 25.583778381347656,
      "step": 600,
      "wo_beta": 15.017353057861328
    },
    {
      "epoch": 1.7005196032120926,
      "eval_dpo_loss": 24.62792205810547,
      "eval_logits": -0.5633407235145569,
      "eval_logps": -81.369873046875,
      "eval_loss": 48.80782699584961,
      "eval_objective": 49.26447677612305,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.523809552192688,
      "eval_regularize": 49.26447677612305,
      "eval_runtime": 307.6769,
      "eval_samples_per_second": 18.818,
      "eval_steps_per_second": 1.57,
      "eval_wo_beta": 14.197225570678711,
      "step": 600
    },
    {
      "dpo_loss": 12.823990821838379,
      "epoch": 1.8422295701464337,
      "grad_norm": 1590.0809438470442,
      "learning_rate": 3.974272604254906e-06,
      "logits": -0.45912277698516846,
      "logps": -77.55583190917969,
      "loss": 24.4429,
      "objective": 24.74443817138672,
      "ranking_idealized": 0.5291666388511658,
      "ranking_idealized_expo": 0.527916669845581,
      "ranking_simple": 0.5270833373069763,
      "regularize": 24.74443817138672,
      "step": 650,
      "wo_beta": 15.796711921691895
    },
    {
      "epoch": 1.8422295701464337,
      "eval_dpo_loss": 25.341928482055664,
      "eval_logits": -0.475749671459198,
      "eval_logps": -81.65654754638672,
      "eval_loss": 49.71050262451172,
      "eval_objective": 49.81724548339844,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.5191511511802673,
      "eval_regularize": 49.81724548339844,
      "eval_runtime": 318.0633,
      "eval_samples_per_second": 18.204,
      "eval_steps_per_second": 1.519,
      "eval_wo_beta": 14.336784362792969,
      "step": 650
    },
    {
      "dpo_loss": 11.803265571594238,
      "epoch": 1.9839395370807746,
      "grad_norm": 1573.6320557673569,
      "learning_rate": 3.767136614452458e-06,
      "logits": -0.44002941250801086,
      "logps": -77.62532043457031,
      "loss": 22.5358,
      "objective": 22.4056339263916,
      "ranking_idealized": 0.5129166841506958,
      "ranking_idealized_expo": 0.5108333230018616,
      "ranking_simple": 0.5058333277702332,
      "regularize": 22.4056339263916,
      "step": 700,
      "wo_beta": 15.435830116271973
    },
    {
      "epoch": 1.9839395370807746,
      "eval_dpo_loss": 26.279430389404297,
      "eval_logits": -0.5139885544776917,
      "eval_logps": -80.61864471435547,
      "eval_loss": 51.679359436035156,
      "eval_objective": 51.56280517578125,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.5248447060585022,
      "eval_regularize": 51.56280517578125,
      "eval_runtime": 307.1755,
      "eval_samples_per_second": 18.849,
      "eval_steps_per_second": 1.572,
      "eval_wo_beta": 14.074385643005371,
      "step": 700
    },
    {
      "dpo_loss": 10.530390739440918,
      "epoch": 2.1256495040151155,
      "grad_norm": 1447.9001618253178,
      "learning_rate": 3.547549834686222e-06,
      "logits": -0.4438280165195465,
      "logps": -79.3443374633789,
      "loss": 20.6864,
      "objective": 20.564796447753906,
      "ranking_idealized": 0.5129166841506958,
      "ranking_idealized_expo": 0.5112500190734863,
      "ranking_simple": 0.512499988079071,
      "regularize": 20.564796447753906,
      "step": 750,
      "wo_beta": 15.44257640838623
    },
    {
      "epoch": 2.1256495040151155,
      "eval_dpo_loss": 25.791982650756836,
      "eval_logits": -0.4510954022407532,
      "eval_logps": -83.94737243652344,
      "eval_loss": 50.90283966064453,
      "eval_objective": 51.139808654785156,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.5274327397346497,
      "eval_regularize": 51.139808654785156,
      "eval_runtime": 307.3519,
      "eval_samples_per_second": 18.838,
      "eval_steps_per_second": 1.571,
      "eval_wo_beta": 14.28470230102539,
      "step": 750
    },
    {
      "dpo_loss": 10.331942558288574,
      "epoch": 2.2673594709494567,
      "grad_norm": 1416.622520151804,
      "learning_rate": 3.3176699082935546e-06,
      "logits": -0.4105643630027771,
      "logps": -81.301513671875,
      "loss": 19.5881,
      "objective": 19.708881378173828,
      "ranking_idealized": 0.512499988079071,
      "ranking_idealized_expo": 0.512499988079071,
      "ranking_simple": 0.5162500143051147,
      "regularize": 19.708881378173828,
      "step": 800,
      "wo_beta": 15.041363716125488
    },
    {
      "epoch": 2.2673594709494567,
      "eval_dpo_loss": 26.223230361938477,
      "eval_logits": -0.45186811685562134,
      "eval_logps": -84.14128112792969,
      "eval_loss": 51.44403076171875,
      "eval_objective": 51.835060119628906,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.5274327397346497,
      "eval_regularize": 51.835060119628906,
      "eval_runtime": 307.4841,
      "eval_samples_per_second": 18.83,
      "eval_steps_per_second": 1.571,
      "eval_wo_beta": 14.21197509765625,
      "step": 800
    },
    {
      "dpo_loss": 9.117318153381348,
      "epoch": 2.409069437883798,
      "grad_norm": 1511.1151822215572,
      "learning_rate": 3.0797556183036582e-06,
      "logits": -0.4155246615409851,
      "logps": -80.53886413574219,
      "loss": 18.5246,
      "objective": 18.382122039794922,
      "ranking_idealized": 0.5145833492279053,
      "ranking_idealized_expo": 0.5133333206176758,
      "ranking_simple": 0.5141666531562805,
      "regularize": 18.382122039794922,
      "step": 850,
      "wo_beta": 15.248088836669922
    },
    {
      "epoch": 2.409069437883798,
      "eval_dpo_loss": 26.526891708374023,
      "eval_logits": -0.5061497688293457,
      "eval_logps": -82.96385192871094,
      "eval_loss": 52.282501220703125,
      "eval_objective": 52.2313346862793,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.5284678936004639,
      "eval_regularize": 52.2313346862793,
      "eval_runtime": 307.2591,
      "eval_samples_per_second": 18.844,
      "eval_steps_per_second": 1.572,
      "eval_wo_beta": 14.120504379272461,
      "step": 850
    },
    {
      "dpo_loss": 8.65651798248291,
      "epoch": 2.550779404818139,
      "grad_norm": 1500.724487309093,
      "learning_rate": 2.8361446928038298e-06,
      "logits": -0.4497624337673187,
      "logps": -79.77722930908203,
      "loss": 17.4115,
      "objective": 17.32391929626465,
      "ranking_idealized": 0.518750011920929,
      "ranking_idealized_expo": 0.5183333158493042,
      "ranking_simple": 0.5179166793823242,
      "regularize": 17.32391929626465,
      "step": 900,
      "wo_beta": 15.50606918334961
    },
    {
      "epoch": 2.550779404818139,
      "eval_dpo_loss": 26.54765510559082,
      "eval_logits": -0.5079280138015747,
      "eval_logps": -83.98892211914062,
      "eval_loss": 52.268577575683594,
      "eval_objective": 52.27949905395508,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.5289855003356934,
      "eval_regularize": 52.27949905395508,
      "eval_runtime": 307.3895,
      "eval_samples_per_second": 18.836,
      "eval_steps_per_second": 1.571,
      "eval_wo_beta": 14.197465896606445,
      "step": 900
    },
    {
      "dpo_loss": 8.308319091796875,
      "epoch": 2.69248937175248,
      "grad_norm": 1453.978726592987,
      "learning_rate": 2.5892308345974517e-06,
      "logits": -0.4583713412284851,
      "logps": -80.14180755615234,
      "loss": 16.2052,
      "objective": 16.429227828979492,
      "ranking_idealized": 0.5079166889190674,
      "ranking_idealized_expo": 0.5058333277702332,
      "ranking_simple": 0.5074999928474426,
      "regularize": 16.429227828979492,
      "step": 950,
      "wo_beta": 15.596735000610352
    },
    {
      "epoch": 2.69248937175248,
      "eval_dpo_loss": 26.657089233398438,
      "eval_logits": -0.46912574768066406,
      "eval_logps": -83.12673950195312,
      "eval_loss": 52.40416717529297,
      "eval_objective": 52.389137268066406,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.523809552192688,
      "eval_regularize": 52.389137268066406,
      "eval_runtime": 307.39,
      "eval_samples_per_second": 18.836,
      "eval_steps_per_second": 1.571,
      "eval_wo_beta": 14.298489570617676,
      "step": 950
    },
    {
      "dpo_loss": 7.868130683898926,
      "epoch": 2.838923004251299,
      "grad_norm": 1371.5890318912852,
      "learning_rate": 2.341440200858589e-06,
      "logits": -0.3988785743713379,
      "logps": -78.35469055175781,
      "loss": 15.0384,
      "objective": 15.024641990661621,
      "ranking_idealized": 0.5112500190734863,
      "ranking_idealized_expo": 0.5112500190734863,
      "ranking_simple": 0.5066666603088379,
      "regularize": 15.024641990661621,
      "step": 1000,
      "wo_beta": 15.029138565063477
    },
    {
      "epoch": 2.838923004251299,
      "eval_dpo_loss": 26.16453742980957,
      "eval_logits": -0.4550507366657257,
      "eval_logps": -82.82769012451172,
      "eval_loss": 51.76364517211914,
      "eval_objective": 51.644718170166016,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.5263975262641907,
      "eval_regularize": 51.644718170166016,
      "eval_runtime": 307.9823,
      "eval_samples_per_second": 18.8,
      "eval_steps_per_second": 1.568,
      "eval_wo_beta": 14.203557968139648,
      "step": 1000
    },
    {
      "dpo_loss": 7.561364650726318,
      "epoch": 2.9806329711856403,
      "grad_norm": 1438.5247466117469,
      "learning_rate": 2.0952075638923656e-06,
      "logits": -0.39186450839042664,
      "logps": -79.17125701904297,
      "loss": 14.381,
      "objective": 14.444308280944824,
      "ranking_idealized": 0.5183333158493042,
      "ranking_idealized_expo": 0.5174999833106995,
      "ranking_simple": 0.5245833396911621,
      "regularize": 14.444308280944824,
      "step": 1050,
      "wo_beta": 15.485770225524902
    },
    {
      "epoch": 2.9806329711856403,
      "eval_dpo_loss": 26.504281997680664,
      "eval_logits": -0.4121534526348114,
      "eval_logps": -83.05400848388672,
      "eval_loss": 51.82139587402344,
      "eval_objective": 51.90236282348633,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.5248447060585022,
      "eval_regularize": 51.90236282348633,
      "eval_runtime": 307.2005,
      "eval_samples_per_second": 18.848,
      "eval_steps_per_second": 1.572,
      "eval_wo_beta": 14.16685962677002,
      "step": 1050
    },
    {
      "dpo_loss": 6.576974868774414,
      "epoch": 3.122342938119981,
      "grad_norm": 1479.1539218663233,
      "learning_rate": 1.852952387243698e-06,
      "logits": -0.37988409399986267,
      "logps": -80.17594146728516,
      "loss": 12.5437,
      "objective": 12.73067855834961,
      "ranking_idealized": 0.5299999713897705,
      "ranking_idealized_expo": 0.528333306312561,
      "ranking_simple": 0.5266666412353516,
      "regularize": 12.73067855834961,
      "step": 1100,
      "wo_beta": 15.62684440612793
    },
    {
      "epoch": 3.122342938119981,
      "eval_dpo_loss": 26.185077667236328,
      "eval_logits": -0.4407959282398224,
      "eval_logps": -83.87307739257812,
      "eval_loss": 51.601688385009766,
      "eval_objective": 51.89978790283203,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.5253623127937317,
      "eval_regularize": 51.89978790283203,
      "eval_runtime": 308.2578,
      "eval_samples_per_second": 18.783,
      "eval_steps_per_second": 1.567,
      "eval_wo_beta": 14.176854133605957,
      "step": 1100
    },
    {
      "dpo_loss": 5.700263023376465,
      "epoch": 3.264052905054322,
      "grad_norm": 1402.4578249025758,
      "learning_rate": 1.617055052228768e-06,
      "logits": -0.39078637957572937,
      "logps": -80.27751159667969,
      "loss": 11.3828,
      "objective": 11.245396614074707,
      "ranking_idealized": 0.5091666579246521,
      "ranking_idealized_expo": 0.5083333253860474,
      "ranking_simple": 0.5104166865348816,
      "regularize": 11.245396614074707,
      "step": 1150,
      "wo_beta": 15.349074363708496
    },
    {
      "epoch": 3.264052905054322,
      "eval_dpo_loss": 26.20229148864746,
      "eval_logits": -0.4506087601184845,
      "eval_logps": -84.2103500366211,
      "eval_loss": 51.586910247802734,
      "eval_objective": 51.72679138183594,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.5258799195289612,
      "eval_regularize": 51.72679138183594,
      "eval_runtime": 307.5329,
      "eval_samples_per_second": 18.827,
      "eval_steps_per_second": 1.571,
      "eval_wo_beta": 14.176774024963379,
      "step": 1150
    },
    {
      "dpo_loss": 5.425318241119385,
      "epoch": 3.4057628719886632,
      "grad_norm": 1477.9539586967678,
      "learning_rate": 1.3898334684855647e-06,
      "logits": -0.3910551071166992,
      "logps": -81.23528289794922,
      "loss": 10.5152,
      "objective": 10.480737686157227,
      "ranking_idealized": 0.5079166889190674,
      "ranking_idealized_expo": 0.5079166889190674,
      "ranking_simple": 0.5049999952316284,
      "regularize": 10.480737686157227,
      "step": 1200,
      "wo_beta": 15.531842231750488
    },
    {
      "epoch": 3.4057628719886632,
      "eval_dpo_loss": 26.307344436645508,
      "eval_logits": -0.4568469524383545,
      "eval_logps": -84.14852905273438,
      "eval_loss": 51.58594512939453,
      "eval_objective": 51.662628173828125,
      "eval_ranking_idealized": 0.5212215185165405,
      "eval_ranking_idealized_expo": 0.5212215185165405,
      "eval_ranking_simple": 0.5253623127937317,
      "eval_regularize": 51.662628173828125,
      "eval_runtime": 307.0369,
      "eval_samples_per_second": 18.858,
      "eval_steps_per_second": 1.573,
      "eval_wo_beta": 14.14501953125,
      "step": 1200
    },
    {
      "epoch": 3.4057628719886632,
      "step": 1200,
      "total_flos": 0.0,
      "train_loss": 2.660881093343099,
      "train_runtime": 6833.7834,
      "train_samples_per_second": 37.17,
      "train_steps_per_second": 0.258
    }
  ],
  "logging_steps": 50,
  "max_steps": 1760,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 5,
  "save_steps": 50,
  "stateful_callbacks": {
    "EarlyStoppingCallback": {
      "args": {
        "early_stopping_patience": 5,
        "early_stopping_threshold": 0.0
      },
      "attributes": {
        "early_stopping_patience_counter": 0
      }
    },
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}